]> git.saurik.com Git - apple/xnu.git/blob - bsd/vfs/vfs_cluster.c
xnu-6153.81.5.tar.gz
[apple/xnu.git] / bsd / vfs / vfs_cluster.c
1 /*
2 * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95
62 */
63
64 #include <sys/param.h>
65 #include <sys/proc_internal.h>
66 #include <sys/buf_internal.h>
67 #include <sys/mount_internal.h>
68 #include <sys/vnode_internal.h>
69 #include <sys/trace.h>
70 #include <sys/malloc.h>
71 #include <sys/time.h>
72 #include <sys/kernel.h>
73 #include <sys/resourcevar.h>
74 #include <miscfs/specfs/specdev.h>
75 #include <sys/uio_internal.h>
76 #include <libkern/libkern.h>
77 #include <machine/machine_routines.h>
78
79 #include <sys/ubc_internal.h>
80 #include <vm/vnode_pager.h>
81
82 #include <mach/mach_types.h>
83 #include <mach/memory_object_types.h>
84 #include <mach/vm_map.h>
85 #include <mach/upl.h>
86 #include <kern/task.h>
87 #include <kern/policy_internal.h>
88
89 #include <vm/vm_kern.h>
90 #include <vm/vm_map.h>
91 #include <vm/vm_pageout.h>
92 #include <vm/vm_fault.h>
93
94 #include <sys/kdebug.h>
95 #include <libkern/OSAtomic.h>
96
97 #include <sys/sdt.h>
98
99 #include <stdbool.h>
100
101 #include <vfs/vfs_disk_conditioner.h>
102
103 #if 0
104 #undef KERNEL_DEBUG
105 #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT
106 #endif
107
108
109 #define CL_READ 0x01
110 #define CL_WRITE 0x02
111 #define CL_ASYNC 0x04
112 #define CL_COMMIT 0x08
113 #define CL_PAGEOUT 0x10
114 #define CL_AGE 0x20
115 #define CL_NOZERO 0x40
116 #define CL_PAGEIN 0x80
117 #define CL_DEV_MEMORY 0x100
118 #define CL_PRESERVE 0x200
119 #define CL_THROTTLE 0x400
120 #define CL_KEEPCACHED 0x800
121 #define CL_DIRECT_IO 0x1000
122 #define CL_PASSIVE 0x2000
123 #define CL_IOSTREAMING 0x4000
124 #define CL_CLOSE 0x8000
125 #define CL_ENCRYPTED 0x10000
126 #define CL_RAW_ENCRYPTED 0x20000
127 #define CL_NOCACHE 0x40000
128
129 #define MAX_VECTOR_UPL_ELEMENTS 8
130 #define MAX_VECTOR_UPL_SIZE (2 * MAX_UPL_SIZE_BYTES)
131
132 #define CLUSTER_IO_WAITING ((buf_t)1)
133
134 extern upl_t vector_upl_create(vm_offset_t);
135 extern boolean_t vector_upl_is_valid(upl_t);
136 extern boolean_t vector_upl_set_subupl(upl_t, upl_t, u_int32_t);
137 extern void vector_upl_set_pagelist(upl_t);
138 extern void vector_upl_set_iostate(upl_t, upl_t, vm_offset_t, u_int32_t);
139
140 struct clios {
141 lck_mtx_t io_mtxp;
142 u_int io_completed; /* amount of io that has currently completed */
143 u_int io_issued; /* amount of io that was successfully issued */
144 int io_error; /* error code of first error encountered */
145 int io_wanted; /* someone is sleeping waiting for a change in state */
146 };
147
148 struct cl_direct_read_lock {
149 LIST_ENTRY(cl_direct_read_lock) chain;
150 int32_t ref_count;
151 vnode_t vp;
152 lck_rw_t rw_lock;
153 };
154
155 #define CL_DIRECT_READ_LOCK_BUCKETS 61
156
157 static LIST_HEAD(cl_direct_read_locks, cl_direct_read_lock)
158 cl_direct_read_locks[CL_DIRECT_READ_LOCK_BUCKETS];
159
160 static lck_spin_t cl_direct_read_spin_lock;
161
162 static lck_grp_t *cl_mtx_grp;
163 static lck_attr_t *cl_mtx_attr;
164 static lck_grp_attr_t *cl_mtx_grp_attr;
165 static lck_mtx_t *cl_transaction_mtxp;
166
167 #define IO_UNKNOWN 0
168 #define IO_DIRECT 1
169 #define IO_CONTIG 2
170 #define IO_COPY 3
171
172 #define PUSH_DELAY 0x01
173 #define PUSH_ALL 0x02
174 #define PUSH_SYNC 0x04
175
176
177 static void cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset);
178 static void cluster_wait_IO(buf_t cbp_head, int async);
179 static void cluster_complete_transaction(buf_t *cbp_head, void *callback_arg, int *retval, int flags, int needwait);
180
181 static int cluster_io_type(struct uio *uio, int *io_type, u_int32_t *io_length, u_int32_t min_length);
182
183 static int cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
184 int flags, buf_t real_bp, struct clios *iostate, int (*)(buf_t, void *), void *callback_arg);
185 static int cluster_iodone(buf_t bp, void *callback_arg);
186 static int cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags, vnode_t vp);
187 static int cluster_is_throttled(vnode_t vp);
188
189 static void cluster_iostate_wait(struct clios *iostate, u_int target, const char *wait_name);
190
191 static void cluster_syncup(vnode_t vp, off_t newEOF, int (*)(buf_t, void *), void *callback_arg, int flags);
192
193 static void cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference);
194 static int cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty, int take_reference);
195
196 static int cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t filesize, int flags,
197 int (*)(buf_t, void *), void *callback_arg);
198 static int cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
199 int flags, int (*)(buf_t, void *), void *callback_arg);
200 static int cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
201 int (*)(buf_t, void *), void *callback_arg, int flags);
202
203 static int cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF,
204 off_t headOff, off_t tailOff, int flags, int (*)(buf_t, void *), void *callback_arg);
205 static int cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF,
206 int *write_type, u_int32_t *write_length, int flags, int (*)(buf_t, void *), void *callback_arg);
207 static int cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF,
208 int *write_type, u_int32_t *write_length, int (*)(buf_t, void *), void *callback_arg, int bflag);
209
210 static void cluster_update_state_internal(vnode_t vp, struct cl_extent *cl, int flags, boolean_t defer_writes, boolean_t *first_pass,
211 off_t write_off, int write_cnt, off_t newEOF, int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
212
213 static int cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int (*)(buf_t, void *), void *callback_arg);
214
215 static int cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (*callback)(buf_t, void *), void *callback_arg, int bflag);
216 static void cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *ra,
217 int (*callback)(buf_t, void *), void *callback_arg, int bflag);
218
219 static int cluster_push_now(vnode_t vp, struct cl_extent *, off_t EOF, int flags, int (*)(buf_t, void *), void *callback_arg, boolean_t vm_ioitiated);
220
221 static int cluster_try_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_flag, int flags, int (*)(buf_t, void *),
222 void *callback_arg, int *err, boolean_t vm_initiated);
223
224 static int sparse_cluster_switch(struct cl_writebehind *, vnode_t vp, off_t EOF, int (*)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
225 static int sparse_cluster_push(struct cl_writebehind *, void **cmapp, vnode_t vp, off_t EOF, int push_flag,
226 int io_flags, int (*)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
227 static int sparse_cluster_add(struct cl_writebehind *, void **cmapp, vnode_t vp, struct cl_extent *, off_t EOF,
228 int (*)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
229
230 static kern_return_t vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, u_int *setcountp);
231 static kern_return_t vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp);
232 static kern_return_t vfs_drt_control(void **cmapp, int op_type);
233 static kern_return_t vfs_get_scmap_push_behavior_internal(void **cmapp, int *push_flag);
234
235
236 /*
237 * For throttled IO to check whether
238 * a block is cached by the boot cache
239 * and thus it can avoid delaying the IO.
240 *
241 * bootcache_contains_block is initially
242 * NULL. The BootCache will set it while
243 * the cache is active and clear it when
244 * the cache is jettisoned.
245 *
246 * Returns 0 if the block is not
247 * contained in the cache, 1 if it is
248 * contained.
249 *
250 * The function pointer remains valid
251 * after the cache has been evicted even
252 * if bootcache_contains_block has been
253 * cleared.
254 *
255 * See rdar://9974130 The new throttling mechanism breaks the boot cache for throttled IOs
256 */
257 int (*bootcache_contains_block)(dev_t device, u_int64_t blkno) = NULL;
258
259
260 /*
261 * limit the internal I/O size so that we
262 * can represent it in a 32 bit int
263 */
264 #define MAX_IO_REQUEST_SIZE (1024 * 1024 * 512)
265 #define MAX_IO_CONTIG_SIZE MAX_UPL_SIZE_BYTES
266 #define MAX_VECTS 16
267 /*
268 * The MIN_DIRECT_WRITE_SIZE governs how much I/O should be issued before we consider
269 * allowing the caller to bypass the buffer cache. For small I/Os (less than 16k),
270 * we have not historically allowed the write to bypass the UBC.
271 */
272 #define MIN_DIRECT_WRITE_SIZE (16384)
273
274 #define WRITE_THROTTLE 6
275 #define WRITE_THROTTLE_SSD 2
276 #define WRITE_BEHIND 1
277 #define WRITE_BEHIND_SSD 1
278
279 #if CONFIG_EMBEDDED
280 #define PREFETCH 1
281 #define PREFETCH_SSD 1
282 uint32_t speculative_prefetch_max = (2048 * 1024); /* maximum bytes in a specluative read-ahead */
283 uint32_t speculative_prefetch_max_iosize = (512 * 1024); /* maximum I/O size to use in a specluative read-ahead */
284 #else
285 #define PREFETCH 3
286 #define PREFETCH_SSD 2
287 uint32_t speculative_prefetch_max = (MAX_UPL_SIZE_BYTES * 3); /* maximum bytes in a specluative read-ahead */
288 uint32_t speculative_prefetch_max_iosize = (512 * 1024); /* maximum I/O size to use in a specluative read-ahead on SSDs*/
289 #endif
290
291
292 #define IO_SCALE(vp, base) (vp->v_mount->mnt_ioscale * (base))
293 #define MAX_CLUSTER_SIZE(vp) (cluster_max_io_size(vp->v_mount, CL_WRITE))
294 #define MAX_PREFETCH(vp, size, is_ssd) (size * IO_SCALE(vp, ((is_ssd) ? PREFETCH_SSD : PREFETCH)))
295
296 int speculative_reads_disabled = 0;
297
298 /*
299 * throttle the number of async writes that
300 * can be outstanding on a single vnode
301 * before we issue a synchronous write
302 */
303 #define THROTTLE_MAXCNT 0
304
305 uint32_t throttle_max_iosize = (128 * 1024);
306
307 #define THROTTLE_MAX_IOSIZE (throttle_max_iosize)
308
309 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_max_iosize, CTLFLAG_RW | CTLFLAG_LOCKED, &throttle_max_iosize, 0, "");
310
311
312 void
313 cluster_init(void)
314 {
315 /*
316 * allocate lock group attribute and group
317 */
318 cl_mtx_grp_attr = lck_grp_attr_alloc_init();
319 cl_mtx_grp = lck_grp_alloc_init("cluster I/O", cl_mtx_grp_attr);
320
321 /*
322 * allocate the lock attribute
323 */
324 cl_mtx_attr = lck_attr_alloc_init();
325
326 cl_transaction_mtxp = lck_mtx_alloc_init(cl_mtx_grp, cl_mtx_attr);
327
328 if (cl_transaction_mtxp == NULL) {
329 panic("cluster_init: failed to allocate cl_transaction_mtxp");
330 }
331
332 lck_spin_init(&cl_direct_read_spin_lock, cl_mtx_grp, cl_mtx_attr);
333
334 for (int i = 0; i < CL_DIRECT_READ_LOCK_BUCKETS; ++i) {
335 LIST_INIT(&cl_direct_read_locks[i]);
336 }
337 }
338
339
340 uint32_t
341 cluster_max_io_size(mount_t mp, int type)
342 {
343 uint32_t max_io_size;
344 uint32_t segcnt;
345 uint32_t maxcnt;
346
347 switch (type) {
348 case CL_READ:
349 segcnt = mp->mnt_segreadcnt;
350 maxcnt = mp->mnt_maxreadcnt;
351 break;
352 case CL_WRITE:
353 segcnt = mp->mnt_segwritecnt;
354 maxcnt = mp->mnt_maxwritecnt;
355 break;
356 default:
357 segcnt = min(mp->mnt_segreadcnt, mp->mnt_segwritecnt);
358 maxcnt = min(mp->mnt_maxreadcnt, mp->mnt_maxwritecnt);
359 break;
360 }
361 if (segcnt > (MAX_UPL_SIZE_BYTES >> PAGE_SHIFT)) {
362 /*
363 * don't allow a size beyond the max UPL size we can create
364 */
365 segcnt = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
366 }
367 max_io_size = min((segcnt * PAGE_SIZE), maxcnt);
368
369 if (max_io_size < MAX_UPL_TRANSFER_BYTES) {
370 /*
371 * don't allow a size smaller than the old fixed limit
372 */
373 max_io_size = MAX_UPL_TRANSFER_BYTES;
374 } else {
375 /*
376 * make sure the size specified is a multiple of PAGE_SIZE
377 */
378 max_io_size &= ~PAGE_MASK;
379 }
380 return max_io_size;
381 }
382
383
384
385
386 #define CLW_ALLOCATE 0x01
387 #define CLW_RETURNLOCKED 0x02
388 #define CLW_IONOCACHE 0x04
389 #define CLW_IOPASSIVE 0x08
390
391 /*
392 * if the read ahead context doesn't yet exist,
393 * allocate and initialize it...
394 * the vnode lock serializes multiple callers
395 * during the actual assignment... first one
396 * to grab the lock wins... the other callers
397 * will release the now unnecessary storage
398 *
399 * once the context is present, try to grab (but don't block on)
400 * the lock associated with it... if someone
401 * else currently owns it, than the read
402 * will run without read-ahead. this allows
403 * multiple readers to run in parallel and
404 * since there's only 1 read ahead context,
405 * there's no real loss in only allowing 1
406 * reader to have read-ahead enabled.
407 */
408 static struct cl_readahead *
409 cluster_get_rap(vnode_t vp)
410 {
411 struct ubc_info *ubc;
412 struct cl_readahead *rap;
413
414 ubc = vp->v_ubcinfo;
415
416 if ((rap = ubc->cl_rahead) == NULL) {
417 MALLOC_ZONE(rap, struct cl_readahead *, sizeof *rap, M_CLRDAHEAD, M_WAITOK);
418
419 bzero(rap, sizeof *rap);
420 rap->cl_lastr = -1;
421 lck_mtx_init(&rap->cl_lockr, cl_mtx_grp, cl_mtx_attr);
422
423 vnode_lock(vp);
424
425 if (ubc->cl_rahead == NULL) {
426 ubc->cl_rahead = rap;
427 } else {
428 lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp);
429 FREE_ZONE(rap, sizeof *rap, M_CLRDAHEAD);
430 rap = ubc->cl_rahead;
431 }
432 vnode_unlock(vp);
433 }
434 if (lck_mtx_try_lock(&rap->cl_lockr) == TRUE) {
435 return rap;
436 }
437
438 return (struct cl_readahead *)NULL;
439 }
440
441
442 /*
443 * if the write behind context doesn't yet exist,
444 * and CLW_ALLOCATE is specified, allocate and initialize it...
445 * the vnode lock serializes multiple callers
446 * during the actual assignment... first one
447 * to grab the lock wins... the other callers
448 * will release the now unnecessary storage
449 *
450 * if CLW_RETURNLOCKED is set, grab (blocking if necessary)
451 * the lock associated with the write behind context before
452 * returning
453 */
454
455 static struct cl_writebehind *
456 cluster_get_wbp(vnode_t vp, int flags)
457 {
458 struct ubc_info *ubc;
459 struct cl_writebehind *wbp;
460
461 ubc = vp->v_ubcinfo;
462
463 if ((wbp = ubc->cl_wbehind) == NULL) {
464 if (!(flags & CLW_ALLOCATE)) {
465 return (struct cl_writebehind *)NULL;
466 }
467
468 MALLOC_ZONE(wbp, struct cl_writebehind *, sizeof *wbp, M_CLWRBEHIND, M_WAITOK);
469
470 bzero(wbp, sizeof *wbp);
471 lck_mtx_init(&wbp->cl_lockw, cl_mtx_grp, cl_mtx_attr);
472
473 vnode_lock(vp);
474
475 if (ubc->cl_wbehind == NULL) {
476 ubc->cl_wbehind = wbp;
477 } else {
478 lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp);
479 FREE_ZONE(wbp, sizeof *wbp, M_CLWRBEHIND);
480 wbp = ubc->cl_wbehind;
481 }
482 vnode_unlock(vp);
483 }
484 if (flags & CLW_RETURNLOCKED) {
485 lck_mtx_lock(&wbp->cl_lockw);
486 }
487
488 return wbp;
489 }
490
491
492 static void
493 cluster_syncup(vnode_t vp, off_t newEOF, int (*callback)(buf_t, void *), void *callback_arg, int flags)
494 {
495 struct cl_writebehind *wbp;
496
497 if ((wbp = cluster_get_wbp(vp, 0)) != NULL) {
498 if (wbp->cl_number) {
499 lck_mtx_lock(&wbp->cl_lockw);
500
501 cluster_try_push(wbp, vp, newEOF, PUSH_ALL | flags, 0, callback, callback_arg, NULL, FALSE);
502
503 lck_mtx_unlock(&wbp->cl_lockw);
504 }
505 }
506 }
507
508
509 static int
510 cluster_io_present_in_BC(vnode_t vp, off_t f_offset)
511 {
512 daddr64_t blkno;
513 size_t io_size;
514 int (*bootcache_check_fn)(dev_t device, u_int64_t blkno) = bootcache_contains_block;
515
516 if (bootcache_check_fn && vp->v_mount && vp->v_mount->mnt_devvp) {
517 if (VNOP_BLOCKMAP(vp, f_offset, PAGE_SIZE, &blkno, &io_size, NULL, VNODE_READ | VNODE_BLOCKMAP_NO_TRACK, NULL)) {
518 return 0;
519 }
520
521 if (io_size == 0) {
522 return 0;
523 }
524
525 if (bootcache_check_fn(vp->v_mount->mnt_devvp->v_rdev, blkno)) {
526 return 1;
527 }
528 }
529 return 0;
530 }
531
532
533 static int
534 cluster_is_throttled(vnode_t vp)
535 {
536 return throttle_io_will_be_throttled(-1, vp->v_mount);
537 }
538
539
540 static void
541 cluster_iostate_wait(struct clios *iostate, u_int target, const char *wait_name)
542 {
543 lck_mtx_lock(&iostate->io_mtxp);
544
545 while ((iostate->io_issued - iostate->io_completed) > target) {
546 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START,
547 iostate->io_issued, iostate->io_completed, target, 0, 0);
548
549 iostate->io_wanted = 1;
550 msleep((caddr_t)&iostate->io_wanted, &iostate->io_mtxp, PRIBIO + 1, wait_name, NULL);
551
552 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END,
553 iostate->io_issued, iostate->io_completed, target, 0, 0);
554 }
555 lck_mtx_unlock(&iostate->io_mtxp);
556 }
557
558 static void
559 cluster_handle_associated_upl(struct clios *iostate, upl_t upl,
560 upl_offset_t upl_offset, upl_size_t size)
561 {
562 if (!size) {
563 return;
564 }
565
566 upl_t associated_upl = upl_associated_upl(upl);
567
568 if (!associated_upl) {
569 return;
570 }
571
572 #if 0
573 printf("1: %d %d\n", upl_offset, upl_offset + size);
574 #endif
575
576 /*
577 * The associated UPL is page aligned to file offsets whereas the
578 * UPL it's attached to has different alignment requirements. The
579 * upl_offset that we have refers to @upl. The code that follows
580 * has to deal with the first and last pages in this transaction
581 * which might straddle pages in the associated UPL. To keep
582 * track of these pages, we use the mark bits: if the mark bit is
583 * set, we know another transaction has completed its part of that
584 * page and so we can unlock that page here.
585 *
586 * The following illustrates what we have to deal with:
587 *
588 * MEM u <------------ 1 PAGE ------------> e
589 * +-------------+----------------------+-----------------
590 * | |######################|#################
591 * +-------------+----------------------+-----------------
592 * FILE | <--- a ---> o <------------ 1 PAGE ------------>
593 *
594 * So here we show a write to offset @o. The data that is to be
595 * written is in a buffer that is not page aligned; it has offset
596 * @a in the page. The upl that carries the data starts in memory
597 * at @u. The associated upl starts in the file at offset @o. A
598 * transaction will always end on a page boundary (like @e above)
599 * except for the very last transaction in the group. We cannot
600 * unlock the page at @o in the associated upl until both the
601 * transaction ending at @e and the following transaction (that
602 * starts at @e) has completed.
603 */
604
605 /*
606 * We record whether or not the two UPLs are aligned as the mark
607 * bit in the first page of @upl.
608 */
609 upl_page_info_t *pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
610 bool is_unaligned = upl_page_get_mark(pl, 0);
611
612 if (is_unaligned) {
613 upl_page_info_t *assoc_pl = UPL_GET_INTERNAL_PAGE_LIST(associated_upl);
614
615 upl_offset_t upl_end = upl_offset + size;
616 assert(upl_end >= PAGE_SIZE);
617
618 upl_size_t assoc_upl_size = upl_get_size(associated_upl);
619
620 /*
621 * In the very first transaction in the group, upl_offset will
622 * not be page aligned, but after that it will be and in that
623 * case we want the preceding page in the associated UPL hence
624 * the minus one.
625 */
626 assert(upl_offset);
627 if (upl_offset) {
628 upl_offset = trunc_page_32(upl_offset - 1);
629 }
630
631 lck_mtx_lock_spin(&iostate->io_mtxp);
632
633 // Look at the first page...
634 if (upl_offset
635 && !upl_page_get_mark(assoc_pl, upl_offset >> PAGE_SHIFT)) {
636 /*
637 * The first page isn't marked so let another transaction
638 * completion handle it.
639 */
640 upl_page_set_mark(assoc_pl, upl_offset >> PAGE_SHIFT, true);
641 upl_offset += PAGE_SIZE;
642 }
643
644 // And now the last page...
645
646 /*
647 * This needs to be > rather than >= because if it's equal, it
648 * means there's another transaction that is sharing the last
649 * page.
650 */
651 if (upl_end > assoc_upl_size) {
652 upl_end = assoc_upl_size;
653 } else {
654 upl_end = trunc_page_32(upl_end);
655 const int last_pg = (upl_end >> PAGE_SHIFT) - 1;
656
657 if (!upl_page_get_mark(assoc_pl, last_pg)) {
658 /*
659 * The last page isn't marked so mark the page and let another
660 * transaction completion handle it.
661 */
662 upl_page_set_mark(assoc_pl, last_pg, true);
663 upl_end -= PAGE_SIZE;
664 }
665 }
666
667 lck_mtx_unlock(&iostate->io_mtxp);
668
669 #if 0
670 printf("2: %d %d\n", upl_offset, upl_end);
671 #endif
672
673 if (upl_end <= upl_offset) {
674 return;
675 }
676
677 size = upl_end - upl_offset;
678 } else {
679 assert(!(upl_offset & PAGE_MASK));
680 assert(!(size & PAGE_MASK));
681 }
682
683 boolean_t empty;
684
685 /*
686 * We can unlock these pages now and as this is for a
687 * direct/uncached write, we want to dump the pages too.
688 */
689 kern_return_t kr = upl_abort_range(associated_upl, upl_offset, size,
690 UPL_ABORT_DUMP_PAGES, &empty);
691
692 assert(!kr);
693
694 if (!kr && empty) {
695 upl_set_associated_upl(upl, NULL);
696 upl_deallocate(associated_upl);
697 }
698 }
699
700 static int
701 cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags, vnode_t vp)
702 {
703 int upl_abort_code = 0;
704 int page_in = 0;
705 int page_out = 0;
706
707 if ((io_flags & (B_PHYS | B_CACHE)) == (B_PHYS | B_CACHE)) {
708 /*
709 * direct write of any flavor, or a direct read that wasn't aligned
710 */
711 ubc_upl_commit_range(upl, upl_offset, abort_size, UPL_COMMIT_FREE_ON_EMPTY);
712 } else {
713 if (io_flags & B_PAGEIO) {
714 if (io_flags & B_READ) {
715 page_in = 1;
716 } else {
717 page_out = 1;
718 }
719 }
720 if (io_flags & B_CACHE) {
721 /*
722 * leave pages in the cache unchanged on error
723 */
724 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
725 } else if (((io_flags & B_READ) == 0) && ((error != ENXIO) || vnode_isswap(vp))) {
726 /*
727 * transient error on pageout/write path... leave pages unchanged
728 */
729 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
730 } else if (page_in) {
731 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
732 } else {
733 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
734 }
735
736 ubc_upl_abort_range(upl, upl_offset, abort_size, upl_abort_code);
737 }
738 return upl_abort_code;
739 }
740
741
742 static int
743 cluster_iodone(buf_t bp, void *callback_arg)
744 {
745 int b_flags;
746 int error;
747 int total_size;
748 int total_resid;
749 int upl_offset;
750 int zero_offset;
751 int pg_offset = 0;
752 int commit_size = 0;
753 int upl_flags = 0;
754 int transaction_size = 0;
755 upl_t upl;
756 buf_t cbp;
757 buf_t cbp_head;
758 buf_t cbp_next;
759 buf_t real_bp;
760 vnode_t vp;
761 struct clios *iostate;
762 boolean_t transaction_complete = FALSE;
763
764 __IGNORE_WCASTALIGN(cbp_head = (buf_t)(bp->b_trans_head));
765
766 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
767 cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
768
769 if (cbp_head->b_trans_next || !(cbp_head->b_flags & B_EOT)) {
770 lck_mtx_lock_spin(cl_transaction_mtxp);
771
772 bp->b_flags |= B_TDONE;
773
774 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
775 /*
776 * all I/O requests that are part of this transaction
777 * have to complete before we can process it
778 */
779 if (!(cbp->b_flags & B_TDONE)) {
780 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
781 cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0);
782
783 lck_mtx_unlock(cl_transaction_mtxp);
784
785 return 0;
786 }
787
788 if (cbp->b_trans_next == CLUSTER_IO_WAITING) {
789 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
790 cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0);
791
792 lck_mtx_unlock(cl_transaction_mtxp);
793 wakeup(cbp);
794
795 return 0;
796 }
797
798 if (cbp->b_flags & B_EOT) {
799 transaction_complete = TRUE;
800 }
801 }
802 lck_mtx_unlock(cl_transaction_mtxp);
803
804 if (transaction_complete == FALSE) {
805 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
806 cbp_head, 0, 0, 0, 0);
807 return 0;
808 }
809 }
810 error = 0;
811 total_size = 0;
812 total_resid = 0;
813
814 cbp = cbp_head;
815 vp = cbp->b_vp;
816 upl_offset = cbp->b_uploffset;
817 upl = cbp->b_upl;
818 b_flags = cbp->b_flags;
819 real_bp = cbp->b_real_bp;
820 zero_offset = cbp->b_validend;
821 iostate = (struct clios *)cbp->b_iostate;
822
823 if (real_bp) {
824 real_bp->b_dev = cbp->b_dev;
825 }
826
827 while (cbp) {
828 if ((cbp->b_flags & B_ERROR) && error == 0) {
829 error = cbp->b_error;
830 }
831
832 total_resid += cbp->b_resid;
833 total_size += cbp->b_bcount;
834
835 cbp_next = cbp->b_trans_next;
836
837 if (cbp_next == NULL) {
838 /*
839 * compute the overall size of the transaction
840 * in case we created one that has 'holes' in it
841 * 'total_size' represents the amount of I/O we
842 * did, not the span of the transaction w/r to the UPL
843 */
844 transaction_size = cbp->b_uploffset + cbp->b_bcount - upl_offset;
845 }
846
847 if (cbp != cbp_head) {
848 free_io_buf(cbp);
849 }
850
851 cbp = cbp_next;
852 }
853
854 if (ISSET(b_flags, B_COMMIT_UPL)) {
855 cluster_handle_associated_upl(iostate,
856 cbp_head->b_upl,
857 upl_offset,
858 transaction_size);
859 }
860
861 if (error == 0 && total_resid) {
862 error = EIO;
863 }
864
865 if (error == 0) {
866 int (*cliodone_func)(buf_t, void *) = (int (*)(buf_t, void *))(cbp_head->b_cliodone);
867
868 if (cliodone_func != NULL) {
869 cbp_head->b_bcount = transaction_size;
870
871 error = (*cliodone_func)(cbp_head, callback_arg);
872 }
873 }
874 if (zero_offset) {
875 cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
876 }
877
878 free_io_buf(cbp_head);
879
880 if (iostate) {
881 int need_wakeup = 0;
882
883 /*
884 * someone has issued multiple I/Os asynchrounsly
885 * and is waiting for them to complete (streaming)
886 */
887 lck_mtx_lock_spin(&iostate->io_mtxp);
888
889 if (error && iostate->io_error == 0) {
890 iostate->io_error = error;
891 }
892
893 iostate->io_completed += total_size;
894
895 if (iostate->io_wanted) {
896 /*
897 * someone is waiting for the state of
898 * this io stream to change
899 */
900 iostate->io_wanted = 0;
901 need_wakeup = 1;
902 }
903 lck_mtx_unlock(&iostate->io_mtxp);
904
905 if (need_wakeup) {
906 wakeup((caddr_t)&iostate->io_wanted);
907 }
908 }
909
910 if (b_flags & B_COMMIT_UPL) {
911 pg_offset = upl_offset & PAGE_MASK;
912 commit_size = (pg_offset + transaction_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
913
914 if (error) {
915 upl_set_iodone_error(upl, error);
916
917 upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, commit_size, error, b_flags, vp);
918 } else {
919 upl_flags = UPL_COMMIT_FREE_ON_EMPTY;
920
921 if ((b_flags & B_PHYS) && (b_flags & B_READ)) {
922 upl_flags |= UPL_COMMIT_SET_DIRTY;
923 }
924
925 if (b_flags & B_AGE) {
926 upl_flags |= UPL_COMMIT_INACTIVATE;
927 }
928
929 ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size, upl_flags);
930 }
931 }
932 if (real_bp) {
933 if (error) {
934 real_bp->b_flags |= B_ERROR;
935 real_bp->b_error = error;
936 }
937 real_bp->b_resid = total_resid;
938
939 buf_biodone(real_bp);
940 }
941 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
942 upl, upl_offset - pg_offset, commit_size, (error << 24) | upl_flags, 0);
943
944 return error;
945 }
946
947
948 uint32_t
949 cluster_throttle_io_limit(vnode_t vp, uint32_t *limit)
950 {
951 if (cluster_is_throttled(vp)) {
952 *limit = THROTTLE_MAX_IOSIZE;
953 return 1;
954 }
955 return 0;
956 }
957
958
959 void
960 cluster_zero(upl_t upl, upl_offset_t upl_offset, int size, buf_t bp)
961 {
962 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_START,
963 upl_offset, size, bp, 0, 0);
964
965 if (bp == NULL || bp->b_datap == 0) {
966 upl_page_info_t *pl;
967 addr64_t zero_addr;
968
969 pl = ubc_upl_pageinfo(upl);
970
971 if (upl_device_page(pl) == TRUE) {
972 zero_addr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + upl_offset;
973
974 bzero_phys_nc(zero_addr, size);
975 } else {
976 while (size) {
977 int page_offset;
978 int page_index;
979 int zero_cnt;
980
981 page_index = upl_offset / PAGE_SIZE;
982 page_offset = upl_offset & PAGE_MASK;
983
984 zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << PAGE_SHIFT) + page_offset;
985 zero_cnt = min(PAGE_SIZE - page_offset, size);
986
987 bzero_phys(zero_addr, zero_cnt);
988
989 size -= zero_cnt;
990 upl_offset += zero_cnt;
991 }
992 }
993 } else {
994 bzero((caddr_t)((vm_offset_t)bp->b_datap + upl_offset), size);
995 }
996
997 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_END,
998 upl_offset, size, 0, 0, 0);
999 }
1000
1001
1002 static void
1003 cluster_EOT(buf_t cbp_head, buf_t cbp_tail, int zero_offset)
1004 {
1005 cbp_head->b_validend = zero_offset;
1006 cbp_tail->b_flags |= B_EOT;
1007 }
1008
1009 static void
1010 cluster_wait_IO(buf_t cbp_head, int async)
1011 {
1012 buf_t cbp;
1013
1014 if (async) {
1015 /*
1016 * Async callback completion will not normally generate a
1017 * wakeup upon I/O completion. To get woken up, we set
1018 * b_trans_next (which is safe for us to modify) on the last
1019 * buffer to CLUSTER_IO_WAITING so that cluster_iodone knows
1020 * to wake us up when all buffers as part of this transaction
1021 * are completed. This is done under the umbrella of
1022 * cl_transaction_mtxp which is also taken in cluster_iodone.
1023 */
1024 bool done = true;
1025 buf_t last = NULL;
1026
1027 lck_mtx_lock_spin(cl_transaction_mtxp);
1028
1029 for (cbp = cbp_head; cbp; last = cbp, cbp = cbp->b_trans_next) {
1030 if (!ISSET(cbp->b_flags, B_TDONE)) {
1031 done = false;
1032 }
1033 }
1034
1035 if (!done) {
1036 last->b_trans_next = CLUSTER_IO_WAITING;
1037
1038 DTRACE_IO1(wait__start, buf_t, last);
1039 do {
1040 msleep(last, cl_transaction_mtxp, PSPIN | (PRIBIO + 1), "cluster_wait_IO", NULL);
1041
1042 /*
1043 * We should only have been woken up if all the
1044 * buffers are completed, but just in case...
1045 */
1046 done = true;
1047 for (cbp = cbp_head; cbp != CLUSTER_IO_WAITING; cbp = cbp->b_trans_next) {
1048 if (!ISSET(cbp->b_flags, B_TDONE)) {
1049 done = false;
1050 break;
1051 }
1052 }
1053 } while (!done);
1054 DTRACE_IO1(wait__done, buf_t, last);
1055
1056 last->b_trans_next = NULL;
1057 }
1058
1059 lck_mtx_unlock(cl_transaction_mtxp);
1060 } else { // !async
1061 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
1062 buf_biowait(cbp);
1063 }
1064 }
1065 }
1066
1067 static void
1068 cluster_complete_transaction(buf_t *cbp_head, void *callback_arg, int *retval, int flags, int needwait)
1069 {
1070 buf_t cbp;
1071 int error;
1072 boolean_t isswapout = FALSE;
1073
1074 /*
1075 * cluster_complete_transaction will
1076 * only be called if we've issued a complete chain in synchronous mode
1077 * or, we've already done a cluster_wait_IO on an incomplete chain
1078 */
1079 if (needwait) {
1080 for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next) {
1081 buf_biowait(cbp);
1082 }
1083 }
1084 /*
1085 * we've already waited on all of the I/Os in this transaction,
1086 * so mark all of the buf_t's in this transaction as B_TDONE
1087 * so that cluster_iodone sees the transaction as completed
1088 */
1089 for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next) {
1090 cbp->b_flags |= B_TDONE;
1091 }
1092 cbp = *cbp_head;
1093
1094 if ((flags & (CL_ASYNC | CL_PAGEOUT)) == CL_PAGEOUT && vnode_isswap(cbp->b_vp)) {
1095 isswapout = TRUE;
1096 }
1097
1098 error = cluster_iodone(cbp, callback_arg);
1099
1100 if (!(flags & CL_ASYNC) && error && *retval == 0) {
1101 if (((flags & (CL_PAGEOUT | CL_KEEPCACHED)) != CL_PAGEOUT) || (error != ENXIO)) {
1102 *retval = error;
1103 } else if (isswapout == TRUE) {
1104 *retval = error;
1105 }
1106 }
1107 *cbp_head = (buf_t)NULL;
1108 }
1109
1110
1111 static int
1112 cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
1113 int flags, buf_t real_bp, struct clios *iostate, int (*callback)(buf_t, void *), void *callback_arg)
1114 {
1115 buf_t cbp;
1116 u_int size;
1117 u_int io_size;
1118 int io_flags;
1119 int bmap_flags;
1120 int error = 0;
1121 int retval = 0;
1122 buf_t cbp_head = NULL;
1123 buf_t cbp_tail = NULL;
1124 int trans_count = 0;
1125 int max_trans_count;
1126 u_int pg_count;
1127 int pg_offset;
1128 u_int max_iosize;
1129 u_int max_vectors;
1130 int priv;
1131 int zero_offset = 0;
1132 int async_throttle = 0;
1133 mount_t mp;
1134 vm_offset_t upl_end_offset;
1135 boolean_t need_EOT = FALSE;
1136
1137 /*
1138 * we currently don't support buffers larger than a page
1139 */
1140 if (real_bp && non_rounded_size > PAGE_SIZE) {
1141 panic("%s(): Called with real buffer of size %d bytes which "
1142 "is greater than the maximum allowed size of "
1143 "%d bytes (the system PAGE_SIZE).\n",
1144 __FUNCTION__, non_rounded_size, PAGE_SIZE);
1145 }
1146
1147 mp = vp->v_mount;
1148
1149 /*
1150 * we don't want to do any funny rounding of the size for IO requests
1151 * coming through the DIRECT or CONTIGUOUS paths... those pages don't
1152 * belong to us... we can't extend (nor do we need to) the I/O to fill
1153 * out a page
1154 */
1155 if (mp->mnt_devblocksize > 1 && !(flags & (CL_DEV_MEMORY | CL_DIRECT_IO))) {
1156 /*
1157 * round the requested size up so that this I/O ends on a
1158 * page boundary in case this is a 'write'... if the filesystem
1159 * has blocks allocated to back the page beyond the EOF, we want to
1160 * make sure to write out the zero's that are sitting beyond the EOF
1161 * so that in case the filesystem doesn't explicitly zero this area
1162 * if a hole is created via a lseek/write beyond the current EOF,
1163 * it will return zeros when it's read back from the disk. If the
1164 * physical allocation doesn't extend for the whole page, we'll
1165 * only write/read from the disk up to the end of this allocation
1166 * via the extent info returned from the VNOP_BLOCKMAP call.
1167 */
1168 pg_offset = upl_offset & PAGE_MASK;
1169
1170 size = (((non_rounded_size + pg_offset) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - pg_offset;
1171 } else {
1172 /*
1173 * anyone advertising a blocksize of 1 byte probably
1174 * can't deal with us rounding up the request size
1175 * AFP is one such filesystem/device
1176 */
1177 size = non_rounded_size;
1178 }
1179 upl_end_offset = upl_offset + size;
1180
1181 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START, (int)f_offset, size, upl_offset, flags, 0);
1182
1183 /*
1184 * Set the maximum transaction size to the maximum desired number of
1185 * buffers.
1186 */
1187 max_trans_count = 8;
1188 if (flags & CL_DEV_MEMORY) {
1189 max_trans_count = 16;
1190 }
1191
1192 if (flags & CL_READ) {
1193 io_flags = B_READ;
1194 bmap_flags = VNODE_READ;
1195
1196 max_iosize = mp->mnt_maxreadcnt;
1197 max_vectors = mp->mnt_segreadcnt;
1198 } else {
1199 io_flags = B_WRITE;
1200 bmap_flags = VNODE_WRITE;
1201
1202 max_iosize = mp->mnt_maxwritecnt;
1203 max_vectors = mp->mnt_segwritecnt;
1204 }
1205 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_NONE, max_iosize, max_vectors, mp->mnt_devblocksize, 0, 0);
1206
1207 /*
1208 * make sure the maximum iosize is a
1209 * multiple of the page size
1210 */
1211 max_iosize &= ~PAGE_MASK;
1212
1213 /*
1214 * Ensure the maximum iosize is sensible.
1215 */
1216 if (!max_iosize) {
1217 max_iosize = PAGE_SIZE;
1218 }
1219
1220 if (flags & CL_THROTTLE) {
1221 if (!(flags & CL_PAGEOUT) && cluster_is_throttled(vp)) {
1222 if (max_iosize > THROTTLE_MAX_IOSIZE) {
1223 max_iosize = THROTTLE_MAX_IOSIZE;
1224 }
1225 async_throttle = THROTTLE_MAXCNT;
1226 } else {
1227 if ((flags & CL_DEV_MEMORY)) {
1228 async_throttle = IO_SCALE(vp, VNODE_ASYNC_THROTTLE);
1229 } else {
1230 u_int max_cluster;
1231 u_int max_cluster_size;
1232 u_int scale;
1233
1234 if (vp->v_mount->mnt_minsaturationbytecount) {
1235 max_cluster_size = vp->v_mount->mnt_minsaturationbytecount;
1236
1237 scale = 1;
1238 } else {
1239 max_cluster_size = MAX_CLUSTER_SIZE(vp);
1240
1241 if (disk_conditioner_mount_is_ssd(vp->v_mount)) {
1242 scale = WRITE_THROTTLE_SSD;
1243 } else {
1244 scale = WRITE_THROTTLE;
1245 }
1246 }
1247 if (max_iosize > max_cluster_size) {
1248 max_cluster = max_cluster_size;
1249 } else {
1250 max_cluster = max_iosize;
1251 }
1252
1253 if (size < max_cluster) {
1254 max_cluster = size;
1255 }
1256
1257 if (flags & CL_CLOSE) {
1258 scale += MAX_CLUSTERS;
1259 }
1260
1261 async_throttle = min(IO_SCALE(vp, VNODE_ASYNC_THROTTLE), ((scale * max_cluster_size) / max_cluster) - 1);
1262 }
1263 }
1264 }
1265 if (flags & CL_AGE) {
1266 io_flags |= B_AGE;
1267 }
1268 if (flags & (CL_PAGEIN | CL_PAGEOUT)) {
1269 io_flags |= B_PAGEIO;
1270 }
1271 if (flags & (CL_IOSTREAMING)) {
1272 io_flags |= B_IOSTREAMING;
1273 }
1274 if (flags & CL_COMMIT) {
1275 io_flags |= B_COMMIT_UPL;
1276 }
1277 if (flags & CL_DIRECT_IO) {
1278 io_flags |= B_PHYS;
1279 }
1280 if (flags & (CL_PRESERVE | CL_KEEPCACHED)) {
1281 io_flags |= B_CACHE;
1282 }
1283 if (flags & CL_PASSIVE) {
1284 io_flags |= B_PASSIVE;
1285 }
1286 if (flags & CL_ENCRYPTED) {
1287 io_flags |= B_ENCRYPTED_IO;
1288 }
1289
1290 if (vp->v_flag & VSYSTEM) {
1291 io_flags |= B_META;
1292 }
1293
1294 if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
1295 /*
1296 * then we are going to end up
1297 * with a page that we can't complete (the file size wasn't a multiple
1298 * of PAGE_SIZE and we're trying to read to the end of the file
1299 * so we'll go ahead and zero out the portion of the page we can't
1300 * read in from the file
1301 */
1302 zero_offset = upl_offset + non_rounded_size;
1303 } else if (!ISSET(flags, CL_READ) && ISSET(flags, CL_DIRECT_IO)) {
1304 assert(ISSET(flags, CL_COMMIT));
1305
1306 // For a direct/uncached write, we need to lock pages...
1307
1308 upl_t cached_upl;
1309
1310 /*
1311 * Create a UPL to lock the pages in the cache whilst the
1312 * write is in progress.
1313 */
1314 ubc_create_upl_kernel(vp, f_offset, non_rounded_size, &cached_upl,
1315 NULL, UPL_SET_LITE, VM_KERN_MEMORY_FILE);
1316
1317 /*
1318 * Attach this UPL to the other UPL so that we can find it
1319 * later.
1320 */
1321 upl_set_associated_upl(upl, cached_upl);
1322
1323 if (upl_offset & PAGE_MASK) {
1324 /*
1325 * The two UPLs are not aligned, so mark the first page in
1326 * @upl so that cluster_handle_associated_upl can handle
1327 * it accordingly.
1328 */
1329 upl_page_info_t *pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1330 upl_page_set_mark(pl, 0, true);
1331 }
1332 }
1333
1334 while (size) {
1335 daddr64_t blkno;
1336 daddr64_t lblkno;
1337 u_int io_size_wanted;
1338 size_t io_size_tmp;
1339
1340 if (size > max_iosize) {
1341 io_size = max_iosize;
1342 } else {
1343 io_size = size;
1344 }
1345
1346 io_size_wanted = io_size;
1347 io_size_tmp = (size_t)io_size;
1348
1349 if ((error = VNOP_BLOCKMAP(vp, f_offset, io_size, &blkno, &io_size_tmp, NULL, bmap_flags, NULL))) {
1350 break;
1351 }
1352
1353 if (io_size_tmp > io_size_wanted) {
1354 io_size = io_size_wanted;
1355 } else {
1356 io_size = (u_int)io_size_tmp;
1357 }
1358
1359 if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno)) {
1360 real_bp->b_blkno = blkno;
1361 }
1362
1363 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
1364 (int)f_offset, (int)(blkno >> 32), (int)blkno, io_size, 0);
1365
1366 if (io_size == 0) {
1367 /*
1368 * vnop_blockmap didn't return an error... however, it did
1369 * return an extent size of 0 which means we can't
1370 * make forward progress on this I/O... a hole in the
1371 * file would be returned as a blkno of -1 with a non-zero io_size
1372 * a real extent is returned with a blkno != -1 and a non-zero io_size
1373 */
1374 error = EINVAL;
1375 break;
1376 }
1377 if (!(flags & CL_READ) && blkno == -1) {
1378 off_t e_offset;
1379 int pageout_flags;
1380
1381 if (upl_get_internal_vectorupl(upl)) {
1382 panic("Vector UPLs should not take this code-path\n");
1383 }
1384 /*
1385 * we're writing into a 'hole'
1386 */
1387 if (flags & CL_PAGEOUT) {
1388 /*
1389 * if we got here via cluster_pageout
1390 * then just error the request and return
1391 * the 'hole' should already have been covered
1392 */
1393 error = EINVAL;
1394 break;
1395 }
1396 /*
1397 * we can get here if the cluster code happens to
1398 * pick up a page that was dirtied via mmap vs
1399 * a 'write' and the page targets a 'hole'...
1400 * i.e. the writes to the cluster were sparse
1401 * and the file was being written for the first time
1402 *
1403 * we can also get here if the filesystem supports
1404 * 'holes' that are less than PAGE_SIZE.... because
1405 * we can't know if the range in the page that covers
1406 * the 'hole' has been dirtied via an mmap or not,
1407 * we have to assume the worst and try to push the
1408 * entire page to storage.
1409 *
1410 * Try paging out the page individually before
1411 * giving up entirely and dumping it (the pageout
1412 * path will insure that the zero extent accounting
1413 * has been taken care of before we get back into cluster_io)
1414 *
1415 * go direct to vnode_pageout so that we don't have to
1416 * unbusy the page from the UPL... we used to do this
1417 * so that we could call ubc_msync, but that results
1418 * in a potential deadlock if someone else races us to acquire
1419 * that page and wins and in addition needs one of the pages
1420 * we're continuing to hold in the UPL
1421 */
1422 pageout_flags = UPL_MSYNC | UPL_VNODE_PAGER | UPL_NESTED_PAGEOUT;
1423
1424 if (!(flags & CL_ASYNC)) {
1425 pageout_flags |= UPL_IOSYNC;
1426 }
1427 if (!(flags & CL_COMMIT)) {
1428 pageout_flags |= UPL_NOCOMMIT;
1429 }
1430
1431 if (cbp_head) {
1432 buf_t prev_cbp;
1433 int bytes_in_last_page;
1434
1435 /*
1436 * first we have to wait for the the current outstanding I/Os
1437 * to complete... EOT hasn't been set yet on this transaction
1438 * so the pages won't be released
1439 */
1440 cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
1441
1442 bytes_in_last_page = cbp_head->b_uploffset & PAGE_MASK;
1443 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
1444 bytes_in_last_page += cbp->b_bcount;
1445 }
1446 bytes_in_last_page &= PAGE_MASK;
1447
1448 while (bytes_in_last_page) {
1449 /*
1450 * we've got a transcation that
1451 * includes the page we're about to push out through vnode_pageout...
1452 * find the bp's in the list which intersect this page and either
1453 * remove them entirely from the transaction (there could be multiple bp's), or
1454 * round it's iosize down to the page boundary (there can only be one)...
1455 *
1456 * find the last bp in the list and act on it
1457 */
1458 for (prev_cbp = cbp = cbp_head; cbp->b_trans_next; cbp = cbp->b_trans_next) {
1459 prev_cbp = cbp;
1460 }
1461
1462 if (bytes_in_last_page >= cbp->b_bcount) {
1463 /*
1464 * this buf no longer has any I/O associated with it
1465 */
1466 bytes_in_last_page -= cbp->b_bcount;
1467 cbp->b_bcount = 0;
1468
1469 free_io_buf(cbp);
1470
1471 if (cbp == cbp_head) {
1472 assert(bytes_in_last_page == 0);
1473 /*
1474 * the buf we just freed was the only buf in
1475 * this transaction... so there's no I/O to do
1476 */
1477 cbp_head = NULL;
1478 cbp_tail = NULL;
1479 } else {
1480 /*
1481 * remove the buf we just freed from
1482 * the transaction list
1483 */
1484 prev_cbp->b_trans_next = NULL;
1485 cbp_tail = prev_cbp;
1486 }
1487 } else {
1488 /*
1489 * this is the last bp that has I/O
1490 * intersecting the page of interest
1491 * only some of the I/O is in the intersection
1492 * so clip the size but keep it in the transaction list
1493 */
1494 cbp->b_bcount -= bytes_in_last_page;
1495 cbp_tail = cbp;
1496 bytes_in_last_page = 0;
1497 }
1498 }
1499 if (cbp_head) {
1500 /*
1501 * there was more to the current transaction
1502 * than just the page we are pushing out via vnode_pageout...
1503 * mark it as finished and complete it... we've already
1504 * waited for the I/Os to complete above in the call to cluster_wait_IO
1505 */
1506 cluster_EOT(cbp_head, cbp_tail, 0);
1507
1508 cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0);
1509
1510 trans_count = 0;
1511 }
1512 }
1513 if (vnode_pageout(vp, upl, trunc_page(upl_offset), trunc_page_64(f_offset), PAGE_SIZE, pageout_flags, NULL) != PAGER_SUCCESS) {
1514 error = EINVAL;
1515 }
1516 e_offset = round_page_64(f_offset + 1);
1517 io_size = e_offset - f_offset;
1518
1519 f_offset += io_size;
1520 upl_offset += io_size;
1521
1522 if (size >= io_size) {
1523 size -= io_size;
1524 } else {
1525 size = 0;
1526 }
1527 /*
1528 * keep track of how much of the original request
1529 * that we've actually completed... non_rounded_size
1530 * may go negative due to us rounding the request
1531 * to a page size multiple (i.e. size > non_rounded_size)
1532 */
1533 non_rounded_size -= io_size;
1534
1535 if (non_rounded_size <= 0) {
1536 /*
1537 * we've transferred all of the data in the original
1538 * request, but we were unable to complete the tail
1539 * of the last page because the file didn't have
1540 * an allocation to back that portion... this is ok.
1541 */
1542 size = 0;
1543 }
1544 if (error) {
1545 if (size == 0) {
1546 flags &= ~CL_COMMIT;
1547 }
1548 break;
1549 }
1550 continue;
1551 }
1552 lblkno = (daddr64_t)(f_offset / 0x1000);
1553 /*
1554 * we have now figured out how much I/O we can do - this is in 'io_size'
1555 * pg_offset is the starting point in the first page for the I/O
1556 * pg_count is the number of full and partial pages that 'io_size' encompasses
1557 */
1558 pg_offset = upl_offset & PAGE_MASK;
1559
1560 if (flags & CL_DEV_MEMORY) {
1561 /*
1562 * treat physical requests as one 'giant' page
1563 */
1564 pg_count = 1;
1565 } else {
1566 pg_count = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
1567 }
1568
1569 if ((flags & CL_READ) && blkno == -1) {
1570 vm_offset_t commit_offset;
1571 int bytes_to_zero;
1572 int complete_transaction_now = 0;
1573
1574 /*
1575 * if we're reading and blkno == -1, then we've got a
1576 * 'hole' in the file that we need to deal with by zeroing
1577 * out the affected area in the upl
1578 */
1579 if (io_size >= (u_int)non_rounded_size) {
1580 /*
1581 * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
1582 * than 'zero_offset' will be non-zero
1583 * if the 'hole' returned by vnop_blockmap extends all the way to the eof
1584 * (indicated by the io_size finishing off the I/O request for this UPL)
1585 * than we're not going to issue an I/O for the
1586 * last page in this upl... we need to zero both the hole and the tail
1587 * of the page beyond the EOF, since the delayed zero-fill won't kick in
1588 */
1589 bytes_to_zero = non_rounded_size;
1590 if (!(flags & CL_NOZERO)) {
1591 bytes_to_zero = (((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset;
1592 }
1593
1594 zero_offset = 0;
1595 } else {
1596 bytes_to_zero = io_size;
1597 }
1598
1599 pg_count = 0;
1600
1601 cluster_zero(upl, upl_offset, bytes_to_zero, real_bp);
1602
1603 if (cbp_head) {
1604 int pg_resid;
1605
1606 /*
1607 * if there is a current I/O chain pending
1608 * then the first page of the group we just zero'd
1609 * will be handled by the I/O completion if the zero
1610 * fill started in the middle of the page
1611 */
1612 commit_offset = (upl_offset + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1613
1614 pg_resid = commit_offset - upl_offset;
1615
1616 if (bytes_to_zero >= pg_resid) {
1617 /*
1618 * the last page of the current I/O
1619 * has been completed...
1620 * compute the number of fully zero'd
1621 * pages that are beyond it
1622 * plus the last page if its partial
1623 * and we have no more I/O to issue...
1624 * otherwise a partial page is left
1625 * to begin the next I/O
1626 */
1627 if ((int)io_size >= non_rounded_size) {
1628 pg_count = (bytes_to_zero - pg_resid + (PAGE_SIZE - 1)) / PAGE_SIZE;
1629 } else {
1630 pg_count = (bytes_to_zero - pg_resid) / PAGE_SIZE;
1631 }
1632
1633 complete_transaction_now = 1;
1634 }
1635 } else {
1636 /*
1637 * no pending I/O to deal with
1638 * so, commit all of the fully zero'd pages
1639 * plus the last page if its partial
1640 * and we have no more I/O to issue...
1641 * otherwise a partial page is left
1642 * to begin the next I/O
1643 */
1644 if ((int)io_size >= non_rounded_size) {
1645 pg_count = (pg_offset + bytes_to_zero + (PAGE_SIZE - 1)) / PAGE_SIZE;
1646 } else {
1647 pg_count = (pg_offset + bytes_to_zero) / PAGE_SIZE;
1648 }
1649
1650 commit_offset = upl_offset & ~PAGE_MASK;
1651 }
1652
1653 // Associated UPL is currently only used in the direct write path
1654 assert(!upl_associated_upl(upl));
1655
1656 if ((flags & CL_COMMIT) && pg_count) {
1657 ubc_upl_commit_range(upl, commit_offset, pg_count * PAGE_SIZE,
1658 UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
1659 }
1660 upl_offset += io_size;
1661 f_offset += io_size;
1662 size -= io_size;
1663
1664 /*
1665 * keep track of how much of the original request
1666 * that we've actually completed... non_rounded_size
1667 * may go negative due to us rounding the request
1668 * to a page size multiple (i.e. size > non_rounded_size)
1669 */
1670 non_rounded_size -= io_size;
1671
1672 if (non_rounded_size <= 0) {
1673 /*
1674 * we've transferred all of the data in the original
1675 * request, but we were unable to complete the tail
1676 * of the last page because the file didn't have
1677 * an allocation to back that portion... this is ok.
1678 */
1679 size = 0;
1680 }
1681 if (cbp_head && (complete_transaction_now || size == 0)) {
1682 cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
1683
1684 cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0);
1685
1686 cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 0);
1687
1688 trans_count = 0;
1689 }
1690 continue;
1691 }
1692 if (pg_count > max_vectors) {
1693 if (((pg_count - max_vectors) * PAGE_SIZE) > io_size) {
1694 io_size = PAGE_SIZE - pg_offset;
1695 pg_count = 1;
1696 } else {
1697 io_size -= (pg_count - max_vectors) * PAGE_SIZE;
1698 pg_count = max_vectors;
1699 }
1700 }
1701 /*
1702 * If the transaction is going to reach the maximum number of
1703 * desired elements, truncate the i/o to the nearest page so
1704 * that the actual i/o is initiated after this buffer is
1705 * created and added to the i/o chain.
1706 *
1707 * I/O directed to physically contiguous memory
1708 * doesn't have a requirement to make sure we 'fill' a page
1709 */
1710 if (!(flags & CL_DEV_MEMORY) && trans_count >= max_trans_count &&
1711 ((upl_offset + io_size) & PAGE_MASK)) {
1712 vm_offset_t aligned_ofs;
1713
1714 aligned_ofs = (upl_offset + io_size) & ~PAGE_MASK;
1715 /*
1716 * If the io_size does not actually finish off even a
1717 * single page we have to keep adding buffers to the
1718 * transaction despite having reached the desired limit.
1719 *
1720 * Eventually we get here with the page being finished
1721 * off (and exceeded) and then we truncate the size of
1722 * this i/o request so that it is page aligned so that
1723 * we can finally issue the i/o on the transaction.
1724 */
1725 if (aligned_ofs > upl_offset) {
1726 io_size = aligned_ofs - upl_offset;
1727 pg_count--;
1728 }
1729 }
1730
1731 if (!(mp->mnt_kern_flag & MNTK_VIRTUALDEV)) {
1732 /*
1733 * if we're not targeting a virtual device i.e. a disk image
1734 * it's safe to dip into the reserve pool since real devices
1735 * can complete this I/O request without requiring additional
1736 * bufs from the alloc_io_buf pool
1737 */
1738 priv = 1;
1739 } else if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT)) {
1740 /*
1741 * Throttle the speculative IO
1742 */
1743 priv = 0;
1744 } else {
1745 priv = 1;
1746 }
1747
1748 cbp = alloc_io_buf(vp, priv);
1749
1750 if (flags & CL_PAGEOUT) {
1751 u_int i;
1752
1753 /*
1754 * since blocks are in offsets of 0x1000, scale
1755 * iteration to (PAGE_SIZE * pg_count) of blks.
1756 */
1757 for (i = 0; i < (PAGE_SIZE * pg_count) / 0x1000; i++) {
1758 if (buf_invalblkno(vp, lblkno + i, 0) == EBUSY) {
1759 panic("BUSY bp found in cluster_io");
1760 }
1761 }
1762 }
1763 if (flags & CL_ASYNC) {
1764 if (buf_setcallback(cbp, (void *)cluster_iodone, callback_arg)) {
1765 panic("buf_setcallback failed\n");
1766 }
1767 }
1768 cbp->b_cliodone = (void *)callback;
1769 cbp->b_flags |= io_flags;
1770 if (flags & CL_NOCACHE) {
1771 cbp->b_attr.ba_flags |= BA_NOCACHE;
1772 }
1773
1774 cbp->b_lblkno = lblkno;
1775 cbp->b_blkno = blkno;
1776 cbp->b_bcount = io_size;
1777
1778 if (buf_setupl(cbp, upl, upl_offset)) {
1779 panic("buf_setupl failed\n");
1780 }
1781 #if CONFIG_IOSCHED
1782 upl_set_blkno(upl, upl_offset, io_size, blkno);
1783 #endif
1784 cbp->b_trans_next = (buf_t)NULL;
1785
1786 if ((cbp->b_iostate = (void *)iostate)) {
1787 /*
1788 * caller wants to track the state of this
1789 * io... bump the amount issued against this stream
1790 */
1791 iostate->io_issued += io_size;
1792 }
1793
1794 if (flags & CL_READ) {
1795 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
1796 (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
1797 } else {
1798 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
1799 (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
1800 }
1801
1802 if (cbp_head) {
1803 cbp_tail->b_trans_next = cbp;
1804 cbp_tail = cbp;
1805 } else {
1806 cbp_head = cbp;
1807 cbp_tail = cbp;
1808
1809 if ((cbp_head->b_real_bp = real_bp)) {
1810 real_bp = (buf_t)NULL;
1811 }
1812 }
1813 *(buf_t *)(&cbp->b_trans_head) = cbp_head;
1814
1815 trans_count++;
1816
1817 upl_offset += io_size;
1818 f_offset += io_size;
1819 size -= io_size;
1820 /*
1821 * keep track of how much of the original request
1822 * that we've actually completed... non_rounded_size
1823 * may go negative due to us rounding the request
1824 * to a page size multiple (i.e. size > non_rounded_size)
1825 */
1826 non_rounded_size -= io_size;
1827
1828 if (non_rounded_size <= 0) {
1829 /*
1830 * we've transferred all of the data in the original
1831 * request, but we were unable to complete the tail
1832 * of the last page because the file didn't have
1833 * an allocation to back that portion... this is ok.
1834 */
1835 size = 0;
1836 }
1837 if (size == 0) {
1838 /*
1839 * we have no more I/O to issue, so go
1840 * finish the final transaction
1841 */
1842 need_EOT = TRUE;
1843 } else if (((flags & CL_DEV_MEMORY) || (upl_offset & PAGE_MASK) == 0) &&
1844 ((flags & CL_ASYNC) || trans_count > max_trans_count)) {
1845 /*
1846 * I/O directed to physically contiguous memory...
1847 * which doesn't have a requirement to make sure we 'fill' a page
1848 * or...
1849 * the current I/O we've prepared fully
1850 * completes the last page in this request
1851 * and ...
1852 * it's either an ASYNC request or
1853 * we've already accumulated more than 8 I/O's into
1854 * this transaction so mark it as complete so that
1855 * it can finish asynchronously or via the cluster_complete_transaction
1856 * below if the request is synchronous
1857 */
1858 need_EOT = TRUE;
1859 }
1860 if (need_EOT == TRUE) {
1861 cluster_EOT(cbp_head, cbp_tail, size == 0 ? zero_offset : 0);
1862 }
1863
1864 if (flags & CL_THROTTLE) {
1865 (void)vnode_waitforwrites(vp, async_throttle, 0, 0, "cluster_io");
1866 }
1867
1868 if (!(io_flags & B_READ)) {
1869 vnode_startwrite(vp);
1870 }
1871
1872 if (flags & CL_RAW_ENCRYPTED) {
1873 /*
1874 * User requested raw encrypted bytes.
1875 * Twiddle the bit in the ba_flags for the buffer
1876 */
1877 cbp->b_attr.ba_flags |= BA_RAW_ENCRYPTED_IO;
1878 }
1879
1880 (void) VNOP_STRATEGY(cbp);
1881
1882 if (need_EOT == TRUE) {
1883 if (!(flags & CL_ASYNC)) {
1884 cluster_complete_transaction(&cbp_head, callback_arg, &retval, flags, 1);
1885 }
1886
1887 need_EOT = FALSE;
1888 trans_count = 0;
1889 cbp_head = NULL;
1890 }
1891 }
1892 if (error) {
1893 int abort_size;
1894
1895 io_size = 0;
1896
1897 if (cbp_head) {
1898 /*
1899 * Wait until all of the outstanding I/O
1900 * for this partial transaction has completed
1901 */
1902 cluster_wait_IO(cbp_head, (flags & CL_ASYNC));
1903
1904 /*
1905 * Rewind the upl offset to the beginning of the
1906 * transaction.
1907 */
1908 upl_offset = cbp_head->b_uploffset;
1909 }
1910
1911 if (ISSET(flags, CL_COMMIT)) {
1912 cluster_handle_associated_upl(iostate, upl, upl_offset,
1913 upl_end_offset - upl_offset);
1914 }
1915
1916 // Free all the IO buffers in this transaction
1917 for (cbp = cbp_head; cbp;) {
1918 buf_t cbp_next;
1919
1920 size += cbp->b_bcount;
1921 io_size += cbp->b_bcount;
1922
1923 cbp_next = cbp->b_trans_next;
1924 free_io_buf(cbp);
1925 cbp = cbp_next;
1926 }
1927
1928 if (iostate) {
1929 int need_wakeup = 0;
1930
1931 /*
1932 * update the error condition for this stream
1933 * since we never really issued the io
1934 * just go ahead and adjust it back
1935 */
1936 lck_mtx_lock_spin(&iostate->io_mtxp);
1937
1938 if (iostate->io_error == 0) {
1939 iostate->io_error = error;
1940 }
1941 iostate->io_issued -= io_size;
1942
1943 if (iostate->io_wanted) {
1944 /*
1945 * someone is waiting for the state of
1946 * this io stream to change
1947 */
1948 iostate->io_wanted = 0;
1949 need_wakeup = 1;
1950 }
1951 lck_mtx_unlock(&iostate->io_mtxp);
1952
1953 if (need_wakeup) {
1954 wakeup((caddr_t)&iostate->io_wanted);
1955 }
1956 }
1957
1958 if (flags & CL_COMMIT) {
1959 int upl_flags;
1960
1961 pg_offset = upl_offset & PAGE_MASK;
1962 abort_size = (upl_end_offset - upl_offset + PAGE_MASK) & ~PAGE_MASK;
1963
1964 upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, abort_size, error, io_flags, vp);
1965
1966 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
1967 upl, upl_offset - pg_offset, abort_size, (error << 24) | upl_flags, 0);
1968 }
1969 if (retval == 0) {
1970 retval = error;
1971 }
1972 } else if (cbp_head) {
1973 panic("%s(): cbp_head is not NULL.\n", __FUNCTION__);
1974 }
1975
1976 if (real_bp) {
1977 /*
1978 * can get here if we either encountered an error
1979 * or we completely zero-filled the request and
1980 * no I/O was issued
1981 */
1982 if (error) {
1983 real_bp->b_flags |= B_ERROR;
1984 real_bp->b_error = error;
1985 }
1986 buf_biodone(real_bp);
1987 }
1988 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END, (int)f_offset, size, upl_offset, retval, 0);
1989
1990 return retval;
1991 }
1992
1993 #define reset_vector_run_state() \
1994 issueVectorUPL = vector_upl_offset = vector_upl_index = vector_upl_iosize = vector_upl_size = 0;
1995
1996 static int
1997 vector_cluster_io(vnode_t vp, upl_t vector_upl, vm_offset_t vector_upl_offset, off_t v_upl_uio_offset, int vector_upl_iosize,
1998 int io_flag, buf_t real_bp, struct clios *iostate, int (*callback)(buf_t, void *), void *callback_arg)
1999 {
2000 vector_upl_set_pagelist(vector_upl);
2001
2002 if (io_flag & CL_READ) {
2003 if (vector_upl_offset == 0 && ((vector_upl_iosize & PAGE_MASK) == 0)) {
2004 io_flag &= ~CL_PRESERVE; /*don't zero fill*/
2005 } else {
2006 io_flag |= CL_PRESERVE; /*zero fill*/
2007 }
2008 }
2009 return cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, real_bp, iostate, callback, callback_arg);
2010 }
2011
2012 static int
2013 cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (*callback)(buf_t, void *), void *callback_arg, int bflag)
2014 {
2015 int pages_in_prefetch;
2016
2017 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
2018 (int)f_offset, size, (int)filesize, 0, 0);
2019
2020 if (f_offset >= filesize) {
2021 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
2022 (int)f_offset, 0, 0, 0, 0);
2023 return 0;
2024 }
2025 if ((off_t)size > (filesize - f_offset)) {
2026 size = filesize - f_offset;
2027 }
2028 pages_in_prefetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
2029
2030 advisory_read_ext(vp, filesize, f_offset, size, callback, callback_arg, bflag);
2031
2032 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
2033 (int)f_offset + size, pages_in_prefetch, 0, 1, 0);
2034
2035 return pages_in_prefetch;
2036 }
2037
2038
2039
2040 static void
2041 cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *rap, int (*callback)(buf_t, void *), void *callback_arg,
2042 int bflag)
2043 {
2044 daddr64_t r_addr;
2045 off_t f_offset;
2046 int size_of_prefetch;
2047 u_int max_prefetch;
2048
2049
2050 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
2051 (int)extent->b_addr, (int)extent->e_addr, (int)rap->cl_lastr, 0, 0);
2052
2053 if (extent->b_addr == rap->cl_lastr && extent->b_addr == extent->e_addr) {
2054 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2055 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 0, 0);
2056 return;
2057 }
2058 if (rap->cl_lastr == -1 || (extent->b_addr != rap->cl_lastr && extent->b_addr != (rap->cl_lastr + 1))) {
2059 rap->cl_ralen = 0;
2060 rap->cl_maxra = 0;
2061
2062 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2063 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 1, 0);
2064
2065 return;
2066 }
2067 max_prefetch = MAX_PREFETCH(vp, cluster_max_io_size(vp->v_mount, CL_READ), disk_conditioner_mount_is_ssd(vp->v_mount));
2068
2069 if (max_prefetch > speculative_prefetch_max) {
2070 max_prefetch = speculative_prefetch_max;
2071 }
2072
2073 if (max_prefetch <= PAGE_SIZE) {
2074 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2075 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 6, 0);
2076 return;
2077 }
2078 if (extent->e_addr < rap->cl_maxra && rap->cl_ralen >= 4) {
2079 if ((rap->cl_maxra - extent->e_addr) > (rap->cl_ralen / 4)) {
2080 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2081 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 2, 0);
2082 return;
2083 }
2084 }
2085 r_addr = max(extent->e_addr, rap->cl_maxra) + 1;
2086 f_offset = (off_t)(r_addr * PAGE_SIZE_64);
2087
2088 size_of_prefetch = 0;
2089
2090 ubc_range_op(vp, f_offset, f_offset + PAGE_SIZE_64, UPL_ROP_PRESENT, &size_of_prefetch);
2091
2092 if (size_of_prefetch) {
2093 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2094 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 3, 0);
2095 return;
2096 }
2097 if (f_offset < filesize) {
2098 daddr64_t read_size;
2099
2100 rap->cl_ralen = rap->cl_ralen ? min(max_prefetch / PAGE_SIZE, rap->cl_ralen << 1) : 1;
2101
2102 read_size = (extent->e_addr + 1) - extent->b_addr;
2103
2104 if (read_size > rap->cl_ralen) {
2105 if (read_size > max_prefetch / PAGE_SIZE) {
2106 rap->cl_ralen = max_prefetch / PAGE_SIZE;
2107 } else {
2108 rap->cl_ralen = read_size;
2109 }
2110 }
2111 size_of_prefetch = cluster_read_prefetch(vp, f_offset, rap->cl_ralen * PAGE_SIZE, filesize, callback, callback_arg, bflag);
2112
2113 if (size_of_prefetch) {
2114 rap->cl_maxra = (r_addr + size_of_prefetch) - 1;
2115 }
2116 }
2117 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
2118 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 4, 0);
2119 }
2120
2121
2122 int
2123 cluster_pageout(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2124 int size, off_t filesize, int flags)
2125 {
2126 return cluster_pageout_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL);
2127 }
2128
2129
2130 int
2131 cluster_pageout_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2132 int size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
2133 {
2134 int io_size;
2135 int rounded_size;
2136 off_t max_size;
2137 int local_flags;
2138
2139 local_flags = CL_PAGEOUT | CL_THROTTLE;
2140
2141 if ((flags & UPL_IOSYNC) == 0) {
2142 local_flags |= CL_ASYNC;
2143 }
2144 if ((flags & UPL_NOCOMMIT) == 0) {
2145 local_flags |= CL_COMMIT;
2146 }
2147 if ((flags & UPL_KEEPCACHED)) {
2148 local_flags |= CL_KEEPCACHED;
2149 }
2150 if (flags & UPL_PAGING_ENCRYPTED) {
2151 local_flags |= CL_ENCRYPTED;
2152 }
2153
2154
2155 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
2156 (int)f_offset, size, (int)filesize, local_flags, 0);
2157
2158 /*
2159 * If they didn't specify any I/O, then we are done...
2160 * we can't issue an abort because we don't know how
2161 * big the upl really is
2162 */
2163 if (size <= 0) {
2164 return EINVAL;
2165 }
2166
2167 if (vp->v_mount->mnt_flag & MNT_RDONLY) {
2168 if (local_flags & CL_COMMIT) {
2169 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
2170 }
2171 return EROFS;
2172 }
2173 /*
2174 * can't page-in from a negative offset
2175 * or if we're starting beyond the EOF
2176 * or if the file offset isn't page aligned
2177 * or the size requested isn't a multiple of PAGE_SIZE
2178 */
2179 if (f_offset < 0 || f_offset >= filesize ||
2180 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
2181 if (local_flags & CL_COMMIT) {
2182 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
2183 }
2184 return EINVAL;
2185 }
2186 max_size = filesize - f_offset;
2187
2188 if (size < max_size) {
2189 io_size = size;
2190 } else {
2191 io_size = max_size;
2192 }
2193
2194 rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2195
2196 if (size > rounded_size) {
2197 if (local_flags & CL_COMMIT) {
2198 ubc_upl_abort_range(upl, upl_offset + rounded_size, size - rounded_size,
2199 UPL_ABORT_FREE_ON_EMPTY);
2200 }
2201 }
2202 return cluster_io(vp, upl, upl_offset, f_offset, io_size,
2203 local_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
2204 }
2205
2206
2207 int
2208 cluster_pagein(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2209 int size, off_t filesize, int flags)
2210 {
2211 return cluster_pagein_ext(vp, upl, upl_offset, f_offset, size, filesize, flags, NULL, NULL);
2212 }
2213
2214
2215 int
2216 cluster_pagein_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offset,
2217 int size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
2218 {
2219 u_int io_size;
2220 int rounded_size;
2221 off_t max_size;
2222 int retval;
2223 int local_flags = 0;
2224
2225 if (upl == NULL || size < 0) {
2226 panic("cluster_pagein: NULL upl passed in");
2227 }
2228
2229 if ((flags & UPL_IOSYNC) == 0) {
2230 local_flags |= CL_ASYNC;
2231 }
2232 if ((flags & UPL_NOCOMMIT) == 0) {
2233 local_flags |= CL_COMMIT;
2234 }
2235 if (flags & UPL_IOSTREAMING) {
2236 local_flags |= CL_IOSTREAMING;
2237 }
2238 if (flags & UPL_PAGING_ENCRYPTED) {
2239 local_flags |= CL_ENCRYPTED;
2240 }
2241
2242
2243 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
2244 (int)f_offset, size, (int)filesize, local_flags, 0);
2245
2246 /*
2247 * can't page-in from a negative offset
2248 * or if we're starting beyond the EOF
2249 * or if the file offset isn't page aligned
2250 * or the size requested isn't a multiple of PAGE_SIZE
2251 */
2252 if (f_offset < 0 || f_offset >= filesize ||
2253 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) {
2254 if (local_flags & CL_COMMIT) {
2255 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
2256 }
2257 return EINVAL;
2258 }
2259 max_size = filesize - f_offset;
2260
2261 if (size < max_size) {
2262 io_size = size;
2263 } else {
2264 io_size = max_size;
2265 }
2266
2267 rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2268
2269 if (size > rounded_size && (local_flags & CL_COMMIT)) {
2270 ubc_upl_abort_range(upl, upl_offset + rounded_size,
2271 size - rounded_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
2272 }
2273
2274 retval = cluster_io(vp, upl, upl_offset, f_offset, io_size,
2275 local_flags | CL_READ | CL_PAGEIN, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
2276
2277 return retval;
2278 }
2279
2280
2281 int
2282 cluster_bp(buf_t bp)
2283 {
2284 return cluster_bp_ext(bp, NULL, NULL);
2285 }
2286
2287
2288 int
2289 cluster_bp_ext(buf_t bp, int (*callback)(buf_t, void *), void *callback_arg)
2290 {
2291 off_t f_offset;
2292 int flags;
2293
2294 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START,
2295 bp, (int)bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
2296
2297 if (bp->b_flags & B_READ) {
2298 flags = CL_ASYNC | CL_READ;
2299 } else {
2300 flags = CL_ASYNC;
2301 }
2302 if (bp->b_flags & B_PASSIVE) {
2303 flags |= CL_PASSIVE;
2304 }
2305
2306 f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
2307
2308 return cluster_io(bp->b_vp, bp->b_upl, 0, f_offset, bp->b_bcount, flags, bp, (struct clios *)NULL, callback, callback_arg);
2309 }
2310
2311
2312
2313 int
2314 cluster_write(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int xflags)
2315 {
2316 return cluster_write_ext(vp, uio, oldEOF, newEOF, headOff, tailOff, xflags, NULL, NULL);
2317 }
2318
2319
2320 int
2321 cluster_write_ext(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff,
2322 int xflags, int (*callback)(buf_t, void *), void *callback_arg)
2323 {
2324 user_ssize_t cur_resid;
2325 int retval = 0;
2326 int flags;
2327 int zflags;
2328 int bflag;
2329 int write_type = IO_COPY;
2330 u_int32_t write_length;
2331
2332 flags = xflags;
2333
2334 if (flags & IO_PASSIVE) {
2335 bflag = CL_PASSIVE;
2336 } else {
2337 bflag = 0;
2338 }
2339
2340 if (vp->v_flag & VNOCACHE_DATA) {
2341 flags |= IO_NOCACHE;
2342 bflag |= CL_NOCACHE;
2343 }
2344 if (uio == NULL) {
2345 /*
2346 * no user data...
2347 * this call is being made to zero-fill some range in the file
2348 */
2349 retval = cluster_write_copy(vp, NULL, (u_int32_t)0, oldEOF, newEOF, headOff, tailOff, flags, callback, callback_arg);
2350
2351 return retval;
2352 }
2353 /*
2354 * do a write through the cache if one of the following is true....
2355 * NOCACHE is not true or NODIRECT is true
2356 * the uio request doesn't target USERSPACE
2357 * otherwise, find out if we want the direct or contig variant for
2358 * the first vector in the uio request
2359 */
2360 if (((flags & (IO_NOCACHE | IO_NODIRECT)) == IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) {
2361 retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE);
2362 }
2363
2364 if ((flags & (IO_TAILZEROFILL | IO_HEADZEROFILL)) && write_type == IO_DIRECT) {
2365 /*
2366 * must go through the cached variant in this case
2367 */
2368 write_type = IO_COPY;
2369 }
2370
2371 while ((cur_resid = uio_resid(uio)) && uio->uio_offset < newEOF && retval == 0) {
2372 switch (write_type) {
2373 case IO_COPY:
2374 /*
2375 * make sure the uio_resid isn't too big...
2376 * internally, we want to handle all of the I/O in
2377 * chunk sizes that fit in a 32 bit int
2378 */
2379 if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE)) {
2380 /*
2381 * we're going to have to call cluster_write_copy
2382 * more than once...
2383 *
2384 * only want the last call to cluster_write_copy to
2385 * have the IO_TAILZEROFILL flag set and only the
2386 * first call should have IO_HEADZEROFILL
2387 */
2388 zflags = flags & ~IO_TAILZEROFILL;
2389 flags &= ~IO_HEADZEROFILL;
2390
2391 write_length = MAX_IO_REQUEST_SIZE;
2392 } else {
2393 /*
2394 * last call to cluster_write_copy
2395 */
2396 zflags = flags;
2397
2398 write_length = (u_int32_t)cur_resid;
2399 }
2400 retval = cluster_write_copy(vp, uio, write_length, oldEOF, newEOF, headOff, tailOff, zflags, callback, callback_arg);
2401 break;
2402
2403 case IO_CONTIG:
2404 zflags = flags & ~(IO_TAILZEROFILL | IO_HEADZEROFILL);
2405
2406 if (flags & IO_HEADZEROFILL) {
2407 /*
2408 * only do this once per request
2409 */
2410 flags &= ~IO_HEADZEROFILL;
2411
2412 retval = cluster_write_copy(vp, (struct uio *)0, (u_int32_t)0, (off_t)0, uio->uio_offset,
2413 headOff, (off_t)0, zflags | IO_HEADZEROFILL | IO_SYNC, callback, callback_arg);
2414 if (retval) {
2415 break;
2416 }
2417 }
2418 retval = cluster_write_contig(vp, uio, newEOF, &write_type, &write_length, callback, callback_arg, bflag);
2419
2420 if (retval == 0 && (flags & IO_TAILZEROFILL) && uio_resid(uio) == 0) {
2421 /*
2422 * we're done with the data from the user specified buffer(s)
2423 * and we've been requested to zero fill at the tail
2424 * treat this as an IO_HEADZEROFILL which doesn't require a uio
2425 * by rearranging the args and passing in IO_HEADZEROFILL
2426 */
2427 retval = cluster_write_copy(vp, (struct uio *)0, (u_int32_t)0, (off_t)0, tailOff, uio->uio_offset,
2428 (off_t)0, zflags | IO_HEADZEROFILL | IO_SYNC, callback, callback_arg);
2429 }
2430 break;
2431
2432 case IO_DIRECT:
2433 /*
2434 * cluster_write_direct is never called with IO_TAILZEROFILL || IO_HEADZEROFILL
2435 */
2436 retval = cluster_write_direct(vp, uio, oldEOF, newEOF, &write_type, &write_length, flags, callback, callback_arg);
2437 break;
2438
2439 case IO_UNKNOWN:
2440 retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE);
2441 break;
2442 }
2443 /*
2444 * in case we end up calling cluster_write_copy (from cluster_write_direct)
2445 * multiple times to service a multi-vector request that is not aligned properly
2446 * we need to update the oldEOF so that we
2447 * don't zero-fill the head of a page if we've successfully written
2448 * data to that area... 'cluster_write_copy' will zero-fill the head of a
2449 * page that is beyond the oldEOF if the write is unaligned... we only
2450 * want that to happen for the very first page of the cluster_write,
2451 * NOT the first page of each vector making up a multi-vector write.
2452 */
2453 if (uio->uio_offset > oldEOF) {
2454 oldEOF = uio->uio_offset;
2455 }
2456 }
2457 return retval;
2458 }
2459
2460
2461 static int
2462 cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, int *write_type, u_int32_t *write_length,
2463 int flags, int (*callback)(buf_t, void *), void *callback_arg)
2464 {
2465 upl_t upl;
2466 upl_page_info_t *pl;
2467 vm_offset_t upl_offset;
2468 vm_offset_t vector_upl_offset = 0;
2469 u_int32_t io_req_size;
2470 u_int32_t offset_in_file;
2471 u_int32_t offset_in_iovbase;
2472 u_int32_t io_size;
2473 int io_flag = 0;
2474 upl_size_t upl_size, vector_upl_size = 0;
2475 vm_size_t upl_needed_size;
2476 mach_msg_type_number_t pages_in_pl;
2477 upl_control_flags_t upl_flags;
2478 kern_return_t kret;
2479 mach_msg_type_number_t i;
2480 int force_data_sync;
2481 int retval = 0;
2482 int first_IO = 1;
2483 struct clios iostate;
2484 user_addr_t iov_base;
2485 u_int32_t mem_alignment_mask;
2486 u_int32_t devblocksize;
2487 u_int32_t max_io_size;
2488 u_int32_t max_upl_size;
2489 u_int32_t max_vector_size;
2490 u_int32_t bytes_outstanding_limit;
2491 boolean_t io_throttled = FALSE;
2492
2493 u_int32_t vector_upl_iosize = 0;
2494 int issueVectorUPL = 0, useVectorUPL = (uio->uio_iovcnt > 1);
2495 off_t v_upl_uio_offset = 0;
2496 int vector_upl_index = 0;
2497 upl_t vector_upl = NULL;
2498
2499
2500 /*
2501 * When we enter this routine, we know
2502 * -- the resid will not exceed iov_len
2503 */
2504 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
2505 (int)uio->uio_offset, *write_length, (int)newEOF, 0, 0);
2506
2507 max_upl_size = cluster_max_io_size(vp->v_mount, CL_WRITE);
2508
2509 io_flag = CL_ASYNC | CL_PRESERVE | CL_COMMIT | CL_THROTTLE | CL_DIRECT_IO;
2510
2511 if (flags & IO_PASSIVE) {
2512 io_flag |= CL_PASSIVE;
2513 }
2514
2515 if (flags & IO_NOCACHE) {
2516 io_flag |= CL_NOCACHE;
2517 }
2518
2519 if (flags & IO_SKIP_ENCRYPTION) {
2520 io_flag |= CL_ENCRYPTED;
2521 }
2522
2523 iostate.io_completed = 0;
2524 iostate.io_issued = 0;
2525 iostate.io_error = 0;
2526 iostate.io_wanted = 0;
2527
2528 lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr);
2529
2530 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
2531 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
2532
2533 if (devblocksize == 1) {
2534 /*
2535 * the AFP client advertises a devblocksize of 1
2536 * however, its BLOCKMAP routine maps to physical
2537 * blocks that are PAGE_SIZE in size...
2538 * therefore we can't ask for I/Os that aren't page aligned
2539 * or aren't multiples of PAGE_SIZE in size
2540 * by setting devblocksize to PAGE_SIZE, we re-instate
2541 * the old behavior we had before the mem_alignment_mask
2542 * changes went in...
2543 */
2544 devblocksize = PAGE_SIZE;
2545 }
2546
2547 next_dwrite:
2548 io_req_size = *write_length;
2549 iov_base = uio_curriovbase(uio);
2550
2551 offset_in_file = (u_int32_t)uio->uio_offset & PAGE_MASK;
2552 offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask;
2553
2554 if (offset_in_file || offset_in_iovbase) {
2555 /*
2556 * one of the 2 important offsets is misaligned
2557 * so fire an I/O through the cache for this entire vector
2558 */
2559 goto wait_for_dwrites;
2560 }
2561 if (iov_base & (devblocksize - 1)) {
2562 /*
2563 * the offset in memory must be on a device block boundary
2564 * so that we can guarantee that we can generate an
2565 * I/O that ends on a page boundary in cluster_io
2566 */
2567 goto wait_for_dwrites;
2568 }
2569
2570 task_update_logical_writes(current_task(), (io_req_size & ~PAGE_MASK), TASK_WRITE_IMMEDIATE, vp);
2571 while (io_req_size >= PAGE_SIZE && uio->uio_offset < newEOF && retval == 0) {
2572 int throttle_type;
2573
2574 if ((throttle_type = cluster_is_throttled(vp))) {
2575 /*
2576 * we're in the throttle window, at the very least
2577 * we want to limit the size of the I/O we're about
2578 * to issue
2579 */
2580 if ((flags & IO_RETURN_ON_THROTTLE) && throttle_type == THROTTLE_NOW) {
2581 /*
2582 * we're in the throttle window and at least 1 I/O
2583 * has already been issued by a throttleable thread
2584 * in this window, so return with EAGAIN to indicate
2585 * to the FS issuing the cluster_write call that it
2586 * should now throttle after dropping any locks
2587 */
2588 throttle_info_update_by_mount(vp->v_mount);
2589
2590 io_throttled = TRUE;
2591 goto wait_for_dwrites;
2592 }
2593 max_vector_size = THROTTLE_MAX_IOSIZE;
2594 max_io_size = THROTTLE_MAX_IOSIZE;
2595 } else {
2596 max_vector_size = MAX_VECTOR_UPL_SIZE;
2597 max_io_size = max_upl_size;
2598 }
2599
2600 if (first_IO) {
2601 cluster_syncup(vp, newEOF, callback, callback_arg, callback ? PUSH_SYNC : 0);
2602 first_IO = 0;
2603 }
2604 io_size = io_req_size & ~PAGE_MASK;
2605 iov_base = uio_curriovbase(uio);
2606
2607 if (io_size > max_io_size) {
2608 io_size = max_io_size;
2609 }
2610
2611 if (useVectorUPL && (iov_base & PAGE_MASK)) {
2612 /*
2613 * We have an iov_base that's not page-aligned.
2614 * Issue all I/O's that have been collected within
2615 * this Vectored UPL.
2616 */
2617 if (vector_upl_index) {
2618 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
2619 reset_vector_run_state();
2620 }
2621
2622 /*
2623 * After this point, if we are using the Vector UPL path and the base is
2624 * not page-aligned then the UPL with that base will be the first in the vector UPL.
2625 */
2626 }
2627
2628 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
2629 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2630
2631 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
2632 (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
2633
2634 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
2635 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
2636 pages_in_pl = 0;
2637 upl_size = upl_needed_size;
2638 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
2639 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
2640
2641 kret = vm_map_get_upl(map,
2642 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
2643 &upl_size,
2644 &upl,
2645 NULL,
2646 &pages_in_pl,
2647 &upl_flags,
2648 VM_KERN_MEMORY_FILE,
2649 force_data_sync);
2650
2651 if (kret != KERN_SUCCESS) {
2652 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
2653 0, 0, 0, kret, 0);
2654 /*
2655 * failed to get pagelist
2656 *
2657 * we may have already spun some portion of this request
2658 * off as async requests... we need to wait for the I/O
2659 * to complete before returning
2660 */
2661 goto wait_for_dwrites;
2662 }
2663 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2664 pages_in_pl = upl_size / PAGE_SIZE;
2665
2666 for (i = 0; i < pages_in_pl; i++) {
2667 if (!upl_valid_page(pl, i)) {
2668 break;
2669 }
2670 }
2671 if (i == pages_in_pl) {
2672 break;
2673 }
2674
2675 /*
2676 * didn't get all the pages back that we
2677 * needed... release this upl and try again
2678 */
2679 ubc_upl_abort(upl, 0);
2680 }
2681 if (force_data_sync >= 3) {
2682 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
2683 i, pages_in_pl, upl_size, kret, 0);
2684 /*
2685 * for some reason, we couldn't acquire a hold on all
2686 * the pages needed in the user's address space
2687 *
2688 * we may have already spun some portion of this request
2689 * off as async requests... we need to wait for the I/O
2690 * to complete before returning
2691 */
2692 goto wait_for_dwrites;
2693 }
2694
2695 /*
2696 * Consider the possibility that upl_size wasn't satisfied.
2697 */
2698 if (upl_size < upl_needed_size) {
2699 if (upl_size && upl_offset == 0) {
2700 io_size = upl_size;
2701 } else {
2702 io_size = 0;
2703 }
2704 }
2705 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
2706 (int)upl_offset, upl_size, (int)iov_base, io_size, 0);
2707
2708 if (io_size == 0) {
2709 ubc_upl_abort(upl, 0);
2710 /*
2711 * we may have already spun some portion of this request
2712 * off as async requests... we need to wait for the I/O
2713 * to complete before returning
2714 */
2715 goto wait_for_dwrites;
2716 }
2717
2718 if (useVectorUPL) {
2719 vm_offset_t end_off = ((iov_base + io_size) & PAGE_MASK);
2720 if (end_off) {
2721 issueVectorUPL = 1;
2722 }
2723 /*
2724 * After this point, if we are using a vector UPL, then
2725 * either all the UPL elements end on a page boundary OR
2726 * this UPL is the last element because it does not end
2727 * on a page boundary.
2728 */
2729 }
2730
2731 /*
2732 * we want push out these writes asynchronously so that we can overlap
2733 * the preparation of the next I/O
2734 * if there are already too many outstanding writes
2735 * wait until some complete before issuing the next
2736 */
2737 if (vp->v_mount->mnt_minsaturationbytecount) {
2738 bytes_outstanding_limit = vp->v_mount->mnt_minsaturationbytecount;
2739 } else {
2740 bytes_outstanding_limit = max_upl_size * IO_SCALE(vp, 2);
2741 }
2742
2743 cluster_iostate_wait(&iostate, bytes_outstanding_limit, "cluster_write_direct");
2744
2745 if (iostate.io_error) {
2746 /*
2747 * one of the earlier writes we issued ran into a hard error
2748 * don't issue any more writes, cleanup the UPL
2749 * that was just created but not used, then
2750 * go wait for all writes that are part of this stream
2751 * to complete before returning the error to the caller
2752 */
2753 ubc_upl_abort(upl, 0);
2754
2755 goto wait_for_dwrites;
2756 }
2757
2758 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
2759 (int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0);
2760
2761 if (!useVectorUPL) {
2762 retval = cluster_io(vp, upl, upl_offset, uio->uio_offset,
2763 io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
2764 } else {
2765 if (!vector_upl_index) {
2766 vector_upl = vector_upl_create(upl_offset);
2767 v_upl_uio_offset = uio->uio_offset;
2768 vector_upl_offset = upl_offset;
2769 }
2770
2771 vector_upl_set_subupl(vector_upl, upl, upl_size);
2772 vector_upl_set_iostate(vector_upl, upl, vector_upl_size, upl_size);
2773 vector_upl_index++;
2774 vector_upl_iosize += io_size;
2775 vector_upl_size += upl_size;
2776
2777 if (issueVectorUPL || vector_upl_index == MAX_VECTOR_UPL_ELEMENTS || vector_upl_size >= max_vector_size) {
2778 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
2779 reset_vector_run_state();
2780 }
2781 }
2782
2783 /*
2784 * update the uio structure to
2785 * reflect the I/O that we just issued
2786 */
2787 uio_update(uio, (user_size_t)io_size);
2788
2789 /*
2790 * in case we end up calling through to cluster_write_copy to finish
2791 * the tail of this request, we need to update the oldEOF so that we
2792 * don't zero-fill the head of a page if we've successfully written
2793 * data to that area... 'cluster_write_copy' will zero-fill the head of a
2794 * page that is beyond the oldEOF if the write is unaligned... we only
2795 * want that to happen for the very first page of the cluster_write,
2796 * NOT the first page of each vector making up a multi-vector write.
2797 */
2798 if (uio->uio_offset > oldEOF) {
2799 oldEOF = uio->uio_offset;
2800 }
2801
2802 io_req_size -= io_size;
2803
2804 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
2805 (int)upl_offset, (int)uio->uio_offset, io_req_size, retval, 0);
2806 } /* end while */
2807
2808 if (retval == 0 && iostate.io_error == 0 && io_req_size == 0) {
2809 retval = cluster_io_type(uio, write_type, write_length, MIN_DIRECT_WRITE_SIZE);
2810
2811 if (retval == 0 && *write_type == IO_DIRECT) {
2812 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_NONE,
2813 (int)uio->uio_offset, *write_length, (int)newEOF, 0, 0);
2814
2815 goto next_dwrite;
2816 }
2817 }
2818
2819 wait_for_dwrites:
2820
2821 if (retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) {
2822 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
2823 reset_vector_run_state();
2824 }
2825 /*
2826 * make sure all async writes issued as part of this stream
2827 * have completed before we return
2828 */
2829 cluster_iostate_wait(&iostate, 0, "cluster_write_direct");
2830
2831 if (iostate.io_error) {
2832 retval = iostate.io_error;
2833 }
2834
2835 lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp);
2836
2837 if (io_throttled == TRUE && retval == 0) {
2838 retval = EAGAIN;
2839 }
2840
2841 if (io_req_size && retval == 0) {
2842 /*
2843 * we couldn't handle the tail of this request in DIRECT mode
2844 * so fire it through the copy path
2845 *
2846 * note that flags will never have IO_HEADZEROFILL or IO_TAILZEROFILL set
2847 * so we can just pass 0 in for the headOff and tailOff
2848 */
2849 if (uio->uio_offset > oldEOF) {
2850 oldEOF = uio->uio_offset;
2851 }
2852
2853 retval = cluster_write_copy(vp, uio, io_req_size, oldEOF, newEOF, (off_t)0, (off_t)0, flags, callback, callback_arg);
2854
2855 *write_type = IO_UNKNOWN;
2856 }
2857 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
2858 (int)uio->uio_offset, io_req_size, retval, 4, 0);
2859
2860 return retval;
2861 }
2862
2863
2864 static int
2865 cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF, int *write_type, u_int32_t *write_length,
2866 int (*callback)(buf_t, void *), void *callback_arg, int bflag)
2867 {
2868 upl_page_info_t *pl;
2869 addr64_t src_paddr = 0;
2870 upl_t upl[MAX_VECTS];
2871 vm_offset_t upl_offset;
2872 u_int32_t tail_size = 0;
2873 u_int32_t io_size;
2874 u_int32_t xsize;
2875 upl_size_t upl_size;
2876 vm_size_t upl_needed_size;
2877 mach_msg_type_number_t pages_in_pl;
2878 upl_control_flags_t upl_flags;
2879 kern_return_t kret;
2880 struct clios iostate;
2881 int error = 0;
2882 int cur_upl = 0;
2883 int num_upl = 0;
2884 int n;
2885 user_addr_t iov_base;
2886 u_int32_t devblocksize;
2887 u_int32_t mem_alignment_mask;
2888
2889 /*
2890 * When we enter this routine, we know
2891 * -- the io_req_size will not exceed iov_len
2892 * -- the target address is physically contiguous
2893 */
2894 cluster_syncup(vp, newEOF, callback, callback_arg, callback ? PUSH_SYNC : 0);
2895
2896 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
2897 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
2898
2899 iostate.io_completed = 0;
2900 iostate.io_issued = 0;
2901 iostate.io_error = 0;
2902 iostate.io_wanted = 0;
2903
2904 lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr);
2905
2906 next_cwrite:
2907 io_size = *write_length;
2908
2909 iov_base = uio_curriovbase(uio);
2910
2911 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
2912 upl_needed_size = upl_offset + io_size;
2913
2914 pages_in_pl = 0;
2915 upl_size = upl_needed_size;
2916 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
2917 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
2918
2919 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
2920 kret = vm_map_get_upl(map,
2921 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
2922 &upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, VM_KERN_MEMORY_FILE, 0);
2923
2924 if (kret != KERN_SUCCESS) {
2925 /*
2926 * failed to get pagelist
2927 */
2928 error = EINVAL;
2929 goto wait_for_cwrites;
2930 }
2931 num_upl++;
2932
2933 /*
2934 * Consider the possibility that upl_size wasn't satisfied.
2935 */
2936 if (upl_size < upl_needed_size) {
2937 /*
2938 * This is a failure in the physical memory case.
2939 */
2940 error = EINVAL;
2941 goto wait_for_cwrites;
2942 }
2943 pl = ubc_upl_pageinfo(upl[cur_upl]);
2944
2945 src_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)upl_offset;
2946
2947 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
2948 u_int32_t head_size;
2949
2950 head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1));
2951
2952 if (head_size > io_size) {
2953 head_size = io_size;
2954 }
2955
2956 error = cluster_align_phys_io(vp, uio, src_paddr, head_size, 0, callback, callback_arg);
2957
2958 if (error) {
2959 goto wait_for_cwrites;
2960 }
2961
2962 upl_offset += head_size;
2963 src_paddr += head_size;
2964 io_size -= head_size;
2965
2966 iov_base += head_size;
2967 }
2968 if ((u_int32_t)iov_base & mem_alignment_mask) {
2969 /*
2970 * request doesn't set up on a memory boundary
2971 * the underlying DMA engine can handle...
2972 * return an error instead of going through
2973 * the slow copy path since the intent of this
2974 * path is direct I/O from device memory
2975 */
2976 error = EINVAL;
2977 goto wait_for_cwrites;
2978 }
2979
2980 tail_size = io_size & (devblocksize - 1);
2981 io_size -= tail_size;
2982
2983 while (io_size && error == 0) {
2984 if (io_size > MAX_IO_CONTIG_SIZE) {
2985 xsize = MAX_IO_CONTIG_SIZE;
2986 } else {
2987 xsize = io_size;
2988 }
2989 /*
2990 * request asynchronously so that we can overlap
2991 * the preparation of the next I/O... we'll do
2992 * the commit after all the I/O has completed
2993 * since its all issued against the same UPL
2994 * if there are already too many outstanding writes
2995 * wait until some have completed before issuing the next
2996 */
2997 cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_write_contig");
2998
2999 if (iostate.io_error) {
3000 /*
3001 * one of the earlier writes we issued ran into a hard error
3002 * don't issue any more writes...
3003 * go wait for all writes that are part of this stream
3004 * to complete before returning the error to the caller
3005 */
3006 goto wait_for_cwrites;
3007 }
3008 /*
3009 * issue an asynchronous write to cluster_io
3010 */
3011 error = cluster_io(vp, upl[cur_upl], upl_offset, uio->uio_offset,
3012 xsize, CL_DEV_MEMORY | CL_ASYNC | bflag, (buf_t)NULL, (struct clios *)&iostate, callback, callback_arg);
3013
3014 if (error == 0) {
3015 /*
3016 * The cluster_io write completed successfully,
3017 * update the uio structure
3018 */
3019 uio_update(uio, (user_size_t)xsize);
3020
3021 upl_offset += xsize;
3022 src_paddr += xsize;
3023 io_size -= xsize;
3024 }
3025 }
3026 if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS) {
3027 error = cluster_io_type(uio, write_type, write_length, 0);
3028
3029 if (error == 0 && *write_type == IO_CONTIG) {
3030 cur_upl++;
3031 goto next_cwrite;
3032 }
3033 } else {
3034 *write_type = IO_UNKNOWN;
3035 }
3036
3037 wait_for_cwrites:
3038 /*
3039 * make sure all async writes that are part of this stream
3040 * have completed before we proceed
3041 */
3042 cluster_iostate_wait(&iostate, 0, "cluster_write_contig");
3043
3044 if (iostate.io_error) {
3045 error = iostate.io_error;
3046 }
3047
3048 lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp);
3049
3050 if (error == 0 && tail_size) {
3051 error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, 0, callback, callback_arg);
3052 }
3053
3054 for (n = 0; n < num_upl; n++) {
3055 /*
3056 * just release our hold on each physically contiguous
3057 * region without changing any state
3058 */
3059 ubc_upl_abort(upl[n], 0);
3060 }
3061
3062 return error;
3063 }
3064
3065
3066 /*
3067 * need to avoid a race between an msync of a range of pages dirtied via mmap
3068 * vs a filesystem such as HFS deciding to write a 'hole' to disk via cluster_write's
3069 * zerofill mechanism before it has seen the VNOP_PAGEOUTs for the pages being msync'd
3070 *
3071 * we should never force-zero-fill pages that are already valid in the cache...
3072 * the entire page contains valid data (either from disk, zero-filled or dirtied
3073 * via an mmap) so we can only do damage by trying to zero-fill
3074 *
3075 */
3076 static int
3077 cluster_zero_range(upl_t upl, upl_page_info_t *pl, int flags, int io_offset, off_t zero_off, off_t upl_f_offset, int bytes_to_zero)
3078 {
3079 int zero_pg_index;
3080 boolean_t need_cluster_zero = TRUE;
3081
3082 if ((flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
3083 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
3084 zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
3085
3086 if (upl_valid_page(pl, zero_pg_index)) {
3087 /*
3088 * never force zero valid pages - dirty or clean
3089 * we'll leave these in the UPL for cluster_write_copy to deal with
3090 */
3091 need_cluster_zero = FALSE;
3092 }
3093 }
3094 if (need_cluster_zero == TRUE) {
3095 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
3096 }
3097
3098 return bytes_to_zero;
3099 }
3100
3101
3102 void
3103 cluster_update_state(vnode_t vp, vm_object_offset_t s_offset, vm_object_offset_t e_offset, boolean_t vm_initiated)
3104 {
3105 struct cl_extent cl;
3106 boolean_t first_pass = TRUE;
3107
3108 assert(s_offset < e_offset);
3109 assert((s_offset & PAGE_MASK_64) == 0);
3110 assert((e_offset & PAGE_MASK_64) == 0);
3111
3112 cl.b_addr = (daddr64_t)(s_offset / PAGE_SIZE_64);
3113 cl.e_addr = (daddr64_t)(e_offset / PAGE_SIZE_64);
3114
3115 cluster_update_state_internal(vp, &cl, 0, TRUE, &first_pass, s_offset, (int)(e_offset - s_offset),
3116 vp->v_un.vu_ubcinfo->ui_size, NULL, NULL, vm_initiated);
3117 }
3118
3119
3120 static void
3121 cluster_update_state_internal(vnode_t vp, struct cl_extent *cl, int flags, boolean_t defer_writes,
3122 boolean_t *first_pass, off_t write_off, int write_cnt, off_t newEOF,
3123 int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
3124 {
3125 struct cl_writebehind *wbp;
3126 int cl_index;
3127 int ret_cluster_try_push;
3128 u_int max_cluster_pgcount;
3129
3130
3131 max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE;
3132
3133 /*
3134 * take the lock to protect our accesses
3135 * of the writebehind and sparse cluster state
3136 */
3137 wbp = cluster_get_wbp(vp, CLW_ALLOCATE | CLW_RETURNLOCKED);
3138
3139 if (wbp->cl_scmap) {
3140 if (!(flags & IO_NOCACHE)) {
3141 /*
3142 * we've fallen into the sparse
3143 * cluster method of delaying dirty pages
3144 */
3145 sparse_cluster_add(wbp, &(wbp->cl_scmap), vp, cl, newEOF, callback, callback_arg, vm_initiated);
3146
3147 lck_mtx_unlock(&wbp->cl_lockw);
3148 return;
3149 }
3150 /*
3151 * must have done cached writes that fell into
3152 * the sparse cluster mechanism... we've switched
3153 * to uncached writes on the file, so go ahead
3154 * and push whatever's in the sparse map
3155 * and switch back to normal clustering
3156 */
3157 wbp->cl_number = 0;
3158
3159 sparse_cluster_push(wbp, &(wbp->cl_scmap), vp, newEOF, PUSH_ALL, 0, callback, callback_arg, vm_initiated);
3160 /*
3161 * no clusters of either type present at this point
3162 * so just go directly to start_new_cluster since
3163 * we know we need to delay this I/O since we've
3164 * already released the pages back into the cache
3165 * to avoid the deadlock with sparse_cluster_push
3166 */
3167 goto start_new_cluster;
3168 }
3169 if (*first_pass == TRUE) {
3170 if (write_off == wbp->cl_last_write) {
3171 wbp->cl_seq_written += write_cnt;
3172 } else {
3173 wbp->cl_seq_written = write_cnt;
3174 }
3175
3176 wbp->cl_last_write = write_off + write_cnt;
3177
3178 *first_pass = FALSE;
3179 }
3180 if (wbp->cl_number == 0) {
3181 /*
3182 * no clusters currently present
3183 */
3184 goto start_new_cluster;
3185 }
3186
3187 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
3188 /*
3189 * check each cluster that we currently hold
3190 * try to merge some or all of this write into
3191 * one or more of the existing clusters... if
3192 * any portion of the write remains, start a
3193 * new cluster
3194 */
3195 if (cl->b_addr >= wbp->cl_clusters[cl_index].b_addr) {
3196 /*
3197 * the current write starts at or after the current cluster
3198 */
3199 if (cl->e_addr <= (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) {
3200 /*
3201 * we have a write that fits entirely
3202 * within the existing cluster limits
3203 */
3204 if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr) {
3205 /*
3206 * update our idea of where the cluster ends
3207 */
3208 wbp->cl_clusters[cl_index].e_addr = cl->e_addr;
3209 }
3210 break;
3211 }
3212 if (cl->b_addr < (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) {
3213 /*
3214 * we have a write that starts in the middle of the current cluster
3215 * but extends beyond the cluster's limit... we know this because
3216 * of the previous checks
3217 * we'll extend the current cluster to the max
3218 * and update the b_addr for the current write to reflect that
3219 * the head of it was absorbed into this cluster...
3220 * note that we'll always have a leftover tail in this case since
3221 * full absorbtion would have occurred in the clause above
3222 */
3223 wbp->cl_clusters[cl_index].e_addr = wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount;
3224
3225 cl->b_addr = wbp->cl_clusters[cl_index].e_addr;
3226 }
3227 /*
3228 * we come here for the case where the current write starts
3229 * beyond the limit of the existing cluster or we have a leftover
3230 * tail after a partial absorbtion
3231 *
3232 * in either case, we'll check the remaining clusters before
3233 * starting a new one
3234 */
3235 } else {
3236 /*
3237 * the current write starts in front of the cluster we're currently considering
3238 */
3239 if ((wbp->cl_clusters[cl_index].e_addr - cl->b_addr) <= max_cluster_pgcount) {
3240 /*
3241 * we can just merge the new request into
3242 * this cluster and leave it in the cache
3243 * since the resulting cluster is still
3244 * less than the maximum allowable size
3245 */
3246 wbp->cl_clusters[cl_index].b_addr = cl->b_addr;
3247
3248 if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr) {
3249 /*
3250 * the current write completely
3251 * envelops the existing cluster and since
3252 * each write is limited to at most max_cluster_pgcount pages
3253 * we can just use the start and last blocknos of the write
3254 * to generate the cluster limits
3255 */
3256 wbp->cl_clusters[cl_index].e_addr = cl->e_addr;
3257 }
3258 break;
3259 }
3260 /*
3261 * if we were to combine this write with the current cluster
3262 * we would exceed the cluster size limit.... so,
3263 * let's see if there's any overlap of the new I/O with
3264 * the cluster we're currently considering... in fact, we'll
3265 * stretch the cluster out to it's full limit and see if we
3266 * get an intersection with the current write
3267 *
3268 */
3269 if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount) {
3270 /*
3271 * the current write extends into the proposed cluster
3272 * clip the length of the current write after first combining it's
3273 * tail with the newly shaped cluster
3274 */
3275 wbp->cl_clusters[cl_index].b_addr = wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount;
3276
3277 cl->e_addr = wbp->cl_clusters[cl_index].b_addr;
3278 }
3279 /*
3280 * if we get here, there was no way to merge
3281 * any portion of this write with this cluster
3282 * or we could only merge part of it which
3283 * will leave a tail...
3284 * we'll check the remaining clusters before starting a new one
3285 */
3286 }
3287 }
3288 if (cl_index < wbp->cl_number) {
3289 /*
3290 * we found an existing cluster(s) that we
3291 * could entirely merge this I/O into
3292 */
3293 goto delay_io;
3294 }
3295
3296 if (defer_writes == FALSE &&
3297 wbp->cl_number == MAX_CLUSTERS &&
3298 wbp->cl_seq_written >= (MAX_CLUSTERS * (max_cluster_pgcount * PAGE_SIZE))) {
3299 uint32_t n;
3300
3301 if (vp->v_mount->mnt_minsaturationbytecount) {
3302 n = vp->v_mount->mnt_minsaturationbytecount / MAX_CLUSTER_SIZE(vp);
3303
3304 if (n > MAX_CLUSTERS) {
3305 n = MAX_CLUSTERS;
3306 }
3307 } else {
3308 n = 0;
3309 }
3310
3311 if (n == 0) {
3312 if (disk_conditioner_mount_is_ssd(vp->v_mount)) {
3313 n = WRITE_BEHIND_SSD;
3314 } else {
3315 n = WRITE_BEHIND;
3316 }
3317 }
3318 while (n--) {
3319 cluster_try_push(wbp, vp, newEOF, 0, 0, callback, callback_arg, NULL, vm_initiated);
3320 }
3321 }
3322 if (wbp->cl_number < MAX_CLUSTERS) {
3323 /*
3324 * we didn't find an existing cluster to
3325 * merge into, but there's room to start
3326 * a new one
3327 */
3328 goto start_new_cluster;
3329 }
3330 /*
3331 * no exisitng cluster to merge with and no
3332 * room to start a new one... we'll try
3333 * pushing one of the existing ones... if none of
3334 * them are able to be pushed, we'll switch
3335 * to the sparse cluster mechanism
3336 * cluster_try_push updates cl_number to the
3337 * number of remaining clusters... and
3338 * returns the number of currently unused clusters
3339 */
3340 ret_cluster_try_push = 0;
3341
3342 /*
3343 * if writes are not deferred, call cluster push immediately
3344 */
3345 if (defer_writes == FALSE) {
3346 ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, (flags & IO_NOCACHE) ? 0 : PUSH_DELAY, 0, callback, callback_arg, NULL, vm_initiated);
3347 }
3348 /*
3349 * execute following regardless of writes being deferred or not
3350 */
3351 if (ret_cluster_try_push == 0) {
3352 /*
3353 * no more room in the normal cluster mechanism
3354 * so let's switch to the more expansive but expensive
3355 * sparse mechanism....
3356 */
3357 sparse_cluster_switch(wbp, vp, newEOF, callback, callback_arg, vm_initiated);
3358 sparse_cluster_add(wbp, &(wbp->cl_scmap), vp, cl, newEOF, callback, callback_arg, vm_initiated);
3359
3360 lck_mtx_unlock(&wbp->cl_lockw);
3361 return;
3362 }
3363 start_new_cluster:
3364 wbp->cl_clusters[wbp->cl_number].b_addr = cl->b_addr;
3365 wbp->cl_clusters[wbp->cl_number].e_addr = cl->e_addr;
3366
3367 wbp->cl_clusters[wbp->cl_number].io_flags = 0;
3368
3369 if (flags & IO_NOCACHE) {
3370 wbp->cl_clusters[wbp->cl_number].io_flags |= CLW_IONOCACHE;
3371 }
3372
3373 if (flags & IO_PASSIVE) {
3374 wbp->cl_clusters[wbp->cl_number].io_flags |= CLW_IOPASSIVE;
3375 }
3376
3377 wbp->cl_number++;
3378 delay_io:
3379 lck_mtx_unlock(&wbp->cl_lockw);
3380 return;
3381 }
3382
3383
3384 static int
3385 cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF, off_t headOff,
3386 off_t tailOff, int flags, int (*callback)(buf_t, void *), void *callback_arg)
3387 {
3388 upl_page_info_t *pl;
3389 upl_t upl;
3390 vm_offset_t upl_offset = 0;
3391 vm_size_t upl_size;
3392 off_t upl_f_offset;
3393 int pages_in_upl;
3394 int start_offset;
3395 int xfer_resid;
3396 int io_size;
3397 int io_offset;
3398 int bytes_to_zero;
3399 int bytes_to_move;
3400 kern_return_t kret;
3401 int retval = 0;
3402 int io_resid;
3403 long long total_size;
3404 long long zero_cnt;
3405 off_t zero_off;
3406 long long zero_cnt1;
3407 off_t zero_off1;
3408 off_t write_off = 0;
3409 int write_cnt = 0;
3410 boolean_t first_pass = FALSE;
3411 struct cl_extent cl;
3412 int bflag;
3413 u_int max_io_size;
3414
3415 if (uio) {
3416 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
3417 (int)uio->uio_offset, io_req_size, (int)oldEOF, (int)newEOF, 0);
3418
3419 io_resid = io_req_size;
3420 } else {
3421 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
3422 0, 0, (int)oldEOF, (int)newEOF, 0);
3423
3424 io_resid = 0;
3425 }
3426 if (flags & IO_PASSIVE) {
3427 bflag = CL_PASSIVE;
3428 } else {
3429 bflag = 0;
3430 }
3431 if (flags & IO_NOCACHE) {
3432 bflag |= CL_NOCACHE;
3433 }
3434
3435 if (flags & IO_SKIP_ENCRYPTION) {
3436 bflag |= CL_ENCRYPTED;
3437 }
3438
3439 zero_cnt = 0;
3440 zero_cnt1 = 0;
3441 zero_off = 0;
3442 zero_off1 = 0;
3443
3444 max_io_size = cluster_max_io_size(vp->v_mount, CL_WRITE);
3445
3446 if (flags & IO_HEADZEROFILL) {
3447 /*
3448 * some filesystems (HFS is one) don't support unallocated holes within a file...
3449 * so we zero fill the intervening space between the old EOF and the offset
3450 * where the next chunk of real data begins.... ftruncate will also use this
3451 * routine to zero fill to the new EOF when growing a file... in this case, the
3452 * uio structure will not be provided
3453 */
3454 if (uio) {
3455 if (headOff < uio->uio_offset) {
3456 zero_cnt = uio->uio_offset - headOff;
3457 zero_off = headOff;
3458 }
3459 } else if (headOff < newEOF) {
3460 zero_cnt = newEOF - headOff;
3461 zero_off = headOff;
3462 }
3463 } else {
3464 if (uio && uio->uio_offset > oldEOF) {
3465 zero_off = uio->uio_offset & ~PAGE_MASK_64;
3466
3467 if (zero_off >= oldEOF) {
3468 zero_cnt = uio->uio_offset - zero_off;
3469
3470 flags |= IO_HEADZEROFILL;
3471 }
3472 }
3473 }
3474 if (flags & IO_TAILZEROFILL) {
3475 if (uio) {
3476 zero_off1 = uio->uio_offset + io_req_size;
3477
3478 if (zero_off1 < tailOff) {
3479 zero_cnt1 = tailOff - zero_off1;
3480 }
3481 }
3482 } else {
3483 if (uio && newEOF > oldEOF) {
3484 zero_off1 = uio->uio_offset + io_req_size;
3485
3486 if (zero_off1 == newEOF && (zero_off1 & PAGE_MASK_64)) {
3487 zero_cnt1 = PAGE_SIZE_64 - (zero_off1 & PAGE_MASK_64);
3488
3489 flags |= IO_TAILZEROFILL;
3490 }
3491 }
3492 }
3493 if (zero_cnt == 0 && uio == (struct uio *) 0) {
3494 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
3495 retval, 0, 0, 0, 0);
3496 return 0;
3497 }
3498 if (uio) {
3499 write_off = uio->uio_offset;
3500 write_cnt = uio_resid(uio);
3501 /*
3502 * delay updating the sequential write info
3503 * in the control block until we've obtained
3504 * the lock for it
3505 */
3506 first_pass = TRUE;
3507 }
3508 while ((total_size = (io_resid + zero_cnt + zero_cnt1)) && retval == 0) {
3509 /*
3510 * for this iteration of the loop, figure out where our starting point is
3511 */
3512 if (zero_cnt) {
3513 start_offset = (int)(zero_off & PAGE_MASK_64);
3514 upl_f_offset = zero_off - start_offset;
3515 } else if (io_resid) {
3516 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
3517 upl_f_offset = uio->uio_offset - start_offset;
3518 } else {
3519 start_offset = (int)(zero_off1 & PAGE_MASK_64);
3520 upl_f_offset = zero_off1 - start_offset;
3521 }
3522 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
3523 (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
3524
3525 if (total_size > max_io_size) {
3526 total_size = max_io_size;
3527 }
3528
3529 cl.b_addr = (daddr64_t)(upl_f_offset / PAGE_SIZE_64);
3530
3531 if (uio && ((flags & (IO_SYNC | IO_HEADZEROFILL | IO_TAILZEROFILL)) == 0)) {
3532 /*
3533 * assumption... total_size <= io_resid
3534 * because IO_HEADZEROFILL and IO_TAILZEROFILL not set
3535 */
3536 if ((start_offset + total_size) > max_io_size) {
3537 total_size = max_io_size - start_offset;
3538 }
3539 xfer_resid = total_size;
3540
3541 retval = cluster_copy_ubc_data_internal(vp, uio, &xfer_resid, 1, 1);
3542
3543 if (retval) {
3544 break;
3545 }
3546
3547 io_resid -= (total_size - xfer_resid);
3548 total_size = xfer_resid;
3549 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
3550 upl_f_offset = uio->uio_offset - start_offset;
3551
3552 if (total_size == 0) {
3553 if (start_offset) {
3554 /*
3555 * the write did not finish on a page boundary
3556 * which will leave upl_f_offset pointing to the
3557 * beginning of the last page written instead of
3558 * the page beyond it... bump it in this case
3559 * so that the cluster code records the last page
3560 * written as dirty
3561 */
3562 upl_f_offset += PAGE_SIZE_64;
3563 }
3564 upl_size = 0;
3565
3566 goto check_cluster;
3567 }
3568 }
3569 /*
3570 * compute the size of the upl needed to encompass
3571 * the requested write... limit each call to cluster_io
3572 * to the maximum UPL size... cluster_io will clip if
3573 * this exceeds the maximum io_size for the device,
3574 * make sure to account for
3575 * a starting offset that's not page aligned
3576 */
3577 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
3578
3579 if (upl_size > max_io_size) {
3580 upl_size = max_io_size;
3581 }
3582
3583 pages_in_upl = upl_size / PAGE_SIZE;
3584 io_size = upl_size - start_offset;
3585
3586 if ((long long)io_size > total_size) {
3587 io_size = total_size;
3588 }
3589
3590 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, io_size, total_size, 0, 0);
3591
3592
3593 /*
3594 * Gather the pages from the buffer cache.
3595 * The UPL_WILL_MODIFY flag lets the UPL subsystem know
3596 * that we intend to modify these pages.
3597 */
3598 kret = ubc_create_upl_kernel(vp,
3599 upl_f_offset,
3600 upl_size,
3601 &upl,
3602 &pl,
3603 UPL_SET_LITE | ((uio != NULL && (uio->uio_flags & UIO_FLAGS_IS_COMPRESSED_FILE)) ? 0 : UPL_WILL_MODIFY),
3604 VM_KERN_MEMORY_FILE);
3605 if (kret != KERN_SUCCESS) {
3606 panic("cluster_write_copy: failed to get pagelist");
3607 }
3608
3609 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END,
3610 upl, (int)upl_f_offset, start_offset, 0, 0);
3611
3612 if (start_offset && upl_f_offset < oldEOF && !upl_valid_page(pl, 0)) {
3613 int read_size;
3614
3615 /*
3616 * we're starting in the middle of the first page of the upl
3617 * and the page isn't currently valid, so we're going to have
3618 * to read it in first... this is a synchronous operation
3619 */
3620 read_size = PAGE_SIZE;
3621
3622 if ((upl_f_offset + read_size) > oldEOF) {
3623 read_size = oldEOF - upl_f_offset;
3624 }
3625
3626 retval = cluster_io(vp, upl, 0, upl_f_offset, read_size,
3627 CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
3628 if (retval) {
3629 /*
3630 * we had an error during the read which causes us to abort
3631 * the current cluster_write request... before we do, we need
3632 * to release the rest of the pages in the upl without modifying
3633 * there state and mark the failed page in error
3634 */
3635 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3636
3637 if (upl_size > PAGE_SIZE) {
3638 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3639 }
3640
3641 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
3642 upl, 0, 0, retval, 0);
3643 break;
3644 }
3645 }
3646 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
3647 /*
3648 * the last offset we're writing to in this upl does not end on a page
3649 * boundary... if it's not beyond the old EOF, then we'll also need to
3650 * pre-read this page in if it isn't already valid
3651 */
3652 upl_offset = upl_size - PAGE_SIZE;
3653
3654 if ((upl_f_offset + start_offset + io_size) < oldEOF &&
3655 !upl_valid_page(pl, upl_offset / PAGE_SIZE)) {
3656 int read_size;
3657
3658 read_size = PAGE_SIZE;
3659
3660 if ((off_t)(upl_f_offset + upl_offset + read_size) > oldEOF) {
3661 read_size = oldEOF - (upl_f_offset + upl_offset);
3662 }
3663
3664 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size,
3665 CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
3666 if (retval) {
3667 /*
3668 * we had an error during the read which causes us to abort
3669 * the current cluster_write request... before we do, we
3670 * need to release the rest of the pages in the upl without
3671 * modifying there state and mark the failed page in error
3672 */
3673 ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3674
3675 if (upl_size > PAGE_SIZE) {
3676 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3677 }
3678
3679 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
3680 upl, 0, 0, retval, 0);
3681 break;
3682 }
3683 }
3684 }
3685 xfer_resid = io_size;
3686 io_offset = start_offset;
3687
3688 while (zero_cnt && xfer_resid) {
3689 if (zero_cnt < (long long)xfer_resid) {
3690 bytes_to_zero = zero_cnt;
3691 } else {
3692 bytes_to_zero = xfer_resid;
3693 }
3694
3695 bytes_to_zero = cluster_zero_range(upl, pl, flags, io_offset, zero_off, upl_f_offset, bytes_to_zero);
3696
3697 xfer_resid -= bytes_to_zero;
3698 zero_cnt -= bytes_to_zero;
3699 zero_off += bytes_to_zero;
3700 io_offset += bytes_to_zero;
3701 }
3702 if (xfer_resid && io_resid) {
3703 u_int32_t io_requested;
3704
3705 bytes_to_move = min(io_resid, xfer_resid);
3706 io_requested = bytes_to_move;
3707
3708 retval = cluster_copy_upl_data(uio, upl, io_offset, (int *)&io_requested);
3709
3710 if (retval) {
3711 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3712
3713 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
3714 upl, 0, 0, retval, 0);
3715 } else {
3716 io_resid -= bytes_to_move;
3717 xfer_resid -= bytes_to_move;
3718 io_offset += bytes_to_move;
3719 }
3720 }
3721 while (xfer_resid && zero_cnt1 && retval == 0) {
3722 if (zero_cnt1 < (long long)xfer_resid) {
3723 bytes_to_zero = zero_cnt1;
3724 } else {
3725 bytes_to_zero = xfer_resid;
3726 }
3727
3728 bytes_to_zero = cluster_zero_range(upl, pl, flags, io_offset, zero_off1, upl_f_offset, bytes_to_zero);
3729
3730 xfer_resid -= bytes_to_zero;
3731 zero_cnt1 -= bytes_to_zero;
3732 zero_off1 += bytes_to_zero;
3733 io_offset += bytes_to_zero;
3734 }
3735 if (retval == 0) {
3736 int do_zeroing = 1;
3737
3738 io_size += start_offset;
3739
3740 /* Force more restrictive zeroing behavior only on APFS */
3741 if ((vnode_tag(vp) == VT_APFS) && (newEOF < oldEOF)) {
3742 do_zeroing = 0;
3743 }
3744
3745 if (do_zeroing && (upl_f_offset + io_size) >= newEOF && (u_int)io_size < upl_size) {
3746 /*
3747 * if we're extending the file with this write
3748 * we'll zero fill the rest of the page so that
3749 * if the file gets extended again in such a way as to leave a
3750 * hole starting at this EOF, we'll have zero's in the correct spot
3751 */
3752 cluster_zero(upl, io_size, upl_size - io_size, NULL);
3753 }
3754 /*
3755 * release the upl now if we hold one since...
3756 * 1) pages in it may be present in the sparse cluster map
3757 * and may span 2 separate buckets there... if they do and
3758 * we happen to have to flush a bucket to make room and it intersects
3759 * this upl, a deadlock may result on page BUSY
3760 * 2) we're delaying the I/O... from this point forward we're just updating
3761 * the cluster state... no need to hold the pages, so commit them
3762 * 3) IO_SYNC is set...
3763 * because we had to ask for a UPL that provides currenty non-present pages, the
3764 * UPL has been automatically set to clear the dirty flags (both software and hardware)
3765 * upon committing it... this is not the behavior we want since it's possible for
3766 * pages currently present as part of a mapped file to be dirtied while the I/O is in flight.
3767 * we'll pick these pages back up later with the correct behavior specified.
3768 * 4) we don't want to hold pages busy in a UPL and then block on the cluster lock... if a flush
3769 * of this vnode is in progress, we will deadlock if the pages being flushed intersect the pages
3770 * we hold since the flushing context is holding the cluster lock.
3771 */
3772 ubc_upl_commit_range(upl, 0, upl_size,
3773 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
3774 check_cluster:
3775 /*
3776 * calculate the last logical block number
3777 * that this delayed I/O encompassed
3778 */
3779 cl.e_addr = (daddr64_t)((upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64);
3780
3781 if (flags & IO_SYNC) {
3782 /*
3783 * if the IO_SYNC flag is set than we need to bypass
3784 * any clustering and immediately issue the I/O
3785 *
3786 * we don't hold the lock at this point
3787 *
3788 * we've already dropped the current upl, so pick it back up with COPYOUT_FROM set
3789 * so that we correctly deal with a change in state of the hardware modify bit...
3790 * we do this via cluster_push_now... by passing along the IO_SYNC flag, we force
3791 * cluster_push_now to wait until all the I/Os have completed... cluster_push_now is also
3792 * responsible for generating the correct sized I/O(s)
3793 */
3794 retval = cluster_push_now(vp, &cl, newEOF, flags, callback, callback_arg, FALSE);
3795 } else {
3796 boolean_t defer_writes = FALSE;
3797
3798 if (vfs_flags(vp->v_mount) & MNT_DEFWRITE) {
3799 defer_writes = TRUE;
3800 }
3801
3802 cluster_update_state_internal(vp, &cl, flags, defer_writes, &first_pass,
3803 write_off, write_cnt, newEOF, callback, callback_arg, FALSE);
3804 }
3805 }
3806 }
3807 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END, retval, 0, io_resid, 0, 0);
3808
3809 return retval;
3810 }
3811
3812
3813
3814 int
3815 cluster_read(vnode_t vp, struct uio *uio, off_t filesize, int xflags)
3816 {
3817 return cluster_read_ext(vp, uio, filesize, xflags, NULL, NULL);
3818 }
3819
3820
3821 int
3822 cluster_read_ext(vnode_t vp, struct uio *uio, off_t filesize, int xflags, int (*callback)(buf_t, void *), void *callback_arg)
3823 {
3824 int retval = 0;
3825 int flags;
3826 user_ssize_t cur_resid;
3827 u_int32_t io_size;
3828 u_int32_t read_length = 0;
3829 int read_type = IO_COPY;
3830
3831 flags = xflags;
3832
3833 if (vp->v_flag & VNOCACHE_DATA) {
3834 flags |= IO_NOCACHE;
3835 }
3836 if ((vp->v_flag & VRAOFF) || speculative_reads_disabled) {
3837 flags |= IO_RAOFF;
3838 }
3839
3840 if (flags & IO_SKIP_ENCRYPTION) {
3841 flags |= IO_ENCRYPTED;
3842 }
3843
3844 /*
3845 * do a read through the cache if one of the following is true....
3846 * NOCACHE is not true
3847 * the uio request doesn't target USERSPACE
3848 * Alternatively, if IO_ENCRYPTED is set, then we want to bypass the cache as well.
3849 * Reading encrypted data from a CP filesystem should never result in the data touching
3850 * the UBC.
3851 *
3852 * otherwise, find out if we want the direct or contig variant for
3853 * the first vector in the uio request
3854 */
3855 if (((flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg)) || (flags & IO_ENCRYPTED)) {
3856 retval = cluster_io_type(uio, &read_type, &read_length, 0);
3857 }
3858
3859 while ((cur_resid = uio_resid(uio)) && uio->uio_offset < filesize && retval == 0) {
3860 switch (read_type) {
3861 case IO_COPY:
3862 /*
3863 * make sure the uio_resid isn't too big...
3864 * internally, we want to handle all of the I/O in
3865 * chunk sizes that fit in a 32 bit int
3866 */
3867 if (cur_resid > (user_ssize_t)(MAX_IO_REQUEST_SIZE)) {
3868 io_size = MAX_IO_REQUEST_SIZE;
3869 } else {
3870 io_size = (u_int32_t)cur_resid;
3871 }
3872
3873 retval = cluster_read_copy(vp, uio, io_size, filesize, flags, callback, callback_arg);
3874 break;
3875
3876 case IO_DIRECT:
3877 retval = cluster_read_direct(vp, uio, filesize, &read_type, &read_length, flags, callback, callback_arg);
3878 break;
3879
3880 case IO_CONTIG:
3881 retval = cluster_read_contig(vp, uio, filesize, &read_type, &read_length, callback, callback_arg, flags);
3882 break;
3883
3884 case IO_UNKNOWN:
3885 retval = cluster_io_type(uio, &read_type, &read_length, 0);
3886 break;
3887 }
3888 }
3889 return retval;
3890 }
3891
3892
3893
3894 static void
3895 cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference)
3896 {
3897 int range;
3898 int abort_flags = UPL_ABORT_FREE_ON_EMPTY;
3899
3900 if ((range = last_pg - start_pg)) {
3901 if (take_reference) {
3902 abort_flags |= UPL_ABORT_REFERENCE;
3903 }
3904
3905 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, range * PAGE_SIZE, abort_flags);
3906 }
3907 }
3908
3909
3910 static int
3911 cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t filesize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
3912 {
3913 upl_page_info_t *pl;
3914 upl_t upl;
3915 vm_offset_t upl_offset;
3916 u_int32_t upl_size;
3917 off_t upl_f_offset;
3918 int start_offset;
3919 int start_pg;
3920 int last_pg;
3921 int uio_last = 0;
3922 int pages_in_upl;
3923 off_t max_size;
3924 off_t last_ioread_offset;
3925 off_t last_request_offset;
3926 kern_return_t kret;
3927 int error = 0;
3928 int retval = 0;
3929 u_int32_t size_of_prefetch;
3930 u_int32_t xsize;
3931 u_int32_t io_size;
3932 u_int32_t max_rd_size;
3933 u_int32_t max_io_size;
3934 u_int32_t max_prefetch;
3935 u_int rd_ahead_enabled = 1;
3936 u_int prefetch_enabled = 1;
3937 struct cl_readahead * rap;
3938 struct clios iostate;
3939 struct cl_extent extent;
3940 int bflag;
3941 int take_reference = 1;
3942 int policy = IOPOL_DEFAULT;
3943 boolean_t iolock_inited = FALSE;
3944
3945 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
3946 (int)uio->uio_offset, io_req_size, (int)filesize, flags, 0);
3947
3948 if (flags & IO_ENCRYPTED) {
3949 panic("encrypted blocks will hit UBC!");
3950 }
3951
3952 policy = throttle_get_io_policy(NULL);
3953
3954 if (policy == THROTTLE_LEVEL_TIER3 || policy == THROTTLE_LEVEL_TIER2 || (flags & IO_NOCACHE)) {
3955 take_reference = 0;
3956 }
3957
3958 if (flags & IO_PASSIVE) {
3959 bflag = CL_PASSIVE;
3960 } else {
3961 bflag = 0;
3962 }
3963
3964 if (flags & IO_NOCACHE) {
3965 bflag |= CL_NOCACHE;
3966 }
3967
3968 if (flags & IO_SKIP_ENCRYPTION) {
3969 bflag |= CL_ENCRYPTED;
3970 }
3971
3972 max_io_size = cluster_max_io_size(vp->v_mount, CL_READ);
3973 max_prefetch = MAX_PREFETCH(vp, max_io_size, disk_conditioner_mount_is_ssd(vp->v_mount));
3974 max_rd_size = max_prefetch;
3975
3976 last_request_offset = uio->uio_offset + io_req_size;
3977
3978 if (last_request_offset > filesize) {
3979 last_request_offset = filesize;
3980 }
3981
3982 if ((flags & (IO_RAOFF | IO_NOCACHE)) || ((last_request_offset & ~PAGE_MASK_64) == (uio->uio_offset & ~PAGE_MASK_64))) {
3983 rd_ahead_enabled = 0;
3984 rap = NULL;
3985 } else {
3986 if (cluster_is_throttled(vp)) {
3987 /*
3988 * we're in the throttle window, at the very least
3989 * we want to limit the size of the I/O we're about
3990 * to issue
3991 */
3992 rd_ahead_enabled = 0;
3993 prefetch_enabled = 0;
3994
3995 max_rd_size = THROTTLE_MAX_IOSIZE;
3996 }
3997 if ((rap = cluster_get_rap(vp)) == NULL) {
3998 rd_ahead_enabled = 0;
3999 } else {
4000 extent.b_addr = uio->uio_offset / PAGE_SIZE_64;
4001 extent.e_addr = (last_request_offset - 1) / PAGE_SIZE_64;
4002 }
4003 }
4004 if (rap != NULL && rap->cl_ralen && (rap->cl_lastr == extent.b_addr || (rap->cl_lastr + 1) == extent.b_addr)) {
4005 /*
4006 * determine if we already have a read-ahead in the pipe courtesy of the
4007 * last read systemcall that was issued...
4008 * if so, pick up it's extent to determine where we should start
4009 * with respect to any read-ahead that might be necessary to
4010 * garner all the data needed to complete this read systemcall
4011 */
4012 last_ioread_offset = (rap->cl_maxra * PAGE_SIZE_64) + PAGE_SIZE_64;
4013
4014 if (last_ioread_offset < uio->uio_offset) {
4015 last_ioread_offset = (off_t)0;
4016 } else if (last_ioread_offset > last_request_offset) {
4017 last_ioread_offset = last_request_offset;
4018 }
4019 } else {
4020 last_ioread_offset = (off_t)0;
4021 }
4022
4023 while (io_req_size && uio->uio_offset < filesize && retval == 0) {
4024 max_size = filesize - uio->uio_offset;
4025
4026 if ((off_t)(io_req_size) < max_size) {
4027 io_size = io_req_size;
4028 } else {
4029 io_size = max_size;
4030 }
4031
4032 if (!(flags & IO_NOCACHE)) {
4033 while (io_size) {
4034 u_int32_t io_resid;
4035 u_int32_t io_requested;
4036
4037 /*
4038 * if we keep finding the pages we need already in the cache, then
4039 * don't bother to call cluster_read_prefetch since it costs CPU cycles
4040 * to determine that we have all the pages we need... once we miss in
4041 * the cache and have issued an I/O, than we'll assume that we're likely
4042 * to continue to miss in the cache and it's to our advantage to try and prefetch
4043 */
4044 if (last_request_offset && last_ioread_offset && (size_of_prefetch = (last_request_offset - last_ioread_offset))) {
4045 if ((last_ioread_offset - uio->uio_offset) <= max_rd_size && prefetch_enabled) {
4046 /*
4047 * we've already issued I/O for this request and
4048 * there's still work to do and
4049 * our prefetch stream is running dry, so issue a
4050 * pre-fetch I/O... the I/O latency will overlap
4051 * with the copying of the data
4052 */
4053 if (size_of_prefetch > max_rd_size) {
4054 size_of_prefetch = max_rd_size;
4055 }
4056
4057 size_of_prefetch = cluster_read_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, callback, callback_arg, bflag);
4058
4059 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
4060
4061 if (last_ioread_offset > last_request_offset) {
4062 last_ioread_offset = last_request_offset;
4063 }
4064 }
4065 }
4066 /*
4067 * limit the size of the copy we're about to do so that
4068 * we can notice that our I/O pipe is running dry and
4069 * get the next I/O issued before it does go dry
4070 */
4071 if (last_ioread_offset && io_size > (max_io_size / 4)) {
4072 io_resid = (max_io_size / 4);
4073 } else {
4074 io_resid = io_size;
4075 }
4076
4077 io_requested = io_resid;
4078
4079 retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_resid, 0, take_reference);
4080
4081 xsize = io_requested - io_resid;
4082
4083 io_size -= xsize;
4084 io_req_size -= xsize;
4085
4086 if (retval || io_resid) {
4087 /*
4088 * if we run into a real error or
4089 * a page that is not in the cache
4090 * we need to leave streaming mode
4091 */
4092 break;
4093 }
4094
4095 if (rd_ahead_enabled && (io_size == 0 || last_ioread_offset == last_request_offset)) {
4096 /*
4097 * we're already finished the I/O for this read request
4098 * let's see if we should do a read-ahead
4099 */
4100 cluster_read_ahead(vp, &extent, filesize, rap, callback, callback_arg, bflag);
4101 }
4102 }
4103 if (retval) {
4104 break;
4105 }
4106 if (io_size == 0) {
4107 if (rap != NULL) {
4108 if (extent.e_addr < rap->cl_lastr) {
4109 rap->cl_maxra = 0;
4110 }
4111 rap->cl_lastr = extent.e_addr;
4112 }
4113 break;
4114 }
4115 /*
4116 * recompute max_size since cluster_copy_ubc_data_internal
4117 * may have advanced uio->uio_offset
4118 */
4119 max_size = filesize - uio->uio_offset;
4120 }
4121
4122 iostate.io_completed = 0;
4123 iostate.io_issued = 0;
4124 iostate.io_error = 0;
4125 iostate.io_wanted = 0;
4126
4127 if ((flags & IO_RETURN_ON_THROTTLE)) {
4128 if (cluster_is_throttled(vp) == THROTTLE_NOW) {
4129 if (!cluster_io_present_in_BC(vp, uio->uio_offset)) {
4130 /*
4131 * we're in the throttle window and at least 1 I/O
4132 * has already been issued by a throttleable thread
4133 * in this window, so return with EAGAIN to indicate
4134 * to the FS issuing the cluster_read call that it
4135 * should now throttle after dropping any locks
4136 */
4137 throttle_info_update_by_mount(vp->v_mount);
4138
4139 retval = EAGAIN;
4140 break;
4141 }
4142 }
4143 }
4144
4145 /*
4146 * compute the size of the upl needed to encompass
4147 * the requested read... limit each call to cluster_io
4148 * to the maximum UPL size... cluster_io will clip if
4149 * this exceeds the maximum io_size for the device,
4150 * make sure to account for
4151 * a starting offset that's not page aligned
4152 */
4153 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
4154 upl_f_offset = uio->uio_offset - (off_t)start_offset;
4155
4156 if (io_size > max_rd_size) {
4157 io_size = max_rd_size;
4158 }
4159
4160 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
4161
4162 if (flags & IO_NOCACHE) {
4163 if (upl_size > max_io_size) {
4164 upl_size = max_io_size;
4165 }
4166 } else {
4167 if (upl_size > max_io_size / 4) {
4168 upl_size = max_io_size / 4;
4169 upl_size &= ~PAGE_MASK;
4170
4171 if (upl_size == 0) {
4172 upl_size = PAGE_SIZE;
4173 }
4174 }
4175 }
4176 pages_in_upl = upl_size / PAGE_SIZE;
4177
4178 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
4179 upl, (int)upl_f_offset, upl_size, start_offset, 0);
4180
4181 kret = ubc_create_upl_kernel(vp,
4182 upl_f_offset,
4183 upl_size,
4184 &upl,
4185 &pl,
4186 UPL_FILE_IO | UPL_SET_LITE,
4187 VM_KERN_MEMORY_FILE);
4188 if (kret != KERN_SUCCESS) {
4189 panic("cluster_read_copy: failed to get pagelist");
4190 }
4191
4192 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
4193 upl, (int)upl_f_offset, upl_size, start_offset, 0);
4194
4195 /*
4196 * scan from the beginning of the upl looking for the first
4197 * non-valid page.... this will become the first page in
4198 * the request we're going to make to 'cluster_io'... if all
4199 * of the pages are valid, we won't call through to 'cluster_io'
4200 */
4201 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
4202 if (!upl_valid_page(pl, start_pg)) {
4203 break;
4204 }
4205 }
4206
4207 /*
4208 * scan from the starting invalid page looking for a valid
4209 * page before the end of the upl is reached, if we
4210 * find one, then it will be the last page of the request to
4211 * 'cluster_io'
4212 */
4213 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
4214 if (upl_valid_page(pl, last_pg)) {
4215 break;
4216 }
4217 }
4218
4219 if (start_pg < last_pg) {
4220 /*
4221 * we found a range of 'invalid' pages that must be filled
4222 * if the last page in this range is the last page of the file
4223 * we may have to clip the size of it to keep from reading past
4224 * the end of the last physical block associated with the file
4225 */
4226 if (iolock_inited == FALSE) {
4227 lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr);
4228
4229 iolock_inited = TRUE;
4230 }
4231 upl_offset = start_pg * PAGE_SIZE;
4232 io_size = (last_pg - start_pg) * PAGE_SIZE;
4233
4234 if ((off_t)(upl_f_offset + upl_offset + io_size) > filesize) {
4235 io_size = filesize - (upl_f_offset + upl_offset);
4236 }
4237
4238 /*
4239 * issue an asynchronous read to cluster_io
4240 */
4241
4242 error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
4243 io_size, CL_READ | CL_ASYNC | bflag, (buf_t)NULL, &iostate, callback, callback_arg);
4244
4245 if (rap) {
4246 if (extent.e_addr < rap->cl_maxra) {
4247 /*
4248 * we've just issued a read for a block that should have been
4249 * in the cache courtesy of the read-ahead engine... something
4250 * has gone wrong with the pipeline, so reset the read-ahead
4251 * logic which will cause us to restart from scratch
4252 */
4253 rap->cl_maxra = 0;
4254 }
4255 }
4256 }
4257 if (error == 0) {
4258 /*
4259 * if the read completed successfully, or there was no I/O request
4260 * issued, than copy the data into user land via 'cluster_upl_copy_data'
4261 * we'll first add on any 'valid'
4262 * pages that were present in the upl when we acquired it.
4263 */
4264 u_int val_size;
4265
4266 for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
4267 if (!upl_valid_page(pl, uio_last)) {
4268 break;
4269 }
4270 }
4271 if (uio_last < pages_in_upl) {
4272 /*
4273 * there were some invalid pages beyond the valid pages
4274 * that we didn't issue an I/O for, just release them
4275 * unchanged now, so that any prefetch/readahed can
4276 * include them
4277 */
4278 ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
4279 (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
4280 }
4281
4282 /*
4283 * compute size to transfer this round, if io_req_size is
4284 * still non-zero after this attempt, we'll loop around and
4285 * set up for another I/O.
4286 */
4287 val_size = (uio_last * PAGE_SIZE) - start_offset;
4288
4289 if (val_size > max_size) {
4290 val_size = max_size;
4291 }
4292
4293 if (val_size > io_req_size) {
4294 val_size = io_req_size;
4295 }
4296
4297 if ((uio->uio_offset + val_size) > last_ioread_offset) {
4298 last_ioread_offset = uio->uio_offset + val_size;
4299 }
4300
4301 if ((size_of_prefetch = (last_request_offset - last_ioread_offset)) && prefetch_enabled) {
4302 if ((last_ioread_offset - (uio->uio_offset + val_size)) <= upl_size) {
4303 /*
4304 * if there's still I/O left to do for this request, and...
4305 * we're not in hard throttle mode, and...
4306 * we're close to using up the previous prefetch, then issue a
4307 * new pre-fetch I/O... the I/O latency will overlap
4308 * with the copying of the data
4309 */
4310 if (size_of_prefetch > max_rd_size) {
4311 size_of_prefetch = max_rd_size;
4312 }
4313
4314 size_of_prefetch = cluster_read_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, callback, callback_arg, bflag);
4315
4316 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
4317
4318 if (last_ioread_offset > last_request_offset) {
4319 last_ioread_offset = last_request_offset;
4320 }
4321 }
4322 } else if ((uio->uio_offset + val_size) == last_request_offset) {
4323 /*
4324 * this transfer will finish this request, so...
4325 * let's try to read ahead if we're in
4326 * a sequential access pattern and we haven't
4327 * explicitly disabled it
4328 */
4329 if (rd_ahead_enabled) {
4330 cluster_read_ahead(vp, &extent, filesize, rap, callback, callback_arg, bflag);
4331 }
4332
4333 if (rap != NULL) {
4334 if (extent.e_addr < rap->cl_lastr) {
4335 rap->cl_maxra = 0;
4336 }
4337 rap->cl_lastr = extent.e_addr;
4338 }
4339 }
4340 if (iolock_inited == TRUE) {
4341 cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
4342 }
4343
4344 if (iostate.io_error) {
4345 error = iostate.io_error;
4346 } else {
4347 u_int32_t io_requested;
4348
4349 io_requested = val_size;
4350
4351 retval = cluster_copy_upl_data(uio, upl, start_offset, (int *)&io_requested);
4352
4353 io_req_size -= (val_size - io_requested);
4354 }
4355 } else {
4356 if (iolock_inited == TRUE) {
4357 cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
4358 }
4359 }
4360 if (start_pg < last_pg) {
4361 /*
4362 * compute the range of pages that we actually issued an I/O for
4363 * and either commit them as valid if the I/O succeeded
4364 * or abort them if the I/O failed or we're not supposed to
4365 * keep them in the cache
4366 */
4367 io_size = (last_pg - start_pg) * PAGE_SIZE;
4368
4369 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START, upl, start_pg * PAGE_SIZE, io_size, error, 0);
4370
4371 if (error || (flags & IO_NOCACHE)) {
4372 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
4373 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
4374 } else {
4375 int commit_flags = UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY;
4376
4377 if (take_reference) {
4378 commit_flags |= UPL_COMMIT_INACTIVATE;
4379 } else {
4380 commit_flags |= UPL_COMMIT_SPECULATE;
4381 }
4382
4383 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size, commit_flags);
4384 }
4385 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END, upl, start_pg * PAGE_SIZE, io_size, error, 0);
4386 }
4387 if ((last_pg - start_pg) < pages_in_upl) {
4388 /*
4389 * the set of pages that we issued an I/O for did not encompass
4390 * the entire upl... so just release these without modifying
4391 * their state
4392 */
4393 if (error) {
4394 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
4395 } else {
4396 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
4397 upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
4398
4399 /*
4400 * handle any valid pages at the beginning of
4401 * the upl... release these appropriately
4402 */
4403 cluster_read_upl_release(upl, 0, start_pg, take_reference);
4404
4405 /*
4406 * handle any valid pages immediately after the
4407 * pages we issued I/O for... ... release these appropriately
4408 */
4409 cluster_read_upl_release(upl, last_pg, uio_last, take_reference);
4410
4411 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END, upl, -1, -1, 0, 0);
4412 }
4413 }
4414 if (retval == 0) {
4415 retval = error;
4416 }
4417
4418 if (io_req_size) {
4419 if (cluster_is_throttled(vp)) {
4420 /*
4421 * we're in the throttle window, at the very least
4422 * we want to limit the size of the I/O we're about
4423 * to issue
4424 */
4425 rd_ahead_enabled = 0;
4426 prefetch_enabled = 0;
4427 max_rd_size = THROTTLE_MAX_IOSIZE;
4428 } else {
4429 if (max_rd_size == THROTTLE_MAX_IOSIZE) {
4430 /*
4431 * coming out of throttled state
4432 */
4433 if (policy != THROTTLE_LEVEL_TIER3 && policy != THROTTLE_LEVEL_TIER2) {
4434 if (rap != NULL) {
4435 rd_ahead_enabled = 1;
4436 }
4437 prefetch_enabled = 1;
4438 }
4439 max_rd_size = max_prefetch;
4440 last_ioread_offset = 0;
4441 }
4442 }
4443 }
4444 }
4445 if (iolock_inited == TRUE) {
4446 /*
4447 * cluster_io returned an error after it
4448 * had already issued some I/O. we need
4449 * to wait for that I/O to complete before
4450 * we can destroy the iostate mutex...
4451 * 'retval' already contains the early error
4452 * so no need to pick it up from iostate.io_error
4453 */
4454 cluster_iostate_wait(&iostate, 0, "cluster_read_copy");
4455
4456 lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp);
4457 }
4458 if (rap != NULL) {
4459 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
4460 (int)uio->uio_offset, io_req_size, rap->cl_lastr, retval, 0);
4461
4462 lck_mtx_unlock(&rap->cl_lockr);
4463 } else {
4464 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
4465 (int)uio->uio_offset, io_req_size, 0, retval, 0);
4466 }
4467
4468 return retval;
4469 }
4470
4471 /*
4472 * We don't want another read/write lock for every vnode in the system
4473 * so we keep a hash of them here. There should never be very many of
4474 * these around at any point in time.
4475 */
4476 cl_direct_read_lock_t *
4477 cluster_lock_direct_read(vnode_t vp, lck_rw_type_t type)
4478 {
4479 struct cl_direct_read_locks *head
4480 = &cl_direct_read_locks[(uintptr_t)vp / sizeof(*vp)
4481 % CL_DIRECT_READ_LOCK_BUCKETS];
4482
4483 struct cl_direct_read_lock *lck, *new_lck = NULL;
4484
4485 for (;;) {
4486 lck_spin_lock(&cl_direct_read_spin_lock);
4487
4488 LIST_FOREACH(lck, head, chain) {
4489 if (lck->vp == vp) {
4490 ++lck->ref_count;
4491 lck_spin_unlock(&cl_direct_read_spin_lock);
4492 if (new_lck) {
4493 // Someone beat us to it, ditch the allocation
4494 lck_rw_destroy(&new_lck->rw_lock, cl_mtx_grp);
4495 FREE(new_lck, M_TEMP);
4496 }
4497 lck_rw_lock(&lck->rw_lock, type);
4498 return lck;
4499 }
4500 }
4501
4502 if (new_lck) {
4503 // Use the lock we allocated
4504 LIST_INSERT_HEAD(head, new_lck, chain);
4505 lck_spin_unlock(&cl_direct_read_spin_lock);
4506 lck_rw_lock(&new_lck->rw_lock, type);
4507 return new_lck;
4508 }
4509
4510 lck_spin_unlock(&cl_direct_read_spin_lock);
4511
4512 // Allocate a new lock
4513 MALLOC(new_lck, cl_direct_read_lock_t *, sizeof(*new_lck),
4514 M_TEMP, M_WAITOK);
4515 lck_rw_init(&new_lck->rw_lock, cl_mtx_grp, cl_mtx_attr);
4516 new_lck->vp = vp;
4517 new_lck->ref_count = 1;
4518
4519 // Got to go round again
4520 }
4521 }
4522
4523 void
4524 cluster_unlock_direct_read(cl_direct_read_lock_t *lck)
4525 {
4526 lck_rw_done(&lck->rw_lock);
4527
4528 lck_spin_lock(&cl_direct_read_spin_lock);
4529 if (lck->ref_count == 1) {
4530 LIST_REMOVE(lck, chain);
4531 lck_spin_unlock(&cl_direct_read_spin_lock);
4532 lck_rw_destroy(&lck->rw_lock, cl_mtx_grp);
4533 FREE(lck, M_TEMP);
4534 } else {
4535 --lck->ref_count;
4536 lck_spin_unlock(&cl_direct_read_spin_lock);
4537 }
4538 }
4539
4540 static int
4541 cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
4542 int flags, int (*callback)(buf_t, void *), void *callback_arg)
4543 {
4544 upl_t upl;
4545 upl_page_info_t *pl;
4546 off_t max_io_size;
4547 vm_offset_t upl_offset, vector_upl_offset = 0;
4548 upl_size_t upl_size, vector_upl_size = 0;
4549 vm_size_t upl_needed_size;
4550 unsigned int pages_in_pl;
4551 upl_control_flags_t upl_flags;
4552 kern_return_t kret;
4553 unsigned int i;
4554 int force_data_sync;
4555 int retval = 0;
4556 int no_zero_fill = 0;
4557 int io_flag = 0;
4558 int misaligned = 0;
4559 struct clios iostate;
4560 user_addr_t iov_base;
4561 u_int32_t io_req_size;
4562 u_int32_t offset_in_file;
4563 u_int32_t offset_in_iovbase;
4564 u_int32_t io_size;
4565 u_int32_t io_min;
4566 u_int32_t xsize;
4567 u_int32_t devblocksize;
4568 u_int32_t mem_alignment_mask;
4569 u_int32_t max_upl_size;
4570 u_int32_t max_rd_size;
4571 u_int32_t max_rd_ahead;
4572 u_int32_t max_vector_size;
4573 boolean_t io_throttled = FALSE;
4574
4575 u_int32_t vector_upl_iosize = 0;
4576 int issueVectorUPL = 0, useVectorUPL = (uio->uio_iovcnt > 1);
4577 off_t v_upl_uio_offset = 0;
4578 int vector_upl_index = 0;
4579 upl_t vector_upl = NULL;
4580 cl_direct_read_lock_t *lock = NULL;
4581
4582 user_addr_t orig_iov_base = 0;
4583 user_addr_t last_iov_base = 0;
4584 user_addr_t next_iov_base = 0;
4585
4586 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
4587 (int)uio->uio_offset, (int)filesize, *read_type, *read_length, 0);
4588
4589 max_upl_size = cluster_max_io_size(vp->v_mount, CL_READ);
4590
4591 max_rd_size = max_upl_size;
4592 max_rd_ahead = max_rd_size * IO_SCALE(vp, 2);
4593
4594 io_flag = CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO | CL_DIRECT_IO;
4595
4596 if (flags & IO_PASSIVE) {
4597 io_flag |= CL_PASSIVE;
4598 }
4599
4600 if (flags & IO_ENCRYPTED) {
4601 io_flag |= CL_RAW_ENCRYPTED;
4602 }
4603
4604 if (flags & IO_NOCACHE) {
4605 io_flag |= CL_NOCACHE;
4606 }
4607
4608 if (flags & IO_SKIP_ENCRYPTION) {
4609 io_flag |= CL_ENCRYPTED;
4610 }
4611
4612 iostate.io_completed = 0;
4613 iostate.io_issued = 0;
4614 iostate.io_error = 0;
4615 iostate.io_wanted = 0;
4616
4617 lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr);
4618
4619 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
4620 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
4621
4622 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_NONE,
4623 (int)devblocksize, (int)mem_alignment_mask, 0, 0, 0);
4624
4625 if (devblocksize == 1) {
4626 /*
4627 * the AFP client advertises a devblocksize of 1
4628 * however, its BLOCKMAP routine maps to physical
4629 * blocks that are PAGE_SIZE in size...
4630 * therefore we can't ask for I/Os that aren't page aligned
4631 * or aren't multiples of PAGE_SIZE in size
4632 * by setting devblocksize to PAGE_SIZE, we re-instate
4633 * the old behavior we had before the mem_alignment_mask
4634 * changes went in...
4635 */
4636 devblocksize = PAGE_SIZE;
4637 }
4638
4639 orig_iov_base = uio_curriovbase(uio);
4640 last_iov_base = orig_iov_base;
4641
4642 next_dread:
4643 io_req_size = *read_length;
4644 iov_base = uio_curriovbase(uio);
4645
4646 offset_in_file = (u_int32_t)uio->uio_offset & (devblocksize - 1);
4647 offset_in_iovbase = (u_int32_t)iov_base & mem_alignment_mask;
4648
4649 if (offset_in_file || offset_in_iovbase) {
4650 /*
4651 * one of the 2 important offsets is misaligned
4652 * so fire an I/O through the cache for this entire vector
4653 */
4654 misaligned = 1;
4655 }
4656 if (iov_base & (devblocksize - 1)) {
4657 /*
4658 * the offset in memory must be on a device block boundary
4659 * so that we can guarantee that we can generate an
4660 * I/O that ends on a page boundary in cluster_io
4661 */
4662 misaligned = 1;
4663 }
4664
4665 max_io_size = filesize - uio->uio_offset;
4666
4667 /*
4668 * The user must request IO in aligned chunks. If the
4669 * offset into the file is bad, or the userland pointer
4670 * is non-aligned, then we cannot service the encrypted IO request.
4671 */
4672 if (flags & IO_ENCRYPTED) {
4673 if (misaligned || (io_req_size & (devblocksize - 1))) {
4674 retval = EINVAL;
4675 }
4676
4677 max_io_size = roundup(max_io_size, devblocksize);
4678 }
4679
4680 if ((off_t)io_req_size > max_io_size) {
4681 io_req_size = max_io_size;
4682 }
4683
4684 /*
4685 * When we get to this point, we know...
4686 * -- the offset into the file is on a devblocksize boundary
4687 */
4688
4689 while (io_req_size && retval == 0) {
4690 u_int32_t io_start;
4691
4692 if (cluster_is_throttled(vp)) {
4693 /*
4694 * we're in the throttle window, at the very least
4695 * we want to limit the size of the I/O we're about
4696 * to issue
4697 */
4698 max_rd_size = THROTTLE_MAX_IOSIZE;
4699 max_rd_ahead = THROTTLE_MAX_IOSIZE - 1;
4700 max_vector_size = THROTTLE_MAX_IOSIZE;
4701 } else {
4702 max_rd_size = max_upl_size;
4703 max_rd_ahead = max_rd_size * IO_SCALE(vp, 2);
4704 max_vector_size = MAX_VECTOR_UPL_SIZE;
4705 }
4706 io_start = io_size = io_req_size;
4707
4708 /*
4709 * First look for pages already in the cache
4710 * and move them to user space. But only do this
4711 * check if we are not retrieving encrypted data directly
4712 * from the filesystem; those blocks should never
4713 * be in the UBC.
4714 *
4715 * cluster_copy_ubc_data returns the resid
4716 * in io_size
4717 */
4718 if ((flags & IO_ENCRYPTED) == 0) {
4719 retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_size, 0, 0);
4720 }
4721 /*
4722 * calculate the number of bytes actually copied
4723 * starting size - residual
4724 */
4725 xsize = io_start - io_size;
4726
4727 io_req_size -= xsize;
4728
4729 if (useVectorUPL && (xsize || (iov_base & PAGE_MASK))) {
4730 /*
4731 * We found something in the cache or we have an iov_base that's not
4732 * page-aligned.
4733 *
4734 * Issue all I/O's that have been collected within this Vectored UPL.
4735 */
4736 if (vector_upl_index) {
4737 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
4738 reset_vector_run_state();
4739 }
4740
4741 if (xsize) {
4742 useVectorUPL = 0;
4743 }
4744
4745 /*
4746 * After this point, if we are using the Vector UPL path and the base is
4747 * not page-aligned then the UPL with that base will be the first in the vector UPL.
4748 */
4749 }
4750
4751 /*
4752 * check to see if we are finished with this request.
4753 *
4754 * If we satisfied this IO already, then io_req_size will be 0.
4755 * Otherwise, see if the IO was mis-aligned and needs to go through
4756 * the UBC to deal with the 'tail'.
4757 *
4758 */
4759 if (io_req_size == 0 || (misaligned)) {
4760 /*
4761 * see if there's another uio vector to
4762 * process that's of type IO_DIRECT
4763 *
4764 * break out of while loop to get there
4765 */
4766 break;
4767 }
4768 /*
4769 * assume the request ends on a device block boundary
4770 */
4771 io_min = devblocksize;
4772
4773 /*
4774 * we can handle I/O's in multiples of the device block size
4775 * however, if io_size isn't a multiple of devblocksize we
4776 * want to clip it back to the nearest page boundary since
4777 * we are going to have to go through cluster_read_copy to
4778 * deal with the 'overhang'... by clipping it to a PAGE_SIZE
4779 * multiple, we avoid asking the drive for the same physical
4780 * blocks twice.. once for the partial page at the end of the
4781 * request and a 2nd time for the page we read into the cache
4782 * (which overlaps the end of the direct read) in order to
4783 * get at the overhang bytes
4784 */
4785 if (io_size & (devblocksize - 1)) {
4786 assert(!(flags & IO_ENCRYPTED));
4787 /*
4788 * Clip the request to the previous page size boundary
4789 * since request does NOT end on a device block boundary
4790 */
4791 io_size &= ~PAGE_MASK;
4792 io_min = PAGE_SIZE;
4793 }
4794 if (retval || io_size < io_min) {
4795 /*
4796 * either an error or we only have the tail left to
4797 * complete via the copy path...
4798 * we may have already spun some portion of this request
4799 * off as async requests... we need to wait for the I/O
4800 * to complete before returning
4801 */
4802 goto wait_for_dreads;
4803 }
4804
4805 /*
4806 * Don't re-check the UBC data if we are looking for uncached IO
4807 * or asking for encrypted blocks.
4808 */
4809 if ((flags & IO_ENCRYPTED) == 0) {
4810 if ((xsize = io_size) > max_rd_size) {
4811 xsize = max_rd_size;
4812 }
4813
4814 io_size = 0;
4815
4816 if (!lock) {
4817 /*
4818 * We hold a lock here between the time we check the
4819 * cache and the time we issue I/O. This saves us
4820 * from having to lock the pages in the cache. Not
4821 * all clients will care about this lock but some
4822 * clients may want to guarantee stability between
4823 * here and when the I/O is issued in which case they
4824 * will take the lock exclusively.
4825 */
4826 lock = cluster_lock_direct_read(vp, LCK_RW_TYPE_SHARED);
4827 }
4828
4829 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + xsize, UPL_ROP_ABSENT, (int *)&io_size);
4830
4831 if (io_size == 0) {
4832 /*
4833 * a page must have just come into the cache
4834 * since the first page in this range is no
4835 * longer absent, go back and re-evaluate
4836 */
4837 continue;
4838 }
4839 }
4840 if ((flags & IO_RETURN_ON_THROTTLE)) {
4841 if (cluster_is_throttled(vp) == THROTTLE_NOW) {
4842 if (!cluster_io_present_in_BC(vp, uio->uio_offset)) {
4843 /*
4844 * we're in the throttle window and at least 1 I/O
4845 * has already been issued by a throttleable thread
4846 * in this window, so return with EAGAIN to indicate
4847 * to the FS issuing the cluster_read call that it
4848 * should now throttle after dropping any locks
4849 */
4850 throttle_info_update_by_mount(vp->v_mount);
4851
4852 io_throttled = TRUE;
4853 goto wait_for_dreads;
4854 }
4855 }
4856 }
4857 if (io_size > max_rd_size) {
4858 io_size = max_rd_size;
4859 }
4860
4861 iov_base = uio_curriovbase(uio);
4862
4863 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
4864 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
4865
4866 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
4867 (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
4868
4869 if (upl_offset == 0 && ((io_size & PAGE_MASK) == 0)) {
4870 no_zero_fill = 1;
4871 } else {
4872 no_zero_fill = 0;
4873 }
4874
4875 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
4876 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
4877 pages_in_pl = 0;
4878 upl_size = upl_needed_size;
4879 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
4880 if (no_zero_fill) {
4881 upl_flags |= UPL_NOZEROFILL;
4882 }
4883 if (force_data_sync) {
4884 upl_flags |= UPL_FORCE_DATA_SYNC;
4885 }
4886
4887 kret = vm_map_create_upl(map,
4888 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
4889 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, VM_KERN_MEMORY_FILE);
4890
4891 if (kret != KERN_SUCCESS) {
4892 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
4893 (int)upl_offset, upl_size, io_size, kret, 0);
4894 /*
4895 * failed to get pagelist
4896 *
4897 * we may have already spun some portion of this request
4898 * off as async requests... we need to wait for the I/O
4899 * to complete before returning
4900 */
4901 goto wait_for_dreads;
4902 }
4903 pages_in_pl = upl_size / PAGE_SIZE;
4904 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
4905
4906 for (i = 0; i < pages_in_pl; i++) {
4907 if (!upl_page_present(pl, i)) {
4908 break;
4909 }
4910 }
4911 if (i == pages_in_pl) {
4912 break;
4913 }
4914
4915 ubc_upl_abort(upl, 0);
4916 }
4917 if (force_data_sync >= 3) {
4918 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
4919 (int)upl_offset, upl_size, io_size, kret, 0);
4920
4921 goto wait_for_dreads;
4922 }
4923 /*
4924 * Consider the possibility that upl_size wasn't satisfied.
4925 */
4926 if (upl_size < upl_needed_size) {
4927 if (upl_size && upl_offset == 0) {
4928 io_size = upl_size;
4929 } else {
4930 io_size = 0;
4931 }
4932 }
4933 if (io_size == 0) {
4934 ubc_upl_abort(upl, 0);
4935 goto wait_for_dreads;
4936 }
4937 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
4938 (int)upl_offset, upl_size, io_size, kret, 0);
4939
4940 if (useVectorUPL) {
4941 vm_offset_t end_off = ((iov_base + io_size) & PAGE_MASK);
4942 if (end_off) {
4943 issueVectorUPL = 1;
4944 }
4945 /*
4946 * After this point, if we are using a vector UPL, then
4947 * either all the UPL elements end on a page boundary OR
4948 * this UPL is the last element because it does not end
4949 * on a page boundary.
4950 */
4951 }
4952
4953 /*
4954 * request asynchronously so that we can overlap
4955 * the preparation of the next I/O
4956 * if there are already too many outstanding reads
4957 * wait until some have completed before issuing the next read
4958 */
4959 cluster_iostate_wait(&iostate, max_rd_ahead, "cluster_read_direct");
4960
4961 if (iostate.io_error) {
4962 /*
4963 * one of the earlier reads we issued ran into a hard error
4964 * don't issue any more reads, cleanup the UPL
4965 * that was just created but not used, then
4966 * go wait for any other reads to complete before
4967 * returning the error to the caller
4968 */
4969 ubc_upl_abort(upl, 0);
4970
4971 goto wait_for_dreads;
4972 }
4973 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
4974 upl, (int)upl_offset, (int)uio->uio_offset, io_size, 0);
4975
4976 if (!useVectorUPL) {
4977 if (no_zero_fill) {
4978 io_flag &= ~CL_PRESERVE;
4979 } else {
4980 io_flag |= CL_PRESERVE;
4981 }
4982
4983 retval = cluster_io(vp, upl, upl_offset, uio->uio_offset, io_size, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
4984 } else {
4985 if (!vector_upl_index) {
4986 vector_upl = vector_upl_create(upl_offset);
4987 v_upl_uio_offset = uio->uio_offset;
4988 vector_upl_offset = upl_offset;
4989 }
4990
4991 vector_upl_set_subupl(vector_upl, upl, upl_size);
4992 vector_upl_set_iostate(vector_upl, upl, vector_upl_size, upl_size);
4993 vector_upl_index++;
4994 vector_upl_size += upl_size;
4995 vector_upl_iosize += io_size;
4996
4997 if (issueVectorUPL || vector_upl_index == MAX_VECTOR_UPL_ELEMENTS || vector_upl_size >= max_vector_size) {
4998 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
4999 reset_vector_run_state();
5000 }
5001 }
5002 last_iov_base = iov_base + io_size;
5003
5004 if (lock) {
5005 // We don't need to wait for the I/O to complete
5006 cluster_unlock_direct_read(lock);
5007 lock = NULL;
5008 }
5009
5010 /*
5011 * update the uio structure
5012 */
5013 if ((flags & IO_ENCRYPTED) && (max_io_size < io_size)) {
5014 uio_update(uio, (user_size_t)max_io_size);
5015 } else {
5016 uio_update(uio, (user_size_t)io_size);
5017 }
5018
5019 io_req_size -= io_size;
5020
5021 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
5022 upl, (int)uio->uio_offset, io_req_size, retval, 0);
5023 } /* end while */
5024
5025 if (retval == 0 && iostate.io_error == 0 && io_req_size == 0 && uio->uio_offset < filesize) {
5026 retval = cluster_io_type(uio, read_type, read_length, 0);
5027
5028 if (retval == 0 && *read_type == IO_DIRECT) {
5029 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_NONE,
5030 (int)uio->uio_offset, (int)filesize, *read_type, *read_length, 0);
5031
5032 goto next_dread;
5033 }
5034 }
5035
5036 wait_for_dreads:
5037
5038 if (retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) {
5039 retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg);
5040 reset_vector_run_state();
5041 }
5042
5043 // We don't need to wait for the I/O to complete
5044 if (lock) {
5045 cluster_unlock_direct_read(lock);
5046 }
5047
5048 /*
5049 * make sure all async reads that are part of this stream
5050 * have completed before we return
5051 */
5052 cluster_iostate_wait(&iostate, 0, "cluster_read_direct");
5053
5054 if (iostate.io_error) {
5055 retval = iostate.io_error;
5056 }
5057
5058 lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp);
5059
5060 if (io_throttled == TRUE && retval == 0) {
5061 retval = EAGAIN;
5062 }
5063
5064 for (next_iov_base = orig_iov_base; next_iov_base < last_iov_base; next_iov_base += PAGE_SIZE) {
5065 /*
5066 * This is specifically done for pmap accounting purposes.
5067 * vm_pre_fault() will call vm_fault() to enter the page into
5068 * the pmap if there isn't _a_ physical page for that VA already.
5069 */
5070 vm_pre_fault(vm_map_trunc_page(next_iov_base, PAGE_MASK), VM_PROT_READ);
5071 }
5072
5073 if (io_req_size && retval == 0) {
5074 /*
5075 * we couldn't handle the tail of this request in DIRECT mode
5076 * so fire it through the copy path
5077 */
5078 if (flags & IO_ENCRYPTED) {
5079 /*
5080 * We cannot fall back to the copy path for encrypted I/O. If this
5081 * happens, there is something wrong with the user buffer passed
5082 * down.
5083 */
5084 retval = EFAULT;
5085 } else {
5086 retval = cluster_read_copy(vp, uio, io_req_size, filesize, flags, callback, callback_arg);
5087 }
5088
5089 *read_type = IO_UNKNOWN;
5090 }
5091 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
5092 (int)uio->uio_offset, (int)uio_resid(uio), io_req_size, retval, 0);
5093
5094 return retval;
5095 }
5096
5097
5098 static int
5099 cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t *read_length,
5100 int (*callback)(buf_t, void *), void *callback_arg, int flags)
5101 {
5102 upl_page_info_t *pl;
5103 upl_t upl[MAX_VECTS];
5104 vm_offset_t upl_offset;
5105 addr64_t dst_paddr = 0;
5106 user_addr_t iov_base;
5107 off_t max_size;
5108 upl_size_t upl_size;
5109 vm_size_t upl_needed_size;
5110 mach_msg_type_number_t pages_in_pl;
5111 upl_control_flags_t upl_flags;
5112 kern_return_t kret;
5113 struct clios iostate;
5114 int error = 0;
5115 int cur_upl = 0;
5116 int num_upl = 0;
5117 int n;
5118 u_int32_t xsize;
5119 u_int32_t io_size;
5120 u_int32_t devblocksize;
5121 u_int32_t mem_alignment_mask;
5122 u_int32_t tail_size = 0;
5123 int bflag;
5124
5125 if (flags & IO_PASSIVE) {
5126 bflag = CL_PASSIVE;
5127 } else {
5128 bflag = 0;
5129 }
5130
5131 if (flags & IO_NOCACHE) {
5132 bflag |= CL_NOCACHE;
5133 }
5134
5135 /*
5136 * When we enter this routine, we know
5137 * -- the read_length will not exceed the current iov_len
5138 * -- the target address is physically contiguous for read_length
5139 */
5140 cluster_syncup(vp, filesize, callback, callback_arg, PUSH_SYNC);
5141
5142 devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize;
5143 mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask;
5144
5145 iostate.io_completed = 0;
5146 iostate.io_issued = 0;
5147 iostate.io_error = 0;
5148 iostate.io_wanted = 0;
5149
5150 lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr);
5151
5152 next_cread:
5153 io_size = *read_length;
5154
5155 max_size = filesize - uio->uio_offset;
5156
5157 if (io_size > max_size) {
5158 io_size = max_size;
5159 }
5160
5161 iov_base = uio_curriovbase(uio);
5162
5163 upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK);
5164 upl_needed_size = upl_offset + io_size;
5165
5166 pages_in_pl = 0;
5167 upl_size = upl_needed_size;
5168 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
5169
5170
5171 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) | DBG_FUNC_START,
5172 (int)upl_offset, (int)upl_size, (int)iov_base, io_size, 0);
5173
5174 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
5175 kret = vm_map_get_upl(map,
5176 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
5177 &upl_size, &upl[cur_upl], NULL, &pages_in_pl, &upl_flags, VM_KERN_MEMORY_FILE, 0);
5178
5179 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 92)) | DBG_FUNC_END,
5180 (int)upl_offset, upl_size, io_size, kret, 0);
5181
5182 if (kret != KERN_SUCCESS) {
5183 /*
5184 * failed to get pagelist
5185 */
5186 error = EINVAL;
5187 goto wait_for_creads;
5188 }
5189 num_upl++;
5190
5191 if (upl_size < upl_needed_size) {
5192 /*
5193 * The upl_size wasn't satisfied.
5194 */
5195 error = EINVAL;
5196 goto wait_for_creads;
5197 }
5198 pl = ubc_upl_pageinfo(upl[cur_upl]);
5199
5200 dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)upl_offset;
5201
5202 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
5203 u_int32_t head_size;
5204
5205 head_size = devblocksize - (u_int32_t)(uio->uio_offset & (devblocksize - 1));
5206
5207 if (head_size > io_size) {
5208 head_size = io_size;
5209 }
5210
5211 error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, CL_READ, callback, callback_arg);
5212
5213 if (error) {
5214 goto wait_for_creads;
5215 }
5216
5217 upl_offset += head_size;
5218 dst_paddr += head_size;
5219 io_size -= head_size;
5220
5221 iov_base += head_size;
5222 }
5223 if ((u_int32_t)iov_base & mem_alignment_mask) {
5224 /*
5225 * request doesn't set up on a memory boundary
5226 * the underlying DMA engine can handle...
5227 * return an error instead of going through
5228 * the slow copy path since the intent of this
5229 * path is direct I/O to device memory
5230 */
5231 error = EINVAL;
5232 goto wait_for_creads;
5233 }
5234
5235 tail_size = io_size & (devblocksize - 1);
5236
5237 io_size -= tail_size;
5238
5239 while (io_size && error == 0) {
5240 if (io_size > MAX_IO_CONTIG_SIZE) {
5241 xsize = MAX_IO_CONTIG_SIZE;
5242 } else {
5243 xsize = io_size;
5244 }
5245 /*
5246 * request asynchronously so that we can overlap
5247 * the preparation of the next I/O... we'll do
5248 * the commit after all the I/O has completed
5249 * since its all issued against the same UPL
5250 * if there are already too many outstanding reads
5251 * wait until some have completed before issuing the next
5252 */
5253 cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_read_contig");
5254
5255 if (iostate.io_error) {
5256 /*
5257 * one of the earlier reads we issued ran into a hard error
5258 * don't issue any more reads...
5259 * go wait for any other reads to complete before
5260 * returning the error to the caller
5261 */
5262 goto wait_for_creads;
5263 }
5264 error = cluster_io(vp, upl[cur_upl], upl_offset, uio->uio_offset, xsize,
5265 CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC | bflag,
5266 (buf_t)NULL, &iostate, callback, callback_arg);
5267 /*
5268 * The cluster_io read was issued successfully,
5269 * update the uio structure
5270 */
5271 if (error == 0) {
5272 uio_update(uio, (user_size_t)xsize);
5273
5274 dst_paddr += xsize;
5275 upl_offset += xsize;
5276 io_size -= xsize;
5277 }
5278 }
5279 if (error == 0 && iostate.io_error == 0 && tail_size == 0 && num_upl < MAX_VECTS && uio->uio_offset < filesize) {
5280 error = cluster_io_type(uio, read_type, read_length, 0);
5281
5282 if (error == 0 && *read_type == IO_CONTIG) {
5283 cur_upl++;
5284 goto next_cread;
5285 }
5286 } else {
5287 *read_type = IO_UNKNOWN;
5288 }
5289
5290 wait_for_creads:
5291 /*
5292 * make sure all async reads that are part of this stream
5293 * have completed before we proceed
5294 */
5295 cluster_iostate_wait(&iostate, 0, "cluster_read_contig");
5296
5297 if (iostate.io_error) {
5298 error = iostate.io_error;
5299 }
5300
5301 lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp);
5302
5303 if (error == 0 && tail_size) {
5304 error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, CL_READ, callback, callback_arg);
5305 }
5306
5307 for (n = 0; n < num_upl; n++) {
5308 /*
5309 * just release our hold on each physically contiguous
5310 * region without changing any state
5311 */
5312 ubc_upl_abort(upl[n], 0);
5313 }
5314
5315 return error;
5316 }
5317
5318
5319 static int
5320 cluster_io_type(struct uio *uio, int *io_type, u_int32_t *io_length, u_int32_t min_length)
5321 {
5322 user_size_t iov_len;
5323 user_addr_t iov_base = 0;
5324 upl_t upl;
5325 upl_size_t upl_size;
5326 upl_control_flags_t upl_flags;
5327 int retval = 0;
5328
5329 /*
5330 * skip over any emtpy vectors
5331 */
5332 uio_update(uio, (user_size_t)0);
5333
5334 iov_len = uio_curriovlen(uio);
5335
5336 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) | DBG_FUNC_START, uio, (int)iov_len, 0, 0, 0);
5337
5338 if (iov_len) {
5339 iov_base = uio_curriovbase(uio);
5340 /*
5341 * make sure the size of the vector isn't too big...
5342 * internally, we want to handle all of the I/O in
5343 * chunk sizes that fit in a 32 bit int
5344 */
5345 if (iov_len > (user_size_t)MAX_IO_REQUEST_SIZE) {
5346 upl_size = MAX_IO_REQUEST_SIZE;
5347 } else {
5348 upl_size = (u_int32_t)iov_len;
5349 }
5350
5351 upl_flags = UPL_QUERY_OBJECT_TYPE;
5352
5353 vm_map_t map = UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ? current_map() : kernel_map;
5354 if ((vm_map_get_upl(map,
5355 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
5356 &upl_size, &upl, NULL, NULL, &upl_flags, VM_KERN_MEMORY_FILE, 0)) != KERN_SUCCESS) {
5357 /*
5358 * the user app must have passed in an invalid address
5359 */
5360 retval = EFAULT;
5361 }
5362 if (upl_size == 0) {
5363 retval = EFAULT;
5364 }
5365
5366 *io_length = upl_size;
5367
5368 if (upl_flags & UPL_PHYS_CONTIG) {
5369 *io_type = IO_CONTIG;
5370 } else if (iov_len >= min_length) {
5371 *io_type = IO_DIRECT;
5372 } else {
5373 *io_type = IO_COPY;
5374 }
5375 } else {
5376 /*
5377 * nothing left to do for this uio
5378 */
5379 *io_length = 0;
5380 *io_type = IO_UNKNOWN;
5381 }
5382 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 94)) | DBG_FUNC_END, iov_base, *io_type, *io_length, retval, 0);
5383
5384 return retval;
5385 }
5386
5387
5388 /*
5389 * generate advisory I/O's in the largest chunks possible
5390 * the completed pages will be released into the VM cache
5391 */
5392 int
5393 advisory_read(vnode_t vp, off_t filesize, off_t f_offset, int resid)
5394 {
5395 return advisory_read_ext(vp, filesize, f_offset, resid, NULL, NULL, CL_PASSIVE);
5396 }
5397
5398 int
5399 advisory_read_ext(vnode_t vp, off_t filesize, off_t f_offset, int resid, int (*callback)(buf_t, void *), void *callback_arg, int bflag)
5400 {
5401 upl_page_info_t *pl;
5402 upl_t upl;
5403 vm_offset_t upl_offset;
5404 int upl_size;
5405 off_t upl_f_offset;
5406 int start_offset;
5407 int start_pg;
5408 int last_pg;
5409 int pages_in_upl;
5410 off_t max_size;
5411 int io_size;
5412 kern_return_t kret;
5413 int retval = 0;
5414 int issued_io;
5415 int skip_range;
5416 uint32_t max_io_size;
5417
5418
5419 if (!UBCINFOEXISTS(vp)) {
5420 return EINVAL;
5421 }
5422
5423 if (resid < 0) {
5424 return EINVAL;
5425 }
5426
5427 max_io_size = cluster_max_io_size(vp->v_mount, CL_READ);
5428
5429 #if CONFIG_EMBEDDED
5430 if (max_io_size > speculative_prefetch_max_iosize) {
5431 max_io_size = speculative_prefetch_max_iosize;
5432 }
5433 #else
5434 if (disk_conditioner_mount_is_ssd(vp->v_mount)) {
5435 if (max_io_size > speculative_prefetch_max_iosize) {
5436 max_io_size = speculative_prefetch_max_iosize;
5437 }
5438 }
5439 #endif
5440
5441 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
5442 (int)f_offset, resid, (int)filesize, 0, 0);
5443
5444 while (resid && f_offset < filesize && retval == 0) {
5445 /*
5446 * compute the size of the upl needed to encompass
5447 * the requested read... limit each call to cluster_io
5448 * to the maximum UPL size... cluster_io will clip if
5449 * this exceeds the maximum io_size for the device,
5450 * make sure to account for
5451 * a starting offset that's not page aligned
5452 */
5453 start_offset = (int)(f_offset & PAGE_MASK_64);
5454 upl_f_offset = f_offset - (off_t)start_offset;
5455 max_size = filesize - f_offset;
5456
5457 if (resid < max_size) {
5458 io_size = resid;
5459 } else {
5460 io_size = max_size;
5461 }
5462
5463 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
5464 if ((uint32_t)upl_size > max_io_size) {
5465 upl_size = max_io_size;
5466 }
5467
5468 skip_range = 0;
5469 /*
5470 * return the number of contiguously present pages in the cache
5471 * starting at upl_f_offset within the file
5472 */
5473 ubc_range_op(vp, upl_f_offset, upl_f_offset + upl_size, UPL_ROP_PRESENT, &skip_range);
5474
5475 if (skip_range) {
5476 /*
5477 * skip over pages already present in the cache
5478 */
5479 io_size = skip_range - start_offset;
5480
5481 f_offset += io_size;
5482 resid -= io_size;
5483
5484 if (skip_range == upl_size) {
5485 continue;
5486 }
5487 /*
5488 * have to issue some real I/O
5489 * at this point, we know it's starting on a page boundary
5490 * because we've skipped over at least the first page in the request
5491 */
5492 start_offset = 0;
5493 upl_f_offset += skip_range;
5494 upl_size -= skip_range;
5495 }
5496 pages_in_upl = upl_size / PAGE_SIZE;
5497
5498 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_START,
5499 upl, (int)upl_f_offset, upl_size, start_offset, 0);
5500
5501 kret = ubc_create_upl_kernel(vp,
5502 upl_f_offset,
5503 upl_size,
5504 &upl,
5505 &pl,
5506 UPL_RET_ONLY_ABSENT | UPL_SET_LITE,
5507 VM_KERN_MEMORY_FILE);
5508 if (kret != KERN_SUCCESS) {
5509 return retval;
5510 }
5511 issued_io = 0;
5512
5513 /*
5514 * before we start marching forward, we must make sure we end on
5515 * a present page, otherwise we will be working with a freed
5516 * upl
5517 */
5518 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
5519 if (upl_page_present(pl, last_pg)) {
5520 break;
5521 }
5522 }
5523 pages_in_upl = last_pg + 1;
5524
5525
5526 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_END,
5527 upl, (int)upl_f_offset, upl_size, start_offset, 0);
5528
5529
5530 for (last_pg = 0; last_pg < pages_in_upl;) {
5531 /*
5532 * scan from the beginning of the upl looking for the first
5533 * page that is present.... this will become the first page in
5534 * the request we're going to make to 'cluster_io'... if all
5535 * of the pages are absent, we won't call through to 'cluster_io'
5536 */
5537 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
5538 if (upl_page_present(pl, start_pg)) {
5539 break;
5540 }
5541 }
5542
5543 /*
5544 * scan from the starting present page looking for an absent
5545 * page before the end of the upl is reached, if we
5546 * find one, then it will terminate the range of pages being
5547 * presented to 'cluster_io'
5548 */
5549 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
5550 if (!upl_page_present(pl, last_pg)) {
5551 break;
5552 }
5553 }
5554
5555 if (last_pg > start_pg) {
5556 /*
5557 * we found a range of pages that must be filled
5558 * if the last page in this range is the last page of the file
5559 * we may have to clip the size of it to keep from reading past
5560 * the end of the last physical block associated with the file
5561 */
5562 upl_offset = start_pg * PAGE_SIZE;
5563 io_size = (last_pg - start_pg) * PAGE_SIZE;
5564
5565 if ((off_t)(upl_f_offset + upl_offset + io_size) > filesize) {
5566 io_size = filesize - (upl_f_offset + upl_offset);
5567 }
5568
5569 /*
5570 * issue an asynchronous read to cluster_io
5571 */
5572 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
5573 CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
5574
5575 issued_io = 1;
5576 }
5577 }
5578 if (issued_io == 0) {
5579 ubc_upl_abort(upl, 0);
5580 }
5581
5582 io_size = upl_size - start_offset;
5583
5584 if (io_size > resid) {
5585 io_size = resid;
5586 }
5587 f_offset += io_size;
5588 resid -= io_size;
5589 }
5590
5591 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
5592 (int)f_offset, resid, retval, 0, 0);
5593
5594 return retval;
5595 }
5596
5597
5598 int
5599 cluster_push(vnode_t vp, int flags)
5600 {
5601 return cluster_push_ext(vp, flags, NULL, NULL);
5602 }
5603
5604
5605 int
5606 cluster_push_ext(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *callback_arg)
5607 {
5608 return cluster_push_err(vp, flags, callback, callback_arg, NULL);
5609 }
5610
5611 /* write errors via err, but return the number of clusters written */
5612 int
5613 cluster_push_err(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *callback_arg, int *err)
5614 {
5615 int retval;
5616 int my_sparse_wait = 0;
5617 struct cl_writebehind *wbp;
5618 int local_err = 0;
5619
5620 if (err) {
5621 *err = 0;
5622 }
5623
5624 if (!UBCINFOEXISTS(vp)) {
5625 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -1, 0);
5626 return 0;
5627 }
5628 /* return if deferred write is set */
5629 if (((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) && (flags & IO_DEFWRITE)) {
5630 return 0;
5631 }
5632 if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) == NULL) {
5633 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -2, 0);
5634 return 0;
5635 }
5636 if (!ISSET(flags, IO_SYNC) && wbp->cl_number == 0 && wbp->cl_scmap == NULL) {
5637 lck_mtx_unlock(&wbp->cl_lockw);
5638
5639 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, kdebug_vnode(vp), flags, 0, -3, 0);
5640 return 0;
5641 }
5642 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
5643 wbp->cl_scmap, wbp->cl_number, flags, 0, 0);
5644
5645 /*
5646 * if we have an fsync in progress, we don't want to allow any additional
5647 * sync/fsync/close(s) to occur until it finishes.
5648 * note that its possible for writes to continue to occur to this file
5649 * while we're waiting and also once the fsync starts to clean if we're
5650 * in the sparse map case
5651 */
5652 while (wbp->cl_sparse_wait) {
5653 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START, kdebug_vnode(vp), 0, 0, 0, 0);
5654
5655 msleep((caddr_t)&wbp->cl_sparse_wait, &wbp->cl_lockw, PRIBIO + 1, "cluster_push_ext", NULL);
5656
5657 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END, kdebug_vnode(vp), 0, 0, 0, 0);
5658 }
5659 if (flags & IO_SYNC) {
5660 my_sparse_wait = 1;
5661 wbp->cl_sparse_wait = 1;
5662
5663 /*
5664 * this is an fsync (or equivalent)... we must wait for any existing async
5665 * cleaning operations to complete before we evaulate the current state
5666 * and finish cleaning... this insures that all writes issued before this
5667 * fsync actually get cleaned to the disk before this fsync returns
5668 */
5669 while (wbp->cl_sparse_pushes) {
5670 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) | DBG_FUNC_START, kdebug_vnode(vp), 0, 0, 0, 0);
5671
5672 msleep((caddr_t)&wbp->cl_sparse_pushes, &wbp->cl_lockw, PRIBIO + 1, "cluster_push_ext", NULL);
5673
5674 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 98)) | DBG_FUNC_END, kdebug_vnode(vp), 0, 0, 0, 0);
5675 }
5676 }
5677 if (wbp->cl_scmap) {
5678 void *scmap;
5679
5680 if (wbp->cl_sparse_pushes < SPARSE_PUSH_LIMIT) {
5681 scmap = wbp->cl_scmap;
5682 wbp->cl_scmap = NULL;
5683
5684 wbp->cl_sparse_pushes++;
5685
5686 lck_mtx_unlock(&wbp->cl_lockw);
5687
5688 retval = sparse_cluster_push(wbp, &scmap, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, FALSE);
5689
5690 lck_mtx_lock(&wbp->cl_lockw);
5691
5692 wbp->cl_sparse_pushes--;
5693
5694 if (retval) {
5695 if (wbp->cl_scmap != NULL) {
5696 panic("cluster_push_err: Expected NULL cl_scmap\n");
5697 }
5698
5699 wbp->cl_scmap = scmap;
5700 }
5701
5702 if (wbp->cl_sparse_wait && wbp->cl_sparse_pushes == 0) {
5703 wakeup((caddr_t)&wbp->cl_sparse_pushes);
5704 }
5705 } else {
5706 retval = sparse_cluster_push(wbp, &(wbp->cl_scmap), vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, FALSE);
5707 }
5708
5709 local_err = retval;
5710
5711 if (err) {
5712 *err = retval;
5713 }
5714 retval = 1;
5715 } else {
5716 retval = cluster_try_push(wbp, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, &local_err, FALSE);
5717 if (err) {
5718 *err = local_err;
5719 }
5720 }
5721 lck_mtx_unlock(&wbp->cl_lockw);
5722
5723 if (flags & IO_SYNC) {
5724 (void)vnode_waitforwrites(vp, 0, 0, 0, "cluster_push");
5725 }
5726
5727 if (my_sparse_wait) {
5728 /*
5729 * I'm the owner of the serialization token
5730 * clear it and wakeup anyone that is waiting
5731 * for me to finish
5732 */
5733 lck_mtx_lock(&wbp->cl_lockw);
5734
5735 wbp->cl_sparse_wait = 0;
5736 wakeup((caddr_t)&wbp->cl_sparse_wait);
5737
5738 lck_mtx_unlock(&wbp->cl_lockw);
5739 }
5740 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
5741 wbp->cl_scmap, wbp->cl_number, retval, local_err, 0);
5742
5743 return retval;
5744 }
5745
5746
5747 __private_extern__ void
5748 cluster_release(struct ubc_info *ubc)
5749 {
5750 struct cl_writebehind *wbp;
5751 struct cl_readahead *rap;
5752
5753 if ((wbp = ubc->cl_wbehind)) {
5754 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, ubc, wbp->cl_scmap, 0, 0, 0);
5755
5756 if (wbp->cl_scmap) {
5757 vfs_drt_control(&(wbp->cl_scmap), 0);
5758 }
5759 } else {
5760 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, ubc, 0, 0, 0, 0);
5761 }
5762
5763 rap = ubc->cl_rahead;
5764
5765 if (wbp != NULL) {
5766 lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp);
5767 FREE_ZONE(wbp, sizeof *wbp, M_CLWRBEHIND);
5768 }
5769 if ((rap = ubc->cl_rahead)) {
5770 lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp);
5771 FREE_ZONE(rap, sizeof *rap, M_CLRDAHEAD);
5772 }
5773 ubc->cl_rahead = NULL;
5774 ubc->cl_wbehind = NULL;
5775
5776 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_END, ubc, rap, wbp, 0, 0);
5777 }
5778
5779
5780 static int
5781 cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*callback)(buf_t, void *), void *callback_arg, int *err, boolean_t vm_initiated)
5782 {
5783 int cl_index;
5784 int cl_index1;
5785 int min_index;
5786 int cl_len;
5787 int cl_pushed = 0;
5788 struct cl_wextent l_clusters[MAX_CLUSTERS];
5789 u_int max_cluster_pgcount;
5790 int error = 0;
5791
5792 max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE;
5793 /*
5794 * the write behind context exists and has
5795 * already been locked...
5796 */
5797 if (wbp->cl_number == 0) {
5798 /*
5799 * no clusters to push
5800 * return number of empty slots
5801 */
5802 return MAX_CLUSTERS;
5803 }
5804
5805 /*
5806 * make a local 'sorted' copy of the clusters
5807 * and clear wbp->cl_number so that new clusters can
5808 * be developed
5809 */
5810 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
5811 for (min_index = -1, cl_index1 = 0; cl_index1 < wbp->cl_number; cl_index1++) {
5812 if (wbp->cl_clusters[cl_index1].b_addr == wbp->cl_clusters[cl_index1].e_addr) {
5813 continue;
5814 }
5815 if (min_index == -1) {
5816 min_index = cl_index1;
5817 } else if (wbp->cl_clusters[cl_index1].b_addr < wbp->cl_clusters[min_index].b_addr) {
5818 min_index = cl_index1;
5819 }
5820 }
5821 if (min_index == -1) {
5822 break;
5823 }
5824
5825 l_clusters[cl_index].b_addr = wbp->cl_clusters[min_index].b_addr;
5826 l_clusters[cl_index].e_addr = wbp->cl_clusters[min_index].e_addr;
5827 l_clusters[cl_index].io_flags = wbp->cl_clusters[min_index].io_flags;
5828
5829 wbp->cl_clusters[min_index].b_addr = wbp->cl_clusters[min_index].e_addr;
5830 }
5831 wbp->cl_number = 0;
5832
5833 cl_len = cl_index;
5834
5835 /* skip switching to the sparse cluster mechanism if on diskimage */
5836 if (((push_flag & PUSH_DELAY) && cl_len == MAX_CLUSTERS) &&
5837 !(vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)) {
5838 int i;
5839
5840 /*
5841 * determine if we appear to be writing the file sequentially
5842 * if not, by returning without having pushed any clusters
5843 * we will cause this vnode to be pushed into the sparse cluster mechanism
5844 * used for managing more random I/O patterns
5845 *
5846 * we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
5847 * that's why we're in try_push with PUSH_DELAY...
5848 *
5849 * check to make sure that all the clusters except the last one are 'full'... and that each cluster
5850 * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
5851 * so we can just make a simple pass through, up to, but not including the last one...
5852 * note that e_addr is not inclusive, so it will be equal to the b_addr of the next cluster if they
5853 * are sequential
5854 *
5855 * we let the last one be partial as long as it was adjacent to the previous one...
5856 * we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
5857 * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
5858 */
5859 for (i = 0; i < MAX_CLUSTERS - 1; i++) {
5860 if ((l_clusters[i].e_addr - l_clusters[i].b_addr) != max_cluster_pgcount) {
5861 goto dont_try;
5862 }
5863 if (l_clusters[i].e_addr != l_clusters[i + 1].b_addr) {
5864 goto dont_try;
5865 }
5866 }
5867 }
5868 if (vm_initiated == TRUE) {
5869 lck_mtx_unlock(&wbp->cl_lockw);
5870 }
5871
5872 for (cl_index = 0; cl_index < cl_len; cl_index++) {
5873 int flags;
5874 struct cl_extent cl;
5875 int retval;
5876
5877 flags = io_flags & (IO_PASSIVE | IO_CLOSE);
5878
5879 /*
5880 * try to push each cluster in turn...
5881 */
5882 if (l_clusters[cl_index].io_flags & CLW_IONOCACHE) {
5883 flags |= IO_NOCACHE;
5884 }
5885
5886 if (l_clusters[cl_index].io_flags & CLW_IOPASSIVE) {
5887 flags |= IO_PASSIVE;
5888 }
5889
5890 if (push_flag & PUSH_SYNC) {
5891 flags |= IO_SYNC;
5892 }
5893
5894 cl.b_addr = l_clusters[cl_index].b_addr;
5895 cl.e_addr = l_clusters[cl_index].e_addr;
5896
5897 retval = cluster_push_now(vp, &cl, EOF, flags, callback, callback_arg, vm_initiated);
5898
5899 if (retval == 0) {
5900 cl_pushed++;
5901
5902 l_clusters[cl_index].b_addr = 0;
5903 l_clusters[cl_index].e_addr = 0;
5904 } else if (error == 0) {
5905 error = retval;
5906 }
5907
5908 if (!(push_flag & PUSH_ALL)) {
5909 break;
5910 }
5911 }
5912 if (vm_initiated == TRUE) {
5913 lck_mtx_lock(&wbp->cl_lockw);
5914 }
5915
5916 if (err) {
5917 *err = error;
5918 }
5919
5920 dont_try:
5921 if (cl_len > cl_pushed) {
5922 /*
5923 * we didn't push all of the clusters, so
5924 * lets try to merge them back in to the vnode
5925 */
5926 if ((MAX_CLUSTERS - wbp->cl_number) < (cl_len - cl_pushed)) {
5927 /*
5928 * we picked up some new clusters while we were trying to
5929 * push the old ones... this can happen because I've dropped
5930 * the vnode lock... the sum of the
5931 * leftovers plus the new cluster count exceeds our ability
5932 * to represent them, so switch to the sparse cluster mechanism
5933 *
5934 * collect the active public clusters...
5935 */
5936 sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg, vm_initiated);
5937
5938 for (cl_index = 0, cl_index1 = 0; cl_index < cl_len; cl_index++) {
5939 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr) {
5940 continue;
5941 }
5942 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
5943 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
5944 wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags;
5945
5946 cl_index1++;
5947 }
5948 /*
5949 * update the cluster count
5950 */
5951 wbp->cl_number = cl_index1;
5952
5953 /*
5954 * and collect the original clusters that were moved into the
5955 * local storage for sorting purposes
5956 */
5957 sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg, vm_initiated);
5958 } else {
5959 /*
5960 * we've got room to merge the leftovers back in
5961 * just append them starting at the next 'hole'
5962 * represented by wbp->cl_number
5963 */
5964 for (cl_index = 0, cl_index1 = wbp->cl_number; cl_index < cl_len; cl_index++) {
5965 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr) {
5966 continue;
5967 }
5968
5969 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
5970 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
5971 wbp->cl_clusters[cl_index1].io_flags = l_clusters[cl_index].io_flags;
5972
5973 cl_index1++;
5974 }
5975 /*
5976 * update the cluster count
5977 */
5978 wbp->cl_number = cl_index1;
5979 }
5980 }
5981 return MAX_CLUSTERS - wbp->cl_number;
5982 }
5983
5984
5985
5986 static int
5987 cluster_push_now(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags,
5988 int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
5989 {
5990 upl_page_info_t *pl;
5991 upl_t upl;
5992 vm_offset_t upl_offset;
5993 int upl_size;
5994 off_t upl_f_offset;
5995 int pages_in_upl;
5996 int start_pg;
5997 int last_pg;
5998 int io_size;
5999 int io_flags;
6000 int upl_flags;
6001 int bflag;
6002 int size;
6003 int error = 0;
6004 int retval;
6005 kern_return_t kret;
6006
6007 if (flags & IO_PASSIVE) {
6008 bflag = CL_PASSIVE;
6009 } else {
6010 bflag = 0;
6011 }
6012
6013 if (flags & IO_SKIP_ENCRYPTION) {
6014 bflag |= CL_ENCRYPTED;
6015 }
6016
6017 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START,
6018 (int)cl->b_addr, (int)cl->e_addr, (int)EOF, flags, 0);
6019
6020 if ((pages_in_upl = (int)(cl->e_addr - cl->b_addr)) == 0) {
6021 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0);
6022
6023 return 0;
6024 }
6025 upl_size = pages_in_upl * PAGE_SIZE;
6026 upl_f_offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
6027
6028 if (upl_f_offset + upl_size >= EOF) {
6029 if (upl_f_offset >= EOF) {
6030 /*
6031 * must have truncated the file and missed
6032 * clearing a dangling cluster (i.e. it's completely
6033 * beyond the new EOF
6034 */
6035 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0);
6036
6037 return 0;
6038 }
6039 size = EOF - upl_f_offset;
6040
6041 upl_size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
6042 pages_in_upl = upl_size / PAGE_SIZE;
6043 } else {
6044 size = upl_size;
6045 }
6046
6047
6048 if (vm_initiated) {
6049 vnode_pageout(vp, NULL, (upl_offset_t)0, upl_f_offset, (upl_size_t)upl_size,
6050 UPL_MSYNC | UPL_VNODE_PAGER | UPL_KEEPCACHED, &error);
6051
6052 return error;
6053 }
6054 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, size, 0, 0, 0);
6055
6056 /*
6057 * by asking for UPL_COPYOUT_FROM and UPL_RET_ONLY_DIRTY, we get the following desirable behavior
6058 *
6059 * - only pages that are currently dirty are returned... these are the ones we need to clean
6060 * - the hardware dirty bit is cleared when the page is gathered into the UPL... the software dirty bit is set
6061 * - if we have to abort the I/O for some reason, the software dirty bit is left set since we didn't clean the page
6062 * - when we commit the page, the software dirty bit is cleared... the hardware dirty bit is untouched so that if
6063 * someone dirties this page while the I/O is in progress, we don't lose track of the new state
6064 *
6065 * when the I/O completes, we no longer ask for an explicit clear of the DIRTY state (either soft or hard)
6066 */
6067
6068 if ((vp->v_flag & VNOCACHE_DATA) || (flags & IO_NOCACHE)) {
6069 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE | UPL_WILL_BE_DUMPED;
6070 } else {
6071 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE;
6072 }
6073
6074 kret = ubc_create_upl_kernel(vp,
6075 upl_f_offset,
6076 upl_size,
6077 &upl,
6078 &pl,
6079 upl_flags,
6080 VM_KERN_MEMORY_FILE);
6081 if (kret != KERN_SUCCESS) {
6082 panic("cluster_push: failed to get pagelist");
6083 }
6084
6085 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END, upl, upl_f_offset, 0, 0, 0);
6086
6087 /*
6088 * since we only asked for the dirty pages back
6089 * it's possible that we may only get a few or even none, so...
6090 * before we start marching forward, we must make sure we know
6091 * where the last present page is in the UPL, otherwise we could
6092 * end up working with a freed upl due to the FREE_ON_EMPTY semantics
6093 * employed by commit_range and abort_range.
6094 */
6095 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
6096 if (upl_page_present(pl, last_pg)) {
6097 break;
6098 }
6099 }
6100 pages_in_upl = last_pg + 1;
6101
6102 if (pages_in_upl == 0) {
6103 ubc_upl_abort(upl, 0);
6104
6105 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 2, 0, 0, 0);
6106 return 0;
6107 }
6108
6109 for (last_pg = 0; last_pg < pages_in_upl;) {
6110 /*
6111 * find the next dirty page in the UPL
6112 * this will become the first page in the
6113 * next I/O to generate
6114 */
6115 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
6116 if (upl_dirty_page(pl, start_pg)) {
6117 break;
6118 }
6119 if (upl_page_present(pl, start_pg)) {
6120 /*
6121 * RET_ONLY_DIRTY will return non-dirty 'precious' pages
6122 * just release these unchanged since we're not going
6123 * to steal them or change their state
6124 */
6125 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
6126 }
6127 }
6128 if (start_pg >= pages_in_upl) {
6129 /*
6130 * done... no more dirty pages to push
6131 */
6132 break;
6133 }
6134 if (start_pg > last_pg) {
6135 /*
6136 * skipped over some non-dirty pages
6137 */
6138 size -= ((start_pg - last_pg) * PAGE_SIZE);
6139 }
6140
6141 /*
6142 * find a range of dirty pages to write
6143 */
6144 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
6145 if (!upl_dirty_page(pl, last_pg)) {
6146 break;
6147 }
6148 }
6149 upl_offset = start_pg * PAGE_SIZE;
6150
6151 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
6152
6153 io_flags = CL_THROTTLE | CL_COMMIT | CL_AGE | bflag;
6154
6155 if (!(flags & IO_SYNC)) {
6156 io_flags |= CL_ASYNC;
6157 }
6158
6159 if (flags & IO_CLOSE) {
6160 io_flags |= CL_CLOSE;
6161 }
6162
6163 if (flags & IO_NOCACHE) {
6164 io_flags |= CL_NOCACHE;
6165 }
6166
6167 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
6168 io_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
6169
6170 if (error == 0 && retval) {
6171 error = retval;
6172 }
6173
6174 size -= io_size;
6175 }
6176 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, error, 0, 0);
6177
6178 return error;
6179 }
6180
6181
6182 /*
6183 * sparse_cluster_switch is called with the write behind lock held
6184 */
6185 static int
6186 sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
6187 {
6188 int cl_index;
6189 int error;
6190
6191 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, kdebug_vnode(vp), wbp->cl_scmap, wbp->cl_number, 0, 0);
6192
6193 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
6194 int flags;
6195 struct cl_extent cl;
6196
6197 for (cl.b_addr = wbp->cl_clusters[cl_index].b_addr; cl.b_addr < wbp->cl_clusters[cl_index].e_addr; cl.b_addr++) {
6198 if (ubc_page_op(vp, (off_t)(cl.b_addr * PAGE_SIZE_64), 0, NULL, &flags) == KERN_SUCCESS) {
6199 if (flags & UPL_POP_DIRTY) {
6200 cl.e_addr = cl.b_addr + 1;
6201
6202 error = sparse_cluster_add(wbp, &(wbp->cl_scmap), vp, &cl, EOF, callback, callback_arg, vm_initiated);
6203
6204 if (error) {
6205 break;
6206 }
6207 }
6208 }
6209 }
6210 }
6211 wbp->cl_number -= cl_index;
6212
6213 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, kdebug_vnode(vp), wbp->cl_scmap, wbp->cl_number, error, 0);
6214
6215 return error;
6216 }
6217
6218
6219 /*
6220 * sparse_cluster_push must be called with the write-behind lock held if the scmap is
6221 * still associated with the write-behind context... however, if the scmap has been disassociated
6222 * from the write-behind context (the cluster_push case), the wb lock is not held
6223 */
6224 static int
6225 sparse_cluster_push(struct cl_writebehind *wbp, void **scmap, vnode_t vp, off_t EOF, int push_flag,
6226 int io_flags, int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
6227 {
6228 struct cl_extent cl;
6229 off_t offset;
6230 u_int length;
6231 void *l_scmap;
6232 int error = 0;
6233
6234 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, kdebug_vnode(vp), (*scmap), 0, push_flag, 0);
6235
6236 if (push_flag & PUSH_ALL) {
6237 vfs_drt_control(scmap, 1);
6238 }
6239
6240 l_scmap = *scmap;
6241
6242 for (;;) {
6243 int retval;
6244
6245 if (vfs_drt_get_cluster(scmap, &offset, &length) != KERN_SUCCESS) {
6246 break;
6247 }
6248
6249 if (vm_initiated == TRUE) {
6250 lck_mtx_unlock(&wbp->cl_lockw);
6251 }
6252
6253 cl.b_addr = (daddr64_t)(offset / PAGE_SIZE_64);
6254 cl.e_addr = (daddr64_t)((offset + length) / PAGE_SIZE_64);
6255
6256 retval = cluster_push_now(vp, &cl, EOF, io_flags, callback, callback_arg, vm_initiated);
6257 if (error == 0 && retval) {
6258 error = retval;
6259 }
6260
6261 if (vm_initiated == TRUE) {
6262 lck_mtx_lock(&wbp->cl_lockw);
6263
6264 if (*scmap != l_scmap) {
6265 break;
6266 }
6267 }
6268
6269 if (error) {
6270 if (vfs_drt_mark_pages(scmap, offset, length, NULL) != KERN_SUCCESS) {
6271 panic("Failed to restore dirty state on failure\n");
6272 }
6273
6274 break;
6275 }
6276
6277 if (!(push_flag & PUSH_ALL)) {
6278 break;
6279 }
6280 }
6281 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, kdebug_vnode(vp), (*scmap), error, 0, 0);
6282
6283 return error;
6284 }
6285
6286
6287 /*
6288 * sparse_cluster_add is called with the write behind lock held
6289 */
6290 static int
6291 sparse_cluster_add(struct cl_writebehind *wbp, void **scmap, vnode_t vp, struct cl_extent *cl, off_t EOF,
6292 int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
6293 {
6294 u_int new_dirty;
6295 u_int length;
6296 off_t offset;
6297 int error;
6298 int push_flag = 0; /* Is this a valid value? */
6299
6300 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_START, (*scmap), 0, cl->b_addr, (int)cl->e_addr, 0);
6301
6302 offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
6303 length = ((u_int)(cl->e_addr - cl->b_addr)) * PAGE_SIZE;
6304
6305 while (vfs_drt_mark_pages(scmap, offset, length, &new_dirty) != KERN_SUCCESS) {
6306 /*
6307 * no room left in the map
6308 * only a partial update was done
6309 * push out some pages and try again
6310 */
6311
6312 if (vfs_get_scmap_push_behavior_internal(scmap, &push_flag)) {
6313 push_flag = 0;
6314 }
6315
6316 error = sparse_cluster_push(wbp, scmap, vp, EOF, push_flag, 0, callback, callback_arg, vm_initiated);
6317
6318 if (error) {
6319 break;
6320 }
6321
6322 offset += (new_dirty * PAGE_SIZE_64);
6323 length -= (new_dirty * PAGE_SIZE);
6324 }
6325 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, kdebug_vnode(vp), (*scmap), error, 0, 0);
6326
6327 return error;
6328 }
6329
6330
6331 static int
6332 cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int (*callback)(buf_t, void *), void *callback_arg)
6333 {
6334 upl_page_info_t *pl;
6335 upl_t upl;
6336 addr64_t ubc_paddr;
6337 kern_return_t kret;
6338 int error = 0;
6339 int did_read = 0;
6340 int abort_flags;
6341 int upl_flags;
6342 int bflag;
6343
6344 if (flags & IO_PASSIVE) {
6345 bflag = CL_PASSIVE;
6346 } else {
6347 bflag = 0;
6348 }
6349
6350 if (flags & IO_NOCACHE) {
6351 bflag |= CL_NOCACHE;
6352 }
6353
6354 upl_flags = UPL_SET_LITE;
6355
6356 if (!(flags & CL_READ)) {
6357 /*
6358 * "write" operation: let the UPL subsystem know
6359 * that we intend to modify the buffer cache pages
6360 * we're gathering.
6361 */
6362 upl_flags |= UPL_WILL_MODIFY;
6363 } else {
6364 /*
6365 * indicate that there is no need to pull the
6366 * mapping for this page... we're only going
6367 * to read from it, not modify it.
6368 */
6369 upl_flags |= UPL_FILE_IO;
6370 }
6371 kret = ubc_create_upl_kernel(vp,
6372 uio->uio_offset & ~PAGE_MASK_64,
6373 PAGE_SIZE,
6374 &upl,
6375 &pl,
6376 upl_flags,
6377 VM_KERN_MEMORY_FILE);
6378
6379 if (kret != KERN_SUCCESS) {
6380 return EINVAL;
6381 }
6382
6383 if (!upl_valid_page(pl, 0)) {
6384 /*
6385 * issue a synchronous read to cluster_io
6386 */
6387 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
6388 CL_READ | bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
6389 if (error) {
6390 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
6391
6392 return error;
6393 }
6394 did_read = 1;
6395 }
6396 ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << PAGE_SHIFT) + (addr64_t)(uio->uio_offset & PAGE_MASK_64);
6397
6398 /*
6399 * NOTE: There is no prototype for the following in BSD. It, and the definitions
6400 * of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
6401 * osfmk/ppc/mappings.h. They are not included here because there appears to be no
6402 * way to do so without exporting them to kexts as well.
6403 */
6404 if (flags & CL_READ) {
6405 // copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk); /* Copy physical to physical and flush the destination */
6406 copypv(ubc_paddr, usr_paddr, xsize, 2 | 1 | 4); /* Copy physical to physical and flush the destination */
6407 } else {
6408 // copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc); /* Copy physical to physical and flush the source */
6409 copypv(usr_paddr, ubc_paddr, xsize, 2 | 1 | 8); /* Copy physical to physical and flush the source */
6410 }
6411 if (!(flags & CL_READ) || (upl_valid_page(pl, 0) && upl_dirty_page(pl, 0))) {
6412 /*
6413 * issue a synchronous write to cluster_io
6414 */
6415 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
6416 bflag, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg);
6417 }
6418 if (error == 0) {
6419 uio_update(uio, (user_size_t)xsize);
6420 }
6421
6422 if (did_read) {
6423 abort_flags = UPL_ABORT_FREE_ON_EMPTY;
6424 } else {
6425 abort_flags = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
6426 }
6427
6428 ubc_upl_abort_range(upl, 0, PAGE_SIZE, abort_flags);
6429
6430 return error;
6431 }
6432
6433 int
6434 cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int *io_resid)
6435 {
6436 int pg_offset;
6437 int pg_index;
6438 int csize;
6439 int segflg;
6440 int retval = 0;
6441 int xsize;
6442 upl_page_info_t *pl;
6443 int dirty_count;
6444
6445 xsize = *io_resid;
6446
6447 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
6448 (int)uio->uio_offset, upl_offset, xsize, 0, 0);
6449
6450 segflg = uio->uio_segflg;
6451
6452 switch (segflg) {
6453 case UIO_USERSPACE32:
6454 case UIO_USERISPACE32:
6455 uio->uio_segflg = UIO_PHYS_USERSPACE32;
6456 break;
6457
6458 case UIO_USERSPACE:
6459 case UIO_USERISPACE:
6460 uio->uio_segflg = UIO_PHYS_USERSPACE;
6461 break;
6462
6463 case UIO_USERSPACE64:
6464 case UIO_USERISPACE64:
6465 uio->uio_segflg = UIO_PHYS_USERSPACE64;
6466 break;
6467
6468 case UIO_SYSSPACE:
6469 uio->uio_segflg = UIO_PHYS_SYSSPACE;
6470 break;
6471 }
6472 pl = ubc_upl_pageinfo(upl);
6473
6474 pg_index = upl_offset / PAGE_SIZE;
6475 pg_offset = upl_offset & PAGE_MASK;
6476 csize = min(PAGE_SIZE - pg_offset, xsize);
6477
6478 dirty_count = 0;
6479 while (xsize && retval == 0) {
6480 addr64_t paddr;
6481
6482 paddr = ((addr64_t)upl_phys_page(pl, pg_index) << PAGE_SHIFT) + pg_offset;
6483 if ((uio->uio_rw == UIO_WRITE) && (upl_dirty_page(pl, pg_index) == FALSE)) {
6484 dirty_count++;
6485 }
6486
6487 retval = uiomove64(paddr, csize, uio);
6488
6489 pg_index += 1;
6490 pg_offset = 0;
6491 xsize -= csize;
6492 csize = min(PAGE_SIZE, xsize);
6493 }
6494 *io_resid = xsize;
6495
6496 uio->uio_segflg = segflg;
6497
6498 task_update_logical_writes(current_task(), (dirty_count * PAGE_SIZE), TASK_WRITE_DEFERRED, upl_lookup_vnode(upl));
6499 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
6500 (int)uio->uio_offset, xsize, retval, segflg, 0);
6501
6502 return retval;
6503 }
6504
6505
6506 int
6507 cluster_copy_ubc_data(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty)
6508 {
6509 return cluster_copy_ubc_data_internal(vp, uio, io_resid, mark_dirty, 1);
6510 }
6511
6512
6513 static int
6514 cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty, int take_reference)
6515 {
6516 int segflg;
6517 int io_size;
6518 int xsize;
6519 int start_offset;
6520 int retval = 0;
6521 memory_object_control_t control;
6522
6523 io_size = *io_resid;
6524
6525 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
6526 (int)uio->uio_offset, io_size, mark_dirty, take_reference, 0);
6527
6528 control = ubc_getobject(vp, UBC_FLAGS_NONE);
6529
6530 if (control == MEMORY_OBJECT_CONTROL_NULL) {
6531 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
6532 (int)uio->uio_offset, io_size, retval, 3, 0);
6533
6534 return 0;
6535 }
6536 segflg = uio->uio_segflg;
6537
6538 switch (segflg) {
6539 case UIO_USERSPACE32:
6540 case UIO_USERISPACE32:
6541 uio->uio_segflg = UIO_PHYS_USERSPACE32;
6542 break;
6543
6544 case UIO_USERSPACE64:
6545 case UIO_USERISPACE64:
6546 uio->uio_segflg = UIO_PHYS_USERSPACE64;
6547 break;
6548
6549 case UIO_USERSPACE:
6550 case UIO_USERISPACE:
6551 uio->uio_segflg = UIO_PHYS_USERSPACE;
6552 break;
6553
6554 case UIO_SYSSPACE:
6555 uio->uio_segflg = UIO_PHYS_SYSSPACE;
6556 break;
6557 }
6558
6559 if ((io_size = *io_resid)) {
6560 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
6561 xsize = uio_resid(uio);
6562
6563 retval = memory_object_control_uiomove(control, uio->uio_offset - start_offset, uio,
6564 start_offset, io_size, mark_dirty, take_reference);
6565 xsize -= uio_resid(uio);
6566 io_size -= xsize;
6567 }
6568 uio->uio_segflg = segflg;
6569 *io_resid = io_size;
6570
6571 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
6572 (int)uio->uio_offset, io_size, retval, 0x80000000 | segflg, 0);
6573
6574 return retval;
6575 }
6576
6577
6578 int
6579 is_file_clean(vnode_t vp, off_t filesize)
6580 {
6581 off_t f_offset;
6582 int flags;
6583 int total_dirty = 0;
6584
6585 for (f_offset = 0; f_offset < filesize; f_offset += PAGE_SIZE_64) {
6586 if (ubc_page_op(vp, f_offset, 0, NULL, &flags) == KERN_SUCCESS) {
6587 if (flags & UPL_POP_DIRTY) {
6588 total_dirty++;
6589 }
6590 }
6591 }
6592 if (total_dirty) {
6593 return EINVAL;
6594 }
6595
6596 return 0;
6597 }
6598
6599
6600
6601 /*
6602 * Dirty region tracking/clustering mechanism.
6603 *
6604 * This code (vfs_drt_*) provides a mechanism for tracking and clustering
6605 * dirty regions within a larger space (file). It is primarily intended to
6606 * support clustering in large files with many dirty areas.
6607 *
6608 * The implementation assumes that the dirty regions are pages.
6609 *
6610 * To represent dirty pages within the file, we store bit vectors in a
6611 * variable-size circular hash.
6612 */
6613
6614 /*
6615 * Bitvector size. This determines the number of pages we group in a
6616 * single hashtable entry. Each hashtable entry is aligned to this
6617 * size within the file.
6618 */
6619 #define DRT_BITVECTOR_PAGES ((1024 * 256) / PAGE_SIZE)
6620
6621 /*
6622 * File offset handling.
6623 *
6624 * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
6625 * the correct formula is (~((DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1))
6626 */
6627 #define DRT_ADDRESS_MASK (~((DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1))
6628 #define DRT_ALIGN_ADDRESS(addr) ((addr) & DRT_ADDRESS_MASK)
6629
6630 /*
6631 * Hashtable address field handling.
6632 *
6633 * The low-order bits of the hashtable address are used to conserve
6634 * space.
6635 *
6636 * DRT_HASH_COUNT_MASK must be large enough to store the range
6637 * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
6638 * to indicate that the bucket is actually unoccupied.
6639 */
6640 #define DRT_HASH_GET_ADDRESS(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
6641 #define DRT_HASH_SET_ADDRESS(scm, i, a) \
6642 do { \
6643 (scm)->scm_hashtable[(i)].dhe_control = \
6644 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \
6645 } while (0)
6646 #define DRT_HASH_COUNT_MASK 0x1ff
6647 #define DRT_HASH_GET_COUNT(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
6648 #define DRT_HASH_SET_COUNT(scm, i, c) \
6649 do { \
6650 (scm)->scm_hashtable[(i)].dhe_control = \
6651 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK); \
6652 } while (0)
6653 #define DRT_HASH_CLEAR(scm, i) \
6654 do { \
6655 (scm)->scm_hashtable[(i)].dhe_control = 0; \
6656 } while (0)
6657 #define DRT_HASH_VACATE(scm, i) DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
6658 #define DRT_HASH_VACANT(scm, i) (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
6659 #define DRT_HASH_COPY(oscm, oi, scm, i) \
6660 do { \
6661 (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control; \
6662 DRT_BITVECTOR_COPY(oscm, oi, scm, i); \
6663 } while(0);
6664
6665
6666 #if CONFIG_EMBEDDED
6667 /*
6668 * Hash table moduli.
6669 *
6670 * Since the hashtable entry's size is dependent on the size of
6671 * the bitvector, and since the hashtable size is constrained to
6672 * both being prime and fitting within the desired allocation
6673 * size, these values need to be manually determined.
6674 *
6675 * For DRT_BITVECTOR_SIZE = 64, the entry size is 16 bytes.
6676 *
6677 * The small hashtable allocation is 4096 bytes, so the modulus is 251.
6678 * The large hashtable allocation is 32768 bytes, so the modulus is 2039.
6679 * The xlarge hashtable allocation is 131072 bytes, so the modulus is 8179.
6680 */
6681
6682 #define DRT_HASH_SMALL_MODULUS 251
6683 #define DRT_HASH_LARGE_MODULUS 2039
6684 #define DRT_HASH_XLARGE_MODULUS 8179
6685
6686 /*
6687 * Physical memory required before the large hash modulus is permitted.
6688 *
6689 * On small memory systems, the large hash modulus can lead to phsyical
6690 * memory starvation, so we avoid using it there.
6691 */
6692 #define DRT_HASH_LARGE_MEMORY_REQUIRED (1024LL * 1024LL * 1024LL) /* 1GiB */
6693 #define DRT_HASH_XLARGE_MEMORY_REQUIRED (8 * 1024LL * 1024LL * 1024LL) /* 8GiB */
6694
6695 #define DRT_SMALL_ALLOCATION 4096 /* 80 bytes spare */
6696 #define DRT_LARGE_ALLOCATION 32768 /* 144 bytes spare */
6697 #define DRT_XLARGE_ALLOCATION 131072 /* 208 bytes spare */
6698
6699 #else
6700 /*
6701 * Hash table moduli.
6702 *
6703 * Since the hashtable entry's size is dependent on the size of
6704 * the bitvector, and since the hashtable size is constrained to
6705 * both being prime and fitting within the desired allocation
6706 * size, these values need to be manually determined.
6707 *
6708 * For DRT_BITVECTOR_SIZE = 64, the entry size is 16 bytes.
6709 *
6710 * The small hashtable allocation is 16384 bytes, so the modulus is 1019.
6711 * The large hashtable allocation is 131072 bytes, so the modulus is 8179.
6712 * The xlarge hashtable allocation is 524288 bytes, so the modulus is 32749.
6713 */
6714
6715 #define DRT_HASH_SMALL_MODULUS 1019
6716 #define DRT_HASH_LARGE_MODULUS 8179
6717 #define DRT_HASH_XLARGE_MODULUS 32749
6718
6719 /*
6720 * Physical memory required before the large hash modulus is permitted.
6721 *
6722 * On small memory systems, the large hash modulus can lead to phsyical
6723 * memory starvation, so we avoid using it there.
6724 */
6725 #define DRT_HASH_LARGE_MEMORY_REQUIRED (4 * 1024LL * 1024LL * 1024LL) /* 4GiB */
6726 #define DRT_HASH_XLARGE_MEMORY_REQUIRED (32 * 1024LL * 1024LL * 1024LL) /* 32GiB */
6727
6728 #define DRT_SMALL_ALLOCATION 16384 /* 80 bytes spare */
6729 #define DRT_LARGE_ALLOCATION 131072 /* 208 bytes spare */
6730 #define DRT_XLARGE_ALLOCATION 524288 /* 304 bytes spare */
6731
6732 #endif
6733
6734 /* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */
6735
6736 /*
6737 * Hashtable entry.
6738 */
6739 struct vfs_drt_hashentry {
6740 u_int64_t dhe_control;
6741 /*
6742 * dhe_bitvector was declared as dhe_bitvector[DRT_BITVECTOR_PAGES / 32];
6743 * DRT_BITVECTOR_PAGES is defined as ((1024 * 256) / PAGE_SIZE)
6744 * Since PAGE_SIZE is only known at boot time,
6745 * -define MAX_DRT_BITVECTOR_PAGES for smallest supported page size (4k)
6746 * -declare dhe_bitvector array for largest possible length
6747 */
6748 #define MAX_DRT_BITVECTOR_PAGES (1024 * 256)/( 4 * 1024)
6749 u_int32_t dhe_bitvector[MAX_DRT_BITVECTOR_PAGES / 32];
6750 };
6751
6752 /*
6753 * Hashtable bitvector handling.
6754 *
6755 * Bitvector fields are 32 bits long.
6756 */
6757
6758 #define DRT_HASH_SET_BIT(scm, i, bit) \
6759 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32))
6760
6761 #define DRT_HASH_CLEAR_BIT(scm, i, bit) \
6762 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
6763
6764 #define DRT_HASH_TEST_BIT(scm, i, bit) \
6765 ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
6766
6767 #define DRT_BITVECTOR_CLEAR(scm, i) \
6768 bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (MAX_DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
6769
6770 #define DRT_BITVECTOR_COPY(oscm, oi, scm, i) \
6771 bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0], \
6772 &(scm)->scm_hashtable[(i)].dhe_bitvector[0], \
6773 (MAX_DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
6774
6775 /*
6776 * Dirty Region Tracking structure.
6777 *
6778 * The hashtable is allocated entirely inside the DRT structure.
6779 *
6780 * The hash is a simple circular prime modulus arrangement, the structure
6781 * is resized from small to large if it overflows.
6782 */
6783
6784 struct vfs_drt_clustermap {
6785 u_int32_t scm_magic; /* sanity/detection */
6786 #define DRT_SCM_MAGIC 0x12020003
6787 u_int32_t scm_modulus; /* current ring size */
6788 u_int32_t scm_buckets; /* number of occupied buckets */
6789 u_int32_t scm_lastclean; /* last entry we cleaned */
6790 u_int32_t scm_iskips; /* number of slot skips */
6791
6792 struct vfs_drt_hashentry scm_hashtable[0];
6793 };
6794
6795
6796 #define DRT_HASH(scm, addr) ((addr) % (scm)->scm_modulus)
6797 #define DRT_HASH_NEXT(scm, addr) (((addr) + 1) % (scm)->scm_modulus)
6798
6799 /*
6800 * Debugging codes and arguments.
6801 */
6802 #define DRT_DEBUG_EMPTYFREE (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
6803 #define DRT_DEBUG_RETCLUSTER (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
6804 #define DRT_DEBUG_ALLOC (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
6805 #define DRT_DEBUG_INSERT (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
6806 #define DRT_DEBUG_MARK (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
6807 * dirty */
6808 /* 0, setcount */
6809 /* 1 (clean, no map) */
6810 /* 2 (map alloc fail) */
6811 /* 3, resid (partial) */
6812 #define DRT_DEBUG_6 (FSDBG_CODE(DBG_FSRW, 87))
6813 #define DRT_DEBUG_SCMDATA (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
6814 * lastclean, iskips */
6815
6816
6817 static kern_return_t vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp);
6818 static kern_return_t vfs_drt_free_map(struct vfs_drt_clustermap *cmap);
6819 static kern_return_t vfs_drt_search_index(struct vfs_drt_clustermap *cmap,
6820 u_int64_t offset, int *indexp);
6821 static kern_return_t vfs_drt_get_index(struct vfs_drt_clustermap **cmapp,
6822 u_int64_t offset,
6823 int *indexp,
6824 int recursed);
6825 static kern_return_t vfs_drt_do_mark_pages(
6826 void **cmapp,
6827 u_int64_t offset,
6828 u_int length,
6829 u_int *setcountp,
6830 int dirty);
6831 static void vfs_drt_trace(
6832 struct vfs_drt_clustermap *cmap,
6833 int code,
6834 int arg1,
6835 int arg2,
6836 int arg3,
6837 int arg4);
6838
6839
6840 /*
6841 * Allocate and initialise a sparse cluster map.
6842 *
6843 * Will allocate a new map, resize or compact an existing map.
6844 *
6845 * XXX we should probably have at least one intermediate map size,
6846 * as the 1:16 ratio seems a bit drastic.
6847 */
6848 static kern_return_t
6849 vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp)
6850 {
6851 struct vfs_drt_clustermap *cmap = NULL, *ocmap = NULL;
6852 kern_return_t kret = KERN_SUCCESS;
6853 u_int64_t offset = 0;
6854 u_int32_t i = 0;
6855 int modulus_size = 0, map_size = 0, active_buckets = 0, index = 0, copycount = 0;
6856
6857 ocmap = NULL;
6858 if (cmapp != NULL) {
6859 ocmap = *cmapp;
6860 }
6861
6862 /*
6863 * Decide on the size of the new map.
6864 */
6865 if (ocmap == NULL) {
6866 modulus_size = DRT_HASH_SMALL_MODULUS;
6867 map_size = DRT_SMALL_ALLOCATION;
6868 } else {
6869 /* count the number of active buckets in the old map */
6870 active_buckets = 0;
6871 for (i = 0; i < ocmap->scm_modulus; i++) {
6872 if (!DRT_HASH_VACANT(ocmap, i) &&
6873 (DRT_HASH_GET_COUNT(ocmap, i) != 0)) {
6874 active_buckets++;
6875 }
6876 }
6877 /*
6878 * If we're currently using the small allocation, check to
6879 * see whether we should grow to the large one.
6880 */
6881 if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
6882 /*
6883 * If the ring is nearly full and we are allowed to
6884 * use the large modulus, upgrade.
6885 */
6886 if ((active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) &&
6887 (max_mem >= DRT_HASH_LARGE_MEMORY_REQUIRED)) {
6888 modulus_size = DRT_HASH_LARGE_MODULUS;
6889 map_size = DRT_LARGE_ALLOCATION;
6890 } else {
6891 modulus_size = DRT_HASH_SMALL_MODULUS;
6892 map_size = DRT_SMALL_ALLOCATION;
6893 }
6894 } else if (ocmap->scm_modulus == DRT_HASH_LARGE_MODULUS) {
6895 if ((active_buckets > (DRT_HASH_LARGE_MODULUS - 5)) &&
6896 (max_mem >= DRT_HASH_XLARGE_MEMORY_REQUIRED)) {
6897 modulus_size = DRT_HASH_XLARGE_MODULUS;
6898 map_size = DRT_XLARGE_ALLOCATION;
6899 } else {
6900 /*
6901 * If the ring is completely full and we can't
6902 * expand, there's nothing useful for us to do.
6903 * Behave as though we had compacted into the new
6904 * array and return.
6905 */
6906 return KERN_SUCCESS;
6907 }
6908 } else {
6909 /* already using the xlarge modulus */
6910 modulus_size = DRT_HASH_XLARGE_MODULUS;
6911 map_size = DRT_XLARGE_ALLOCATION;
6912
6913 /*
6914 * If the ring is completely full, there's
6915 * nothing useful for us to do. Behave as
6916 * though we had compacted into the new
6917 * array and return.
6918 */
6919 if (active_buckets >= DRT_HASH_XLARGE_MODULUS) {
6920 return KERN_SUCCESS;
6921 }
6922 }
6923 }
6924
6925 /*
6926 * Allocate and initialise the new map.
6927 */
6928
6929 kret = kmem_alloc(kernel_map, (vm_offset_t *)&cmap, map_size, VM_KERN_MEMORY_FILE);
6930 if (kret != KERN_SUCCESS) {
6931 return kret;
6932 }
6933 cmap->scm_magic = DRT_SCM_MAGIC;
6934 cmap->scm_modulus = modulus_size;
6935 cmap->scm_buckets = 0;
6936 cmap->scm_lastclean = 0;
6937 cmap->scm_iskips = 0;
6938 for (i = 0; i < cmap->scm_modulus; i++) {
6939 DRT_HASH_CLEAR(cmap, i);
6940 DRT_HASH_VACATE(cmap, i);
6941 DRT_BITVECTOR_CLEAR(cmap, i);
6942 }
6943
6944 /*
6945 * If there's an old map, re-hash entries from it into the new map.
6946 */
6947 copycount = 0;
6948 if (ocmap != NULL) {
6949 for (i = 0; i < ocmap->scm_modulus; i++) {
6950 /* skip empty buckets */
6951 if (DRT_HASH_VACANT(ocmap, i) ||
6952 (DRT_HASH_GET_COUNT(ocmap, i) == 0)) {
6953 continue;
6954 }
6955 /* get new index */
6956 offset = DRT_HASH_GET_ADDRESS(ocmap, i);
6957 kret = vfs_drt_get_index(&cmap, offset, &index, 1);
6958 if (kret != KERN_SUCCESS) {
6959 /* XXX need to bail out gracefully here */
6960 panic("vfs_drt: new cluster map mysteriously too small");
6961 index = 0;
6962 }
6963 /* copy */
6964 DRT_HASH_COPY(ocmap, i, cmap, index);
6965 copycount++;
6966 }
6967 }
6968
6969 /* log what we've done */
6970 vfs_drt_trace(cmap, DRT_DEBUG_ALLOC, copycount, 0, 0, 0);
6971
6972 /*
6973 * It's important to ensure that *cmapp always points to
6974 * a valid map, so we must overwrite it before freeing
6975 * the old map.
6976 */
6977 *cmapp = cmap;
6978 if (ocmap != NULL) {
6979 /* emit stats into trace buffer */
6980 vfs_drt_trace(ocmap, DRT_DEBUG_SCMDATA,
6981 ocmap->scm_modulus,
6982 ocmap->scm_buckets,
6983 ocmap->scm_lastclean,
6984 ocmap->scm_iskips);
6985
6986 vfs_drt_free_map(ocmap);
6987 }
6988 return KERN_SUCCESS;
6989 }
6990
6991
6992 /*
6993 * Free a sparse cluster map.
6994 */
6995 static kern_return_t
6996 vfs_drt_free_map(struct vfs_drt_clustermap *cmap)
6997 {
6998 vm_size_t map_size = 0;
6999
7000 if (cmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
7001 map_size = DRT_SMALL_ALLOCATION;
7002 } else if (cmap->scm_modulus == DRT_HASH_LARGE_MODULUS) {
7003 map_size = DRT_LARGE_ALLOCATION;
7004 } else if (cmap->scm_modulus == DRT_HASH_XLARGE_MODULUS) {
7005 map_size = DRT_XLARGE_ALLOCATION;
7006 } else {
7007 panic("vfs_drt_free_map: Invalid modulus %d\n", cmap->scm_modulus);
7008 }
7009
7010 kmem_free(kernel_map, (vm_offset_t)cmap, map_size);
7011 return KERN_SUCCESS;
7012 }
7013
7014
7015 /*
7016 * Find the hashtable slot currently occupied by an entry for the supplied offset.
7017 */
7018 static kern_return_t
7019 vfs_drt_search_index(struct vfs_drt_clustermap *cmap, u_int64_t offset, int *indexp)
7020 {
7021 int index;
7022 u_int32_t i;
7023
7024 offset = DRT_ALIGN_ADDRESS(offset);
7025 index = DRT_HASH(cmap, offset);
7026
7027 /* traverse the hashtable */
7028 for (i = 0; i < cmap->scm_modulus; i++) {
7029 /*
7030 * If the slot is vacant, we can stop.
7031 */
7032 if (DRT_HASH_VACANT(cmap, index)) {
7033 break;
7034 }
7035
7036 /*
7037 * If the address matches our offset, we have success.
7038 */
7039 if (DRT_HASH_GET_ADDRESS(cmap, index) == offset) {
7040 *indexp = index;
7041 return KERN_SUCCESS;
7042 }
7043
7044 /*
7045 * Move to the next slot, try again.
7046 */
7047 index = DRT_HASH_NEXT(cmap, index);
7048 }
7049 /*
7050 * It's not there.
7051 */
7052 return KERN_FAILURE;
7053 }
7054
7055 /*
7056 * Find the hashtable slot for the supplied offset. If we haven't allocated
7057 * one yet, allocate one and populate the address field. Note that it will
7058 * not have a nonzero page count and thus will still technically be free, so
7059 * in the case where we are called to clean pages, the slot will remain free.
7060 */
7061 static kern_return_t
7062 vfs_drt_get_index(struct vfs_drt_clustermap **cmapp, u_int64_t offset, int *indexp, int recursed)
7063 {
7064 struct vfs_drt_clustermap *cmap;
7065 kern_return_t kret;
7066 u_int32_t index;
7067 u_int32_t i;
7068
7069 cmap = *cmapp;
7070
7071 /* look for an existing entry */
7072 kret = vfs_drt_search_index(cmap, offset, indexp);
7073 if (kret == KERN_SUCCESS) {
7074 return kret;
7075 }
7076
7077 /* need to allocate an entry */
7078 offset = DRT_ALIGN_ADDRESS(offset);
7079 index = DRT_HASH(cmap, offset);
7080
7081 /* scan from the index forwards looking for a vacant slot */
7082 for (i = 0; i < cmap->scm_modulus; i++) {
7083 /* slot vacant? */
7084 if (DRT_HASH_VACANT(cmap, index) || DRT_HASH_GET_COUNT(cmap, index) == 0) {
7085 cmap->scm_buckets++;
7086 if (index < cmap->scm_lastclean) {
7087 cmap->scm_lastclean = index;
7088 }
7089 DRT_HASH_SET_ADDRESS(cmap, index, offset);
7090 DRT_HASH_SET_COUNT(cmap, index, 0);
7091 DRT_BITVECTOR_CLEAR(cmap, index);
7092 *indexp = index;
7093 vfs_drt_trace(cmap, DRT_DEBUG_INSERT, (int)offset, i, 0, 0);
7094 return KERN_SUCCESS;
7095 }
7096 cmap->scm_iskips += i;
7097 index = DRT_HASH_NEXT(cmap, index);
7098 }
7099
7100 /*
7101 * We haven't found a vacant slot, so the map is full. If we're not
7102 * already recursed, try reallocating/compacting it.
7103 */
7104 if (recursed) {
7105 return KERN_FAILURE;
7106 }
7107 kret = vfs_drt_alloc_map(cmapp);
7108 if (kret == KERN_SUCCESS) {
7109 /* now try to insert again */
7110 kret = vfs_drt_get_index(cmapp, offset, indexp, 1);
7111 }
7112 return kret;
7113 }
7114
7115 /*
7116 * Implementation of set dirty/clean.
7117 *
7118 * In the 'clean' case, not finding a map is OK.
7119 */
7120 static kern_return_t
7121 vfs_drt_do_mark_pages(
7122 void **private,
7123 u_int64_t offset,
7124 u_int length,
7125 u_int *setcountp,
7126 int dirty)
7127 {
7128 struct vfs_drt_clustermap *cmap, **cmapp;
7129 kern_return_t kret;
7130 int i, index, pgoff, pgcount, setcount, ecount;
7131
7132 cmapp = (struct vfs_drt_clustermap **)private;
7133 cmap = *cmapp;
7134
7135 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_START, (int)offset, (int)length, dirty, 0);
7136
7137 if (setcountp != NULL) {
7138 *setcountp = 0;
7139 }
7140
7141 /* allocate a cluster map if we don't already have one */
7142 if (cmap == NULL) {
7143 /* no cluster map, nothing to clean */
7144 if (!dirty) {
7145 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 1, 0, 0, 0);
7146 return KERN_SUCCESS;
7147 }
7148 kret = vfs_drt_alloc_map(cmapp);
7149 if (kret != KERN_SUCCESS) {
7150 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 2, 0, 0, 0);
7151 return kret;
7152 }
7153 }
7154 setcount = 0;
7155
7156 /*
7157 * Iterate over the length of the region.
7158 */
7159 while (length > 0) {
7160 /*
7161 * Get the hashtable index for this offset.
7162 *
7163 * XXX this will add blank entries if we are clearing a range
7164 * that hasn't been dirtied.
7165 */
7166 kret = vfs_drt_get_index(cmapp, offset, &index, 0);
7167 cmap = *cmapp; /* may have changed! */
7168 /* this may be a partial-success return */
7169 if (kret != KERN_SUCCESS) {
7170 if (setcountp != NULL) {
7171 *setcountp = setcount;
7172 }
7173 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 3, (int)length, 0, 0);
7174
7175 return kret;
7176 }
7177
7178 /*
7179 * Work out how many pages we're modifying in this
7180 * hashtable entry.
7181 */
7182 pgoff = (offset - DRT_ALIGN_ADDRESS(offset)) / PAGE_SIZE;
7183 pgcount = min((length / PAGE_SIZE), (DRT_BITVECTOR_PAGES - pgoff));
7184
7185 /*
7186 * Iterate over pages, dirty/clearing as we go.
7187 */
7188 ecount = DRT_HASH_GET_COUNT(cmap, index);
7189 for (i = 0; i < pgcount; i++) {
7190 if (dirty) {
7191 if (!DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
7192 if (ecount >= DRT_BITVECTOR_PAGES) {
7193 panic("ecount >= DRT_BITVECTOR_PAGES, cmap = %p, index = %d, bit = %d", cmap, index, pgoff + i);
7194 }
7195 DRT_HASH_SET_BIT(cmap, index, pgoff + i);
7196 ecount++;
7197 setcount++;
7198 }
7199 } else {
7200 if (DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
7201 if (ecount <= 0) {
7202 panic("ecount <= 0, cmap = %p, index = %d, bit = %d", cmap, index, pgoff + i);
7203 }
7204 assert(ecount > 0);
7205 DRT_HASH_CLEAR_BIT(cmap, index, pgoff + i);
7206 ecount--;
7207 setcount++;
7208 }
7209 }
7210 }
7211 DRT_HASH_SET_COUNT(cmap, index, ecount);
7212
7213 offset += pgcount * PAGE_SIZE;
7214 length -= pgcount * PAGE_SIZE;
7215 }
7216 if (setcountp != NULL) {
7217 *setcountp = setcount;
7218 }
7219
7220 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 0, setcount, 0, 0);
7221
7222 return KERN_SUCCESS;
7223 }
7224
7225 /*
7226 * Mark a set of pages as dirty/clean.
7227 *
7228 * This is a public interface.
7229 *
7230 * cmapp
7231 * Pointer to storage suitable for holding a pointer. Note that
7232 * this must either be NULL or a value set by this function.
7233 *
7234 * size
7235 * Current file size in bytes.
7236 *
7237 * offset
7238 * Offset of the first page to be marked as dirty, in bytes. Must be
7239 * page-aligned.
7240 *
7241 * length
7242 * Length of dirty region, in bytes. Must be a multiple of PAGE_SIZE.
7243 *
7244 * setcountp
7245 * Number of pages newly marked dirty by this call (optional).
7246 *
7247 * Returns KERN_SUCCESS if all the pages were successfully marked.
7248 */
7249 static kern_return_t
7250 vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, u_int *setcountp)
7251 {
7252 /* XXX size unused, drop from interface */
7253 return vfs_drt_do_mark_pages(cmapp, offset, length, setcountp, 1);
7254 }
7255
7256 #if 0
7257 static kern_return_t
7258 vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length)
7259 {
7260 return vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
7261 }
7262 #endif
7263
7264 /*
7265 * Get a cluster of dirty pages.
7266 *
7267 * This is a public interface.
7268 *
7269 * cmapp
7270 * Pointer to storage managed by drt_mark_pages. Note that this must
7271 * be NULL or a value set by drt_mark_pages.
7272 *
7273 * offsetp
7274 * Returns the byte offset into the file of the first page in the cluster.
7275 *
7276 * lengthp
7277 * Returns the length in bytes of the cluster of dirty pages.
7278 *
7279 * Returns success if a cluster was found. If KERN_FAILURE is returned, there
7280 * are no dirty pages meeting the minmum size criteria. Private storage will
7281 * be released if there are no more dirty pages left in the map
7282 *
7283 */
7284 static kern_return_t
7285 vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp)
7286 {
7287 struct vfs_drt_clustermap *cmap;
7288 u_int64_t offset;
7289 u_int length;
7290 u_int32_t j;
7291 int index, i, fs, ls;
7292
7293 /* sanity */
7294 if ((cmapp == NULL) || (*cmapp == NULL)) {
7295 return KERN_FAILURE;
7296 }
7297 cmap = *cmapp;
7298
7299 /* walk the hashtable */
7300 for (offset = 0, j = 0; j < cmap->scm_modulus; offset += (DRT_BITVECTOR_PAGES * PAGE_SIZE), j++) {
7301 index = DRT_HASH(cmap, offset);
7302
7303 if (DRT_HASH_VACANT(cmap, index) || (DRT_HASH_GET_COUNT(cmap, index) == 0)) {
7304 continue;
7305 }
7306
7307 /* scan the bitfield for a string of bits */
7308 fs = -1;
7309
7310 for (i = 0; i < DRT_BITVECTOR_PAGES; i++) {
7311 if (DRT_HASH_TEST_BIT(cmap, index, i)) {
7312 fs = i;
7313 break;
7314 }
7315 }
7316 if (fs == -1) {
7317 /* didn't find any bits set */
7318 panic("vfs_drt: entry summary count > 0 but no bits set in map, cmap = %p, index = %d, count = %lld",
7319 cmap, index, DRT_HASH_GET_COUNT(cmap, index));
7320 }
7321 for (ls = 0; i < DRT_BITVECTOR_PAGES; i++, ls++) {
7322 if (!DRT_HASH_TEST_BIT(cmap, index, i)) {
7323 break;
7324 }
7325 }
7326
7327 /* compute offset and length, mark pages clean */
7328 offset = DRT_HASH_GET_ADDRESS(cmap, index) + (PAGE_SIZE * fs);
7329 length = ls * PAGE_SIZE;
7330 vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
7331 cmap->scm_lastclean = index;
7332
7333 /* return successful */
7334 *offsetp = (off_t)offset;
7335 *lengthp = length;
7336
7337 vfs_drt_trace(cmap, DRT_DEBUG_RETCLUSTER, (int)offset, (int)length, 0, 0);
7338 return KERN_SUCCESS;
7339 }
7340 /*
7341 * We didn't find anything... hashtable is empty
7342 * emit stats into trace buffer and
7343 * then free it
7344 */
7345 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
7346 cmap->scm_modulus,
7347 cmap->scm_buckets,
7348 cmap->scm_lastclean,
7349 cmap->scm_iskips);
7350
7351 vfs_drt_free_map(cmap);
7352 *cmapp = NULL;
7353
7354 return KERN_FAILURE;
7355 }
7356
7357
7358 static kern_return_t
7359 vfs_drt_control(void **cmapp, int op_type)
7360 {
7361 struct vfs_drt_clustermap *cmap;
7362
7363 /* sanity */
7364 if ((cmapp == NULL) || (*cmapp == NULL)) {
7365 return KERN_FAILURE;
7366 }
7367 cmap = *cmapp;
7368
7369 switch (op_type) {
7370 case 0:
7371 /* emit stats into trace buffer */
7372 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
7373 cmap->scm_modulus,
7374 cmap->scm_buckets,
7375 cmap->scm_lastclean,
7376 cmap->scm_iskips);
7377
7378 vfs_drt_free_map(cmap);
7379 *cmapp = NULL;
7380 break;
7381
7382 case 1:
7383 cmap->scm_lastclean = 0;
7384 break;
7385 }
7386 return KERN_SUCCESS;
7387 }
7388
7389
7390
7391 /*
7392 * Emit a summary of the state of the clustermap into the trace buffer
7393 * along with some caller-provided data.
7394 */
7395 #if KDEBUG
7396 static void
7397 vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, int code, int arg1, int arg2, int arg3, int arg4)
7398 {
7399 KERNEL_DEBUG(code, arg1, arg2, arg3, arg4, 0);
7400 }
7401 #else
7402 static void
7403 vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, __unused int code,
7404 __unused int arg1, __unused int arg2, __unused int arg3,
7405 __unused int arg4)
7406 {
7407 }
7408 #endif
7409
7410 #if 0
7411 /*
7412 * Perform basic sanity check on the hash entry summary count
7413 * vs. the actual bits set in the entry.
7414 */
7415 static void
7416 vfs_drt_sanity(struct vfs_drt_clustermap *cmap)
7417 {
7418 int index, i;
7419 int bits_on;
7420
7421 for (index = 0; index < cmap->scm_modulus; index++) {
7422 if (DRT_HASH_VACANT(cmap, index)) {
7423 continue;
7424 }
7425
7426 for (bits_on = 0, i = 0; i < DRT_BITVECTOR_PAGES; i++) {
7427 if (DRT_HASH_TEST_BIT(cmap, index, i)) {
7428 bits_on++;
7429 }
7430 }
7431 if (bits_on != DRT_HASH_GET_COUNT(cmap, index)) {
7432 panic("bits_on = %d, index = %d\n", bits_on, index);
7433 }
7434 }
7435 }
7436 #endif
7437
7438 /*
7439 * Internal interface only.
7440 */
7441 static kern_return_t
7442 vfs_get_scmap_push_behavior_internal(void **cmapp, int *push_flag)
7443 {
7444 struct vfs_drt_clustermap *cmap;
7445
7446 /* sanity */
7447 if ((cmapp == NULL) || (*cmapp == NULL) || (push_flag == NULL)) {
7448 return KERN_FAILURE;
7449 }
7450 cmap = *cmapp;
7451
7452 if (cmap->scm_modulus == DRT_HASH_XLARGE_MODULUS) {
7453 /*
7454 * If we have a full xlarge sparse cluster,
7455 * we push it out all at once so the cluster
7456 * map can be available to absorb more I/Os.
7457 * This is done on large memory configs so
7458 * the small I/Os don't interfere with the
7459 * pro workloads.
7460 */
7461 *push_flag = PUSH_ALL;
7462 }
7463 return KERN_SUCCESS;
7464 }