]> git.saurik.com Git - apple/xnu.git/blame - bsd/vfs/vfs_cluster.c
xnu-792.22.5.tar.gz
[apple/xnu.git] / bsd / vfs / vfs_cluster.c
CommitLineData
1c79356b 1/*
5d5c5d0d
A
2 * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
3 *
8f6c56a5 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
1c79356b 5 *
8f6c56a5
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
8ad349bb 24 * limitations under the License.
8f6c56a5
A
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b
A
27 */
28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29/*
30 * Copyright (c) 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95
62 */
63
64#include <sys/param.h>
91447636
A
65#include <sys/proc_internal.h>
66#include <sys/buf_internal.h>
67#include <sys/mount_internal.h>
68#include <sys/vnode_internal.h>
1c79356b
A
69#include <sys/trace.h>
70#include <sys/malloc.h>
55e303ae
A
71#include <sys/time.h>
72#include <sys/kernel.h>
1c79356b 73#include <sys/resourcevar.h>
91447636 74#include <sys/uio_internal.h>
1c79356b 75#include <libkern/libkern.h>
55e303ae 76#include <machine/machine_routines.h>
1c79356b 77
91447636 78#include <sys/ubc_internal.h>
1c79356b 79
55e303ae
A
80#include <mach/mach_types.h>
81#include <mach/memory_object_types.h>
91447636
A
82#include <mach/vm_map.h>
83#include <mach/upl.h>
84
85#include <vm/vm_kern.h>
86#include <vm/vm_map.h>
87#include <vm/vm_pageout.h>
55e303ae 88
1c79356b
A
89#include <sys/kdebug.h>
90
91447636 91
1c79356b
A
92#define CL_READ 0x01
93#define CL_ASYNC 0x02
94#define CL_COMMIT 0x04
1c79356b
A
95#define CL_PAGEOUT 0x10
96#define CL_AGE 0x20
97#define CL_DUMP 0x40
98#define CL_NOZERO 0x80
99#define CL_PAGEIN 0x100
0b4e3aa0 100#define CL_DEV_MEMORY 0x200
b4c24cb9 101#define CL_PRESERVE 0x400
55e303ae 102#define CL_THROTTLE 0x800
91447636 103#define CL_KEEPCACHED 0x1000
b4c24cb9 104
d7e50217 105
b4c24cb9 106struct clios {
d7e50217
A
107 u_int io_completed; /* amount of io that has currently completed */
108 u_int io_issued; /* amount of io that was successfully issued */
109 int io_error; /* error code of first error encountered */
110 int io_wanted; /* someone is sleeping waiting for a change in state */
b4c24cb9
A
111};
112
91447636
A
113static lck_grp_t *cl_mtx_grp;
114static lck_attr_t *cl_mtx_attr;
115static lck_grp_attr_t *cl_mtx_grp_attr;
116static lck_mtx_t *cl_mtxp;
117
118
119static int cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
120 int flags, buf_t real_bp, struct clios *iostate);
121static int cluster_iodone(buf_t bp, void *dummy);
122static int cluster_rd_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize);
123static int cluster_hard_throttle_on(vnode_t vp);
124
125static int cluster_read_x(vnode_t vp, struct uio *uio, off_t filesize, int flags);
126static int cluster_write_x(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF,
127 off_t headOff, off_t tailOff, int flags);
128static int cluster_nocopy_read(vnode_t vp, struct uio *uio, off_t filesize);
129static int cluster_nocopy_write(vnode_t vp, struct uio *uio, off_t newEOF);
130static int cluster_phys_read(vnode_t vp, struct uio *uio, off_t filesize);
131static int cluster_phys_write(vnode_t vp, struct uio *uio, off_t newEOF);
132static int cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, int xsize, int flags);
1c79356b 133
91447636
A
134static void cluster_rd_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *ra);
135
136static int cluster_push_x(vnode_t vp, struct cl_extent *, off_t EOF, int flags);
137static void cluster_push_EOF(vnode_t vp, off_t EOF);
138
139static int cluster_try_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int can_delay, int push_all);
140
141static void sparse_cluster_switch(struct cl_writebehind *, vnode_t vp, off_t EOF);
142static void sparse_cluster_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_all);
143static void sparse_cluster_add(struct cl_writebehind *, vnode_t vp, struct cl_extent *, off_t EOF);
55e303ae
A
144
145static kern_return_t vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, int *setcountp);
55e303ae
A
146static kern_return_t vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp);
147static kern_return_t vfs_drt_control(void **cmapp, int op_type);
148
91447636 149int is_file_clean(vnode_t, off_t);
9bccf70c 150
1c79356b
A
151/*
152 * throttle the number of async writes that
153 * can be outstanding on a single vnode
154 * before we issue a synchronous write
155 */
91447636
A
156#define HARD_THROTTLE_MAXCNT 0
157#define HARD_THROTTLE_MAXSIZE (64 * 1024)
55e303ae
A
158
159int hard_throttle_on_root = 0;
160struct timeval priority_IO_timestamp_for_root;
161
162
91447636
A
163void
164cluster_init(void) {
4452a7af 165 /*
91447636
A
166 * allocate lock group attribute and group
167 */
4452a7af 168 cl_mtx_grp_attr = lck_grp_attr_alloc_init();
91447636
A
169 cl_mtx_grp = lck_grp_alloc_init("cluster I/O", cl_mtx_grp_attr);
170
171 /*
172 * allocate the lock attribute
173 */
174 cl_mtx_attr = lck_attr_alloc_init();
91447636
A
175
176 /*
177 * allocate and initialize mutex's used to protect updates and waits
178 * on the cluster_io context
179 */
180 cl_mtxp = lck_mtx_alloc_init(cl_mtx_grp, cl_mtx_attr);
181
182 if (cl_mtxp == NULL)
183 panic("cluster_init: failed to allocate cl_mtxp");
184}
185
186
187
188#define CLW_ALLOCATE 0x01
189#define CLW_RETURNLOCKED 0x02
190/*
191 * if the read ahead context doesn't yet exist,
192 * allocate and initialize it...
193 * the vnode lock serializes multiple callers
194 * during the actual assignment... first one
195 * to grab the lock wins... the other callers
196 * will release the now unnecessary storage
197 *
198 * once the context is present, try to grab (but don't block on)
199 * the lock associated with it... if someone
200 * else currently owns it, than the read
201 * will run without read-ahead. this allows
202 * multiple readers to run in parallel and
203 * since there's only 1 read ahead context,
204 * there's no real loss in only allowing 1
205 * reader to have read-ahead enabled.
206 */
207static struct cl_readahead *
208cluster_get_rap(vnode_t vp)
209{
210 struct ubc_info *ubc;
211 struct cl_readahead *rap;
212
213 ubc = vp->v_ubcinfo;
214
215 if ((rap = ubc->cl_rahead) == NULL) {
216 MALLOC_ZONE(rap, struct cl_readahead *, sizeof *rap, M_CLRDAHEAD, M_WAITOK);
217
218 bzero(rap, sizeof *rap);
219 rap->cl_lastr = -1;
220 lck_mtx_init(&rap->cl_lockr, cl_mtx_grp, cl_mtx_attr);
221
222 vnode_lock(vp);
223
224 if (ubc->cl_rahead == NULL)
225 ubc->cl_rahead = rap;
226 else {
227 lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp);
228 FREE_ZONE((void *)rap, sizeof *rap, M_CLRDAHEAD);
229 rap = ubc->cl_rahead;
230 }
231 vnode_unlock(vp);
232 }
233 if (lck_mtx_try_lock(&rap->cl_lockr) == TRUE)
234 return(rap);
235
236 return ((struct cl_readahead *)NULL);
237}
238
239
240/*
241 * if the write behind context doesn't yet exist,
242 * and CLW_ALLOCATE is specified, allocate and initialize it...
243 * the vnode lock serializes multiple callers
244 * during the actual assignment... first one
245 * to grab the lock wins... the other callers
246 * will release the now unnecessary storage
247 *
248 * if CLW_RETURNLOCKED is set, grab (blocking if necessary)
249 * the lock associated with the write behind context before
250 * returning
251 */
252
253static struct cl_writebehind *
254cluster_get_wbp(vnode_t vp, int flags)
255{
256 struct ubc_info *ubc;
257 struct cl_writebehind *wbp;
258
259 ubc = vp->v_ubcinfo;
260
261 if ((wbp = ubc->cl_wbehind) == NULL) {
262
263 if ( !(flags & CLW_ALLOCATE))
264 return ((struct cl_writebehind *)NULL);
265
266 MALLOC_ZONE(wbp, struct cl_writebehind *, sizeof *wbp, M_CLWRBEHIND, M_WAITOK);
267
268 bzero(wbp, sizeof *wbp);
269 lck_mtx_init(&wbp->cl_lockw, cl_mtx_grp, cl_mtx_attr);
270
271 vnode_lock(vp);
272
273 if (ubc->cl_wbehind == NULL)
274 ubc->cl_wbehind = wbp;
275 else {
276 lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp);
277 FREE_ZONE((void *)wbp, sizeof *wbp, M_CLWRBEHIND);
278 wbp = ubc->cl_wbehind;
279 }
280 vnode_unlock(vp);
281 }
282 if (flags & CLW_RETURNLOCKED)
283 lck_mtx_lock(&wbp->cl_lockw);
284
285 return (wbp);
286}
287
288
55e303ae 289static int
91447636 290cluster_hard_throttle_on(vnode_t vp)
55e303ae 291{
91447636 292 static struct timeval hard_throttle_maxelapsed = { 0, 200000 };
55e303ae
A
293
294 if (vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV) {
295 struct timeval elapsed;
296
297 if (hard_throttle_on_root)
298 return(1);
299
91447636 300 microuptime(&elapsed);
55e303ae
A
301 timevalsub(&elapsed, &priority_IO_timestamp_for_root);
302
303 if (timevalcmp(&elapsed, &hard_throttle_maxelapsed, <))
304 return(1);
305 }
306 return(0);
307}
308
1c79356b
A
309
310static int
91447636 311cluster_iodone(buf_t bp, __unused void *dummy)
1c79356b 312{
91447636
A
313 int b_flags;
314 int error;
315 int total_size;
316 int total_resid;
317 int upl_offset;
318 int zero_offset;
319 upl_t upl;
320 buf_t cbp;
321 buf_t cbp_head;
322 buf_t cbp_next;
323 buf_t real_bp;
324 struct clios *iostate;
325 int commit_size;
326 int pg_offset;
327
328 cbp_head = (buf_t)(bp->b_trans_head);
1c79356b
A
329
330 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
9bccf70c 331 (int)cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
1c79356b
A
332
333 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
334 /*
335 * all I/O requests that are part of this transaction
336 * have to complete before we can process it
337 */
338 if ( !(cbp->b_flags & B_DONE)) {
339
340 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
9bccf70c 341 (int)cbp_head, (int)cbp, cbp->b_bcount, cbp->b_flags, 0);
1c79356b
A
342
343 return 0;
344 }
345 }
346 error = 0;
347 total_size = 0;
348 total_resid = 0;
349
350 cbp = cbp_head;
351 upl_offset = cbp->b_uploffset;
91447636 352 upl = cbp->b_upl;
1c79356b
A
353 b_flags = cbp->b_flags;
354 real_bp = cbp->b_real_bp;
9bccf70c 355 zero_offset= cbp->b_validend;
b4c24cb9 356 iostate = (struct clios *)cbp->b_iostate;
1c79356b 357
91447636
A
358 if (real_bp)
359 real_bp->b_dev = cbp->b_dev;
360
1c79356b 361 while (cbp) {
1c79356b
A
362 if ((cbp->b_flags & B_ERROR) && error == 0)
363 error = cbp->b_error;
364
365 total_resid += cbp->b_resid;
366 total_size += cbp->b_bcount;
367
368 cbp_next = cbp->b_trans_next;
369
370 free_io_buf(cbp);
371
372 cbp = cbp_next;
373 }
b4c24cb9
A
374 if (zero_offset)
375 cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
376
b4c24cb9 377 if (iostate) {
91447636
A
378 int need_wakeup = 0;
379
d7e50217
A
380 /*
381 * someone has issued multiple I/Os asynchrounsly
382 * and is waiting for them to complete (streaming)
383 */
91447636
A
384 lck_mtx_lock(cl_mtxp);
385
d7e50217
A
386 if (error && iostate->io_error == 0)
387 iostate->io_error = error;
9bccf70c 388
b4c24cb9
A
389 iostate->io_completed += total_size;
390
391 if (iostate->io_wanted) {
d7e50217
A
392 /*
393 * someone is waiting for the state of
394 * this io stream to change
395 */
b4c24cb9 396 iostate->io_wanted = 0;
91447636 397 need_wakeup = 1;
b4c24cb9 398 }
91447636
A
399 lck_mtx_unlock(cl_mtxp);
400
401 if (need_wakeup)
402 wakeup((caddr_t)&iostate->io_wanted);
b4c24cb9 403 }
1c79356b
A
404 if ((b_flags & B_NEED_IODONE) && real_bp) {
405 if (error) {
406 real_bp->b_flags |= B_ERROR;
407 real_bp->b_error = error;
408 }
409 real_bp->b_resid = total_resid;
410
91447636 411 buf_biodone(real_bp);
1c79356b
A
412 }
413 if (error == 0 && total_resid)
414 error = EIO;
415
416 if (b_flags & B_COMMIT_UPL) {
b4c24cb9 417 pg_offset = upl_offset & PAGE_MASK;
55e303ae 418 commit_size = (pg_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1c79356b 419
55e303ae 420 if (error || (b_flags & B_NOCACHE)) {
1c79356b 421 int upl_abort_code;
91447636
A
422 int page_in = 0;
423 int page_out = 0;
1c79356b 424
91447636
A
425 if (b_flags & B_PAGEIO) {
426 if (b_flags & B_READ)
427 page_in = 1;
428 else
429 page_out = 1;
430 }
431 if (b_flags & B_CACHE) /* leave pages in the cache unchanged on error */
1c79356b 432 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
91447636
A
433 else if (page_out && (error != ENXIO)) /* transient error */
434 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
435 else if (page_in)
436 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
1c79356b
A
437 else
438 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
439
0b4e3aa0 440 ubc_upl_abort_range(upl, upl_offset - pg_offset, commit_size,
91447636
A
441 upl_abort_code);
442
1c79356b 443 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
9bccf70c 444 (int)upl, upl_offset - pg_offset, commit_size,
1c79356b
A
445 0x80000000|upl_abort_code, 0);
446
447 } else {
448 int upl_commit_flags = UPL_COMMIT_FREE_ON_EMPTY;
449
91447636
A
450 if ((b_flags & B_PHYS) && (b_flags & B_READ))
451 upl_commit_flags |= UPL_COMMIT_SET_DIRTY;
55e303ae 452
1c79356b
A
453 if (b_flags & B_AGE)
454 upl_commit_flags |= UPL_COMMIT_INACTIVATE;
455
0b4e3aa0
A
456 ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size,
457 upl_commit_flags);
1c79356b 458
0b4e3aa0 459 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
9bccf70c 460 (int)upl, upl_offset - pg_offset, commit_size,
1c79356b
A
461 upl_commit_flags, 0);
462 }
91447636 463 } else {
1c79356b 464 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
9bccf70c 465 (int)upl, upl_offset, 0, error, 0);
91447636 466 }
1c79356b
A
467
468 return (error);
469}
470
471
91447636
A
472void
473cluster_zero(upl_t upl, vm_offset_t upl_offset, int size, buf_t bp)
1c79356b 474{
55e303ae 475 upl_page_info_t *pl;
1c79356b 476
55e303ae 477 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_START,
9bccf70c
A
478 upl_offset, size, (int)bp, 0, 0);
479
91447636 480 if (bp == NULL || bp->b_datap == 0) {
9bccf70c 481
55e303ae
A
482 pl = ubc_upl_pageinfo(upl);
483
484 while (size) {
485 int page_offset;
486 int page_index;
487 addr64_t zero_addr;
488 int zero_cnt;
489
490 page_index = upl_offset / PAGE_SIZE;
491 page_offset = upl_offset & PAGE_MASK;
492
493 zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << 12) + page_offset;
494 zero_cnt = min(PAGE_SIZE - page_offset, size);
495
496 bzero_phys(zero_addr, zero_cnt);
497
498 size -= zero_cnt;
499 upl_offset += zero_cnt;
500 }
1c79356b 501 } else
91447636 502 bzero((caddr_t)((vm_offset_t)bp->b_datap + upl_offset), size);
1c79356b 503
55e303ae
A
504 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_END,
505 upl_offset, size, 0, 0, 0);
1c79356b
A
506}
507
91447636 508
1c79356b 509static int
91447636
A
510cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
511 int flags, buf_t real_bp, struct clios *iostate)
1c79356b 512{
91447636
A
513 buf_t cbp;
514 u_int size;
515 u_int io_size;
516 int io_flags;
517 int bmap_flags;
518 int error = 0;
519 int retval = 0;
520 buf_t cbp_head = NULL;
521 buf_t cbp_tail = NULL;
522 int trans_count = 0;
523 u_int pg_count;
524 int pg_offset;
525 u_int max_iosize;
526 u_int max_vectors;
527 int priv;
528 int zero_offset = 0;
529 int async_throttle = 0;
530 mount_t mp;
531
532 mp = vp->v_mount;
533
534 if (mp->mnt_devblocksize > 1) {
535 /*
536 * round the requested size up so that this I/O ends on a
537 * page boundary in case this is a 'write'... if the filesystem
538 * has blocks allocated to back the page beyond the EOF, we want to
539 * make sure to write out the zero's that are sitting beyond the EOF
540 * so that in case the filesystem doesn't explicitly zero this area
541 * if a hole is created via a lseek/write beyond the current EOF,
542 * it will return zeros when it's read back from the disk. If the
543 * physical allocation doesn't extend for the whole page, we'll
544 * only write/read from the disk up to the end of this allocation
545 * via the extent info returned from the VNOP_BLOCKMAP call.
546 */
547 pg_offset = upl_offset & PAGE_MASK;
55e303ae 548
91447636
A
549 size = (((non_rounded_size + pg_offset) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - pg_offset;
550 } else {
551 /*
552 * anyone advertising a blocksize of 1 byte probably
553 * can't deal with us rounding up the request size
554 * AFP is one such filesystem/device
555 */
556 size = non_rounded_size;
557 }
55e303ae
A
558 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START,
559 (int)f_offset, size, upl_offset, flags, 0);
560
0b4e3aa0 561 if (flags & CL_READ) {
91447636
A
562 io_flags = (B_READ);
563 bmap_flags = VNODE_READ;
0b4e3aa0 564
91447636
A
565 max_iosize = mp->mnt_maxreadcnt;
566 max_vectors = mp->mnt_segreadcnt;
0b4e3aa0 567 } else {
91447636
A
568 io_flags = 0;
569 bmap_flags = VNODE_WRITE;
1c79356b 570
91447636
A
571 max_iosize = mp->mnt_maxwritecnt;
572 max_vectors = mp->mnt_segwritecnt;
0b4e3aa0 573 }
91447636
A
574 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_NONE, max_iosize, max_vectors, mp->mnt_devblocksize, 0, 0);
575
55e303ae 576 /*
91447636
A
577 * make sure the maximum iosize is a
578 * multiple of the page size
55e303ae
A
579 */
580 max_iosize &= ~PAGE_MASK;
581
582 if (flags & CL_THROTTLE) {
583 if ( !(flags & CL_PAGEOUT) && cluster_hard_throttle_on(vp)) {
584 if (max_iosize > HARD_THROTTLE_MAXSIZE)
585 max_iosize = HARD_THROTTLE_MAXSIZE;
586 async_throttle = HARD_THROTTLE_MAXCNT;
587 } else
91447636 588 async_throttle = VNODE_ASYNC_THROTTLE;
55e303ae 589 }
1c79356b
A
590 if (flags & CL_AGE)
591 io_flags |= B_AGE;
592 if (flags & CL_DUMP)
593 io_flags |= B_NOCACHE;
91447636
A
594 if (flags & (CL_PAGEIN | CL_PAGEOUT))
595 io_flags |= B_PAGEIO;
b4c24cb9
A
596 if (flags & CL_COMMIT)
597 io_flags |= B_COMMIT_UPL;
598 if (flags & CL_PRESERVE)
599 io_flags |= B_PHYS;
91447636
A
600 if (flags & CL_KEEPCACHED)
601 io_flags |= B_CACHE;
1c79356b 602
9bccf70c 603 if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
1c79356b
A
604 /*
605 * then we are going to end up
606 * with a page that we can't complete (the file size wasn't a multiple
607 * of PAGE_SIZE and we're trying to read to the end of the file
608 * so we'll go ahead and zero out the portion of the page we can't
609 * read in from the file
610 */
9bccf70c 611 zero_offset = upl_offset + non_rounded_size;
1c79356b
A
612 }
613 while (size) {
91447636
A
614 int pg_resid;
615 daddr64_t blkno;
616 daddr64_t lblkno;
1c79356b 617
0b4e3aa0
A
618 if (size > max_iosize)
619 io_size = max_iosize;
1c79356b
A
620 else
621 io_size = size;
91447636
A
622
623 if ((error = VNOP_BLOCKMAP(vp, f_offset, io_size, &blkno, (size_t *)&io_size, NULL, bmap_flags, NULL))) {
1c79356b
A
624 break;
625 }
91447636
A
626 if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno))
627 real_bp->b_blkno = blkno;
1c79356b
A
628
629 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
9bccf70c 630 (int)f_offset, (int)blkno, io_size, zero_offset, 0);
1c79356b 631
91447636
A
632 if (io_size == 0) {
633 /*
634 * vnop_blockmap didn't return an error... however, it did
635 * return an extent size of 0 which means we can't
636 * make forward progress on this I/O... a hole in the
637 * file would be returned as a blkno of -1 with a non-zero io_size
638 * a real extent is returned with a blkno != -1 and a non-zero io_size
639 */
640 error = EINVAL;
641 break;
642 }
643 if ( !(flags & CL_READ) && blkno == -1) {
644 off_t e_offset;
645
646 /*
647 * we're writing into a 'hole'
648 */
0b4e3aa0 649 if (flags & CL_PAGEOUT) {
91447636
A
650 /*
651 * if we got here via cluster_pageout
652 * then just error the request and return
653 * the 'hole' should already have been covered
654 */
0b4e3aa0
A
655 error = EINVAL;
656 break;
91447636
A
657 }
658 if ( !(flags & CL_COMMIT)) {
659 /*
660 * currently writes always request the commit to happen
661 * as part of the io completion... however, if the CL_COMMIT
662 * flag isn't specified, than we can't issue the abort_range
663 * since the call site is going to abort or commit the same upl..
664 * in this case we can only return an error
665 */
666 error = EINVAL;
667 break;
668 }
669 /*
670 * we can get here if the cluster code happens to
671 * pick up a page that was dirtied via mmap vs
672 * a 'write' and the page targets a 'hole'...
673 * i.e. the writes to the cluster were sparse
674 * and the file was being written for the first time
675 *
676 * we can also get here if the filesystem supports
677 * 'holes' that are less than PAGE_SIZE.... because
678 * we can't know if the range in the page that covers
679 * the 'hole' has been dirtied via an mmap or not,
680 * we have to assume the worst and try to push the
681 * entire page to storage.
682 *
683 * Try paging out the page individually before
684 * giving up entirely and dumping it (the pageout
685 * path will insure that the zero extent accounting
686 * has been taken care of before we get back into cluster_io)
0b4e3aa0 687 */
91447636
A
688 ubc_upl_abort_range(upl, trunc_page(upl_offset), PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
689
690 e_offset = round_page_64(f_offset + 1);
691
692 if (ubc_sync_range(vp, f_offset, e_offset, UBC_PUSHDIRTY) == 0) {
693 error = EINVAL;
0b4e3aa0 694 break;
91447636
A
695 }
696 io_size = e_offset - f_offset;
697
698 f_offset += io_size;
699 upl_offset += io_size;
700
701 if (size >= io_size)
702 size -= io_size;
703 else
704 size = 0;
705 /*
706 * keep track of how much of the original request
707 * that we've actually completed... non_rounded_size
708 * may go negative due to us rounding the request
709 * to a page size multiple (i.e. size > non_rounded_size)
710 */
711 non_rounded_size -= io_size;
712
713 if (non_rounded_size <= 0) {
714 /*
715 * we've transferred all of the data in the original
716 * request, but we were unable to complete the tail
717 * of the last page because the file didn't have
718 * an allocation to back that portion... this is ok.
719 */
720 size = 0;
721 }
0b4e3aa0 722 continue;
1c79356b 723 }
91447636 724 lblkno = (daddr64_t)(f_offset / PAGE_SIZE_64);
1c79356b
A
725 /*
726 * we have now figured out how much I/O we can do - this is in 'io_size'
1c79356b
A
727 * pg_offset is the starting point in the first page for the I/O
728 * pg_count is the number of full and partial pages that 'io_size' encompasses
729 */
1c79356b 730 pg_offset = upl_offset & PAGE_MASK;
1c79356b 731
0b4e3aa0
A
732 if (flags & CL_DEV_MEMORY) {
733 /*
734 * currently, can't deal with reading 'holes' in file
735 */
91447636 736 if (blkno == -1) {
0b4e3aa0
A
737 error = EINVAL;
738 break;
739 }
740 /*
741 * treat physical requests as one 'giant' page
742 */
743 pg_count = 1;
55e303ae
A
744 } else
745 pg_count = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
746
91447636 747 if ((flags & CL_READ) && blkno == -1) {
9bccf70c
A
748 int bytes_to_zero;
749
1c79356b
A
750 /*
751 * if we're reading and blkno == -1, then we've got a
752 * 'hole' in the file that we need to deal with by zeroing
753 * out the affected area in the upl
754 */
9bccf70c
A
755 if (zero_offset && io_size == size) {
756 /*
757 * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
758 * than 'zero_offset' will be non-zero
91447636 759 * if the 'hole' returned by vnop_blockmap extends all the way to the eof
9bccf70c
A
760 * (indicated by the io_size finishing off the I/O request for this UPL)
761 * than we're not going to issue an I/O for the
762 * last page in this upl... we need to zero both the hole and the tail
763 * of the page beyond the EOF, since the delayed zero-fill won't kick in
764 */
765 bytes_to_zero = (((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset;
1c79356b 766
9bccf70c
A
767 zero_offset = 0;
768 } else
769 bytes_to_zero = io_size;
1c79356b 770
9bccf70c
A
771 cluster_zero(upl, upl_offset, bytes_to_zero, real_bp);
772
773 if (cbp_head)
774 /*
775 * if there is a current I/O chain pending
776 * then the first page of the group we just zero'd
777 * will be handled by the I/O completion if the zero
778 * fill started in the middle of the page
779 */
780 pg_count = (io_size - pg_offset) / PAGE_SIZE;
781 else {
782 /*
783 * no pending I/O to pick up that first page
784 * so, we have to make sure it gets committed
785 * here.
786 * set the pg_offset to 0 so that the upl_commit_range
787 * starts with this page
788 */
789 pg_count = (io_size + pg_offset) / PAGE_SIZE;
790 pg_offset = 0;
791 }
1c79356b 792 if (io_size == size && ((upl_offset + io_size) & PAGE_MASK))
9bccf70c
A
793 /*
794 * if we're done with the request for this UPL
795 * then we have to make sure to commit the last page
796 * even if we only partially zero-filled it
797 */
1c79356b
A
798 pg_count++;
799
800 if (pg_count) {
801 if (pg_offset)
802 pg_resid = PAGE_SIZE - pg_offset;
803 else
804 pg_resid = 0;
9bccf70c 805
1c79356b 806 if (flags & CL_COMMIT)
0b4e3aa0 807 ubc_upl_commit_range(upl,
9bccf70c 808 (upl_offset + pg_resid) & ~PAGE_MASK,
0b4e3aa0
A
809 pg_count * PAGE_SIZE,
810 UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
1c79356b
A
811 }
812 upl_offset += io_size;
813 f_offset += io_size;
814 size -= io_size;
91447636
A
815 /*
816 * keep track of how much of the original request
817 * that we've actually completed... non_rounded_size
818 * may go negative due to us rounding the request
819 * to a page size multiple (i.e. size > non_rounded_size)
820 */
821 non_rounded_size -= io_size;
1c79356b 822
91447636
A
823 if (non_rounded_size <= 0) {
824 /*
825 * we've transferred all of the data in the original
826 * request, but we were unable to complete the tail
827 * of the last page because the file didn't have
828 * an allocation to back that portion... this is ok.
829 */
830 size = 0;
831 }
9bccf70c 832 if (cbp_head && pg_count)
1c79356b
A
833 goto start_io;
834 continue;
9bccf70c 835
1c79356b 836 }
55e303ae 837 if (pg_count > max_vectors) {
91447636 838 if (((pg_count - max_vectors) * PAGE_SIZE) > io_size) {
55e303ae
A
839 io_size = PAGE_SIZE - pg_offset;
840 pg_count = 1;
91447636
A
841 } else {
842 io_size -= (pg_count - max_vectors) * PAGE_SIZE;
55e303ae 843 pg_count = max_vectors;
91447636 844 }
1c79356b 845 }
1c79356b 846
91447636 847 if ( !(mp->mnt_kern_flag & MNTK_VIRTUALDEV))
55e303ae
A
848 /*
849 * if we're not targeting a virtual device i.e. a disk image
850 * it's safe to dip into the reserve pool since real devices
851 * can complete this I/O request without requiring additional
852 * bufs from the alloc_io_buf pool
853 */
854 priv = 1;
855 else if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT))
856 /*
857 * Throttle the speculative IO
858 */
0b4e3aa0
A
859 priv = 0;
860 else
861 priv = 1;
862
863 cbp = alloc_io_buf(vp, priv);
1c79356b 864
55e303ae 865 if (flags & CL_PAGEOUT) {
91447636
A
866 u_int i;
867
55e303ae 868 for (i = 0; i < pg_count; i++) {
91447636
A
869 if (buf_invalblkno(vp, lblkno + i, 0) == EBUSY)
870 panic("BUSY bp found in cluster_io");
1c79356b 871 }
1c79356b 872 }
b4c24cb9 873 if (flags & CL_ASYNC) {
91447636
A
874 if (buf_setcallback(cbp, (void *)cluster_iodone, NULL))
875 panic("buf_setcallback failed\n");
b4c24cb9 876 }
1c79356b
A
877 cbp->b_flags |= io_flags;
878
879 cbp->b_lblkno = lblkno;
880 cbp->b_blkno = blkno;
881 cbp->b_bcount = io_size;
1c79356b 882
91447636
A
883 if (buf_setupl(cbp, upl, upl_offset))
884 panic("buf_setupl failed\n");
885
886 cbp->b_trans_next = (buf_t)NULL;
887
888 if ((cbp->b_iostate = (void *)iostate))
d7e50217
A
889 /*
890 * caller wants to track the state of this
891 * io... bump the amount issued against this stream
892 */
b4c24cb9
A
893 iostate->io_issued += io_size;
894
91447636 895 if (flags & CL_READ) {
1c79356b 896 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
91447636
A
897 (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
898 }
899 else {
1c79356b 900 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
91447636
A
901 (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
902 }
1c79356b
A
903
904 if (cbp_head) {
905 cbp_tail->b_trans_next = cbp;
906 cbp_tail = cbp;
907 } else {
908 cbp_head = cbp;
909 cbp_tail = cbp;
910 }
91447636
A
911 (buf_t)(cbp->b_trans_head) = cbp_head;
912 trans_count++;
1c79356b
A
913
914 upl_offset += io_size;
915 f_offset += io_size;
916 size -= io_size;
91447636
A
917 /*
918 * keep track of how much of the original request
919 * that we've actually completed... non_rounded_size
920 * may go negative due to us rounding the request
921 * to a page size multiple (i.e. size > non_rounded_size)
922 */
923 non_rounded_size -= io_size;
1c79356b 924
91447636
A
925 if (non_rounded_size <= 0) {
926 /*
927 * we've transferred all of the data in the original
928 * request, but we were unable to complete the tail
929 * of the last page because the file didn't have
930 * an allocation to back that portion... this is ok.
931 */
932 size = 0;
933 }
934 if ( (!(upl_offset & PAGE_MASK) && !(flags & CL_DEV_MEMORY) && ((flags & CL_ASYNC) || trans_count > 8)) || size == 0) {
1c79356b
A
935 /*
936 * if we have no more I/O to issue or
937 * the current I/O we've prepared fully
938 * completes the last page in this request
9bccf70c
A
939 * and it's either an ASYNC request or
940 * we've already accumulated more than 8 I/O's into
941 * this transaction and it's not an I/O directed to
942 * special DEVICE memory
1c79356b
A
943 * then go ahead and issue the I/O
944 */
945start_io:
1c79356b
A
946 if (real_bp) {
947 cbp_head->b_flags |= B_NEED_IODONE;
948 cbp_head->b_real_bp = real_bp;
9bccf70c 949 } else
91447636 950 cbp_head->b_real_bp = (buf_t)NULL;
1c79356b 951
9bccf70c
A
952 if (size == 0) {
953 /*
954 * we're about to issue the last I/O for this upl
955 * if this was a read to the eof and the eof doesn't
956 * finish on a page boundary, than we need to zero-fill
957 * the rest of the page....
958 */
959 cbp_head->b_validend = zero_offset;
960 } else
961 cbp_head->b_validend = 0;
962
91447636
A
963 if (flags & CL_THROTTLE)
964 (void)vnode_waitforwrites(vp, async_throttle, 0, 0, (char *)"cluster_io");
965
1c79356b 966 for (cbp = cbp_head; cbp;) {
91447636 967 buf_t cbp_next;
1c79356b 968
91447636
A
969 if ( !(io_flags & B_READ))
970 vnode_startwrite(vp);
1c79356b
A
971
972 cbp_next = cbp->b_trans_next;
9bccf70c 973
91447636 974 (void) VNOP_STRATEGY(cbp);
1c79356b
A
975 cbp = cbp_next;
976 }
977 if ( !(flags & CL_ASYNC)) {
91447636
A
978 int dummy;
979
1c79356b 980 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next)
91447636
A
981 buf_biowait(cbp);
982
983 if ((error = cluster_iodone(cbp_head, (void *)&dummy))) {
cc9f6e38 984 if (((flags & (CL_PAGEOUT | CL_KEEPCACHED)) == CL_PAGEOUT) && (error == ENXIO))
91447636
A
985 error = 0; /* drop the error */
986 else {
987 if (retval == 0)
988 retval = error;
989 error = 0;
990 }
1c79356b
A
991 }
992 }
91447636
A
993 cbp_head = (buf_t)NULL;
994 cbp_tail = (buf_t)NULL;
9bccf70c 995
91447636 996 trans_count = 0;
1c79356b
A
997 }
998 }
999 if (error) {
0b4e3aa0
A
1000 int abort_size;
1001
b4c24cb9
A
1002 io_size = 0;
1003
1c79356b 1004 for (cbp = cbp_head; cbp;) {
91447636 1005 buf_t cbp_next;
1c79356b 1006
0b4e3aa0
A
1007 upl_offset -= cbp->b_bcount;
1008 size += cbp->b_bcount;
b4c24cb9 1009 io_size += cbp->b_bcount;
0b4e3aa0 1010
1c79356b
A
1011 cbp_next = cbp->b_trans_next;
1012 free_io_buf(cbp);
1013 cbp = cbp_next;
1c79356b 1014 }
b4c24cb9 1015 if (iostate) {
91447636
A
1016 int need_wakeup = 0;
1017
d7e50217
A
1018 /*
1019 * update the error condition for this stream
1020 * since we never really issued the io
1021 * just go ahead and adjust it back
1022 */
91447636
A
1023 lck_mtx_lock(cl_mtxp);
1024
d7e50217 1025 if (iostate->io_error == 0)
b4c24cb9 1026 iostate->io_error = error;
b4c24cb9
A
1027 iostate->io_issued -= io_size;
1028
1029 if (iostate->io_wanted) {
d7e50217
A
1030 /*
1031 * someone is waiting for the state of
1032 * this io stream to change
1033 */
b4c24cb9 1034 iostate->io_wanted = 0;
91447636 1035 need_wakeup = 0;
b4c24cb9 1036 }
91447636
A
1037 lck_mtx_unlock(cl_mtxp);
1038
1039 if (need_wakeup)
1040 wakeup((caddr_t)&iostate->io_wanted);
b4c24cb9 1041 }
0b4e3aa0 1042 pg_offset = upl_offset & PAGE_MASK;
55e303ae 1043 abort_size = (size + pg_offset + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1c79356b
A
1044
1045 if (flags & CL_COMMIT) {
1046 int upl_abort_code;
1047
55e303ae
A
1048 if (flags & CL_PRESERVE) {
1049 ubc_upl_commit_range(upl, upl_offset - pg_offset, abort_size,
1050 UPL_COMMIT_FREE_ON_EMPTY);
1051 } else {
1052 if ((flags & CL_PAGEOUT) && (error != ENXIO)) /* transient error */
1053 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
1054 else if (flags & CL_PAGEIN)
1055 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
1056 else
1057 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
1c79356b 1058
55e303ae 1059 ubc_upl_abort_range(upl, upl_offset - pg_offset, abort_size,
0b4e3aa0 1060 upl_abort_code);
55e303ae 1061 }
1c79356b 1062 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
9bccf70c 1063 (int)upl, upl_offset - pg_offset, abort_size, error, 0);
1c79356b
A
1064 }
1065 if (real_bp) {
1066 real_bp->b_flags |= B_ERROR;
1067 real_bp->b_error = error;
1068
91447636 1069 buf_biodone(real_bp);
1c79356b
A
1070 }
1071 if (retval == 0)
1072 retval = error;
1073 }
1074 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END,
1075 (int)f_offset, size, upl_offset, retval, 0);
1076
1077 return (retval);
1078}
1079
1080
1081static int
91447636 1082cluster_rd_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize)
1c79356b 1083{
55e303ae 1084 int pages_in_prefetch;
1c79356b
A
1085
1086 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
1087 (int)f_offset, size, (int)filesize, 0, 0);
1088
1089 if (f_offset >= filesize) {
1090 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
1091 (int)f_offset, 0, 0, 0, 0);
1092 return(0);
1093 }
0b4e3aa0 1094 if (size > (MAX_UPL_TRANSFER * PAGE_SIZE))
55e303ae 1095 size = (MAX_UPL_TRANSFER * PAGE_SIZE);
1c79356b 1096 else
55e303ae 1097 size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1c79356b 1098
9bccf70c
A
1099 if ((off_t)size > (filesize - f_offset))
1100 size = filesize - f_offset;
55e303ae 1101 pages_in_prefetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
1c79356b 1102
91447636 1103 advisory_read(vp, filesize, f_offset, size);
1c79356b
A
1104
1105 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
55e303ae 1106 (int)f_offset + size, pages_in_prefetch, 0, 1, 0);
1c79356b 1107
55e303ae 1108 return (pages_in_prefetch);
1c79356b
A
1109}
1110
1111
1112
1113static void
91447636 1114cluster_rd_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *rap)
1c79356b 1115{
91447636
A
1116 daddr64_t r_addr;
1117 off_t f_offset;
1118 int size_of_prefetch;
1119
1c79356b
A
1120
1121 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
91447636 1122 (int)extent->b_addr, (int)extent->e_addr, (int)rap->cl_lastr, 0, 0);
1c79356b 1123
91447636 1124 if (extent->b_addr == rap->cl_lastr && extent->b_addr == extent->e_addr) {
1c79356b 1125 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
91447636 1126 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 0, 0);
1c79356b
A
1127 return;
1128 }
91447636
A
1129 if (rap->cl_lastr == -1 || (extent->b_addr != rap->cl_lastr && extent->b_addr != (rap->cl_lastr + 1) &&
1130 (extent->b_addr != (rap->cl_maxra + 1) || rap->cl_ralen == 0))) {
1131 rap->cl_ralen = 0;
1132 rap->cl_maxra = 0;
1c79356b
A
1133
1134 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
91447636 1135 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 1, 0);
1c79356b
A
1136
1137 return;
1138 }
91447636
A
1139 if (extent->e_addr < rap->cl_maxra) {
1140 if ((rap->cl_maxra - extent->e_addr) > (MAX_UPL_TRANSFER / 4)) {
1c79356b
A
1141
1142 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
91447636 1143 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 2, 0);
1c79356b
A
1144 return;
1145 }
1146 }
91447636
A
1147 r_addr = max(extent->e_addr, rap->cl_maxra) + 1;
1148 f_offset = (off_t)(r_addr * PAGE_SIZE_64);
1c79356b 1149
55e303ae
A
1150 size_of_prefetch = 0;
1151
1152 ubc_range_op(vp, f_offset, f_offset + PAGE_SIZE_64, UPL_ROP_PRESENT, &size_of_prefetch);
1153
1154 if (size_of_prefetch) {
1155 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
91447636 1156 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 3, 0);
55e303ae
A
1157 return;
1158 }
9bccf70c 1159 if (f_offset < filesize) {
91447636 1160 daddr64_t read_size;
55e303ae 1161
91447636 1162 rap->cl_ralen = rap->cl_ralen ? min(MAX_UPL_TRANSFER, rap->cl_ralen << 1) : 1;
55e303ae 1163
91447636
A
1164 read_size = (extent->e_addr + 1) - extent->b_addr;
1165
1166 if (read_size > rap->cl_ralen) {
1167 if (read_size > MAX_UPL_TRANSFER)
1168 rap->cl_ralen = MAX_UPL_TRANSFER;
1169 else
1170 rap->cl_ralen = read_size;
1171 }
1172 size_of_prefetch = cluster_rd_prefetch(vp, f_offset, rap->cl_ralen * PAGE_SIZE, filesize);
1c79356b 1173
9bccf70c 1174 if (size_of_prefetch)
91447636 1175 rap->cl_maxra = (r_addr + size_of_prefetch) - 1;
9bccf70c 1176 }
1c79356b 1177 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
91447636 1178 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 4, 0);
1c79356b
A
1179}
1180
9bccf70c 1181int
91447636
A
1182cluster_pageout(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset,
1183 int size, off_t filesize, int flags)
1c79356b
A
1184{
1185 int io_size;
55e303ae 1186 int rounded_size;
1c79356b 1187 off_t max_size;
55e303ae 1188 int local_flags;
91447636 1189 struct cl_writebehind *wbp;
55e303ae
A
1190
1191 if (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)
1192 /*
1193 * if we know we're issuing this I/O to a virtual device (i.e. disk image)
1194 * then we don't want to enforce this throttle... if we do, we can
1195 * potentially deadlock since we're stalling the pageout thread at a time
1196 * when the disk image might need additional memory (which won't be available
1197 * if the pageout thread can't run)... instead we'll just depend on the throttle
1198 * that the pageout thread now has in place to deal with external files
1199 */
1200 local_flags = CL_PAGEOUT;
1201 else
1202 local_flags = CL_PAGEOUT | CL_THROTTLE;
1c79356b
A
1203
1204 if ((flags & UPL_IOSYNC) == 0)
1205 local_flags |= CL_ASYNC;
1206 if ((flags & UPL_NOCOMMIT) == 0)
1207 local_flags |= CL_COMMIT;
91447636
A
1208 if ((flags & UPL_KEEPCACHED))
1209 local_flags |= CL_KEEPCACHED;
1c79356b 1210
1c79356b
A
1211
1212 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
1213 (int)f_offset, size, (int)filesize, local_flags, 0);
1214
1215 /*
1216 * If they didn't specify any I/O, then we are done...
1217 * we can't issue an abort because we don't know how
1218 * big the upl really is
1219 */
1220 if (size <= 0)
1221 return (EINVAL);
1222
1223 if (vp->v_mount->mnt_flag & MNT_RDONLY) {
1224 if (local_flags & CL_COMMIT)
9bccf70c 1225 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
1c79356b
A
1226 return (EROFS);
1227 }
1228 /*
1229 * can't page-in from a negative offset
1230 * or if we're starting beyond the EOF
1231 * or if the file offset isn't page aligned
1232 * or the size requested isn't a multiple of PAGE_SIZE
1233 */
1234 if (f_offset < 0 || f_offset >= filesize ||
1235 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
0b4e3aa0
A
1236 if (local_flags & CL_COMMIT)
1237 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
1c79356b
A
1238 return (EINVAL);
1239 }
1240 max_size = filesize - f_offset;
1241
1242 if (size < max_size)
1243 io_size = size;
1244 else
9bccf70c 1245 io_size = max_size;
1c79356b 1246
55e303ae 1247 rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1c79356b 1248
55e303ae 1249 if (size > rounded_size) {
0b4e3aa0 1250 if (local_flags & CL_COMMIT)
55e303ae 1251 ubc_upl_abort_range(upl, upl_offset + rounded_size, size - rounded_size,
1c79356b
A
1252 UPL_ABORT_FREE_ON_EMPTY);
1253 }
91447636
A
1254 if ((wbp = cluster_get_wbp(vp, 0)) != NULL)
1255 wbp->cl_hasbeenpaged = 1;
1c79356b 1256
91447636
A
1257 return (cluster_io(vp, upl, upl_offset, f_offset, io_size,
1258 local_flags, (buf_t)NULL, (struct clios *)NULL));
1c79356b
A
1259}
1260
9bccf70c 1261int
91447636
A
1262cluster_pagein(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset,
1263 int size, off_t filesize, int flags)
1c79356b
A
1264{
1265 u_int io_size;
9bccf70c 1266 int rounded_size;
1c79356b
A
1267 off_t max_size;
1268 int retval;
1269 int local_flags = 0;
1c79356b 1270
9bccf70c
A
1271 if (upl == NULL || size < 0)
1272 panic("cluster_pagein: NULL upl passed in");
1c79356b 1273
9bccf70c
A
1274 if ((flags & UPL_IOSYNC) == 0)
1275 local_flags |= CL_ASYNC;
1c79356b 1276 if ((flags & UPL_NOCOMMIT) == 0)
9bccf70c
A
1277 local_flags |= CL_COMMIT;
1278
1c79356b
A
1279
1280 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
1281 (int)f_offset, size, (int)filesize, local_flags, 0);
1282
1283 /*
1284 * can't page-in from a negative offset
1285 * or if we're starting beyond the EOF
1286 * or if the file offset isn't page aligned
1287 * or the size requested isn't a multiple of PAGE_SIZE
1288 */
1289 if (f_offset < 0 || f_offset >= filesize ||
9bccf70c
A
1290 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) {
1291 if (local_flags & CL_COMMIT)
1292 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1c79356b
A
1293 return (EINVAL);
1294 }
1295 max_size = filesize - f_offset;
1296
1297 if (size < max_size)
1298 io_size = size;
1299 else
9bccf70c 1300 io_size = max_size;
1c79356b 1301
9bccf70c 1302 rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1c79356b 1303
9bccf70c
A
1304 if (size > rounded_size && (local_flags & CL_COMMIT))
1305 ubc_upl_abort_range(upl, upl_offset + rounded_size,
55e303ae 1306 size - rounded_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
9bccf70c 1307
91447636
A
1308 retval = cluster_io(vp, upl, upl_offset, f_offset, io_size,
1309 local_flags | CL_READ | CL_PAGEIN, (buf_t)NULL, (struct clios *)NULL);
1c79356b 1310
91447636
A
1311 if (retval == 0 && !(flags & UPL_NORDAHEAD) && !(vp->v_flag & VRAOFF)) {
1312 struct cl_readahead *rap;
1c79356b 1313
91447636 1314 rap = cluster_get_rap(vp);
1c79356b 1315
91447636
A
1316 if (rap != NULL) {
1317 struct cl_extent extent;
1318
1319 extent.b_addr = (daddr64_t)(f_offset / PAGE_SIZE_64);
1320 extent.e_addr = (daddr64_t)((f_offset + ((off_t)io_size - 1)) / PAGE_SIZE_64);
1321
1322 if (rounded_size == PAGE_SIZE) {
1323 /*
1324 * we haven't read the last page in of the file yet
1325 * so let's try to read ahead if we're in
1326 * a sequential access pattern
1327 */
1328 cluster_rd_ahead(vp, &extent, filesize, rap);
1329 }
1330 rap->cl_lastr = extent.e_addr;
1331
1332 lck_mtx_unlock(&rap->cl_lockr);
1c79356b 1333 }
1c79356b
A
1334 }
1335 return (retval);
1336}
1337
9bccf70c 1338int
91447636 1339cluster_bp(buf_t bp)
1c79356b
A
1340{
1341 off_t f_offset;
1342 int flags;
1343
9bccf70c 1344 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START,
91447636 1345 (int)bp, (int)bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
9bccf70c 1346
1c79356b 1347 if (bp->b_flags & B_READ)
9bccf70c 1348 flags = CL_ASYNC | CL_READ;
1c79356b 1349 else
9bccf70c 1350 flags = CL_ASYNC;
1c79356b
A
1351
1352 f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
1353
91447636 1354 return (cluster_io(bp->b_vp, bp->b_upl, 0, f_offset, bp->b_bcount, flags, bp, (struct clios *)NULL));
1c79356b
A
1355}
1356
9bccf70c 1357int
91447636 1358cluster_write(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int xflags)
1c79356b 1359{
1c79356b 1360 int prev_resid;
91447636 1361 u_int clip_size;
1c79356b 1362 off_t max_io_size;
0b4e3aa0 1363 int upl_size;
0b4e3aa0
A
1364 int upl_flags;
1365 upl_t upl;
1c79356b 1366 int retval = 0;
91447636 1367 int flags;
1c79356b 1368
91447636
A
1369 flags = xflags;
1370
1371 if (vp->v_flag & VNOCACHE_DATA)
1372 flags |= IO_NOCACHE;
1373
1374 if ( (!(flags & IO_NOCACHE)) || (!uio) || (!UIO_SEG_IS_USER_SPACE(uio->uio_segflg))) {
1375 /*
1376 * go do a write through the cache if one of the following is true....
1377 * NOCACHE is not true
1378 * there is no uio structure or it doesn't target USERSPACE
1379 */
1380 return (cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags));
1381 }
1382
1383#if LP64_DEBUG
1384 if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) {
1385 panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__);
1386 }
1387#endif /* LP64_DEBUG */
55e303ae 1388
91447636 1389 while (uio_resid(uio) && uio->uio_offset < newEOF && retval == 0) {
cc9f6e38
A
1390 user_size_t iov_len;
1391 user_addr_t iov_base;
91447636 1392
0b4e3aa0 1393 /*
91447636
A
1394 * we know we have a resid, so this is safe
1395 * skip over any emtpy vectors
0b4e3aa0 1396 */
cc9f6e38 1397 uio_update(uio, (user_size_t)0);
0b4e3aa0 1398
cc9f6e38
A
1399 iov_len = uio_curriovlen(uio);
1400 iov_base = uio_curriovbase(uio);
91447636
A
1401
1402 upl_size = PAGE_SIZE;
1403 upl_flags = UPL_QUERY_OBJECT_TYPE;
1404
1405 // LP64todo - fix this!
1406 if ((vm_map_get_upl(current_map(),
cc9f6e38 1407 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
91447636
A
1408 &upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS) {
1409 /*
1410 * the user app must have passed in an invalid address
1411 */
1412 return (EFAULT);
1413 }
0b4e3aa0 1414
0b4e3aa0 1415 /*
91447636
A
1416 * We check every vector target but if it is physically
1417 * contiguous space, we skip the sanity checks.
0b4e3aa0 1418 */
91447636
A
1419 if (upl_flags & UPL_PHYS_CONTIG) {
1420 int zflags;
1421
1422 zflags = flags & ~IO_TAILZEROFILL;
1423 zflags |= IO_HEADZEROFILL;
1424
1425 if (flags & IO_HEADZEROFILL) {
1426 /*
1427 * in case we have additional vectors, we don't want to do this again
1428 */
1429 flags &= ~IO_HEADZEROFILL;
1430
1431 if ((retval = cluster_write_x(vp, (struct uio *)0, 0, uio->uio_offset, headOff, 0, zflags)))
1432 return(retval);
1433 }
1434 retval = cluster_phys_write(vp, uio, newEOF);
1435
1436 if (uio_resid(uio) == 0 && (flags & IO_TAILZEROFILL)) {
1437 return (cluster_write_x(vp, (struct uio *)0, 0, tailOff, uio->uio_offset, 0, zflags));
1438 }
1439 }
1440 else if ((uio_resid(uio) < PAGE_SIZE) || (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL))) {
1441 /*
1442 * we're here because we're don't have a physically contiguous target buffer
1443 * go do a write through the cache if one of the following is true....
1444 * the total xfer size is less than a page...
1445 * we're being asked to ZEROFILL either the head or the tail of the I/O...
1446 */
1447 return (cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags));
1448 }
1449 // LP64todo - fix this!
1450 else if (((int)uio->uio_offset & PAGE_MASK) || (CAST_DOWN(int, iov_base) & PAGE_MASK)) {
1451 if (((int)uio->uio_offset & PAGE_MASK) == (CAST_DOWN(int, iov_base) & PAGE_MASK)) {
1452 /*
1453 * Bring the file offset write up to a pagesize boundary
1454 * this will also bring the base address to a page boundary
1455 * since they both are currently on the same offset within a page
1456 * note: if we get here, uio->uio_resid is greater than PAGE_SIZE
1457 * so the computed clip_size must always be less than the current uio_resid
1458 */
1459 clip_size = (PAGE_SIZE - (uio->uio_offset & PAGE_MASK_64));
1460
1461 /*
1462 * Fake the resid going into the cluster_write_x call
1463 * and restore it on the way out.
1464 */
1465 // LP64todo - fix this
1466 prev_resid = uio_resid(uio);
1467 uio_setresid(uio, clip_size);
1468
1469 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags);
1470
1471 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
1472 } else {
1473 /*
1474 * can't get both the file offset and the buffer offset aligned to a page boundary
1475 * so fire an I/O through the cache for this entire vector
1476 */
1477 // LP64todo - fix this
1478 clip_size = iov_len;
1479 // LP64todo - fix this
1480 prev_resid = uio_resid(uio);
1481 uio_setresid(uio, clip_size);
1482
1483 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags);
1484
1485 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
1486 }
1487 } else {
1488 /*
1489 * If we come in here, we know the offset into
1490 * the file is on a pagesize boundary and the
1491 * target buffer address is also on a page boundary
1492 */
1493 max_io_size = newEOF - uio->uio_offset;
1494 // LP64todo - fix this
1495 clip_size = uio_resid(uio);
1496 if (iov_len < clip_size)
1497 // LP64todo - fix this!
1498 clip_size = iov_len;
1499 if (max_io_size < clip_size)
1500 clip_size = max_io_size;
1501
1502 if (clip_size < PAGE_SIZE) {
1503 /*
1504 * Take care of tail end of write in this vector
1505 */
1506 // LP64todo - fix this
1507 prev_resid = uio_resid(uio);
1508 uio_setresid(uio, clip_size);
1509
1510 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags);
1511
1512 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
1513 } else {
1514 /* round clip_size down to a multiple of pagesize */
1515 clip_size = clip_size & ~(PAGE_MASK);
1516 // LP64todo - fix this
1517 prev_resid = uio_resid(uio);
1518 uio_setresid(uio, clip_size);
1519
1520 retval = cluster_nocopy_write(vp, uio, newEOF);
1521
1522 if ((retval == 0) && uio_resid(uio))
1523 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags);
1524
1525 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
1526 }
1527 } /* end else */
1528 } /* end while */
1529
1c79356b
A
1530 return(retval);
1531}
1532
b4c24cb9 1533
9bccf70c 1534static int
91447636 1535cluster_nocopy_write(vnode_t vp, struct uio *uio, off_t newEOF)
1c79356b
A
1536{
1537 upl_t upl;
1538 upl_page_info_t *pl;
1c79356b 1539 vm_offset_t upl_offset;
1c79356b 1540 int io_size;
d7e50217 1541 int io_flag;
1c79356b
A
1542 int upl_size;
1543 int upl_needed_size;
1544 int pages_in_pl;
1545 int upl_flags;
1546 kern_return_t kret;
1c79356b
A
1547 int i;
1548 int force_data_sync;
1549 int error = 0;
d7e50217 1550 struct clios iostate;
91447636 1551 struct cl_writebehind *wbp;
cc9f6e38 1552
1c79356b
A
1553
1554 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
91447636
A
1555 (int)uio->uio_offset, (int)uio_resid(uio),
1556 (int)newEOF, 0, 0);
1c79356b
A
1557
1558 /*
1559 * When we enter this routine, we know
1560 * -- the offset into the file is on a pagesize boundary
1561 * -- the resid is a page multiple
1562 * -- the resid will not exceed iov_len
1563 */
91447636
A
1564
1565 if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) != NULL) {
1c79356b 1566
91447636
A
1567 cluster_try_push(wbp, vp, newEOF, 0, 1);
1568
1569 lck_mtx_unlock(&wbp->cl_lockw);
1570 }
d7e50217
A
1571 iostate.io_completed = 0;
1572 iostate.io_issued = 0;
1573 iostate.io_error = 0;
1574 iostate.io_wanted = 0;
1575
91447636 1576 while (uio_resid(uio) && uio->uio_offset < newEOF && error == 0) {
cc9f6e38
A
1577 user_addr_t iov_base;
1578
91447636 1579 io_size = uio_resid(uio);
1c79356b 1580
d7e50217
A
1581 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1582 io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1c79356b 1583
cc9f6e38
A
1584 iov_base = uio_curriovbase(uio);
1585
91447636 1586 // LP64todo - fix this!
cc9f6e38 1587 upl_offset = CAST_DOWN(vm_offset_t, iov_base) & PAGE_MASK;
91447636 1588
d7e50217
A
1589 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
1590
1591 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
cc9f6e38 1592 (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
d7e50217
A
1593
1594 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
1595 pages_in_pl = 0;
1596 upl_size = upl_needed_size;
1597 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
55e303ae 1598 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
d7e50217 1599
91447636 1600 // LP64todo - fix this!
d7e50217 1601 kret = vm_map_get_upl(current_map(),
cc9f6e38 1602 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
d7e50217
A
1603 &upl_size,
1604 &upl,
1605 NULL,
1606 &pages_in_pl,
1607 &upl_flags,
1608 force_data_sync);
1609
1610 if (kret != KERN_SUCCESS) {
1611 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1612 0, 0, 0, kret, 0);
d7e50217
A
1613 /*
1614 * cluster_nocopy_write: failed to get pagelist
1615 *
1616 * we may have already spun some portion of this request
1617 * off as async requests... we need to wait for the I/O
1618 * to complete before returning
1619 */
1620 goto wait_for_writes;
1621 }
1622 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1623 pages_in_pl = upl_size / PAGE_SIZE;
1c79356b 1624
d7e50217
A
1625 for (i = 0; i < pages_in_pl; i++) {
1626 if (!upl_valid_page(pl, i))
1627 break;
1628 }
1629 if (i == pages_in_pl)
1630 break;
1c79356b 1631
d7e50217
A
1632 /*
1633 * didn't get all the pages back that we
1634 * needed... release this upl and try again
1635 */
1636 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1637 UPL_ABORT_FREE_ON_EMPTY);
1c79356b 1638 }
d7e50217
A
1639 if (force_data_sync >= 3) {
1640 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1641 i, pages_in_pl, upl_size, kret, 0);
d7e50217
A
1642 /*
1643 * for some reason, we couldn't acquire a hold on all
1644 * the pages needed in the user's address space
1645 *
1646 * we may have already spun some portion of this request
1647 * off as async requests... we need to wait for the I/O
1648 * to complete before returning
1649 */
1650 goto wait_for_writes;
1c79356b 1651 }
0b4e3aa0 1652
d7e50217
A
1653 /*
1654 * Consider the possibility that upl_size wasn't satisfied.
1655 */
1656 if (upl_size != upl_needed_size)
1657 io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
1c79356b 1658
d7e50217 1659 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
cc9f6e38 1660 (int)upl_offset, upl_size, (int)iov_base, io_size, 0);
1c79356b 1661
d7e50217
A
1662 if (io_size == 0) {
1663 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1664 UPL_ABORT_FREE_ON_EMPTY);
d7e50217
A
1665 /*
1666 * we may have already spun some portion of this request
1667 * off as async requests... we need to wait for the I/O
1668 * to complete before returning
1669 */
1670 goto wait_for_writes;
1671 }
1672 /*
1673 * Now look for pages already in the cache
1674 * and throw them away.
55e303ae
A
1675 * uio->uio_offset is page aligned within the file
1676 * io_size is a multiple of PAGE_SIZE
d7e50217 1677 */
55e303ae 1678 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + io_size, UPL_ROP_DUMP, NULL);
1c79356b 1679
d7e50217
A
1680 /*
1681 * we want push out these writes asynchronously so that we can overlap
1682 * the preparation of the next I/O
1683 * if there are already too many outstanding writes
1684 * wait until some complete before issuing the next
1685 */
91447636
A
1686 lck_mtx_lock(cl_mtxp);
1687
d7e50217
A
1688 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
1689 iostate.io_wanted = 1;
91447636 1690 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_write", 0);
d7e50217 1691 }
91447636
A
1692 lck_mtx_unlock(cl_mtxp);
1693
d7e50217
A
1694 if (iostate.io_error) {
1695 /*
1696 * one of the earlier writes we issued ran into a hard error
1697 * don't issue any more writes, cleanup the UPL
1698 * that was just created but not used, then
1699 * go wait for all writes that are part of this stream
1700 * to complete before returning the error to the caller
1701 */
1702 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1703 UPL_ABORT_FREE_ON_EMPTY);
1c79356b 1704
d7e50217
A
1705 goto wait_for_writes;
1706 }
55e303ae 1707 io_flag = CL_ASYNC | CL_PRESERVE | CL_COMMIT | CL_THROTTLE;
1c79356b 1708
d7e50217
A
1709 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
1710 (int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0);
1c79356b 1711
d7e50217 1712 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
91447636 1713 io_size, io_flag, (buf_t)NULL, &iostate);
7b1edb79 1714
cc9f6e38 1715 uio_update(uio, (user_size_t)io_size);
1c79356b 1716
d7e50217 1717 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
91447636 1718 (int)upl_offset, (int)uio->uio_offset, (int)uio_resid(uio), error, 0);
1c79356b
A
1719
1720 } /* end while */
1721
d7e50217
A
1722wait_for_writes:
1723 /*
1724 * make sure all async writes issued as part of this stream
1725 * have completed before we return
1726 */
91447636
A
1727 lck_mtx_lock(cl_mtxp);
1728
d7e50217
A
1729 while (iostate.io_issued != iostate.io_completed) {
1730 iostate.io_wanted = 1;
91447636 1731 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_write", 0);
d7e50217 1732 }
91447636
A
1733 lck_mtx_unlock(cl_mtxp);
1734
d7e50217
A
1735 if (iostate.io_error)
1736 error = iostate.io_error;
1c79356b
A
1737
1738 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1739 (int)uio->uio_offset, (int)uio->uio_resid, error, 4, 0);
1740
1741 return (error);
1742}
1743
b4c24cb9 1744
9bccf70c 1745static int
91447636 1746cluster_phys_write(vnode_t vp, struct uio *uio, off_t newEOF)
0b4e3aa0 1747{
b4c24cb9 1748 upl_page_info_t *pl;
55e303ae 1749 addr64_t src_paddr;
0b4e3aa0
A
1750 upl_t upl;
1751 vm_offset_t upl_offset;
b4c24cb9 1752 int tail_size;
0b4e3aa0
A
1753 int io_size;
1754 int upl_size;
1755 int upl_needed_size;
1756 int pages_in_pl;
1757 int upl_flags;
1758 kern_return_t kret;
0b4e3aa0 1759 int error = 0;
cc9f6e38 1760 user_addr_t iov_base;
91447636
A
1761 int devblocksize;
1762 struct cl_writebehind *wbp;
0b4e3aa0 1763
91447636 1764 devblocksize = vp->v_mount->mnt_devblocksize;
0b4e3aa0
A
1765 /*
1766 * When we enter this routine, we know
1767 * -- the resid will not exceed iov_len
1768 * -- the vector target address is physcially contiguous
1769 */
91447636 1770 if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) != NULL) {
0b4e3aa0 1771
91447636
A
1772 cluster_try_push(wbp, vp, newEOF, 0, 1);
1773
1774 lck_mtx_unlock(&wbp->cl_lockw);
1775 }
1776#if LP64_DEBUG
1777 if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) {
1778 panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__);
1779 }
1780#endif /* LP64_DEBUG */
1781
1782 // LP64todo - fix this!
cc9f6e38
A
1783 io_size = (int)uio_curriovlen(uio);
1784 iov_base = uio_curriovbase(uio);
1785
91447636 1786 upl_offset = CAST_DOWN(upl_offset_t, iov_base) & PAGE_MASK;
0b4e3aa0
A
1787 upl_needed_size = upl_offset + io_size;
1788
1789 pages_in_pl = 0;
1790 upl_size = upl_needed_size;
9bccf70c 1791 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
55e303ae 1792 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
0b4e3aa0 1793
91447636 1794 // LP64todo - fix this!
0b4e3aa0 1795 kret = vm_map_get_upl(current_map(),
cc9f6e38 1796 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
0b4e3aa0
A
1797 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
1798
b4c24cb9
A
1799 if (kret != KERN_SUCCESS) {
1800 /*
1801 * cluster_phys_write: failed to get pagelist
1802 * note: return kret here
1803 */
0b4e3aa0 1804 return(EINVAL);
b4c24cb9 1805 }
0b4e3aa0
A
1806 /*
1807 * Consider the possibility that upl_size wasn't satisfied.
1808 * This is a failure in the physical memory case.
1809 */
b4c24cb9 1810 if (upl_size < upl_needed_size) {
91447636 1811 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
b4c24cb9
A
1812 return(EINVAL);
1813 }
1814 pl = ubc_upl_pageinfo(upl);
0b4e3aa0 1815
cc9f6e38 1816 src_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)upl_offset;
0b4e3aa0 1817
b4c24cb9
A
1818 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
1819 int head_size;
0b4e3aa0 1820
b4c24cb9 1821 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
0b4e3aa0 1822
b4c24cb9
A
1823 if (head_size > io_size)
1824 head_size = io_size;
1825
91447636 1826 error = cluster_align_phys_io(vp, uio, src_paddr, head_size, 0);
b4c24cb9
A
1827
1828 if (error) {
1829 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1830
1831 return(EINVAL);
1832 }
1833 upl_offset += head_size;
1834 src_paddr += head_size;
1835 io_size -= head_size;
0b4e3aa0 1836 }
b4c24cb9
A
1837 tail_size = io_size & (devblocksize - 1);
1838 io_size -= tail_size;
1839
1840 if (io_size) {
1841 /*
1842 * issue a synchronous write to cluster_io
1843 */
1844 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
91447636 1845 io_size, CL_DEV_MEMORY, (buf_t)NULL, (struct clios *)NULL);
b4c24cb9
A
1846 }
1847 if (error == 0) {
1848 /*
1849 * The cluster_io write completed successfully,
1850 * update the uio structure
1851 */
cc9f6e38
A
1852 uio_update(uio, (user_size_t)io_size);
1853
1854 src_paddr += io_size;
b4c24cb9
A
1855
1856 if (tail_size)
91447636 1857 error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, 0);
b4c24cb9
A
1858 }
1859 /*
1860 * just release our hold on the physically contiguous
1861 * region without changing any state
1862 */
1863 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
0b4e3aa0
A
1864
1865 return (error);
1866}
1867
b4c24cb9 1868
9bccf70c 1869static int
91447636 1870cluster_write_x(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int flags)
1c79356b
A
1871{
1872 upl_page_info_t *pl;
1873 upl_t upl;
91447636 1874 vm_offset_t upl_offset = 0;
1c79356b
A
1875 int upl_size;
1876 off_t upl_f_offset;
1877 int pages_in_upl;
1878 int start_offset;
1879 int xfer_resid;
1880 int io_size;
1c79356b
A
1881 int io_offset;
1882 int bytes_to_zero;
1883 int bytes_to_move;
1884 kern_return_t kret;
1885 int retval = 0;
91447636 1886 int io_resid;
1c79356b
A
1887 long long total_size;
1888 long long zero_cnt;
1889 off_t zero_off;
1890 long long zero_cnt1;
1891 off_t zero_off1;
91447636 1892 struct cl_extent cl;
55e303ae 1893 int intersection;
91447636 1894 struct cl_writebehind *wbp;
55e303ae 1895
91447636
A
1896 if ((wbp = cluster_get_wbp(vp, 0)) != NULL)
1897 {
1898 if (wbp->cl_hasbeenpaged) {
1899 /*
1900 * this vnode had pages cleaned to it by
1901 * the pager which indicates that either
1902 * it's not very 'hot', or the system is
1903 * being overwhelmed by a lot of dirty
1904 * data being delayed in the VM cache...
1905 * in either event, we'll push our remaining
1906 * delayed data at this point... this will
1907 * be more efficient than paging out 1 page at
1908 * a time, and will also act as a throttle
1909 * by delaying this client from writing any
1910 * more data until all his delayed data has
1911 * at least been queued to the uderlying driver.
1912 */
1913 if (wbp->cl_number || wbp->cl_scmap)
1914 cluster_push_EOF(vp, newEOF);
1c79356b 1915
91447636
A
1916 wbp->cl_hasbeenpaged = 0;
1917 }
1918 }
1c79356b
A
1919 if (uio) {
1920 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
91447636 1921 (int)uio->uio_offset, uio_resid(uio), (int)oldEOF, (int)newEOF, 0);
1c79356b 1922
91447636
A
1923 // LP64todo - fix this
1924 io_resid = uio_resid(uio);
1c79356b
A
1925 } else {
1926 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1927 0, 0, (int)oldEOF, (int)newEOF, 0);
1928
91447636 1929 io_resid = 0;
1c79356b
A
1930 }
1931 zero_cnt = 0;
1932 zero_cnt1 = 0;
91447636
A
1933 zero_off = 0;
1934 zero_off1 = 0;
1c79356b
A
1935
1936 if (flags & IO_HEADZEROFILL) {
1937 /*
1938 * some filesystems (HFS is one) don't support unallocated holes within a file...
1939 * so we zero fill the intervening space between the old EOF and the offset
1940 * where the next chunk of real data begins.... ftruncate will also use this
1941 * routine to zero fill to the new EOF when growing a file... in this case, the
1942 * uio structure will not be provided
1943 */
1944 if (uio) {
1945 if (headOff < uio->uio_offset) {
1946 zero_cnt = uio->uio_offset - headOff;
1947 zero_off = headOff;
1948 }
1949 } else if (headOff < newEOF) {
1950 zero_cnt = newEOF - headOff;
1951 zero_off = headOff;
1952 }
1953 }
1954 if (flags & IO_TAILZEROFILL) {
1955 if (uio) {
91447636
A
1956 // LP64todo - fix this
1957 zero_off1 = uio->uio_offset + uio_resid(uio);
1c79356b
A
1958
1959 if (zero_off1 < tailOff)
1960 zero_cnt1 = tailOff - zero_off1;
1961 }
1962 }
55e303ae 1963 if (zero_cnt == 0 && uio == (struct uio *) 0) {
91447636
A
1964 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
1965 retval, 0, 0, 0, 0);
1966 return (0);
55e303ae 1967 }
1c79356b 1968
91447636 1969 while ((total_size = (io_resid + zero_cnt + zero_cnt1)) && retval == 0) {
1c79356b
A
1970 /*
1971 * for this iteration of the loop, figure out where our starting point is
1972 */
1973 if (zero_cnt) {
1974 start_offset = (int)(zero_off & PAGE_MASK_64);
1975 upl_f_offset = zero_off - start_offset;
91447636 1976 } else if (io_resid) {
1c79356b
A
1977 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
1978 upl_f_offset = uio->uio_offset - start_offset;
1979 } else {
1980 start_offset = (int)(zero_off1 & PAGE_MASK_64);
1981 upl_f_offset = zero_off1 - start_offset;
1982 }
1983 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
1984 (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
1985
0b4e3aa0
A
1986 if (total_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1987 total_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1c79356b 1988
91447636 1989 cl.b_addr = (daddr64_t)(upl_f_offset / PAGE_SIZE_64);
55e303ae 1990
91447636 1991 if (uio && ((flags & (IO_NOCACHE | IO_SYNC | IO_HEADZEROFILL | IO_TAILZEROFILL)) == 0)) {
55e303ae 1992 /*
91447636 1993 * assumption... total_size <= io_resid
55e303ae
A
1994 * because IO_HEADZEROFILL and IO_TAILZEROFILL not set
1995 */
1996 if ((start_offset + total_size) > (MAX_UPL_TRANSFER * PAGE_SIZE))
1997 total_size -= start_offset;
1998 xfer_resid = total_size;
1999
2000 retval = cluster_copy_ubc_data(vp, uio, &xfer_resid, 1);
2001
2002 if (retval)
2003 break;
2004
91447636 2005 io_resid -= (total_size - xfer_resid);
55e303ae
A
2006 total_size = xfer_resid;
2007 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2008 upl_f_offset = uio->uio_offset - start_offset;
2009
2010 if (total_size == 0) {
2011 if (start_offset) {
2012 /*
2013 * the write did not finish on a page boundary
2014 * which will leave upl_f_offset pointing to the
2015 * beginning of the last page written instead of
2016 * the page beyond it... bump it in this case
2017 * so that the cluster code records the last page
2018 * written as dirty
2019 */
2020 upl_f_offset += PAGE_SIZE_64;
2021 }
2022 upl_size = 0;
2023
2024 goto check_cluster;
2025 }
2026 }
1c79356b
A
2027 /*
2028 * compute the size of the upl needed to encompass
2029 * the requested write... limit each call to cluster_io
0b4e3aa0
A
2030 * to the maximum UPL size... cluster_io will clip if
2031 * this exceeds the maximum io_size for the device,
2032 * make sure to account for
1c79356b
A
2033 * a starting offset that's not page aligned
2034 */
2035 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2036
0b4e3aa0
A
2037 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2038 upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1c79356b
A
2039
2040 pages_in_upl = upl_size / PAGE_SIZE;
2041 io_size = upl_size - start_offset;
2042
2043 if ((long long)io_size > total_size)
2044 io_size = total_size;
2045
55e303ae
A
2046 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, io_size, total_size, 0, 0);
2047
1c79356b 2048
91447636
A
2049 /*
2050 * Gather the pages from the buffer cache.
2051 * The UPL_WILL_MODIFY flag lets the UPL subsystem know
2052 * that we intend to modify these pages.
2053 */
0b4e3aa0 2054 kret = ubc_create_upl(vp,
91447636
A
2055 upl_f_offset,
2056 upl_size,
2057 &upl,
2058 &pl,
2059 UPL_SET_LITE | UPL_WILL_MODIFY);
1c79356b
A
2060 if (kret != KERN_SUCCESS)
2061 panic("cluster_write: failed to get pagelist");
2062
55e303ae
A
2063 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END,
2064 (int)upl, (int)upl_f_offset, start_offset, 0, 0);
1c79356b
A
2065
2066 if (start_offset && !upl_valid_page(pl, 0)) {
0b4e3aa0 2067 int read_size;
1c79356b 2068
0b4e3aa0 2069 /*
1c79356b
A
2070 * we're starting in the middle of the first page of the upl
2071 * and the page isn't currently valid, so we're going to have
2072 * to read it in first... this is a synchronous operation
2073 */
2074 read_size = PAGE_SIZE;
2075
9bccf70c 2076 if ((upl_f_offset + read_size) > newEOF)
1c79356b 2077 read_size = newEOF - upl_f_offset;
9bccf70c 2078
91447636
A
2079 retval = cluster_io(vp, upl, 0, upl_f_offset, read_size,
2080 CL_READ, (buf_t)NULL, (struct clios *)NULL);
1c79356b 2081 if (retval) {
0b4e3aa0 2082 /*
1c79356b
A
2083 * we had an error during the read which causes us to abort
2084 * the current cluster_write request... before we do, we need
2085 * to release the rest of the pages in the upl without modifying
2086 * there state and mark the failed page in error
2087 */
0b4e3aa0 2088 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
91447636
A
2089
2090 if (upl_size > PAGE_SIZE)
2091 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1c79356b
A
2092
2093 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
9bccf70c 2094 (int)upl, 0, 0, retval, 0);
1c79356b
A
2095 break;
2096 }
2097 }
2098 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
2099 /*
2100 * the last offset we're writing to in this upl does not end on a page
2101 * boundary... if it's not beyond the old EOF, then we'll also need to
2102 * pre-read this page in if it isn't already valid
2103 */
2104 upl_offset = upl_size - PAGE_SIZE;
2105
2106 if ((upl_f_offset + start_offset + io_size) < oldEOF &&
2107 !upl_valid_page(pl, upl_offset / PAGE_SIZE)) {
2108 int read_size;
2109
2110 read_size = PAGE_SIZE;
2111
9bccf70c 2112 if ((upl_f_offset + upl_offset + read_size) > newEOF)
1c79356b 2113 read_size = newEOF - (upl_f_offset + upl_offset);
9bccf70c 2114
91447636
A
2115 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size,
2116 CL_READ, (buf_t)NULL, (struct clios *)NULL);
1c79356b 2117 if (retval) {
0b4e3aa0 2118 /*
1c79356b 2119 * we had an error during the read which causes us to abort
0b4e3aa0
A
2120 * the current cluster_write request... before we do, we
2121 * need to release the rest of the pages in the upl without
2122 * modifying there state and mark the failed page in error
1c79356b 2123 */
9bccf70c 2124 ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
91447636
A
2125
2126 if (upl_size > PAGE_SIZE)
2127 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1c79356b
A
2128
2129 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
9bccf70c 2130 (int)upl, 0, 0, retval, 0);
1c79356b
A
2131 break;
2132 }
2133 }
2134 }
1c79356b
A
2135 xfer_resid = io_size;
2136 io_offset = start_offset;
2137
2138 while (zero_cnt && xfer_resid) {
2139
2140 if (zero_cnt < (long long)xfer_resid)
2141 bytes_to_zero = zero_cnt;
2142 else
2143 bytes_to_zero = xfer_resid;
2144
9bccf70c 2145 if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
55e303ae 2146 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
1c79356b 2147 } else {
9bccf70c
A
2148 int zero_pg_index;
2149
1c79356b 2150 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
9bccf70c
A
2151 zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
2152
2153 if ( !upl_valid_page(pl, zero_pg_index)) {
55e303ae 2154 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
1c79356b 2155
9bccf70c
A
2156 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
2157 !upl_dirty_page(pl, zero_pg_index)) {
55e303ae 2158 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
1c79356b
A
2159 }
2160 }
2161 xfer_resid -= bytes_to_zero;
2162 zero_cnt -= bytes_to_zero;
2163 zero_off += bytes_to_zero;
2164 io_offset += bytes_to_zero;
2165 }
91447636
A
2166 if (xfer_resid && io_resid) {
2167 bytes_to_move = min(io_resid, xfer_resid);
1c79356b 2168
55e303ae 2169 retval = cluster_copy_upl_data(uio, upl, io_offset, bytes_to_move);
9bccf70c 2170
1c79356b 2171 if (retval) {
9bccf70c
A
2172
2173 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
1c79356b
A
2174
2175 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
9bccf70c 2176 (int)upl, 0, 0, retval, 0);
1c79356b 2177 } else {
91447636 2178 io_resid -= bytes_to_move;
1c79356b
A
2179 xfer_resid -= bytes_to_move;
2180 io_offset += bytes_to_move;
2181 }
2182 }
2183 while (xfer_resid && zero_cnt1 && retval == 0) {
2184
2185 if (zero_cnt1 < (long long)xfer_resid)
2186 bytes_to_zero = zero_cnt1;
2187 else
2188 bytes_to_zero = xfer_resid;
2189
9bccf70c 2190 if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
55e303ae 2191 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
1c79356b 2192 } else {
9bccf70c
A
2193 int zero_pg_index;
2194
1c79356b 2195 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off1 & PAGE_MASK_64));
9bccf70c
A
2196 zero_pg_index = (int)((zero_off1 - upl_f_offset) / PAGE_SIZE_64);
2197
2198 if ( !upl_valid_page(pl, zero_pg_index)) {
55e303ae 2199 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
9bccf70c
A
2200 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
2201 !upl_dirty_page(pl, zero_pg_index)) {
55e303ae 2202 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
1c79356b
A
2203 }
2204 }
2205 xfer_resid -= bytes_to_zero;
2206 zero_cnt1 -= bytes_to_zero;
2207 zero_off1 += bytes_to_zero;
2208 io_offset += bytes_to_zero;
2209 }
2210
2211 if (retval == 0) {
9bccf70c 2212 int cl_index;
1c79356b
A
2213 int can_delay;
2214
2215 io_size += start_offset;
2216
9bccf70c 2217 if ((upl_f_offset + io_size) >= newEOF && io_size < upl_size) {
1c79356b
A
2218 /*
2219 * if we're extending the file with this write
2220 * we'll zero fill the rest of the page so that
2221 * if the file gets extended again in such a way as to leave a
2222 * hole starting at this EOF, we'll have zero's in the correct spot
2223 */
55e303ae 2224 cluster_zero(upl, io_size, upl_size - io_size, NULL);
1c79356b 2225 }
9bccf70c
A
2226 if (flags & IO_SYNC)
2227 /*
2228 * if the IO_SYNC flag is set than we need to
2229 * bypass any clusters and immediately issue
2230 * the I/O
2231 */
2232 goto issue_io;
55e303ae 2233check_cluster:
91447636
A
2234 /*
2235 * take the lock to protect our accesses
2236 * of the writebehind and sparse cluster state
2237 */
2238 wbp = cluster_get_wbp(vp, CLW_ALLOCATE | CLW_RETURNLOCKED);
2239
55e303ae
A
2240 /*
2241 * calculate the last logical block number
2242 * that this delayed I/O encompassed
2243 */
91447636 2244 cl.e_addr = (daddr64_t)((upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64);
55e303ae 2245
91447636 2246 if (wbp->cl_scmap) {
55e303ae 2247
91447636 2248 if ( !(flags & IO_NOCACHE)) {
55e303ae
A
2249 /*
2250 * we've fallen into the sparse
2251 * cluster method of delaying dirty pages
2252 * first, we need to release the upl if we hold one
2253 * since pages in it may be present in the sparse cluster map
2254 * and may span 2 separate buckets there... if they do and
2255 * we happen to have to flush a bucket to make room and it intersects
2256 * this upl, a deadlock may result on page BUSY
2257 */
2258 if (upl_size)
2259 ubc_upl_commit_range(upl, 0, upl_size,
2260 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2261
91447636
A
2262 sparse_cluster_add(wbp, vp, &cl, newEOF);
2263
2264 lck_mtx_unlock(&wbp->cl_lockw);
55e303ae
A
2265
2266 continue;
2267 }
2268 /*
2269 * must have done cached writes that fell into
2270 * the sparse cluster mechanism... we've switched
2271 * to uncached writes on the file, so go ahead
2272 * and push whatever's in the sparse map
2273 * and switch back to normal clustering
2274 *
2275 * see the comment above concerning a possible deadlock...
2276 */
2277 if (upl_size) {
2278 ubc_upl_commit_range(upl, 0, upl_size,
2279 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2280 /*
2281 * setting upl_size to 0 keeps us from committing a
2282 * second time in the start_new_cluster path
2283 */
2284 upl_size = 0;
2285 }
91447636 2286 sparse_cluster_push(wbp, vp, newEOF, 1);
55e303ae 2287
91447636 2288 wbp->cl_number = 0;
55e303ae
A
2289 /*
2290 * no clusters of either type present at this point
2291 * so just go directly to start_new_cluster since
2292 * we know we need to delay this I/O since we've
2293 * already released the pages back into the cache
2294 * to avoid the deadlock with sparse_cluster_push
2295 */
2296 goto start_new_cluster;
2297 }
2298 upl_offset = 0;
1c79356b 2299
91447636 2300 if (wbp->cl_number == 0)
9bccf70c
A
2301 /*
2302 * no clusters currently present
2303 */
2304 goto start_new_cluster;
1c79356b 2305
91447636 2306 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
1c79356b 2307 /*
55e303ae
A
2308 * check each cluster that we currently hold
2309 * try to merge some or all of this write into
2310 * one or more of the existing clusters... if
2311 * any portion of the write remains, start a
2312 * new cluster
1c79356b 2313 */
91447636 2314 if (cl.b_addr >= wbp->cl_clusters[cl_index].b_addr) {
9bccf70c
A
2315 /*
2316 * the current write starts at or after the current cluster
2317 */
91447636 2318 if (cl.e_addr <= (wbp->cl_clusters[cl_index].b_addr + MAX_UPL_TRANSFER)) {
1c79356b
A
2319 /*
2320 * we have a write that fits entirely
2321 * within the existing cluster limits
2322 */
91447636 2323 if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr)
1c79356b 2324 /*
9bccf70c 2325 * update our idea of where the cluster ends
1c79356b 2326 */
91447636 2327 wbp->cl_clusters[cl_index].e_addr = cl.e_addr;
9bccf70c 2328 break;
1c79356b 2329 }
91447636 2330 if (cl.b_addr < (wbp->cl_clusters[cl_index].b_addr + MAX_UPL_TRANSFER)) {
1c79356b
A
2331 /*
2332 * we have a write that starts in the middle of the current cluster
55e303ae
A
2333 * but extends beyond the cluster's limit... we know this because
2334 * of the previous checks
2335 * we'll extend the current cluster to the max
91447636 2336 * and update the b_addr for the current write to reflect that
55e303ae
A
2337 * the head of it was absorbed into this cluster...
2338 * note that we'll always have a leftover tail in this case since
2339 * full absorbtion would have occurred in the clause above
1c79356b 2340 */
91447636 2341 wbp->cl_clusters[cl_index].e_addr = wbp->cl_clusters[cl_index].b_addr + MAX_UPL_TRANSFER;
55e303ae
A
2342
2343 if (upl_size) {
91447636 2344 daddr64_t start_pg_in_upl;
55e303ae 2345
91447636 2346 start_pg_in_upl = (daddr64_t)(upl_f_offset / PAGE_SIZE_64);
55e303ae 2347
91447636
A
2348 if (start_pg_in_upl < wbp->cl_clusters[cl_index].e_addr) {
2349 intersection = (int)((wbp->cl_clusters[cl_index].e_addr - start_pg_in_upl) * PAGE_SIZE);
55e303ae
A
2350
2351 ubc_upl_commit_range(upl, upl_offset, intersection,
2352 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2353 upl_f_offset += intersection;
2354 upl_offset += intersection;
2355 upl_size -= intersection;
2356 }
2357 }
91447636 2358 cl.b_addr = wbp->cl_clusters[cl_index].e_addr;
1c79356b
A
2359 }
2360 /*
55e303ae
A
2361 * we come here for the case where the current write starts
2362 * beyond the limit of the existing cluster or we have a leftover
2363 * tail after a partial absorbtion
9bccf70c
A
2364 *
2365 * in either case, we'll check the remaining clusters before
2366 * starting a new one
1c79356b 2367 */
9bccf70c 2368 } else {
1c79356b 2369 /*
55e303ae 2370 * the current write starts in front of the cluster we're currently considering
1c79356b 2371 */
91447636 2372 if ((wbp->cl_clusters[cl_index].e_addr - cl.b_addr) <= MAX_UPL_TRANSFER) {
1c79356b 2373 /*
55e303ae
A
2374 * we can just merge the new request into
2375 * this cluster and leave it in the cache
2376 * since the resulting cluster is still
2377 * less than the maximum allowable size
1c79356b 2378 */
91447636 2379 wbp->cl_clusters[cl_index].b_addr = cl.b_addr;
1c79356b 2380
91447636 2381 if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr) {
9bccf70c
A
2382 /*
2383 * the current write completely
55e303ae
A
2384 * envelops the existing cluster and since
2385 * each write is limited to at most MAX_UPL_TRANSFER bytes
2386 * we can just use the start and last blocknos of the write
2387 * to generate the cluster limits
9bccf70c 2388 */
91447636 2389 wbp->cl_clusters[cl_index].e_addr = cl.e_addr;
9bccf70c
A
2390 }
2391 break;
1c79356b 2392 }
9bccf70c 2393
1c79356b 2394 /*
9bccf70c
A
2395 * if we were to combine this write with the current cluster
2396 * we would exceed the cluster size limit.... so,
2397 * let's see if there's any overlap of the new I/O with
55e303ae
A
2398 * the cluster we're currently considering... in fact, we'll
2399 * stretch the cluster out to it's full limit and see if we
2400 * get an intersection with the current write
9bccf70c 2401 *
1c79356b 2402 */
91447636 2403 if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr - MAX_UPL_TRANSFER) {
1c79356b 2404 /*
55e303ae
A
2405 * the current write extends into the proposed cluster
2406 * clip the length of the current write after first combining it's
2407 * tail with the newly shaped cluster
1c79356b 2408 */
91447636 2409 wbp->cl_clusters[cl_index].b_addr = wbp->cl_clusters[cl_index].e_addr - MAX_UPL_TRANSFER;
55e303ae
A
2410
2411 if (upl_size) {
91447636 2412 intersection = (int)((cl.e_addr - wbp->cl_clusters[cl_index].b_addr) * PAGE_SIZE);
55e303ae
A
2413
2414 if (intersection > upl_size)
2415 /*
2416 * because the current write may consist of a number of pages found in the cache
2417 * which are not part of the UPL, we may have an intersection that exceeds
2418 * the size of the UPL that is also part of this write
2419 */
2420 intersection = upl_size;
2421
2422 ubc_upl_commit_range(upl, upl_offset + (upl_size - intersection), intersection,
2423 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2424 upl_size -= intersection;
2425 }
91447636 2426 cl.e_addr = wbp->cl_clusters[cl_index].b_addr;
55e303ae 2427 }
9bccf70c
A
2428 /*
2429 * if we get here, there was no way to merge
55e303ae
A
2430 * any portion of this write with this cluster
2431 * or we could only merge part of it which
2432 * will leave a tail...
9bccf70c
A
2433 * we'll check the remaining clusters before starting a new one
2434 */
1c79356b 2435 }
9bccf70c 2436 }
91447636 2437 if (cl_index < wbp->cl_number)
9bccf70c 2438 /*
55e303ae
A
2439 * we found an existing cluster(s) that we
2440 * could entirely merge this I/O into
9bccf70c
A
2441 */
2442 goto delay_io;
2443
91447636 2444 if (wbp->cl_number < MAX_CLUSTERS && !(flags & IO_NOCACHE))
9bccf70c
A
2445 /*
2446 * we didn't find an existing cluster to
2447 * merge into, but there's room to start
1c79356b
A
2448 * a new one
2449 */
9bccf70c 2450 goto start_new_cluster;
1c79356b 2451
9bccf70c
A
2452 /*
2453 * no exisitng cluster to merge with and no
2454 * room to start a new one... we'll try
55e303ae
A
2455 * pushing one of the existing ones... if none of
2456 * them are able to be pushed, we'll switch
2457 * to the sparse cluster mechanism
91447636 2458 * cluster_try_push updates cl_number to the
55e303ae
A
2459 * number of remaining clusters... and
2460 * returns the number of currently unused clusters
9bccf70c 2461 */
91447636
A
2462 int ret_cluster_try_push = 0;
2463 /* if writes are not deferred, call cluster push immediately */
2464 if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) {
2465 if (flags & IO_NOCACHE)
2466 can_delay = 0;
2467 else
2468 can_delay = 1;
2469
2470 ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, can_delay, 0);
2471 }
9bccf70c 2472
91447636
A
2473 /* execute following regardless writes are deferred or not */
2474 if (ret_cluster_try_push == 0) {
55e303ae
A
2475 /*
2476 * no more room in the normal cluster mechanism
2477 * so let's switch to the more expansive but expensive
2478 * sparse mechanism....
2479 * first, we need to release the upl if we hold one
2480 * since pages in it may be present in the sparse cluster map (after the cluster_switch)
2481 * and may span 2 separate buckets there... if they do and
2482 * we happen to have to flush a bucket to make room and it intersects
2483 * this upl, a deadlock may result on page BUSY
2484 */
2485 if (upl_size)
2486 ubc_upl_commit_range(upl, upl_offset, upl_size,
2487 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2488
91447636
A
2489 sparse_cluster_switch(wbp, vp, newEOF);
2490 sparse_cluster_add(wbp, vp, &cl, newEOF);
2491
2492 lck_mtx_unlock(&wbp->cl_lockw);
55e303ae
A
2493
2494 continue;
9bccf70c 2495 }
55e303ae
A
2496 /*
2497 * we pushed one cluster successfully, so we must be sequentially writing this file
2498 * otherwise, we would have failed and fallen into the sparse cluster support
2499 * so let's take the opportunity to push out additional clusters as long as we
2500 * remain below the throttle... this will give us better I/O locality if we're
2501 * in a copy loop (i.e. we won't jump back and forth between the read and write points
2502 * however, we don't want to push so much out that the write throttle kicks in and
2503 * hangs this thread up until some of the I/O completes...
2504 */
91447636
A
2505 if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) {
2506 while (wbp->cl_number && (vp->v_numoutput <= (VNODE_ASYNC_THROTTLE / 2)))
2507 cluster_try_push(wbp, vp, newEOF, 0, 0);
2508 }
55e303ae 2509
9bccf70c 2510start_new_cluster:
91447636
A
2511 wbp->cl_clusters[wbp->cl_number].b_addr = cl.b_addr;
2512 wbp->cl_clusters[wbp->cl_number].e_addr = cl.e_addr;
9bccf70c 2513
91447636
A
2514 if (flags & IO_NOCACHE)
2515 wbp->cl_clusters[wbp->cl_number].io_nocache = 1;
2516 else
2517 wbp->cl_clusters[wbp->cl_number].io_nocache = 0;
2518 wbp->cl_number++;
55e303ae
A
2519delay_io:
2520 if (upl_size)
2521 ubc_upl_commit_range(upl, upl_offset, upl_size,
2522 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
91447636
A
2523
2524 lck_mtx_unlock(&wbp->cl_lockw);
2525
9bccf70c
A
2526 continue;
2527issue_io:
2528 /*
91447636
A
2529 * we don't hold the vnode lock at this point
2530 *
2531 * because we had to ask for a UPL that provides currenty non-present pages, the
2532 * UPL has been automatically set to clear the dirty flags (both software and hardware)
2533 * upon committing it... this is not the behavior we want since it's possible for
2534 * pages currently present as part of a mapped file to be dirtied while the I/O is in flight.
9bccf70c 2535 * in order to maintain some semblance of coherency with mapped writes
91447636
A
2536 * we need to drop the current upl and pick it back up with COPYOUT_FROM set
2537 * so that we correctly deal with a change in state of the hardware modify bit...
2538 * we do this via cluster_push_x... by passing along the IO_SYNC flag, we force
2539 * cluster_push_x to wait until all the I/Os have completed... cluster_push_x is also
2540 * responsible for generating the correct sized I/O(s)
9bccf70c 2541 */
91447636
A
2542 ubc_upl_commit_range(upl, 0, upl_size,
2543 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
1c79356b 2544
91447636 2545 cl.e_addr = (upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64;
1c79356b 2546
91447636 2547 retval = cluster_push_x(vp, &cl, newEOF, flags);
1c79356b
A
2548 }
2549 }
2550 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
91447636 2551 retval, 0, io_resid, 0, 0);
1c79356b
A
2552
2553 return (retval);
2554}
2555
9bccf70c 2556int
91447636 2557cluster_read(vnode_t vp, struct uio *uio, off_t filesize, int xflags)
1c79356b 2558{
1c79356b 2559 int prev_resid;
91447636 2560 u_int clip_size;
1c79356b 2561 off_t max_io_size;
0b4e3aa0 2562 int upl_size;
0b4e3aa0
A
2563 int upl_flags;
2564 upl_t upl;
1c79356b 2565 int retval = 0;
91447636 2566 int flags;
1c79356b 2567
91447636 2568 flags = xflags;
1c79356b 2569
91447636
A
2570 if (vp->v_flag & VNOCACHE_DATA)
2571 flags |= IO_NOCACHE;
2572 if (vp->v_flag & VRAOFF)
2573 flags |= IO_RAOFF;
2574
2575 if (!((flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg))) {
2576 /*
2577 * go do a read through the cache if one of the following is true....
2578 * NOCACHE is not true
2579 * the uio request doesn't target USERSPACE
0b4e3aa0 2580 */
91447636
A
2581 return (cluster_read_x(vp, uio, filesize, flags));
2582 }
2583
2584#if LP64_DEBUG
2585 if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) {
2586 panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__);
2587 }
2588#endif /* LP64_DEBUG */
2589
2590 while (uio_resid(uio) && uio->uio_offset < filesize && retval == 0) {
cc9f6e38
A
2591 user_size_t iov_len;
2592 user_addr_t iov_base;
91447636 2593
0b4e3aa0 2594 /*
91447636
A
2595 * we know we have a resid, so this is safe
2596 * skip over any emtpy vectors
1c79356b 2597 */
cc9f6e38
A
2598 uio_update(uio, (user_size_t)0);
2599
2600 iov_len = uio_curriovlen(uio);
2601 iov_base = uio_curriovbase(uio);
91447636 2602
91447636
A
2603 upl_size = PAGE_SIZE;
2604 upl_flags = UPL_QUERY_OBJECT_TYPE;
2605
2606 // LP64todo - fix this!
2607 if ((vm_map_get_upl(current_map(),
cc9f6e38 2608 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
91447636
A
2609 &upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS) {
2610 /*
2611 * the user app must have passed in an invalid address
2612 */
2613 return (EFAULT);
2614 }
2615
2616 /*
2617 * We check every vector target but if it is physically
2618 * contiguous space, we skip the sanity checks.
1c79356b 2619 */
91447636
A
2620 if (upl_flags & UPL_PHYS_CONTIG) {
2621 retval = cluster_phys_read(vp, uio, filesize);
2622 }
2623 else if (uio_resid(uio) < PAGE_SIZE) {
2624 /*
2625 * we're here because we're don't have a physically contiguous target buffer
2626 * go do a read through the cache if
2627 * the total xfer size is less than a page...
2628 */
2629 return (cluster_read_x(vp, uio, filesize, flags));
2630 }
2631 // LP64todo - fix this!
2632 else if (((int)uio->uio_offset & PAGE_MASK) || (CAST_DOWN(int, iov_base) & PAGE_MASK)) {
2633 if (((int)uio->uio_offset & PAGE_MASK) == (CAST_DOWN(int, iov_base) & PAGE_MASK)) {
2634 /*
2635 * Bring the file offset read up to a pagesize boundary
2636 * this will also bring the base address to a page boundary
2637 * since they both are currently on the same offset within a page
2638 * note: if we get here, uio->uio_resid is greater than PAGE_SIZE
2639 * so the computed clip_size must always be less than the current uio_resid
2640 */
2641 clip_size = (PAGE_SIZE - (int)(uio->uio_offset & PAGE_MASK_64));
2642
2643 /*
2644 * Fake the resid going into the cluster_read_x call
2645 * and restore it on the way out.
2646 */
2647 prev_resid = uio_resid(uio);
2648 // LP64todo - fix this
2649 uio_setresid(uio, clip_size);
2650
2651 retval = cluster_read_x(vp, uio, filesize, flags);
2652
2653 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
2654 } else {
2655 /*
2656 * can't get both the file offset and the buffer offset aligned to a page boundary
2657 * so fire an I/O through the cache for this entire vector
2658 */
2659 // LP64todo - fix this!
2660 clip_size = iov_len;
2661 prev_resid = uio_resid(uio);
2662 uio_setresid(uio, clip_size);
2663
2664 retval = cluster_read_x(vp, uio, filesize, flags);
2665
2666 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
2667 }
2668 } else {
2669 /*
2670 * If we come in here, we know the offset into
2671 * the file is on a pagesize boundary
2672 */
2673 max_io_size = filesize - uio->uio_offset;
2674 // LP64todo - fix this
2675 clip_size = uio_resid(uio);
2676 if (iov_len < clip_size)
2677 clip_size = iov_len;
2678 if (max_io_size < clip_size)
2679 clip_size = (int)max_io_size;
2680
2681 if (clip_size < PAGE_SIZE) {
2682 /*
2683 * Take care of the tail end of the read in this vector.
2684 */
2685 // LP64todo - fix this
2686 prev_resid = uio_resid(uio);
2687 uio_setresid(uio, clip_size);
1c79356b 2688
91447636
A
2689 retval = cluster_read_x(vp, uio, filesize, flags);
2690
2691 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
2692 } else {
2693 /* round clip_size down to a multiple of pagesize */
2694 clip_size = clip_size & ~(PAGE_MASK);
2695 // LP64todo - fix this
2696 prev_resid = uio_resid(uio);
2697 uio_setresid(uio, clip_size);
2698
2699 retval = cluster_nocopy_read(vp, uio, filesize);
2700
2701 if ((retval==0) && uio_resid(uio))
2702 retval = cluster_read_x(vp, uio, filesize, flags);
2703
2704 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
2705 }
2706 } /* end else */
2707 } /* end while */
1c79356b 2708
1c79356b
A
2709 return(retval);
2710}
2711
9bccf70c 2712static int
91447636 2713cluster_read_x(vnode_t vp, struct uio *uio, off_t filesize, int flags)
1c79356b
A
2714{
2715 upl_page_info_t *pl;
2716 upl_t upl;
2717 vm_offset_t upl_offset;
2718 int upl_size;
2719 off_t upl_f_offset;
2720 int start_offset;
2721 int start_pg;
2722 int last_pg;
91447636 2723 int uio_last = 0;
1c79356b
A
2724 int pages_in_upl;
2725 off_t max_size;
55e303ae
A
2726 off_t last_ioread_offset;
2727 off_t last_request_offset;
2728 u_int size_of_prefetch;
91447636 2729 u_int io_size;
1c79356b 2730 kern_return_t kret;
1c79356b
A
2731 int error = 0;
2732 int retval = 0;
55e303ae
A
2733 u_int max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2734 u_int rd_ahead_enabled = 1;
2735 u_int prefetch_enabled = 1;
91447636
A
2736 struct cl_readahead * rap;
2737 struct clios iostate;
2738 struct cl_extent extent;
55e303ae
A
2739
2740 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
91447636
A
2741 (int)uio->uio_offset, uio_resid(uio), (int)filesize, 0, 0);
2742
2743 // LP64todo - fix this
2744 last_request_offset = uio->uio_offset + uio_resid(uio);
55e303ae 2745
91447636
A
2746 if ((flags & (IO_RAOFF|IO_NOCACHE)) ||
2747 ((last_request_offset & ~PAGE_MASK_64) == (uio->uio_offset & ~PAGE_MASK_64))) {
55e303ae 2748 rd_ahead_enabled = 0;
91447636
A
2749 rap = NULL;
2750 } else {
2751 if (cluster_hard_throttle_on(vp)) {
2752 rd_ahead_enabled = 0;
2753 prefetch_enabled = 0;
55e303ae 2754
91447636
A
2755 max_rd_size = HARD_THROTTLE_MAXSIZE;
2756 }
2757 if ((rap = cluster_get_rap(vp)) == NULL)
2758 rd_ahead_enabled = 0;
55e303ae 2759 }
55e303ae
A
2760 if (last_request_offset > filesize)
2761 last_request_offset = filesize;
91447636
A
2762 extent.b_addr = uio->uio_offset / PAGE_SIZE_64;
2763 extent.e_addr = (last_request_offset - 1) / PAGE_SIZE_64;
55e303ae 2764
91447636 2765 if (rap != NULL && rap->cl_ralen && (rap->cl_lastr == extent.b_addr || (rap->cl_lastr + 1) == extent.b_addr)) {
55e303ae
A
2766 /*
2767 * determine if we already have a read-ahead in the pipe courtesy of the
2768 * last read systemcall that was issued...
2769 * if so, pick up it's extent to determine where we should start
2770 * with respect to any read-ahead that might be necessary to
2771 * garner all the data needed to complete this read systemcall
2772 */
91447636 2773 last_ioread_offset = (rap->cl_maxra * PAGE_SIZE_64) + PAGE_SIZE_64;
1c79356b 2774
55e303ae
A
2775 if (last_ioread_offset < uio->uio_offset)
2776 last_ioread_offset = (off_t)0;
2777 else if (last_ioread_offset > last_request_offset)
2778 last_ioread_offset = last_request_offset;
2779 } else
2780 last_ioread_offset = (off_t)0;
1c79356b 2781
91447636 2782 while (uio_resid(uio) && uio->uio_offset < filesize && retval == 0) {
1c79356b
A
2783 /*
2784 * compute the size of the upl needed to encompass
2785 * the requested read... limit each call to cluster_io
0b4e3aa0
A
2786 * to the maximum UPL size... cluster_io will clip if
2787 * this exceeds the maximum io_size for the device,
2788 * make sure to account for
1c79356b
A
2789 * a starting offset that's not page aligned
2790 */
2791 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2792 upl_f_offset = uio->uio_offset - (off_t)start_offset;
2793 max_size = filesize - uio->uio_offset;
2794
91447636
A
2795 // LP64todo - fix this!
2796 if ((off_t)((unsigned int)uio_resid(uio)) < max_size)
2797 io_size = uio_resid(uio);
1c79356b
A
2798 else
2799 io_size = max_size;
9bccf70c 2800
91447636 2801 if (!(flags & IO_NOCACHE)) {
1c79356b 2802
55e303ae
A
2803 while (io_size) {
2804 u_int io_resid;
2805 u_int io_requested;
1c79356b 2806
55e303ae
A
2807 /*
2808 * if we keep finding the pages we need already in the cache, then
2809 * don't bother to call cluster_rd_prefetch since it costs CPU cycles
2810 * to determine that we have all the pages we need... once we miss in
2811 * the cache and have issued an I/O, than we'll assume that we're likely
2812 * to continue to miss in the cache and it's to our advantage to try and prefetch
2813 */
2814 if (last_request_offset && last_ioread_offset && (size_of_prefetch = (last_request_offset - last_ioread_offset))) {
2815 if ((last_ioread_offset - uio->uio_offset) <= max_rd_size && prefetch_enabled) {
2816 /*
2817 * we've already issued I/O for this request and
2818 * there's still work to do and
2819 * our prefetch stream is running dry, so issue a
2820 * pre-fetch I/O... the I/O latency will overlap
2821 * with the copying of the data
2822 */
2823 if (size_of_prefetch > max_rd_size)
2824 size_of_prefetch = max_rd_size;
1c79356b 2825
91447636 2826 size_of_prefetch = cluster_rd_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize);
1c79356b 2827
55e303ae
A
2828 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
2829
2830 if (last_ioread_offset > last_request_offset)
2831 last_ioread_offset = last_request_offset;
2832 }
2833 }
2834 /*
2835 * limit the size of the copy we're about to do so that
2836 * we can notice that our I/O pipe is running dry and
2837 * get the next I/O issued before it does go dry
2838 */
2839 if (last_ioread_offset && io_size > ((MAX_UPL_TRANSFER * PAGE_SIZE) / 4))
2840 io_resid = ((MAX_UPL_TRANSFER * PAGE_SIZE) / 4);
2841 else
2842 io_resid = io_size;
1c79356b 2843
55e303ae 2844 io_requested = io_resid;
1c79356b 2845
55e303ae 2846 retval = cluster_copy_ubc_data(vp, uio, &io_resid, 0);
1c79356b 2847
55e303ae 2848 io_size -= (io_requested - io_resid);
1c79356b 2849
55e303ae
A
2850 if (retval || io_resid)
2851 /*
2852 * if we run into a real error or
2853 * a page that is not in the cache
2854 * we need to leave streaming mode
2855 */
2856 break;
2857
2858 if ((io_size == 0 || last_ioread_offset == last_request_offset) && rd_ahead_enabled) {
2859 /*
2860 * we're already finished the I/O for this read request
2861 * let's see if we should do a read-ahead
2862 */
91447636 2863 cluster_rd_ahead(vp, &extent, filesize, rap);
55e303ae 2864 }
1c79356b 2865 }
1c79356b
A
2866 if (retval)
2867 break;
1c79356b 2868 if (io_size == 0) {
91447636
A
2869 if (rap != NULL) {
2870 if (extent.e_addr < rap->cl_lastr)
2871 rap->cl_maxra = 0;
2872 rap->cl_lastr = extent.e_addr;
2873 }
1c79356b
A
2874 break;
2875 }
55e303ae
A
2876 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2877 upl_f_offset = uio->uio_offset - (off_t)start_offset;
2878 max_size = filesize - uio->uio_offset;
1c79356b 2879 }
55e303ae
A
2880 if (io_size > max_rd_size)
2881 io_size = max_rd_size;
2882
1c79356b 2883 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
55e303ae
A
2884
2885 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE) / 4)
2886 upl_size = (MAX_UPL_TRANSFER * PAGE_SIZE) / 4;
1c79356b
A
2887 pages_in_upl = upl_size / PAGE_SIZE;
2888
2889 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
9bccf70c 2890 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
1c79356b 2891
0b4e3aa0 2892 kret = ubc_create_upl(vp,
91447636
A
2893 upl_f_offset,
2894 upl_size,
2895 &upl,
2896 &pl,
2897 UPL_SET_LITE);
1c79356b
A
2898 if (kret != KERN_SUCCESS)
2899 panic("cluster_read: failed to get pagelist");
2900
1c79356b 2901 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
9bccf70c 2902 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
1c79356b
A
2903
2904 /*
2905 * scan from the beginning of the upl looking for the first
2906 * non-valid page.... this will become the first page in
2907 * the request we're going to make to 'cluster_io'... if all
2908 * of the pages are valid, we won't call through to 'cluster_io'
2909 */
2910 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
2911 if (!upl_valid_page(pl, start_pg))
2912 break;
2913 }
2914
2915 /*
2916 * scan from the starting invalid page looking for a valid
2917 * page before the end of the upl is reached, if we
2918 * find one, then it will be the last page of the request to
2919 * 'cluster_io'
2920 */
2921 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
2922 if (upl_valid_page(pl, last_pg))
2923 break;
2924 }
55e303ae
A
2925 iostate.io_completed = 0;
2926 iostate.io_issued = 0;
2927 iostate.io_error = 0;
2928 iostate.io_wanted = 0;
1c79356b
A
2929
2930 if (start_pg < last_pg) {
2931 /*
2932 * we found a range of 'invalid' pages that must be filled
2933 * if the last page in this range is the last page of the file
2934 * we may have to clip the size of it to keep from reading past
2935 * the end of the last physical block associated with the file
2936 */
2937 upl_offset = start_pg * PAGE_SIZE;
2938 io_size = (last_pg - start_pg) * PAGE_SIZE;
2939
9bccf70c 2940 if ((upl_f_offset + upl_offset + io_size) > filesize)
1c79356b 2941 io_size = filesize - (upl_f_offset + upl_offset);
9bccf70c 2942
1c79356b 2943 /*
55e303ae 2944 * issue an asynchronous read to cluster_io
1c79356b
A
2945 */
2946
2947 error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
91447636 2948 io_size, CL_READ | CL_ASYNC, (buf_t)NULL, &iostate);
1c79356b
A
2949 }
2950 if (error == 0) {
2951 /*
2952 * if the read completed successfully, or there was no I/O request
55e303ae
A
2953 * issued, than copy the data into user land via 'cluster_upl_copy_data'
2954 * we'll first add on any 'valid'
1c79356b
A
2955 * pages that were present in the upl when we acquired it.
2956 */
2957 u_int val_size;
1c79356b
A
2958
2959 for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
2960 if (!upl_valid_page(pl, uio_last))
2961 break;
2962 }
2963 /*
2964 * compute size to transfer this round, if uio->uio_resid is
55e303ae 2965 * still non-zero after this attempt, we'll loop around and
1c79356b
A
2966 * set up for another I/O.
2967 */
2968 val_size = (uio_last * PAGE_SIZE) - start_offset;
2969
55e303ae 2970 if (val_size > max_size)
1c79356b
A
2971 val_size = max_size;
2972
91447636
A
2973 if (val_size > uio_resid(uio))
2974 // LP64todo - fix this
2975 val_size = uio_resid(uio);
1c79356b 2976
55e303ae
A
2977 if (last_ioread_offset == 0)
2978 last_ioread_offset = uio->uio_offset + val_size;
1c79356b 2979
55e303ae 2980 if ((size_of_prefetch = (last_request_offset - last_ioread_offset)) && prefetch_enabled) {
1c79356b 2981 /*
55e303ae
A
2982 * if there's still I/O left to do for this request, and...
2983 * we're not in hard throttle mode, then issue a
2984 * pre-fetch I/O... the I/O latency will overlap
1c79356b
A
2985 * with the copying of the data
2986 */
91447636 2987 size_of_prefetch = cluster_rd_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize);
1c79356b 2988
55e303ae
A
2989 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
2990
2991 if (last_ioread_offset > last_request_offset)
2992 last_ioread_offset = last_request_offset;
1c79356b 2993
55e303ae
A
2994 } else if ((uio->uio_offset + val_size) == last_request_offset) {
2995 /*
2996 * this transfer will finish this request, so...
2997 * let's try to read ahead if we're in
2998 * a sequential access pattern and we haven't
2999 * explicitly disabled it
3000 */
3001 if (rd_ahead_enabled)
91447636
A
3002 cluster_rd_ahead(vp, &extent, filesize, rap);
3003
3004 if (rap != NULL) {
3005 if (extent.e_addr < rap->cl_lastr)
3006 rap->cl_maxra = 0;
3007 rap->cl_lastr = extent.e_addr;
3008 }
9bccf70c 3009 }
91447636
A
3010 lck_mtx_lock(cl_mtxp);
3011
55e303ae
A
3012 while (iostate.io_issued != iostate.io_completed) {
3013 iostate.io_wanted = 1;
91447636 3014 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_x", 0);
55e303ae 3015 }
91447636
A
3016 lck_mtx_unlock(cl_mtxp);
3017
55e303ae
A
3018 if (iostate.io_error)
3019 error = iostate.io_error;
9bccf70c 3020 else
55e303ae 3021 retval = cluster_copy_upl_data(uio, upl, start_offset, val_size);
1c79356b
A
3022 }
3023 if (start_pg < last_pg) {
3024 /*
3025 * compute the range of pages that we actually issued an I/O for
3026 * and either commit them as valid if the I/O succeeded
3027 * or abort them if the I/O failed
3028 */
3029 io_size = (last_pg - start_pg) * PAGE_SIZE;
3030
3031 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
9bccf70c 3032 (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
1c79356b 3033
91447636 3034 if (error || (flags & IO_NOCACHE))
0b4e3aa0 3035 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
1c79356b
A
3036 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3037 else
0b4e3aa0 3038 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size,
55e303ae
A
3039 UPL_COMMIT_CLEAR_DIRTY |
3040 UPL_COMMIT_FREE_ON_EMPTY |
3041 UPL_COMMIT_INACTIVATE);
1c79356b
A
3042
3043 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
9bccf70c 3044 (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
1c79356b
A
3045 }
3046 if ((last_pg - start_pg) < pages_in_upl) {
3047 int cur_pg;
3048 int commit_flags;
3049
3050 /*
3051 * the set of pages that we issued an I/O for did not encompass
3052 * the entire upl... so just release these without modifying
55e303ae 3053 * their state
1c79356b
A
3054 */
3055 if (error)
9bccf70c 3056 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1c79356b 3057 else {
0b4e3aa0 3058 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
9bccf70c 3059 (int)upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
1c79356b 3060
0b4e3aa0
A
3061 if (start_pg) {
3062 /*
3063 * we found some already valid pages at the beginning of
3064 * the upl commit these back to the inactive list with
3065 * reference cleared
3066 */
3067 for (cur_pg = 0; cur_pg < start_pg; cur_pg++) {
3068 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
3069 | UPL_COMMIT_INACTIVATE;
1c79356b
A
3070
3071 if (upl_dirty_page(pl, cur_pg))
0b4e3aa0 3072 commit_flags |= UPL_COMMIT_SET_DIRTY;
1c79356b 3073
91447636 3074 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (flags & IO_NOCACHE))
0b4e3aa0
A
3075 ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
3076 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
1c79356b 3077 else
0b4e3aa0
A
3078 ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
3079 PAGE_SIZE, commit_flags);
1c79356b
A
3080 }
3081 }
3082 if (last_pg < uio_last) {
0b4e3aa0
A
3083 /*
3084 * we found some already valid pages immediately after the
3085 * pages we issued I/O for, commit these back to the
3086 * inactive list with reference cleared
3087 */
3088 for (cur_pg = last_pg; cur_pg < uio_last; cur_pg++) {
3089 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
3090 | UPL_COMMIT_INACTIVATE;
1c79356b
A
3091
3092 if (upl_dirty_page(pl, cur_pg))
0b4e3aa0 3093 commit_flags |= UPL_COMMIT_SET_DIRTY;
1c79356b 3094
91447636 3095 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (flags & IO_NOCACHE))
0b4e3aa0
A
3096 ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
3097 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
1c79356b 3098 else
0b4e3aa0
A
3099 ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
3100 PAGE_SIZE, commit_flags);
1c79356b
A
3101 }
3102 }
3103 if (uio_last < pages_in_upl) {
0b4e3aa0
A
3104 /*
3105 * there were some invalid pages beyond the valid pages
3106 * that we didn't issue an I/O for, just release them
3107 * unchanged
1c79356b 3108 */
9bccf70c
A
3109 ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
3110 (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
1c79356b
A
3111 }
3112
0b4e3aa0 3113 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
9bccf70c 3114 (int)upl, -1, -1, 0, 0);
1c79356b
A
3115 }
3116 }
3117 if (retval == 0)
3118 retval = error;
91447636
A
3119
3120 if ( uio_resid(uio) ) {
3121 if (cluster_hard_throttle_on(vp)) {
3122 rd_ahead_enabled = 0;
3123 prefetch_enabled = 0;
3124
3125 max_rd_size = HARD_THROTTLE_MAXSIZE;
3126 } else {
3127 if (rap != NULL)
3128 rd_ahead_enabled = 1;
3129 prefetch_enabled = 1;
3130
3131 max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE;
3132 }
3133 }
3134 }
3135 if (rap != NULL) {
3136 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
3137 (int)uio->uio_offset, uio_resid(uio), rap->cl_lastr, retval, 0);
3138
3139 lck_mtx_unlock(&rap->cl_lockr);
3140 } else {
3141 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
3142 (int)uio->uio_offset, uio_resid(uio), 0, retval, 0);
1c79356b
A
3143 }
3144
3145 return (retval);
3146}
3147
b4c24cb9 3148
9bccf70c 3149static int
91447636 3150cluster_nocopy_read(vnode_t vp, struct uio *uio, off_t filesize)
1c79356b
A
3151{
3152 upl_t upl;
3153 upl_page_info_t *pl;
1c79356b 3154 vm_offset_t upl_offset;
1c79356b
A
3155 off_t max_io_size;
3156 int io_size;
3157 int upl_size;
3158 int upl_needed_size;
3159 int pages_in_pl;
1c79356b
A
3160 int upl_flags;
3161 kern_return_t kret;
1c79356b
A
3162 int i;
3163 int force_data_sync;
1c79356b 3164 int retval = 0;
91447636
A
3165 int no_zero_fill = 0;
3166 int abort_flag = 0;
d7e50217 3167 struct clios iostate;
55e303ae
A
3168 u_int max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE;
3169 u_int max_rd_ahead = MAX_UPL_TRANSFER * PAGE_SIZE * 2;
3170
1c79356b
A
3171
3172 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
91447636 3173 (int)uio->uio_offset, uio_resid(uio), (int)filesize, 0, 0);
1c79356b
A
3174
3175 /*
3176 * When we enter this routine, we know
3177 * -- the offset into the file is on a pagesize boundary
3178 * -- the resid is a page multiple
3179 * -- the resid will not exceed iov_len
3180 */
3181
d7e50217
A
3182 iostate.io_completed = 0;
3183 iostate.io_issued = 0;
3184 iostate.io_error = 0;
3185 iostate.io_wanted = 0;
3186
91447636 3187 while (uio_resid(uio) && uio->uio_offset < filesize && retval == 0) {
cc9f6e38 3188 user_addr_t iov_base;
1c79356b 3189
91447636
A
3190 if (cluster_hard_throttle_on(vp)) {
3191 max_rd_size = HARD_THROTTLE_MAXSIZE;
3192 max_rd_ahead = HARD_THROTTLE_MAXSIZE - 1;
3193 } else {
3194 max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE;
cc9f6e38 3195 max_rd_ahead = MAX_UPL_TRANSFER * PAGE_SIZE * 8;
91447636 3196 }
d7e50217 3197 max_io_size = filesize - uio->uio_offset;
0b4e3aa0 3198
91447636
A
3199 // LP64todo - fix this
3200 if (max_io_size < (off_t)((unsigned int)uio_resid(uio)))
d7e50217
A
3201 io_size = max_io_size;
3202 else
91447636 3203 io_size = uio_resid(uio);
1c79356b 3204
d7e50217
A
3205 /*
3206 * First look for pages already in the cache
3207 * and move them to user space.
3208 */
55e303ae 3209 retval = cluster_copy_ubc_data(vp, uio, &io_size, 0);
1c79356b 3210
d7e50217
A
3211 if (retval) {
3212 /*
3213 * we may have already spun some portion of this request
3214 * off as async requests... we need to wait for the I/O
3215 * to complete before returning
3216 */
3217 goto wait_for_reads;
0b4e3aa0 3218 }
d7e50217
A
3219 /*
3220 * If we are already finished with this read, then return
3221 */
3222 if (io_size == 0) {
3223 /*
3224 * we may have already spun some portion of this request
3225 * off as async requests... we need to wait for the I/O
3226 * to complete before returning
3227 */
3228 goto wait_for_reads;
3229 }
3230 max_io_size = io_size;
3231
55e303ae
A
3232 if (max_io_size > max_rd_size)
3233 max_io_size = max_rd_size;
3234
d7e50217 3235 io_size = 0;
1c79356b 3236
55e303ae
A
3237 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + max_io_size, UPL_ROP_ABSENT, &io_size);
3238
d7e50217
A
3239 if (io_size == 0)
3240 /*
3241 * we may have already spun some portion of this request
3242 * off as async requests... we need to wait for the I/O
3243 * to complete before returning
3244 */
3245 goto wait_for_reads;
cc9f6e38
A
3246
3247 iov_base = uio_curriovbase(uio);
1c79356b 3248
91447636 3249 // LP64todo - fix this!
cc9f6e38 3250 upl_offset = CAST_DOWN(vm_offset_t, iov_base) & PAGE_MASK;
d7e50217 3251 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
1c79356b 3252
d7e50217 3253 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
cc9f6e38 3254 (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
1c79356b 3255
91447636
A
3256 if (upl_offset == 0 && ((io_size & PAGE_MASK) == 0)) {
3257 no_zero_fill = 1;
3258 abort_flag = UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY;
3259 } else {
3260 no_zero_fill = 0;
3261 abort_flag = UPL_ABORT_FREE_ON_EMPTY;
3262 }
d7e50217
A
3263 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
3264 pages_in_pl = 0;
3265 upl_size = upl_needed_size;
55e303ae 3266 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
1c79356b 3267
91447636
A
3268 if (no_zero_fill)
3269 upl_flags |= UPL_NOZEROFILL;
3270 if (force_data_sync)
3271 upl_flags |= UPL_FORCE_DATA_SYNC;
3272
3273 // LP64todo - fix this!
3274 kret = vm_map_create_upl(current_map(),
cc9f6e38 3275 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
91447636 3276 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags);
1c79356b 3277
d7e50217
A
3278 if (kret != KERN_SUCCESS) {
3279 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
3280 (int)upl_offset, upl_size, io_size, kret, 0);
d7e50217
A
3281 /*
3282 * cluster_nocopy_read: failed to get pagelist
3283 *
3284 * we may have already spun some portion of this request
3285 * off as async requests... we need to wait for the I/O
3286 * to complete before returning
3287 */
3288 goto wait_for_reads;
3289 }
3290 pages_in_pl = upl_size / PAGE_SIZE;
3291 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1c79356b 3292
d7e50217
A
3293 for (i = 0; i < pages_in_pl; i++) {
3294 if (!upl_valid_page(pl, i))
3295 break;
3296 }
3297 if (i == pages_in_pl)
3298 break;
0b4e3aa0 3299
91447636 3300 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, abort_flag);
1c79356b 3301 }
d7e50217
A
3302 if (force_data_sync >= 3) {
3303 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
3304 (int)upl_offset, upl_size, io_size, kret, 0);
1c79356b 3305
d7e50217
A
3306 goto wait_for_reads;
3307 }
3308 /*
3309 * Consider the possibility that upl_size wasn't satisfied.
3310 */
3311 if (upl_size != upl_needed_size)
3312 io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
1c79356b 3313
d7e50217 3314 if (io_size == 0) {
91447636 3315 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, abort_flag);
d7e50217
A
3316 goto wait_for_reads;
3317 }
3318 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
3319 (int)upl_offset, upl_size, io_size, kret, 0);
1c79356b 3320
d7e50217
A
3321 /*
3322 * request asynchronously so that we can overlap
3323 * the preparation of the next I/O
3324 * if there are already too many outstanding reads
3325 * wait until some have completed before issuing the next read
3326 */
91447636
A
3327 lck_mtx_lock(cl_mtxp);
3328
55e303ae 3329 while ((iostate.io_issued - iostate.io_completed) > max_rd_ahead) {
d7e50217 3330 iostate.io_wanted = 1;
91447636 3331 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_read", 0);
d7e50217 3332 }
91447636
A
3333 lck_mtx_unlock(cl_mtxp);
3334
d7e50217
A
3335 if (iostate.io_error) {
3336 /*
3337 * one of the earlier reads we issued ran into a hard error
3338 * don't issue any more reads, cleanup the UPL
3339 * that was just created but not used, then
3340 * go wait for any other reads to complete before
3341 * returning the error to the caller
3342 */
91447636 3343 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, abort_flag);
1c79356b 3344
d7e50217
A
3345 goto wait_for_reads;
3346 }
3347 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
55e303ae 3348 (int)upl, (int)upl_offset, (int)uio->uio_offset, io_size, 0);
1c79356b 3349
91447636 3350 retval = cluster_io(vp, upl, upl_offset, uio->uio_offset, io_size,
d7e50217 3351 CL_PRESERVE | CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO,
91447636 3352 (buf_t)NULL, &iostate);
1c79356b 3353
d7e50217
A
3354 /*
3355 * update the uio structure
3356 */
cc9f6e38 3357 uio_update(uio, (user_size_t)io_size);
1c79356b 3358
d7e50217 3359 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
91447636 3360 (int)upl, (int)uio->uio_offset, (int)uio_resid(uio), retval, 0);
1c79356b
A
3361
3362 } /* end while */
3363
d7e50217
A
3364wait_for_reads:
3365 /*
3366 * make sure all async reads that are part of this stream
3367 * have completed before we return
3368 */
91447636
A
3369 lck_mtx_lock(cl_mtxp);
3370
d7e50217
A
3371 while (iostate.io_issued != iostate.io_completed) {
3372 iostate.io_wanted = 1;
91447636 3373 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_read", 0);
d7e50217 3374 }
91447636
A
3375 lck_mtx_unlock(cl_mtxp);
3376
d7e50217
A
3377 if (iostate.io_error)
3378 retval = iostate.io_error;
1c79356b
A
3379
3380 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
91447636 3381 (int)uio->uio_offset, (int)uio_resid(uio), 6, retval, 0);
1c79356b
A
3382
3383 return (retval);
3384}
3385
3386
9bccf70c 3387static int
91447636 3388cluster_phys_read(vnode_t vp, struct uio *uio, off_t filesize)
0b4e3aa0 3389{
b4c24cb9 3390 upl_page_info_t *pl;
0b4e3aa0
A
3391 upl_t upl;
3392 vm_offset_t upl_offset;
55e303ae 3393 addr64_t dst_paddr;
0b4e3aa0 3394 off_t max_size;
cc9f6e38
A
3395 int io_size;
3396 user_size_t iov_len;
3397 user_addr_t iov_base;
b4c24cb9 3398 int tail_size;
0b4e3aa0
A
3399 int upl_size;
3400 int upl_needed_size;
3401 int pages_in_pl;
3402 int upl_flags;
3403 kern_return_t kret;
b4c24cb9 3404 struct clios iostate;
0b4e3aa0 3405 int error;
91447636 3406 int devblocksize;
0b4e3aa0 3407
91447636 3408 devblocksize = vp->v_mount->mnt_devblocksize;
0b4e3aa0
A
3409 /*
3410 * When we enter this routine, we know
3411 * -- the resid will not exceed iov_len
3412 * -- the target address is physically contiguous
3413 */
3414
91447636
A
3415#if LP64_DEBUG
3416 if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) {
3417 panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__);
3418 }
3419#endif /* LP64_DEBUG */
3420
cc9f6e38
A
3421 iov_len = uio_curriovlen(uio);
3422 iov_base = uio_curriovbase(uio);
0b4e3aa0
A
3423
3424 max_size = filesize - uio->uio_offset;
3425
91447636
A
3426 // LP64todo - fix this!
3427 if (max_size < 0 || (u_int64_t)max_size > iov_len)
3428 io_size = iov_len;
0b4e3aa0 3429 else
b4c24cb9 3430 io_size = max_size;
0b4e3aa0 3431
91447636
A
3432 // LP64todo - fix this!
3433 upl_offset = CAST_DOWN(vm_offset_t, iov_base) & PAGE_MASK;
0b4e3aa0
A
3434 upl_needed_size = upl_offset + io_size;
3435
b4c24cb9 3436 error = 0;
0b4e3aa0
A
3437 pages_in_pl = 0;
3438 upl_size = upl_needed_size;
55e303ae 3439 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
0b4e3aa0
A
3440
3441 kret = vm_map_get_upl(current_map(),
cc9f6e38 3442 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
0b4e3aa0
A
3443 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
3444
b4c24cb9
A
3445 if (kret != KERN_SUCCESS) {
3446 /*
3447 * cluster_phys_read: failed to get pagelist
3448 */
3449 return(EINVAL);
3450 }
3451 if (upl_size < upl_needed_size) {
3452 /*
3453 * The upl_size wasn't satisfied.
3454 */
3455 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3456
3457 return(EINVAL);
3458 }
3459 pl = ubc_upl_pageinfo(upl);
3460
cc9f6e38 3461 dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)upl_offset;
0b4e3aa0 3462
b4c24cb9
A
3463 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
3464 int head_size;
3465
3466 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
3467
3468 if (head_size > io_size)
3469 head_size = io_size;
3470
91447636 3471 error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, CL_READ);
b4c24cb9
A
3472
3473 if (error) {
3474 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3475
3476 return(EINVAL);
3477 }
3478 upl_offset += head_size;
3479 dst_paddr += head_size;
3480 io_size -= head_size;
3481 }
3482 tail_size = io_size & (devblocksize - 1);
3483 io_size -= tail_size;
3484
3485 iostate.io_completed = 0;
3486 iostate.io_issued = 0;
3487 iostate.io_error = 0;
3488 iostate.io_wanted = 0;
3489
3490 while (io_size && error == 0) {
3491 int xsize;
3492
3493 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
3494 xsize = MAX_UPL_TRANSFER * PAGE_SIZE;
3495 else
3496 xsize = io_size;
3497 /*
3498 * request asynchronously so that we can overlap
3499 * the preparation of the next I/O... we'll do
3500 * the commit after all the I/O has completed
3501 * since its all issued against the same UPL
3502 * if there are already too many outstanding reads
d7e50217 3503 * wait until some have completed before issuing the next
b4c24cb9 3504 */
91447636
A
3505 lck_mtx_lock(cl_mtxp);
3506
cc9f6e38 3507 while ((iostate.io_issued - iostate.io_completed) > (8 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
b4c24cb9 3508 iostate.io_wanted = 1;
91447636 3509 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_phys_read", 0);
b4c24cb9 3510 }
91447636 3511 lck_mtx_unlock(cl_mtxp);
b4c24cb9 3512
91447636 3513 error = cluster_io(vp, upl, upl_offset, uio->uio_offset, xsize,
b4c24cb9 3514 CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC,
91447636 3515 (buf_t)NULL, &iostate);
b4c24cb9
A
3516 /*
3517 * The cluster_io read was issued successfully,
3518 * update the uio structure
3519 */
3520 if (error == 0) {
cc9f6e38
A
3521 uio_update(uio, (user_size_t)xsize);
3522
3523 dst_paddr += xsize;
3524 upl_offset += xsize;
3525 io_size -= xsize;
b4c24cb9
A
3526 }
3527 }
0b4e3aa0 3528 /*
d7e50217
A
3529 * make sure all async reads that are part of this stream
3530 * have completed before we proceed
0b4e3aa0 3531 */
91447636
A
3532 lck_mtx_lock(cl_mtxp);
3533
b4c24cb9
A
3534 while (iostate.io_issued != iostate.io_completed) {
3535 iostate.io_wanted = 1;
91447636 3536 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_phys_read", 0);
b4c24cb9 3537 }
91447636
A
3538 lck_mtx_unlock(cl_mtxp);
3539
3540 if (iostate.io_error)
b4c24cb9 3541 error = iostate.io_error;
91447636 3542
b4c24cb9 3543 if (error == 0 && tail_size)
91447636 3544 error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, CL_READ);
0b4e3aa0
A
3545
3546 /*
b4c24cb9
A
3547 * just release our hold on the physically contiguous
3548 * region without changing any state
0b4e3aa0 3549 */
b4c24cb9 3550 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
0b4e3aa0
A
3551
3552 return (error);
3553}
1c79356b 3554
b4c24cb9 3555
1c79356b
A
3556/*
3557 * generate advisory I/O's in the largest chunks possible
3558 * the completed pages will be released into the VM cache
3559 */
9bccf70c 3560int
91447636 3561advisory_read(vnode_t vp, off_t filesize, off_t f_offset, int resid)
1c79356b 3562{
1c79356b
A
3563 upl_page_info_t *pl;
3564 upl_t upl;
3565 vm_offset_t upl_offset;
3566 int upl_size;
3567 off_t upl_f_offset;
3568 int start_offset;
3569 int start_pg;
3570 int last_pg;
3571 int pages_in_upl;
3572 off_t max_size;
3573 int io_size;
3574 kern_return_t kret;
3575 int retval = 0;
9bccf70c 3576 int issued_io;
55e303ae 3577 int skip_range;
1c79356b 3578
91447636 3579 if ( !UBCINFOEXISTS(vp))
1c79356b
A
3580 return(EINVAL);
3581
1c79356b 3582 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
91447636 3583 (int)f_offset, resid, (int)filesize, 0, 0);
1c79356b
A
3584
3585 while (resid && f_offset < filesize && retval == 0) {
3586 /*
3587 * compute the size of the upl needed to encompass
3588 * the requested read... limit each call to cluster_io
0b4e3aa0
A
3589 * to the maximum UPL size... cluster_io will clip if
3590 * this exceeds the maximum io_size for the device,
3591 * make sure to account for
1c79356b
A
3592 * a starting offset that's not page aligned
3593 */
3594 start_offset = (int)(f_offset & PAGE_MASK_64);
3595 upl_f_offset = f_offset - (off_t)start_offset;
3596 max_size = filesize - f_offset;
3597
3598 if (resid < max_size)
3599 io_size = resid;
3600 else
3601 io_size = max_size;
3602
3603 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
0b4e3aa0
A
3604 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
3605 upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
55e303ae
A
3606
3607 skip_range = 0;
3608 /*
3609 * return the number of contiguously present pages in the cache
3610 * starting at upl_f_offset within the file
3611 */
3612 ubc_range_op(vp, upl_f_offset, upl_f_offset + upl_size, UPL_ROP_PRESENT, &skip_range);
3613
3614 if (skip_range) {
3615 /*
3616 * skip over pages already present in the cache
3617 */
3618 io_size = skip_range - start_offset;
3619
3620 f_offset += io_size;
3621 resid -= io_size;
3622
3623 if (skip_range == upl_size)
3624 continue;
3625 /*
3626 * have to issue some real I/O
3627 * at this point, we know it's starting on a page boundary
3628 * because we've skipped over at least the first page in the request
3629 */
3630 start_offset = 0;
3631 upl_f_offset += skip_range;
3632 upl_size -= skip_range;
3633 }
1c79356b
A
3634 pages_in_upl = upl_size / PAGE_SIZE;
3635
55e303ae
A
3636 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_START,
3637 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
3638
0b4e3aa0 3639 kret = ubc_create_upl(vp,
91447636
A
3640 upl_f_offset,
3641 upl_size,
3642 &upl,
3643 &pl,
3644 UPL_RET_ONLY_ABSENT | UPL_SET_LITE);
1c79356b 3645 if (kret != KERN_SUCCESS)
9bccf70c
A
3646 return(retval);
3647 issued_io = 0;
1c79356b
A
3648
3649 /*
9bccf70c
A
3650 * before we start marching forward, we must make sure we end on
3651 * a present page, otherwise we will be working with a freed
3652 * upl
1c79356b 3653 */
9bccf70c
A
3654 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
3655 if (upl_page_present(pl, last_pg))
3656 break;
1c79356b 3657 }
9bccf70c 3658 pages_in_upl = last_pg + 1;
1c79356b 3659
1c79356b 3660
55e303ae 3661 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_END,
9bccf70c
A
3662 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
3663
3664
3665 for (last_pg = 0; last_pg < pages_in_upl; ) {
1c79356b 3666 /*
9bccf70c
A
3667 * scan from the beginning of the upl looking for the first
3668 * page that is present.... this will become the first page in
3669 * the request we're going to make to 'cluster_io'... if all
3670 * of the pages are absent, we won't call through to 'cluster_io'
1c79356b 3671 */
9bccf70c
A
3672 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
3673 if (upl_page_present(pl, start_pg))
3674 break;
1c79356b 3675 }
1c79356b 3676
1c79356b 3677 /*
9bccf70c
A
3678 * scan from the starting present page looking for an absent
3679 * page before the end of the upl is reached, if we
3680 * find one, then it will terminate the range of pages being
3681 * presented to 'cluster_io'
1c79356b 3682 */
9bccf70c
A
3683 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
3684 if (!upl_page_present(pl, last_pg))
3685 break;
3686 }
3687
3688 if (last_pg > start_pg) {
3689 /*
3690 * we found a range of pages that must be filled
3691 * if the last page in this range is the last page of the file
3692 * we may have to clip the size of it to keep from reading past
3693 * the end of the last physical block associated with the file
3694 */
3695 upl_offset = start_pg * PAGE_SIZE;
3696 io_size = (last_pg - start_pg) * PAGE_SIZE;
3697
3698 if ((upl_f_offset + upl_offset + io_size) > filesize)
3699 io_size = filesize - (upl_f_offset + upl_offset);
3700
3701 /*
3702 * issue an asynchronous read to cluster_io
3703 */
91447636
A
3704 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
3705 CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE, (buf_t)NULL, (struct clios *)NULL);
1c79356b 3706
9bccf70c
A
3707 issued_io = 1;
3708 }
1c79356b 3709 }
9bccf70c
A
3710 if (issued_io == 0)
3711 ubc_upl_abort(upl, 0);
3712
3713 io_size = upl_size - start_offset;
1c79356b
A
3714
3715 if (io_size > resid)
3716 io_size = resid;
3717 f_offset += io_size;
3718 resid -= io_size;
3719 }
9bccf70c 3720
1c79356b
A
3721 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
3722 (int)f_offset, resid, retval, 0, 0);
3723
3724 return(retval);
3725}
3726
3727
9bccf70c 3728int
91447636 3729cluster_push(vnode_t vp, int flags)
9bccf70c 3730{
91447636
A
3731 int retval;
3732 struct cl_writebehind *wbp;
9bccf70c 3733
91447636
A
3734 if ( !UBCINFOEXISTS(vp)) {
3735 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, (int)vp, flags, 0, -1, 0);
3736 return (0);
3737 }
3738 /* return if deferred write is set */
3739 if (((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) && (flags & IO_DEFWRITE)) {
3740 return (0);
3741 }
3742 if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) == NULL) {
3743 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, (int)vp, flags, 0, -2, 0);
3744 return (0);
3745 }
3746 if (wbp->cl_number == 0 && wbp->cl_scmap == NULL) {
3747 lck_mtx_unlock(&wbp->cl_lockw);
9bccf70c 3748
91447636
A
3749 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, (int)vp, flags, 0, -3, 0);
3750 return(0);
3751 }
9bccf70c 3752 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
91447636 3753 (int)wbp->cl_scmap, wbp->cl_number, flags, 0, 0);
9bccf70c 3754
91447636
A
3755 if (wbp->cl_scmap) {
3756 sparse_cluster_push(wbp, vp, ubc_getsize(vp), 1);
9bccf70c 3757
55e303ae
A
3758 retval = 1;
3759 } else
91447636
A
3760 retval = cluster_try_push(wbp, vp, ubc_getsize(vp), 0, 1);
3761
3762 lck_mtx_unlock(&wbp->cl_lockw);
3763
3764 if (flags & IO_SYNC)
3765 (void)vnode_waitforwrites(vp, 0, 0, 0, (char *)"cluster_push");
9bccf70c 3766
55e303ae 3767 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
91447636 3768 (int)wbp->cl_scmap, wbp->cl_number, retval, 0, 0);
9bccf70c 3769
55e303ae
A
3770 return (retval);
3771}
9bccf70c 3772
9bccf70c 3773
91447636
A
3774__private_extern__ void
3775cluster_release(struct ubc_info *ubc)
55e303ae 3776{
91447636
A
3777 struct cl_writebehind *wbp;
3778 struct cl_readahead *rap;
3779
3780 if ((wbp = ubc->cl_wbehind)) {
9bccf70c 3781
91447636
A
3782 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, (int)ubc, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
3783
3784 if (wbp->cl_scmap)
3785 vfs_drt_control(&(wbp->cl_scmap), 0);
3786 } else {
3787 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, (int)ubc, 0, 0, 0, 0);
3788 }
9bccf70c 3789
91447636 3790 rap = ubc->cl_rahead;
55e303ae 3791
91447636
A
3792 if (wbp != NULL) {
3793 lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp);
3794 FREE_ZONE((void *)wbp, sizeof *wbp, M_CLWRBEHIND);
3795 }
3796 if ((rap = ubc->cl_rahead)) {
3797 lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp);
3798 FREE_ZONE((void *)rap, sizeof *rap, M_CLRDAHEAD);
55e303ae 3799 }
91447636
A
3800 ubc->cl_rahead = NULL;
3801 ubc->cl_wbehind = NULL;
3802
3803 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_END, (int)ubc, (int)rap, (int)wbp, 0, 0);
3804}
3805
3806
3807static void
3808cluster_push_EOF(vnode_t vp, off_t EOF)
3809{
3810 struct cl_writebehind *wbp;
3811
3812 wbp = cluster_get_wbp(vp, CLW_ALLOCATE | CLW_RETURNLOCKED);
3813
3814 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
3815 (int)wbp->cl_scmap, wbp->cl_number, (int)EOF, 0, 0);
3816
3817 if (wbp->cl_scmap)
3818 sparse_cluster_push(wbp, vp, EOF, 1);
3819 else
3820 cluster_try_push(wbp, vp, EOF, 0, 1);
3821
3822 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
3823 (int)wbp->cl_scmap, wbp->cl_number, 0, 0, 0);
3824
3825 lck_mtx_unlock(&wbp->cl_lockw);
9bccf70c
A
3826}
3827
3828
3829static int
91447636 3830cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int can_delay, int push_all)
9bccf70c
A
3831{
3832 int cl_index;
3833 int cl_index1;
3834 int min_index;
3835 int cl_len;
55e303ae 3836 int cl_pushed = 0;
91447636 3837 struct cl_wextent l_clusters[MAX_CLUSTERS];
9bccf70c
A
3838
3839 /*
91447636
A
3840 * the write behind context exists and has
3841 * already been locked...
3842 *
9bccf70c 3843 * make a local 'sorted' copy of the clusters
91447636 3844 * and clear wbp->cl_number so that new clusters can
9bccf70c
A
3845 * be developed
3846 */
91447636
A
3847 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
3848 for (min_index = -1, cl_index1 = 0; cl_index1 < wbp->cl_number; cl_index1++) {
3849 if (wbp->cl_clusters[cl_index1].b_addr == wbp->cl_clusters[cl_index1].e_addr)
9bccf70c
A
3850 continue;
3851 if (min_index == -1)
3852 min_index = cl_index1;
91447636 3853 else if (wbp->cl_clusters[cl_index1].b_addr < wbp->cl_clusters[min_index].b_addr)
9bccf70c
A
3854 min_index = cl_index1;
3855 }
3856 if (min_index == -1)
3857 break;
91447636
A
3858 l_clusters[cl_index].b_addr = wbp->cl_clusters[min_index].b_addr;
3859 l_clusters[cl_index].e_addr = wbp->cl_clusters[min_index].e_addr;
3860 l_clusters[cl_index].io_nocache = wbp->cl_clusters[min_index].io_nocache;
9bccf70c 3861
91447636 3862 wbp->cl_clusters[min_index].b_addr = wbp->cl_clusters[min_index].e_addr;
9bccf70c 3863 }
91447636
A
3864 wbp->cl_number = 0;
3865
3866 cl_len = cl_index;
9bccf70c 3867
55e303ae
A
3868 if (can_delay && cl_len == MAX_CLUSTERS) {
3869 int i;
3870
3871 /*
3872 * determine if we appear to be writing the file sequentially
3873 * if not, by returning without having pushed any clusters
3874 * we will cause this vnode to be pushed into the sparse cluster mechanism
3875 * used for managing more random I/O patterns
3876 *
3877 * we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
3878 * that's why we're in try_push with can_delay true...
3879 *
3880 * check to make sure that all the clusters except the last one are 'full'... and that each cluster
3881 * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
91447636
A
3882 * so we can just make a simple pass through, up to, but not including the last one...
3883 * note that e_addr is not inclusive, so it will be equal to the b_addr of the next cluster if they
55e303ae
A
3884 * are sequential
3885 *
3886 * we let the last one be partial as long as it was adjacent to the previous one...
3887 * we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
3888 * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
3889 */
3890 for (i = 0; i < MAX_CLUSTERS - 1; i++) {
91447636 3891 if ((l_clusters[i].e_addr - l_clusters[i].b_addr) != MAX_UPL_TRANSFER)
55e303ae 3892 goto dont_try;
91447636 3893 if (l_clusters[i].e_addr != l_clusters[i+1].b_addr)
55e303ae
A
3894 goto dont_try;
3895 }
3896 }
91447636
A
3897 /*
3898 * drop the lock while we're firing off the I/Os...
3899 * this is safe since I'm working off of a private sorted copy
3900 * of the clusters, and I'm going to re-evaluate the public
3901 * state after I retake the lock
3902 */
3903 lck_mtx_unlock(&wbp->cl_lockw);
3904
55e303ae 3905 for (cl_index = 0; cl_index < cl_len; cl_index++) {
91447636
A
3906 int flags;
3907 struct cl_extent cl;
3908
9bccf70c 3909 /*
91447636 3910 * try to push each cluster in turn...
9bccf70c 3911 */
91447636
A
3912 if (l_clusters[cl_index].io_nocache)
3913 flags = IO_NOCACHE;
3914 else
3915 flags = 0;
3916 cl.b_addr = l_clusters[cl_index].b_addr;
3917 cl.e_addr = l_clusters[cl_index].e_addr;
9bccf70c 3918
91447636 3919 cluster_push_x(vp, &cl, EOF, flags);
9bccf70c 3920
91447636
A
3921 l_clusters[cl_index].b_addr = 0;
3922 l_clusters[cl_index].e_addr = 0;
3923
3924 cl_pushed++;
3925
3926 if (push_all == 0)
3927 break;
9bccf70c 3928 }
91447636
A
3929 lck_mtx_lock(&wbp->cl_lockw);
3930
55e303ae 3931dont_try:
9bccf70c
A
3932 if (cl_len > cl_pushed) {
3933 /*
3934 * we didn't push all of the clusters, so
3935 * lets try to merge them back in to the vnode
3936 */
91447636 3937 if ((MAX_CLUSTERS - wbp->cl_number) < (cl_len - cl_pushed)) {
9bccf70c
A
3938 /*
3939 * we picked up some new clusters while we were trying to
91447636
A
3940 * push the old ones... this can happen because I've dropped
3941 * the vnode lock... the sum of the
9bccf70c 3942 * leftovers plus the new cluster count exceeds our ability
55e303ae 3943 * to represent them, so switch to the sparse cluster mechanism
91447636
A
3944 *
3945 * collect the active public clusters...
9bccf70c 3946 */
91447636 3947 sparse_cluster_switch(wbp, vp, EOF);
55e303ae
A
3948
3949 for (cl_index = 0, cl_index1 = 0; cl_index < cl_len; cl_index++) {
91447636 3950 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr)
9bccf70c 3951 continue;
91447636
A
3952 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
3953 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
3954 wbp->cl_clusters[cl_index1].io_nocache = l_clusters[cl_index].io_nocache;
9bccf70c 3955
55e303ae 3956 cl_index1++;
9bccf70c 3957 }
55e303ae
A
3958 /*
3959 * update the cluster count
3960 */
91447636 3961 wbp->cl_number = cl_index1;
55e303ae
A
3962
3963 /*
3964 * and collect the original clusters that were moved into the
3965 * local storage for sorting purposes
3966 */
91447636 3967 sparse_cluster_switch(wbp, vp, EOF);
55e303ae 3968
9bccf70c
A
3969 } else {
3970 /*
3971 * we've got room to merge the leftovers back in
3972 * just append them starting at the next 'hole'
91447636 3973 * represented by wbp->cl_number
9bccf70c 3974 */
91447636
A
3975 for (cl_index = 0, cl_index1 = wbp->cl_number; cl_index < cl_len; cl_index++) {
3976 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr)
9bccf70c
A
3977 continue;
3978
91447636
A
3979 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
3980 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
3981 wbp->cl_clusters[cl_index1].io_nocache = l_clusters[cl_index].io_nocache;
9bccf70c 3982
9bccf70c
A
3983 cl_index1++;
3984 }
3985 /*
3986 * update the cluster count
3987 */
91447636 3988 wbp->cl_number = cl_index1;
9bccf70c
A
3989 }
3990 }
91447636 3991 return(MAX_CLUSTERS - wbp->cl_number);
9bccf70c
A
3992}
3993
3994
3995
3996static int
91447636 3997cluster_push_x(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags)
1c79356b 3998{
1c79356b
A
3999 upl_page_info_t *pl;
4000 upl_t upl;
4001 vm_offset_t upl_offset;
4002 int upl_size;
4003 off_t upl_f_offset;
4004 int pages_in_upl;
4005 int start_pg;
4006 int last_pg;
4007 int io_size;
4008 int io_flags;
55e303ae 4009 int upl_flags;
1c79356b 4010 int size;
91447636
A
4011 int error = 0;
4012 int retval;
1c79356b
A
4013 kern_return_t kret;
4014
4015
9bccf70c 4016 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START,
91447636 4017 (int)cl->b_addr, (int)cl->e_addr, (int)EOF, flags, 0);
9bccf70c 4018
91447636 4019 if ((pages_in_upl = (int)(cl->e_addr - cl->b_addr)) == 0) {
9bccf70c 4020 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0);
1c79356b 4021
91447636 4022 return (0);
9bccf70c 4023 }
1c79356b 4024 upl_size = pages_in_upl * PAGE_SIZE;
91447636 4025 upl_f_offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
1c79356b 4026
9bccf70c
A
4027 if (upl_f_offset + upl_size >= EOF) {
4028
4029 if (upl_f_offset >= EOF) {
4030 /*
4031 * must have truncated the file and missed
4032 * clearing a dangling cluster (i.e. it's completely
4033 * beyond the new EOF
4034 */
4035 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0);
4036
91447636 4037 return(0);
9bccf70c
A
4038 }
4039 size = EOF - upl_f_offset;
1c79356b 4040
55e303ae 4041 upl_size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
9bccf70c 4042 pages_in_upl = upl_size / PAGE_SIZE;
55e303ae 4043 } else
9bccf70c 4044 size = upl_size;
55e303ae
A
4045
4046 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, size, 0, 0, 0);
4047
91447636
A
4048 /*
4049 * by asking for UPL_COPYOUT_FROM and UPL_RET_ONLY_DIRTY, we get the following desirable behavior
4050 *
4051 * - only pages that are currently dirty are returned... these are the ones we need to clean
4052 * - the hardware dirty bit is cleared when the page is gathered into the UPL... the software dirty bit is set
4053 * - if we have to abort the I/O for some reason, the software dirty bit is left set since we didn't clean the page
4054 * - when we commit the page, the software dirty bit is cleared... the hardware dirty bit is untouched so that if
4055 * someone dirties this page while the I/O is in progress, we don't lose track of the new state
4056 *
4057 * when the I/O completes, we no longer ask for an explicit clear of the DIRTY state (either soft or hard)
4058 */
4059
4060 if ((vp->v_flag & VNOCACHE_DATA) || (flags & IO_NOCACHE))
55e303ae
A
4061 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE | UPL_WILL_BE_DUMPED;
4062 else
4063 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE;
4064
0b4e3aa0
A
4065 kret = ubc_create_upl(vp,
4066 upl_f_offset,
4067 upl_size,
4068 &upl,
9bccf70c 4069 &pl,
55e303ae 4070 upl_flags);
1c79356b
A
4071 if (kret != KERN_SUCCESS)
4072 panic("cluster_push: failed to get pagelist");
4073
55e303ae 4074 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END, (int)upl, upl_f_offset, 0, 0, 0);
9bccf70c 4075
55e303ae
A
4076 /*
4077 * since we only asked for the dirty pages back
4078 * it's possible that we may only get a few or even none, so...
4079 * before we start marching forward, we must make sure we know
4080 * where the last present page is in the UPL, otherwise we could
4081 * end up working with a freed upl due to the FREE_ON_EMPTY semantics
4082 * employed by commit_range and abort_range.
4083 */
4084 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
4085 if (upl_page_present(pl, last_pg))
4086 break;
9bccf70c 4087 }
55e303ae 4088 pages_in_upl = last_pg + 1;
1c79356b 4089
55e303ae
A
4090 if (pages_in_upl == 0) {
4091 ubc_upl_abort(upl, 0);
1c79356b 4092
55e303ae 4093 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 2, 0, 0, 0);
91447636 4094 return(0);
55e303ae
A
4095 }
4096
4097 for (last_pg = 0; last_pg < pages_in_upl; ) {
4098 /*
4099 * find the next dirty page in the UPL
4100 * this will become the first page in the
4101 * next I/O to generate
4102 */
1c79356b 4103 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
55e303ae 4104 if (upl_dirty_page(pl, start_pg))
1c79356b 4105 break;
55e303ae
A
4106 if (upl_page_present(pl, start_pg))
4107 /*
4108 * RET_ONLY_DIRTY will return non-dirty 'precious' pages
4109 * just release these unchanged since we're not going
4110 * to steal them or change their state
4111 */
4112 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
1c79356b 4113 }
55e303ae
A
4114 if (start_pg >= pages_in_upl)
4115 /*
4116 * done... no more dirty pages to push
4117 */
4118 break;
4119 if (start_pg > last_pg)
4120 /*
4121 * skipped over some non-dirty pages
4122 */
4123 size -= ((start_pg - last_pg) * PAGE_SIZE);
1c79356b 4124
55e303ae
A
4125 /*
4126 * find a range of dirty pages to write
4127 */
1c79356b 4128 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
55e303ae 4129 if (!upl_dirty_page(pl, last_pg))
1c79356b
A
4130 break;
4131 }
4132 upl_offset = start_pg * PAGE_SIZE;
4133
4134 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
4135
91447636
A
4136 io_flags = CL_THROTTLE | CL_COMMIT;
4137
4138 if ( !(flags & IO_SYNC))
4139 io_flags |= CL_ASYNC;
4140
4141 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
4142 io_flags, (buf_t)NULL, (struct clios *)NULL);
1c79356b 4143
91447636
A
4144 if (error == 0 && retval)
4145 error = retval;
1c79356b
A
4146
4147 size -= io_size;
4148 }
9bccf70c
A
4149 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, 0, 0, 0);
4150
91447636 4151 return(error);
1c79356b 4152}
b4c24cb9
A
4153
4154
91447636
A
4155/*
4156 * sparse_cluster_switch is called with the write behind lock held
4157 */
4158static void
4159sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF)
b4c24cb9 4160{
91447636 4161 int cl_index;
b4c24cb9 4162
91447636 4163 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
b4c24cb9 4164
91447636
A
4165 if (wbp->cl_scmap == NULL)
4166 wbp->cl_scdirty = 0;
4167
4168 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
4169 int flags;
4170 struct cl_extent cl;
4171
4172 for (cl.b_addr = wbp->cl_clusters[cl_index].b_addr; cl.b_addr < wbp->cl_clusters[cl_index].e_addr; cl.b_addr++) {
b4c24cb9 4173
91447636
A
4174 if (ubc_page_op(vp, (off_t)(cl.b_addr * PAGE_SIZE_64), 0, 0, &flags) == KERN_SUCCESS) {
4175 if (flags & UPL_POP_DIRTY) {
4176 cl.e_addr = cl.b_addr + 1;
b4c24cb9 4177
91447636
A
4178 sparse_cluster_add(wbp, vp, &cl, EOF);
4179 }
55e303ae
A
4180 }
4181 }
4182 }
91447636
A
4183 wbp->cl_number = 0;
4184
4185 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
55e303ae
A
4186}
4187
4188
91447636
A
4189/*
4190 * sparse_cluster_push is called with the write behind lock held
4191 */
4192static void
4193sparse_cluster_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_all)
55e303ae 4194{
91447636
A
4195 struct cl_extent cl;
4196 off_t offset;
4197 u_int length;
55e303ae 4198
91447636 4199 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, push_all, 0);
55e303ae
A
4200
4201 if (push_all)
91447636 4202 vfs_drt_control(&(wbp->cl_scmap), 1);
55e303ae
A
4203
4204 for (;;) {
91447636 4205 if (vfs_drt_get_cluster(&(wbp->cl_scmap), &offset, &length) != KERN_SUCCESS)
55e303ae 4206 break;
55e303ae 4207
91447636
A
4208 cl.b_addr = (daddr64_t)(offset / PAGE_SIZE_64);
4209 cl.e_addr = (daddr64_t)((offset + length) / PAGE_SIZE_64);
4210
4211 wbp->cl_scdirty -= (int)(cl.e_addr - cl.b_addr);
55e303ae 4212
91447636 4213 cluster_push_x(vp, &cl, EOF, 0);
55e303ae
A
4214
4215 if (push_all == 0)
4216 break;
4217 }
91447636 4218 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
55e303ae
A
4219}
4220
4221
91447636
A
4222/*
4223 * sparse_cluster_add is called with the write behind lock held
4224 */
4225static void
4226sparse_cluster_add(struct cl_writebehind *wbp, vnode_t vp, struct cl_extent *cl, off_t EOF)
55e303ae 4227{
91447636
A
4228 u_int new_dirty;
4229 u_int length;
4230 off_t offset;
55e303ae 4231
91447636 4232 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_START, (int)wbp->cl_scmap, wbp->cl_scdirty, (int)cl->b_addr, (int)cl->e_addr, 0);
55e303ae 4233
91447636
A
4234 offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
4235 length = ((u_int)(cl->e_addr - cl->b_addr)) * PAGE_SIZE;
55e303ae 4236
91447636 4237 while (vfs_drt_mark_pages(&(wbp->cl_scmap), offset, length, &new_dirty) != KERN_SUCCESS) {
55e303ae
A
4238 /*
4239 * no room left in the map
4240 * only a partial update was done
4241 * push out some pages and try again
4242 */
91447636 4243 wbp->cl_scdirty += new_dirty;
55e303ae 4244
91447636 4245 sparse_cluster_push(wbp, vp, EOF, 0);
55e303ae
A
4246
4247 offset += (new_dirty * PAGE_SIZE_64);
4248 length -= (new_dirty * PAGE_SIZE);
4249 }
91447636 4250 wbp->cl_scdirty += new_dirty;
55e303ae 4251
91447636 4252 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
55e303ae
A
4253}
4254
4255
4256static int
91447636 4257cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, int xsize, int flags)
55e303ae 4258{
55e303ae
A
4259 upl_page_info_t *pl;
4260 upl_t upl;
4261 addr64_t ubc_paddr;
4262 kern_return_t kret;
4263 int error = 0;
91447636
A
4264 int did_read = 0;
4265 int abort_flags;
4266 int upl_flags;
55e303ae 4267
91447636
A
4268 upl_flags = UPL_SET_LITE;
4269 if (! (flags & CL_READ)) {
4270 /*
4271 * "write" operation: let the UPL subsystem know
4272 * that we intend to modify the buffer cache pages
4273 * we're gathering.
4274 */
4275 upl_flags |= UPL_WILL_MODIFY;
4276 }
4277
55e303ae
A
4278 kret = ubc_create_upl(vp,
4279 uio->uio_offset & ~PAGE_MASK_64,
4280 PAGE_SIZE,
4281 &upl,
4282 &pl,
91447636 4283 upl_flags);
55e303ae
A
4284
4285 if (kret != KERN_SUCCESS)
4286 return(EINVAL);
4287
4288 if (!upl_valid_page(pl, 0)) {
4289 /*
4290 * issue a synchronous read to cluster_io
4291 */
91447636
A
4292 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
4293 CL_READ, (buf_t)NULL, (struct clios *)NULL);
55e303ae 4294 if (error) {
b4c24cb9
A
4295 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
4296
4297 return(error);
4298 }
91447636 4299 did_read = 1;
b4c24cb9 4300 }
55e303ae 4301 ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)(uio->uio_offset & PAGE_MASK_64);
b4c24cb9 4302
55e303ae
A
4303/*
4304 * NOTE: There is no prototype for the following in BSD. It, and the definitions
4305 * of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
4306 * osfmk/ppc/mappings.h. They are not included here because there appears to be no
4307 * way to do so without exporting them to kexts as well.
4308 */
de355530 4309 if (flags & CL_READ)
55e303ae
A
4310// copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk); /* Copy physical to physical and flush the destination */
4311 copypv(ubc_paddr, usr_paddr, xsize, 2 | 1 | 4); /* Copy physical to physical and flush the destination */
de355530 4312 else
4a249263
A
4313// copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc); /* Copy physical to physical and flush the source */
4314 copypv(usr_paddr, ubc_paddr, xsize, 2 | 1 | 8); /* Copy physical to physical and flush the source */
55e303ae
A
4315
4316 if ( !(flags & CL_READ) || (upl_valid_page(pl, 0) && upl_dirty_page(pl, 0))) {
4317 /*
4318 * issue a synchronous write to cluster_io
4319 */
91447636
A
4320 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
4321 0, (buf_t)NULL, (struct clios *)NULL);
de355530 4322 }
cc9f6e38
A
4323 if (error == 0)
4324 uio_update(uio, (user_size_t)xsize);
4325
91447636
A
4326 if (did_read)
4327 abort_flags = UPL_ABORT_FREE_ON_EMPTY;
4328 else
4329 abort_flags = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
4330
4331 ubc_upl_abort_range(upl, 0, PAGE_SIZE, abort_flags);
55e303ae
A
4332
4333 return (error);
4334}
4335
4336
4337
4338int
4339cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int xsize)
4340{
4341 int pg_offset;
4342 int pg_index;
4343 int csize;
4344 int segflg;
4345 int retval = 0;
4346 upl_page_info_t *pl;
55e303ae
A
4347
4348 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
91447636 4349 (int)uio->uio_offset, uio_resid(uio), upl_offset, xsize, 0);
55e303ae
A
4350
4351 segflg = uio->uio_segflg;
4352
4353 switch(segflg) {
4354
91447636
A
4355 case UIO_USERSPACE32:
4356 case UIO_USERISPACE32:
4357 uio->uio_segflg = UIO_PHYS_USERSPACE32;
4358 break;
4359
55e303ae
A
4360 case UIO_USERSPACE:
4361 case UIO_USERISPACE:
4362 uio->uio_segflg = UIO_PHYS_USERSPACE;
4363 break;
4364
91447636
A
4365 case UIO_USERSPACE64:
4366 case UIO_USERISPACE64:
4367 uio->uio_segflg = UIO_PHYS_USERSPACE64;
4368 break;
4369
4370 case UIO_SYSSPACE32:
4371 uio->uio_segflg = UIO_PHYS_SYSSPACE32;
4372 break;
4373
55e303ae
A
4374 case UIO_SYSSPACE:
4375 uio->uio_segflg = UIO_PHYS_SYSSPACE;
4376 break;
91447636
A
4377
4378 case UIO_SYSSPACE64:
4379 uio->uio_segflg = UIO_PHYS_SYSSPACE64;
4380 break;
55e303ae
A
4381 }
4382 pl = ubc_upl_pageinfo(upl);
4383
4384 pg_index = upl_offset / PAGE_SIZE;
4385 pg_offset = upl_offset & PAGE_MASK;
4386 csize = min(PAGE_SIZE - pg_offset, xsize);
4387
4388 while (xsize && retval == 0) {
4389 addr64_t paddr;
4390
4391 paddr = ((addr64_t)upl_phys_page(pl, pg_index) << 12) + pg_offset;
de355530 4392
55e303ae
A
4393 retval = uiomove64(paddr, csize, uio);
4394
4395 pg_index += 1;
4396 pg_offset = 0;
4397 xsize -= csize;
4398 csize = min(PAGE_SIZE, xsize);
4399 }
4400 uio->uio_segflg = segflg;
4401
55e303ae 4402 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
91447636 4403 (int)uio->uio_offset, uio_resid(uio), retval, segflg, 0);
55e303ae
A
4404
4405 return (retval);
4406}
4407
4408
4409int
91447636 4410cluster_copy_ubc_data(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty)
55e303ae
A
4411{
4412 int segflg;
4413 int io_size;
4414 int xsize;
4415 int start_offset;
55e303ae
A
4416 int retval = 0;
4417 memory_object_control_t control;
55e303ae
A
4418
4419
4420 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
91447636 4421 (int)uio->uio_offset, uio_resid(uio), 0, *io_resid, 0);
55e303ae
A
4422
4423 control = ubc_getobject(vp, UBC_FLAGS_NONE);
4424 if (control == MEMORY_OBJECT_CONTROL_NULL) {
4425 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
91447636 4426 (int)uio->uio_offset, uio_resid(uio), retval, 3, 0);
55e303ae
A
4427
4428 return(0);
4429 }
55e303ae
A
4430 segflg = uio->uio_segflg;
4431
4432 switch(segflg) {
4433
91447636
A
4434 case UIO_USERSPACE32:
4435 case UIO_USERISPACE32:
4436 uio->uio_segflg = UIO_PHYS_USERSPACE32;
4437 break;
4438
4439 case UIO_USERSPACE64:
4440 case UIO_USERISPACE64:
4441 uio->uio_segflg = UIO_PHYS_USERSPACE64;
4442 break;
4443
4444 case UIO_SYSSPACE32:
4445 uio->uio_segflg = UIO_PHYS_SYSSPACE32;
4446 break;
4447
4448 case UIO_SYSSPACE64:
4449 uio->uio_segflg = UIO_PHYS_SYSSPACE64;
4450 break;
4451
55e303ae
A
4452 case UIO_USERSPACE:
4453 case UIO_USERISPACE:
4454 uio->uio_segflg = UIO_PHYS_USERSPACE;
4455 break;
4456
4457 case UIO_SYSSPACE:
4458 uio->uio_segflg = UIO_PHYS_SYSSPACE;
4459 break;
4460 }
55e303ae 4461
91447636
A
4462 if ( (io_size = *io_resid) ) {
4463 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
4464 xsize = uio_resid(uio);
55e303ae 4465
91447636
A
4466 retval = memory_object_control_uiomove(control, uio->uio_offset - start_offset,
4467 uio, start_offset, io_size, mark_dirty);
4468 xsize -= uio_resid(uio);
4469 io_size -= xsize;
55e303ae
A
4470 }
4471 uio->uio_segflg = segflg;
4472 *io_resid = io_size;
4473
55e303ae 4474 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
91447636 4475 (int)uio->uio_offset, uio_resid(uio), retval, 0x80000000 | segflg, 0);
55e303ae
A
4476
4477 return(retval);
4478}
4479
4480
4481int
91447636 4482is_file_clean(vnode_t vp, off_t filesize)
55e303ae
A
4483{
4484 off_t f_offset;
4485 int flags;
4486 int total_dirty = 0;
4487
4488 for (f_offset = 0; f_offset < filesize; f_offset += PAGE_SIZE_64) {
4489 if (ubc_page_op(vp, f_offset, 0, 0, &flags) == KERN_SUCCESS) {
4490 if (flags & UPL_POP_DIRTY) {
4491 total_dirty++;
4492 }
4493 }
4494 }
4495 if (total_dirty)
4496 return(EINVAL);
4497
4498 return (0);
4499}
4500
4501
4502
4503/*
4504 * Dirty region tracking/clustering mechanism.
4505 *
4506 * This code (vfs_drt_*) provides a mechanism for tracking and clustering
4507 * dirty regions within a larger space (file). It is primarily intended to
4508 * support clustering in large files with many dirty areas.
4509 *
4510 * The implementation assumes that the dirty regions are pages.
4511 *
4512 * To represent dirty pages within the file, we store bit vectors in a
4513 * variable-size circular hash.
4514 */
4515
4516/*
4517 * Bitvector size. This determines the number of pages we group in a
4518 * single hashtable entry. Each hashtable entry is aligned to this
4519 * size within the file.
4520 */
4521#define DRT_BITVECTOR_PAGES 256
4522
4523/*
4524 * File offset handling.
4525 *
4526 * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
4527 * the correct formula is (~(DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1)
4528 */
4529#define DRT_ADDRESS_MASK (~((1 << 20) - 1))
4530#define DRT_ALIGN_ADDRESS(addr) ((addr) & DRT_ADDRESS_MASK)
4531
4532/*
4533 * Hashtable address field handling.
4534 *
4535 * The low-order bits of the hashtable address are used to conserve
4536 * space.
4537 *
4538 * DRT_HASH_COUNT_MASK must be large enough to store the range
4539 * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
4540 * to indicate that the bucket is actually unoccupied.
4541 */
4542#define DRT_HASH_GET_ADDRESS(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
4543#define DRT_HASH_SET_ADDRESS(scm, i, a) \
4544 do { \
4545 (scm)->scm_hashtable[(i)].dhe_control = \
4546 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \
4547 } while (0)
4548#define DRT_HASH_COUNT_MASK 0x1ff
4549#define DRT_HASH_GET_COUNT(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
4550#define DRT_HASH_SET_COUNT(scm, i, c) \
4551 do { \
4552 (scm)->scm_hashtable[(i)].dhe_control = \
4553 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK); \
4554 } while (0)
4555#define DRT_HASH_CLEAR(scm, i) \
4556 do { \
4557 (scm)->scm_hashtable[(i)].dhe_control = 0; \
4558 } while (0)
4559#define DRT_HASH_VACATE(scm, i) DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
4560#define DRT_HASH_VACANT(scm, i) (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
4561#define DRT_HASH_COPY(oscm, oi, scm, i) \
4562 do { \
4563 (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control; \
4564 DRT_BITVECTOR_COPY(oscm, oi, scm, i); \
4565 } while(0);
4566
4567
4568/*
4569 * Hash table moduli.
4570 *
4571 * Since the hashtable entry's size is dependent on the size of
4572 * the bitvector, and since the hashtable size is constrained to
4573 * both being prime and fitting within the desired allocation
4574 * size, these values need to be manually determined.
4575 *
4576 * For DRT_BITVECTOR_SIZE = 256, the entry size is 40 bytes.
4577 *
4578 * The small hashtable allocation is 1024 bytes, so the modulus is 23.
4579 * The large hashtable allocation is 16384 bytes, so the modulus is 401.
4580 */
4581#define DRT_HASH_SMALL_MODULUS 23
4582#define DRT_HASH_LARGE_MODULUS 401
4583
4584#define DRT_SMALL_ALLOCATION 1024 /* 104 bytes spare */
4585#define DRT_LARGE_ALLOCATION 16384 /* 344 bytes spare */
4586
4587/* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */
4588
4589/*
4590 * Hashtable bitvector handling.
4591 *
4592 * Bitvector fields are 32 bits long.
4593 */
4594
4595#define DRT_HASH_SET_BIT(scm, i, bit) \
4596 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32))
4597
4598#define DRT_HASH_CLEAR_BIT(scm, i, bit) \
4599 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
4600
4601#define DRT_HASH_TEST_BIT(scm, i, bit) \
4602 ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
4603
4604#define DRT_BITVECTOR_CLEAR(scm, i) \
4605 bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
4606
4607#define DRT_BITVECTOR_COPY(oscm, oi, scm, i) \
4608 bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0], \
4609 &(scm)->scm_hashtable[(i)].dhe_bitvector[0], \
4610 (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
4611
4612
4613
4614/*
4615 * Hashtable entry.
4616 */
4617struct vfs_drt_hashentry {
4618 u_int64_t dhe_control;
4619 u_int32_t dhe_bitvector[DRT_BITVECTOR_PAGES / 32];
4620};
4621
4622/*
4623 * Dirty Region Tracking structure.
4624 *
4625 * The hashtable is allocated entirely inside the DRT structure.
4626 *
4627 * The hash is a simple circular prime modulus arrangement, the structure
4628 * is resized from small to large if it overflows.
4629 */
4630
4631struct vfs_drt_clustermap {
4632 u_int32_t scm_magic; /* sanity/detection */
4633#define DRT_SCM_MAGIC 0x12020003
4634 u_int32_t scm_modulus; /* current ring size */
4635 u_int32_t scm_buckets; /* number of occupied buckets */
4636 u_int32_t scm_lastclean; /* last entry we cleaned */
4637 u_int32_t scm_iskips; /* number of slot skips */
4638
4639 struct vfs_drt_hashentry scm_hashtable[0];
4640};
4641
4642
4643#define DRT_HASH(scm, addr) ((addr) % (scm)->scm_modulus)
4644#define DRT_HASH_NEXT(scm, addr) (((addr) + 1) % (scm)->scm_modulus)
4645
4646/*
4647 * Debugging codes and arguments.
4648 */
4649#define DRT_DEBUG_EMPTYFREE (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
4650#define DRT_DEBUG_RETCLUSTER (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
4651#define DRT_DEBUG_ALLOC (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
4652#define DRT_DEBUG_INSERT (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
4653#define DRT_DEBUG_MARK (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
4654 * dirty */
4655 /* 0, setcount */
4656 /* 1 (clean, no map) */
4657 /* 2 (map alloc fail) */
4658 /* 3, resid (partial) */
4659#define DRT_DEBUG_6 (FSDBG_CODE(DBG_FSRW, 87))
4660#define DRT_DEBUG_SCMDATA (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
4661 * lastclean, iskips */
4662
4663
55e303ae
A
4664static kern_return_t vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp);
4665static kern_return_t vfs_drt_free_map(struct vfs_drt_clustermap *cmap);
4666static kern_return_t vfs_drt_search_index(struct vfs_drt_clustermap *cmap,
4667 u_int64_t offset, int *indexp);
4668static kern_return_t vfs_drt_get_index(struct vfs_drt_clustermap **cmapp,
4669 u_int64_t offset,
4670 int *indexp,
4671 int recursed);
4672static kern_return_t vfs_drt_do_mark_pages(
4673 void **cmapp,
4674 u_int64_t offset,
4675 u_int length,
4676 int *setcountp,
4677 int dirty);
4678static void vfs_drt_trace(
4679 struct vfs_drt_clustermap *cmap,
4680 int code,
4681 int arg1,
4682 int arg2,
4683 int arg3,
4684 int arg4);
4685
4686
4687/*
4688 * Allocate and initialise a sparse cluster map.
4689 *
4690 * Will allocate a new map, resize or compact an existing map.
4691 *
4692 * XXX we should probably have at least one intermediate map size,
4693 * as the 1:16 ratio seems a bit drastic.
4694 */
4695static kern_return_t
4696vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp)
4697{
4698 struct vfs_drt_clustermap *cmap, *ocmap;
4699 kern_return_t kret;
4700 u_int64_t offset;
4701 int nsize, i, active_buckets, index, copycount;
4702
4703 ocmap = NULL;
4704 if (cmapp != NULL)
4705 ocmap = *cmapp;
4706
4707 /*
4708 * Decide on the size of the new map.
4709 */
4710 if (ocmap == NULL) {
4711 nsize = DRT_HASH_SMALL_MODULUS;
4712 } else {
4713 /* count the number of active buckets in the old map */
4714 active_buckets = 0;
4715 for (i = 0; i < ocmap->scm_modulus; i++) {
4716 if (!DRT_HASH_VACANT(ocmap, i) &&
4717 (DRT_HASH_GET_COUNT(ocmap, i) != 0))
4718 active_buckets++;
4719 }
4720 /*
4721 * If we're currently using the small allocation, check to
4722 * see whether we should grow to the large one.
4723 */
4724 if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
4725 /* if the ring is nearly full */
4726 if (active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) {
4727 nsize = DRT_HASH_LARGE_MODULUS;
4728 } else {
4729 nsize = DRT_HASH_SMALL_MODULUS;
4730 }
4731 } else {
4732 /* already using the large modulus */
4733 nsize = DRT_HASH_LARGE_MODULUS;
4734 /*
4735 * If the ring is completely full, there's
4736 * nothing useful for us to do. Behave as
4737 * though we had compacted into the new
4738 * array and return.
4739 */
4740 if (active_buckets >= DRT_HASH_LARGE_MODULUS)
4741 return(KERN_SUCCESS);
4742 }
4743 }
4744
4745 /*
4746 * Allocate and initialise the new map.
4747 */
4748
4749 kret = kmem_alloc(kernel_map, (vm_offset_t *)&cmap,
4750 (nsize == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION);
4751 if (kret != KERN_SUCCESS)
4752 return(kret);
4753 cmap->scm_magic = DRT_SCM_MAGIC;
4754 cmap->scm_modulus = nsize;
4755 cmap->scm_buckets = 0;
4756 cmap->scm_lastclean = 0;
4757 cmap->scm_iskips = 0;
4758 for (i = 0; i < cmap->scm_modulus; i++) {
4759 DRT_HASH_CLEAR(cmap, i);
4760 DRT_HASH_VACATE(cmap, i);
4761 DRT_BITVECTOR_CLEAR(cmap, i);
4762 }
4763
4764 /*
4765 * If there's an old map, re-hash entries from it into the new map.
4766 */
4767 copycount = 0;
4768 if (ocmap != NULL) {
4769 for (i = 0; i < ocmap->scm_modulus; i++) {
4770 /* skip empty buckets */
4771 if (DRT_HASH_VACANT(ocmap, i) ||
4772 (DRT_HASH_GET_COUNT(ocmap, i) == 0))
4773 continue;
4774 /* get new index */
4775 offset = DRT_HASH_GET_ADDRESS(ocmap, i);
4776 kret = vfs_drt_get_index(&cmap, offset, &index, 1);
4777 if (kret != KERN_SUCCESS) {
4778 /* XXX need to bail out gracefully here */
4779 panic("vfs_drt: new cluster map mysteriously too small");
4780 }
4781 /* copy */
4782 DRT_HASH_COPY(ocmap, i, cmap, index);
4783 copycount++;
4784 }
4785 }
4786
4787 /* log what we've done */
4788 vfs_drt_trace(cmap, DRT_DEBUG_ALLOC, copycount, 0, 0, 0);
4789
4790 /*
4791 * It's important to ensure that *cmapp always points to
4792 * a valid map, so we must overwrite it before freeing
4793 * the old map.
4794 */
4795 *cmapp = cmap;
4796 if (ocmap != NULL) {
4797 /* emit stats into trace buffer */
4798 vfs_drt_trace(ocmap, DRT_DEBUG_SCMDATA,
4799 ocmap->scm_modulus,
4800 ocmap->scm_buckets,
4801 ocmap->scm_lastclean,
4802 ocmap->scm_iskips);
4803
4804 vfs_drt_free_map(ocmap);
4805 }
4806 return(KERN_SUCCESS);
4807}
4808
4809
4810/*
4811 * Free a sparse cluster map.
4812 */
4813static kern_return_t
4814vfs_drt_free_map(struct vfs_drt_clustermap *cmap)
4815{
55e303ae
A
4816 kmem_free(kernel_map, (vm_offset_t)cmap,
4817 (cmap->scm_modulus == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION);
4818 return(KERN_SUCCESS);
4819}
4820
4821
4822/*
4823 * Find the hashtable slot currently occupied by an entry for the supplied offset.
4824 */
4825static kern_return_t
4826vfs_drt_search_index(struct vfs_drt_clustermap *cmap, u_int64_t offset, int *indexp)
4827{
91447636 4828 int index, i;
55e303ae
A
4829
4830 offset = DRT_ALIGN_ADDRESS(offset);
4831 index = DRT_HASH(cmap, offset);
4832
4833 /* traverse the hashtable */
4834 for (i = 0; i < cmap->scm_modulus; i++) {
4835
4836 /*
4837 * If the slot is vacant, we can stop.
4838 */
4839 if (DRT_HASH_VACANT(cmap, index))
4840 break;
4841
4842 /*
4843 * If the address matches our offset, we have success.
4844 */
4845 if (DRT_HASH_GET_ADDRESS(cmap, index) == offset) {
4846 *indexp = index;
4847 return(KERN_SUCCESS);
4848 }
4849
4850 /*
4851 * Move to the next slot, try again.
4852 */
4853 index = DRT_HASH_NEXT(cmap, index);
4854 }
4855 /*
4856 * It's not there.
4857 */
4858 return(KERN_FAILURE);
4859}
4860
4861/*
4862 * Find the hashtable slot for the supplied offset. If we haven't allocated
4863 * one yet, allocate one and populate the address field. Note that it will
4864 * not have a nonzero page count and thus will still technically be free, so
4865 * in the case where we are called to clean pages, the slot will remain free.
4866 */
4867static kern_return_t
4868vfs_drt_get_index(struct vfs_drt_clustermap **cmapp, u_int64_t offset, int *indexp, int recursed)
4869{
4870 struct vfs_drt_clustermap *cmap;
4871 kern_return_t kret;
4872 int index, i;
4873
4874 cmap = *cmapp;
4875
4876 /* look for an existing entry */
4877 kret = vfs_drt_search_index(cmap, offset, indexp);
4878 if (kret == KERN_SUCCESS)
4879 return(kret);
4880
4881 /* need to allocate an entry */
4882 offset = DRT_ALIGN_ADDRESS(offset);
4883 index = DRT_HASH(cmap, offset);
4884
4885 /* scan from the index forwards looking for a vacant slot */
4886 for (i = 0; i < cmap->scm_modulus; i++) {
4887 /* slot vacant? */
4888 if (DRT_HASH_VACANT(cmap, index) || DRT_HASH_GET_COUNT(cmap,index) == 0) {
4889 cmap->scm_buckets++;
4890 if (index < cmap->scm_lastclean)
4891 cmap->scm_lastclean = index;
4892 DRT_HASH_SET_ADDRESS(cmap, index, offset);
4893 DRT_HASH_SET_COUNT(cmap, index, 0);
4894 DRT_BITVECTOR_CLEAR(cmap, index);
4895 *indexp = index;
4896 vfs_drt_trace(cmap, DRT_DEBUG_INSERT, (int)offset, i, 0, 0);
4897 return(KERN_SUCCESS);
4898 }
4899 cmap->scm_iskips += i;
4900 index = DRT_HASH_NEXT(cmap, index);
4901 }
4902
4903 /*
4904 * We haven't found a vacant slot, so the map is full. If we're not
4905 * already recursed, try reallocating/compacting it.
4906 */
4907 if (recursed)
4908 return(KERN_FAILURE);
4909 kret = vfs_drt_alloc_map(cmapp);
4910 if (kret == KERN_SUCCESS) {
4911 /* now try to insert again */
4912 kret = vfs_drt_get_index(cmapp, offset, indexp, 1);
4913 }
4914 return(kret);
4915}
4916
4917/*
4918 * Implementation of set dirty/clean.
4919 *
4920 * In the 'clean' case, not finding a map is OK.
4921 */
4922static kern_return_t
4923vfs_drt_do_mark_pages(
4924 void **private,
4925 u_int64_t offset,
4926 u_int length,
4927 int *setcountp,
4928 int dirty)
4929{
4930 struct vfs_drt_clustermap *cmap, **cmapp;
4931 kern_return_t kret;
4932 int i, index, pgoff, pgcount, setcount, ecount;
4933
4934 cmapp = (struct vfs_drt_clustermap **)private;
4935 cmap = *cmapp;
4936
4937 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_START, (int)offset, (int)length, dirty, 0);
4938
4939 if (setcountp != NULL)
4940 *setcountp = 0;
4941
4942 /* allocate a cluster map if we don't already have one */
4943 if (cmap == NULL) {
4944 /* no cluster map, nothing to clean */
4945 if (!dirty) {
4946 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 1, 0, 0, 0);
4947 return(KERN_SUCCESS);
4948 }
4949 kret = vfs_drt_alloc_map(cmapp);
4950 if (kret != KERN_SUCCESS) {
4951 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 2, 0, 0, 0);
4952 return(kret);
4953 }
4954 }
4955 setcount = 0;
4956
4957 /*
4958 * Iterate over the length of the region.
4959 */
4960 while (length > 0) {
4961 /*
4962 * Get the hashtable index for this offset.
4963 *
4964 * XXX this will add blank entries if we are clearing a range
4965 * that hasn't been dirtied.
4966 */
4967 kret = vfs_drt_get_index(cmapp, offset, &index, 0);
4968 cmap = *cmapp; /* may have changed! */
4969 /* this may be a partial-success return */
4970 if (kret != KERN_SUCCESS) {
4971 if (setcountp != NULL)
4972 *setcountp = setcount;
4973 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 3, (int)length, 0, 0);
4974
4975 return(kret);
4976 }
4977
4978 /*
4979 * Work out how many pages we're modifying in this
4980 * hashtable entry.
4981 */
4982 pgoff = (offset - DRT_ALIGN_ADDRESS(offset)) / PAGE_SIZE;
4983 pgcount = min((length / PAGE_SIZE), (DRT_BITVECTOR_PAGES - pgoff));
4984
4985 /*
4986 * Iterate over pages, dirty/clearing as we go.
4987 */
4988 ecount = DRT_HASH_GET_COUNT(cmap, index);
4989 for (i = 0; i < pgcount; i++) {
4990 if (dirty) {
4991 if (!DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
4992 DRT_HASH_SET_BIT(cmap, index, pgoff + i);
4993 ecount++;
4994 setcount++;
4995 }
4996 } else {
4997 if (DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
4998 DRT_HASH_CLEAR_BIT(cmap, index, pgoff + i);
4999 ecount--;
5000 setcount++;
5001 }
5002 }
5003 }
5004 DRT_HASH_SET_COUNT(cmap, index, ecount);
91447636 5005
55e303ae
A
5006 offset += pgcount * PAGE_SIZE;
5007 length -= pgcount * PAGE_SIZE;
5008 }
5009 if (setcountp != NULL)
5010 *setcountp = setcount;
5011
5012 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 0, setcount, 0, 0);
5013
5014 return(KERN_SUCCESS);
5015}
5016
5017/*
5018 * Mark a set of pages as dirty/clean.
5019 *
5020 * This is a public interface.
5021 *
5022 * cmapp
5023 * Pointer to storage suitable for holding a pointer. Note that
5024 * this must either be NULL or a value set by this function.
5025 *
5026 * size
5027 * Current file size in bytes.
5028 *
5029 * offset
5030 * Offset of the first page to be marked as dirty, in bytes. Must be
5031 * page-aligned.
5032 *
5033 * length
5034 * Length of dirty region, in bytes. Must be a multiple of PAGE_SIZE.
5035 *
5036 * setcountp
5037 * Number of pages newly marked dirty by this call (optional).
5038 *
5039 * Returns KERN_SUCCESS if all the pages were successfully marked.
5040 */
5041static kern_return_t
5042vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, int *setcountp)
5043{
5044 /* XXX size unused, drop from interface */
5045 return(vfs_drt_do_mark_pages(cmapp, offset, length, setcountp, 1));
5046}
5047
91447636 5048#if 0
55e303ae
A
5049static kern_return_t
5050vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length)
5051{
5052 return(vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0));
5053}
91447636 5054#endif
55e303ae
A
5055
5056/*
5057 * Get a cluster of dirty pages.
5058 *
5059 * This is a public interface.
5060 *
5061 * cmapp
5062 * Pointer to storage managed by drt_mark_pages. Note that this must
5063 * be NULL or a value set by drt_mark_pages.
5064 *
5065 * offsetp
5066 * Returns the byte offset into the file of the first page in the cluster.
5067 *
5068 * lengthp
5069 * Returns the length in bytes of the cluster of dirty pages.
5070 *
5071 * Returns success if a cluster was found. If KERN_FAILURE is returned, there
5072 * are no dirty pages meeting the minmum size criteria. Private storage will
5073 * be released if there are no more dirty pages left in the map
5074 *
5075 */
5076static kern_return_t
5077vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp)
5078{
5079 struct vfs_drt_clustermap *cmap;
5080 u_int64_t offset;
5081 u_int length;
5082 int index, i, j, fs, ls;
5083
5084 /* sanity */
5085 if ((cmapp == NULL) || (*cmapp == NULL))
5086 return(KERN_FAILURE);
5087 cmap = *cmapp;
5088
5089 /* walk the hashtable */
5090 for (offset = 0, j = 0; j < cmap->scm_modulus; offset += (DRT_BITVECTOR_PAGES * PAGE_SIZE), j++) {
5091 index = DRT_HASH(cmap, offset);
5092
5093 if (DRT_HASH_VACANT(cmap, index) || (DRT_HASH_GET_COUNT(cmap, index) == 0))
5094 continue;
5095
5096 /* scan the bitfield for a string of bits */
5097 fs = -1;
5098
5099 for (i = 0; i < DRT_BITVECTOR_PAGES; i++) {
5100 if (DRT_HASH_TEST_BIT(cmap, index, i)) {
5101 fs = i;
5102 break;
5103 }
5104 }
5105 if (fs == -1) {
5106 /* didn't find any bits set */
5107 panic("vfs_drt: entry summary count > 0 but no bits set in map");
5108 }
5109 for (ls = 0; i < DRT_BITVECTOR_PAGES; i++, ls++) {
5110 if (!DRT_HASH_TEST_BIT(cmap, index, i))
5111 break;
5112 }
5113
5114 /* compute offset and length, mark pages clean */
5115 offset = DRT_HASH_GET_ADDRESS(cmap, index) + (PAGE_SIZE * fs);
5116 length = ls * PAGE_SIZE;
5117 vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
5118 cmap->scm_lastclean = index;
5119
5120 /* return successful */
5121 *offsetp = (off_t)offset;
5122 *lengthp = length;
5123
5124 vfs_drt_trace(cmap, DRT_DEBUG_RETCLUSTER, (int)offset, (int)length, 0, 0);
5125 return(KERN_SUCCESS);
5126 }
5127 /*
5128 * We didn't find anything... hashtable is empty
5129 * emit stats into trace buffer and
5130 * then free it
5131 */
5132 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
5133 cmap->scm_modulus,
5134 cmap->scm_buckets,
5135 cmap->scm_lastclean,
5136 cmap->scm_iskips);
5137
5138 vfs_drt_free_map(cmap);
5139 *cmapp = NULL;
5140
5141 return(KERN_FAILURE);
5142}
5143
5144
5145static kern_return_t
5146vfs_drt_control(void **cmapp, int op_type)
5147{
5148 struct vfs_drt_clustermap *cmap;
5149
5150 /* sanity */
5151 if ((cmapp == NULL) || (*cmapp == NULL))
5152 return(KERN_FAILURE);
5153 cmap = *cmapp;
5154
5155 switch (op_type) {
5156 case 0:
5157 /* emit stats into trace buffer */
5158 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
5159 cmap->scm_modulus,
5160 cmap->scm_buckets,
5161 cmap->scm_lastclean,
5162 cmap->scm_iskips);
5163
5164 vfs_drt_free_map(cmap);
5165 *cmapp = NULL;
5166 break;
5167
5168 case 1:
5169 cmap->scm_lastclean = 0;
5170 break;
5171 }
5172 return(KERN_SUCCESS);
5173}
5174
5175
5176
5177/*
5178 * Emit a summary of the state of the clustermap into the trace buffer
5179 * along with some caller-provided data.
5180 */
91447636 5181#if KDEBUG
55e303ae 5182static void
91447636 5183vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, int code, int arg1, int arg2, int arg3, int arg4)
55e303ae
A
5184{
5185 KERNEL_DEBUG(code, arg1, arg2, arg3, arg4, 0);
5186}
91447636
A
5187#else
5188static void
5189vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, __unused int code,
5190 __unused int arg1, __unused int arg2, __unused int arg3,
5191 __unused int arg4)
5192{
5193}
5194#endif
55e303ae 5195
91447636 5196#if 0
55e303ae
A
5197/*
5198 * Perform basic sanity check on the hash entry summary count
5199 * vs. the actual bits set in the entry.
5200 */
5201static void
5202vfs_drt_sanity(struct vfs_drt_clustermap *cmap)
5203{
5204 int index, i;
5205 int bits_on;
5206
5207 for (index = 0; index < cmap->scm_modulus; index++) {
5208 if (DRT_HASH_VACANT(cmap, index))
5209 continue;
5210
5211 for (bits_on = 0, i = 0; i < DRT_BITVECTOR_PAGES; i++) {
5212 if (DRT_HASH_TEST_BIT(cmap, index, i))
5213 bits_on++;
5214 }
5215 if (bits_on != DRT_HASH_GET_COUNT(cmap, index))
5216 panic("bits_on = %d, index = %d\n", bits_on, index);
5217 }
b4c24cb9 5218}
91447636 5219#endif