]> git.saurik.com Git - apple/xnu.git/blame_incremental - bsd/vfs/vfs_cluster.c
xnu-792.13.8.tar.gz
[apple/xnu.git] / bsd / vfs / vfs_cluster.c
... / ...
CommitLineData
1/*
2 * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the
10 * License may not be used to create, or enable the creation or
11 * redistribution of, unlawful or unlicensed copies of an Apple operating
12 * system, or to circumvent, violate, or enable the circumvention or
13 * violation of, any terms of an Apple operating system software license
14 * agreement.
15 *
16 * Please obtain a copy of the License at
17 * http://www.opensource.apple.com/apsl/ and read it before using this
18 * file.
19 *
20 * The Original Code and all software distributed under the License are
21 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
22 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
23 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
24 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
25 * Please see the License for the specific language governing rights and
26 * limitations under the License.
27 *
28 * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
29 */
30/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
31/*
32 * Copyright (c) 1993
33 * The Regents of the University of California. All rights reserved.
34 *
35 * Redistribution and use in source and binary forms, with or without
36 * modification, are permitted provided that the following conditions
37 * are met:
38 * 1. Redistributions of source code must retain the above copyright
39 * notice, this list of conditions and the following disclaimer.
40 * 2. Redistributions in binary form must reproduce the above copyright
41 * notice, this list of conditions and the following disclaimer in the
42 * documentation and/or other materials provided with the distribution.
43 * 3. All advertising materials mentioning features or use of this software
44 * must display the following acknowledgement:
45 * This product includes software developed by the University of
46 * California, Berkeley and its contributors.
47 * 4. Neither the name of the University nor the names of its contributors
48 * may be used to endorse or promote products derived from this software
49 * without specific prior written permission.
50 *
51 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
52 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
53 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
54 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
55 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
56 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
57 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
58 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
59 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
60 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 * SUCH DAMAGE.
62 *
63 * @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95
64 */
65
66#include <sys/param.h>
67#include <sys/proc_internal.h>
68#include <sys/buf_internal.h>
69#include <sys/mount_internal.h>
70#include <sys/vnode_internal.h>
71#include <sys/trace.h>
72#include <sys/malloc.h>
73#include <sys/time.h>
74#include <sys/kernel.h>
75#include <sys/resourcevar.h>
76#include <sys/uio_internal.h>
77#include <libkern/libkern.h>
78#include <machine/machine_routines.h>
79
80#include <sys/ubc_internal.h>
81
82#include <mach/mach_types.h>
83#include <mach/memory_object_types.h>
84#include <mach/vm_map.h>
85#include <mach/upl.h>
86
87#include <vm/vm_kern.h>
88#include <vm/vm_map.h>
89#include <vm/vm_pageout.h>
90
91#include <sys/kdebug.h>
92
93
94#define CL_READ 0x01
95#define CL_ASYNC 0x02
96#define CL_COMMIT 0x04
97#define CL_PAGEOUT 0x10
98#define CL_AGE 0x20
99#define CL_DUMP 0x40
100#define CL_NOZERO 0x80
101#define CL_PAGEIN 0x100
102#define CL_DEV_MEMORY 0x200
103#define CL_PRESERVE 0x400
104#define CL_THROTTLE 0x800
105#define CL_KEEPCACHED 0x1000
106
107
108struct clios {
109 u_int io_completed; /* amount of io that has currently completed */
110 u_int io_issued; /* amount of io that was successfully issued */
111 int io_error; /* error code of first error encountered */
112 int io_wanted; /* someone is sleeping waiting for a change in state */
113};
114
115static lck_grp_t *cl_mtx_grp;
116static lck_attr_t *cl_mtx_attr;
117static lck_grp_attr_t *cl_mtx_grp_attr;
118static lck_mtx_t *cl_mtxp;
119
120
121static int cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
122 int flags, buf_t real_bp, struct clios *iostate);
123static int cluster_iodone(buf_t bp, void *dummy);
124static int cluster_rd_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize);
125static int cluster_hard_throttle_on(vnode_t vp);
126
127static int cluster_read_x(vnode_t vp, struct uio *uio, off_t filesize, int flags);
128static int cluster_write_x(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF,
129 off_t headOff, off_t tailOff, int flags);
130static int cluster_nocopy_read(vnode_t vp, struct uio *uio, off_t filesize);
131static int cluster_nocopy_write(vnode_t vp, struct uio *uio, off_t newEOF);
132static int cluster_phys_read(vnode_t vp, struct uio *uio, off_t filesize);
133static int cluster_phys_write(vnode_t vp, struct uio *uio, off_t newEOF);
134static int cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, int xsize, int flags);
135
136static void cluster_rd_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *ra);
137
138static int cluster_push_x(vnode_t vp, struct cl_extent *, off_t EOF, int flags);
139static void cluster_push_EOF(vnode_t vp, off_t EOF);
140
141static int cluster_try_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int can_delay, int push_all);
142
143static void sparse_cluster_switch(struct cl_writebehind *, vnode_t vp, off_t EOF);
144static void sparse_cluster_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_all);
145static void sparse_cluster_add(struct cl_writebehind *, vnode_t vp, struct cl_extent *, off_t EOF);
146
147static kern_return_t vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, int *setcountp);
148static kern_return_t vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp);
149static kern_return_t vfs_drt_control(void **cmapp, int op_type);
150
151int is_file_clean(vnode_t, off_t);
152
153/*
154 * throttle the number of async writes that
155 * can be outstanding on a single vnode
156 * before we issue a synchronous write
157 */
158#define HARD_THROTTLE_MAXCNT 0
159#define HARD_THROTTLE_MAXSIZE (64 * 1024)
160
161int hard_throttle_on_root = 0;
162struct timeval priority_IO_timestamp_for_root;
163
164
165void
166cluster_init(void) {
167 /*
168 * allocate lock group attribute and group
169 */
170 cl_mtx_grp_attr = lck_grp_attr_alloc_init();
171 cl_mtx_grp = lck_grp_alloc_init("cluster I/O", cl_mtx_grp_attr);
172
173 /*
174 * allocate the lock attribute
175 */
176 cl_mtx_attr = lck_attr_alloc_init();
177
178 /*
179 * allocate and initialize mutex's used to protect updates and waits
180 * on the cluster_io context
181 */
182 cl_mtxp = lck_mtx_alloc_init(cl_mtx_grp, cl_mtx_attr);
183
184 if (cl_mtxp == NULL)
185 panic("cluster_init: failed to allocate cl_mtxp");
186}
187
188
189
190#define CLW_ALLOCATE 0x01
191#define CLW_RETURNLOCKED 0x02
192/*
193 * if the read ahead context doesn't yet exist,
194 * allocate and initialize it...
195 * the vnode lock serializes multiple callers
196 * during the actual assignment... first one
197 * to grab the lock wins... the other callers
198 * will release the now unnecessary storage
199 *
200 * once the context is present, try to grab (but don't block on)
201 * the lock associated with it... if someone
202 * else currently owns it, than the read
203 * will run without read-ahead. this allows
204 * multiple readers to run in parallel and
205 * since there's only 1 read ahead context,
206 * there's no real loss in only allowing 1
207 * reader to have read-ahead enabled.
208 */
209static struct cl_readahead *
210cluster_get_rap(vnode_t vp)
211{
212 struct ubc_info *ubc;
213 struct cl_readahead *rap;
214
215 ubc = vp->v_ubcinfo;
216
217 if ((rap = ubc->cl_rahead) == NULL) {
218 MALLOC_ZONE(rap, struct cl_readahead *, sizeof *rap, M_CLRDAHEAD, M_WAITOK);
219
220 bzero(rap, sizeof *rap);
221 rap->cl_lastr = -1;
222 lck_mtx_init(&rap->cl_lockr, cl_mtx_grp, cl_mtx_attr);
223
224 vnode_lock(vp);
225
226 if (ubc->cl_rahead == NULL)
227 ubc->cl_rahead = rap;
228 else {
229 lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp);
230 FREE_ZONE((void *)rap, sizeof *rap, M_CLRDAHEAD);
231 rap = ubc->cl_rahead;
232 }
233 vnode_unlock(vp);
234 }
235 if (lck_mtx_try_lock(&rap->cl_lockr) == TRUE)
236 return(rap);
237
238 return ((struct cl_readahead *)NULL);
239}
240
241
242/*
243 * if the write behind context doesn't yet exist,
244 * and CLW_ALLOCATE is specified, allocate and initialize it...
245 * the vnode lock serializes multiple callers
246 * during the actual assignment... first one
247 * to grab the lock wins... the other callers
248 * will release the now unnecessary storage
249 *
250 * if CLW_RETURNLOCKED is set, grab (blocking if necessary)
251 * the lock associated with the write behind context before
252 * returning
253 */
254
255static struct cl_writebehind *
256cluster_get_wbp(vnode_t vp, int flags)
257{
258 struct ubc_info *ubc;
259 struct cl_writebehind *wbp;
260
261 ubc = vp->v_ubcinfo;
262
263 if ((wbp = ubc->cl_wbehind) == NULL) {
264
265 if ( !(flags & CLW_ALLOCATE))
266 return ((struct cl_writebehind *)NULL);
267
268 MALLOC_ZONE(wbp, struct cl_writebehind *, sizeof *wbp, M_CLWRBEHIND, M_WAITOK);
269
270 bzero(wbp, sizeof *wbp);
271 lck_mtx_init(&wbp->cl_lockw, cl_mtx_grp, cl_mtx_attr);
272
273 vnode_lock(vp);
274
275 if (ubc->cl_wbehind == NULL)
276 ubc->cl_wbehind = wbp;
277 else {
278 lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp);
279 FREE_ZONE((void *)wbp, sizeof *wbp, M_CLWRBEHIND);
280 wbp = ubc->cl_wbehind;
281 }
282 vnode_unlock(vp);
283 }
284 if (flags & CLW_RETURNLOCKED)
285 lck_mtx_lock(&wbp->cl_lockw);
286
287 return (wbp);
288}
289
290
291static int
292cluster_hard_throttle_on(vnode_t vp)
293{
294 static struct timeval hard_throttle_maxelapsed = { 0, 200000 };
295
296 if (vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV) {
297 struct timeval elapsed;
298
299 if (hard_throttle_on_root)
300 return(1);
301
302 microuptime(&elapsed);
303 timevalsub(&elapsed, &priority_IO_timestamp_for_root);
304
305 if (timevalcmp(&elapsed, &hard_throttle_maxelapsed, <))
306 return(1);
307 }
308 return(0);
309}
310
311
312static int
313cluster_iodone(buf_t bp, __unused void *dummy)
314{
315 int b_flags;
316 int error;
317 int total_size;
318 int total_resid;
319 int upl_offset;
320 int zero_offset;
321 upl_t upl;
322 buf_t cbp;
323 buf_t cbp_head;
324 buf_t cbp_next;
325 buf_t real_bp;
326 struct clios *iostate;
327 int commit_size;
328 int pg_offset;
329
330 cbp_head = (buf_t)(bp->b_trans_head);
331
332 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
333 (int)cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
334
335 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
336 /*
337 * all I/O requests that are part of this transaction
338 * have to complete before we can process it
339 */
340 if ( !(cbp->b_flags & B_DONE)) {
341
342 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
343 (int)cbp_head, (int)cbp, cbp->b_bcount, cbp->b_flags, 0);
344
345 return 0;
346 }
347 }
348 error = 0;
349 total_size = 0;
350 total_resid = 0;
351
352 cbp = cbp_head;
353 upl_offset = cbp->b_uploffset;
354 upl = cbp->b_upl;
355 b_flags = cbp->b_flags;
356 real_bp = cbp->b_real_bp;
357 zero_offset= cbp->b_validend;
358 iostate = (struct clios *)cbp->b_iostate;
359
360 if (real_bp)
361 real_bp->b_dev = cbp->b_dev;
362
363 while (cbp) {
364 if ((cbp->b_flags & B_ERROR) && error == 0)
365 error = cbp->b_error;
366
367 total_resid += cbp->b_resid;
368 total_size += cbp->b_bcount;
369
370 cbp_next = cbp->b_trans_next;
371
372 free_io_buf(cbp);
373
374 cbp = cbp_next;
375 }
376 if (zero_offset)
377 cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
378
379 if (iostate) {
380 int need_wakeup = 0;
381
382 /*
383 * someone has issued multiple I/Os asynchrounsly
384 * and is waiting for them to complete (streaming)
385 */
386 lck_mtx_lock(cl_mtxp);
387
388 if (error && iostate->io_error == 0)
389 iostate->io_error = error;
390
391 iostate->io_completed += total_size;
392
393 if (iostate->io_wanted) {
394 /*
395 * someone is waiting for the state of
396 * this io stream to change
397 */
398 iostate->io_wanted = 0;
399 need_wakeup = 1;
400 }
401 lck_mtx_unlock(cl_mtxp);
402
403 if (need_wakeup)
404 wakeup((caddr_t)&iostate->io_wanted);
405 }
406 if ((b_flags & B_NEED_IODONE) && real_bp) {
407 if (error) {
408 real_bp->b_flags |= B_ERROR;
409 real_bp->b_error = error;
410 }
411 real_bp->b_resid = total_resid;
412
413 buf_biodone(real_bp);
414 }
415 if (error == 0 && total_resid)
416 error = EIO;
417
418 if (b_flags & B_COMMIT_UPL) {
419 pg_offset = upl_offset & PAGE_MASK;
420 commit_size = (pg_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
421
422 if (error || (b_flags & B_NOCACHE)) {
423 int upl_abort_code;
424 int page_in = 0;
425 int page_out = 0;
426
427 if (b_flags & B_PAGEIO) {
428 if (b_flags & B_READ)
429 page_in = 1;
430 else
431 page_out = 1;
432 }
433 if (b_flags & B_CACHE) /* leave pages in the cache unchanged on error */
434 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
435 else if (page_out && (error != ENXIO)) /* transient error */
436 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
437 else if (page_in)
438 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
439 else
440 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
441
442 ubc_upl_abort_range(upl, upl_offset - pg_offset, commit_size,
443 upl_abort_code);
444
445 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
446 (int)upl, upl_offset - pg_offset, commit_size,
447 0x80000000|upl_abort_code, 0);
448
449 } else {
450 int upl_commit_flags = UPL_COMMIT_FREE_ON_EMPTY;
451
452 if ((b_flags & B_PHYS) && (b_flags & B_READ))
453 upl_commit_flags |= UPL_COMMIT_SET_DIRTY;
454
455 if (b_flags & B_AGE)
456 upl_commit_flags |= UPL_COMMIT_INACTIVATE;
457
458 ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size,
459 upl_commit_flags);
460
461 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
462 (int)upl, upl_offset - pg_offset, commit_size,
463 upl_commit_flags, 0);
464 }
465 } else {
466 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
467 (int)upl, upl_offset, 0, error, 0);
468 }
469
470 return (error);
471}
472
473
474void
475cluster_zero(upl_t upl, vm_offset_t upl_offset, int size, buf_t bp)
476{
477 upl_page_info_t *pl;
478
479 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_START,
480 upl_offset, size, (int)bp, 0, 0);
481
482 if (bp == NULL || bp->b_datap == 0) {
483
484 pl = ubc_upl_pageinfo(upl);
485
486 while (size) {
487 int page_offset;
488 int page_index;
489 addr64_t zero_addr;
490 int zero_cnt;
491
492 page_index = upl_offset / PAGE_SIZE;
493 page_offset = upl_offset & PAGE_MASK;
494
495 zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << 12) + page_offset;
496 zero_cnt = min(PAGE_SIZE - page_offset, size);
497
498 bzero_phys(zero_addr, zero_cnt);
499
500 size -= zero_cnt;
501 upl_offset += zero_cnt;
502 }
503 } else
504 bzero((caddr_t)((vm_offset_t)bp->b_datap + upl_offset), size);
505
506 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_END,
507 upl_offset, size, 0, 0, 0);
508}
509
510
511static int
512cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
513 int flags, buf_t real_bp, struct clios *iostate)
514{
515 buf_t cbp;
516 u_int size;
517 u_int io_size;
518 int io_flags;
519 int bmap_flags;
520 int error = 0;
521 int retval = 0;
522 buf_t cbp_head = NULL;
523 buf_t cbp_tail = NULL;
524 int trans_count = 0;
525 u_int pg_count;
526 int pg_offset;
527 u_int max_iosize;
528 u_int max_vectors;
529 int priv;
530 int zero_offset = 0;
531 int async_throttle = 0;
532 mount_t mp;
533
534 mp = vp->v_mount;
535
536 if (mp->mnt_devblocksize > 1) {
537 /*
538 * round the requested size up so that this I/O ends on a
539 * page boundary in case this is a 'write'... if the filesystem
540 * has blocks allocated to back the page beyond the EOF, we want to
541 * make sure to write out the zero's that are sitting beyond the EOF
542 * so that in case the filesystem doesn't explicitly zero this area
543 * if a hole is created via a lseek/write beyond the current EOF,
544 * it will return zeros when it's read back from the disk. If the
545 * physical allocation doesn't extend for the whole page, we'll
546 * only write/read from the disk up to the end of this allocation
547 * via the extent info returned from the VNOP_BLOCKMAP call.
548 */
549 pg_offset = upl_offset & PAGE_MASK;
550
551 size = (((non_rounded_size + pg_offset) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - pg_offset;
552 } else {
553 /*
554 * anyone advertising a blocksize of 1 byte probably
555 * can't deal with us rounding up the request size
556 * AFP is one such filesystem/device
557 */
558 size = non_rounded_size;
559 }
560 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START,
561 (int)f_offset, size, upl_offset, flags, 0);
562
563 if (flags & CL_READ) {
564 io_flags = (B_READ);
565 bmap_flags = VNODE_READ;
566
567 max_iosize = mp->mnt_maxreadcnt;
568 max_vectors = mp->mnt_segreadcnt;
569 } else {
570 io_flags = 0;
571 bmap_flags = VNODE_WRITE;
572
573 max_iosize = mp->mnt_maxwritecnt;
574 max_vectors = mp->mnt_segwritecnt;
575 }
576 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_NONE, max_iosize, max_vectors, mp->mnt_devblocksize, 0, 0);
577
578 /*
579 * make sure the maximum iosize is a
580 * multiple of the page size
581 */
582 max_iosize &= ~PAGE_MASK;
583
584 if (flags & CL_THROTTLE) {
585 if ( !(flags & CL_PAGEOUT) && cluster_hard_throttle_on(vp)) {
586 if (max_iosize > HARD_THROTTLE_MAXSIZE)
587 max_iosize = HARD_THROTTLE_MAXSIZE;
588 async_throttle = HARD_THROTTLE_MAXCNT;
589 } else
590 async_throttle = VNODE_ASYNC_THROTTLE;
591 }
592 if (flags & CL_AGE)
593 io_flags |= B_AGE;
594 if (flags & CL_DUMP)
595 io_flags |= B_NOCACHE;
596 if (flags & (CL_PAGEIN | CL_PAGEOUT))
597 io_flags |= B_PAGEIO;
598 if (flags & CL_COMMIT)
599 io_flags |= B_COMMIT_UPL;
600 if (flags & CL_PRESERVE)
601 io_flags |= B_PHYS;
602 if (flags & CL_KEEPCACHED)
603 io_flags |= B_CACHE;
604
605 if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
606 /*
607 * then we are going to end up
608 * with a page that we can't complete (the file size wasn't a multiple
609 * of PAGE_SIZE and we're trying to read to the end of the file
610 * so we'll go ahead and zero out the portion of the page we can't
611 * read in from the file
612 */
613 zero_offset = upl_offset + non_rounded_size;
614 }
615 while (size) {
616 int pg_resid;
617 daddr64_t blkno;
618 daddr64_t lblkno;
619
620 if (size > max_iosize)
621 io_size = max_iosize;
622 else
623 io_size = size;
624
625 if ((error = VNOP_BLOCKMAP(vp, f_offset, io_size, &blkno, (size_t *)&io_size, NULL, bmap_flags, NULL))) {
626 break;
627 }
628 if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno))
629 real_bp->b_blkno = blkno;
630
631 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
632 (int)f_offset, (int)blkno, io_size, zero_offset, 0);
633
634 if (io_size == 0) {
635 /*
636 * vnop_blockmap didn't return an error... however, it did
637 * return an extent size of 0 which means we can't
638 * make forward progress on this I/O... a hole in the
639 * file would be returned as a blkno of -1 with a non-zero io_size
640 * a real extent is returned with a blkno != -1 and a non-zero io_size
641 */
642 error = EINVAL;
643 break;
644 }
645 if ( !(flags & CL_READ) && blkno == -1) {
646 off_t e_offset;
647
648 /*
649 * we're writing into a 'hole'
650 */
651 if (flags & CL_PAGEOUT) {
652 /*
653 * if we got here via cluster_pageout
654 * then just error the request and return
655 * the 'hole' should already have been covered
656 */
657 error = EINVAL;
658 break;
659 }
660 if ( !(flags & CL_COMMIT)) {
661 /*
662 * currently writes always request the commit to happen
663 * as part of the io completion... however, if the CL_COMMIT
664 * flag isn't specified, than we can't issue the abort_range
665 * since the call site is going to abort or commit the same upl..
666 * in this case we can only return an error
667 */
668 error = EINVAL;
669 break;
670 }
671 /*
672 * we can get here if the cluster code happens to
673 * pick up a page that was dirtied via mmap vs
674 * a 'write' and the page targets a 'hole'...
675 * i.e. the writes to the cluster were sparse
676 * and the file was being written for the first time
677 *
678 * we can also get here if the filesystem supports
679 * 'holes' that are less than PAGE_SIZE.... because
680 * we can't know if the range in the page that covers
681 * the 'hole' has been dirtied via an mmap or not,
682 * we have to assume the worst and try to push the
683 * entire page to storage.
684 *
685 * Try paging out the page individually before
686 * giving up entirely and dumping it (the pageout
687 * path will insure that the zero extent accounting
688 * has been taken care of before we get back into cluster_io)
689 */
690 ubc_upl_abort_range(upl, trunc_page(upl_offset), PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
691
692 e_offset = round_page_64(f_offset + 1);
693
694 if (ubc_sync_range(vp, f_offset, e_offset, UBC_PUSHDIRTY) == 0) {
695 error = EINVAL;
696 break;
697 }
698 io_size = e_offset - f_offset;
699
700 f_offset += io_size;
701 upl_offset += io_size;
702
703 if (size >= io_size)
704 size -= io_size;
705 else
706 size = 0;
707 /*
708 * keep track of how much of the original request
709 * that we've actually completed... non_rounded_size
710 * may go negative due to us rounding the request
711 * to a page size multiple (i.e. size > non_rounded_size)
712 */
713 non_rounded_size -= io_size;
714
715 if (non_rounded_size <= 0) {
716 /*
717 * we've transferred all of the data in the original
718 * request, but we were unable to complete the tail
719 * of the last page because the file didn't have
720 * an allocation to back that portion... this is ok.
721 */
722 size = 0;
723 }
724 continue;
725 }
726 lblkno = (daddr64_t)(f_offset / PAGE_SIZE_64);
727 /*
728 * we have now figured out how much I/O we can do - this is in 'io_size'
729 * pg_offset is the starting point in the first page for the I/O
730 * pg_count is the number of full and partial pages that 'io_size' encompasses
731 */
732 pg_offset = upl_offset & PAGE_MASK;
733
734 if (flags & CL_DEV_MEMORY) {
735 /*
736 * currently, can't deal with reading 'holes' in file
737 */
738 if (blkno == -1) {
739 error = EINVAL;
740 break;
741 }
742 /*
743 * treat physical requests as one 'giant' page
744 */
745 pg_count = 1;
746 } else
747 pg_count = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
748
749 if ((flags & CL_READ) && blkno == -1) {
750 int bytes_to_zero;
751
752 /*
753 * if we're reading and blkno == -1, then we've got a
754 * 'hole' in the file that we need to deal with by zeroing
755 * out the affected area in the upl
756 */
757 if (zero_offset && io_size == size) {
758 /*
759 * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
760 * than 'zero_offset' will be non-zero
761 * if the 'hole' returned by vnop_blockmap extends all the way to the eof
762 * (indicated by the io_size finishing off the I/O request for this UPL)
763 * than we're not going to issue an I/O for the
764 * last page in this upl... we need to zero both the hole and the tail
765 * of the page beyond the EOF, since the delayed zero-fill won't kick in
766 */
767 bytes_to_zero = (((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset;
768
769 zero_offset = 0;
770 } else
771 bytes_to_zero = io_size;
772
773 cluster_zero(upl, upl_offset, bytes_to_zero, real_bp);
774
775 if (cbp_head)
776 /*
777 * if there is a current I/O chain pending
778 * then the first page of the group we just zero'd
779 * will be handled by the I/O completion if the zero
780 * fill started in the middle of the page
781 */
782 pg_count = (io_size - pg_offset) / PAGE_SIZE;
783 else {
784 /*
785 * no pending I/O to pick up that first page
786 * so, we have to make sure it gets committed
787 * here.
788 * set the pg_offset to 0 so that the upl_commit_range
789 * starts with this page
790 */
791 pg_count = (io_size + pg_offset) / PAGE_SIZE;
792 pg_offset = 0;
793 }
794 if (io_size == size && ((upl_offset + io_size) & PAGE_MASK))
795 /*
796 * if we're done with the request for this UPL
797 * then we have to make sure to commit the last page
798 * even if we only partially zero-filled it
799 */
800 pg_count++;
801
802 if (pg_count) {
803 if (pg_offset)
804 pg_resid = PAGE_SIZE - pg_offset;
805 else
806 pg_resid = 0;
807
808 if (flags & CL_COMMIT)
809 ubc_upl_commit_range(upl,
810 (upl_offset + pg_resid) & ~PAGE_MASK,
811 pg_count * PAGE_SIZE,
812 UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
813 }
814 upl_offset += io_size;
815 f_offset += io_size;
816 size -= io_size;
817 /*
818 * keep track of how much of the original request
819 * that we've actually completed... non_rounded_size
820 * may go negative due to us rounding the request
821 * to a page size multiple (i.e. size > non_rounded_size)
822 */
823 non_rounded_size -= io_size;
824
825 if (non_rounded_size <= 0) {
826 /*
827 * we've transferred all of the data in the original
828 * request, but we were unable to complete the tail
829 * of the last page because the file didn't have
830 * an allocation to back that portion... this is ok.
831 */
832 size = 0;
833 }
834 if (cbp_head && pg_count)
835 goto start_io;
836 continue;
837
838 }
839 if (pg_count > max_vectors) {
840 if (((pg_count - max_vectors) * PAGE_SIZE) > io_size) {
841 io_size = PAGE_SIZE - pg_offset;
842 pg_count = 1;
843 } else {
844 io_size -= (pg_count - max_vectors) * PAGE_SIZE;
845 pg_count = max_vectors;
846 }
847 }
848
849 if ( !(mp->mnt_kern_flag & MNTK_VIRTUALDEV))
850 /*
851 * if we're not targeting a virtual device i.e. a disk image
852 * it's safe to dip into the reserve pool since real devices
853 * can complete this I/O request without requiring additional
854 * bufs from the alloc_io_buf pool
855 */
856 priv = 1;
857 else if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT))
858 /*
859 * Throttle the speculative IO
860 */
861 priv = 0;
862 else
863 priv = 1;
864
865 cbp = alloc_io_buf(vp, priv);
866
867 if (flags & CL_PAGEOUT) {
868 u_int i;
869
870 for (i = 0; i < pg_count; i++) {
871 if (buf_invalblkno(vp, lblkno + i, 0) == EBUSY)
872 panic("BUSY bp found in cluster_io");
873 }
874 }
875 if (flags & CL_ASYNC) {
876 if (buf_setcallback(cbp, (void *)cluster_iodone, NULL))
877 panic("buf_setcallback failed\n");
878 }
879 cbp->b_flags |= io_flags;
880
881 cbp->b_lblkno = lblkno;
882 cbp->b_blkno = blkno;
883 cbp->b_bcount = io_size;
884
885 if (buf_setupl(cbp, upl, upl_offset))
886 panic("buf_setupl failed\n");
887
888 cbp->b_trans_next = (buf_t)NULL;
889
890 if ((cbp->b_iostate = (void *)iostate))
891 /*
892 * caller wants to track the state of this
893 * io... bump the amount issued against this stream
894 */
895 iostate->io_issued += io_size;
896
897 if (flags & CL_READ) {
898 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
899 (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
900 }
901 else {
902 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
903 (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
904 }
905
906 if (cbp_head) {
907 cbp_tail->b_trans_next = cbp;
908 cbp_tail = cbp;
909 } else {
910 cbp_head = cbp;
911 cbp_tail = cbp;
912 }
913 (buf_t)(cbp->b_trans_head) = cbp_head;
914 trans_count++;
915
916 upl_offset += io_size;
917 f_offset += io_size;
918 size -= io_size;
919 /*
920 * keep track of how much of the original request
921 * that we've actually completed... non_rounded_size
922 * may go negative due to us rounding the request
923 * to a page size multiple (i.e. size > non_rounded_size)
924 */
925 non_rounded_size -= io_size;
926
927 if (non_rounded_size <= 0) {
928 /*
929 * we've transferred all of the data in the original
930 * request, but we were unable to complete the tail
931 * of the last page because the file didn't have
932 * an allocation to back that portion... this is ok.
933 */
934 size = 0;
935 }
936 if ( (!(upl_offset & PAGE_MASK) && !(flags & CL_DEV_MEMORY) && ((flags & CL_ASYNC) || trans_count > 8)) || size == 0) {
937 /*
938 * if we have no more I/O to issue or
939 * the current I/O we've prepared fully
940 * completes the last page in this request
941 * and it's either an ASYNC request or
942 * we've already accumulated more than 8 I/O's into
943 * this transaction and it's not an I/O directed to
944 * special DEVICE memory
945 * then go ahead and issue the I/O
946 */
947start_io:
948 if (real_bp) {
949 cbp_head->b_flags |= B_NEED_IODONE;
950 cbp_head->b_real_bp = real_bp;
951 } else
952 cbp_head->b_real_bp = (buf_t)NULL;
953
954 if (size == 0) {
955 /*
956 * we're about to issue the last I/O for this upl
957 * if this was a read to the eof and the eof doesn't
958 * finish on a page boundary, than we need to zero-fill
959 * the rest of the page....
960 */
961 cbp_head->b_validend = zero_offset;
962 } else
963 cbp_head->b_validend = 0;
964
965 if (flags & CL_THROTTLE)
966 (void)vnode_waitforwrites(vp, async_throttle, 0, 0, (char *)"cluster_io");
967
968 for (cbp = cbp_head; cbp;) {
969 buf_t cbp_next;
970
971 if ( !(io_flags & B_READ))
972 vnode_startwrite(vp);
973
974 cbp_next = cbp->b_trans_next;
975
976 (void) VNOP_STRATEGY(cbp);
977 cbp = cbp_next;
978 }
979 if ( !(flags & CL_ASYNC)) {
980 int dummy;
981
982 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next)
983 buf_biowait(cbp);
984
985 if ((error = cluster_iodone(cbp_head, (void *)&dummy))) {
986 if (((flags & (CL_PAGEOUT | CL_KEEPCACHED)) == CL_PAGEOUT) && (error == ENXIO))
987 error = 0; /* drop the error */
988 else {
989 if (retval == 0)
990 retval = error;
991 error = 0;
992 }
993 }
994 }
995 cbp_head = (buf_t)NULL;
996 cbp_tail = (buf_t)NULL;
997
998 trans_count = 0;
999 }
1000 }
1001 if (error) {
1002 int abort_size;
1003
1004 io_size = 0;
1005
1006 for (cbp = cbp_head; cbp;) {
1007 buf_t cbp_next;
1008
1009 upl_offset -= cbp->b_bcount;
1010 size += cbp->b_bcount;
1011 io_size += cbp->b_bcount;
1012
1013 cbp_next = cbp->b_trans_next;
1014 free_io_buf(cbp);
1015 cbp = cbp_next;
1016 }
1017 if (iostate) {
1018 int need_wakeup = 0;
1019
1020 /*
1021 * update the error condition for this stream
1022 * since we never really issued the io
1023 * just go ahead and adjust it back
1024 */
1025 lck_mtx_lock(cl_mtxp);
1026
1027 if (iostate->io_error == 0)
1028 iostate->io_error = error;
1029 iostate->io_issued -= io_size;
1030
1031 if (iostate->io_wanted) {
1032 /*
1033 * someone is waiting for the state of
1034 * this io stream to change
1035 */
1036 iostate->io_wanted = 0;
1037 need_wakeup = 0;
1038 }
1039 lck_mtx_unlock(cl_mtxp);
1040
1041 if (need_wakeup)
1042 wakeup((caddr_t)&iostate->io_wanted);
1043 }
1044 pg_offset = upl_offset & PAGE_MASK;
1045 abort_size = (size + pg_offset + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1046
1047 if (flags & CL_COMMIT) {
1048 int upl_abort_code;
1049
1050 if (flags & CL_PRESERVE) {
1051 ubc_upl_commit_range(upl, upl_offset - pg_offset, abort_size,
1052 UPL_COMMIT_FREE_ON_EMPTY);
1053 } else {
1054 if ((flags & CL_PAGEOUT) && (error != ENXIO)) /* transient error */
1055 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
1056 else if (flags & CL_PAGEIN)
1057 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
1058 else
1059 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
1060
1061 ubc_upl_abort_range(upl, upl_offset - pg_offset, abort_size,
1062 upl_abort_code);
1063 }
1064 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
1065 (int)upl, upl_offset - pg_offset, abort_size, error, 0);
1066 }
1067 if (real_bp) {
1068 real_bp->b_flags |= B_ERROR;
1069 real_bp->b_error = error;
1070
1071 buf_biodone(real_bp);
1072 }
1073 if (retval == 0)
1074 retval = error;
1075 }
1076 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END,
1077 (int)f_offset, size, upl_offset, retval, 0);
1078
1079 return (retval);
1080}
1081
1082
1083static int
1084cluster_rd_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize)
1085{
1086 int pages_in_prefetch;
1087
1088 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
1089 (int)f_offset, size, (int)filesize, 0, 0);
1090
1091 if (f_offset >= filesize) {
1092 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
1093 (int)f_offset, 0, 0, 0, 0);
1094 return(0);
1095 }
1096 if (size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1097 size = (MAX_UPL_TRANSFER * PAGE_SIZE);
1098 else
1099 size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1100
1101 if ((off_t)size > (filesize - f_offset))
1102 size = filesize - f_offset;
1103 pages_in_prefetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
1104
1105 advisory_read(vp, filesize, f_offset, size);
1106
1107 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
1108 (int)f_offset + size, pages_in_prefetch, 0, 1, 0);
1109
1110 return (pages_in_prefetch);
1111}
1112
1113
1114
1115static void
1116cluster_rd_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *rap)
1117{
1118 daddr64_t r_addr;
1119 off_t f_offset;
1120 int size_of_prefetch;
1121
1122
1123 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
1124 (int)extent->b_addr, (int)extent->e_addr, (int)rap->cl_lastr, 0, 0);
1125
1126 if (extent->b_addr == rap->cl_lastr && extent->b_addr == extent->e_addr) {
1127 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1128 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 0, 0);
1129 return;
1130 }
1131 if (rap->cl_lastr == -1 || (extent->b_addr != rap->cl_lastr && extent->b_addr != (rap->cl_lastr + 1) &&
1132 (extent->b_addr != (rap->cl_maxra + 1) || rap->cl_ralen == 0))) {
1133 rap->cl_ralen = 0;
1134 rap->cl_maxra = 0;
1135
1136 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1137 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 1, 0);
1138
1139 return;
1140 }
1141 if (extent->e_addr < rap->cl_maxra) {
1142 if ((rap->cl_maxra - extent->e_addr) > (MAX_UPL_TRANSFER / 4)) {
1143
1144 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1145 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 2, 0);
1146 return;
1147 }
1148 }
1149 r_addr = max(extent->e_addr, rap->cl_maxra) + 1;
1150 f_offset = (off_t)(r_addr * PAGE_SIZE_64);
1151
1152 size_of_prefetch = 0;
1153
1154 ubc_range_op(vp, f_offset, f_offset + PAGE_SIZE_64, UPL_ROP_PRESENT, &size_of_prefetch);
1155
1156 if (size_of_prefetch) {
1157 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1158 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 3, 0);
1159 return;
1160 }
1161 if (f_offset < filesize) {
1162 daddr64_t read_size;
1163
1164 rap->cl_ralen = rap->cl_ralen ? min(MAX_UPL_TRANSFER, rap->cl_ralen << 1) : 1;
1165
1166 read_size = (extent->e_addr + 1) - extent->b_addr;
1167
1168 if (read_size > rap->cl_ralen) {
1169 if (read_size > MAX_UPL_TRANSFER)
1170 rap->cl_ralen = MAX_UPL_TRANSFER;
1171 else
1172 rap->cl_ralen = read_size;
1173 }
1174 size_of_prefetch = cluster_rd_prefetch(vp, f_offset, rap->cl_ralen * PAGE_SIZE, filesize);
1175
1176 if (size_of_prefetch)
1177 rap->cl_maxra = (r_addr + size_of_prefetch) - 1;
1178 }
1179 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1180 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 4, 0);
1181}
1182
1183int
1184cluster_pageout(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset,
1185 int size, off_t filesize, int flags)
1186{
1187 int io_size;
1188 int rounded_size;
1189 off_t max_size;
1190 int local_flags;
1191 struct cl_writebehind *wbp;
1192
1193 if (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)
1194 /*
1195 * if we know we're issuing this I/O to a virtual device (i.e. disk image)
1196 * then we don't want to enforce this throttle... if we do, we can
1197 * potentially deadlock since we're stalling the pageout thread at a time
1198 * when the disk image might need additional memory (which won't be available
1199 * if the pageout thread can't run)... instead we'll just depend on the throttle
1200 * that the pageout thread now has in place to deal with external files
1201 */
1202 local_flags = CL_PAGEOUT;
1203 else
1204 local_flags = CL_PAGEOUT | CL_THROTTLE;
1205
1206 if ((flags & UPL_IOSYNC) == 0)
1207 local_flags |= CL_ASYNC;
1208 if ((flags & UPL_NOCOMMIT) == 0)
1209 local_flags |= CL_COMMIT;
1210 if ((flags & UPL_KEEPCACHED))
1211 local_flags |= CL_KEEPCACHED;
1212
1213
1214 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
1215 (int)f_offset, size, (int)filesize, local_flags, 0);
1216
1217 /*
1218 * If they didn't specify any I/O, then we are done...
1219 * we can't issue an abort because we don't know how
1220 * big the upl really is
1221 */
1222 if (size <= 0)
1223 return (EINVAL);
1224
1225 if (vp->v_mount->mnt_flag & MNT_RDONLY) {
1226 if (local_flags & CL_COMMIT)
1227 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
1228 return (EROFS);
1229 }
1230 /*
1231 * can't page-in from a negative offset
1232 * or if we're starting beyond the EOF
1233 * or if the file offset isn't page aligned
1234 * or the size requested isn't a multiple of PAGE_SIZE
1235 */
1236 if (f_offset < 0 || f_offset >= filesize ||
1237 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
1238 if (local_flags & CL_COMMIT)
1239 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
1240 return (EINVAL);
1241 }
1242 max_size = filesize - f_offset;
1243
1244 if (size < max_size)
1245 io_size = size;
1246 else
1247 io_size = max_size;
1248
1249 rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1250
1251 if (size > rounded_size) {
1252 if (local_flags & CL_COMMIT)
1253 ubc_upl_abort_range(upl, upl_offset + rounded_size, size - rounded_size,
1254 UPL_ABORT_FREE_ON_EMPTY);
1255 }
1256 if ((wbp = cluster_get_wbp(vp, 0)) != NULL)
1257 wbp->cl_hasbeenpaged = 1;
1258
1259 return (cluster_io(vp, upl, upl_offset, f_offset, io_size,
1260 local_flags, (buf_t)NULL, (struct clios *)NULL));
1261}
1262
1263int
1264cluster_pagein(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset,
1265 int size, off_t filesize, int flags)
1266{
1267 u_int io_size;
1268 int rounded_size;
1269 off_t max_size;
1270 int retval;
1271 int local_flags = 0;
1272
1273 if (upl == NULL || size < 0)
1274 panic("cluster_pagein: NULL upl passed in");
1275
1276 if ((flags & UPL_IOSYNC) == 0)
1277 local_flags |= CL_ASYNC;
1278 if ((flags & UPL_NOCOMMIT) == 0)
1279 local_flags |= CL_COMMIT;
1280
1281
1282 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
1283 (int)f_offset, size, (int)filesize, local_flags, 0);
1284
1285 /*
1286 * can't page-in from a negative offset
1287 * or if we're starting beyond the EOF
1288 * or if the file offset isn't page aligned
1289 * or the size requested isn't a multiple of PAGE_SIZE
1290 */
1291 if (f_offset < 0 || f_offset >= filesize ||
1292 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) {
1293 if (local_flags & CL_COMMIT)
1294 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1295 return (EINVAL);
1296 }
1297 max_size = filesize - f_offset;
1298
1299 if (size < max_size)
1300 io_size = size;
1301 else
1302 io_size = max_size;
1303
1304 rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1305
1306 if (size > rounded_size && (local_flags & CL_COMMIT))
1307 ubc_upl_abort_range(upl, upl_offset + rounded_size,
1308 size - rounded_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1309
1310 retval = cluster_io(vp, upl, upl_offset, f_offset, io_size,
1311 local_flags | CL_READ | CL_PAGEIN, (buf_t)NULL, (struct clios *)NULL);
1312
1313 if (retval == 0 && !(flags & UPL_NORDAHEAD) && !(vp->v_flag & VRAOFF)) {
1314 struct cl_readahead *rap;
1315
1316 rap = cluster_get_rap(vp);
1317
1318 if (rap != NULL) {
1319 struct cl_extent extent;
1320
1321 extent.b_addr = (daddr64_t)(f_offset / PAGE_SIZE_64);
1322 extent.e_addr = (daddr64_t)((f_offset + ((off_t)io_size - 1)) / PAGE_SIZE_64);
1323
1324 if (rounded_size == PAGE_SIZE) {
1325 /*
1326 * we haven't read the last page in of the file yet
1327 * so let's try to read ahead if we're in
1328 * a sequential access pattern
1329 */
1330 cluster_rd_ahead(vp, &extent, filesize, rap);
1331 }
1332 rap->cl_lastr = extent.e_addr;
1333
1334 lck_mtx_unlock(&rap->cl_lockr);
1335 }
1336 }
1337 return (retval);
1338}
1339
1340int
1341cluster_bp(buf_t bp)
1342{
1343 off_t f_offset;
1344 int flags;
1345
1346 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START,
1347 (int)bp, (int)bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
1348
1349 if (bp->b_flags & B_READ)
1350 flags = CL_ASYNC | CL_READ;
1351 else
1352 flags = CL_ASYNC;
1353
1354 f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
1355
1356 return (cluster_io(bp->b_vp, bp->b_upl, 0, f_offset, bp->b_bcount, flags, bp, (struct clios *)NULL));
1357}
1358
1359int
1360cluster_write(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int xflags)
1361{
1362 int prev_resid;
1363 u_int clip_size;
1364 off_t max_io_size;
1365 int upl_size;
1366 int upl_flags;
1367 upl_t upl;
1368 int retval = 0;
1369 int flags;
1370
1371 flags = xflags;
1372
1373 if (vp->v_flag & VNOCACHE_DATA)
1374 flags |= IO_NOCACHE;
1375
1376 if ( (!(flags & IO_NOCACHE)) || (!uio) || (!UIO_SEG_IS_USER_SPACE(uio->uio_segflg))) {
1377 /*
1378 * go do a write through the cache if one of the following is true....
1379 * NOCACHE is not true
1380 * there is no uio structure or it doesn't target USERSPACE
1381 */
1382 return (cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags));
1383 }
1384
1385#if LP64_DEBUG
1386 if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) {
1387 panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__);
1388 }
1389#endif /* LP64_DEBUG */
1390
1391 while (uio_resid(uio) && uio->uio_offset < newEOF && retval == 0) {
1392 user_size_t iov_len;
1393 user_addr_t iov_base;
1394
1395 /*
1396 * we know we have a resid, so this is safe
1397 * skip over any emtpy vectors
1398 */
1399 uio_update(uio, (user_size_t)0);
1400
1401 iov_len = uio_curriovlen(uio);
1402 iov_base = uio_curriovbase(uio);
1403
1404 upl_size = PAGE_SIZE;
1405 upl_flags = UPL_QUERY_OBJECT_TYPE;
1406
1407 // LP64todo - fix this!
1408 if ((vm_map_get_upl(current_map(),
1409 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
1410 &upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS) {
1411 /*
1412 * the user app must have passed in an invalid address
1413 */
1414 return (EFAULT);
1415 }
1416
1417 /*
1418 * We check every vector target but if it is physically
1419 * contiguous space, we skip the sanity checks.
1420 */
1421 if (upl_flags & UPL_PHYS_CONTIG) {
1422 int zflags;
1423
1424 zflags = flags & ~IO_TAILZEROFILL;
1425 zflags |= IO_HEADZEROFILL;
1426
1427 if (flags & IO_HEADZEROFILL) {
1428 /*
1429 * in case we have additional vectors, we don't want to do this again
1430 */
1431 flags &= ~IO_HEADZEROFILL;
1432
1433 if ((retval = cluster_write_x(vp, (struct uio *)0, 0, uio->uio_offset, headOff, 0, zflags)))
1434 return(retval);
1435 }
1436 retval = cluster_phys_write(vp, uio, newEOF);
1437
1438 if (uio_resid(uio) == 0 && (flags & IO_TAILZEROFILL)) {
1439 return (cluster_write_x(vp, (struct uio *)0, 0, tailOff, uio->uio_offset, 0, zflags));
1440 }
1441 }
1442 else if ((uio_resid(uio) < PAGE_SIZE) || (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL))) {
1443 /*
1444 * we're here because we're don't have a physically contiguous target buffer
1445 * go do a write through the cache if one of the following is true....
1446 * the total xfer size is less than a page...
1447 * we're being asked to ZEROFILL either the head or the tail of the I/O...
1448 */
1449 return (cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags));
1450 }
1451 // LP64todo - fix this!
1452 else if (((int)uio->uio_offset & PAGE_MASK) || (CAST_DOWN(int, iov_base) & PAGE_MASK)) {
1453 if (((int)uio->uio_offset & PAGE_MASK) == (CAST_DOWN(int, iov_base) & PAGE_MASK)) {
1454 /*
1455 * Bring the file offset write up to a pagesize boundary
1456 * this will also bring the base address to a page boundary
1457 * since they both are currently on the same offset within a page
1458 * note: if we get here, uio->uio_resid is greater than PAGE_SIZE
1459 * so the computed clip_size must always be less than the current uio_resid
1460 */
1461 clip_size = (PAGE_SIZE - (uio->uio_offset & PAGE_MASK_64));
1462
1463 /*
1464 * Fake the resid going into the cluster_write_x call
1465 * and restore it on the way out.
1466 */
1467 // LP64todo - fix this
1468 prev_resid = uio_resid(uio);
1469 uio_setresid(uio, clip_size);
1470
1471 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags);
1472
1473 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
1474 } else {
1475 /*
1476 * can't get both the file offset and the buffer offset aligned to a page boundary
1477 * so fire an I/O through the cache for this entire vector
1478 */
1479 // LP64todo - fix this
1480 clip_size = iov_len;
1481 // LP64todo - fix this
1482 prev_resid = uio_resid(uio);
1483 uio_setresid(uio, clip_size);
1484
1485 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags);
1486
1487 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
1488 }
1489 } else {
1490 /*
1491 * If we come in here, we know the offset into
1492 * the file is on a pagesize boundary and the
1493 * target buffer address is also on a page boundary
1494 */
1495 max_io_size = newEOF - uio->uio_offset;
1496 // LP64todo - fix this
1497 clip_size = uio_resid(uio);
1498 if (iov_len < clip_size)
1499 // LP64todo - fix this!
1500 clip_size = iov_len;
1501 if (max_io_size < clip_size)
1502 clip_size = max_io_size;
1503
1504 if (clip_size < PAGE_SIZE) {
1505 /*
1506 * Take care of tail end of write in this vector
1507 */
1508 // LP64todo - fix this
1509 prev_resid = uio_resid(uio);
1510 uio_setresid(uio, clip_size);
1511
1512 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags);
1513
1514 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
1515 } else {
1516 /* round clip_size down to a multiple of pagesize */
1517 clip_size = clip_size & ~(PAGE_MASK);
1518 // LP64todo - fix this
1519 prev_resid = uio_resid(uio);
1520 uio_setresid(uio, clip_size);
1521
1522 retval = cluster_nocopy_write(vp, uio, newEOF);
1523
1524 if ((retval == 0) && uio_resid(uio))
1525 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags);
1526
1527 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
1528 }
1529 } /* end else */
1530 } /* end while */
1531
1532 return(retval);
1533}
1534
1535
1536static int
1537cluster_nocopy_write(vnode_t vp, struct uio *uio, off_t newEOF)
1538{
1539 upl_t upl;
1540 upl_page_info_t *pl;
1541 vm_offset_t upl_offset;
1542 int io_size;
1543 int io_flag;
1544 int upl_size;
1545 int upl_needed_size;
1546 int pages_in_pl;
1547 int upl_flags;
1548 kern_return_t kret;
1549 int i;
1550 int force_data_sync;
1551 int error = 0;
1552 struct clios iostate;
1553 struct cl_writebehind *wbp;
1554
1555
1556 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
1557 (int)uio->uio_offset, (int)uio_resid(uio),
1558 (int)newEOF, 0, 0);
1559
1560 /*
1561 * When we enter this routine, we know
1562 * -- the offset into the file is on a pagesize boundary
1563 * -- the resid is a page multiple
1564 * -- the resid will not exceed iov_len
1565 */
1566
1567 if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) != NULL) {
1568
1569 cluster_try_push(wbp, vp, newEOF, 0, 1);
1570
1571 lck_mtx_unlock(&wbp->cl_lockw);
1572 }
1573 iostate.io_completed = 0;
1574 iostate.io_issued = 0;
1575 iostate.io_error = 0;
1576 iostate.io_wanted = 0;
1577
1578 while (uio_resid(uio) && uio->uio_offset < newEOF && error == 0) {
1579 user_addr_t iov_base;
1580
1581 io_size = uio_resid(uio);
1582
1583 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1584 io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1585
1586 iov_base = uio_curriovbase(uio);
1587
1588 // LP64todo - fix this!
1589 upl_offset = CAST_DOWN(vm_offset_t, iov_base) & PAGE_MASK;
1590
1591 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
1592
1593 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
1594 (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
1595
1596 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
1597 pages_in_pl = 0;
1598 upl_size = upl_needed_size;
1599 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1600 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
1601
1602 // LP64todo - fix this!
1603 kret = vm_map_get_upl(current_map(),
1604 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
1605 &upl_size,
1606 &upl,
1607 NULL,
1608 &pages_in_pl,
1609 &upl_flags,
1610 force_data_sync);
1611
1612 if (kret != KERN_SUCCESS) {
1613 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1614 0, 0, 0, kret, 0);
1615 /*
1616 * cluster_nocopy_write: failed to get pagelist
1617 *
1618 * we may have already spun some portion of this request
1619 * off as async requests... we need to wait for the I/O
1620 * to complete before returning
1621 */
1622 goto wait_for_writes;
1623 }
1624 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1625 pages_in_pl = upl_size / PAGE_SIZE;
1626
1627 for (i = 0; i < pages_in_pl; i++) {
1628 if (!upl_valid_page(pl, i))
1629 break;
1630 }
1631 if (i == pages_in_pl)
1632 break;
1633
1634 /*
1635 * didn't get all the pages back that we
1636 * needed... release this upl and try again
1637 */
1638 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1639 UPL_ABORT_FREE_ON_EMPTY);
1640 }
1641 if (force_data_sync >= 3) {
1642 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1643 i, pages_in_pl, upl_size, kret, 0);
1644 /*
1645 * for some reason, we couldn't acquire a hold on all
1646 * the pages needed in the user's address space
1647 *
1648 * we may have already spun some portion of this request
1649 * off as async requests... we need to wait for the I/O
1650 * to complete before returning
1651 */
1652 goto wait_for_writes;
1653 }
1654
1655 /*
1656 * Consider the possibility that upl_size wasn't satisfied.
1657 */
1658 if (upl_size != upl_needed_size)
1659 io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
1660
1661 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1662 (int)upl_offset, upl_size, (int)iov_base, io_size, 0);
1663
1664 if (io_size == 0) {
1665 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1666 UPL_ABORT_FREE_ON_EMPTY);
1667 /*
1668 * we may have already spun some portion of this request
1669 * off as async requests... we need to wait for the I/O
1670 * to complete before returning
1671 */
1672 goto wait_for_writes;
1673 }
1674 /*
1675 * Now look for pages already in the cache
1676 * and throw them away.
1677 * uio->uio_offset is page aligned within the file
1678 * io_size is a multiple of PAGE_SIZE
1679 */
1680 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + io_size, UPL_ROP_DUMP, NULL);
1681
1682 /*
1683 * we want push out these writes asynchronously so that we can overlap
1684 * the preparation of the next I/O
1685 * if there are already too many outstanding writes
1686 * wait until some complete before issuing the next
1687 */
1688 lck_mtx_lock(cl_mtxp);
1689
1690 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
1691 iostate.io_wanted = 1;
1692 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_write", 0);
1693 }
1694 lck_mtx_unlock(cl_mtxp);
1695
1696 if (iostate.io_error) {
1697 /*
1698 * one of the earlier writes we issued ran into a hard error
1699 * don't issue any more writes, cleanup the UPL
1700 * that was just created but not used, then
1701 * go wait for all writes that are part of this stream
1702 * to complete before returning the error to the caller
1703 */
1704 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1705 UPL_ABORT_FREE_ON_EMPTY);
1706
1707 goto wait_for_writes;
1708 }
1709 io_flag = CL_ASYNC | CL_PRESERVE | CL_COMMIT | CL_THROTTLE;
1710
1711 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
1712 (int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0);
1713
1714 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1715 io_size, io_flag, (buf_t)NULL, &iostate);
1716
1717 uio_update(uio, (user_size_t)io_size);
1718
1719 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
1720 (int)upl_offset, (int)uio->uio_offset, (int)uio_resid(uio), error, 0);
1721
1722 } /* end while */
1723
1724wait_for_writes:
1725 /*
1726 * make sure all async writes issued as part of this stream
1727 * have completed before we return
1728 */
1729 lck_mtx_lock(cl_mtxp);
1730
1731 while (iostate.io_issued != iostate.io_completed) {
1732 iostate.io_wanted = 1;
1733 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_write", 0);
1734 }
1735 lck_mtx_unlock(cl_mtxp);
1736
1737 if (iostate.io_error)
1738 error = iostate.io_error;
1739
1740 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1741 (int)uio->uio_offset, (int)uio->uio_resid, error, 4, 0);
1742
1743 return (error);
1744}
1745
1746
1747static int
1748cluster_phys_write(vnode_t vp, struct uio *uio, off_t newEOF)
1749{
1750 upl_page_info_t *pl;
1751 addr64_t src_paddr;
1752 upl_t upl;
1753 vm_offset_t upl_offset;
1754 int tail_size;
1755 int io_size;
1756 int upl_size;
1757 int upl_needed_size;
1758 int pages_in_pl;
1759 int upl_flags;
1760 kern_return_t kret;
1761 int error = 0;
1762 user_addr_t iov_base;
1763 int devblocksize;
1764 struct cl_writebehind *wbp;
1765
1766 devblocksize = vp->v_mount->mnt_devblocksize;
1767 /*
1768 * When we enter this routine, we know
1769 * -- the resid will not exceed iov_len
1770 * -- the vector target address is physcially contiguous
1771 */
1772 if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) != NULL) {
1773
1774 cluster_try_push(wbp, vp, newEOF, 0, 1);
1775
1776 lck_mtx_unlock(&wbp->cl_lockw);
1777 }
1778#if LP64_DEBUG
1779 if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) {
1780 panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__);
1781 }
1782#endif /* LP64_DEBUG */
1783
1784 // LP64todo - fix this!
1785 io_size = (int)uio_curriovlen(uio);
1786 iov_base = uio_curriovbase(uio);
1787
1788 upl_offset = CAST_DOWN(upl_offset_t, iov_base) & PAGE_MASK;
1789 upl_needed_size = upl_offset + io_size;
1790
1791 pages_in_pl = 0;
1792 upl_size = upl_needed_size;
1793 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1794 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
1795
1796 // LP64todo - fix this!
1797 kret = vm_map_get_upl(current_map(),
1798 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
1799 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
1800
1801 if (kret != KERN_SUCCESS) {
1802 /*
1803 * cluster_phys_write: failed to get pagelist
1804 * note: return kret here
1805 */
1806 return(EINVAL);
1807 }
1808 /*
1809 * Consider the possibility that upl_size wasn't satisfied.
1810 * This is a failure in the physical memory case.
1811 */
1812 if (upl_size < upl_needed_size) {
1813 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1814 return(EINVAL);
1815 }
1816 pl = ubc_upl_pageinfo(upl);
1817
1818 src_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)upl_offset;
1819
1820 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
1821 int head_size;
1822
1823 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
1824
1825 if (head_size > io_size)
1826 head_size = io_size;
1827
1828 error = cluster_align_phys_io(vp, uio, src_paddr, head_size, 0);
1829
1830 if (error) {
1831 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1832
1833 return(EINVAL);
1834 }
1835 upl_offset += head_size;
1836 src_paddr += head_size;
1837 io_size -= head_size;
1838 }
1839 tail_size = io_size & (devblocksize - 1);
1840 io_size -= tail_size;
1841
1842 if (io_size) {
1843 /*
1844 * issue a synchronous write to cluster_io
1845 */
1846 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1847 io_size, CL_DEV_MEMORY, (buf_t)NULL, (struct clios *)NULL);
1848 }
1849 if (error == 0) {
1850 /*
1851 * The cluster_io write completed successfully,
1852 * update the uio structure
1853 */
1854 uio_update(uio, (user_size_t)io_size);
1855
1856 src_paddr += io_size;
1857
1858 if (tail_size)
1859 error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, 0);
1860 }
1861 /*
1862 * just release our hold on the physically contiguous
1863 * region without changing any state
1864 */
1865 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1866
1867 return (error);
1868}
1869
1870
1871static int
1872cluster_write_x(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int flags)
1873{
1874 upl_page_info_t *pl;
1875 upl_t upl;
1876 vm_offset_t upl_offset = 0;
1877 int upl_size;
1878 off_t upl_f_offset;
1879 int pages_in_upl;
1880 int start_offset;
1881 int xfer_resid;
1882 int io_size;
1883 int io_offset;
1884 int bytes_to_zero;
1885 int bytes_to_move;
1886 kern_return_t kret;
1887 int retval = 0;
1888 int io_resid;
1889 long long total_size;
1890 long long zero_cnt;
1891 off_t zero_off;
1892 long long zero_cnt1;
1893 off_t zero_off1;
1894 struct cl_extent cl;
1895 int intersection;
1896 struct cl_writebehind *wbp;
1897
1898 if ((wbp = cluster_get_wbp(vp, 0)) != NULL)
1899 {
1900 if (wbp->cl_hasbeenpaged) {
1901 /*
1902 * this vnode had pages cleaned to it by
1903 * the pager which indicates that either
1904 * it's not very 'hot', or the system is
1905 * being overwhelmed by a lot of dirty
1906 * data being delayed in the VM cache...
1907 * in either event, we'll push our remaining
1908 * delayed data at this point... this will
1909 * be more efficient than paging out 1 page at
1910 * a time, and will also act as a throttle
1911 * by delaying this client from writing any
1912 * more data until all his delayed data has
1913 * at least been queued to the uderlying driver.
1914 */
1915 if (wbp->cl_number || wbp->cl_scmap)
1916 cluster_push_EOF(vp, newEOF);
1917
1918 wbp->cl_hasbeenpaged = 0;
1919 }
1920 }
1921 if (uio) {
1922 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1923 (int)uio->uio_offset, uio_resid(uio), (int)oldEOF, (int)newEOF, 0);
1924
1925 // LP64todo - fix this
1926 io_resid = uio_resid(uio);
1927 } else {
1928 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1929 0, 0, (int)oldEOF, (int)newEOF, 0);
1930
1931 io_resid = 0;
1932 }
1933 zero_cnt = 0;
1934 zero_cnt1 = 0;
1935 zero_off = 0;
1936 zero_off1 = 0;
1937
1938 if (flags & IO_HEADZEROFILL) {
1939 /*
1940 * some filesystems (HFS is one) don't support unallocated holes within a file...
1941 * so we zero fill the intervening space between the old EOF and the offset
1942 * where the next chunk of real data begins.... ftruncate will also use this
1943 * routine to zero fill to the new EOF when growing a file... in this case, the
1944 * uio structure will not be provided
1945 */
1946 if (uio) {
1947 if (headOff < uio->uio_offset) {
1948 zero_cnt = uio->uio_offset - headOff;
1949 zero_off = headOff;
1950 }
1951 } else if (headOff < newEOF) {
1952 zero_cnt = newEOF - headOff;
1953 zero_off = headOff;
1954 }
1955 }
1956 if (flags & IO_TAILZEROFILL) {
1957 if (uio) {
1958 // LP64todo - fix this
1959 zero_off1 = uio->uio_offset + uio_resid(uio);
1960
1961 if (zero_off1 < tailOff)
1962 zero_cnt1 = tailOff - zero_off1;
1963 }
1964 }
1965 if (zero_cnt == 0 && uio == (struct uio *) 0) {
1966 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
1967 retval, 0, 0, 0, 0);
1968 return (0);
1969 }
1970
1971 while ((total_size = (io_resid + zero_cnt + zero_cnt1)) && retval == 0) {
1972 /*
1973 * for this iteration of the loop, figure out where our starting point is
1974 */
1975 if (zero_cnt) {
1976 start_offset = (int)(zero_off & PAGE_MASK_64);
1977 upl_f_offset = zero_off - start_offset;
1978 } else if (io_resid) {
1979 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
1980 upl_f_offset = uio->uio_offset - start_offset;
1981 } else {
1982 start_offset = (int)(zero_off1 & PAGE_MASK_64);
1983 upl_f_offset = zero_off1 - start_offset;
1984 }
1985 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
1986 (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
1987
1988 if (total_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1989 total_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1990
1991 cl.b_addr = (daddr64_t)(upl_f_offset / PAGE_SIZE_64);
1992
1993 if (uio && ((flags & (IO_NOCACHE | IO_SYNC | IO_HEADZEROFILL | IO_TAILZEROFILL)) == 0)) {
1994 /*
1995 * assumption... total_size <= io_resid
1996 * because IO_HEADZEROFILL and IO_TAILZEROFILL not set
1997 */
1998 if ((start_offset + total_size) > (MAX_UPL_TRANSFER * PAGE_SIZE))
1999 total_size -= start_offset;
2000 xfer_resid = total_size;
2001
2002 retval = cluster_copy_ubc_data(vp, uio, &xfer_resid, 1);
2003
2004 if (retval)
2005 break;
2006
2007 io_resid -= (total_size - xfer_resid);
2008 total_size = xfer_resid;
2009 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2010 upl_f_offset = uio->uio_offset - start_offset;
2011
2012 if (total_size == 0) {
2013 if (start_offset) {
2014 /*
2015 * the write did not finish on a page boundary
2016 * which will leave upl_f_offset pointing to the
2017 * beginning of the last page written instead of
2018 * the page beyond it... bump it in this case
2019 * so that the cluster code records the last page
2020 * written as dirty
2021 */
2022 upl_f_offset += PAGE_SIZE_64;
2023 }
2024 upl_size = 0;
2025
2026 goto check_cluster;
2027 }
2028 }
2029 /*
2030 * compute the size of the upl needed to encompass
2031 * the requested write... limit each call to cluster_io
2032 * to the maximum UPL size... cluster_io will clip if
2033 * this exceeds the maximum io_size for the device,
2034 * make sure to account for
2035 * a starting offset that's not page aligned
2036 */
2037 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2038
2039 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2040 upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2041
2042 pages_in_upl = upl_size / PAGE_SIZE;
2043 io_size = upl_size - start_offset;
2044
2045 if ((long long)io_size > total_size)
2046 io_size = total_size;
2047
2048 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, io_size, total_size, 0, 0);
2049
2050
2051 /*
2052 * Gather the pages from the buffer cache.
2053 * The UPL_WILL_MODIFY flag lets the UPL subsystem know
2054 * that we intend to modify these pages.
2055 */
2056 kret = ubc_create_upl(vp,
2057 upl_f_offset,
2058 upl_size,
2059 &upl,
2060 &pl,
2061 UPL_SET_LITE | UPL_WILL_MODIFY);
2062 if (kret != KERN_SUCCESS)
2063 panic("cluster_write: failed to get pagelist");
2064
2065 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END,
2066 (int)upl, (int)upl_f_offset, start_offset, 0, 0);
2067
2068 if (start_offset && !upl_valid_page(pl, 0)) {
2069 int read_size;
2070
2071 /*
2072 * we're starting in the middle of the first page of the upl
2073 * and the page isn't currently valid, so we're going to have
2074 * to read it in first... this is a synchronous operation
2075 */
2076 read_size = PAGE_SIZE;
2077
2078 if ((upl_f_offset + read_size) > newEOF)
2079 read_size = newEOF - upl_f_offset;
2080
2081 retval = cluster_io(vp, upl, 0, upl_f_offset, read_size,
2082 CL_READ, (buf_t)NULL, (struct clios *)NULL);
2083 if (retval) {
2084 /*
2085 * we had an error during the read which causes us to abort
2086 * the current cluster_write request... before we do, we need
2087 * to release the rest of the pages in the upl without modifying
2088 * there state and mark the failed page in error
2089 */
2090 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
2091
2092 if (upl_size > PAGE_SIZE)
2093 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2094
2095 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
2096 (int)upl, 0, 0, retval, 0);
2097 break;
2098 }
2099 }
2100 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
2101 /*
2102 * the last offset we're writing to in this upl does not end on a page
2103 * boundary... if it's not beyond the old EOF, then we'll also need to
2104 * pre-read this page in if it isn't already valid
2105 */
2106 upl_offset = upl_size - PAGE_SIZE;
2107
2108 if ((upl_f_offset + start_offset + io_size) < oldEOF &&
2109 !upl_valid_page(pl, upl_offset / PAGE_SIZE)) {
2110 int read_size;
2111
2112 read_size = PAGE_SIZE;
2113
2114 if ((upl_f_offset + upl_offset + read_size) > newEOF)
2115 read_size = newEOF - (upl_f_offset + upl_offset);
2116
2117 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size,
2118 CL_READ, (buf_t)NULL, (struct clios *)NULL);
2119 if (retval) {
2120 /*
2121 * we had an error during the read which causes us to abort
2122 * the current cluster_write request... before we do, we
2123 * need to release the rest of the pages in the upl without
2124 * modifying there state and mark the failed page in error
2125 */
2126 ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
2127
2128 if (upl_size > PAGE_SIZE)
2129 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2130
2131 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
2132 (int)upl, 0, 0, retval, 0);
2133 break;
2134 }
2135 }
2136 }
2137 xfer_resid = io_size;
2138 io_offset = start_offset;
2139
2140 while (zero_cnt && xfer_resid) {
2141
2142 if (zero_cnt < (long long)xfer_resid)
2143 bytes_to_zero = zero_cnt;
2144 else
2145 bytes_to_zero = xfer_resid;
2146
2147 if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
2148 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2149 } else {
2150 int zero_pg_index;
2151
2152 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
2153 zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
2154
2155 if ( !upl_valid_page(pl, zero_pg_index)) {
2156 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2157
2158 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
2159 !upl_dirty_page(pl, zero_pg_index)) {
2160 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2161 }
2162 }
2163 xfer_resid -= bytes_to_zero;
2164 zero_cnt -= bytes_to_zero;
2165 zero_off += bytes_to_zero;
2166 io_offset += bytes_to_zero;
2167 }
2168 if (xfer_resid && io_resid) {
2169 bytes_to_move = min(io_resid, xfer_resid);
2170
2171 retval = cluster_copy_upl_data(uio, upl, io_offset, bytes_to_move);
2172
2173 if (retval) {
2174
2175 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2176
2177 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
2178 (int)upl, 0, 0, retval, 0);
2179 } else {
2180 io_resid -= bytes_to_move;
2181 xfer_resid -= bytes_to_move;
2182 io_offset += bytes_to_move;
2183 }
2184 }
2185 while (xfer_resid && zero_cnt1 && retval == 0) {
2186
2187 if (zero_cnt1 < (long long)xfer_resid)
2188 bytes_to_zero = zero_cnt1;
2189 else
2190 bytes_to_zero = xfer_resid;
2191
2192 if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
2193 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2194 } else {
2195 int zero_pg_index;
2196
2197 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off1 & PAGE_MASK_64));
2198 zero_pg_index = (int)((zero_off1 - upl_f_offset) / PAGE_SIZE_64);
2199
2200 if ( !upl_valid_page(pl, zero_pg_index)) {
2201 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2202 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
2203 !upl_dirty_page(pl, zero_pg_index)) {
2204 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2205 }
2206 }
2207 xfer_resid -= bytes_to_zero;
2208 zero_cnt1 -= bytes_to_zero;
2209 zero_off1 += bytes_to_zero;
2210 io_offset += bytes_to_zero;
2211 }
2212
2213 if (retval == 0) {
2214 int cl_index;
2215 int can_delay;
2216
2217 io_size += start_offset;
2218
2219 if ((upl_f_offset + io_size) >= newEOF && io_size < upl_size) {
2220 /*
2221 * if we're extending the file with this write
2222 * we'll zero fill the rest of the page so that
2223 * if the file gets extended again in such a way as to leave a
2224 * hole starting at this EOF, we'll have zero's in the correct spot
2225 */
2226 cluster_zero(upl, io_size, upl_size - io_size, NULL);
2227 }
2228 if (flags & IO_SYNC)
2229 /*
2230 * if the IO_SYNC flag is set than we need to
2231 * bypass any clusters and immediately issue
2232 * the I/O
2233 */
2234 goto issue_io;
2235check_cluster:
2236 /*
2237 * take the lock to protect our accesses
2238 * of the writebehind and sparse cluster state
2239 */
2240 wbp = cluster_get_wbp(vp, CLW_ALLOCATE | CLW_RETURNLOCKED);
2241
2242 /*
2243 * calculate the last logical block number
2244 * that this delayed I/O encompassed
2245 */
2246 cl.e_addr = (daddr64_t)((upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64);
2247
2248 if (wbp->cl_scmap) {
2249
2250 if ( !(flags & IO_NOCACHE)) {
2251 /*
2252 * we've fallen into the sparse
2253 * cluster method of delaying dirty pages
2254 * first, we need to release the upl if we hold one
2255 * since pages in it may be present in the sparse cluster map
2256 * and may span 2 separate buckets there... if they do and
2257 * we happen to have to flush a bucket to make room and it intersects
2258 * this upl, a deadlock may result on page BUSY
2259 */
2260 if (upl_size)
2261 ubc_upl_commit_range(upl, 0, upl_size,
2262 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2263
2264 sparse_cluster_add(wbp, vp, &cl, newEOF);
2265
2266 lck_mtx_unlock(&wbp->cl_lockw);
2267
2268 continue;
2269 }
2270 /*
2271 * must have done cached writes that fell into
2272 * the sparse cluster mechanism... we've switched
2273 * to uncached writes on the file, so go ahead
2274 * and push whatever's in the sparse map
2275 * and switch back to normal clustering
2276 *
2277 * see the comment above concerning a possible deadlock...
2278 */
2279 if (upl_size) {
2280 ubc_upl_commit_range(upl, 0, upl_size,
2281 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2282 /*
2283 * setting upl_size to 0 keeps us from committing a
2284 * second time in the start_new_cluster path
2285 */
2286 upl_size = 0;
2287 }
2288 sparse_cluster_push(wbp, vp, newEOF, 1);
2289
2290 wbp->cl_number = 0;
2291 /*
2292 * no clusters of either type present at this point
2293 * so just go directly to start_new_cluster since
2294 * we know we need to delay this I/O since we've
2295 * already released the pages back into the cache
2296 * to avoid the deadlock with sparse_cluster_push
2297 */
2298 goto start_new_cluster;
2299 }
2300 upl_offset = 0;
2301
2302 if (wbp->cl_number == 0)
2303 /*
2304 * no clusters currently present
2305 */
2306 goto start_new_cluster;
2307
2308 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
2309 /*
2310 * check each cluster that we currently hold
2311 * try to merge some or all of this write into
2312 * one or more of the existing clusters... if
2313 * any portion of the write remains, start a
2314 * new cluster
2315 */
2316 if (cl.b_addr >= wbp->cl_clusters[cl_index].b_addr) {
2317 /*
2318 * the current write starts at or after the current cluster
2319 */
2320 if (cl.e_addr <= (wbp->cl_clusters[cl_index].b_addr + MAX_UPL_TRANSFER)) {
2321 /*
2322 * we have a write that fits entirely
2323 * within the existing cluster limits
2324 */
2325 if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr)
2326 /*
2327 * update our idea of where the cluster ends
2328 */
2329 wbp->cl_clusters[cl_index].e_addr = cl.e_addr;
2330 break;
2331 }
2332 if (cl.b_addr < (wbp->cl_clusters[cl_index].b_addr + MAX_UPL_TRANSFER)) {
2333 /*
2334 * we have a write that starts in the middle of the current cluster
2335 * but extends beyond the cluster's limit... we know this because
2336 * of the previous checks
2337 * we'll extend the current cluster to the max
2338 * and update the b_addr for the current write to reflect that
2339 * the head of it was absorbed into this cluster...
2340 * note that we'll always have a leftover tail in this case since
2341 * full absorbtion would have occurred in the clause above
2342 */
2343 wbp->cl_clusters[cl_index].e_addr = wbp->cl_clusters[cl_index].b_addr + MAX_UPL_TRANSFER;
2344
2345 if (upl_size) {
2346 daddr64_t start_pg_in_upl;
2347
2348 start_pg_in_upl = (daddr64_t)(upl_f_offset / PAGE_SIZE_64);
2349
2350 if (start_pg_in_upl < wbp->cl_clusters[cl_index].e_addr) {
2351 intersection = (int)((wbp->cl_clusters[cl_index].e_addr - start_pg_in_upl) * PAGE_SIZE);
2352
2353 ubc_upl_commit_range(upl, upl_offset, intersection,
2354 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2355 upl_f_offset += intersection;
2356 upl_offset += intersection;
2357 upl_size -= intersection;
2358 }
2359 }
2360 cl.b_addr = wbp->cl_clusters[cl_index].e_addr;
2361 }
2362 /*
2363 * we come here for the case where the current write starts
2364 * beyond the limit of the existing cluster or we have a leftover
2365 * tail after a partial absorbtion
2366 *
2367 * in either case, we'll check the remaining clusters before
2368 * starting a new one
2369 */
2370 } else {
2371 /*
2372 * the current write starts in front of the cluster we're currently considering
2373 */
2374 if ((wbp->cl_clusters[cl_index].e_addr - cl.b_addr) <= MAX_UPL_TRANSFER) {
2375 /*
2376 * we can just merge the new request into
2377 * this cluster and leave it in the cache
2378 * since the resulting cluster is still
2379 * less than the maximum allowable size
2380 */
2381 wbp->cl_clusters[cl_index].b_addr = cl.b_addr;
2382
2383 if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr) {
2384 /*
2385 * the current write completely
2386 * envelops the existing cluster and since
2387 * each write is limited to at most MAX_UPL_TRANSFER bytes
2388 * we can just use the start and last blocknos of the write
2389 * to generate the cluster limits
2390 */
2391 wbp->cl_clusters[cl_index].e_addr = cl.e_addr;
2392 }
2393 break;
2394 }
2395
2396 /*
2397 * if we were to combine this write with the current cluster
2398 * we would exceed the cluster size limit.... so,
2399 * let's see if there's any overlap of the new I/O with
2400 * the cluster we're currently considering... in fact, we'll
2401 * stretch the cluster out to it's full limit and see if we
2402 * get an intersection with the current write
2403 *
2404 */
2405 if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr - MAX_UPL_TRANSFER) {
2406 /*
2407 * the current write extends into the proposed cluster
2408 * clip the length of the current write after first combining it's
2409 * tail with the newly shaped cluster
2410 */
2411 wbp->cl_clusters[cl_index].b_addr = wbp->cl_clusters[cl_index].e_addr - MAX_UPL_TRANSFER;
2412
2413 if (upl_size) {
2414 intersection = (int)((cl.e_addr - wbp->cl_clusters[cl_index].b_addr) * PAGE_SIZE);
2415
2416 if (intersection > upl_size)
2417 /*
2418 * because the current write may consist of a number of pages found in the cache
2419 * which are not part of the UPL, we may have an intersection that exceeds
2420 * the size of the UPL that is also part of this write
2421 */
2422 intersection = upl_size;
2423
2424 ubc_upl_commit_range(upl, upl_offset + (upl_size - intersection), intersection,
2425 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2426 upl_size -= intersection;
2427 }
2428 cl.e_addr = wbp->cl_clusters[cl_index].b_addr;
2429 }
2430 /*
2431 * if we get here, there was no way to merge
2432 * any portion of this write with this cluster
2433 * or we could only merge part of it which
2434 * will leave a tail...
2435 * we'll check the remaining clusters before starting a new one
2436 */
2437 }
2438 }
2439 if (cl_index < wbp->cl_number)
2440 /*
2441 * we found an existing cluster(s) that we
2442 * could entirely merge this I/O into
2443 */
2444 goto delay_io;
2445
2446 if (wbp->cl_number < MAX_CLUSTERS && !(flags & IO_NOCACHE))
2447 /*
2448 * we didn't find an existing cluster to
2449 * merge into, but there's room to start
2450 * a new one
2451 */
2452 goto start_new_cluster;
2453
2454 /*
2455 * no exisitng cluster to merge with and no
2456 * room to start a new one... we'll try
2457 * pushing one of the existing ones... if none of
2458 * them are able to be pushed, we'll switch
2459 * to the sparse cluster mechanism
2460 * cluster_try_push updates cl_number to the
2461 * number of remaining clusters... and
2462 * returns the number of currently unused clusters
2463 */
2464 int ret_cluster_try_push = 0;
2465 /* if writes are not deferred, call cluster push immediately */
2466 if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) {
2467 if (flags & IO_NOCACHE)
2468 can_delay = 0;
2469 else
2470 can_delay = 1;
2471
2472 ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, can_delay, 0);
2473 }
2474
2475 /* execute following regardless writes are deferred or not */
2476 if (ret_cluster_try_push == 0) {
2477 /*
2478 * no more room in the normal cluster mechanism
2479 * so let's switch to the more expansive but expensive
2480 * sparse mechanism....
2481 * first, we need to release the upl if we hold one
2482 * since pages in it may be present in the sparse cluster map (after the cluster_switch)
2483 * and may span 2 separate buckets there... if they do and
2484 * we happen to have to flush a bucket to make room and it intersects
2485 * this upl, a deadlock may result on page BUSY
2486 */
2487 if (upl_size)
2488 ubc_upl_commit_range(upl, upl_offset, upl_size,
2489 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2490
2491 sparse_cluster_switch(wbp, vp, newEOF);
2492 sparse_cluster_add(wbp, vp, &cl, newEOF);
2493
2494 lck_mtx_unlock(&wbp->cl_lockw);
2495
2496 continue;
2497 }
2498 /*
2499 * we pushed one cluster successfully, so we must be sequentially writing this file
2500 * otherwise, we would have failed and fallen into the sparse cluster support
2501 * so let's take the opportunity to push out additional clusters as long as we
2502 * remain below the throttle... this will give us better I/O locality if we're
2503 * in a copy loop (i.e. we won't jump back and forth between the read and write points
2504 * however, we don't want to push so much out that the write throttle kicks in and
2505 * hangs this thread up until some of the I/O completes...
2506 */
2507 if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) {
2508 while (wbp->cl_number && (vp->v_numoutput <= (VNODE_ASYNC_THROTTLE / 2)))
2509 cluster_try_push(wbp, vp, newEOF, 0, 0);
2510 }
2511
2512start_new_cluster:
2513 wbp->cl_clusters[wbp->cl_number].b_addr = cl.b_addr;
2514 wbp->cl_clusters[wbp->cl_number].e_addr = cl.e_addr;
2515
2516 if (flags & IO_NOCACHE)
2517 wbp->cl_clusters[wbp->cl_number].io_nocache = 1;
2518 else
2519 wbp->cl_clusters[wbp->cl_number].io_nocache = 0;
2520 wbp->cl_number++;
2521delay_io:
2522 if (upl_size)
2523 ubc_upl_commit_range(upl, upl_offset, upl_size,
2524 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2525
2526 lck_mtx_unlock(&wbp->cl_lockw);
2527
2528 continue;
2529issue_io:
2530 /*
2531 * we don't hold the vnode lock at this point
2532 *
2533 * because we had to ask for a UPL that provides currenty non-present pages, the
2534 * UPL has been automatically set to clear the dirty flags (both software and hardware)
2535 * upon committing it... this is not the behavior we want since it's possible for
2536 * pages currently present as part of a mapped file to be dirtied while the I/O is in flight.
2537 * in order to maintain some semblance of coherency with mapped writes
2538 * we need to drop the current upl and pick it back up with COPYOUT_FROM set
2539 * so that we correctly deal with a change in state of the hardware modify bit...
2540 * we do this via cluster_push_x... by passing along the IO_SYNC flag, we force
2541 * cluster_push_x to wait until all the I/Os have completed... cluster_push_x is also
2542 * responsible for generating the correct sized I/O(s)
2543 */
2544 ubc_upl_commit_range(upl, 0, upl_size,
2545 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2546
2547 cl.e_addr = (upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64;
2548
2549 retval = cluster_push_x(vp, &cl, newEOF, flags);
2550 }
2551 }
2552 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
2553 retval, 0, io_resid, 0, 0);
2554
2555 return (retval);
2556}
2557
2558int
2559cluster_read(vnode_t vp, struct uio *uio, off_t filesize, int xflags)
2560{
2561 int prev_resid;
2562 u_int clip_size;
2563 off_t max_io_size;
2564 int upl_size;
2565 int upl_flags;
2566 upl_t upl;
2567 int retval = 0;
2568 int flags;
2569
2570 flags = xflags;
2571
2572 if (vp->v_flag & VNOCACHE_DATA)
2573 flags |= IO_NOCACHE;
2574 if (vp->v_flag & VRAOFF)
2575 flags |= IO_RAOFF;
2576
2577 if (!((flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg))) {
2578 /*
2579 * go do a read through the cache if one of the following is true....
2580 * NOCACHE is not true
2581 * the uio request doesn't target USERSPACE
2582 */
2583 return (cluster_read_x(vp, uio, filesize, flags));
2584 }
2585
2586#if LP64_DEBUG
2587 if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) {
2588 panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__);
2589 }
2590#endif /* LP64_DEBUG */
2591
2592 while (uio_resid(uio) && uio->uio_offset < filesize && retval == 0) {
2593 user_size_t iov_len;
2594 user_addr_t iov_base;
2595
2596 /*
2597 * we know we have a resid, so this is safe
2598 * skip over any emtpy vectors
2599 */
2600 uio_update(uio, (user_size_t)0);
2601
2602 iov_len = uio_curriovlen(uio);
2603 iov_base = uio_curriovbase(uio);
2604
2605 upl_size = PAGE_SIZE;
2606 upl_flags = UPL_QUERY_OBJECT_TYPE;
2607
2608 // LP64todo - fix this!
2609 if ((vm_map_get_upl(current_map(),
2610 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
2611 &upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS) {
2612 /*
2613 * the user app must have passed in an invalid address
2614 */
2615 return (EFAULT);
2616 }
2617
2618 /*
2619 * We check every vector target but if it is physically
2620 * contiguous space, we skip the sanity checks.
2621 */
2622 if (upl_flags & UPL_PHYS_CONTIG) {
2623 retval = cluster_phys_read(vp, uio, filesize);
2624 }
2625 else if (uio_resid(uio) < PAGE_SIZE) {
2626 /*
2627 * we're here because we're don't have a physically contiguous target buffer
2628 * go do a read through the cache if
2629 * the total xfer size is less than a page...
2630 */
2631 return (cluster_read_x(vp, uio, filesize, flags));
2632 }
2633 // LP64todo - fix this!
2634 else if (((int)uio->uio_offset & PAGE_MASK) || (CAST_DOWN(int, iov_base) & PAGE_MASK)) {
2635 if (((int)uio->uio_offset & PAGE_MASK) == (CAST_DOWN(int, iov_base) & PAGE_MASK)) {
2636 /*
2637 * Bring the file offset read up to a pagesize boundary
2638 * this will also bring the base address to a page boundary
2639 * since they both are currently on the same offset within a page
2640 * note: if we get here, uio->uio_resid is greater than PAGE_SIZE
2641 * so the computed clip_size must always be less than the current uio_resid
2642 */
2643 clip_size = (PAGE_SIZE - (int)(uio->uio_offset & PAGE_MASK_64));
2644
2645 /*
2646 * Fake the resid going into the cluster_read_x call
2647 * and restore it on the way out.
2648 */
2649 prev_resid = uio_resid(uio);
2650 // LP64todo - fix this
2651 uio_setresid(uio, clip_size);
2652
2653 retval = cluster_read_x(vp, uio, filesize, flags);
2654
2655 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
2656 } else {
2657 /*
2658 * can't get both the file offset and the buffer offset aligned to a page boundary
2659 * so fire an I/O through the cache for this entire vector
2660 */
2661 // LP64todo - fix this!
2662 clip_size = iov_len;
2663 prev_resid = uio_resid(uio);
2664 uio_setresid(uio, clip_size);
2665
2666 retval = cluster_read_x(vp, uio, filesize, flags);
2667
2668 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
2669 }
2670 } else {
2671 /*
2672 * If we come in here, we know the offset into
2673 * the file is on a pagesize boundary
2674 */
2675 max_io_size = filesize - uio->uio_offset;
2676 // LP64todo - fix this
2677 clip_size = uio_resid(uio);
2678 if (iov_len < clip_size)
2679 clip_size = iov_len;
2680 if (max_io_size < clip_size)
2681 clip_size = (int)max_io_size;
2682
2683 if (clip_size < PAGE_SIZE) {
2684 /*
2685 * Take care of the tail end of the read in this vector.
2686 */
2687 // LP64todo - fix this
2688 prev_resid = uio_resid(uio);
2689 uio_setresid(uio, clip_size);
2690
2691 retval = cluster_read_x(vp, uio, filesize, flags);
2692
2693 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
2694 } else {
2695 /* round clip_size down to a multiple of pagesize */
2696 clip_size = clip_size & ~(PAGE_MASK);
2697 // LP64todo - fix this
2698 prev_resid = uio_resid(uio);
2699 uio_setresid(uio, clip_size);
2700
2701 retval = cluster_nocopy_read(vp, uio, filesize);
2702
2703 if ((retval==0) && uio_resid(uio))
2704 retval = cluster_read_x(vp, uio, filesize, flags);
2705
2706 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
2707 }
2708 } /* end else */
2709 } /* end while */
2710
2711 return(retval);
2712}
2713
2714static int
2715cluster_read_x(vnode_t vp, struct uio *uio, off_t filesize, int flags)
2716{
2717 upl_page_info_t *pl;
2718 upl_t upl;
2719 vm_offset_t upl_offset;
2720 int upl_size;
2721 off_t upl_f_offset;
2722 int start_offset;
2723 int start_pg;
2724 int last_pg;
2725 int uio_last = 0;
2726 int pages_in_upl;
2727 off_t max_size;
2728 off_t last_ioread_offset;
2729 off_t last_request_offset;
2730 u_int size_of_prefetch;
2731 u_int io_size;
2732 kern_return_t kret;
2733 int error = 0;
2734 int retval = 0;
2735 u_int max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2736 u_int rd_ahead_enabled = 1;
2737 u_int prefetch_enabled = 1;
2738 struct cl_readahead * rap;
2739 struct clios iostate;
2740 struct cl_extent extent;
2741
2742 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
2743 (int)uio->uio_offset, uio_resid(uio), (int)filesize, 0, 0);
2744
2745 // LP64todo - fix this
2746 last_request_offset = uio->uio_offset + uio_resid(uio);
2747
2748 if ((flags & (IO_RAOFF|IO_NOCACHE)) ||
2749 ((last_request_offset & ~PAGE_MASK_64) == (uio->uio_offset & ~PAGE_MASK_64))) {
2750 rd_ahead_enabled = 0;
2751 rap = NULL;
2752 } else {
2753 if (cluster_hard_throttle_on(vp)) {
2754 rd_ahead_enabled = 0;
2755 prefetch_enabled = 0;
2756
2757 max_rd_size = HARD_THROTTLE_MAXSIZE;
2758 }
2759 if ((rap = cluster_get_rap(vp)) == NULL)
2760 rd_ahead_enabled = 0;
2761 }
2762 if (last_request_offset > filesize)
2763 last_request_offset = filesize;
2764 extent.b_addr = uio->uio_offset / PAGE_SIZE_64;
2765 extent.e_addr = (last_request_offset - 1) / PAGE_SIZE_64;
2766
2767 if (rap != NULL && rap->cl_ralen && (rap->cl_lastr == extent.b_addr || (rap->cl_lastr + 1) == extent.b_addr)) {
2768 /*
2769 * determine if we already have a read-ahead in the pipe courtesy of the
2770 * last read systemcall that was issued...
2771 * if so, pick up it's extent to determine where we should start
2772 * with respect to any read-ahead that might be necessary to
2773 * garner all the data needed to complete this read systemcall
2774 */
2775 last_ioread_offset = (rap->cl_maxra * PAGE_SIZE_64) + PAGE_SIZE_64;
2776
2777 if (last_ioread_offset < uio->uio_offset)
2778 last_ioread_offset = (off_t)0;
2779 else if (last_ioread_offset > last_request_offset)
2780 last_ioread_offset = last_request_offset;
2781 } else
2782 last_ioread_offset = (off_t)0;
2783
2784 while (uio_resid(uio) && uio->uio_offset < filesize && retval == 0) {
2785 /*
2786 * compute the size of the upl needed to encompass
2787 * the requested read... limit each call to cluster_io
2788 * to the maximum UPL size... cluster_io will clip if
2789 * this exceeds the maximum io_size for the device,
2790 * make sure to account for
2791 * a starting offset that's not page aligned
2792 */
2793 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2794 upl_f_offset = uio->uio_offset - (off_t)start_offset;
2795 max_size = filesize - uio->uio_offset;
2796
2797 // LP64todo - fix this!
2798 if ((off_t)((unsigned int)uio_resid(uio)) < max_size)
2799 io_size = uio_resid(uio);
2800 else
2801 io_size = max_size;
2802
2803 if (!(flags & IO_NOCACHE)) {
2804
2805 while (io_size) {
2806 u_int io_resid;
2807 u_int io_requested;
2808
2809 /*
2810 * if we keep finding the pages we need already in the cache, then
2811 * don't bother to call cluster_rd_prefetch since it costs CPU cycles
2812 * to determine that we have all the pages we need... once we miss in
2813 * the cache and have issued an I/O, than we'll assume that we're likely
2814 * to continue to miss in the cache and it's to our advantage to try and prefetch
2815 */
2816 if (last_request_offset && last_ioread_offset && (size_of_prefetch = (last_request_offset - last_ioread_offset))) {
2817 if ((last_ioread_offset - uio->uio_offset) <= max_rd_size && prefetch_enabled) {
2818 /*
2819 * we've already issued I/O for this request and
2820 * there's still work to do and
2821 * our prefetch stream is running dry, so issue a
2822 * pre-fetch I/O... the I/O latency will overlap
2823 * with the copying of the data
2824 */
2825 if (size_of_prefetch > max_rd_size)
2826 size_of_prefetch = max_rd_size;
2827
2828 size_of_prefetch = cluster_rd_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize);
2829
2830 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
2831
2832 if (last_ioread_offset > last_request_offset)
2833 last_ioread_offset = last_request_offset;
2834 }
2835 }
2836 /*
2837 * limit the size of the copy we're about to do so that
2838 * we can notice that our I/O pipe is running dry and
2839 * get the next I/O issued before it does go dry
2840 */
2841 if (last_ioread_offset && io_size > ((MAX_UPL_TRANSFER * PAGE_SIZE) / 4))
2842 io_resid = ((MAX_UPL_TRANSFER * PAGE_SIZE) / 4);
2843 else
2844 io_resid = io_size;
2845
2846 io_requested = io_resid;
2847
2848 retval = cluster_copy_ubc_data(vp, uio, &io_resid, 0);
2849
2850 io_size -= (io_requested - io_resid);
2851
2852 if (retval || io_resid)
2853 /*
2854 * if we run into a real error or
2855 * a page that is not in the cache
2856 * we need to leave streaming mode
2857 */
2858 break;
2859
2860 if ((io_size == 0 || last_ioread_offset == last_request_offset) && rd_ahead_enabled) {
2861 /*
2862 * we're already finished the I/O for this read request
2863 * let's see if we should do a read-ahead
2864 */
2865 cluster_rd_ahead(vp, &extent, filesize, rap);
2866 }
2867 }
2868 if (retval)
2869 break;
2870 if (io_size == 0) {
2871 if (rap != NULL) {
2872 if (extent.e_addr < rap->cl_lastr)
2873 rap->cl_maxra = 0;
2874 rap->cl_lastr = extent.e_addr;
2875 }
2876 break;
2877 }
2878 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2879 upl_f_offset = uio->uio_offset - (off_t)start_offset;
2880 max_size = filesize - uio->uio_offset;
2881 }
2882 if (io_size > max_rd_size)
2883 io_size = max_rd_size;
2884
2885 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2886
2887 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE) / 4)
2888 upl_size = (MAX_UPL_TRANSFER * PAGE_SIZE) / 4;
2889 pages_in_upl = upl_size / PAGE_SIZE;
2890
2891 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
2892 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2893
2894 kret = ubc_create_upl(vp,
2895 upl_f_offset,
2896 upl_size,
2897 &upl,
2898 &pl,
2899 UPL_SET_LITE);
2900 if (kret != KERN_SUCCESS)
2901 panic("cluster_read: failed to get pagelist");
2902
2903 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
2904 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2905
2906 /*
2907 * scan from the beginning of the upl looking for the first
2908 * non-valid page.... this will become the first page in
2909 * the request we're going to make to 'cluster_io'... if all
2910 * of the pages are valid, we won't call through to 'cluster_io'
2911 */
2912 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
2913 if (!upl_valid_page(pl, start_pg))
2914 break;
2915 }
2916
2917 /*
2918 * scan from the starting invalid page looking for a valid
2919 * page before the end of the upl is reached, if we
2920 * find one, then it will be the last page of the request to
2921 * 'cluster_io'
2922 */
2923 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
2924 if (upl_valid_page(pl, last_pg))
2925 break;
2926 }
2927 iostate.io_completed = 0;
2928 iostate.io_issued = 0;
2929 iostate.io_error = 0;
2930 iostate.io_wanted = 0;
2931
2932 if (start_pg < last_pg) {
2933 /*
2934 * we found a range of 'invalid' pages that must be filled
2935 * if the last page in this range is the last page of the file
2936 * we may have to clip the size of it to keep from reading past
2937 * the end of the last physical block associated with the file
2938 */
2939 upl_offset = start_pg * PAGE_SIZE;
2940 io_size = (last_pg - start_pg) * PAGE_SIZE;
2941
2942 if ((upl_f_offset + upl_offset + io_size) > filesize)
2943 io_size = filesize - (upl_f_offset + upl_offset);
2944
2945 /*
2946 * issue an asynchronous read to cluster_io
2947 */
2948
2949 error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
2950 io_size, CL_READ | CL_ASYNC, (buf_t)NULL, &iostate);
2951 }
2952 if (error == 0) {
2953 /*
2954 * if the read completed successfully, or there was no I/O request
2955 * issued, than copy the data into user land via 'cluster_upl_copy_data'
2956 * we'll first add on any 'valid'
2957 * pages that were present in the upl when we acquired it.
2958 */
2959 u_int val_size;
2960
2961 for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
2962 if (!upl_valid_page(pl, uio_last))
2963 break;
2964 }
2965 /*
2966 * compute size to transfer this round, if uio->uio_resid is
2967 * still non-zero after this attempt, we'll loop around and
2968 * set up for another I/O.
2969 */
2970 val_size = (uio_last * PAGE_SIZE) - start_offset;
2971
2972 if (val_size > max_size)
2973 val_size = max_size;
2974
2975 if (val_size > uio_resid(uio))
2976 // LP64todo - fix this
2977 val_size = uio_resid(uio);
2978
2979 if (last_ioread_offset == 0)
2980 last_ioread_offset = uio->uio_offset + val_size;
2981
2982 if ((size_of_prefetch = (last_request_offset - last_ioread_offset)) && prefetch_enabled) {
2983 /*
2984 * if there's still I/O left to do for this request, and...
2985 * we're not in hard throttle mode, then issue a
2986 * pre-fetch I/O... the I/O latency will overlap
2987 * with the copying of the data
2988 */
2989 size_of_prefetch = cluster_rd_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize);
2990
2991 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
2992
2993 if (last_ioread_offset > last_request_offset)
2994 last_ioread_offset = last_request_offset;
2995
2996 } else if ((uio->uio_offset + val_size) == last_request_offset) {
2997 /*
2998 * this transfer will finish this request, so...
2999 * let's try to read ahead if we're in
3000 * a sequential access pattern and we haven't
3001 * explicitly disabled it
3002 */
3003 if (rd_ahead_enabled)
3004 cluster_rd_ahead(vp, &extent, filesize, rap);
3005
3006 if (rap != NULL) {
3007 if (extent.e_addr < rap->cl_lastr)
3008 rap->cl_maxra = 0;
3009 rap->cl_lastr = extent.e_addr;
3010 }
3011 }
3012 lck_mtx_lock(cl_mtxp);
3013
3014 while (iostate.io_issued != iostate.io_completed) {
3015 iostate.io_wanted = 1;
3016 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_x", 0);
3017 }
3018 lck_mtx_unlock(cl_mtxp);
3019
3020 if (iostate.io_error)
3021 error = iostate.io_error;
3022 else
3023 retval = cluster_copy_upl_data(uio, upl, start_offset, val_size);
3024 }
3025 if (start_pg < last_pg) {
3026 /*
3027 * compute the range of pages that we actually issued an I/O for
3028 * and either commit them as valid if the I/O succeeded
3029 * or abort them if the I/O failed
3030 */
3031 io_size = (last_pg - start_pg) * PAGE_SIZE;
3032
3033 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
3034 (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
3035
3036 if (error || (flags & IO_NOCACHE))
3037 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
3038 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3039 else
3040 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size,
3041 UPL_COMMIT_CLEAR_DIRTY |
3042 UPL_COMMIT_FREE_ON_EMPTY |
3043 UPL_COMMIT_INACTIVATE);
3044
3045 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
3046 (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
3047 }
3048 if ((last_pg - start_pg) < pages_in_upl) {
3049 int cur_pg;
3050 int commit_flags;
3051
3052 /*
3053 * the set of pages that we issued an I/O for did not encompass
3054 * the entire upl... so just release these without modifying
3055 * their state
3056 */
3057 if (error)
3058 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3059 else {
3060 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
3061 (int)upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
3062
3063 if (start_pg) {
3064 /*
3065 * we found some already valid pages at the beginning of
3066 * the upl commit these back to the inactive list with
3067 * reference cleared
3068 */
3069 for (cur_pg = 0; cur_pg < start_pg; cur_pg++) {
3070 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
3071 | UPL_COMMIT_INACTIVATE;
3072
3073 if (upl_dirty_page(pl, cur_pg))
3074 commit_flags |= UPL_COMMIT_SET_DIRTY;
3075
3076 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (flags & IO_NOCACHE))
3077 ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
3078 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3079 else
3080 ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
3081 PAGE_SIZE, commit_flags);
3082 }
3083 }
3084 if (last_pg < uio_last) {
3085 /*
3086 * we found some already valid pages immediately after the
3087 * pages we issued I/O for, commit these back to the
3088 * inactive list with reference cleared
3089 */
3090 for (cur_pg = last_pg; cur_pg < uio_last; cur_pg++) {
3091 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
3092 | UPL_COMMIT_INACTIVATE;
3093
3094 if (upl_dirty_page(pl, cur_pg))
3095 commit_flags |= UPL_COMMIT_SET_DIRTY;
3096
3097 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (flags & IO_NOCACHE))
3098 ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
3099 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3100 else
3101 ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
3102 PAGE_SIZE, commit_flags);
3103 }
3104 }
3105 if (uio_last < pages_in_upl) {
3106 /*
3107 * there were some invalid pages beyond the valid pages
3108 * that we didn't issue an I/O for, just release them
3109 * unchanged
3110 */
3111 ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
3112 (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
3113 }
3114
3115 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
3116 (int)upl, -1, -1, 0, 0);
3117 }
3118 }
3119 if (retval == 0)
3120 retval = error;
3121
3122 if ( uio_resid(uio) ) {
3123 if (cluster_hard_throttle_on(vp)) {
3124 rd_ahead_enabled = 0;
3125 prefetch_enabled = 0;
3126
3127 max_rd_size = HARD_THROTTLE_MAXSIZE;
3128 } else {
3129 if (rap != NULL)
3130 rd_ahead_enabled = 1;
3131 prefetch_enabled = 1;
3132
3133 max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE;
3134 }
3135 }
3136 }
3137 if (rap != NULL) {
3138 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
3139 (int)uio->uio_offset, uio_resid(uio), rap->cl_lastr, retval, 0);
3140
3141 lck_mtx_unlock(&rap->cl_lockr);
3142 } else {
3143 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
3144 (int)uio->uio_offset, uio_resid(uio), 0, retval, 0);
3145 }
3146
3147 return (retval);
3148}
3149
3150
3151static int
3152cluster_nocopy_read(vnode_t vp, struct uio *uio, off_t filesize)
3153{
3154 upl_t upl;
3155 upl_page_info_t *pl;
3156 vm_offset_t upl_offset;
3157 off_t max_io_size;
3158 int io_size;
3159 int upl_size;
3160 int upl_needed_size;
3161 int pages_in_pl;
3162 int upl_flags;
3163 kern_return_t kret;
3164 int i;
3165 int force_data_sync;
3166 int retval = 0;
3167 int no_zero_fill = 0;
3168 int abort_flag = 0;
3169 struct clios iostate;
3170 u_int max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE;
3171 u_int max_rd_ahead = MAX_UPL_TRANSFER * PAGE_SIZE * 2;
3172
3173
3174 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
3175 (int)uio->uio_offset, uio_resid(uio), (int)filesize, 0, 0);
3176
3177 /*
3178 * When we enter this routine, we know
3179 * -- the offset into the file is on a pagesize boundary
3180 * -- the resid is a page multiple
3181 * -- the resid will not exceed iov_len
3182 */
3183
3184 iostate.io_completed = 0;
3185 iostate.io_issued = 0;
3186 iostate.io_error = 0;
3187 iostate.io_wanted = 0;
3188
3189 while (uio_resid(uio) && uio->uio_offset < filesize && retval == 0) {
3190 user_addr_t iov_base;
3191
3192 if (cluster_hard_throttle_on(vp)) {
3193 max_rd_size = HARD_THROTTLE_MAXSIZE;
3194 max_rd_ahead = HARD_THROTTLE_MAXSIZE - 1;
3195 } else {
3196 max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE;
3197 max_rd_ahead = MAX_UPL_TRANSFER * PAGE_SIZE * 8;
3198 }
3199 max_io_size = filesize - uio->uio_offset;
3200
3201 // LP64todo - fix this
3202 if (max_io_size < (off_t)((unsigned int)uio_resid(uio)))
3203 io_size = max_io_size;
3204 else
3205 io_size = uio_resid(uio);
3206
3207 /*
3208 * First look for pages already in the cache
3209 * and move them to user space.
3210 */
3211 retval = cluster_copy_ubc_data(vp, uio, &io_size, 0);
3212
3213 if (retval) {
3214 /*
3215 * we may have already spun some portion of this request
3216 * off as async requests... we need to wait for the I/O
3217 * to complete before returning
3218 */
3219 goto wait_for_reads;
3220 }
3221 /*
3222 * If we are already finished with this read, then return
3223 */
3224 if (io_size == 0) {
3225 /*
3226 * we may have already spun some portion of this request
3227 * off as async requests... we need to wait for the I/O
3228 * to complete before returning
3229 */
3230 goto wait_for_reads;
3231 }
3232 max_io_size = io_size;
3233
3234 if (max_io_size > max_rd_size)
3235 max_io_size = max_rd_size;
3236
3237 io_size = 0;
3238
3239 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + max_io_size, UPL_ROP_ABSENT, &io_size);
3240
3241 if (io_size == 0)
3242 /*
3243 * we may have already spun some portion of this request
3244 * off as async requests... we need to wait for the I/O
3245 * to complete before returning
3246 */
3247 goto wait_for_reads;
3248
3249 iov_base = uio_curriovbase(uio);
3250
3251 // LP64todo - fix this!
3252 upl_offset = CAST_DOWN(vm_offset_t, iov_base) & PAGE_MASK;
3253 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
3254
3255 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
3256 (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
3257
3258 if (upl_offset == 0 && ((io_size & PAGE_MASK) == 0)) {
3259 no_zero_fill = 1;
3260 abort_flag = UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY;
3261 } else {
3262 no_zero_fill = 0;
3263 abort_flag = UPL_ABORT_FREE_ON_EMPTY;
3264 }
3265 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
3266 pages_in_pl = 0;
3267 upl_size = upl_needed_size;
3268 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
3269
3270 if (no_zero_fill)
3271 upl_flags |= UPL_NOZEROFILL;
3272 if (force_data_sync)
3273 upl_flags |= UPL_FORCE_DATA_SYNC;
3274
3275 // LP64todo - fix this!
3276 kret = vm_map_create_upl(current_map(),
3277 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
3278 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags);
3279
3280 if (kret != KERN_SUCCESS) {
3281 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
3282 (int)upl_offset, upl_size, io_size, kret, 0);
3283 /*
3284 * cluster_nocopy_read: failed to get pagelist
3285 *
3286 * we may have already spun some portion of this request
3287 * off as async requests... we need to wait for the I/O
3288 * to complete before returning
3289 */
3290 goto wait_for_reads;
3291 }
3292 pages_in_pl = upl_size / PAGE_SIZE;
3293 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
3294
3295 for (i = 0; i < pages_in_pl; i++) {
3296 if (!upl_valid_page(pl, i))
3297 break;
3298 }
3299 if (i == pages_in_pl)
3300 break;
3301
3302 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, abort_flag);
3303 }
3304 if (force_data_sync >= 3) {
3305 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
3306 (int)upl_offset, upl_size, io_size, kret, 0);
3307
3308 goto wait_for_reads;
3309 }
3310 /*
3311 * Consider the possibility that upl_size wasn't satisfied.
3312 */
3313 if (upl_size != upl_needed_size)
3314 io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
3315
3316 if (io_size == 0) {
3317 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, abort_flag);
3318 goto wait_for_reads;
3319 }
3320 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
3321 (int)upl_offset, upl_size, io_size, kret, 0);
3322
3323 /*
3324 * request asynchronously so that we can overlap
3325 * the preparation of the next I/O
3326 * if there are already too many outstanding reads
3327 * wait until some have completed before issuing the next read
3328 */
3329 lck_mtx_lock(cl_mtxp);
3330
3331 while ((iostate.io_issued - iostate.io_completed) > max_rd_ahead) {
3332 iostate.io_wanted = 1;
3333 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_read", 0);
3334 }
3335 lck_mtx_unlock(cl_mtxp);
3336
3337 if (iostate.io_error) {
3338 /*
3339 * one of the earlier reads we issued ran into a hard error
3340 * don't issue any more reads, cleanup the UPL
3341 * that was just created but not used, then
3342 * go wait for any other reads to complete before
3343 * returning the error to the caller
3344 */
3345 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, abort_flag);
3346
3347 goto wait_for_reads;
3348 }
3349 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
3350 (int)upl, (int)upl_offset, (int)uio->uio_offset, io_size, 0);
3351
3352 retval = cluster_io(vp, upl, upl_offset, uio->uio_offset, io_size,
3353 CL_PRESERVE | CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO,
3354 (buf_t)NULL, &iostate);
3355
3356 /*
3357 * update the uio structure
3358 */
3359 uio_update(uio, (user_size_t)io_size);
3360
3361 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
3362 (int)upl, (int)uio->uio_offset, (int)uio_resid(uio), retval, 0);
3363
3364 } /* end while */
3365
3366wait_for_reads:
3367 /*
3368 * make sure all async reads that are part of this stream
3369 * have completed before we return
3370 */
3371 lck_mtx_lock(cl_mtxp);
3372
3373 while (iostate.io_issued != iostate.io_completed) {
3374 iostate.io_wanted = 1;
3375 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_read", 0);
3376 }
3377 lck_mtx_unlock(cl_mtxp);
3378
3379 if (iostate.io_error)
3380 retval = iostate.io_error;
3381
3382 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
3383 (int)uio->uio_offset, (int)uio_resid(uio), 6, retval, 0);
3384
3385 return (retval);
3386}
3387
3388
3389static int
3390cluster_phys_read(vnode_t vp, struct uio *uio, off_t filesize)
3391{
3392 upl_page_info_t *pl;
3393 upl_t upl;
3394 vm_offset_t upl_offset;
3395 addr64_t dst_paddr;
3396 off_t max_size;
3397 int io_size;
3398 user_size_t iov_len;
3399 user_addr_t iov_base;
3400 int tail_size;
3401 int upl_size;
3402 int upl_needed_size;
3403 int pages_in_pl;
3404 int upl_flags;
3405 kern_return_t kret;
3406 struct clios iostate;
3407 int error;
3408 int devblocksize;
3409
3410 devblocksize = vp->v_mount->mnt_devblocksize;
3411 /*
3412 * When we enter this routine, we know
3413 * -- the resid will not exceed iov_len
3414 * -- the target address is physically contiguous
3415 */
3416
3417#if LP64_DEBUG
3418 if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) {
3419 panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__);
3420 }
3421#endif /* LP64_DEBUG */
3422
3423 iov_len = uio_curriovlen(uio);
3424 iov_base = uio_curriovbase(uio);
3425
3426 max_size = filesize - uio->uio_offset;
3427
3428 // LP64todo - fix this!
3429 if (max_size < 0 || (u_int64_t)max_size > iov_len)
3430 io_size = iov_len;
3431 else
3432 io_size = max_size;
3433
3434 // LP64todo - fix this!
3435 upl_offset = CAST_DOWN(vm_offset_t, iov_base) & PAGE_MASK;
3436 upl_needed_size = upl_offset + io_size;
3437
3438 error = 0;
3439 pages_in_pl = 0;
3440 upl_size = upl_needed_size;
3441 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
3442
3443 kret = vm_map_get_upl(current_map(),
3444 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
3445 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
3446
3447 if (kret != KERN_SUCCESS) {
3448 /*
3449 * cluster_phys_read: failed to get pagelist
3450 */
3451 return(EINVAL);
3452 }
3453 if (upl_size < upl_needed_size) {
3454 /*
3455 * The upl_size wasn't satisfied.
3456 */
3457 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3458
3459 return(EINVAL);
3460 }
3461 pl = ubc_upl_pageinfo(upl);
3462
3463 dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)upl_offset;
3464
3465 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
3466 int head_size;
3467
3468 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
3469
3470 if (head_size > io_size)
3471 head_size = io_size;
3472
3473 error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, CL_READ);
3474
3475 if (error) {
3476 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3477
3478 return(EINVAL);
3479 }
3480 upl_offset += head_size;
3481 dst_paddr += head_size;
3482 io_size -= head_size;
3483 }
3484 tail_size = io_size & (devblocksize - 1);
3485 io_size -= tail_size;
3486
3487 iostate.io_completed = 0;
3488 iostate.io_issued = 0;
3489 iostate.io_error = 0;
3490 iostate.io_wanted = 0;
3491
3492 while (io_size && error == 0) {
3493 int xsize;
3494
3495 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
3496 xsize = MAX_UPL_TRANSFER * PAGE_SIZE;
3497 else
3498 xsize = io_size;
3499 /*
3500 * request asynchronously so that we can overlap
3501 * the preparation of the next I/O... we'll do
3502 * the commit after all the I/O has completed
3503 * since its all issued against the same UPL
3504 * if there are already too many outstanding reads
3505 * wait until some have completed before issuing the next
3506 */
3507 lck_mtx_lock(cl_mtxp);
3508
3509 while ((iostate.io_issued - iostate.io_completed) > (8 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
3510 iostate.io_wanted = 1;
3511 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_phys_read", 0);
3512 }
3513 lck_mtx_unlock(cl_mtxp);
3514
3515 error = cluster_io(vp, upl, upl_offset, uio->uio_offset, xsize,
3516 CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC,
3517 (buf_t)NULL, &iostate);
3518 /*
3519 * The cluster_io read was issued successfully,
3520 * update the uio structure
3521 */
3522 if (error == 0) {
3523 uio_update(uio, (user_size_t)xsize);
3524
3525 dst_paddr += xsize;
3526 upl_offset += xsize;
3527 io_size -= xsize;
3528 }
3529 }
3530 /*
3531 * make sure all async reads that are part of this stream
3532 * have completed before we proceed
3533 */
3534 lck_mtx_lock(cl_mtxp);
3535
3536 while (iostate.io_issued != iostate.io_completed) {
3537 iostate.io_wanted = 1;
3538 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_phys_read", 0);
3539 }
3540 lck_mtx_unlock(cl_mtxp);
3541
3542 if (iostate.io_error)
3543 error = iostate.io_error;
3544
3545 if (error == 0 && tail_size)
3546 error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, CL_READ);
3547
3548 /*
3549 * just release our hold on the physically contiguous
3550 * region without changing any state
3551 */
3552 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3553
3554 return (error);
3555}
3556
3557
3558/*
3559 * generate advisory I/O's in the largest chunks possible
3560 * the completed pages will be released into the VM cache
3561 */
3562int
3563advisory_read(vnode_t vp, off_t filesize, off_t f_offset, int resid)
3564{
3565 upl_page_info_t *pl;
3566 upl_t upl;
3567 vm_offset_t upl_offset;
3568 int upl_size;
3569 off_t upl_f_offset;
3570 int start_offset;
3571 int start_pg;
3572 int last_pg;
3573 int pages_in_upl;
3574 off_t max_size;
3575 int io_size;
3576 kern_return_t kret;
3577 int retval = 0;
3578 int issued_io;
3579 int skip_range;
3580
3581 if ( !UBCINFOEXISTS(vp))
3582 return(EINVAL);
3583
3584 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
3585 (int)f_offset, resid, (int)filesize, 0, 0);
3586
3587 while (resid && f_offset < filesize && retval == 0) {
3588 /*
3589 * compute the size of the upl needed to encompass
3590 * the requested read... limit each call to cluster_io
3591 * to the maximum UPL size... cluster_io will clip if
3592 * this exceeds the maximum io_size for the device,
3593 * make sure to account for
3594 * a starting offset that's not page aligned
3595 */
3596 start_offset = (int)(f_offset & PAGE_MASK_64);
3597 upl_f_offset = f_offset - (off_t)start_offset;
3598 max_size = filesize - f_offset;
3599
3600 if (resid < max_size)
3601 io_size = resid;
3602 else
3603 io_size = max_size;
3604
3605 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
3606 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
3607 upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
3608
3609 skip_range = 0;
3610 /*
3611 * return the number of contiguously present pages in the cache
3612 * starting at upl_f_offset within the file
3613 */
3614 ubc_range_op(vp, upl_f_offset, upl_f_offset + upl_size, UPL_ROP_PRESENT, &skip_range);
3615
3616 if (skip_range) {
3617 /*
3618 * skip over pages already present in the cache
3619 */
3620 io_size = skip_range - start_offset;
3621
3622 f_offset += io_size;
3623 resid -= io_size;
3624
3625 if (skip_range == upl_size)
3626 continue;
3627 /*
3628 * have to issue some real I/O
3629 * at this point, we know it's starting on a page boundary
3630 * because we've skipped over at least the first page in the request
3631 */
3632 start_offset = 0;
3633 upl_f_offset += skip_range;
3634 upl_size -= skip_range;
3635 }
3636 pages_in_upl = upl_size / PAGE_SIZE;
3637
3638 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_START,
3639 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
3640
3641 kret = ubc_create_upl(vp,
3642 upl_f_offset,
3643 upl_size,
3644 &upl,
3645 &pl,
3646 UPL_RET_ONLY_ABSENT | UPL_SET_LITE);
3647 if (kret != KERN_SUCCESS)
3648 return(retval);
3649 issued_io = 0;
3650
3651 /*
3652 * before we start marching forward, we must make sure we end on
3653 * a present page, otherwise we will be working with a freed
3654 * upl
3655 */
3656 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
3657 if (upl_page_present(pl, last_pg))
3658 break;
3659 }
3660 pages_in_upl = last_pg + 1;
3661
3662
3663 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_END,
3664 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
3665
3666
3667 for (last_pg = 0; last_pg < pages_in_upl; ) {
3668 /*
3669 * scan from the beginning of the upl looking for the first
3670 * page that is present.... this will become the first page in
3671 * the request we're going to make to 'cluster_io'... if all
3672 * of the pages are absent, we won't call through to 'cluster_io'
3673 */
3674 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
3675 if (upl_page_present(pl, start_pg))
3676 break;
3677 }
3678
3679 /*
3680 * scan from the starting present page looking for an absent
3681 * page before the end of the upl is reached, if we
3682 * find one, then it will terminate the range of pages being
3683 * presented to 'cluster_io'
3684 */
3685 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
3686 if (!upl_page_present(pl, last_pg))
3687 break;
3688 }
3689
3690 if (last_pg > start_pg) {
3691 /*
3692 * we found a range of pages that must be filled
3693 * if the last page in this range is the last page of the file
3694 * we may have to clip the size of it to keep from reading past
3695 * the end of the last physical block associated with the file
3696 */
3697 upl_offset = start_pg * PAGE_SIZE;
3698 io_size = (last_pg - start_pg) * PAGE_SIZE;
3699
3700 if ((upl_f_offset + upl_offset + io_size) > filesize)
3701 io_size = filesize - (upl_f_offset + upl_offset);
3702
3703 /*
3704 * issue an asynchronous read to cluster_io
3705 */
3706 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
3707 CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE, (buf_t)NULL, (struct clios *)NULL);
3708
3709 issued_io = 1;
3710 }
3711 }
3712 if (issued_io == 0)
3713 ubc_upl_abort(upl, 0);
3714
3715 io_size = upl_size - start_offset;
3716
3717 if (io_size > resid)
3718 io_size = resid;
3719 f_offset += io_size;
3720 resid -= io_size;
3721 }
3722
3723 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
3724 (int)f_offset, resid, retval, 0, 0);
3725
3726 return(retval);
3727}
3728
3729
3730int
3731cluster_push(vnode_t vp, int flags)
3732{
3733 int retval;
3734 struct cl_writebehind *wbp;
3735
3736 if ( !UBCINFOEXISTS(vp)) {
3737 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, (int)vp, flags, 0, -1, 0);
3738 return (0);
3739 }
3740 /* return if deferred write is set */
3741 if (((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) && (flags & IO_DEFWRITE)) {
3742 return (0);
3743 }
3744 if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) == NULL) {
3745 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, (int)vp, flags, 0, -2, 0);
3746 return (0);
3747 }
3748 if (wbp->cl_number == 0 && wbp->cl_scmap == NULL) {
3749 lck_mtx_unlock(&wbp->cl_lockw);
3750
3751 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, (int)vp, flags, 0, -3, 0);
3752 return(0);
3753 }
3754 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
3755 (int)wbp->cl_scmap, wbp->cl_number, flags, 0, 0);
3756
3757 if (wbp->cl_scmap) {
3758 sparse_cluster_push(wbp, vp, ubc_getsize(vp), 1);
3759
3760 retval = 1;
3761 } else
3762 retval = cluster_try_push(wbp, vp, ubc_getsize(vp), 0, 1);
3763
3764 lck_mtx_unlock(&wbp->cl_lockw);
3765
3766 if (flags & IO_SYNC)
3767 (void)vnode_waitforwrites(vp, 0, 0, 0, (char *)"cluster_push");
3768
3769 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
3770 (int)wbp->cl_scmap, wbp->cl_number, retval, 0, 0);
3771
3772 return (retval);
3773}
3774
3775
3776__private_extern__ void
3777cluster_release(struct ubc_info *ubc)
3778{
3779 struct cl_writebehind *wbp;
3780 struct cl_readahead *rap;
3781
3782 if ((wbp = ubc->cl_wbehind)) {
3783
3784 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, (int)ubc, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
3785
3786 if (wbp->cl_scmap)
3787 vfs_drt_control(&(wbp->cl_scmap), 0);
3788 } else {
3789 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, (int)ubc, 0, 0, 0, 0);
3790 }
3791
3792 rap = ubc->cl_rahead;
3793
3794 if (wbp != NULL) {
3795 lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp);
3796 FREE_ZONE((void *)wbp, sizeof *wbp, M_CLWRBEHIND);
3797 }
3798 if ((rap = ubc->cl_rahead)) {
3799 lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp);
3800 FREE_ZONE((void *)rap, sizeof *rap, M_CLRDAHEAD);
3801 }
3802 ubc->cl_rahead = NULL;
3803 ubc->cl_wbehind = NULL;
3804
3805 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_END, (int)ubc, (int)rap, (int)wbp, 0, 0);
3806}
3807
3808
3809static void
3810cluster_push_EOF(vnode_t vp, off_t EOF)
3811{
3812 struct cl_writebehind *wbp;
3813
3814 wbp = cluster_get_wbp(vp, CLW_ALLOCATE | CLW_RETURNLOCKED);
3815
3816 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
3817 (int)wbp->cl_scmap, wbp->cl_number, (int)EOF, 0, 0);
3818
3819 if (wbp->cl_scmap)
3820 sparse_cluster_push(wbp, vp, EOF, 1);
3821 else
3822 cluster_try_push(wbp, vp, EOF, 0, 1);
3823
3824 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
3825 (int)wbp->cl_scmap, wbp->cl_number, 0, 0, 0);
3826
3827 lck_mtx_unlock(&wbp->cl_lockw);
3828}
3829
3830
3831static int
3832cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int can_delay, int push_all)
3833{
3834 int cl_index;
3835 int cl_index1;
3836 int min_index;
3837 int cl_len;
3838 int cl_pushed = 0;
3839 struct cl_wextent l_clusters[MAX_CLUSTERS];
3840
3841 /*
3842 * the write behind context exists and has
3843 * already been locked...
3844 *
3845 * make a local 'sorted' copy of the clusters
3846 * and clear wbp->cl_number so that new clusters can
3847 * be developed
3848 */
3849 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
3850 for (min_index = -1, cl_index1 = 0; cl_index1 < wbp->cl_number; cl_index1++) {
3851 if (wbp->cl_clusters[cl_index1].b_addr == wbp->cl_clusters[cl_index1].e_addr)
3852 continue;
3853 if (min_index == -1)
3854 min_index = cl_index1;
3855 else if (wbp->cl_clusters[cl_index1].b_addr < wbp->cl_clusters[min_index].b_addr)
3856 min_index = cl_index1;
3857 }
3858 if (min_index == -1)
3859 break;
3860 l_clusters[cl_index].b_addr = wbp->cl_clusters[min_index].b_addr;
3861 l_clusters[cl_index].e_addr = wbp->cl_clusters[min_index].e_addr;
3862 l_clusters[cl_index].io_nocache = wbp->cl_clusters[min_index].io_nocache;
3863
3864 wbp->cl_clusters[min_index].b_addr = wbp->cl_clusters[min_index].e_addr;
3865 }
3866 wbp->cl_number = 0;
3867
3868 cl_len = cl_index;
3869
3870 if (can_delay && cl_len == MAX_CLUSTERS) {
3871 int i;
3872
3873 /*
3874 * determine if we appear to be writing the file sequentially
3875 * if not, by returning without having pushed any clusters
3876 * we will cause this vnode to be pushed into the sparse cluster mechanism
3877 * used for managing more random I/O patterns
3878 *
3879 * we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
3880 * that's why we're in try_push with can_delay true...
3881 *
3882 * check to make sure that all the clusters except the last one are 'full'... and that each cluster
3883 * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
3884 * so we can just make a simple pass through, up to, but not including the last one...
3885 * note that e_addr is not inclusive, so it will be equal to the b_addr of the next cluster if they
3886 * are sequential
3887 *
3888 * we let the last one be partial as long as it was adjacent to the previous one...
3889 * we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
3890 * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
3891 */
3892 for (i = 0; i < MAX_CLUSTERS - 1; i++) {
3893 if ((l_clusters[i].e_addr - l_clusters[i].b_addr) != MAX_UPL_TRANSFER)
3894 goto dont_try;
3895 if (l_clusters[i].e_addr != l_clusters[i+1].b_addr)
3896 goto dont_try;
3897 }
3898 }
3899 /*
3900 * drop the lock while we're firing off the I/Os...
3901 * this is safe since I'm working off of a private sorted copy
3902 * of the clusters, and I'm going to re-evaluate the public
3903 * state after I retake the lock
3904 */
3905 lck_mtx_unlock(&wbp->cl_lockw);
3906
3907 for (cl_index = 0; cl_index < cl_len; cl_index++) {
3908 int flags;
3909 struct cl_extent cl;
3910
3911 /*
3912 * try to push each cluster in turn...
3913 */
3914 if (l_clusters[cl_index].io_nocache)
3915 flags = IO_NOCACHE;
3916 else
3917 flags = 0;
3918 cl.b_addr = l_clusters[cl_index].b_addr;
3919 cl.e_addr = l_clusters[cl_index].e_addr;
3920
3921 cluster_push_x(vp, &cl, EOF, flags);
3922
3923 l_clusters[cl_index].b_addr = 0;
3924 l_clusters[cl_index].e_addr = 0;
3925
3926 cl_pushed++;
3927
3928 if (push_all == 0)
3929 break;
3930 }
3931 lck_mtx_lock(&wbp->cl_lockw);
3932
3933dont_try:
3934 if (cl_len > cl_pushed) {
3935 /*
3936 * we didn't push all of the clusters, so
3937 * lets try to merge them back in to the vnode
3938 */
3939 if ((MAX_CLUSTERS - wbp->cl_number) < (cl_len - cl_pushed)) {
3940 /*
3941 * we picked up some new clusters while we were trying to
3942 * push the old ones... this can happen because I've dropped
3943 * the vnode lock... the sum of the
3944 * leftovers plus the new cluster count exceeds our ability
3945 * to represent them, so switch to the sparse cluster mechanism
3946 *
3947 * collect the active public clusters...
3948 */
3949 sparse_cluster_switch(wbp, vp, EOF);
3950
3951 for (cl_index = 0, cl_index1 = 0; cl_index < cl_len; cl_index++) {
3952 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr)
3953 continue;
3954 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
3955 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
3956 wbp->cl_clusters[cl_index1].io_nocache = l_clusters[cl_index].io_nocache;
3957
3958 cl_index1++;
3959 }
3960 /*
3961 * update the cluster count
3962 */
3963 wbp->cl_number = cl_index1;
3964
3965 /*
3966 * and collect the original clusters that were moved into the
3967 * local storage for sorting purposes
3968 */
3969 sparse_cluster_switch(wbp, vp, EOF);
3970
3971 } else {
3972 /*
3973 * we've got room to merge the leftovers back in
3974 * just append them starting at the next 'hole'
3975 * represented by wbp->cl_number
3976 */
3977 for (cl_index = 0, cl_index1 = wbp->cl_number; cl_index < cl_len; cl_index++) {
3978 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr)
3979 continue;
3980
3981 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
3982 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
3983 wbp->cl_clusters[cl_index1].io_nocache = l_clusters[cl_index].io_nocache;
3984
3985 cl_index1++;
3986 }
3987 /*
3988 * update the cluster count
3989 */
3990 wbp->cl_number = cl_index1;
3991 }
3992 }
3993 return(MAX_CLUSTERS - wbp->cl_number);
3994}
3995
3996
3997
3998static int
3999cluster_push_x(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags)
4000{
4001 upl_page_info_t *pl;
4002 upl_t upl;
4003 vm_offset_t upl_offset;
4004 int upl_size;
4005 off_t upl_f_offset;
4006 int pages_in_upl;
4007 int start_pg;
4008 int last_pg;
4009 int io_size;
4010 int io_flags;
4011 int upl_flags;
4012 int size;
4013 int error = 0;
4014 int retval;
4015 kern_return_t kret;
4016
4017
4018 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START,
4019 (int)cl->b_addr, (int)cl->e_addr, (int)EOF, flags, 0);
4020
4021 if ((pages_in_upl = (int)(cl->e_addr - cl->b_addr)) == 0) {
4022 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0);
4023
4024 return (0);
4025 }
4026 upl_size = pages_in_upl * PAGE_SIZE;
4027 upl_f_offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
4028
4029 if (upl_f_offset + upl_size >= EOF) {
4030
4031 if (upl_f_offset >= EOF) {
4032 /*
4033 * must have truncated the file and missed
4034 * clearing a dangling cluster (i.e. it's completely
4035 * beyond the new EOF
4036 */
4037 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0);
4038
4039 return(0);
4040 }
4041 size = EOF - upl_f_offset;
4042
4043 upl_size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
4044 pages_in_upl = upl_size / PAGE_SIZE;
4045 } else
4046 size = upl_size;
4047
4048 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, size, 0, 0, 0);
4049
4050 /*
4051 * by asking for UPL_COPYOUT_FROM and UPL_RET_ONLY_DIRTY, we get the following desirable behavior
4052 *
4053 * - only pages that are currently dirty are returned... these are the ones we need to clean
4054 * - the hardware dirty bit is cleared when the page is gathered into the UPL... the software dirty bit is set
4055 * - if we have to abort the I/O for some reason, the software dirty bit is left set since we didn't clean the page
4056 * - when we commit the page, the software dirty bit is cleared... the hardware dirty bit is untouched so that if
4057 * someone dirties this page while the I/O is in progress, we don't lose track of the new state
4058 *
4059 * when the I/O completes, we no longer ask for an explicit clear of the DIRTY state (either soft or hard)
4060 */
4061
4062 if ((vp->v_flag & VNOCACHE_DATA) || (flags & IO_NOCACHE))
4063 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE | UPL_WILL_BE_DUMPED;
4064 else
4065 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE;
4066
4067 kret = ubc_create_upl(vp,
4068 upl_f_offset,
4069 upl_size,
4070 &upl,
4071 &pl,
4072 upl_flags);
4073 if (kret != KERN_SUCCESS)
4074 panic("cluster_push: failed to get pagelist");
4075
4076 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END, (int)upl, upl_f_offset, 0, 0, 0);
4077
4078 /*
4079 * since we only asked for the dirty pages back
4080 * it's possible that we may only get a few or even none, so...
4081 * before we start marching forward, we must make sure we know
4082 * where the last present page is in the UPL, otherwise we could
4083 * end up working with a freed upl due to the FREE_ON_EMPTY semantics
4084 * employed by commit_range and abort_range.
4085 */
4086 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
4087 if (upl_page_present(pl, last_pg))
4088 break;
4089 }
4090 pages_in_upl = last_pg + 1;
4091
4092 if (pages_in_upl == 0) {
4093 ubc_upl_abort(upl, 0);
4094
4095 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 2, 0, 0, 0);
4096 return(0);
4097 }
4098
4099 for (last_pg = 0; last_pg < pages_in_upl; ) {
4100 /*
4101 * find the next dirty page in the UPL
4102 * this will become the first page in the
4103 * next I/O to generate
4104 */
4105 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
4106 if (upl_dirty_page(pl, start_pg))
4107 break;
4108 if (upl_page_present(pl, start_pg))
4109 /*
4110 * RET_ONLY_DIRTY will return non-dirty 'precious' pages
4111 * just release these unchanged since we're not going
4112 * to steal them or change their state
4113 */
4114 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
4115 }
4116 if (start_pg >= pages_in_upl)
4117 /*
4118 * done... no more dirty pages to push
4119 */
4120 break;
4121 if (start_pg > last_pg)
4122 /*
4123 * skipped over some non-dirty pages
4124 */
4125 size -= ((start_pg - last_pg) * PAGE_SIZE);
4126
4127 /*
4128 * find a range of dirty pages to write
4129 */
4130 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
4131 if (!upl_dirty_page(pl, last_pg))
4132 break;
4133 }
4134 upl_offset = start_pg * PAGE_SIZE;
4135
4136 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
4137
4138 io_flags = CL_THROTTLE | CL_COMMIT;
4139
4140 if ( !(flags & IO_SYNC))
4141 io_flags |= CL_ASYNC;
4142
4143 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
4144 io_flags, (buf_t)NULL, (struct clios *)NULL);
4145
4146 if (error == 0 && retval)
4147 error = retval;
4148
4149 size -= io_size;
4150 }
4151 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, 0, 0, 0);
4152
4153 return(error);
4154}
4155
4156
4157/*
4158 * sparse_cluster_switch is called with the write behind lock held
4159 */
4160static void
4161sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF)
4162{
4163 int cl_index;
4164
4165 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
4166
4167 if (wbp->cl_scmap == NULL)
4168 wbp->cl_scdirty = 0;
4169
4170 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
4171 int flags;
4172 struct cl_extent cl;
4173
4174 for (cl.b_addr = wbp->cl_clusters[cl_index].b_addr; cl.b_addr < wbp->cl_clusters[cl_index].e_addr; cl.b_addr++) {
4175
4176 if (ubc_page_op(vp, (off_t)(cl.b_addr * PAGE_SIZE_64), 0, 0, &flags) == KERN_SUCCESS) {
4177 if (flags & UPL_POP_DIRTY) {
4178 cl.e_addr = cl.b_addr + 1;
4179
4180 sparse_cluster_add(wbp, vp, &cl, EOF);
4181 }
4182 }
4183 }
4184 }
4185 wbp->cl_number = 0;
4186
4187 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
4188}
4189
4190
4191/*
4192 * sparse_cluster_push is called with the write behind lock held
4193 */
4194static void
4195sparse_cluster_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_all)
4196{
4197 struct cl_extent cl;
4198 off_t offset;
4199 u_int length;
4200
4201 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, push_all, 0);
4202
4203 if (push_all)
4204 vfs_drt_control(&(wbp->cl_scmap), 1);
4205
4206 for (;;) {
4207 if (vfs_drt_get_cluster(&(wbp->cl_scmap), &offset, &length) != KERN_SUCCESS)
4208 break;
4209
4210 cl.b_addr = (daddr64_t)(offset / PAGE_SIZE_64);
4211 cl.e_addr = (daddr64_t)((offset + length) / PAGE_SIZE_64);
4212
4213 wbp->cl_scdirty -= (int)(cl.e_addr - cl.b_addr);
4214
4215 cluster_push_x(vp, &cl, EOF, 0);
4216
4217 if (push_all == 0)
4218 break;
4219 }
4220 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
4221}
4222
4223
4224/*
4225 * sparse_cluster_add is called with the write behind lock held
4226 */
4227static void
4228sparse_cluster_add(struct cl_writebehind *wbp, vnode_t vp, struct cl_extent *cl, off_t EOF)
4229{
4230 u_int new_dirty;
4231 u_int length;
4232 off_t offset;
4233
4234 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_START, (int)wbp->cl_scmap, wbp->cl_scdirty, (int)cl->b_addr, (int)cl->e_addr, 0);
4235
4236 offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
4237 length = ((u_int)(cl->e_addr - cl->b_addr)) * PAGE_SIZE;
4238
4239 while (vfs_drt_mark_pages(&(wbp->cl_scmap), offset, length, &new_dirty) != KERN_SUCCESS) {
4240 /*
4241 * no room left in the map
4242 * only a partial update was done
4243 * push out some pages and try again
4244 */
4245 wbp->cl_scdirty += new_dirty;
4246
4247 sparse_cluster_push(wbp, vp, EOF, 0);
4248
4249 offset += (new_dirty * PAGE_SIZE_64);
4250 length -= (new_dirty * PAGE_SIZE);
4251 }
4252 wbp->cl_scdirty += new_dirty;
4253
4254 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
4255}
4256
4257
4258static int
4259cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, int xsize, int flags)
4260{
4261 upl_page_info_t *pl;
4262 upl_t upl;
4263 addr64_t ubc_paddr;
4264 kern_return_t kret;
4265 int error = 0;
4266 int did_read = 0;
4267 int abort_flags;
4268 int upl_flags;
4269
4270 upl_flags = UPL_SET_LITE;
4271 if (! (flags & CL_READ)) {
4272 /*
4273 * "write" operation: let the UPL subsystem know
4274 * that we intend to modify the buffer cache pages
4275 * we're gathering.
4276 */
4277 upl_flags |= UPL_WILL_MODIFY;
4278 }
4279
4280 kret = ubc_create_upl(vp,
4281 uio->uio_offset & ~PAGE_MASK_64,
4282 PAGE_SIZE,
4283 &upl,
4284 &pl,
4285 upl_flags);
4286
4287 if (kret != KERN_SUCCESS)
4288 return(EINVAL);
4289
4290 if (!upl_valid_page(pl, 0)) {
4291 /*
4292 * issue a synchronous read to cluster_io
4293 */
4294 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
4295 CL_READ, (buf_t)NULL, (struct clios *)NULL);
4296 if (error) {
4297 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
4298
4299 return(error);
4300 }
4301 did_read = 1;
4302 }
4303 ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)(uio->uio_offset & PAGE_MASK_64);
4304
4305/*
4306 * NOTE: There is no prototype for the following in BSD. It, and the definitions
4307 * of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
4308 * osfmk/ppc/mappings.h. They are not included here because there appears to be no
4309 * way to do so without exporting them to kexts as well.
4310 */
4311 if (flags & CL_READ)
4312// copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk); /* Copy physical to physical and flush the destination */
4313 copypv(ubc_paddr, usr_paddr, xsize, 2 | 1 | 4); /* Copy physical to physical and flush the destination */
4314 else
4315// copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc); /* Copy physical to physical and flush the source */
4316 copypv(usr_paddr, ubc_paddr, xsize, 2 | 1 | 8); /* Copy physical to physical and flush the source */
4317
4318 if ( !(flags & CL_READ) || (upl_valid_page(pl, 0) && upl_dirty_page(pl, 0))) {
4319 /*
4320 * issue a synchronous write to cluster_io
4321 */
4322 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
4323 0, (buf_t)NULL, (struct clios *)NULL);
4324 }
4325 if (error == 0)
4326 uio_update(uio, (user_size_t)xsize);
4327
4328 if (did_read)
4329 abort_flags = UPL_ABORT_FREE_ON_EMPTY;
4330 else
4331 abort_flags = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
4332
4333 ubc_upl_abort_range(upl, 0, PAGE_SIZE, abort_flags);
4334
4335 return (error);
4336}
4337
4338
4339
4340int
4341cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int xsize)
4342{
4343 int pg_offset;
4344 int pg_index;
4345 int csize;
4346 int segflg;
4347 int retval = 0;
4348 upl_page_info_t *pl;
4349
4350 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
4351 (int)uio->uio_offset, uio_resid(uio), upl_offset, xsize, 0);
4352
4353 segflg = uio->uio_segflg;
4354
4355 switch(segflg) {
4356
4357 case UIO_USERSPACE32:
4358 case UIO_USERISPACE32:
4359 uio->uio_segflg = UIO_PHYS_USERSPACE32;
4360 break;
4361
4362 case UIO_USERSPACE:
4363 case UIO_USERISPACE:
4364 uio->uio_segflg = UIO_PHYS_USERSPACE;
4365 break;
4366
4367 case UIO_USERSPACE64:
4368 case UIO_USERISPACE64:
4369 uio->uio_segflg = UIO_PHYS_USERSPACE64;
4370 break;
4371
4372 case UIO_SYSSPACE32:
4373 uio->uio_segflg = UIO_PHYS_SYSSPACE32;
4374 break;
4375
4376 case UIO_SYSSPACE:
4377 uio->uio_segflg = UIO_PHYS_SYSSPACE;
4378 break;
4379
4380 case UIO_SYSSPACE64:
4381 uio->uio_segflg = UIO_PHYS_SYSSPACE64;
4382 break;
4383 }
4384 pl = ubc_upl_pageinfo(upl);
4385
4386 pg_index = upl_offset / PAGE_SIZE;
4387 pg_offset = upl_offset & PAGE_MASK;
4388 csize = min(PAGE_SIZE - pg_offset, xsize);
4389
4390 while (xsize && retval == 0) {
4391 addr64_t paddr;
4392
4393 paddr = ((addr64_t)upl_phys_page(pl, pg_index) << 12) + pg_offset;
4394
4395 retval = uiomove64(paddr, csize, uio);
4396
4397 pg_index += 1;
4398 pg_offset = 0;
4399 xsize -= csize;
4400 csize = min(PAGE_SIZE, xsize);
4401 }
4402 uio->uio_segflg = segflg;
4403
4404 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
4405 (int)uio->uio_offset, uio_resid(uio), retval, segflg, 0);
4406
4407 return (retval);
4408}
4409
4410
4411int
4412cluster_copy_ubc_data(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty)
4413{
4414 int segflg;
4415 int io_size;
4416 int xsize;
4417 int start_offset;
4418 int retval = 0;
4419 memory_object_control_t control;
4420
4421
4422 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
4423 (int)uio->uio_offset, uio_resid(uio), 0, *io_resid, 0);
4424
4425 control = ubc_getobject(vp, UBC_FLAGS_NONE);
4426 if (control == MEMORY_OBJECT_CONTROL_NULL) {
4427 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
4428 (int)uio->uio_offset, uio_resid(uio), retval, 3, 0);
4429
4430 return(0);
4431 }
4432 segflg = uio->uio_segflg;
4433
4434 switch(segflg) {
4435
4436 case UIO_USERSPACE32:
4437 case UIO_USERISPACE32:
4438 uio->uio_segflg = UIO_PHYS_USERSPACE32;
4439 break;
4440
4441 case UIO_USERSPACE64:
4442 case UIO_USERISPACE64:
4443 uio->uio_segflg = UIO_PHYS_USERSPACE64;
4444 break;
4445
4446 case UIO_SYSSPACE32:
4447 uio->uio_segflg = UIO_PHYS_SYSSPACE32;
4448 break;
4449
4450 case UIO_SYSSPACE64:
4451 uio->uio_segflg = UIO_PHYS_SYSSPACE64;
4452 break;
4453
4454 case UIO_USERSPACE:
4455 case UIO_USERISPACE:
4456 uio->uio_segflg = UIO_PHYS_USERSPACE;
4457 break;
4458
4459 case UIO_SYSSPACE:
4460 uio->uio_segflg = UIO_PHYS_SYSSPACE;
4461 break;
4462 }
4463
4464 if ( (io_size = *io_resid) ) {
4465 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
4466 xsize = uio_resid(uio);
4467
4468 retval = memory_object_control_uiomove(control, uio->uio_offset - start_offset,
4469 uio, start_offset, io_size, mark_dirty);
4470 xsize -= uio_resid(uio);
4471 io_size -= xsize;
4472 }
4473 uio->uio_segflg = segflg;
4474 *io_resid = io_size;
4475
4476 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
4477 (int)uio->uio_offset, uio_resid(uio), retval, 0x80000000 | segflg, 0);
4478
4479 return(retval);
4480}
4481
4482
4483int
4484is_file_clean(vnode_t vp, off_t filesize)
4485{
4486 off_t f_offset;
4487 int flags;
4488 int total_dirty = 0;
4489
4490 for (f_offset = 0; f_offset < filesize; f_offset += PAGE_SIZE_64) {
4491 if (ubc_page_op(vp, f_offset, 0, 0, &flags) == KERN_SUCCESS) {
4492 if (flags & UPL_POP_DIRTY) {
4493 total_dirty++;
4494 }
4495 }
4496 }
4497 if (total_dirty)
4498 return(EINVAL);
4499
4500 return (0);
4501}
4502
4503
4504
4505/*
4506 * Dirty region tracking/clustering mechanism.
4507 *
4508 * This code (vfs_drt_*) provides a mechanism for tracking and clustering
4509 * dirty regions within a larger space (file). It is primarily intended to
4510 * support clustering in large files with many dirty areas.
4511 *
4512 * The implementation assumes that the dirty regions are pages.
4513 *
4514 * To represent dirty pages within the file, we store bit vectors in a
4515 * variable-size circular hash.
4516 */
4517
4518/*
4519 * Bitvector size. This determines the number of pages we group in a
4520 * single hashtable entry. Each hashtable entry is aligned to this
4521 * size within the file.
4522 */
4523#define DRT_BITVECTOR_PAGES 256
4524
4525/*
4526 * File offset handling.
4527 *
4528 * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
4529 * the correct formula is (~(DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1)
4530 */
4531#define DRT_ADDRESS_MASK (~((1 << 20) - 1))
4532#define DRT_ALIGN_ADDRESS(addr) ((addr) & DRT_ADDRESS_MASK)
4533
4534/*
4535 * Hashtable address field handling.
4536 *
4537 * The low-order bits of the hashtable address are used to conserve
4538 * space.
4539 *
4540 * DRT_HASH_COUNT_MASK must be large enough to store the range
4541 * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
4542 * to indicate that the bucket is actually unoccupied.
4543 */
4544#define DRT_HASH_GET_ADDRESS(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
4545#define DRT_HASH_SET_ADDRESS(scm, i, a) \
4546 do { \
4547 (scm)->scm_hashtable[(i)].dhe_control = \
4548 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \
4549 } while (0)
4550#define DRT_HASH_COUNT_MASK 0x1ff
4551#define DRT_HASH_GET_COUNT(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
4552#define DRT_HASH_SET_COUNT(scm, i, c) \
4553 do { \
4554 (scm)->scm_hashtable[(i)].dhe_control = \
4555 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK); \
4556 } while (0)
4557#define DRT_HASH_CLEAR(scm, i) \
4558 do { \
4559 (scm)->scm_hashtable[(i)].dhe_control = 0; \
4560 } while (0)
4561#define DRT_HASH_VACATE(scm, i) DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
4562#define DRT_HASH_VACANT(scm, i) (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
4563#define DRT_HASH_COPY(oscm, oi, scm, i) \
4564 do { \
4565 (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control; \
4566 DRT_BITVECTOR_COPY(oscm, oi, scm, i); \
4567 } while(0);
4568
4569
4570/*
4571 * Hash table moduli.
4572 *
4573 * Since the hashtable entry's size is dependent on the size of
4574 * the bitvector, and since the hashtable size is constrained to
4575 * both being prime and fitting within the desired allocation
4576 * size, these values need to be manually determined.
4577 *
4578 * For DRT_BITVECTOR_SIZE = 256, the entry size is 40 bytes.
4579 *
4580 * The small hashtable allocation is 1024 bytes, so the modulus is 23.
4581 * The large hashtable allocation is 16384 bytes, so the modulus is 401.
4582 */
4583#define DRT_HASH_SMALL_MODULUS 23
4584#define DRT_HASH_LARGE_MODULUS 401
4585
4586#define DRT_SMALL_ALLOCATION 1024 /* 104 bytes spare */
4587#define DRT_LARGE_ALLOCATION 16384 /* 344 bytes spare */
4588
4589/* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */
4590
4591/*
4592 * Hashtable bitvector handling.
4593 *
4594 * Bitvector fields are 32 bits long.
4595 */
4596
4597#define DRT_HASH_SET_BIT(scm, i, bit) \
4598 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32))
4599
4600#define DRT_HASH_CLEAR_BIT(scm, i, bit) \
4601 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
4602
4603#define DRT_HASH_TEST_BIT(scm, i, bit) \
4604 ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
4605
4606#define DRT_BITVECTOR_CLEAR(scm, i) \
4607 bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
4608
4609#define DRT_BITVECTOR_COPY(oscm, oi, scm, i) \
4610 bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0], \
4611 &(scm)->scm_hashtable[(i)].dhe_bitvector[0], \
4612 (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
4613
4614
4615
4616/*
4617 * Hashtable entry.
4618 */
4619struct vfs_drt_hashentry {
4620 u_int64_t dhe_control;
4621 u_int32_t dhe_bitvector[DRT_BITVECTOR_PAGES / 32];
4622};
4623
4624/*
4625 * Dirty Region Tracking structure.
4626 *
4627 * The hashtable is allocated entirely inside the DRT structure.
4628 *
4629 * The hash is a simple circular prime modulus arrangement, the structure
4630 * is resized from small to large if it overflows.
4631 */
4632
4633struct vfs_drt_clustermap {
4634 u_int32_t scm_magic; /* sanity/detection */
4635#define DRT_SCM_MAGIC 0x12020003
4636 u_int32_t scm_modulus; /* current ring size */
4637 u_int32_t scm_buckets; /* number of occupied buckets */
4638 u_int32_t scm_lastclean; /* last entry we cleaned */
4639 u_int32_t scm_iskips; /* number of slot skips */
4640
4641 struct vfs_drt_hashentry scm_hashtable[0];
4642};
4643
4644
4645#define DRT_HASH(scm, addr) ((addr) % (scm)->scm_modulus)
4646#define DRT_HASH_NEXT(scm, addr) (((addr) + 1) % (scm)->scm_modulus)
4647
4648/*
4649 * Debugging codes and arguments.
4650 */
4651#define DRT_DEBUG_EMPTYFREE (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
4652#define DRT_DEBUG_RETCLUSTER (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
4653#define DRT_DEBUG_ALLOC (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
4654#define DRT_DEBUG_INSERT (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
4655#define DRT_DEBUG_MARK (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
4656 * dirty */
4657 /* 0, setcount */
4658 /* 1 (clean, no map) */
4659 /* 2 (map alloc fail) */
4660 /* 3, resid (partial) */
4661#define DRT_DEBUG_6 (FSDBG_CODE(DBG_FSRW, 87))
4662#define DRT_DEBUG_SCMDATA (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
4663 * lastclean, iskips */
4664
4665
4666static kern_return_t vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp);
4667static kern_return_t vfs_drt_free_map(struct vfs_drt_clustermap *cmap);
4668static kern_return_t vfs_drt_search_index(struct vfs_drt_clustermap *cmap,
4669 u_int64_t offset, int *indexp);
4670static kern_return_t vfs_drt_get_index(struct vfs_drt_clustermap **cmapp,
4671 u_int64_t offset,
4672 int *indexp,
4673 int recursed);
4674static kern_return_t vfs_drt_do_mark_pages(
4675 void **cmapp,
4676 u_int64_t offset,
4677 u_int length,
4678 int *setcountp,
4679 int dirty);
4680static void vfs_drt_trace(
4681 struct vfs_drt_clustermap *cmap,
4682 int code,
4683 int arg1,
4684 int arg2,
4685 int arg3,
4686 int arg4);
4687
4688
4689/*
4690 * Allocate and initialise a sparse cluster map.
4691 *
4692 * Will allocate a new map, resize or compact an existing map.
4693 *
4694 * XXX we should probably have at least one intermediate map size,
4695 * as the 1:16 ratio seems a bit drastic.
4696 */
4697static kern_return_t
4698vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp)
4699{
4700 struct vfs_drt_clustermap *cmap, *ocmap;
4701 kern_return_t kret;
4702 u_int64_t offset;
4703 int nsize, i, active_buckets, index, copycount;
4704
4705 ocmap = NULL;
4706 if (cmapp != NULL)
4707 ocmap = *cmapp;
4708
4709 /*
4710 * Decide on the size of the new map.
4711 */
4712 if (ocmap == NULL) {
4713 nsize = DRT_HASH_SMALL_MODULUS;
4714 } else {
4715 /* count the number of active buckets in the old map */
4716 active_buckets = 0;
4717 for (i = 0; i < ocmap->scm_modulus; i++) {
4718 if (!DRT_HASH_VACANT(ocmap, i) &&
4719 (DRT_HASH_GET_COUNT(ocmap, i) != 0))
4720 active_buckets++;
4721 }
4722 /*
4723 * If we're currently using the small allocation, check to
4724 * see whether we should grow to the large one.
4725 */
4726 if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
4727 /* if the ring is nearly full */
4728 if (active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) {
4729 nsize = DRT_HASH_LARGE_MODULUS;
4730 } else {
4731 nsize = DRT_HASH_SMALL_MODULUS;
4732 }
4733 } else {
4734 /* already using the large modulus */
4735 nsize = DRT_HASH_LARGE_MODULUS;
4736 /*
4737 * If the ring is completely full, there's
4738 * nothing useful for us to do. Behave as
4739 * though we had compacted into the new
4740 * array and return.
4741 */
4742 if (active_buckets >= DRT_HASH_LARGE_MODULUS)
4743 return(KERN_SUCCESS);
4744 }
4745 }
4746
4747 /*
4748 * Allocate and initialise the new map.
4749 */
4750
4751 kret = kmem_alloc(kernel_map, (vm_offset_t *)&cmap,
4752 (nsize == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION);
4753 if (kret != KERN_SUCCESS)
4754 return(kret);
4755 cmap->scm_magic = DRT_SCM_MAGIC;
4756 cmap->scm_modulus = nsize;
4757 cmap->scm_buckets = 0;
4758 cmap->scm_lastclean = 0;
4759 cmap->scm_iskips = 0;
4760 for (i = 0; i < cmap->scm_modulus; i++) {
4761 DRT_HASH_CLEAR(cmap, i);
4762 DRT_HASH_VACATE(cmap, i);
4763 DRT_BITVECTOR_CLEAR(cmap, i);
4764 }
4765
4766 /*
4767 * If there's an old map, re-hash entries from it into the new map.
4768 */
4769 copycount = 0;
4770 if (ocmap != NULL) {
4771 for (i = 0; i < ocmap->scm_modulus; i++) {
4772 /* skip empty buckets */
4773 if (DRT_HASH_VACANT(ocmap, i) ||
4774 (DRT_HASH_GET_COUNT(ocmap, i) == 0))
4775 continue;
4776 /* get new index */
4777 offset = DRT_HASH_GET_ADDRESS(ocmap, i);
4778 kret = vfs_drt_get_index(&cmap, offset, &index, 1);
4779 if (kret != KERN_SUCCESS) {
4780 /* XXX need to bail out gracefully here */
4781 panic("vfs_drt: new cluster map mysteriously too small");
4782 }
4783 /* copy */
4784 DRT_HASH_COPY(ocmap, i, cmap, index);
4785 copycount++;
4786 }
4787 }
4788
4789 /* log what we've done */
4790 vfs_drt_trace(cmap, DRT_DEBUG_ALLOC, copycount, 0, 0, 0);
4791
4792 /*
4793 * It's important to ensure that *cmapp always points to
4794 * a valid map, so we must overwrite it before freeing
4795 * the old map.
4796 */
4797 *cmapp = cmap;
4798 if (ocmap != NULL) {
4799 /* emit stats into trace buffer */
4800 vfs_drt_trace(ocmap, DRT_DEBUG_SCMDATA,
4801 ocmap->scm_modulus,
4802 ocmap->scm_buckets,
4803 ocmap->scm_lastclean,
4804 ocmap->scm_iskips);
4805
4806 vfs_drt_free_map(ocmap);
4807 }
4808 return(KERN_SUCCESS);
4809}
4810
4811
4812/*
4813 * Free a sparse cluster map.
4814 */
4815static kern_return_t
4816vfs_drt_free_map(struct vfs_drt_clustermap *cmap)
4817{
4818 kmem_free(kernel_map, (vm_offset_t)cmap,
4819 (cmap->scm_modulus == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION);
4820 return(KERN_SUCCESS);
4821}
4822
4823
4824/*
4825 * Find the hashtable slot currently occupied by an entry for the supplied offset.
4826 */
4827static kern_return_t
4828vfs_drt_search_index(struct vfs_drt_clustermap *cmap, u_int64_t offset, int *indexp)
4829{
4830 int index, i;
4831
4832 offset = DRT_ALIGN_ADDRESS(offset);
4833 index = DRT_HASH(cmap, offset);
4834
4835 /* traverse the hashtable */
4836 for (i = 0; i < cmap->scm_modulus; i++) {
4837
4838 /*
4839 * If the slot is vacant, we can stop.
4840 */
4841 if (DRT_HASH_VACANT(cmap, index))
4842 break;
4843
4844 /*
4845 * If the address matches our offset, we have success.
4846 */
4847 if (DRT_HASH_GET_ADDRESS(cmap, index) == offset) {
4848 *indexp = index;
4849 return(KERN_SUCCESS);
4850 }
4851
4852 /*
4853 * Move to the next slot, try again.
4854 */
4855 index = DRT_HASH_NEXT(cmap, index);
4856 }
4857 /*
4858 * It's not there.
4859 */
4860 return(KERN_FAILURE);
4861}
4862
4863/*
4864 * Find the hashtable slot for the supplied offset. If we haven't allocated
4865 * one yet, allocate one and populate the address field. Note that it will
4866 * not have a nonzero page count and thus will still technically be free, so
4867 * in the case where we are called to clean pages, the slot will remain free.
4868 */
4869static kern_return_t
4870vfs_drt_get_index(struct vfs_drt_clustermap **cmapp, u_int64_t offset, int *indexp, int recursed)
4871{
4872 struct vfs_drt_clustermap *cmap;
4873 kern_return_t kret;
4874 int index, i;
4875
4876 cmap = *cmapp;
4877
4878 /* look for an existing entry */
4879 kret = vfs_drt_search_index(cmap, offset, indexp);
4880 if (kret == KERN_SUCCESS)
4881 return(kret);
4882
4883 /* need to allocate an entry */
4884 offset = DRT_ALIGN_ADDRESS(offset);
4885 index = DRT_HASH(cmap, offset);
4886
4887 /* scan from the index forwards looking for a vacant slot */
4888 for (i = 0; i < cmap->scm_modulus; i++) {
4889 /* slot vacant? */
4890 if (DRT_HASH_VACANT(cmap, index) || DRT_HASH_GET_COUNT(cmap,index) == 0) {
4891 cmap->scm_buckets++;
4892 if (index < cmap->scm_lastclean)
4893 cmap->scm_lastclean = index;
4894 DRT_HASH_SET_ADDRESS(cmap, index, offset);
4895 DRT_HASH_SET_COUNT(cmap, index, 0);
4896 DRT_BITVECTOR_CLEAR(cmap, index);
4897 *indexp = index;
4898 vfs_drt_trace(cmap, DRT_DEBUG_INSERT, (int)offset, i, 0, 0);
4899 return(KERN_SUCCESS);
4900 }
4901 cmap->scm_iskips += i;
4902 index = DRT_HASH_NEXT(cmap, index);
4903 }
4904
4905 /*
4906 * We haven't found a vacant slot, so the map is full. If we're not
4907 * already recursed, try reallocating/compacting it.
4908 */
4909 if (recursed)
4910 return(KERN_FAILURE);
4911 kret = vfs_drt_alloc_map(cmapp);
4912 if (kret == KERN_SUCCESS) {
4913 /* now try to insert again */
4914 kret = vfs_drt_get_index(cmapp, offset, indexp, 1);
4915 }
4916 return(kret);
4917}
4918
4919/*
4920 * Implementation of set dirty/clean.
4921 *
4922 * In the 'clean' case, not finding a map is OK.
4923 */
4924static kern_return_t
4925vfs_drt_do_mark_pages(
4926 void **private,
4927 u_int64_t offset,
4928 u_int length,
4929 int *setcountp,
4930 int dirty)
4931{
4932 struct vfs_drt_clustermap *cmap, **cmapp;
4933 kern_return_t kret;
4934 int i, index, pgoff, pgcount, setcount, ecount;
4935
4936 cmapp = (struct vfs_drt_clustermap **)private;
4937 cmap = *cmapp;
4938
4939 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_START, (int)offset, (int)length, dirty, 0);
4940
4941 if (setcountp != NULL)
4942 *setcountp = 0;
4943
4944 /* allocate a cluster map if we don't already have one */
4945 if (cmap == NULL) {
4946 /* no cluster map, nothing to clean */
4947 if (!dirty) {
4948 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 1, 0, 0, 0);
4949 return(KERN_SUCCESS);
4950 }
4951 kret = vfs_drt_alloc_map(cmapp);
4952 if (kret != KERN_SUCCESS) {
4953 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 2, 0, 0, 0);
4954 return(kret);
4955 }
4956 }
4957 setcount = 0;
4958
4959 /*
4960 * Iterate over the length of the region.
4961 */
4962 while (length > 0) {
4963 /*
4964 * Get the hashtable index for this offset.
4965 *
4966 * XXX this will add blank entries if we are clearing a range
4967 * that hasn't been dirtied.
4968 */
4969 kret = vfs_drt_get_index(cmapp, offset, &index, 0);
4970 cmap = *cmapp; /* may have changed! */
4971 /* this may be a partial-success return */
4972 if (kret != KERN_SUCCESS) {
4973 if (setcountp != NULL)
4974 *setcountp = setcount;
4975 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 3, (int)length, 0, 0);
4976
4977 return(kret);
4978 }
4979
4980 /*
4981 * Work out how many pages we're modifying in this
4982 * hashtable entry.
4983 */
4984 pgoff = (offset - DRT_ALIGN_ADDRESS(offset)) / PAGE_SIZE;
4985 pgcount = min((length / PAGE_SIZE), (DRT_BITVECTOR_PAGES - pgoff));
4986
4987 /*
4988 * Iterate over pages, dirty/clearing as we go.
4989 */
4990 ecount = DRT_HASH_GET_COUNT(cmap, index);
4991 for (i = 0; i < pgcount; i++) {
4992 if (dirty) {
4993 if (!DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
4994 DRT_HASH_SET_BIT(cmap, index, pgoff + i);
4995 ecount++;
4996 setcount++;
4997 }
4998 } else {
4999 if (DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
5000 DRT_HASH_CLEAR_BIT(cmap, index, pgoff + i);
5001 ecount--;
5002 setcount++;
5003 }
5004 }
5005 }
5006 DRT_HASH_SET_COUNT(cmap, index, ecount);
5007
5008 offset += pgcount * PAGE_SIZE;
5009 length -= pgcount * PAGE_SIZE;
5010 }
5011 if (setcountp != NULL)
5012 *setcountp = setcount;
5013
5014 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 0, setcount, 0, 0);
5015
5016 return(KERN_SUCCESS);
5017}
5018
5019/*
5020 * Mark a set of pages as dirty/clean.
5021 *
5022 * This is a public interface.
5023 *
5024 * cmapp
5025 * Pointer to storage suitable for holding a pointer. Note that
5026 * this must either be NULL or a value set by this function.
5027 *
5028 * size
5029 * Current file size in bytes.
5030 *
5031 * offset
5032 * Offset of the first page to be marked as dirty, in bytes. Must be
5033 * page-aligned.
5034 *
5035 * length
5036 * Length of dirty region, in bytes. Must be a multiple of PAGE_SIZE.
5037 *
5038 * setcountp
5039 * Number of pages newly marked dirty by this call (optional).
5040 *
5041 * Returns KERN_SUCCESS if all the pages were successfully marked.
5042 */
5043static kern_return_t
5044vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, int *setcountp)
5045{
5046 /* XXX size unused, drop from interface */
5047 return(vfs_drt_do_mark_pages(cmapp, offset, length, setcountp, 1));
5048}
5049
5050#if 0
5051static kern_return_t
5052vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length)
5053{
5054 return(vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0));
5055}
5056#endif
5057
5058/*
5059 * Get a cluster of dirty pages.
5060 *
5061 * This is a public interface.
5062 *
5063 * cmapp
5064 * Pointer to storage managed by drt_mark_pages. Note that this must
5065 * be NULL or a value set by drt_mark_pages.
5066 *
5067 * offsetp
5068 * Returns the byte offset into the file of the first page in the cluster.
5069 *
5070 * lengthp
5071 * Returns the length in bytes of the cluster of dirty pages.
5072 *
5073 * Returns success if a cluster was found. If KERN_FAILURE is returned, there
5074 * are no dirty pages meeting the minmum size criteria. Private storage will
5075 * be released if there are no more dirty pages left in the map
5076 *
5077 */
5078static kern_return_t
5079vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp)
5080{
5081 struct vfs_drt_clustermap *cmap;
5082 u_int64_t offset;
5083 u_int length;
5084 int index, i, j, fs, ls;
5085
5086 /* sanity */
5087 if ((cmapp == NULL) || (*cmapp == NULL))
5088 return(KERN_FAILURE);
5089 cmap = *cmapp;
5090
5091 /* walk the hashtable */
5092 for (offset = 0, j = 0; j < cmap->scm_modulus; offset += (DRT_BITVECTOR_PAGES * PAGE_SIZE), j++) {
5093 index = DRT_HASH(cmap, offset);
5094
5095 if (DRT_HASH_VACANT(cmap, index) || (DRT_HASH_GET_COUNT(cmap, index) == 0))
5096 continue;
5097
5098 /* scan the bitfield for a string of bits */
5099 fs = -1;
5100
5101 for (i = 0; i < DRT_BITVECTOR_PAGES; i++) {
5102 if (DRT_HASH_TEST_BIT(cmap, index, i)) {
5103 fs = i;
5104 break;
5105 }
5106 }
5107 if (fs == -1) {
5108 /* didn't find any bits set */
5109 panic("vfs_drt: entry summary count > 0 but no bits set in map");
5110 }
5111 for (ls = 0; i < DRT_BITVECTOR_PAGES; i++, ls++) {
5112 if (!DRT_HASH_TEST_BIT(cmap, index, i))
5113 break;
5114 }
5115
5116 /* compute offset and length, mark pages clean */
5117 offset = DRT_HASH_GET_ADDRESS(cmap, index) + (PAGE_SIZE * fs);
5118 length = ls * PAGE_SIZE;
5119 vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
5120 cmap->scm_lastclean = index;
5121
5122 /* return successful */
5123 *offsetp = (off_t)offset;
5124 *lengthp = length;
5125
5126 vfs_drt_trace(cmap, DRT_DEBUG_RETCLUSTER, (int)offset, (int)length, 0, 0);
5127 return(KERN_SUCCESS);
5128 }
5129 /*
5130 * We didn't find anything... hashtable is empty
5131 * emit stats into trace buffer and
5132 * then free it
5133 */
5134 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
5135 cmap->scm_modulus,
5136 cmap->scm_buckets,
5137 cmap->scm_lastclean,
5138 cmap->scm_iskips);
5139
5140 vfs_drt_free_map(cmap);
5141 *cmapp = NULL;
5142
5143 return(KERN_FAILURE);
5144}
5145
5146
5147static kern_return_t
5148vfs_drt_control(void **cmapp, int op_type)
5149{
5150 struct vfs_drt_clustermap *cmap;
5151
5152 /* sanity */
5153 if ((cmapp == NULL) || (*cmapp == NULL))
5154 return(KERN_FAILURE);
5155 cmap = *cmapp;
5156
5157 switch (op_type) {
5158 case 0:
5159 /* emit stats into trace buffer */
5160 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
5161 cmap->scm_modulus,
5162 cmap->scm_buckets,
5163 cmap->scm_lastclean,
5164 cmap->scm_iskips);
5165
5166 vfs_drt_free_map(cmap);
5167 *cmapp = NULL;
5168 break;
5169
5170 case 1:
5171 cmap->scm_lastclean = 0;
5172 break;
5173 }
5174 return(KERN_SUCCESS);
5175}
5176
5177
5178
5179/*
5180 * Emit a summary of the state of the clustermap into the trace buffer
5181 * along with some caller-provided data.
5182 */
5183#if KDEBUG
5184static void
5185vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, int code, int arg1, int arg2, int arg3, int arg4)
5186{
5187 KERNEL_DEBUG(code, arg1, arg2, arg3, arg4, 0);
5188}
5189#else
5190static void
5191vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, __unused int code,
5192 __unused int arg1, __unused int arg2, __unused int arg3,
5193 __unused int arg4)
5194{
5195}
5196#endif
5197
5198#if 0
5199/*
5200 * Perform basic sanity check on the hash entry summary count
5201 * vs. the actual bits set in the entry.
5202 */
5203static void
5204vfs_drt_sanity(struct vfs_drt_clustermap *cmap)
5205{
5206 int index, i;
5207 int bits_on;
5208
5209 for (index = 0; index < cmap->scm_modulus; index++) {
5210 if (DRT_HASH_VACANT(cmap, index))
5211 continue;
5212
5213 for (bits_on = 0, i = 0; i < DRT_BITVECTOR_PAGES; i++) {
5214 if (DRT_HASH_TEST_BIT(cmap, index, i))
5215 bits_on++;
5216 }
5217 if (bits_on != DRT_HASH_GET_COUNT(cmap, index))
5218 panic("bits_on = %d, index = %d\n", bits_on, index);
5219 }
5220}
5221#endif