]> git.saurik.com Git - apple/xnu.git/blame - bsd/vfs/vfs_cluster.c
xnu-792.6.56.tar.gz
[apple/xnu.git] / bsd / vfs / vfs_cluster.c
CommitLineData
1c79356b 1/*
91447636 2 * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
1c79356b
A
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
ff6e181a
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
1c79356b 12 *
ff6e181a
A
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
1c79356b
A
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
ff6e181a
A
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
1c79356b
A
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
24/*
25 * Copyright (c) 1993
26 * The Regents of the University of California. All rights reserved.
27 *
28 * Redistribution and use in source and binary forms, with or without
29 * modification, are permitted provided that the following conditions
30 * are met:
31 * 1. Redistributions of source code must retain the above copyright
32 * notice, this list of conditions and the following disclaimer.
33 * 2. Redistributions in binary form must reproduce the above copyright
34 * notice, this list of conditions and the following disclaimer in the
35 * documentation and/or other materials provided with the distribution.
36 * 3. All advertising materials mentioning features or use of this software
37 * must display the following acknowledgement:
38 * This product includes software developed by the University of
39 * California, Berkeley and its contributors.
40 * 4. Neither the name of the University nor the names of its contributors
41 * may be used to endorse or promote products derived from this software
42 * without specific prior written permission.
43 *
44 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
45 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
46 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
47 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
48 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
49 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
50 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
51 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
52 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
53 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
54 * SUCH DAMAGE.
55 *
56 * @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95
57 */
58
59#include <sys/param.h>
91447636
A
60#include <sys/proc_internal.h>
61#include <sys/buf_internal.h>
62#include <sys/mount_internal.h>
63#include <sys/vnode_internal.h>
1c79356b
A
64#include <sys/trace.h>
65#include <sys/malloc.h>
55e303ae
A
66#include <sys/time.h>
67#include <sys/kernel.h>
1c79356b 68#include <sys/resourcevar.h>
91447636 69#include <sys/uio_internal.h>
1c79356b 70#include <libkern/libkern.h>
55e303ae 71#include <machine/machine_routines.h>
1c79356b 72
91447636 73#include <sys/ubc_internal.h>
1c79356b 74
55e303ae
A
75#include <mach/mach_types.h>
76#include <mach/memory_object_types.h>
91447636
A
77#include <mach/vm_map.h>
78#include <mach/upl.h>
79
80#include <vm/vm_kern.h>
81#include <vm/vm_map.h>
82#include <vm/vm_pageout.h>
55e303ae 83
1c79356b
A
84#include <sys/kdebug.h>
85
91447636 86
1c79356b
A
87#define CL_READ 0x01
88#define CL_ASYNC 0x02
89#define CL_COMMIT 0x04
1c79356b
A
90#define CL_PAGEOUT 0x10
91#define CL_AGE 0x20
92#define CL_DUMP 0x40
93#define CL_NOZERO 0x80
94#define CL_PAGEIN 0x100
0b4e3aa0 95#define CL_DEV_MEMORY 0x200
b4c24cb9 96#define CL_PRESERVE 0x400
55e303ae 97#define CL_THROTTLE 0x800
91447636 98#define CL_KEEPCACHED 0x1000
b4c24cb9 99
d7e50217 100
b4c24cb9 101struct clios {
d7e50217
A
102 u_int io_completed; /* amount of io that has currently completed */
103 u_int io_issued; /* amount of io that was successfully issued */
104 int io_error; /* error code of first error encountered */
105 int io_wanted; /* someone is sleeping waiting for a change in state */
b4c24cb9
A
106};
107
91447636
A
108static lck_grp_t *cl_mtx_grp;
109static lck_attr_t *cl_mtx_attr;
110static lck_grp_attr_t *cl_mtx_grp_attr;
111static lck_mtx_t *cl_mtxp;
112
113
114static int cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
115 int flags, buf_t real_bp, struct clios *iostate);
116static int cluster_iodone(buf_t bp, void *dummy);
117static int cluster_rd_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize);
118static int cluster_hard_throttle_on(vnode_t vp);
119
120static int cluster_read_x(vnode_t vp, struct uio *uio, off_t filesize, int flags);
121static int cluster_write_x(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF,
122 off_t headOff, off_t tailOff, int flags);
123static int cluster_nocopy_read(vnode_t vp, struct uio *uio, off_t filesize);
124static int cluster_nocopy_write(vnode_t vp, struct uio *uio, off_t newEOF);
125static int cluster_phys_read(vnode_t vp, struct uio *uio, off_t filesize);
126static int cluster_phys_write(vnode_t vp, struct uio *uio, off_t newEOF);
127static int cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, int xsize, int flags);
1c79356b 128
91447636
A
129static void cluster_rd_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *ra);
130
131static int cluster_push_x(vnode_t vp, struct cl_extent *, off_t EOF, int flags);
132static void cluster_push_EOF(vnode_t vp, off_t EOF);
133
134static int cluster_try_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int can_delay, int push_all);
135
136static void sparse_cluster_switch(struct cl_writebehind *, vnode_t vp, off_t EOF);
137static void sparse_cluster_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_all);
138static void sparse_cluster_add(struct cl_writebehind *, vnode_t vp, struct cl_extent *, off_t EOF);
55e303ae
A
139
140static kern_return_t vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, int *setcountp);
55e303ae
A
141static kern_return_t vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp);
142static kern_return_t vfs_drt_control(void **cmapp, int op_type);
143
91447636 144int is_file_clean(vnode_t, off_t);
9bccf70c 145
1c79356b
A
146/*
147 * throttle the number of async writes that
148 * can be outstanding on a single vnode
149 * before we issue a synchronous write
150 */
91447636
A
151#define HARD_THROTTLE_MAXCNT 0
152#define HARD_THROTTLE_MAXSIZE (64 * 1024)
55e303ae
A
153
154int hard_throttle_on_root = 0;
155struct timeval priority_IO_timestamp_for_root;
156
157
91447636
A
158void
159cluster_init(void) {
160 /*
161 * allocate lock group attribute and group
162 */
163 cl_mtx_grp_attr = lck_grp_attr_alloc_init();
164 //lck_grp_attr_setstat(cl_mtx_grp_attr);
165 cl_mtx_grp = lck_grp_alloc_init("cluster I/O", cl_mtx_grp_attr);
166
167 /*
168 * allocate the lock attribute
169 */
170 cl_mtx_attr = lck_attr_alloc_init();
171 //lck_attr_setdebug(clf_mtx_attr);
172
173 /*
174 * allocate and initialize mutex's used to protect updates and waits
175 * on the cluster_io context
176 */
177 cl_mtxp = lck_mtx_alloc_init(cl_mtx_grp, cl_mtx_attr);
178
179 if (cl_mtxp == NULL)
180 panic("cluster_init: failed to allocate cl_mtxp");
181}
182
183
184
185#define CLW_ALLOCATE 0x01
186#define CLW_RETURNLOCKED 0x02
187/*
188 * if the read ahead context doesn't yet exist,
189 * allocate and initialize it...
190 * the vnode lock serializes multiple callers
191 * during the actual assignment... first one
192 * to grab the lock wins... the other callers
193 * will release the now unnecessary storage
194 *
195 * once the context is present, try to grab (but don't block on)
196 * the lock associated with it... if someone
197 * else currently owns it, than the read
198 * will run without read-ahead. this allows
199 * multiple readers to run in parallel and
200 * since there's only 1 read ahead context,
201 * there's no real loss in only allowing 1
202 * reader to have read-ahead enabled.
203 */
204static struct cl_readahead *
205cluster_get_rap(vnode_t vp)
206{
207 struct ubc_info *ubc;
208 struct cl_readahead *rap;
209
210 ubc = vp->v_ubcinfo;
211
212 if ((rap = ubc->cl_rahead) == NULL) {
213 MALLOC_ZONE(rap, struct cl_readahead *, sizeof *rap, M_CLRDAHEAD, M_WAITOK);
214
215 bzero(rap, sizeof *rap);
216 rap->cl_lastr = -1;
217 lck_mtx_init(&rap->cl_lockr, cl_mtx_grp, cl_mtx_attr);
218
219 vnode_lock(vp);
220
221 if (ubc->cl_rahead == NULL)
222 ubc->cl_rahead = rap;
223 else {
224 lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp);
225 FREE_ZONE((void *)rap, sizeof *rap, M_CLRDAHEAD);
226 rap = ubc->cl_rahead;
227 }
228 vnode_unlock(vp);
229 }
230 if (lck_mtx_try_lock(&rap->cl_lockr) == TRUE)
231 return(rap);
232
233 return ((struct cl_readahead *)NULL);
234}
235
236
237/*
238 * if the write behind context doesn't yet exist,
239 * and CLW_ALLOCATE is specified, allocate and initialize it...
240 * the vnode lock serializes multiple callers
241 * during the actual assignment... first one
242 * to grab the lock wins... the other callers
243 * will release the now unnecessary storage
244 *
245 * if CLW_RETURNLOCKED is set, grab (blocking if necessary)
246 * the lock associated with the write behind context before
247 * returning
248 */
249
250static struct cl_writebehind *
251cluster_get_wbp(vnode_t vp, int flags)
252{
253 struct ubc_info *ubc;
254 struct cl_writebehind *wbp;
255
256 ubc = vp->v_ubcinfo;
257
258 if ((wbp = ubc->cl_wbehind) == NULL) {
259
260 if ( !(flags & CLW_ALLOCATE))
261 return ((struct cl_writebehind *)NULL);
262
263 MALLOC_ZONE(wbp, struct cl_writebehind *, sizeof *wbp, M_CLWRBEHIND, M_WAITOK);
264
265 bzero(wbp, sizeof *wbp);
266 lck_mtx_init(&wbp->cl_lockw, cl_mtx_grp, cl_mtx_attr);
267
268 vnode_lock(vp);
269
270 if (ubc->cl_wbehind == NULL)
271 ubc->cl_wbehind = wbp;
272 else {
273 lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp);
274 FREE_ZONE((void *)wbp, sizeof *wbp, M_CLWRBEHIND);
275 wbp = ubc->cl_wbehind;
276 }
277 vnode_unlock(vp);
278 }
279 if (flags & CLW_RETURNLOCKED)
280 lck_mtx_lock(&wbp->cl_lockw);
281
282 return (wbp);
283}
284
285
55e303ae 286static int
91447636 287cluster_hard_throttle_on(vnode_t vp)
55e303ae 288{
91447636 289 static struct timeval hard_throttle_maxelapsed = { 0, 200000 };
55e303ae
A
290
291 if (vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV) {
292 struct timeval elapsed;
293
294 if (hard_throttle_on_root)
295 return(1);
296
91447636 297 microuptime(&elapsed);
55e303ae
A
298 timevalsub(&elapsed, &priority_IO_timestamp_for_root);
299
300 if (timevalcmp(&elapsed, &hard_throttle_maxelapsed, <))
301 return(1);
302 }
303 return(0);
304}
305
1c79356b
A
306
307static int
91447636 308cluster_iodone(buf_t bp, __unused void *dummy)
1c79356b 309{
91447636
A
310 int b_flags;
311 int error;
312 int total_size;
313 int total_resid;
314 int upl_offset;
315 int zero_offset;
316 upl_t upl;
317 buf_t cbp;
318 buf_t cbp_head;
319 buf_t cbp_next;
320 buf_t real_bp;
321 struct clios *iostate;
322 int commit_size;
323 int pg_offset;
324
325 cbp_head = (buf_t)(bp->b_trans_head);
1c79356b
A
326
327 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
9bccf70c 328 (int)cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
1c79356b
A
329
330 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
331 /*
332 * all I/O requests that are part of this transaction
333 * have to complete before we can process it
334 */
335 if ( !(cbp->b_flags & B_DONE)) {
336
337 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
9bccf70c 338 (int)cbp_head, (int)cbp, cbp->b_bcount, cbp->b_flags, 0);
1c79356b
A
339
340 return 0;
341 }
342 }
343 error = 0;
344 total_size = 0;
345 total_resid = 0;
346
347 cbp = cbp_head;
348 upl_offset = cbp->b_uploffset;
91447636 349 upl = cbp->b_upl;
1c79356b
A
350 b_flags = cbp->b_flags;
351 real_bp = cbp->b_real_bp;
9bccf70c 352 zero_offset= cbp->b_validend;
b4c24cb9 353 iostate = (struct clios *)cbp->b_iostate;
1c79356b 354
91447636
A
355 if (real_bp)
356 real_bp->b_dev = cbp->b_dev;
357
1c79356b 358 while (cbp) {
1c79356b
A
359 if ((cbp->b_flags & B_ERROR) && error == 0)
360 error = cbp->b_error;
361
362 total_resid += cbp->b_resid;
363 total_size += cbp->b_bcount;
364
365 cbp_next = cbp->b_trans_next;
366
367 free_io_buf(cbp);
368
369 cbp = cbp_next;
370 }
b4c24cb9
A
371 if (zero_offset)
372 cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
373
b4c24cb9 374 if (iostate) {
91447636
A
375 int need_wakeup = 0;
376
d7e50217
A
377 /*
378 * someone has issued multiple I/Os asynchrounsly
379 * and is waiting for them to complete (streaming)
380 */
91447636
A
381 lck_mtx_lock(cl_mtxp);
382
d7e50217
A
383 if (error && iostate->io_error == 0)
384 iostate->io_error = error;
9bccf70c 385
b4c24cb9
A
386 iostate->io_completed += total_size;
387
388 if (iostate->io_wanted) {
d7e50217
A
389 /*
390 * someone is waiting for the state of
391 * this io stream to change
392 */
b4c24cb9 393 iostate->io_wanted = 0;
91447636 394 need_wakeup = 1;
b4c24cb9 395 }
91447636
A
396 lck_mtx_unlock(cl_mtxp);
397
398 if (need_wakeup)
399 wakeup((caddr_t)&iostate->io_wanted);
b4c24cb9 400 }
1c79356b
A
401 if ((b_flags & B_NEED_IODONE) && real_bp) {
402 if (error) {
403 real_bp->b_flags |= B_ERROR;
404 real_bp->b_error = error;
405 }
406 real_bp->b_resid = total_resid;
407
91447636 408 buf_biodone(real_bp);
1c79356b
A
409 }
410 if (error == 0 && total_resid)
411 error = EIO;
412
413 if (b_flags & B_COMMIT_UPL) {
b4c24cb9 414 pg_offset = upl_offset & PAGE_MASK;
55e303ae 415 commit_size = (pg_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1c79356b 416
55e303ae 417 if (error || (b_flags & B_NOCACHE)) {
1c79356b 418 int upl_abort_code;
91447636
A
419 int page_in = 0;
420 int page_out = 0;
1c79356b 421
91447636
A
422 if (b_flags & B_PAGEIO) {
423 if (b_flags & B_READ)
424 page_in = 1;
425 else
426 page_out = 1;
427 }
428 if (b_flags & B_CACHE) /* leave pages in the cache unchanged on error */
1c79356b 429 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
91447636
A
430 else if (page_out && (error != ENXIO)) /* transient error */
431 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
432 else if (page_in)
433 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
1c79356b
A
434 else
435 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
436
0b4e3aa0 437 ubc_upl_abort_range(upl, upl_offset - pg_offset, commit_size,
91447636
A
438 upl_abort_code);
439
1c79356b 440 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
9bccf70c 441 (int)upl, upl_offset - pg_offset, commit_size,
1c79356b
A
442 0x80000000|upl_abort_code, 0);
443
444 } else {
445 int upl_commit_flags = UPL_COMMIT_FREE_ON_EMPTY;
446
91447636
A
447 if ((b_flags & B_PHYS) && (b_flags & B_READ))
448 upl_commit_flags |= UPL_COMMIT_SET_DIRTY;
55e303ae 449
1c79356b
A
450 if (b_flags & B_AGE)
451 upl_commit_flags |= UPL_COMMIT_INACTIVATE;
452
0b4e3aa0
A
453 ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size,
454 upl_commit_flags);
1c79356b 455
0b4e3aa0 456 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
9bccf70c 457 (int)upl, upl_offset - pg_offset, commit_size,
1c79356b
A
458 upl_commit_flags, 0);
459 }
91447636 460 } else {
1c79356b 461 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
9bccf70c 462 (int)upl, upl_offset, 0, error, 0);
91447636 463 }
1c79356b
A
464
465 return (error);
466}
467
468
91447636
A
469void
470cluster_zero(upl_t upl, vm_offset_t upl_offset, int size, buf_t bp)
1c79356b 471{
55e303ae 472 upl_page_info_t *pl;
1c79356b 473
55e303ae 474 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_START,
9bccf70c
A
475 upl_offset, size, (int)bp, 0, 0);
476
91447636 477 if (bp == NULL || bp->b_datap == 0) {
9bccf70c 478
55e303ae
A
479 pl = ubc_upl_pageinfo(upl);
480
481 while (size) {
482 int page_offset;
483 int page_index;
484 addr64_t zero_addr;
485 int zero_cnt;
486
487 page_index = upl_offset / PAGE_SIZE;
488 page_offset = upl_offset & PAGE_MASK;
489
490 zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << 12) + page_offset;
491 zero_cnt = min(PAGE_SIZE - page_offset, size);
492
493 bzero_phys(zero_addr, zero_cnt);
494
495 size -= zero_cnt;
496 upl_offset += zero_cnt;
497 }
1c79356b 498 } else
91447636 499 bzero((caddr_t)((vm_offset_t)bp->b_datap + upl_offset), size);
1c79356b 500
55e303ae
A
501 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_END,
502 upl_offset, size, 0, 0, 0);
1c79356b
A
503}
504
91447636 505
1c79356b 506static int
91447636
A
507cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
508 int flags, buf_t real_bp, struct clios *iostate)
1c79356b 509{
91447636
A
510 buf_t cbp;
511 u_int size;
512 u_int io_size;
513 int io_flags;
514 int bmap_flags;
515 int error = 0;
516 int retval = 0;
517 buf_t cbp_head = NULL;
518 buf_t cbp_tail = NULL;
519 int trans_count = 0;
520 u_int pg_count;
521 int pg_offset;
522 u_int max_iosize;
523 u_int max_vectors;
524 int priv;
525 int zero_offset = 0;
526 int async_throttle = 0;
527 mount_t mp;
528
529 mp = vp->v_mount;
530
531 if (mp->mnt_devblocksize > 1) {
532 /*
533 * round the requested size up so that this I/O ends on a
534 * page boundary in case this is a 'write'... if the filesystem
535 * has blocks allocated to back the page beyond the EOF, we want to
536 * make sure to write out the zero's that are sitting beyond the EOF
537 * so that in case the filesystem doesn't explicitly zero this area
538 * if a hole is created via a lseek/write beyond the current EOF,
539 * it will return zeros when it's read back from the disk. If the
540 * physical allocation doesn't extend for the whole page, we'll
541 * only write/read from the disk up to the end of this allocation
542 * via the extent info returned from the VNOP_BLOCKMAP call.
543 */
544 pg_offset = upl_offset & PAGE_MASK;
55e303ae 545
91447636
A
546 size = (((non_rounded_size + pg_offset) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - pg_offset;
547 } else {
548 /*
549 * anyone advertising a blocksize of 1 byte probably
550 * can't deal with us rounding up the request size
551 * AFP is one such filesystem/device
552 */
553 size = non_rounded_size;
554 }
55e303ae
A
555 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START,
556 (int)f_offset, size, upl_offset, flags, 0);
557
0b4e3aa0 558 if (flags & CL_READ) {
91447636
A
559 io_flags = (B_READ);
560 bmap_flags = VNODE_READ;
0b4e3aa0 561
91447636
A
562 max_iosize = mp->mnt_maxreadcnt;
563 max_vectors = mp->mnt_segreadcnt;
0b4e3aa0 564 } else {
91447636
A
565 io_flags = 0;
566 bmap_flags = VNODE_WRITE;
1c79356b 567
91447636
A
568 max_iosize = mp->mnt_maxwritecnt;
569 max_vectors = mp->mnt_segwritecnt;
0b4e3aa0 570 }
91447636
A
571 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_NONE, max_iosize, max_vectors, mp->mnt_devblocksize, 0, 0);
572
55e303ae 573 /*
91447636
A
574 * make sure the maximum iosize is a
575 * multiple of the page size
55e303ae
A
576 */
577 max_iosize &= ~PAGE_MASK;
578
579 if (flags & CL_THROTTLE) {
580 if ( !(flags & CL_PAGEOUT) && cluster_hard_throttle_on(vp)) {
581 if (max_iosize > HARD_THROTTLE_MAXSIZE)
582 max_iosize = HARD_THROTTLE_MAXSIZE;
583 async_throttle = HARD_THROTTLE_MAXCNT;
584 } else
91447636 585 async_throttle = VNODE_ASYNC_THROTTLE;
55e303ae 586 }
1c79356b
A
587 if (flags & CL_AGE)
588 io_flags |= B_AGE;
589 if (flags & CL_DUMP)
590 io_flags |= B_NOCACHE;
91447636
A
591 if (flags & (CL_PAGEIN | CL_PAGEOUT))
592 io_flags |= B_PAGEIO;
b4c24cb9
A
593 if (flags & CL_COMMIT)
594 io_flags |= B_COMMIT_UPL;
595 if (flags & CL_PRESERVE)
596 io_flags |= B_PHYS;
91447636
A
597 if (flags & CL_KEEPCACHED)
598 io_flags |= B_CACHE;
1c79356b 599
9bccf70c 600 if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
1c79356b
A
601 /*
602 * then we are going to end up
603 * with a page that we can't complete (the file size wasn't a multiple
604 * of PAGE_SIZE and we're trying to read to the end of the file
605 * so we'll go ahead and zero out the portion of the page we can't
606 * read in from the file
607 */
9bccf70c 608 zero_offset = upl_offset + non_rounded_size;
1c79356b
A
609 }
610 while (size) {
91447636
A
611 int pg_resid;
612 daddr64_t blkno;
613 daddr64_t lblkno;
1c79356b 614
0b4e3aa0
A
615 if (size > max_iosize)
616 io_size = max_iosize;
1c79356b
A
617 else
618 io_size = size;
91447636
A
619
620 if ((error = VNOP_BLOCKMAP(vp, f_offset, io_size, &blkno, (size_t *)&io_size, NULL, bmap_flags, NULL))) {
1c79356b
A
621 break;
622 }
91447636
A
623 if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno))
624 real_bp->b_blkno = blkno;
1c79356b
A
625
626 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
9bccf70c 627 (int)f_offset, (int)blkno, io_size, zero_offset, 0);
1c79356b 628
91447636
A
629 if (io_size == 0) {
630 /*
631 * vnop_blockmap didn't return an error... however, it did
632 * return an extent size of 0 which means we can't
633 * make forward progress on this I/O... a hole in the
634 * file would be returned as a blkno of -1 with a non-zero io_size
635 * a real extent is returned with a blkno != -1 and a non-zero io_size
636 */
637 error = EINVAL;
638 break;
639 }
640 if ( !(flags & CL_READ) && blkno == -1) {
641 off_t e_offset;
642
643 /*
644 * we're writing into a 'hole'
645 */
0b4e3aa0 646 if (flags & CL_PAGEOUT) {
91447636
A
647 /*
648 * if we got here via cluster_pageout
649 * then just error the request and return
650 * the 'hole' should already have been covered
651 */
0b4e3aa0
A
652 error = EINVAL;
653 break;
91447636
A
654 }
655 if ( !(flags & CL_COMMIT)) {
656 /*
657 * currently writes always request the commit to happen
658 * as part of the io completion... however, if the CL_COMMIT
659 * flag isn't specified, than we can't issue the abort_range
660 * since the call site is going to abort or commit the same upl..
661 * in this case we can only return an error
662 */
663 error = EINVAL;
664 break;
665 }
666 /*
667 * we can get here if the cluster code happens to
668 * pick up a page that was dirtied via mmap vs
669 * a 'write' and the page targets a 'hole'...
670 * i.e. the writes to the cluster were sparse
671 * and the file was being written for the first time
672 *
673 * we can also get here if the filesystem supports
674 * 'holes' that are less than PAGE_SIZE.... because
675 * we can't know if the range in the page that covers
676 * the 'hole' has been dirtied via an mmap or not,
677 * we have to assume the worst and try to push the
678 * entire page to storage.
679 *
680 * Try paging out the page individually before
681 * giving up entirely and dumping it (the pageout
682 * path will insure that the zero extent accounting
683 * has been taken care of before we get back into cluster_io)
0b4e3aa0 684 */
91447636
A
685 ubc_upl_abort_range(upl, trunc_page(upl_offset), PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
686
687 e_offset = round_page_64(f_offset + 1);
688
689 if (ubc_sync_range(vp, f_offset, e_offset, UBC_PUSHDIRTY) == 0) {
690 error = EINVAL;
0b4e3aa0 691 break;
91447636
A
692 }
693 io_size = e_offset - f_offset;
694
695 f_offset += io_size;
696 upl_offset += io_size;
697
698 if (size >= io_size)
699 size -= io_size;
700 else
701 size = 0;
702 /*
703 * keep track of how much of the original request
704 * that we've actually completed... non_rounded_size
705 * may go negative due to us rounding the request
706 * to a page size multiple (i.e. size > non_rounded_size)
707 */
708 non_rounded_size -= io_size;
709
710 if (non_rounded_size <= 0) {
711 /*
712 * we've transferred all of the data in the original
713 * request, but we were unable to complete the tail
714 * of the last page because the file didn't have
715 * an allocation to back that portion... this is ok.
716 */
717 size = 0;
718 }
0b4e3aa0 719 continue;
1c79356b 720 }
91447636 721 lblkno = (daddr64_t)(f_offset / PAGE_SIZE_64);
1c79356b
A
722 /*
723 * we have now figured out how much I/O we can do - this is in 'io_size'
1c79356b
A
724 * pg_offset is the starting point in the first page for the I/O
725 * pg_count is the number of full and partial pages that 'io_size' encompasses
726 */
1c79356b 727 pg_offset = upl_offset & PAGE_MASK;
1c79356b 728
0b4e3aa0
A
729 if (flags & CL_DEV_MEMORY) {
730 /*
731 * currently, can't deal with reading 'holes' in file
732 */
91447636 733 if (blkno == -1) {
0b4e3aa0
A
734 error = EINVAL;
735 break;
736 }
737 /*
738 * treat physical requests as one 'giant' page
739 */
740 pg_count = 1;
55e303ae
A
741 } else
742 pg_count = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
743
91447636 744 if ((flags & CL_READ) && blkno == -1) {
9bccf70c
A
745 int bytes_to_zero;
746
1c79356b
A
747 /*
748 * if we're reading and blkno == -1, then we've got a
749 * 'hole' in the file that we need to deal with by zeroing
750 * out the affected area in the upl
751 */
9bccf70c
A
752 if (zero_offset && io_size == size) {
753 /*
754 * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
755 * than 'zero_offset' will be non-zero
91447636 756 * if the 'hole' returned by vnop_blockmap extends all the way to the eof
9bccf70c
A
757 * (indicated by the io_size finishing off the I/O request for this UPL)
758 * than we're not going to issue an I/O for the
759 * last page in this upl... we need to zero both the hole and the tail
760 * of the page beyond the EOF, since the delayed zero-fill won't kick in
761 */
762 bytes_to_zero = (((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset;
1c79356b 763
9bccf70c
A
764 zero_offset = 0;
765 } else
766 bytes_to_zero = io_size;
1c79356b 767
9bccf70c
A
768 cluster_zero(upl, upl_offset, bytes_to_zero, real_bp);
769
770 if (cbp_head)
771 /*
772 * if there is a current I/O chain pending
773 * then the first page of the group we just zero'd
774 * will be handled by the I/O completion if the zero
775 * fill started in the middle of the page
776 */
777 pg_count = (io_size - pg_offset) / PAGE_SIZE;
778 else {
779 /*
780 * no pending I/O to pick up that first page
781 * so, we have to make sure it gets committed
782 * here.
783 * set the pg_offset to 0 so that the upl_commit_range
784 * starts with this page
785 */
786 pg_count = (io_size + pg_offset) / PAGE_SIZE;
787 pg_offset = 0;
788 }
1c79356b 789 if (io_size == size && ((upl_offset + io_size) & PAGE_MASK))
9bccf70c
A
790 /*
791 * if we're done with the request for this UPL
792 * then we have to make sure to commit the last page
793 * even if we only partially zero-filled it
794 */
1c79356b
A
795 pg_count++;
796
797 if (pg_count) {
798 if (pg_offset)
799 pg_resid = PAGE_SIZE - pg_offset;
800 else
801 pg_resid = 0;
9bccf70c 802
1c79356b 803 if (flags & CL_COMMIT)
0b4e3aa0 804 ubc_upl_commit_range(upl,
9bccf70c 805 (upl_offset + pg_resid) & ~PAGE_MASK,
0b4e3aa0
A
806 pg_count * PAGE_SIZE,
807 UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
1c79356b
A
808 }
809 upl_offset += io_size;
810 f_offset += io_size;
811 size -= io_size;
91447636
A
812 /*
813 * keep track of how much of the original request
814 * that we've actually completed... non_rounded_size
815 * may go negative due to us rounding the request
816 * to a page size multiple (i.e. size > non_rounded_size)
817 */
818 non_rounded_size -= io_size;
1c79356b 819
91447636
A
820 if (non_rounded_size <= 0) {
821 /*
822 * we've transferred all of the data in the original
823 * request, but we were unable to complete the tail
824 * of the last page because the file didn't have
825 * an allocation to back that portion... this is ok.
826 */
827 size = 0;
828 }
9bccf70c 829 if (cbp_head && pg_count)
1c79356b
A
830 goto start_io;
831 continue;
9bccf70c 832
1c79356b 833 }
55e303ae 834 if (pg_count > max_vectors) {
91447636 835 if (((pg_count - max_vectors) * PAGE_SIZE) > io_size) {
55e303ae
A
836 io_size = PAGE_SIZE - pg_offset;
837 pg_count = 1;
91447636
A
838 } else {
839 io_size -= (pg_count - max_vectors) * PAGE_SIZE;
55e303ae 840 pg_count = max_vectors;
91447636 841 }
1c79356b 842 }
1c79356b 843
91447636 844 if ( !(mp->mnt_kern_flag & MNTK_VIRTUALDEV))
55e303ae
A
845 /*
846 * if we're not targeting a virtual device i.e. a disk image
847 * it's safe to dip into the reserve pool since real devices
848 * can complete this I/O request without requiring additional
849 * bufs from the alloc_io_buf pool
850 */
851 priv = 1;
852 else if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT))
853 /*
854 * Throttle the speculative IO
855 */
0b4e3aa0
A
856 priv = 0;
857 else
858 priv = 1;
859
860 cbp = alloc_io_buf(vp, priv);
1c79356b 861
55e303ae 862 if (flags & CL_PAGEOUT) {
91447636
A
863 u_int i;
864
55e303ae 865 for (i = 0; i < pg_count; i++) {
91447636
A
866 if (buf_invalblkno(vp, lblkno + i, 0) == EBUSY)
867 panic("BUSY bp found in cluster_io");
1c79356b 868 }
1c79356b 869 }
b4c24cb9 870 if (flags & CL_ASYNC) {
91447636
A
871 if (buf_setcallback(cbp, (void *)cluster_iodone, NULL))
872 panic("buf_setcallback failed\n");
b4c24cb9 873 }
1c79356b
A
874 cbp->b_flags |= io_flags;
875
876 cbp->b_lblkno = lblkno;
877 cbp->b_blkno = blkno;
878 cbp->b_bcount = io_size;
1c79356b 879
91447636
A
880 if (buf_setupl(cbp, upl, upl_offset))
881 panic("buf_setupl failed\n");
882
883 cbp->b_trans_next = (buf_t)NULL;
884
885 if ((cbp->b_iostate = (void *)iostate))
d7e50217
A
886 /*
887 * caller wants to track the state of this
888 * io... bump the amount issued against this stream
889 */
b4c24cb9
A
890 iostate->io_issued += io_size;
891
91447636 892 if (flags & CL_READ) {
1c79356b 893 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
91447636
A
894 (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
895 }
896 else {
1c79356b 897 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
91447636
A
898 (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
899 }
1c79356b
A
900
901 if (cbp_head) {
902 cbp_tail->b_trans_next = cbp;
903 cbp_tail = cbp;
904 } else {
905 cbp_head = cbp;
906 cbp_tail = cbp;
907 }
91447636
A
908 (buf_t)(cbp->b_trans_head) = cbp_head;
909 trans_count++;
1c79356b
A
910
911 upl_offset += io_size;
912 f_offset += io_size;
913 size -= io_size;
91447636
A
914 /*
915 * keep track of how much of the original request
916 * that we've actually completed... non_rounded_size
917 * may go negative due to us rounding the request
918 * to a page size multiple (i.e. size > non_rounded_size)
919 */
920 non_rounded_size -= io_size;
1c79356b 921
91447636
A
922 if (non_rounded_size <= 0) {
923 /*
924 * we've transferred all of the data in the original
925 * request, but we were unable to complete the tail
926 * of the last page because the file didn't have
927 * an allocation to back that portion... this is ok.
928 */
929 size = 0;
930 }
931 if ( (!(upl_offset & PAGE_MASK) && !(flags & CL_DEV_MEMORY) && ((flags & CL_ASYNC) || trans_count > 8)) || size == 0) {
1c79356b
A
932 /*
933 * if we have no more I/O to issue or
934 * the current I/O we've prepared fully
935 * completes the last page in this request
9bccf70c
A
936 * and it's either an ASYNC request or
937 * we've already accumulated more than 8 I/O's into
938 * this transaction and it's not an I/O directed to
939 * special DEVICE memory
1c79356b
A
940 * then go ahead and issue the I/O
941 */
942start_io:
1c79356b
A
943 if (real_bp) {
944 cbp_head->b_flags |= B_NEED_IODONE;
945 cbp_head->b_real_bp = real_bp;
9bccf70c 946 } else
91447636 947 cbp_head->b_real_bp = (buf_t)NULL;
1c79356b 948
9bccf70c
A
949 if (size == 0) {
950 /*
951 * we're about to issue the last I/O for this upl
952 * if this was a read to the eof and the eof doesn't
953 * finish on a page boundary, than we need to zero-fill
954 * the rest of the page....
955 */
956 cbp_head->b_validend = zero_offset;
957 } else
958 cbp_head->b_validend = 0;
959
91447636
A
960 if (flags & CL_THROTTLE)
961 (void)vnode_waitforwrites(vp, async_throttle, 0, 0, (char *)"cluster_io");
962
1c79356b 963 for (cbp = cbp_head; cbp;) {
91447636 964 buf_t cbp_next;
1c79356b 965
91447636
A
966 if ( !(io_flags & B_READ))
967 vnode_startwrite(vp);
1c79356b
A
968
969 cbp_next = cbp->b_trans_next;
9bccf70c 970
91447636 971 (void) VNOP_STRATEGY(cbp);
1c79356b
A
972 cbp = cbp_next;
973 }
974 if ( !(flags & CL_ASYNC)) {
91447636
A
975 int dummy;
976
1c79356b 977 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next)
91447636
A
978 buf_biowait(cbp);
979
980 if ((error = cluster_iodone(cbp_head, (void *)&dummy))) {
cc9f6e38 981 if (((flags & (CL_PAGEOUT | CL_KEEPCACHED)) == CL_PAGEOUT) && (error == ENXIO))
91447636
A
982 error = 0; /* drop the error */
983 else {
984 if (retval == 0)
985 retval = error;
986 error = 0;
987 }
1c79356b
A
988 }
989 }
91447636
A
990 cbp_head = (buf_t)NULL;
991 cbp_tail = (buf_t)NULL;
9bccf70c 992
91447636 993 trans_count = 0;
1c79356b
A
994 }
995 }
996 if (error) {
0b4e3aa0
A
997 int abort_size;
998
b4c24cb9
A
999 io_size = 0;
1000
1c79356b 1001 for (cbp = cbp_head; cbp;) {
91447636 1002 buf_t cbp_next;
1c79356b 1003
0b4e3aa0
A
1004 upl_offset -= cbp->b_bcount;
1005 size += cbp->b_bcount;
b4c24cb9 1006 io_size += cbp->b_bcount;
0b4e3aa0 1007
1c79356b
A
1008 cbp_next = cbp->b_trans_next;
1009 free_io_buf(cbp);
1010 cbp = cbp_next;
1c79356b 1011 }
b4c24cb9 1012 if (iostate) {
91447636
A
1013 int need_wakeup = 0;
1014
d7e50217
A
1015 /*
1016 * update the error condition for this stream
1017 * since we never really issued the io
1018 * just go ahead and adjust it back
1019 */
91447636
A
1020 lck_mtx_lock(cl_mtxp);
1021
d7e50217 1022 if (iostate->io_error == 0)
b4c24cb9 1023 iostate->io_error = error;
b4c24cb9
A
1024 iostate->io_issued -= io_size;
1025
1026 if (iostate->io_wanted) {
d7e50217
A
1027 /*
1028 * someone is waiting for the state of
1029 * this io stream to change
1030 */
b4c24cb9 1031 iostate->io_wanted = 0;
91447636 1032 need_wakeup = 0;
b4c24cb9 1033 }
91447636
A
1034 lck_mtx_unlock(cl_mtxp);
1035
1036 if (need_wakeup)
1037 wakeup((caddr_t)&iostate->io_wanted);
b4c24cb9 1038 }
0b4e3aa0 1039 pg_offset = upl_offset & PAGE_MASK;
55e303ae 1040 abort_size = (size + pg_offset + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1c79356b
A
1041
1042 if (flags & CL_COMMIT) {
1043 int upl_abort_code;
1044
55e303ae
A
1045 if (flags & CL_PRESERVE) {
1046 ubc_upl_commit_range(upl, upl_offset - pg_offset, abort_size,
1047 UPL_COMMIT_FREE_ON_EMPTY);
1048 } else {
1049 if ((flags & CL_PAGEOUT) && (error != ENXIO)) /* transient error */
1050 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
1051 else if (flags & CL_PAGEIN)
1052 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
1053 else
1054 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
1c79356b 1055
55e303ae 1056 ubc_upl_abort_range(upl, upl_offset - pg_offset, abort_size,
0b4e3aa0 1057 upl_abort_code);
55e303ae 1058 }
1c79356b 1059 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
9bccf70c 1060 (int)upl, upl_offset - pg_offset, abort_size, error, 0);
1c79356b
A
1061 }
1062 if (real_bp) {
1063 real_bp->b_flags |= B_ERROR;
1064 real_bp->b_error = error;
1065
91447636 1066 buf_biodone(real_bp);
1c79356b
A
1067 }
1068 if (retval == 0)
1069 retval = error;
1070 }
1071 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END,
1072 (int)f_offset, size, upl_offset, retval, 0);
1073
1074 return (retval);
1075}
1076
1077
1078static int
91447636 1079cluster_rd_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize)
1c79356b 1080{
55e303ae 1081 int pages_in_prefetch;
1c79356b
A
1082
1083 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
1084 (int)f_offset, size, (int)filesize, 0, 0);
1085
1086 if (f_offset >= filesize) {
1087 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
1088 (int)f_offset, 0, 0, 0, 0);
1089 return(0);
1090 }
0b4e3aa0 1091 if (size > (MAX_UPL_TRANSFER * PAGE_SIZE))
55e303ae 1092 size = (MAX_UPL_TRANSFER * PAGE_SIZE);
1c79356b 1093 else
55e303ae 1094 size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1c79356b 1095
9bccf70c
A
1096 if ((off_t)size > (filesize - f_offset))
1097 size = filesize - f_offset;
55e303ae 1098 pages_in_prefetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
1c79356b 1099
91447636 1100 advisory_read(vp, filesize, f_offset, size);
1c79356b
A
1101
1102 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
55e303ae 1103 (int)f_offset + size, pages_in_prefetch, 0, 1, 0);
1c79356b 1104
55e303ae 1105 return (pages_in_prefetch);
1c79356b
A
1106}
1107
1108
1109
1110static void
91447636 1111cluster_rd_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *rap)
1c79356b 1112{
91447636
A
1113 daddr64_t r_addr;
1114 off_t f_offset;
1115 int size_of_prefetch;
1116
1c79356b
A
1117
1118 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
91447636 1119 (int)extent->b_addr, (int)extent->e_addr, (int)rap->cl_lastr, 0, 0);
1c79356b 1120
91447636 1121 if (extent->b_addr == rap->cl_lastr && extent->b_addr == extent->e_addr) {
1c79356b 1122 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
91447636 1123 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 0, 0);
1c79356b
A
1124 return;
1125 }
91447636
A
1126 if (rap->cl_lastr == -1 || (extent->b_addr != rap->cl_lastr && extent->b_addr != (rap->cl_lastr + 1) &&
1127 (extent->b_addr != (rap->cl_maxra + 1) || rap->cl_ralen == 0))) {
1128 rap->cl_ralen = 0;
1129 rap->cl_maxra = 0;
1c79356b
A
1130
1131 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
91447636 1132 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 1, 0);
1c79356b
A
1133
1134 return;
1135 }
91447636
A
1136 if (extent->e_addr < rap->cl_maxra) {
1137 if ((rap->cl_maxra - extent->e_addr) > (MAX_UPL_TRANSFER / 4)) {
1c79356b
A
1138
1139 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
91447636 1140 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 2, 0);
1c79356b
A
1141 return;
1142 }
1143 }
91447636
A
1144 r_addr = max(extent->e_addr, rap->cl_maxra) + 1;
1145 f_offset = (off_t)(r_addr * PAGE_SIZE_64);
1c79356b 1146
55e303ae
A
1147 size_of_prefetch = 0;
1148
1149 ubc_range_op(vp, f_offset, f_offset + PAGE_SIZE_64, UPL_ROP_PRESENT, &size_of_prefetch);
1150
1151 if (size_of_prefetch) {
1152 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
91447636 1153 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 3, 0);
55e303ae
A
1154 return;
1155 }
9bccf70c 1156 if (f_offset < filesize) {
91447636 1157 daddr64_t read_size;
55e303ae 1158
91447636 1159 rap->cl_ralen = rap->cl_ralen ? min(MAX_UPL_TRANSFER, rap->cl_ralen << 1) : 1;
55e303ae 1160
91447636
A
1161 read_size = (extent->e_addr + 1) - extent->b_addr;
1162
1163 if (read_size > rap->cl_ralen) {
1164 if (read_size > MAX_UPL_TRANSFER)
1165 rap->cl_ralen = MAX_UPL_TRANSFER;
1166 else
1167 rap->cl_ralen = read_size;
1168 }
1169 size_of_prefetch = cluster_rd_prefetch(vp, f_offset, rap->cl_ralen * PAGE_SIZE, filesize);
1c79356b 1170
9bccf70c 1171 if (size_of_prefetch)
91447636 1172 rap->cl_maxra = (r_addr + size_of_prefetch) - 1;
9bccf70c 1173 }
1c79356b 1174 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
91447636 1175 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 4, 0);
1c79356b
A
1176}
1177
9bccf70c 1178int
91447636
A
1179cluster_pageout(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset,
1180 int size, off_t filesize, int flags)
1c79356b
A
1181{
1182 int io_size;
55e303ae 1183 int rounded_size;
1c79356b 1184 off_t max_size;
55e303ae 1185 int local_flags;
91447636 1186 struct cl_writebehind *wbp;
55e303ae
A
1187
1188 if (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)
1189 /*
1190 * if we know we're issuing this I/O to a virtual device (i.e. disk image)
1191 * then we don't want to enforce this throttle... if we do, we can
1192 * potentially deadlock since we're stalling the pageout thread at a time
1193 * when the disk image might need additional memory (which won't be available
1194 * if the pageout thread can't run)... instead we'll just depend on the throttle
1195 * that the pageout thread now has in place to deal with external files
1196 */
1197 local_flags = CL_PAGEOUT;
1198 else
1199 local_flags = CL_PAGEOUT | CL_THROTTLE;
1c79356b
A
1200
1201 if ((flags & UPL_IOSYNC) == 0)
1202 local_flags |= CL_ASYNC;
1203 if ((flags & UPL_NOCOMMIT) == 0)
1204 local_flags |= CL_COMMIT;
91447636
A
1205 if ((flags & UPL_KEEPCACHED))
1206 local_flags |= CL_KEEPCACHED;
1c79356b 1207
1c79356b
A
1208
1209 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
1210 (int)f_offset, size, (int)filesize, local_flags, 0);
1211
1212 /*
1213 * If they didn't specify any I/O, then we are done...
1214 * we can't issue an abort because we don't know how
1215 * big the upl really is
1216 */
1217 if (size <= 0)
1218 return (EINVAL);
1219
1220 if (vp->v_mount->mnt_flag & MNT_RDONLY) {
1221 if (local_flags & CL_COMMIT)
9bccf70c 1222 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
1c79356b
A
1223 return (EROFS);
1224 }
1225 /*
1226 * can't page-in from a negative offset
1227 * or if we're starting beyond the EOF
1228 * or if the file offset isn't page aligned
1229 * or the size requested isn't a multiple of PAGE_SIZE
1230 */
1231 if (f_offset < 0 || f_offset >= filesize ||
1232 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
0b4e3aa0
A
1233 if (local_flags & CL_COMMIT)
1234 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
1c79356b
A
1235 return (EINVAL);
1236 }
1237 max_size = filesize - f_offset;
1238
1239 if (size < max_size)
1240 io_size = size;
1241 else
9bccf70c 1242 io_size = max_size;
1c79356b 1243
55e303ae 1244 rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1c79356b 1245
55e303ae 1246 if (size > rounded_size) {
0b4e3aa0 1247 if (local_flags & CL_COMMIT)
55e303ae 1248 ubc_upl_abort_range(upl, upl_offset + rounded_size, size - rounded_size,
1c79356b
A
1249 UPL_ABORT_FREE_ON_EMPTY);
1250 }
91447636
A
1251 if ((wbp = cluster_get_wbp(vp, 0)) != NULL)
1252 wbp->cl_hasbeenpaged = 1;
1c79356b 1253
91447636
A
1254 return (cluster_io(vp, upl, upl_offset, f_offset, io_size,
1255 local_flags, (buf_t)NULL, (struct clios *)NULL));
1c79356b
A
1256}
1257
9bccf70c 1258int
91447636
A
1259cluster_pagein(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset,
1260 int size, off_t filesize, int flags)
1c79356b
A
1261{
1262 u_int io_size;
9bccf70c 1263 int rounded_size;
1c79356b
A
1264 off_t max_size;
1265 int retval;
1266 int local_flags = 0;
1c79356b 1267
9bccf70c
A
1268 if (upl == NULL || size < 0)
1269 panic("cluster_pagein: NULL upl passed in");
1c79356b 1270
9bccf70c
A
1271 if ((flags & UPL_IOSYNC) == 0)
1272 local_flags |= CL_ASYNC;
1c79356b 1273 if ((flags & UPL_NOCOMMIT) == 0)
9bccf70c
A
1274 local_flags |= CL_COMMIT;
1275
1c79356b
A
1276
1277 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
1278 (int)f_offset, size, (int)filesize, local_flags, 0);
1279
1280 /*
1281 * can't page-in from a negative offset
1282 * or if we're starting beyond the EOF
1283 * or if the file offset isn't page aligned
1284 * or the size requested isn't a multiple of PAGE_SIZE
1285 */
1286 if (f_offset < 0 || f_offset >= filesize ||
9bccf70c
A
1287 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) {
1288 if (local_flags & CL_COMMIT)
1289 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1c79356b
A
1290 return (EINVAL);
1291 }
1292 max_size = filesize - f_offset;
1293
1294 if (size < max_size)
1295 io_size = size;
1296 else
9bccf70c 1297 io_size = max_size;
1c79356b 1298
9bccf70c 1299 rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1c79356b 1300
9bccf70c
A
1301 if (size > rounded_size && (local_flags & CL_COMMIT))
1302 ubc_upl_abort_range(upl, upl_offset + rounded_size,
55e303ae 1303 size - rounded_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
9bccf70c 1304
91447636
A
1305 retval = cluster_io(vp, upl, upl_offset, f_offset, io_size,
1306 local_flags | CL_READ | CL_PAGEIN, (buf_t)NULL, (struct clios *)NULL);
1c79356b 1307
91447636
A
1308 if (retval == 0 && !(flags & UPL_NORDAHEAD) && !(vp->v_flag & VRAOFF)) {
1309 struct cl_readahead *rap;
1c79356b 1310
91447636 1311 rap = cluster_get_rap(vp);
1c79356b 1312
91447636
A
1313 if (rap != NULL) {
1314 struct cl_extent extent;
1315
1316 extent.b_addr = (daddr64_t)(f_offset / PAGE_SIZE_64);
1317 extent.e_addr = (daddr64_t)((f_offset + ((off_t)io_size - 1)) / PAGE_SIZE_64);
1318
1319 if (rounded_size == PAGE_SIZE) {
1320 /*
1321 * we haven't read the last page in of the file yet
1322 * so let's try to read ahead if we're in
1323 * a sequential access pattern
1324 */
1325 cluster_rd_ahead(vp, &extent, filesize, rap);
1326 }
1327 rap->cl_lastr = extent.e_addr;
1328
1329 lck_mtx_unlock(&rap->cl_lockr);
1c79356b 1330 }
1c79356b
A
1331 }
1332 return (retval);
1333}
1334
9bccf70c 1335int
91447636 1336cluster_bp(buf_t bp)
1c79356b
A
1337{
1338 off_t f_offset;
1339 int flags;
1340
9bccf70c 1341 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START,
91447636 1342 (int)bp, (int)bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
9bccf70c 1343
1c79356b 1344 if (bp->b_flags & B_READ)
9bccf70c 1345 flags = CL_ASYNC | CL_READ;
1c79356b 1346 else
9bccf70c 1347 flags = CL_ASYNC;
1c79356b
A
1348
1349 f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
1350
91447636 1351 return (cluster_io(bp->b_vp, bp->b_upl, 0, f_offset, bp->b_bcount, flags, bp, (struct clios *)NULL));
1c79356b
A
1352}
1353
9bccf70c 1354int
91447636 1355cluster_write(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int xflags)
1c79356b 1356{
1c79356b 1357 int prev_resid;
91447636 1358 u_int clip_size;
1c79356b 1359 off_t max_io_size;
0b4e3aa0 1360 int upl_size;
0b4e3aa0
A
1361 int upl_flags;
1362 upl_t upl;
1c79356b 1363 int retval = 0;
91447636 1364 int flags;
1c79356b 1365
91447636
A
1366 flags = xflags;
1367
1368 if (vp->v_flag & VNOCACHE_DATA)
1369 flags |= IO_NOCACHE;
1370
1371 if ( (!(flags & IO_NOCACHE)) || (!uio) || (!UIO_SEG_IS_USER_SPACE(uio->uio_segflg))) {
1372 /*
1373 * go do a write through the cache if one of the following is true....
1374 * NOCACHE is not true
1375 * there is no uio structure or it doesn't target USERSPACE
1376 */
1377 return (cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags));
1378 }
1379
1380#if LP64_DEBUG
1381 if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) {
1382 panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__);
1383 }
1384#endif /* LP64_DEBUG */
55e303ae 1385
91447636 1386 while (uio_resid(uio) && uio->uio_offset < newEOF && retval == 0) {
cc9f6e38
A
1387 user_size_t iov_len;
1388 user_addr_t iov_base;
91447636 1389
0b4e3aa0 1390 /*
91447636
A
1391 * we know we have a resid, so this is safe
1392 * skip over any emtpy vectors
0b4e3aa0 1393 */
cc9f6e38 1394 uio_update(uio, (user_size_t)0);
0b4e3aa0 1395
cc9f6e38
A
1396 iov_len = uio_curriovlen(uio);
1397 iov_base = uio_curriovbase(uio);
91447636
A
1398
1399 upl_size = PAGE_SIZE;
1400 upl_flags = UPL_QUERY_OBJECT_TYPE;
1401
1402 // LP64todo - fix this!
1403 if ((vm_map_get_upl(current_map(),
cc9f6e38 1404 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
91447636
A
1405 &upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS) {
1406 /*
1407 * the user app must have passed in an invalid address
1408 */
1409 return (EFAULT);
1410 }
0b4e3aa0 1411
0b4e3aa0 1412 /*
91447636
A
1413 * We check every vector target but if it is physically
1414 * contiguous space, we skip the sanity checks.
0b4e3aa0 1415 */
91447636
A
1416 if (upl_flags & UPL_PHYS_CONTIG) {
1417 int zflags;
1418
1419 zflags = flags & ~IO_TAILZEROFILL;
1420 zflags |= IO_HEADZEROFILL;
1421
1422 if (flags & IO_HEADZEROFILL) {
1423 /*
1424 * in case we have additional vectors, we don't want to do this again
1425 */
1426 flags &= ~IO_HEADZEROFILL;
1427
1428 if ((retval = cluster_write_x(vp, (struct uio *)0, 0, uio->uio_offset, headOff, 0, zflags)))
1429 return(retval);
1430 }
1431 retval = cluster_phys_write(vp, uio, newEOF);
1432
1433 if (uio_resid(uio) == 0 && (flags & IO_TAILZEROFILL)) {
1434 return (cluster_write_x(vp, (struct uio *)0, 0, tailOff, uio->uio_offset, 0, zflags));
1435 }
1436 }
1437 else if ((uio_resid(uio) < PAGE_SIZE) || (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL))) {
1438 /*
1439 * we're here because we're don't have a physically contiguous target buffer
1440 * go do a write through the cache if one of the following is true....
1441 * the total xfer size is less than a page...
1442 * we're being asked to ZEROFILL either the head or the tail of the I/O...
1443 */
1444 return (cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags));
1445 }
1446 // LP64todo - fix this!
1447 else if (((int)uio->uio_offset & PAGE_MASK) || (CAST_DOWN(int, iov_base) & PAGE_MASK)) {
1448 if (((int)uio->uio_offset & PAGE_MASK) == (CAST_DOWN(int, iov_base) & PAGE_MASK)) {
1449 /*
1450 * Bring the file offset write up to a pagesize boundary
1451 * this will also bring the base address to a page boundary
1452 * since they both are currently on the same offset within a page
1453 * note: if we get here, uio->uio_resid is greater than PAGE_SIZE
1454 * so the computed clip_size must always be less than the current uio_resid
1455 */
1456 clip_size = (PAGE_SIZE - (uio->uio_offset & PAGE_MASK_64));
1457
1458 /*
1459 * Fake the resid going into the cluster_write_x call
1460 * and restore it on the way out.
1461 */
1462 // LP64todo - fix this
1463 prev_resid = uio_resid(uio);
1464 uio_setresid(uio, clip_size);
1465
1466 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags);
1467
1468 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
1469 } else {
1470 /*
1471 * can't get both the file offset and the buffer offset aligned to a page boundary
1472 * so fire an I/O through the cache for this entire vector
1473 */
1474 // LP64todo - fix this
1475 clip_size = iov_len;
1476 // LP64todo - fix this
1477 prev_resid = uio_resid(uio);
1478 uio_setresid(uio, clip_size);
1479
1480 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags);
1481
1482 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
1483 }
1484 } else {
1485 /*
1486 * If we come in here, we know the offset into
1487 * the file is on a pagesize boundary and the
1488 * target buffer address is also on a page boundary
1489 */
1490 max_io_size = newEOF - uio->uio_offset;
1491 // LP64todo - fix this
1492 clip_size = uio_resid(uio);
1493 if (iov_len < clip_size)
1494 // LP64todo - fix this!
1495 clip_size = iov_len;
1496 if (max_io_size < clip_size)
1497 clip_size = max_io_size;
1498
1499 if (clip_size < PAGE_SIZE) {
1500 /*
1501 * Take care of tail end of write in this vector
1502 */
1503 // LP64todo - fix this
1504 prev_resid = uio_resid(uio);
1505 uio_setresid(uio, clip_size);
1506
1507 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags);
1508
1509 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
1510 } else {
1511 /* round clip_size down to a multiple of pagesize */
1512 clip_size = clip_size & ~(PAGE_MASK);
1513 // LP64todo - fix this
1514 prev_resid = uio_resid(uio);
1515 uio_setresid(uio, clip_size);
1516
1517 retval = cluster_nocopy_write(vp, uio, newEOF);
1518
1519 if ((retval == 0) && uio_resid(uio))
1520 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags);
1521
1522 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
1523 }
1524 } /* end else */
1525 } /* end while */
1526
1c79356b
A
1527 return(retval);
1528}
1529
b4c24cb9 1530
9bccf70c 1531static int
91447636 1532cluster_nocopy_write(vnode_t vp, struct uio *uio, off_t newEOF)
1c79356b
A
1533{
1534 upl_t upl;
1535 upl_page_info_t *pl;
1c79356b 1536 vm_offset_t upl_offset;
1c79356b 1537 int io_size;
d7e50217 1538 int io_flag;
1c79356b
A
1539 int upl_size;
1540 int upl_needed_size;
1541 int pages_in_pl;
1542 int upl_flags;
1543 kern_return_t kret;
1c79356b
A
1544 int i;
1545 int force_data_sync;
1546 int error = 0;
d7e50217 1547 struct clios iostate;
91447636 1548 struct cl_writebehind *wbp;
cc9f6e38 1549
1c79356b
A
1550
1551 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
91447636
A
1552 (int)uio->uio_offset, (int)uio_resid(uio),
1553 (int)newEOF, 0, 0);
1c79356b
A
1554
1555 /*
1556 * When we enter this routine, we know
1557 * -- the offset into the file is on a pagesize boundary
1558 * -- the resid is a page multiple
1559 * -- the resid will not exceed iov_len
1560 */
91447636
A
1561
1562 if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) != NULL) {
1c79356b 1563
91447636
A
1564 cluster_try_push(wbp, vp, newEOF, 0, 1);
1565
1566 lck_mtx_unlock(&wbp->cl_lockw);
1567 }
d7e50217
A
1568 iostate.io_completed = 0;
1569 iostate.io_issued = 0;
1570 iostate.io_error = 0;
1571 iostate.io_wanted = 0;
1572
91447636 1573 while (uio_resid(uio) && uio->uio_offset < newEOF && error == 0) {
cc9f6e38
A
1574 user_addr_t iov_base;
1575
91447636 1576 io_size = uio_resid(uio);
1c79356b 1577
d7e50217
A
1578 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1579 io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1c79356b 1580
cc9f6e38
A
1581 iov_base = uio_curriovbase(uio);
1582
91447636 1583 // LP64todo - fix this!
cc9f6e38 1584 upl_offset = CAST_DOWN(vm_offset_t, iov_base) & PAGE_MASK;
91447636 1585
d7e50217
A
1586 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
1587
1588 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
cc9f6e38 1589 (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
d7e50217
A
1590
1591 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
1592 pages_in_pl = 0;
1593 upl_size = upl_needed_size;
1594 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
55e303ae 1595 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
d7e50217 1596
91447636 1597 // LP64todo - fix this!
d7e50217 1598 kret = vm_map_get_upl(current_map(),
cc9f6e38 1599 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
d7e50217
A
1600 &upl_size,
1601 &upl,
1602 NULL,
1603 &pages_in_pl,
1604 &upl_flags,
1605 force_data_sync);
1606
1607 if (kret != KERN_SUCCESS) {
1608 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1609 0, 0, 0, kret, 0);
d7e50217
A
1610 /*
1611 * cluster_nocopy_write: failed to get pagelist
1612 *
1613 * we may have already spun some portion of this request
1614 * off as async requests... we need to wait for the I/O
1615 * to complete before returning
1616 */
1617 goto wait_for_writes;
1618 }
1619 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1620 pages_in_pl = upl_size / PAGE_SIZE;
1c79356b 1621
d7e50217
A
1622 for (i = 0; i < pages_in_pl; i++) {
1623 if (!upl_valid_page(pl, i))
1624 break;
1625 }
1626 if (i == pages_in_pl)
1627 break;
1c79356b 1628
d7e50217
A
1629 /*
1630 * didn't get all the pages back that we
1631 * needed... release this upl and try again
1632 */
1633 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1634 UPL_ABORT_FREE_ON_EMPTY);
1c79356b 1635 }
d7e50217
A
1636 if (force_data_sync >= 3) {
1637 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1638 i, pages_in_pl, upl_size, kret, 0);
d7e50217
A
1639 /*
1640 * for some reason, we couldn't acquire a hold on all
1641 * the pages needed in the user's address space
1642 *
1643 * we may have already spun some portion of this request
1644 * off as async requests... we need to wait for the I/O
1645 * to complete before returning
1646 */
1647 goto wait_for_writes;
1c79356b 1648 }
0b4e3aa0 1649
d7e50217
A
1650 /*
1651 * Consider the possibility that upl_size wasn't satisfied.
1652 */
1653 if (upl_size != upl_needed_size)
1654 io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
1c79356b 1655
d7e50217 1656 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
cc9f6e38 1657 (int)upl_offset, upl_size, (int)iov_base, io_size, 0);
1c79356b 1658
d7e50217
A
1659 if (io_size == 0) {
1660 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1661 UPL_ABORT_FREE_ON_EMPTY);
d7e50217
A
1662 /*
1663 * we may have already spun some portion of this request
1664 * off as async requests... we need to wait for the I/O
1665 * to complete before returning
1666 */
1667 goto wait_for_writes;
1668 }
1669 /*
1670 * Now look for pages already in the cache
1671 * and throw them away.
55e303ae
A
1672 * uio->uio_offset is page aligned within the file
1673 * io_size is a multiple of PAGE_SIZE
d7e50217 1674 */
55e303ae 1675 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + io_size, UPL_ROP_DUMP, NULL);
1c79356b 1676
d7e50217
A
1677 /*
1678 * we want push out these writes asynchronously so that we can overlap
1679 * the preparation of the next I/O
1680 * if there are already too many outstanding writes
1681 * wait until some complete before issuing the next
1682 */
91447636
A
1683 lck_mtx_lock(cl_mtxp);
1684
d7e50217
A
1685 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
1686 iostate.io_wanted = 1;
91447636 1687 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_write", 0);
d7e50217 1688 }
91447636
A
1689 lck_mtx_unlock(cl_mtxp);
1690
d7e50217
A
1691 if (iostate.io_error) {
1692 /*
1693 * one of the earlier writes we issued ran into a hard error
1694 * don't issue any more writes, cleanup the UPL
1695 * that was just created but not used, then
1696 * go wait for all writes that are part of this stream
1697 * to complete before returning the error to the caller
1698 */
1699 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1700 UPL_ABORT_FREE_ON_EMPTY);
1c79356b 1701
d7e50217
A
1702 goto wait_for_writes;
1703 }
55e303ae 1704 io_flag = CL_ASYNC | CL_PRESERVE | CL_COMMIT | CL_THROTTLE;
1c79356b 1705
d7e50217
A
1706 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
1707 (int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0);
1c79356b 1708
d7e50217 1709 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
91447636 1710 io_size, io_flag, (buf_t)NULL, &iostate);
7b1edb79 1711
cc9f6e38 1712 uio_update(uio, (user_size_t)io_size);
1c79356b 1713
d7e50217 1714 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
91447636 1715 (int)upl_offset, (int)uio->uio_offset, (int)uio_resid(uio), error, 0);
1c79356b
A
1716
1717 } /* end while */
1718
d7e50217
A
1719wait_for_writes:
1720 /*
1721 * make sure all async writes issued as part of this stream
1722 * have completed before we return
1723 */
91447636
A
1724 lck_mtx_lock(cl_mtxp);
1725
d7e50217
A
1726 while (iostate.io_issued != iostate.io_completed) {
1727 iostate.io_wanted = 1;
91447636 1728 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_write", 0);
d7e50217 1729 }
91447636
A
1730 lck_mtx_unlock(cl_mtxp);
1731
d7e50217
A
1732 if (iostate.io_error)
1733 error = iostate.io_error;
1c79356b
A
1734
1735 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1736 (int)uio->uio_offset, (int)uio->uio_resid, error, 4, 0);
1737
1738 return (error);
1739}
1740
b4c24cb9 1741
9bccf70c 1742static int
91447636 1743cluster_phys_write(vnode_t vp, struct uio *uio, off_t newEOF)
0b4e3aa0 1744{
b4c24cb9 1745 upl_page_info_t *pl;
55e303ae 1746 addr64_t src_paddr;
0b4e3aa0
A
1747 upl_t upl;
1748 vm_offset_t upl_offset;
b4c24cb9 1749 int tail_size;
0b4e3aa0
A
1750 int io_size;
1751 int upl_size;
1752 int upl_needed_size;
1753 int pages_in_pl;
1754 int upl_flags;
1755 kern_return_t kret;
0b4e3aa0 1756 int error = 0;
cc9f6e38 1757 user_addr_t iov_base;
91447636
A
1758 int devblocksize;
1759 struct cl_writebehind *wbp;
0b4e3aa0 1760
91447636 1761 devblocksize = vp->v_mount->mnt_devblocksize;
0b4e3aa0
A
1762 /*
1763 * When we enter this routine, we know
1764 * -- the resid will not exceed iov_len
1765 * -- the vector target address is physcially contiguous
1766 */
91447636 1767 if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) != NULL) {
0b4e3aa0 1768
91447636
A
1769 cluster_try_push(wbp, vp, newEOF, 0, 1);
1770
1771 lck_mtx_unlock(&wbp->cl_lockw);
1772 }
1773#if LP64_DEBUG
1774 if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) {
1775 panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__);
1776 }
1777#endif /* LP64_DEBUG */
1778
1779 // LP64todo - fix this!
cc9f6e38
A
1780 io_size = (int)uio_curriovlen(uio);
1781 iov_base = uio_curriovbase(uio);
1782
91447636 1783 upl_offset = CAST_DOWN(upl_offset_t, iov_base) & PAGE_MASK;
0b4e3aa0
A
1784 upl_needed_size = upl_offset + io_size;
1785
1786 pages_in_pl = 0;
1787 upl_size = upl_needed_size;
9bccf70c 1788 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
55e303ae 1789 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
0b4e3aa0 1790
91447636 1791 // LP64todo - fix this!
0b4e3aa0 1792 kret = vm_map_get_upl(current_map(),
cc9f6e38 1793 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
0b4e3aa0
A
1794 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
1795
b4c24cb9
A
1796 if (kret != KERN_SUCCESS) {
1797 /*
1798 * cluster_phys_write: failed to get pagelist
1799 * note: return kret here
1800 */
0b4e3aa0 1801 return(EINVAL);
b4c24cb9 1802 }
0b4e3aa0
A
1803 /*
1804 * Consider the possibility that upl_size wasn't satisfied.
1805 * This is a failure in the physical memory case.
1806 */
b4c24cb9 1807 if (upl_size < upl_needed_size) {
91447636 1808 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
b4c24cb9
A
1809 return(EINVAL);
1810 }
1811 pl = ubc_upl_pageinfo(upl);
0b4e3aa0 1812
cc9f6e38 1813 src_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)upl_offset;
0b4e3aa0 1814
b4c24cb9
A
1815 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
1816 int head_size;
0b4e3aa0 1817
b4c24cb9 1818 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
0b4e3aa0 1819
b4c24cb9
A
1820 if (head_size > io_size)
1821 head_size = io_size;
1822
91447636 1823 error = cluster_align_phys_io(vp, uio, src_paddr, head_size, 0);
b4c24cb9
A
1824
1825 if (error) {
1826 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1827
1828 return(EINVAL);
1829 }
1830 upl_offset += head_size;
1831 src_paddr += head_size;
1832 io_size -= head_size;
0b4e3aa0 1833 }
b4c24cb9
A
1834 tail_size = io_size & (devblocksize - 1);
1835 io_size -= tail_size;
1836
1837 if (io_size) {
1838 /*
1839 * issue a synchronous write to cluster_io
1840 */
1841 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
91447636 1842 io_size, CL_DEV_MEMORY, (buf_t)NULL, (struct clios *)NULL);
b4c24cb9
A
1843 }
1844 if (error == 0) {
1845 /*
1846 * The cluster_io write completed successfully,
1847 * update the uio structure
1848 */
cc9f6e38
A
1849 uio_update(uio, (user_size_t)io_size);
1850
1851 src_paddr += io_size;
b4c24cb9
A
1852
1853 if (tail_size)
91447636 1854 error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, 0);
b4c24cb9
A
1855 }
1856 /*
1857 * just release our hold on the physically contiguous
1858 * region without changing any state
1859 */
1860 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
0b4e3aa0
A
1861
1862 return (error);
1863}
1864
b4c24cb9 1865
9bccf70c 1866static int
91447636 1867cluster_write_x(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int flags)
1c79356b
A
1868{
1869 upl_page_info_t *pl;
1870 upl_t upl;
91447636 1871 vm_offset_t upl_offset = 0;
1c79356b
A
1872 int upl_size;
1873 off_t upl_f_offset;
1874 int pages_in_upl;
1875 int start_offset;
1876 int xfer_resid;
1877 int io_size;
1c79356b
A
1878 int io_offset;
1879 int bytes_to_zero;
1880 int bytes_to_move;
1881 kern_return_t kret;
1882 int retval = 0;
91447636 1883 int io_resid;
1c79356b
A
1884 long long total_size;
1885 long long zero_cnt;
1886 off_t zero_off;
1887 long long zero_cnt1;
1888 off_t zero_off1;
91447636 1889 struct cl_extent cl;
55e303ae 1890 int intersection;
91447636 1891 struct cl_writebehind *wbp;
55e303ae 1892
91447636
A
1893 if ((wbp = cluster_get_wbp(vp, 0)) != NULL)
1894 {
1895 if (wbp->cl_hasbeenpaged) {
1896 /*
1897 * this vnode had pages cleaned to it by
1898 * the pager which indicates that either
1899 * it's not very 'hot', or the system is
1900 * being overwhelmed by a lot of dirty
1901 * data being delayed in the VM cache...
1902 * in either event, we'll push our remaining
1903 * delayed data at this point... this will
1904 * be more efficient than paging out 1 page at
1905 * a time, and will also act as a throttle
1906 * by delaying this client from writing any
1907 * more data until all his delayed data has
1908 * at least been queued to the uderlying driver.
1909 */
1910 if (wbp->cl_number || wbp->cl_scmap)
1911 cluster_push_EOF(vp, newEOF);
1c79356b 1912
91447636
A
1913 wbp->cl_hasbeenpaged = 0;
1914 }
1915 }
1c79356b
A
1916 if (uio) {
1917 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
91447636 1918 (int)uio->uio_offset, uio_resid(uio), (int)oldEOF, (int)newEOF, 0);
1c79356b 1919
91447636
A
1920 // LP64todo - fix this
1921 io_resid = uio_resid(uio);
1c79356b
A
1922 } else {
1923 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1924 0, 0, (int)oldEOF, (int)newEOF, 0);
1925
91447636 1926 io_resid = 0;
1c79356b
A
1927 }
1928 zero_cnt = 0;
1929 zero_cnt1 = 0;
91447636
A
1930 zero_off = 0;
1931 zero_off1 = 0;
1c79356b
A
1932
1933 if (flags & IO_HEADZEROFILL) {
1934 /*
1935 * some filesystems (HFS is one) don't support unallocated holes within a file...
1936 * so we zero fill the intervening space between the old EOF and the offset
1937 * where the next chunk of real data begins.... ftruncate will also use this
1938 * routine to zero fill to the new EOF when growing a file... in this case, the
1939 * uio structure will not be provided
1940 */
1941 if (uio) {
1942 if (headOff < uio->uio_offset) {
1943 zero_cnt = uio->uio_offset - headOff;
1944 zero_off = headOff;
1945 }
1946 } else if (headOff < newEOF) {
1947 zero_cnt = newEOF - headOff;
1948 zero_off = headOff;
1949 }
1950 }
1951 if (flags & IO_TAILZEROFILL) {
1952 if (uio) {
91447636
A
1953 // LP64todo - fix this
1954 zero_off1 = uio->uio_offset + uio_resid(uio);
1c79356b
A
1955
1956 if (zero_off1 < tailOff)
1957 zero_cnt1 = tailOff - zero_off1;
1958 }
1959 }
55e303ae 1960 if (zero_cnt == 0 && uio == (struct uio *) 0) {
91447636
A
1961 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
1962 retval, 0, 0, 0, 0);
1963 return (0);
55e303ae 1964 }
1c79356b 1965
91447636 1966 while ((total_size = (io_resid + zero_cnt + zero_cnt1)) && retval == 0) {
1c79356b
A
1967 /*
1968 * for this iteration of the loop, figure out where our starting point is
1969 */
1970 if (zero_cnt) {
1971 start_offset = (int)(zero_off & PAGE_MASK_64);
1972 upl_f_offset = zero_off - start_offset;
91447636 1973 } else if (io_resid) {
1c79356b
A
1974 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
1975 upl_f_offset = uio->uio_offset - start_offset;
1976 } else {
1977 start_offset = (int)(zero_off1 & PAGE_MASK_64);
1978 upl_f_offset = zero_off1 - start_offset;
1979 }
1980 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
1981 (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
1982
0b4e3aa0
A
1983 if (total_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1984 total_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1c79356b 1985
91447636 1986 cl.b_addr = (daddr64_t)(upl_f_offset / PAGE_SIZE_64);
55e303ae 1987
91447636 1988 if (uio && ((flags & (IO_NOCACHE | IO_SYNC | IO_HEADZEROFILL | IO_TAILZEROFILL)) == 0)) {
55e303ae 1989 /*
91447636 1990 * assumption... total_size <= io_resid
55e303ae
A
1991 * because IO_HEADZEROFILL and IO_TAILZEROFILL not set
1992 */
1993 if ((start_offset + total_size) > (MAX_UPL_TRANSFER * PAGE_SIZE))
1994 total_size -= start_offset;
1995 xfer_resid = total_size;
1996
1997 retval = cluster_copy_ubc_data(vp, uio, &xfer_resid, 1);
1998
1999 if (retval)
2000 break;
2001
91447636 2002 io_resid -= (total_size - xfer_resid);
55e303ae
A
2003 total_size = xfer_resid;
2004 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2005 upl_f_offset = uio->uio_offset - start_offset;
2006
2007 if (total_size == 0) {
2008 if (start_offset) {
2009 /*
2010 * the write did not finish on a page boundary
2011 * which will leave upl_f_offset pointing to the
2012 * beginning of the last page written instead of
2013 * the page beyond it... bump it in this case
2014 * so that the cluster code records the last page
2015 * written as dirty
2016 */
2017 upl_f_offset += PAGE_SIZE_64;
2018 }
2019 upl_size = 0;
2020
2021 goto check_cluster;
2022 }
2023 }
1c79356b
A
2024 /*
2025 * compute the size of the upl needed to encompass
2026 * the requested write... limit each call to cluster_io
0b4e3aa0
A
2027 * to the maximum UPL size... cluster_io will clip if
2028 * this exceeds the maximum io_size for the device,
2029 * make sure to account for
1c79356b
A
2030 * a starting offset that's not page aligned
2031 */
2032 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2033
0b4e3aa0
A
2034 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2035 upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1c79356b
A
2036
2037 pages_in_upl = upl_size / PAGE_SIZE;
2038 io_size = upl_size - start_offset;
2039
2040 if ((long long)io_size > total_size)
2041 io_size = total_size;
2042
55e303ae
A
2043 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, io_size, total_size, 0, 0);
2044
1c79356b 2045
91447636
A
2046 /*
2047 * Gather the pages from the buffer cache.
2048 * The UPL_WILL_MODIFY flag lets the UPL subsystem know
2049 * that we intend to modify these pages.
2050 */
0b4e3aa0 2051 kret = ubc_create_upl(vp,
91447636
A
2052 upl_f_offset,
2053 upl_size,
2054 &upl,
2055 &pl,
2056 UPL_SET_LITE | UPL_WILL_MODIFY);
1c79356b
A
2057 if (kret != KERN_SUCCESS)
2058 panic("cluster_write: failed to get pagelist");
2059
55e303ae
A
2060 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END,
2061 (int)upl, (int)upl_f_offset, start_offset, 0, 0);
1c79356b
A
2062
2063 if (start_offset && !upl_valid_page(pl, 0)) {
0b4e3aa0 2064 int read_size;
1c79356b 2065
0b4e3aa0 2066 /*
1c79356b
A
2067 * we're starting in the middle of the first page of the upl
2068 * and the page isn't currently valid, so we're going to have
2069 * to read it in first... this is a synchronous operation
2070 */
2071 read_size = PAGE_SIZE;
2072
9bccf70c 2073 if ((upl_f_offset + read_size) > newEOF)
1c79356b 2074 read_size = newEOF - upl_f_offset;
9bccf70c 2075
91447636
A
2076 retval = cluster_io(vp, upl, 0, upl_f_offset, read_size,
2077 CL_READ, (buf_t)NULL, (struct clios *)NULL);
1c79356b 2078 if (retval) {
0b4e3aa0 2079 /*
1c79356b
A
2080 * we had an error during the read which causes us to abort
2081 * the current cluster_write request... before we do, we need
2082 * to release the rest of the pages in the upl without modifying
2083 * there state and mark the failed page in error
2084 */
0b4e3aa0 2085 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
91447636
A
2086
2087 if (upl_size > PAGE_SIZE)
2088 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1c79356b
A
2089
2090 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
9bccf70c 2091 (int)upl, 0, 0, retval, 0);
1c79356b
A
2092 break;
2093 }
2094 }
2095 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
2096 /*
2097 * the last offset we're writing to in this upl does not end on a page
2098 * boundary... if it's not beyond the old EOF, then we'll also need to
2099 * pre-read this page in if it isn't already valid
2100 */
2101 upl_offset = upl_size - PAGE_SIZE;
2102
2103 if ((upl_f_offset + start_offset + io_size) < oldEOF &&
2104 !upl_valid_page(pl, upl_offset / PAGE_SIZE)) {
2105 int read_size;
2106
2107 read_size = PAGE_SIZE;
2108
9bccf70c 2109 if ((upl_f_offset + upl_offset + read_size) > newEOF)
1c79356b 2110 read_size = newEOF - (upl_f_offset + upl_offset);
9bccf70c 2111
91447636
A
2112 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size,
2113 CL_READ, (buf_t)NULL, (struct clios *)NULL);
1c79356b 2114 if (retval) {
0b4e3aa0 2115 /*
1c79356b 2116 * we had an error during the read which causes us to abort
0b4e3aa0
A
2117 * the current cluster_write request... before we do, we
2118 * need to release the rest of the pages in the upl without
2119 * modifying there state and mark the failed page in error
1c79356b 2120 */
9bccf70c 2121 ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
91447636
A
2122
2123 if (upl_size > PAGE_SIZE)
2124 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1c79356b
A
2125
2126 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
9bccf70c 2127 (int)upl, 0, 0, retval, 0);
1c79356b
A
2128 break;
2129 }
2130 }
2131 }
1c79356b
A
2132 xfer_resid = io_size;
2133 io_offset = start_offset;
2134
2135 while (zero_cnt && xfer_resid) {
2136
2137 if (zero_cnt < (long long)xfer_resid)
2138 bytes_to_zero = zero_cnt;
2139 else
2140 bytes_to_zero = xfer_resid;
2141
9bccf70c 2142 if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
55e303ae 2143 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
1c79356b 2144 } else {
9bccf70c
A
2145 int zero_pg_index;
2146
1c79356b 2147 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
9bccf70c
A
2148 zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
2149
2150 if ( !upl_valid_page(pl, zero_pg_index)) {
55e303ae 2151 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
1c79356b 2152
9bccf70c
A
2153 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
2154 !upl_dirty_page(pl, zero_pg_index)) {
55e303ae 2155 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
1c79356b
A
2156 }
2157 }
2158 xfer_resid -= bytes_to_zero;
2159 zero_cnt -= bytes_to_zero;
2160 zero_off += bytes_to_zero;
2161 io_offset += bytes_to_zero;
2162 }
91447636
A
2163 if (xfer_resid && io_resid) {
2164 bytes_to_move = min(io_resid, xfer_resid);
1c79356b 2165
55e303ae 2166 retval = cluster_copy_upl_data(uio, upl, io_offset, bytes_to_move);
9bccf70c 2167
1c79356b 2168 if (retval) {
9bccf70c
A
2169
2170 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
1c79356b
A
2171
2172 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
9bccf70c 2173 (int)upl, 0, 0, retval, 0);
1c79356b 2174 } else {
91447636 2175 io_resid -= bytes_to_move;
1c79356b
A
2176 xfer_resid -= bytes_to_move;
2177 io_offset += bytes_to_move;
2178 }
2179 }
2180 while (xfer_resid && zero_cnt1 && retval == 0) {
2181
2182 if (zero_cnt1 < (long long)xfer_resid)
2183 bytes_to_zero = zero_cnt1;
2184 else
2185 bytes_to_zero = xfer_resid;
2186
9bccf70c 2187 if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
55e303ae 2188 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
1c79356b 2189 } else {
9bccf70c
A
2190 int zero_pg_index;
2191
1c79356b 2192 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off1 & PAGE_MASK_64));
9bccf70c
A
2193 zero_pg_index = (int)((zero_off1 - upl_f_offset) / PAGE_SIZE_64);
2194
2195 if ( !upl_valid_page(pl, zero_pg_index)) {
55e303ae 2196 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
9bccf70c
A
2197 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
2198 !upl_dirty_page(pl, zero_pg_index)) {
55e303ae 2199 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
1c79356b
A
2200 }
2201 }
2202 xfer_resid -= bytes_to_zero;
2203 zero_cnt1 -= bytes_to_zero;
2204 zero_off1 += bytes_to_zero;
2205 io_offset += bytes_to_zero;
2206 }
2207
2208 if (retval == 0) {
9bccf70c 2209 int cl_index;
1c79356b
A
2210 int can_delay;
2211
2212 io_size += start_offset;
2213
9bccf70c 2214 if ((upl_f_offset + io_size) >= newEOF && io_size < upl_size) {
1c79356b
A
2215 /*
2216 * if we're extending the file with this write
2217 * we'll zero fill the rest of the page so that
2218 * if the file gets extended again in such a way as to leave a
2219 * hole starting at this EOF, we'll have zero's in the correct spot
2220 */
55e303ae 2221 cluster_zero(upl, io_size, upl_size - io_size, NULL);
1c79356b 2222 }
9bccf70c
A
2223 if (flags & IO_SYNC)
2224 /*
2225 * if the IO_SYNC flag is set than we need to
2226 * bypass any clusters and immediately issue
2227 * the I/O
2228 */
2229 goto issue_io;
55e303ae 2230check_cluster:
91447636
A
2231 /*
2232 * take the lock to protect our accesses
2233 * of the writebehind and sparse cluster state
2234 */
2235 wbp = cluster_get_wbp(vp, CLW_ALLOCATE | CLW_RETURNLOCKED);
2236
55e303ae
A
2237 /*
2238 * calculate the last logical block number
2239 * that this delayed I/O encompassed
2240 */
91447636 2241 cl.e_addr = (daddr64_t)((upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64);
55e303ae 2242
91447636 2243 if (wbp->cl_scmap) {
55e303ae 2244
91447636 2245 if ( !(flags & IO_NOCACHE)) {
55e303ae
A
2246 /*
2247 * we've fallen into the sparse
2248 * cluster method of delaying dirty pages
2249 * first, we need to release the upl if we hold one
2250 * since pages in it may be present in the sparse cluster map
2251 * and may span 2 separate buckets there... if they do and
2252 * we happen to have to flush a bucket to make room and it intersects
2253 * this upl, a deadlock may result on page BUSY
2254 */
2255 if (upl_size)
2256 ubc_upl_commit_range(upl, 0, upl_size,
2257 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2258
91447636
A
2259 sparse_cluster_add(wbp, vp, &cl, newEOF);
2260
2261 lck_mtx_unlock(&wbp->cl_lockw);
55e303ae
A
2262
2263 continue;
2264 }
2265 /*
2266 * must have done cached writes that fell into
2267 * the sparse cluster mechanism... we've switched
2268 * to uncached writes on the file, so go ahead
2269 * and push whatever's in the sparse map
2270 * and switch back to normal clustering
2271 *
2272 * see the comment above concerning a possible deadlock...
2273 */
2274 if (upl_size) {
2275 ubc_upl_commit_range(upl, 0, upl_size,
2276 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2277 /*
2278 * setting upl_size to 0 keeps us from committing a
2279 * second time in the start_new_cluster path
2280 */
2281 upl_size = 0;
2282 }
91447636 2283 sparse_cluster_push(wbp, vp, newEOF, 1);
55e303ae 2284
91447636 2285 wbp->cl_number = 0;
55e303ae
A
2286 /*
2287 * no clusters of either type present at this point
2288 * so just go directly to start_new_cluster since
2289 * we know we need to delay this I/O since we've
2290 * already released the pages back into the cache
2291 * to avoid the deadlock with sparse_cluster_push
2292 */
2293 goto start_new_cluster;
2294 }
2295 upl_offset = 0;
1c79356b 2296
91447636 2297 if (wbp->cl_number == 0)
9bccf70c
A
2298 /*
2299 * no clusters currently present
2300 */
2301 goto start_new_cluster;
1c79356b 2302
91447636 2303 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
1c79356b 2304 /*
55e303ae
A
2305 * check each cluster that we currently hold
2306 * try to merge some or all of this write into
2307 * one or more of the existing clusters... if
2308 * any portion of the write remains, start a
2309 * new cluster
1c79356b 2310 */
91447636 2311 if (cl.b_addr >= wbp->cl_clusters[cl_index].b_addr) {
9bccf70c
A
2312 /*
2313 * the current write starts at or after the current cluster
2314 */
91447636 2315 if (cl.e_addr <= (wbp->cl_clusters[cl_index].b_addr + MAX_UPL_TRANSFER)) {
1c79356b
A
2316 /*
2317 * we have a write that fits entirely
2318 * within the existing cluster limits
2319 */
91447636 2320 if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr)
1c79356b 2321 /*
9bccf70c 2322 * update our idea of where the cluster ends
1c79356b 2323 */
91447636 2324 wbp->cl_clusters[cl_index].e_addr = cl.e_addr;
9bccf70c 2325 break;
1c79356b 2326 }
91447636 2327 if (cl.b_addr < (wbp->cl_clusters[cl_index].b_addr + MAX_UPL_TRANSFER)) {
1c79356b
A
2328 /*
2329 * we have a write that starts in the middle of the current cluster
55e303ae
A
2330 * but extends beyond the cluster's limit... we know this because
2331 * of the previous checks
2332 * we'll extend the current cluster to the max
91447636 2333 * and update the b_addr for the current write to reflect that
55e303ae
A
2334 * the head of it was absorbed into this cluster...
2335 * note that we'll always have a leftover tail in this case since
2336 * full absorbtion would have occurred in the clause above
1c79356b 2337 */
91447636 2338 wbp->cl_clusters[cl_index].e_addr = wbp->cl_clusters[cl_index].b_addr + MAX_UPL_TRANSFER;
55e303ae
A
2339
2340 if (upl_size) {
91447636 2341 daddr64_t start_pg_in_upl;
55e303ae 2342
91447636 2343 start_pg_in_upl = (daddr64_t)(upl_f_offset / PAGE_SIZE_64);
55e303ae 2344
91447636
A
2345 if (start_pg_in_upl < wbp->cl_clusters[cl_index].e_addr) {
2346 intersection = (int)((wbp->cl_clusters[cl_index].e_addr - start_pg_in_upl) * PAGE_SIZE);
55e303ae
A
2347
2348 ubc_upl_commit_range(upl, upl_offset, intersection,
2349 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2350 upl_f_offset += intersection;
2351 upl_offset += intersection;
2352 upl_size -= intersection;
2353 }
2354 }
91447636 2355 cl.b_addr = wbp->cl_clusters[cl_index].e_addr;
1c79356b
A
2356 }
2357 /*
55e303ae
A
2358 * we come here for the case where the current write starts
2359 * beyond the limit of the existing cluster or we have a leftover
2360 * tail after a partial absorbtion
9bccf70c
A
2361 *
2362 * in either case, we'll check the remaining clusters before
2363 * starting a new one
1c79356b 2364 */
9bccf70c 2365 } else {
1c79356b 2366 /*
55e303ae 2367 * the current write starts in front of the cluster we're currently considering
1c79356b 2368 */
91447636 2369 if ((wbp->cl_clusters[cl_index].e_addr - cl.b_addr) <= MAX_UPL_TRANSFER) {
1c79356b 2370 /*
55e303ae
A
2371 * we can just merge the new request into
2372 * this cluster and leave it in the cache
2373 * since the resulting cluster is still
2374 * less than the maximum allowable size
1c79356b 2375 */
91447636 2376 wbp->cl_clusters[cl_index].b_addr = cl.b_addr;
1c79356b 2377
91447636 2378 if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr) {
9bccf70c
A
2379 /*
2380 * the current write completely
55e303ae
A
2381 * envelops the existing cluster and since
2382 * each write is limited to at most MAX_UPL_TRANSFER bytes
2383 * we can just use the start and last blocknos of the write
2384 * to generate the cluster limits
9bccf70c 2385 */
91447636 2386 wbp->cl_clusters[cl_index].e_addr = cl.e_addr;
9bccf70c
A
2387 }
2388 break;
1c79356b 2389 }
9bccf70c 2390
1c79356b 2391 /*
9bccf70c
A
2392 * if we were to combine this write with the current cluster
2393 * we would exceed the cluster size limit.... so,
2394 * let's see if there's any overlap of the new I/O with
55e303ae
A
2395 * the cluster we're currently considering... in fact, we'll
2396 * stretch the cluster out to it's full limit and see if we
2397 * get an intersection with the current write
9bccf70c 2398 *
1c79356b 2399 */
91447636 2400 if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr - MAX_UPL_TRANSFER) {
1c79356b 2401 /*
55e303ae
A
2402 * the current write extends into the proposed cluster
2403 * clip the length of the current write after first combining it's
2404 * tail with the newly shaped cluster
1c79356b 2405 */
91447636 2406 wbp->cl_clusters[cl_index].b_addr = wbp->cl_clusters[cl_index].e_addr - MAX_UPL_TRANSFER;
55e303ae
A
2407
2408 if (upl_size) {
91447636 2409 intersection = (int)((cl.e_addr - wbp->cl_clusters[cl_index].b_addr) * PAGE_SIZE);
55e303ae
A
2410
2411 if (intersection > upl_size)
2412 /*
2413 * because the current write may consist of a number of pages found in the cache
2414 * which are not part of the UPL, we may have an intersection that exceeds
2415 * the size of the UPL that is also part of this write
2416 */
2417 intersection = upl_size;
2418
2419 ubc_upl_commit_range(upl, upl_offset + (upl_size - intersection), intersection,
2420 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2421 upl_size -= intersection;
2422 }
91447636 2423 cl.e_addr = wbp->cl_clusters[cl_index].b_addr;
55e303ae 2424 }
9bccf70c
A
2425 /*
2426 * if we get here, there was no way to merge
55e303ae
A
2427 * any portion of this write with this cluster
2428 * or we could only merge part of it which
2429 * will leave a tail...
9bccf70c
A
2430 * we'll check the remaining clusters before starting a new one
2431 */
1c79356b 2432 }
9bccf70c 2433 }
91447636 2434 if (cl_index < wbp->cl_number)
9bccf70c 2435 /*
55e303ae
A
2436 * we found an existing cluster(s) that we
2437 * could entirely merge this I/O into
9bccf70c
A
2438 */
2439 goto delay_io;
2440
91447636 2441 if (wbp->cl_number < MAX_CLUSTERS && !(flags & IO_NOCACHE))
9bccf70c
A
2442 /*
2443 * we didn't find an existing cluster to
2444 * merge into, but there's room to start
1c79356b
A
2445 * a new one
2446 */
9bccf70c 2447 goto start_new_cluster;
1c79356b 2448
9bccf70c
A
2449 /*
2450 * no exisitng cluster to merge with and no
2451 * room to start a new one... we'll try
55e303ae
A
2452 * pushing one of the existing ones... if none of
2453 * them are able to be pushed, we'll switch
2454 * to the sparse cluster mechanism
91447636 2455 * cluster_try_push updates cl_number to the
55e303ae
A
2456 * number of remaining clusters... and
2457 * returns the number of currently unused clusters
9bccf70c 2458 */
91447636
A
2459 int ret_cluster_try_push = 0;
2460 /* if writes are not deferred, call cluster push immediately */
2461 if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) {
2462 if (flags & IO_NOCACHE)
2463 can_delay = 0;
2464 else
2465 can_delay = 1;
2466
2467 ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, can_delay, 0);
2468 }
9bccf70c 2469
91447636
A
2470 /* execute following regardless writes are deferred or not */
2471 if (ret_cluster_try_push == 0) {
55e303ae
A
2472 /*
2473 * no more room in the normal cluster mechanism
2474 * so let's switch to the more expansive but expensive
2475 * sparse mechanism....
2476 * first, we need to release the upl if we hold one
2477 * since pages in it may be present in the sparse cluster map (after the cluster_switch)
2478 * and may span 2 separate buckets there... if they do and
2479 * we happen to have to flush a bucket to make room and it intersects
2480 * this upl, a deadlock may result on page BUSY
2481 */
2482 if (upl_size)
2483 ubc_upl_commit_range(upl, upl_offset, upl_size,
2484 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2485
91447636
A
2486 sparse_cluster_switch(wbp, vp, newEOF);
2487 sparse_cluster_add(wbp, vp, &cl, newEOF);
2488
2489 lck_mtx_unlock(&wbp->cl_lockw);
55e303ae
A
2490
2491 continue;
9bccf70c 2492 }
55e303ae
A
2493 /*
2494 * we pushed one cluster successfully, so we must be sequentially writing this file
2495 * otherwise, we would have failed and fallen into the sparse cluster support
2496 * so let's take the opportunity to push out additional clusters as long as we
2497 * remain below the throttle... this will give us better I/O locality if we're
2498 * in a copy loop (i.e. we won't jump back and forth between the read and write points
2499 * however, we don't want to push so much out that the write throttle kicks in and
2500 * hangs this thread up until some of the I/O completes...
2501 */
91447636
A
2502 if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) {
2503 while (wbp->cl_number && (vp->v_numoutput <= (VNODE_ASYNC_THROTTLE / 2)))
2504 cluster_try_push(wbp, vp, newEOF, 0, 0);
2505 }
55e303ae 2506
9bccf70c 2507start_new_cluster:
91447636
A
2508 wbp->cl_clusters[wbp->cl_number].b_addr = cl.b_addr;
2509 wbp->cl_clusters[wbp->cl_number].e_addr = cl.e_addr;
9bccf70c 2510
91447636
A
2511 if (flags & IO_NOCACHE)
2512 wbp->cl_clusters[wbp->cl_number].io_nocache = 1;
2513 else
2514 wbp->cl_clusters[wbp->cl_number].io_nocache = 0;
2515 wbp->cl_number++;
55e303ae
A
2516delay_io:
2517 if (upl_size)
2518 ubc_upl_commit_range(upl, upl_offset, upl_size,
2519 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
91447636
A
2520
2521 lck_mtx_unlock(&wbp->cl_lockw);
2522
9bccf70c
A
2523 continue;
2524issue_io:
2525 /*
91447636
A
2526 * we don't hold the vnode lock at this point
2527 *
2528 * because we had to ask for a UPL that provides currenty non-present pages, the
2529 * UPL has been automatically set to clear the dirty flags (both software and hardware)
2530 * upon committing it... this is not the behavior we want since it's possible for
2531 * pages currently present as part of a mapped file to be dirtied while the I/O is in flight.
9bccf70c 2532 * in order to maintain some semblance of coherency with mapped writes
91447636
A
2533 * we need to drop the current upl and pick it back up with COPYOUT_FROM set
2534 * so that we correctly deal with a change in state of the hardware modify bit...
2535 * we do this via cluster_push_x... by passing along the IO_SYNC flag, we force
2536 * cluster_push_x to wait until all the I/Os have completed... cluster_push_x is also
2537 * responsible for generating the correct sized I/O(s)
9bccf70c 2538 */
91447636
A
2539 ubc_upl_commit_range(upl, 0, upl_size,
2540 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
1c79356b 2541
91447636 2542 cl.e_addr = (upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64;
1c79356b 2543
91447636 2544 retval = cluster_push_x(vp, &cl, newEOF, flags);
1c79356b
A
2545 }
2546 }
2547 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
91447636 2548 retval, 0, io_resid, 0, 0);
1c79356b
A
2549
2550 return (retval);
2551}
2552
9bccf70c 2553int
91447636 2554cluster_read(vnode_t vp, struct uio *uio, off_t filesize, int xflags)
1c79356b 2555{
1c79356b 2556 int prev_resid;
91447636 2557 u_int clip_size;
1c79356b 2558 off_t max_io_size;
0b4e3aa0 2559 int upl_size;
0b4e3aa0
A
2560 int upl_flags;
2561 upl_t upl;
1c79356b 2562 int retval = 0;
91447636 2563 int flags;
1c79356b 2564
91447636 2565 flags = xflags;
1c79356b 2566
91447636
A
2567 if (vp->v_flag & VNOCACHE_DATA)
2568 flags |= IO_NOCACHE;
2569 if (vp->v_flag & VRAOFF)
2570 flags |= IO_RAOFF;
2571
2572 if (!((flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg))) {
2573 /*
2574 * go do a read through the cache if one of the following is true....
2575 * NOCACHE is not true
2576 * the uio request doesn't target USERSPACE
0b4e3aa0 2577 */
91447636
A
2578 return (cluster_read_x(vp, uio, filesize, flags));
2579 }
2580
2581#if LP64_DEBUG
2582 if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) {
2583 panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__);
2584 }
2585#endif /* LP64_DEBUG */
2586
2587 while (uio_resid(uio) && uio->uio_offset < filesize && retval == 0) {
cc9f6e38
A
2588 user_size_t iov_len;
2589 user_addr_t iov_base;
91447636 2590
0b4e3aa0 2591 /*
91447636
A
2592 * we know we have a resid, so this is safe
2593 * skip over any emtpy vectors
1c79356b 2594 */
cc9f6e38
A
2595 uio_update(uio, (user_size_t)0);
2596
2597 iov_len = uio_curriovlen(uio);
2598 iov_base = uio_curriovbase(uio);
91447636 2599
91447636
A
2600 upl_size = PAGE_SIZE;
2601 upl_flags = UPL_QUERY_OBJECT_TYPE;
2602
2603 // LP64todo - fix this!
2604 if ((vm_map_get_upl(current_map(),
cc9f6e38 2605 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
91447636
A
2606 &upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS) {
2607 /*
2608 * the user app must have passed in an invalid address
2609 */
2610 return (EFAULT);
2611 }
2612
2613 /*
2614 * We check every vector target but if it is physically
2615 * contiguous space, we skip the sanity checks.
1c79356b 2616 */
91447636
A
2617 if (upl_flags & UPL_PHYS_CONTIG) {
2618 retval = cluster_phys_read(vp, uio, filesize);
2619 }
2620 else if (uio_resid(uio) < PAGE_SIZE) {
2621 /*
2622 * we're here because we're don't have a physically contiguous target buffer
2623 * go do a read through the cache if
2624 * the total xfer size is less than a page...
2625 */
2626 return (cluster_read_x(vp, uio, filesize, flags));
2627 }
2628 // LP64todo - fix this!
2629 else if (((int)uio->uio_offset & PAGE_MASK) || (CAST_DOWN(int, iov_base) & PAGE_MASK)) {
2630 if (((int)uio->uio_offset & PAGE_MASK) == (CAST_DOWN(int, iov_base) & PAGE_MASK)) {
2631 /*
2632 * Bring the file offset read up to a pagesize boundary
2633 * this will also bring the base address to a page boundary
2634 * since they both are currently on the same offset within a page
2635 * note: if we get here, uio->uio_resid is greater than PAGE_SIZE
2636 * so the computed clip_size must always be less than the current uio_resid
2637 */
2638 clip_size = (PAGE_SIZE - (int)(uio->uio_offset & PAGE_MASK_64));
2639
2640 /*
2641 * Fake the resid going into the cluster_read_x call
2642 * and restore it on the way out.
2643 */
2644 prev_resid = uio_resid(uio);
2645 // LP64todo - fix this
2646 uio_setresid(uio, clip_size);
2647
2648 retval = cluster_read_x(vp, uio, filesize, flags);
2649
2650 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
2651 } else {
2652 /*
2653 * can't get both the file offset and the buffer offset aligned to a page boundary
2654 * so fire an I/O through the cache for this entire vector
2655 */
2656 // LP64todo - fix this!
2657 clip_size = iov_len;
2658 prev_resid = uio_resid(uio);
2659 uio_setresid(uio, clip_size);
2660
2661 retval = cluster_read_x(vp, uio, filesize, flags);
2662
2663 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
2664 }
2665 } else {
2666 /*
2667 * If we come in here, we know the offset into
2668 * the file is on a pagesize boundary
2669 */
2670 max_io_size = filesize - uio->uio_offset;
2671 // LP64todo - fix this
2672 clip_size = uio_resid(uio);
2673 if (iov_len < clip_size)
2674 clip_size = iov_len;
2675 if (max_io_size < clip_size)
2676 clip_size = (int)max_io_size;
2677
2678 if (clip_size < PAGE_SIZE) {
2679 /*
2680 * Take care of the tail end of the read in this vector.
2681 */
2682 // LP64todo - fix this
2683 prev_resid = uio_resid(uio);
2684 uio_setresid(uio, clip_size);
1c79356b 2685
91447636
A
2686 retval = cluster_read_x(vp, uio, filesize, flags);
2687
2688 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
2689 } else {
2690 /* round clip_size down to a multiple of pagesize */
2691 clip_size = clip_size & ~(PAGE_MASK);
2692 // LP64todo - fix this
2693 prev_resid = uio_resid(uio);
2694 uio_setresid(uio, clip_size);
2695
2696 retval = cluster_nocopy_read(vp, uio, filesize);
2697
2698 if ((retval==0) && uio_resid(uio))
2699 retval = cluster_read_x(vp, uio, filesize, flags);
2700
2701 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
2702 }
2703 } /* end else */
2704 } /* end while */
1c79356b 2705
1c79356b
A
2706 return(retval);
2707}
2708
9bccf70c 2709static int
91447636 2710cluster_read_x(vnode_t vp, struct uio *uio, off_t filesize, int flags)
1c79356b
A
2711{
2712 upl_page_info_t *pl;
2713 upl_t upl;
2714 vm_offset_t upl_offset;
2715 int upl_size;
2716 off_t upl_f_offset;
2717 int start_offset;
2718 int start_pg;
2719 int last_pg;
91447636 2720 int uio_last = 0;
1c79356b
A
2721 int pages_in_upl;
2722 off_t max_size;
55e303ae
A
2723 off_t last_ioread_offset;
2724 off_t last_request_offset;
2725 u_int size_of_prefetch;
91447636 2726 u_int io_size;
1c79356b 2727 kern_return_t kret;
1c79356b
A
2728 int error = 0;
2729 int retval = 0;
55e303ae
A
2730 u_int max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2731 u_int rd_ahead_enabled = 1;
2732 u_int prefetch_enabled = 1;
91447636
A
2733 struct cl_readahead * rap;
2734 struct clios iostate;
2735 struct cl_extent extent;
55e303ae
A
2736
2737 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
91447636
A
2738 (int)uio->uio_offset, uio_resid(uio), (int)filesize, 0, 0);
2739
2740 // LP64todo - fix this
2741 last_request_offset = uio->uio_offset + uio_resid(uio);
55e303ae 2742
91447636
A
2743 if ((flags & (IO_RAOFF|IO_NOCACHE)) ||
2744 ((last_request_offset & ~PAGE_MASK_64) == (uio->uio_offset & ~PAGE_MASK_64))) {
55e303ae 2745 rd_ahead_enabled = 0;
91447636
A
2746 rap = NULL;
2747 } else {
2748 if (cluster_hard_throttle_on(vp)) {
2749 rd_ahead_enabled = 0;
2750 prefetch_enabled = 0;
55e303ae 2751
91447636
A
2752 max_rd_size = HARD_THROTTLE_MAXSIZE;
2753 }
2754 if ((rap = cluster_get_rap(vp)) == NULL)
2755 rd_ahead_enabled = 0;
55e303ae 2756 }
55e303ae
A
2757 if (last_request_offset > filesize)
2758 last_request_offset = filesize;
91447636
A
2759 extent.b_addr = uio->uio_offset / PAGE_SIZE_64;
2760 extent.e_addr = (last_request_offset - 1) / PAGE_SIZE_64;
55e303ae 2761
91447636 2762 if (rap != NULL && rap->cl_ralen && (rap->cl_lastr == extent.b_addr || (rap->cl_lastr + 1) == extent.b_addr)) {
55e303ae
A
2763 /*
2764 * determine if we already have a read-ahead in the pipe courtesy of the
2765 * last read systemcall that was issued...
2766 * if so, pick up it's extent to determine where we should start
2767 * with respect to any read-ahead that might be necessary to
2768 * garner all the data needed to complete this read systemcall
2769 */
91447636 2770 last_ioread_offset = (rap->cl_maxra * PAGE_SIZE_64) + PAGE_SIZE_64;
1c79356b 2771
55e303ae
A
2772 if (last_ioread_offset < uio->uio_offset)
2773 last_ioread_offset = (off_t)0;
2774 else if (last_ioread_offset > last_request_offset)
2775 last_ioread_offset = last_request_offset;
2776 } else
2777 last_ioread_offset = (off_t)0;
1c79356b 2778
91447636 2779 while (uio_resid(uio) && uio->uio_offset < filesize && retval == 0) {
1c79356b
A
2780 /*
2781 * compute the size of the upl needed to encompass
2782 * the requested read... limit each call to cluster_io
0b4e3aa0
A
2783 * to the maximum UPL size... cluster_io will clip if
2784 * this exceeds the maximum io_size for the device,
2785 * make sure to account for
1c79356b
A
2786 * a starting offset that's not page aligned
2787 */
2788 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2789 upl_f_offset = uio->uio_offset - (off_t)start_offset;
2790 max_size = filesize - uio->uio_offset;
2791
91447636
A
2792 // LP64todo - fix this!
2793 if ((off_t)((unsigned int)uio_resid(uio)) < max_size)
2794 io_size = uio_resid(uio);
1c79356b
A
2795 else
2796 io_size = max_size;
9bccf70c 2797
91447636 2798 if (!(flags & IO_NOCACHE)) {
1c79356b 2799
55e303ae
A
2800 while (io_size) {
2801 u_int io_resid;
2802 u_int io_requested;
1c79356b 2803
55e303ae
A
2804 /*
2805 * if we keep finding the pages we need already in the cache, then
2806 * don't bother to call cluster_rd_prefetch since it costs CPU cycles
2807 * to determine that we have all the pages we need... once we miss in
2808 * the cache and have issued an I/O, than we'll assume that we're likely
2809 * to continue to miss in the cache and it's to our advantage to try and prefetch
2810 */
2811 if (last_request_offset && last_ioread_offset && (size_of_prefetch = (last_request_offset - last_ioread_offset))) {
2812 if ((last_ioread_offset - uio->uio_offset) <= max_rd_size && prefetch_enabled) {
2813 /*
2814 * we've already issued I/O for this request and
2815 * there's still work to do and
2816 * our prefetch stream is running dry, so issue a
2817 * pre-fetch I/O... the I/O latency will overlap
2818 * with the copying of the data
2819 */
2820 if (size_of_prefetch > max_rd_size)
2821 size_of_prefetch = max_rd_size;
1c79356b 2822
91447636 2823 size_of_prefetch = cluster_rd_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize);
1c79356b 2824
55e303ae
A
2825 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
2826
2827 if (last_ioread_offset > last_request_offset)
2828 last_ioread_offset = last_request_offset;
2829 }
2830 }
2831 /*
2832 * limit the size of the copy we're about to do so that
2833 * we can notice that our I/O pipe is running dry and
2834 * get the next I/O issued before it does go dry
2835 */
2836 if (last_ioread_offset && io_size > ((MAX_UPL_TRANSFER * PAGE_SIZE) / 4))
2837 io_resid = ((MAX_UPL_TRANSFER * PAGE_SIZE) / 4);
2838 else
2839 io_resid = io_size;
1c79356b 2840
55e303ae 2841 io_requested = io_resid;
1c79356b 2842
55e303ae 2843 retval = cluster_copy_ubc_data(vp, uio, &io_resid, 0);
1c79356b 2844
55e303ae 2845 io_size -= (io_requested - io_resid);
1c79356b 2846
55e303ae
A
2847 if (retval || io_resid)
2848 /*
2849 * if we run into a real error or
2850 * a page that is not in the cache
2851 * we need to leave streaming mode
2852 */
2853 break;
2854
2855 if ((io_size == 0 || last_ioread_offset == last_request_offset) && rd_ahead_enabled) {
2856 /*
2857 * we're already finished the I/O for this read request
2858 * let's see if we should do a read-ahead
2859 */
91447636 2860 cluster_rd_ahead(vp, &extent, filesize, rap);
55e303ae 2861 }
1c79356b 2862 }
1c79356b
A
2863 if (retval)
2864 break;
1c79356b 2865 if (io_size == 0) {
91447636
A
2866 if (rap != NULL) {
2867 if (extent.e_addr < rap->cl_lastr)
2868 rap->cl_maxra = 0;
2869 rap->cl_lastr = extent.e_addr;
2870 }
1c79356b
A
2871 break;
2872 }
55e303ae
A
2873 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2874 upl_f_offset = uio->uio_offset - (off_t)start_offset;
2875 max_size = filesize - uio->uio_offset;
1c79356b 2876 }
55e303ae
A
2877 if (io_size > max_rd_size)
2878 io_size = max_rd_size;
2879
1c79356b 2880 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
55e303ae
A
2881
2882 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE) / 4)
2883 upl_size = (MAX_UPL_TRANSFER * PAGE_SIZE) / 4;
1c79356b
A
2884 pages_in_upl = upl_size / PAGE_SIZE;
2885
2886 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
9bccf70c 2887 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
1c79356b 2888
0b4e3aa0 2889 kret = ubc_create_upl(vp,
91447636
A
2890 upl_f_offset,
2891 upl_size,
2892 &upl,
2893 &pl,
2894 UPL_SET_LITE);
1c79356b
A
2895 if (kret != KERN_SUCCESS)
2896 panic("cluster_read: failed to get pagelist");
2897
1c79356b 2898 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
9bccf70c 2899 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
1c79356b
A
2900
2901 /*
2902 * scan from the beginning of the upl looking for the first
2903 * non-valid page.... this will become the first page in
2904 * the request we're going to make to 'cluster_io'... if all
2905 * of the pages are valid, we won't call through to 'cluster_io'
2906 */
2907 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
2908 if (!upl_valid_page(pl, start_pg))
2909 break;
2910 }
2911
2912 /*
2913 * scan from the starting invalid page looking for a valid
2914 * page before the end of the upl is reached, if we
2915 * find one, then it will be the last page of the request to
2916 * 'cluster_io'
2917 */
2918 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
2919 if (upl_valid_page(pl, last_pg))
2920 break;
2921 }
55e303ae
A
2922 iostate.io_completed = 0;
2923 iostate.io_issued = 0;
2924 iostate.io_error = 0;
2925 iostate.io_wanted = 0;
1c79356b
A
2926
2927 if (start_pg < last_pg) {
2928 /*
2929 * we found a range of 'invalid' pages that must be filled
2930 * if the last page in this range is the last page of the file
2931 * we may have to clip the size of it to keep from reading past
2932 * the end of the last physical block associated with the file
2933 */
2934 upl_offset = start_pg * PAGE_SIZE;
2935 io_size = (last_pg - start_pg) * PAGE_SIZE;
2936
9bccf70c 2937 if ((upl_f_offset + upl_offset + io_size) > filesize)
1c79356b 2938 io_size = filesize - (upl_f_offset + upl_offset);
9bccf70c 2939
1c79356b 2940 /*
55e303ae 2941 * issue an asynchronous read to cluster_io
1c79356b
A
2942 */
2943
2944 error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
91447636 2945 io_size, CL_READ | CL_ASYNC, (buf_t)NULL, &iostate);
1c79356b
A
2946 }
2947 if (error == 0) {
2948 /*
2949 * if the read completed successfully, or there was no I/O request
55e303ae
A
2950 * issued, than copy the data into user land via 'cluster_upl_copy_data'
2951 * we'll first add on any 'valid'
1c79356b
A
2952 * pages that were present in the upl when we acquired it.
2953 */
2954 u_int val_size;
1c79356b
A
2955
2956 for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
2957 if (!upl_valid_page(pl, uio_last))
2958 break;
2959 }
2960 /*
2961 * compute size to transfer this round, if uio->uio_resid is
55e303ae 2962 * still non-zero after this attempt, we'll loop around and
1c79356b
A
2963 * set up for another I/O.
2964 */
2965 val_size = (uio_last * PAGE_SIZE) - start_offset;
2966
55e303ae 2967 if (val_size > max_size)
1c79356b
A
2968 val_size = max_size;
2969
91447636
A
2970 if (val_size > uio_resid(uio))
2971 // LP64todo - fix this
2972 val_size = uio_resid(uio);
1c79356b 2973
55e303ae
A
2974 if (last_ioread_offset == 0)
2975 last_ioread_offset = uio->uio_offset + val_size;
1c79356b 2976
55e303ae 2977 if ((size_of_prefetch = (last_request_offset - last_ioread_offset)) && prefetch_enabled) {
1c79356b 2978 /*
55e303ae
A
2979 * if there's still I/O left to do for this request, and...
2980 * we're not in hard throttle mode, then issue a
2981 * pre-fetch I/O... the I/O latency will overlap
1c79356b
A
2982 * with the copying of the data
2983 */
91447636 2984 size_of_prefetch = cluster_rd_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize);
1c79356b 2985
55e303ae
A
2986 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
2987
2988 if (last_ioread_offset > last_request_offset)
2989 last_ioread_offset = last_request_offset;
1c79356b 2990
55e303ae
A
2991 } else if ((uio->uio_offset + val_size) == last_request_offset) {
2992 /*
2993 * this transfer will finish this request, so...
2994 * let's try to read ahead if we're in
2995 * a sequential access pattern and we haven't
2996 * explicitly disabled it
2997 */
2998 if (rd_ahead_enabled)
91447636
A
2999 cluster_rd_ahead(vp, &extent, filesize, rap);
3000
3001 if (rap != NULL) {
3002 if (extent.e_addr < rap->cl_lastr)
3003 rap->cl_maxra = 0;
3004 rap->cl_lastr = extent.e_addr;
3005 }
9bccf70c 3006 }
91447636
A
3007 lck_mtx_lock(cl_mtxp);
3008
55e303ae
A
3009 while (iostate.io_issued != iostate.io_completed) {
3010 iostate.io_wanted = 1;
91447636 3011 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_x", 0);
55e303ae 3012 }
91447636
A
3013 lck_mtx_unlock(cl_mtxp);
3014
55e303ae
A
3015 if (iostate.io_error)
3016 error = iostate.io_error;
9bccf70c 3017 else
55e303ae 3018 retval = cluster_copy_upl_data(uio, upl, start_offset, val_size);
1c79356b
A
3019 }
3020 if (start_pg < last_pg) {
3021 /*
3022 * compute the range of pages that we actually issued an I/O for
3023 * and either commit them as valid if the I/O succeeded
3024 * or abort them if the I/O failed
3025 */
3026 io_size = (last_pg - start_pg) * PAGE_SIZE;
3027
3028 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
9bccf70c 3029 (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
1c79356b 3030
91447636 3031 if (error || (flags & IO_NOCACHE))
0b4e3aa0 3032 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
1c79356b
A
3033 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3034 else
0b4e3aa0 3035 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size,
55e303ae
A
3036 UPL_COMMIT_CLEAR_DIRTY |
3037 UPL_COMMIT_FREE_ON_EMPTY |
3038 UPL_COMMIT_INACTIVATE);
1c79356b
A
3039
3040 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
9bccf70c 3041 (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
1c79356b
A
3042 }
3043 if ((last_pg - start_pg) < pages_in_upl) {
3044 int cur_pg;
3045 int commit_flags;
3046
3047 /*
3048 * the set of pages that we issued an I/O for did not encompass
3049 * the entire upl... so just release these without modifying
55e303ae 3050 * their state
1c79356b
A
3051 */
3052 if (error)
9bccf70c 3053 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1c79356b 3054 else {
0b4e3aa0 3055 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
9bccf70c 3056 (int)upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
1c79356b 3057
0b4e3aa0
A
3058 if (start_pg) {
3059 /*
3060 * we found some already valid pages at the beginning of
3061 * the upl commit these back to the inactive list with
3062 * reference cleared
3063 */
3064 for (cur_pg = 0; cur_pg < start_pg; cur_pg++) {
3065 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
3066 | UPL_COMMIT_INACTIVATE;
1c79356b
A
3067
3068 if (upl_dirty_page(pl, cur_pg))
0b4e3aa0 3069 commit_flags |= UPL_COMMIT_SET_DIRTY;
1c79356b 3070
91447636 3071 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (flags & IO_NOCACHE))
0b4e3aa0
A
3072 ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
3073 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
1c79356b 3074 else
0b4e3aa0
A
3075 ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
3076 PAGE_SIZE, commit_flags);
1c79356b
A
3077 }
3078 }
3079 if (last_pg < uio_last) {
0b4e3aa0
A
3080 /*
3081 * we found some already valid pages immediately after the
3082 * pages we issued I/O for, commit these back to the
3083 * inactive list with reference cleared
3084 */
3085 for (cur_pg = last_pg; cur_pg < uio_last; cur_pg++) {
3086 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
3087 | UPL_COMMIT_INACTIVATE;
1c79356b
A
3088
3089 if (upl_dirty_page(pl, cur_pg))
0b4e3aa0 3090 commit_flags |= UPL_COMMIT_SET_DIRTY;
1c79356b 3091
91447636 3092 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (flags & IO_NOCACHE))
0b4e3aa0
A
3093 ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
3094 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
1c79356b 3095 else
0b4e3aa0
A
3096 ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
3097 PAGE_SIZE, commit_flags);
1c79356b
A
3098 }
3099 }
3100 if (uio_last < pages_in_upl) {
0b4e3aa0
A
3101 /*
3102 * there were some invalid pages beyond the valid pages
3103 * that we didn't issue an I/O for, just release them
3104 * unchanged
1c79356b 3105 */
9bccf70c
A
3106 ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
3107 (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
1c79356b
A
3108 }
3109
0b4e3aa0 3110 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
9bccf70c 3111 (int)upl, -1, -1, 0, 0);
1c79356b
A
3112 }
3113 }
3114 if (retval == 0)
3115 retval = error;
91447636
A
3116
3117 if ( uio_resid(uio) ) {
3118 if (cluster_hard_throttle_on(vp)) {
3119 rd_ahead_enabled = 0;
3120 prefetch_enabled = 0;
3121
3122 max_rd_size = HARD_THROTTLE_MAXSIZE;
3123 } else {
3124 if (rap != NULL)
3125 rd_ahead_enabled = 1;
3126 prefetch_enabled = 1;
3127
3128 max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE;
3129 }
3130 }
3131 }
3132 if (rap != NULL) {
3133 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
3134 (int)uio->uio_offset, uio_resid(uio), rap->cl_lastr, retval, 0);
3135
3136 lck_mtx_unlock(&rap->cl_lockr);
3137 } else {
3138 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
3139 (int)uio->uio_offset, uio_resid(uio), 0, retval, 0);
1c79356b
A
3140 }
3141
3142 return (retval);
3143}
3144
b4c24cb9 3145
9bccf70c 3146static int
91447636 3147cluster_nocopy_read(vnode_t vp, struct uio *uio, off_t filesize)
1c79356b
A
3148{
3149 upl_t upl;
3150 upl_page_info_t *pl;
1c79356b 3151 vm_offset_t upl_offset;
1c79356b
A
3152 off_t max_io_size;
3153 int io_size;
3154 int upl_size;
3155 int upl_needed_size;
3156 int pages_in_pl;
1c79356b
A
3157 int upl_flags;
3158 kern_return_t kret;
1c79356b
A
3159 int i;
3160 int force_data_sync;
1c79356b 3161 int retval = 0;
91447636
A
3162 int no_zero_fill = 0;
3163 int abort_flag = 0;
d7e50217 3164 struct clios iostate;
55e303ae
A
3165 u_int max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE;
3166 u_int max_rd_ahead = MAX_UPL_TRANSFER * PAGE_SIZE * 2;
3167
1c79356b
A
3168
3169 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
91447636 3170 (int)uio->uio_offset, uio_resid(uio), (int)filesize, 0, 0);
1c79356b
A
3171
3172 /*
3173 * When we enter this routine, we know
3174 * -- the offset into the file is on a pagesize boundary
3175 * -- the resid is a page multiple
3176 * -- the resid will not exceed iov_len
3177 */
3178
d7e50217
A
3179 iostate.io_completed = 0;
3180 iostate.io_issued = 0;
3181 iostate.io_error = 0;
3182 iostate.io_wanted = 0;
3183
91447636 3184 while (uio_resid(uio) && uio->uio_offset < filesize && retval == 0) {
cc9f6e38 3185 user_addr_t iov_base;
1c79356b 3186
91447636
A
3187 if (cluster_hard_throttle_on(vp)) {
3188 max_rd_size = HARD_THROTTLE_MAXSIZE;
3189 max_rd_ahead = HARD_THROTTLE_MAXSIZE - 1;
3190 } else {
3191 max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE;
cc9f6e38 3192 max_rd_ahead = MAX_UPL_TRANSFER * PAGE_SIZE * 8;
91447636 3193 }
d7e50217 3194 max_io_size = filesize - uio->uio_offset;
0b4e3aa0 3195
91447636
A
3196 // LP64todo - fix this
3197 if (max_io_size < (off_t)((unsigned int)uio_resid(uio)))
d7e50217
A
3198 io_size = max_io_size;
3199 else
91447636 3200 io_size = uio_resid(uio);
1c79356b 3201
d7e50217
A
3202 /*
3203 * First look for pages already in the cache
3204 * and move them to user space.
3205 */
55e303ae 3206 retval = cluster_copy_ubc_data(vp, uio, &io_size, 0);
1c79356b 3207
d7e50217
A
3208 if (retval) {
3209 /*
3210 * we may have already spun some portion of this request
3211 * off as async requests... we need to wait for the I/O
3212 * to complete before returning
3213 */
3214 goto wait_for_reads;
0b4e3aa0 3215 }
d7e50217
A
3216 /*
3217 * If we are already finished with this read, then return
3218 */
3219 if (io_size == 0) {
3220 /*
3221 * we may have already spun some portion of this request
3222 * off as async requests... we need to wait for the I/O
3223 * to complete before returning
3224 */
3225 goto wait_for_reads;
3226 }
3227 max_io_size = io_size;
3228
55e303ae
A
3229 if (max_io_size > max_rd_size)
3230 max_io_size = max_rd_size;
3231
d7e50217 3232 io_size = 0;
1c79356b 3233
55e303ae
A
3234 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + max_io_size, UPL_ROP_ABSENT, &io_size);
3235
d7e50217
A
3236 if (io_size == 0)
3237 /*
3238 * we may have already spun some portion of this request
3239 * off as async requests... we need to wait for the I/O
3240 * to complete before returning
3241 */
3242 goto wait_for_reads;
cc9f6e38
A
3243
3244 iov_base = uio_curriovbase(uio);
1c79356b 3245
91447636 3246 // LP64todo - fix this!
cc9f6e38 3247 upl_offset = CAST_DOWN(vm_offset_t, iov_base) & PAGE_MASK;
d7e50217 3248 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
1c79356b 3249
d7e50217 3250 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
cc9f6e38 3251 (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
1c79356b 3252
91447636
A
3253 if (upl_offset == 0 && ((io_size & PAGE_MASK) == 0)) {
3254 no_zero_fill = 1;
3255 abort_flag = UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY;
3256 } else {
3257 no_zero_fill = 0;
3258 abort_flag = UPL_ABORT_FREE_ON_EMPTY;
3259 }
d7e50217
A
3260 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
3261 pages_in_pl = 0;
3262 upl_size = upl_needed_size;
55e303ae 3263 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
1c79356b 3264
91447636
A
3265 if (no_zero_fill)
3266 upl_flags |= UPL_NOZEROFILL;
3267 if (force_data_sync)
3268 upl_flags |= UPL_FORCE_DATA_SYNC;
3269
3270 // LP64todo - fix this!
3271 kret = vm_map_create_upl(current_map(),
cc9f6e38 3272 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
91447636 3273 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags);
1c79356b 3274
d7e50217
A
3275 if (kret != KERN_SUCCESS) {
3276 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
3277 (int)upl_offset, upl_size, io_size, kret, 0);
d7e50217
A
3278 /*
3279 * cluster_nocopy_read: failed to get pagelist
3280 *
3281 * we may have already spun some portion of this request
3282 * off as async requests... we need to wait for the I/O
3283 * to complete before returning
3284 */
3285 goto wait_for_reads;
3286 }
3287 pages_in_pl = upl_size / PAGE_SIZE;
3288 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1c79356b 3289
d7e50217
A
3290 for (i = 0; i < pages_in_pl; i++) {
3291 if (!upl_valid_page(pl, i))
3292 break;
3293 }
3294 if (i == pages_in_pl)
3295 break;
0b4e3aa0 3296
91447636 3297 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, abort_flag);
1c79356b 3298 }
d7e50217
A
3299 if (force_data_sync >= 3) {
3300 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
3301 (int)upl_offset, upl_size, io_size, kret, 0);
1c79356b 3302
d7e50217
A
3303 goto wait_for_reads;
3304 }
3305 /*
3306 * Consider the possibility that upl_size wasn't satisfied.
3307 */
3308 if (upl_size != upl_needed_size)
3309 io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
1c79356b 3310
d7e50217 3311 if (io_size == 0) {
91447636 3312 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, abort_flag);
d7e50217
A
3313 goto wait_for_reads;
3314 }
3315 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
3316 (int)upl_offset, upl_size, io_size, kret, 0);
1c79356b 3317
d7e50217
A
3318 /*
3319 * request asynchronously so that we can overlap
3320 * the preparation of the next I/O
3321 * if there are already too many outstanding reads
3322 * wait until some have completed before issuing the next read
3323 */
91447636
A
3324 lck_mtx_lock(cl_mtxp);
3325
55e303ae 3326 while ((iostate.io_issued - iostate.io_completed) > max_rd_ahead) {
d7e50217 3327 iostate.io_wanted = 1;
91447636 3328 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_read", 0);
d7e50217 3329 }
91447636
A
3330 lck_mtx_unlock(cl_mtxp);
3331
d7e50217
A
3332 if (iostate.io_error) {
3333 /*
3334 * one of the earlier reads we issued ran into a hard error
3335 * don't issue any more reads, cleanup the UPL
3336 * that was just created but not used, then
3337 * go wait for any other reads to complete before
3338 * returning the error to the caller
3339 */
91447636 3340 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, abort_flag);
1c79356b 3341
d7e50217
A
3342 goto wait_for_reads;
3343 }
3344 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
55e303ae 3345 (int)upl, (int)upl_offset, (int)uio->uio_offset, io_size, 0);
1c79356b 3346
91447636 3347 retval = cluster_io(vp, upl, upl_offset, uio->uio_offset, io_size,
d7e50217 3348 CL_PRESERVE | CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO,
91447636 3349 (buf_t)NULL, &iostate);
1c79356b 3350
d7e50217
A
3351 /*
3352 * update the uio structure
3353 */
cc9f6e38 3354 uio_update(uio, (user_size_t)io_size);
1c79356b 3355
d7e50217 3356 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
91447636 3357 (int)upl, (int)uio->uio_offset, (int)uio_resid(uio), retval, 0);
1c79356b
A
3358
3359 } /* end while */
3360
d7e50217
A
3361wait_for_reads:
3362 /*
3363 * make sure all async reads that are part of this stream
3364 * have completed before we return
3365 */
91447636
A
3366 lck_mtx_lock(cl_mtxp);
3367
d7e50217
A
3368 while (iostate.io_issued != iostate.io_completed) {
3369 iostate.io_wanted = 1;
91447636 3370 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_read", 0);
d7e50217 3371 }
91447636
A
3372 lck_mtx_unlock(cl_mtxp);
3373
d7e50217
A
3374 if (iostate.io_error)
3375 retval = iostate.io_error;
1c79356b
A
3376
3377 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
91447636 3378 (int)uio->uio_offset, (int)uio_resid(uio), 6, retval, 0);
1c79356b
A
3379
3380 return (retval);
3381}
3382
3383
9bccf70c 3384static int
91447636 3385cluster_phys_read(vnode_t vp, struct uio *uio, off_t filesize)
0b4e3aa0 3386{
b4c24cb9 3387 upl_page_info_t *pl;
0b4e3aa0
A
3388 upl_t upl;
3389 vm_offset_t upl_offset;
55e303ae 3390 addr64_t dst_paddr;
0b4e3aa0 3391 off_t max_size;
cc9f6e38
A
3392 int io_size;
3393 user_size_t iov_len;
3394 user_addr_t iov_base;
b4c24cb9 3395 int tail_size;
0b4e3aa0
A
3396 int upl_size;
3397 int upl_needed_size;
3398 int pages_in_pl;
3399 int upl_flags;
3400 kern_return_t kret;
b4c24cb9 3401 struct clios iostate;
0b4e3aa0 3402 int error;
91447636 3403 int devblocksize;
0b4e3aa0 3404
91447636 3405 devblocksize = vp->v_mount->mnt_devblocksize;
0b4e3aa0
A
3406 /*
3407 * When we enter this routine, we know
3408 * -- the resid will not exceed iov_len
3409 * -- the target address is physically contiguous
3410 */
3411
91447636
A
3412#if LP64_DEBUG
3413 if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) {
3414 panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__);
3415 }
3416#endif /* LP64_DEBUG */
3417
cc9f6e38
A
3418 iov_len = uio_curriovlen(uio);
3419 iov_base = uio_curriovbase(uio);
0b4e3aa0
A
3420
3421 max_size = filesize - uio->uio_offset;
3422
91447636
A
3423 // LP64todo - fix this!
3424 if (max_size < 0 || (u_int64_t)max_size > iov_len)
3425 io_size = iov_len;
0b4e3aa0 3426 else
b4c24cb9 3427 io_size = max_size;
0b4e3aa0 3428
91447636
A
3429 // LP64todo - fix this!
3430 upl_offset = CAST_DOWN(vm_offset_t, iov_base) & PAGE_MASK;
0b4e3aa0
A
3431 upl_needed_size = upl_offset + io_size;
3432
b4c24cb9 3433 error = 0;
0b4e3aa0
A
3434 pages_in_pl = 0;
3435 upl_size = upl_needed_size;
55e303ae 3436 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
0b4e3aa0
A
3437
3438 kret = vm_map_get_upl(current_map(),
cc9f6e38 3439 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
0b4e3aa0
A
3440 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
3441
b4c24cb9
A
3442 if (kret != KERN_SUCCESS) {
3443 /*
3444 * cluster_phys_read: failed to get pagelist
3445 */
3446 return(EINVAL);
3447 }
3448 if (upl_size < upl_needed_size) {
3449 /*
3450 * The upl_size wasn't satisfied.
3451 */
3452 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3453
3454 return(EINVAL);
3455 }
3456 pl = ubc_upl_pageinfo(upl);
3457
cc9f6e38 3458 dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)upl_offset;
0b4e3aa0 3459
b4c24cb9
A
3460 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
3461 int head_size;
3462
3463 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
3464
3465 if (head_size > io_size)
3466 head_size = io_size;
3467
91447636 3468 error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, CL_READ);
b4c24cb9
A
3469
3470 if (error) {
3471 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3472
3473 return(EINVAL);
3474 }
3475 upl_offset += head_size;
3476 dst_paddr += head_size;
3477 io_size -= head_size;
3478 }
3479 tail_size = io_size & (devblocksize - 1);
3480 io_size -= tail_size;
3481
3482 iostate.io_completed = 0;
3483 iostate.io_issued = 0;
3484 iostate.io_error = 0;
3485 iostate.io_wanted = 0;
3486
3487 while (io_size && error == 0) {
3488 int xsize;
3489
3490 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
3491 xsize = MAX_UPL_TRANSFER * PAGE_SIZE;
3492 else
3493 xsize = io_size;
3494 /*
3495 * request asynchronously so that we can overlap
3496 * the preparation of the next I/O... we'll do
3497 * the commit after all the I/O has completed
3498 * since its all issued against the same UPL
3499 * if there are already too many outstanding reads
d7e50217 3500 * wait until some have completed before issuing the next
b4c24cb9 3501 */
91447636
A
3502 lck_mtx_lock(cl_mtxp);
3503
cc9f6e38 3504 while ((iostate.io_issued - iostate.io_completed) > (8 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
b4c24cb9 3505 iostate.io_wanted = 1;
91447636 3506 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_phys_read", 0);
b4c24cb9 3507 }
91447636 3508 lck_mtx_unlock(cl_mtxp);
b4c24cb9 3509
91447636 3510 error = cluster_io(vp, upl, upl_offset, uio->uio_offset, xsize,
b4c24cb9 3511 CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC,
91447636 3512 (buf_t)NULL, &iostate);
b4c24cb9
A
3513 /*
3514 * The cluster_io read was issued successfully,
3515 * update the uio structure
3516 */
3517 if (error == 0) {
cc9f6e38
A
3518 uio_update(uio, (user_size_t)xsize);
3519
3520 dst_paddr += xsize;
3521 upl_offset += xsize;
3522 io_size -= xsize;
b4c24cb9
A
3523 }
3524 }
0b4e3aa0 3525 /*
d7e50217
A
3526 * make sure all async reads that are part of this stream
3527 * have completed before we proceed
0b4e3aa0 3528 */
91447636
A
3529 lck_mtx_lock(cl_mtxp);
3530
b4c24cb9
A
3531 while (iostate.io_issued != iostate.io_completed) {
3532 iostate.io_wanted = 1;
91447636 3533 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_phys_read", 0);
b4c24cb9 3534 }
91447636
A
3535 lck_mtx_unlock(cl_mtxp);
3536
3537 if (iostate.io_error)
b4c24cb9 3538 error = iostate.io_error;
91447636 3539
b4c24cb9 3540 if (error == 0 && tail_size)
91447636 3541 error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, CL_READ);
0b4e3aa0
A
3542
3543 /*
b4c24cb9
A
3544 * just release our hold on the physically contiguous
3545 * region without changing any state
0b4e3aa0 3546 */
b4c24cb9 3547 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
0b4e3aa0
A
3548
3549 return (error);
3550}
1c79356b 3551
b4c24cb9 3552
1c79356b
A
3553/*
3554 * generate advisory I/O's in the largest chunks possible
3555 * the completed pages will be released into the VM cache
3556 */
9bccf70c 3557int
91447636 3558advisory_read(vnode_t vp, off_t filesize, off_t f_offset, int resid)
1c79356b 3559{
1c79356b
A
3560 upl_page_info_t *pl;
3561 upl_t upl;
3562 vm_offset_t upl_offset;
3563 int upl_size;
3564 off_t upl_f_offset;
3565 int start_offset;
3566 int start_pg;
3567 int last_pg;
3568 int pages_in_upl;
3569 off_t max_size;
3570 int io_size;
3571 kern_return_t kret;
3572 int retval = 0;
9bccf70c 3573 int issued_io;
55e303ae 3574 int skip_range;
1c79356b 3575
91447636 3576 if ( !UBCINFOEXISTS(vp))
1c79356b
A
3577 return(EINVAL);
3578
1c79356b 3579 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
91447636 3580 (int)f_offset, resid, (int)filesize, 0, 0);
1c79356b
A
3581
3582 while (resid && f_offset < filesize && retval == 0) {
3583 /*
3584 * compute the size of the upl needed to encompass
3585 * the requested read... limit each call to cluster_io
0b4e3aa0
A
3586 * to the maximum UPL size... cluster_io will clip if
3587 * this exceeds the maximum io_size for the device,
3588 * make sure to account for
1c79356b
A
3589 * a starting offset that's not page aligned
3590 */
3591 start_offset = (int)(f_offset & PAGE_MASK_64);
3592 upl_f_offset = f_offset - (off_t)start_offset;
3593 max_size = filesize - f_offset;
3594
3595 if (resid < max_size)
3596 io_size = resid;
3597 else
3598 io_size = max_size;
3599
3600 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
0b4e3aa0
A
3601 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
3602 upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
55e303ae
A
3603
3604 skip_range = 0;
3605 /*
3606 * return the number of contiguously present pages in the cache
3607 * starting at upl_f_offset within the file
3608 */
3609 ubc_range_op(vp, upl_f_offset, upl_f_offset + upl_size, UPL_ROP_PRESENT, &skip_range);
3610
3611 if (skip_range) {
3612 /*
3613 * skip over pages already present in the cache
3614 */
3615 io_size = skip_range - start_offset;
3616
3617 f_offset += io_size;
3618 resid -= io_size;
3619
3620 if (skip_range == upl_size)
3621 continue;
3622 /*
3623 * have to issue some real I/O
3624 * at this point, we know it's starting on a page boundary
3625 * because we've skipped over at least the first page in the request
3626 */
3627 start_offset = 0;
3628 upl_f_offset += skip_range;
3629 upl_size -= skip_range;
3630 }
1c79356b
A
3631 pages_in_upl = upl_size / PAGE_SIZE;
3632
55e303ae
A
3633 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_START,
3634 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
3635
0b4e3aa0 3636 kret = ubc_create_upl(vp,
91447636
A
3637 upl_f_offset,
3638 upl_size,
3639 &upl,
3640 &pl,
3641 UPL_RET_ONLY_ABSENT | UPL_SET_LITE);
1c79356b 3642 if (kret != KERN_SUCCESS)
9bccf70c
A
3643 return(retval);
3644 issued_io = 0;
1c79356b
A
3645
3646 /*
9bccf70c
A
3647 * before we start marching forward, we must make sure we end on
3648 * a present page, otherwise we will be working with a freed
3649 * upl
1c79356b 3650 */
9bccf70c
A
3651 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
3652 if (upl_page_present(pl, last_pg))
3653 break;
1c79356b 3654 }
9bccf70c 3655 pages_in_upl = last_pg + 1;
1c79356b 3656
1c79356b 3657
55e303ae 3658 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_END,
9bccf70c
A
3659 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
3660
3661
3662 for (last_pg = 0; last_pg < pages_in_upl; ) {
1c79356b 3663 /*
9bccf70c
A
3664 * scan from the beginning of the upl looking for the first
3665 * page that is present.... this will become the first page in
3666 * the request we're going to make to 'cluster_io'... if all
3667 * of the pages are absent, we won't call through to 'cluster_io'
1c79356b 3668 */
9bccf70c
A
3669 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
3670 if (upl_page_present(pl, start_pg))
3671 break;
1c79356b 3672 }
1c79356b 3673
1c79356b 3674 /*
9bccf70c
A
3675 * scan from the starting present page looking for an absent
3676 * page before the end of the upl is reached, if we
3677 * find one, then it will terminate the range of pages being
3678 * presented to 'cluster_io'
1c79356b 3679 */
9bccf70c
A
3680 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
3681 if (!upl_page_present(pl, last_pg))
3682 break;
3683 }
3684
3685 if (last_pg > start_pg) {
3686 /*
3687 * we found a range of pages that must be filled
3688 * if the last page in this range is the last page of the file
3689 * we may have to clip the size of it to keep from reading past
3690 * the end of the last physical block associated with the file
3691 */
3692 upl_offset = start_pg * PAGE_SIZE;
3693 io_size = (last_pg - start_pg) * PAGE_SIZE;
3694
3695 if ((upl_f_offset + upl_offset + io_size) > filesize)
3696 io_size = filesize - (upl_f_offset + upl_offset);
3697
3698 /*
3699 * issue an asynchronous read to cluster_io
3700 */
91447636
A
3701 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
3702 CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE, (buf_t)NULL, (struct clios *)NULL);
1c79356b 3703
9bccf70c
A
3704 issued_io = 1;
3705 }
1c79356b 3706 }
9bccf70c
A
3707 if (issued_io == 0)
3708 ubc_upl_abort(upl, 0);
3709
3710 io_size = upl_size - start_offset;
1c79356b
A
3711
3712 if (io_size > resid)
3713 io_size = resid;
3714 f_offset += io_size;
3715 resid -= io_size;
3716 }
9bccf70c 3717
1c79356b
A
3718 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
3719 (int)f_offset, resid, retval, 0, 0);
3720
3721 return(retval);
3722}
3723
3724
9bccf70c 3725int
91447636 3726cluster_push(vnode_t vp, int flags)
9bccf70c 3727{
91447636
A
3728 int retval;
3729 struct cl_writebehind *wbp;
9bccf70c 3730
91447636
A
3731 if ( !UBCINFOEXISTS(vp)) {
3732 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, (int)vp, flags, 0, -1, 0);
3733 return (0);
3734 }
3735 /* return if deferred write is set */
3736 if (((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) && (flags & IO_DEFWRITE)) {
3737 return (0);
3738 }
3739 if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) == NULL) {
3740 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, (int)vp, flags, 0, -2, 0);
3741 return (0);
3742 }
3743 if (wbp->cl_number == 0 && wbp->cl_scmap == NULL) {
3744 lck_mtx_unlock(&wbp->cl_lockw);
9bccf70c 3745
91447636
A
3746 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, (int)vp, flags, 0, -3, 0);
3747 return(0);
3748 }
9bccf70c 3749 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
91447636 3750 (int)wbp->cl_scmap, wbp->cl_number, flags, 0, 0);
9bccf70c 3751
91447636
A
3752 if (wbp->cl_scmap) {
3753 sparse_cluster_push(wbp, vp, ubc_getsize(vp), 1);
9bccf70c 3754
55e303ae
A
3755 retval = 1;
3756 } else
91447636
A
3757 retval = cluster_try_push(wbp, vp, ubc_getsize(vp), 0, 1);
3758
3759 lck_mtx_unlock(&wbp->cl_lockw);
3760
3761 if (flags & IO_SYNC)
3762 (void)vnode_waitforwrites(vp, 0, 0, 0, (char *)"cluster_push");
9bccf70c 3763
55e303ae 3764 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
91447636 3765 (int)wbp->cl_scmap, wbp->cl_number, retval, 0, 0);
9bccf70c 3766
55e303ae
A
3767 return (retval);
3768}
9bccf70c 3769
9bccf70c 3770
91447636
A
3771__private_extern__ void
3772cluster_release(struct ubc_info *ubc)
55e303ae 3773{
91447636
A
3774 struct cl_writebehind *wbp;
3775 struct cl_readahead *rap;
3776
3777 if ((wbp = ubc->cl_wbehind)) {
9bccf70c 3778
91447636
A
3779 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, (int)ubc, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
3780
3781 if (wbp->cl_scmap)
3782 vfs_drt_control(&(wbp->cl_scmap), 0);
3783 } else {
3784 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, (int)ubc, 0, 0, 0, 0);
3785 }
9bccf70c 3786
91447636 3787 rap = ubc->cl_rahead;
55e303ae 3788
91447636
A
3789 if (wbp != NULL) {
3790 lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp);
3791 FREE_ZONE((void *)wbp, sizeof *wbp, M_CLWRBEHIND);
3792 }
3793 if ((rap = ubc->cl_rahead)) {
3794 lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp);
3795 FREE_ZONE((void *)rap, sizeof *rap, M_CLRDAHEAD);
55e303ae 3796 }
91447636
A
3797 ubc->cl_rahead = NULL;
3798 ubc->cl_wbehind = NULL;
3799
3800 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_END, (int)ubc, (int)rap, (int)wbp, 0, 0);
3801}
3802
3803
3804static void
3805cluster_push_EOF(vnode_t vp, off_t EOF)
3806{
3807 struct cl_writebehind *wbp;
3808
3809 wbp = cluster_get_wbp(vp, CLW_ALLOCATE | CLW_RETURNLOCKED);
3810
3811 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
3812 (int)wbp->cl_scmap, wbp->cl_number, (int)EOF, 0, 0);
3813
3814 if (wbp->cl_scmap)
3815 sparse_cluster_push(wbp, vp, EOF, 1);
3816 else
3817 cluster_try_push(wbp, vp, EOF, 0, 1);
3818
3819 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
3820 (int)wbp->cl_scmap, wbp->cl_number, 0, 0, 0);
3821
3822 lck_mtx_unlock(&wbp->cl_lockw);
9bccf70c
A
3823}
3824
3825
3826static int
91447636 3827cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int can_delay, int push_all)
9bccf70c
A
3828{
3829 int cl_index;
3830 int cl_index1;
3831 int min_index;
3832 int cl_len;
55e303ae 3833 int cl_pushed = 0;
91447636 3834 struct cl_wextent l_clusters[MAX_CLUSTERS];
9bccf70c
A
3835
3836 /*
91447636
A
3837 * the write behind context exists and has
3838 * already been locked...
3839 *
9bccf70c 3840 * make a local 'sorted' copy of the clusters
91447636 3841 * and clear wbp->cl_number so that new clusters can
9bccf70c
A
3842 * be developed
3843 */
91447636
A
3844 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
3845 for (min_index = -1, cl_index1 = 0; cl_index1 < wbp->cl_number; cl_index1++) {
3846 if (wbp->cl_clusters[cl_index1].b_addr == wbp->cl_clusters[cl_index1].e_addr)
9bccf70c
A
3847 continue;
3848 if (min_index == -1)
3849 min_index = cl_index1;
91447636 3850 else if (wbp->cl_clusters[cl_index1].b_addr < wbp->cl_clusters[min_index].b_addr)
9bccf70c
A
3851 min_index = cl_index1;
3852 }
3853 if (min_index == -1)
3854 break;
91447636
A
3855 l_clusters[cl_index].b_addr = wbp->cl_clusters[min_index].b_addr;
3856 l_clusters[cl_index].e_addr = wbp->cl_clusters[min_index].e_addr;
3857 l_clusters[cl_index].io_nocache = wbp->cl_clusters[min_index].io_nocache;
9bccf70c 3858
91447636 3859 wbp->cl_clusters[min_index].b_addr = wbp->cl_clusters[min_index].e_addr;
9bccf70c 3860 }
91447636
A
3861 wbp->cl_number = 0;
3862
3863 cl_len = cl_index;
9bccf70c 3864
55e303ae
A
3865 if (can_delay && cl_len == MAX_CLUSTERS) {
3866 int i;
3867
3868 /*
3869 * determine if we appear to be writing the file sequentially
3870 * if not, by returning without having pushed any clusters
3871 * we will cause this vnode to be pushed into the sparse cluster mechanism
3872 * used for managing more random I/O patterns
3873 *
3874 * we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
3875 * that's why we're in try_push with can_delay true...
3876 *
3877 * check to make sure that all the clusters except the last one are 'full'... and that each cluster
3878 * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
91447636
A
3879 * so we can just make a simple pass through, up to, but not including the last one...
3880 * note that e_addr is not inclusive, so it will be equal to the b_addr of the next cluster if they
55e303ae
A
3881 * are sequential
3882 *
3883 * we let the last one be partial as long as it was adjacent to the previous one...
3884 * we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
3885 * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
3886 */
3887 for (i = 0; i < MAX_CLUSTERS - 1; i++) {
91447636 3888 if ((l_clusters[i].e_addr - l_clusters[i].b_addr) != MAX_UPL_TRANSFER)
55e303ae 3889 goto dont_try;
91447636 3890 if (l_clusters[i].e_addr != l_clusters[i+1].b_addr)
55e303ae
A
3891 goto dont_try;
3892 }
3893 }
91447636
A
3894 /*
3895 * drop the lock while we're firing off the I/Os...
3896 * this is safe since I'm working off of a private sorted copy
3897 * of the clusters, and I'm going to re-evaluate the public
3898 * state after I retake the lock
3899 */
3900 lck_mtx_unlock(&wbp->cl_lockw);
3901
55e303ae 3902 for (cl_index = 0; cl_index < cl_len; cl_index++) {
91447636
A
3903 int flags;
3904 struct cl_extent cl;
3905
9bccf70c 3906 /*
91447636 3907 * try to push each cluster in turn...
9bccf70c 3908 */
91447636
A
3909 if (l_clusters[cl_index].io_nocache)
3910 flags = IO_NOCACHE;
3911 else
3912 flags = 0;
3913 cl.b_addr = l_clusters[cl_index].b_addr;
3914 cl.e_addr = l_clusters[cl_index].e_addr;
9bccf70c 3915
91447636 3916 cluster_push_x(vp, &cl, EOF, flags);
9bccf70c 3917
91447636
A
3918 l_clusters[cl_index].b_addr = 0;
3919 l_clusters[cl_index].e_addr = 0;
3920
3921 cl_pushed++;
3922
3923 if (push_all == 0)
3924 break;
9bccf70c 3925 }
91447636
A
3926 lck_mtx_lock(&wbp->cl_lockw);
3927
55e303ae 3928dont_try:
9bccf70c
A
3929 if (cl_len > cl_pushed) {
3930 /*
3931 * we didn't push all of the clusters, so
3932 * lets try to merge them back in to the vnode
3933 */
91447636 3934 if ((MAX_CLUSTERS - wbp->cl_number) < (cl_len - cl_pushed)) {
9bccf70c
A
3935 /*
3936 * we picked up some new clusters while we were trying to
91447636
A
3937 * push the old ones... this can happen because I've dropped
3938 * the vnode lock... the sum of the
9bccf70c 3939 * leftovers plus the new cluster count exceeds our ability
55e303ae 3940 * to represent them, so switch to the sparse cluster mechanism
91447636
A
3941 *
3942 * collect the active public clusters...
9bccf70c 3943 */
91447636 3944 sparse_cluster_switch(wbp, vp, EOF);
55e303ae
A
3945
3946 for (cl_index = 0, cl_index1 = 0; cl_index < cl_len; cl_index++) {
91447636 3947 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr)
9bccf70c 3948 continue;
91447636
A
3949 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
3950 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
3951 wbp->cl_clusters[cl_index1].io_nocache = l_clusters[cl_index].io_nocache;
9bccf70c 3952
55e303ae 3953 cl_index1++;
9bccf70c 3954 }
55e303ae
A
3955 /*
3956 * update the cluster count
3957 */
91447636 3958 wbp->cl_number = cl_index1;
55e303ae
A
3959
3960 /*
3961 * and collect the original clusters that were moved into the
3962 * local storage for sorting purposes
3963 */
91447636 3964 sparse_cluster_switch(wbp, vp, EOF);
55e303ae 3965
9bccf70c
A
3966 } else {
3967 /*
3968 * we've got room to merge the leftovers back in
3969 * just append them starting at the next 'hole'
91447636 3970 * represented by wbp->cl_number
9bccf70c 3971 */
91447636
A
3972 for (cl_index = 0, cl_index1 = wbp->cl_number; cl_index < cl_len; cl_index++) {
3973 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr)
9bccf70c
A
3974 continue;
3975
91447636
A
3976 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
3977 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
3978 wbp->cl_clusters[cl_index1].io_nocache = l_clusters[cl_index].io_nocache;
9bccf70c 3979
9bccf70c
A
3980 cl_index1++;
3981 }
3982 /*
3983 * update the cluster count
3984 */
91447636 3985 wbp->cl_number = cl_index1;
9bccf70c
A
3986 }
3987 }
91447636 3988 return(MAX_CLUSTERS - wbp->cl_number);
9bccf70c
A
3989}
3990
3991
3992
3993static int
91447636 3994cluster_push_x(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags)
1c79356b 3995{
1c79356b
A
3996 upl_page_info_t *pl;
3997 upl_t upl;
3998 vm_offset_t upl_offset;
3999 int upl_size;
4000 off_t upl_f_offset;
4001 int pages_in_upl;
4002 int start_pg;
4003 int last_pg;
4004 int io_size;
4005 int io_flags;
55e303ae 4006 int upl_flags;
1c79356b 4007 int size;
91447636
A
4008 int error = 0;
4009 int retval;
1c79356b
A
4010 kern_return_t kret;
4011
4012
9bccf70c 4013 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START,
91447636 4014 (int)cl->b_addr, (int)cl->e_addr, (int)EOF, flags, 0);
9bccf70c 4015
91447636 4016 if ((pages_in_upl = (int)(cl->e_addr - cl->b_addr)) == 0) {
9bccf70c 4017 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0);
1c79356b 4018
91447636 4019 return (0);
9bccf70c 4020 }
1c79356b 4021 upl_size = pages_in_upl * PAGE_SIZE;
91447636 4022 upl_f_offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
1c79356b 4023
9bccf70c
A
4024 if (upl_f_offset + upl_size >= EOF) {
4025
4026 if (upl_f_offset >= EOF) {
4027 /*
4028 * must have truncated the file and missed
4029 * clearing a dangling cluster (i.e. it's completely
4030 * beyond the new EOF
4031 */
4032 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0);
4033
91447636 4034 return(0);
9bccf70c
A
4035 }
4036 size = EOF - upl_f_offset;
1c79356b 4037
55e303ae 4038 upl_size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
9bccf70c 4039 pages_in_upl = upl_size / PAGE_SIZE;
55e303ae 4040 } else
9bccf70c 4041 size = upl_size;
55e303ae
A
4042
4043 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, size, 0, 0, 0);
4044
91447636
A
4045 /*
4046 * by asking for UPL_COPYOUT_FROM and UPL_RET_ONLY_DIRTY, we get the following desirable behavior
4047 *
4048 * - only pages that are currently dirty are returned... these are the ones we need to clean
4049 * - the hardware dirty bit is cleared when the page is gathered into the UPL... the software dirty bit is set
4050 * - if we have to abort the I/O for some reason, the software dirty bit is left set since we didn't clean the page
4051 * - when we commit the page, the software dirty bit is cleared... the hardware dirty bit is untouched so that if
4052 * someone dirties this page while the I/O is in progress, we don't lose track of the new state
4053 *
4054 * when the I/O completes, we no longer ask for an explicit clear of the DIRTY state (either soft or hard)
4055 */
4056
4057 if ((vp->v_flag & VNOCACHE_DATA) || (flags & IO_NOCACHE))
55e303ae
A
4058 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE | UPL_WILL_BE_DUMPED;
4059 else
4060 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE;
4061
0b4e3aa0
A
4062 kret = ubc_create_upl(vp,
4063 upl_f_offset,
4064 upl_size,
4065 &upl,
9bccf70c 4066 &pl,
55e303ae 4067 upl_flags);
1c79356b
A
4068 if (kret != KERN_SUCCESS)
4069 panic("cluster_push: failed to get pagelist");
4070
55e303ae 4071 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END, (int)upl, upl_f_offset, 0, 0, 0);
9bccf70c 4072
55e303ae
A
4073 /*
4074 * since we only asked for the dirty pages back
4075 * it's possible that we may only get a few or even none, so...
4076 * before we start marching forward, we must make sure we know
4077 * where the last present page is in the UPL, otherwise we could
4078 * end up working with a freed upl due to the FREE_ON_EMPTY semantics
4079 * employed by commit_range and abort_range.
4080 */
4081 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
4082 if (upl_page_present(pl, last_pg))
4083 break;
9bccf70c 4084 }
55e303ae 4085 pages_in_upl = last_pg + 1;
1c79356b 4086
55e303ae
A
4087 if (pages_in_upl == 0) {
4088 ubc_upl_abort(upl, 0);
1c79356b 4089
55e303ae 4090 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 2, 0, 0, 0);
91447636 4091 return(0);
55e303ae
A
4092 }
4093
4094 for (last_pg = 0; last_pg < pages_in_upl; ) {
4095 /*
4096 * find the next dirty page in the UPL
4097 * this will become the first page in the
4098 * next I/O to generate
4099 */
1c79356b 4100 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
55e303ae 4101 if (upl_dirty_page(pl, start_pg))
1c79356b 4102 break;
55e303ae
A
4103 if (upl_page_present(pl, start_pg))
4104 /*
4105 * RET_ONLY_DIRTY will return non-dirty 'precious' pages
4106 * just release these unchanged since we're not going
4107 * to steal them or change their state
4108 */
4109 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
1c79356b 4110 }
55e303ae
A
4111 if (start_pg >= pages_in_upl)
4112 /*
4113 * done... no more dirty pages to push
4114 */
4115 break;
4116 if (start_pg > last_pg)
4117 /*
4118 * skipped over some non-dirty pages
4119 */
4120 size -= ((start_pg - last_pg) * PAGE_SIZE);
1c79356b 4121
55e303ae
A
4122 /*
4123 * find a range of dirty pages to write
4124 */
1c79356b 4125 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
55e303ae 4126 if (!upl_dirty_page(pl, last_pg))
1c79356b
A
4127 break;
4128 }
4129 upl_offset = start_pg * PAGE_SIZE;
4130
4131 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
4132
91447636
A
4133 io_flags = CL_THROTTLE | CL_COMMIT;
4134
4135 if ( !(flags & IO_SYNC))
4136 io_flags |= CL_ASYNC;
4137
4138 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
4139 io_flags, (buf_t)NULL, (struct clios *)NULL);
1c79356b 4140
91447636
A
4141 if (error == 0 && retval)
4142 error = retval;
1c79356b
A
4143
4144 size -= io_size;
4145 }
9bccf70c
A
4146 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, 0, 0, 0);
4147
91447636 4148 return(error);
1c79356b 4149}
b4c24cb9
A
4150
4151
91447636
A
4152/*
4153 * sparse_cluster_switch is called with the write behind lock held
4154 */
4155static void
4156sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF)
b4c24cb9 4157{
91447636 4158 int cl_index;
b4c24cb9 4159
91447636 4160 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
b4c24cb9 4161
91447636
A
4162 if (wbp->cl_scmap == NULL)
4163 wbp->cl_scdirty = 0;
4164
4165 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
4166 int flags;
4167 struct cl_extent cl;
4168
4169 for (cl.b_addr = wbp->cl_clusters[cl_index].b_addr; cl.b_addr < wbp->cl_clusters[cl_index].e_addr; cl.b_addr++) {
b4c24cb9 4170
91447636
A
4171 if (ubc_page_op(vp, (off_t)(cl.b_addr * PAGE_SIZE_64), 0, 0, &flags) == KERN_SUCCESS) {
4172 if (flags & UPL_POP_DIRTY) {
4173 cl.e_addr = cl.b_addr + 1;
b4c24cb9 4174
91447636
A
4175 sparse_cluster_add(wbp, vp, &cl, EOF);
4176 }
55e303ae
A
4177 }
4178 }
4179 }
91447636
A
4180 wbp->cl_number = 0;
4181
4182 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
55e303ae
A
4183}
4184
4185
91447636
A
4186/*
4187 * sparse_cluster_push is called with the write behind lock held
4188 */
4189static void
4190sparse_cluster_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_all)
55e303ae 4191{
91447636
A
4192 struct cl_extent cl;
4193 off_t offset;
4194 u_int length;
55e303ae 4195
91447636 4196 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, push_all, 0);
55e303ae
A
4197
4198 if (push_all)
91447636 4199 vfs_drt_control(&(wbp->cl_scmap), 1);
55e303ae
A
4200
4201 for (;;) {
91447636 4202 if (vfs_drt_get_cluster(&(wbp->cl_scmap), &offset, &length) != KERN_SUCCESS)
55e303ae 4203 break;
55e303ae 4204
91447636
A
4205 cl.b_addr = (daddr64_t)(offset / PAGE_SIZE_64);
4206 cl.e_addr = (daddr64_t)((offset + length) / PAGE_SIZE_64);
4207
4208 wbp->cl_scdirty -= (int)(cl.e_addr - cl.b_addr);
55e303ae 4209
91447636 4210 cluster_push_x(vp, &cl, EOF, 0);
55e303ae
A
4211
4212 if (push_all == 0)
4213 break;
4214 }
91447636 4215 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
55e303ae
A
4216}
4217
4218
91447636
A
4219/*
4220 * sparse_cluster_add is called with the write behind lock held
4221 */
4222static void
4223sparse_cluster_add(struct cl_writebehind *wbp, vnode_t vp, struct cl_extent *cl, off_t EOF)
55e303ae 4224{
91447636
A
4225 u_int new_dirty;
4226 u_int length;
4227 off_t offset;
55e303ae 4228
91447636 4229 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_START, (int)wbp->cl_scmap, wbp->cl_scdirty, (int)cl->b_addr, (int)cl->e_addr, 0);
55e303ae 4230
91447636
A
4231 offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
4232 length = ((u_int)(cl->e_addr - cl->b_addr)) * PAGE_SIZE;
55e303ae 4233
91447636 4234 while (vfs_drt_mark_pages(&(wbp->cl_scmap), offset, length, &new_dirty) != KERN_SUCCESS) {
55e303ae
A
4235 /*
4236 * no room left in the map
4237 * only a partial update was done
4238 * push out some pages and try again
4239 */
91447636 4240 wbp->cl_scdirty += new_dirty;
55e303ae 4241
91447636 4242 sparse_cluster_push(wbp, vp, EOF, 0);
55e303ae
A
4243
4244 offset += (new_dirty * PAGE_SIZE_64);
4245 length -= (new_dirty * PAGE_SIZE);
4246 }
91447636 4247 wbp->cl_scdirty += new_dirty;
55e303ae 4248
91447636 4249 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
55e303ae
A
4250}
4251
4252
4253static int
91447636 4254cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, int xsize, int flags)
55e303ae 4255{
55e303ae
A
4256 upl_page_info_t *pl;
4257 upl_t upl;
4258 addr64_t ubc_paddr;
4259 kern_return_t kret;
4260 int error = 0;
91447636
A
4261 int did_read = 0;
4262 int abort_flags;
4263 int upl_flags;
55e303ae 4264
91447636
A
4265 upl_flags = UPL_SET_LITE;
4266 if (! (flags & CL_READ)) {
4267 /*
4268 * "write" operation: let the UPL subsystem know
4269 * that we intend to modify the buffer cache pages
4270 * we're gathering.
4271 */
4272 upl_flags |= UPL_WILL_MODIFY;
4273 }
4274
55e303ae
A
4275 kret = ubc_create_upl(vp,
4276 uio->uio_offset & ~PAGE_MASK_64,
4277 PAGE_SIZE,
4278 &upl,
4279 &pl,
91447636 4280 upl_flags);
55e303ae
A
4281
4282 if (kret != KERN_SUCCESS)
4283 return(EINVAL);
4284
4285 if (!upl_valid_page(pl, 0)) {
4286 /*
4287 * issue a synchronous read to cluster_io
4288 */
91447636
A
4289 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
4290 CL_READ, (buf_t)NULL, (struct clios *)NULL);
55e303ae 4291 if (error) {
b4c24cb9
A
4292 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
4293
4294 return(error);
4295 }
91447636 4296 did_read = 1;
b4c24cb9 4297 }
55e303ae 4298 ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)(uio->uio_offset & PAGE_MASK_64);
b4c24cb9 4299
55e303ae
A
4300/*
4301 * NOTE: There is no prototype for the following in BSD. It, and the definitions
4302 * of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
4303 * osfmk/ppc/mappings.h. They are not included here because there appears to be no
4304 * way to do so without exporting them to kexts as well.
4305 */
de355530 4306 if (flags & CL_READ)
55e303ae
A
4307// copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk); /* Copy physical to physical and flush the destination */
4308 copypv(ubc_paddr, usr_paddr, xsize, 2 | 1 | 4); /* Copy physical to physical and flush the destination */
de355530 4309 else
4a249263
A
4310// copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc); /* Copy physical to physical and flush the source */
4311 copypv(usr_paddr, ubc_paddr, xsize, 2 | 1 | 8); /* Copy physical to physical and flush the source */
55e303ae
A
4312
4313 if ( !(flags & CL_READ) || (upl_valid_page(pl, 0) && upl_dirty_page(pl, 0))) {
4314 /*
4315 * issue a synchronous write to cluster_io
4316 */
91447636
A
4317 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
4318 0, (buf_t)NULL, (struct clios *)NULL);
de355530 4319 }
cc9f6e38
A
4320 if (error == 0)
4321 uio_update(uio, (user_size_t)xsize);
4322
91447636
A
4323 if (did_read)
4324 abort_flags = UPL_ABORT_FREE_ON_EMPTY;
4325 else
4326 abort_flags = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
4327
4328 ubc_upl_abort_range(upl, 0, PAGE_SIZE, abort_flags);
55e303ae
A
4329
4330 return (error);
4331}
4332
4333
4334
4335int
4336cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int xsize)
4337{
4338 int pg_offset;
4339 int pg_index;
4340 int csize;
4341 int segflg;
4342 int retval = 0;
4343 upl_page_info_t *pl;
55e303ae
A
4344
4345 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
91447636 4346 (int)uio->uio_offset, uio_resid(uio), upl_offset, xsize, 0);
55e303ae
A
4347
4348 segflg = uio->uio_segflg;
4349
4350 switch(segflg) {
4351
91447636
A
4352 case UIO_USERSPACE32:
4353 case UIO_USERISPACE32:
4354 uio->uio_segflg = UIO_PHYS_USERSPACE32;
4355 break;
4356
55e303ae
A
4357 case UIO_USERSPACE:
4358 case UIO_USERISPACE:
4359 uio->uio_segflg = UIO_PHYS_USERSPACE;
4360 break;
4361
91447636
A
4362 case UIO_USERSPACE64:
4363 case UIO_USERISPACE64:
4364 uio->uio_segflg = UIO_PHYS_USERSPACE64;
4365 break;
4366
4367 case UIO_SYSSPACE32:
4368 uio->uio_segflg = UIO_PHYS_SYSSPACE32;
4369 break;
4370
55e303ae
A
4371 case UIO_SYSSPACE:
4372 uio->uio_segflg = UIO_PHYS_SYSSPACE;
4373 break;
91447636
A
4374
4375 case UIO_SYSSPACE64:
4376 uio->uio_segflg = UIO_PHYS_SYSSPACE64;
4377 break;
55e303ae
A
4378 }
4379 pl = ubc_upl_pageinfo(upl);
4380
4381 pg_index = upl_offset / PAGE_SIZE;
4382 pg_offset = upl_offset & PAGE_MASK;
4383 csize = min(PAGE_SIZE - pg_offset, xsize);
4384
4385 while (xsize && retval == 0) {
4386 addr64_t paddr;
4387
4388 paddr = ((addr64_t)upl_phys_page(pl, pg_index) << 12) + pg_offset;
de355530 4389
55e303ae
A
4390 retval = uiomove64(paddr, csize, uio);
4391
4392 pg_index += 1;
4393 pg_offset = 0;
4394 xsize -= csize;
4395 csize = min(PAGE_SIZE, xsize);
4396 }
4397 uio->uio_segflg = segflg;
4398
55e303ae 4399 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
91447636 4400 (int)uio->uio_offset, uio_resid(uio), retval, segflg, 0);
55e303ae
A
4401
4402 return (retval);
4403}
4404
4405
4406int
91447636 4407cluster_copy_ubc_data(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty)
55e303ae
A
4408{
4409 int segflg;
4410 int io_size;
4411 int xsize;
4412 int start_offset;
55e303ae
A
4413 int retval = 0;
4414 memory_object_control_t control;
55e303ae
A
4415
4416
4417 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
91447636 4418 (int)uio->uio_offset, uio_resid(uio), 0, *io_resid, 0);
55e303ae
A
4419
4420 control = ubc_getobject(vp, UBC_FLAGS_NONE);
4421 if (control == MEMORY_OBJECT_CONTROL_NULL) {
4422 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
91447636 4423 (int)uio->uio_offset, uio_resid(uio), retval, 3, 0);
55e303ae
A
4424
4425 return(0);
4426 }
55e303ae
A
4427 segflg = uio->uio_segflg;
4428
4429 switch(segflg) {
4430
91447636
A
4431 case UIO_USERSPACE32:
4432 case UIO_USERISPACE32:
4433 uio->uio_segflg = UIO_PHYS_USERSPACE32;
4434 break;
4435
4436 case UIO_USERSPACE64:
4437 case UIO_USERISPACE64:
4438 uio->uio_segflg = UIO_PHYS_USERSPACE64;
4439 break;
4440
4441 case UIO_SYSSPACE32:
4442 uio->uio_segflg = UIO_PHYS_SYSSPACE32;
4443 break;
4444
4445 case UIO_SYSSPACE64:
4446 uio->uio_segflg = UIO_PHYS_SYSSPACE64;
4447 break;
4448
55e303ae
A
4449 case UIO_USERSPACE:
4450 case UIO_USERISPACE:
4451 uio->uio_segflg = UIO_PHYS_USERSPACE;
4452 break;
4453
4454 case UIO_SYSSPACE:
4455 uio->uio_segflg = UIO_PHYS_SYSSPACE;
4456 break;
4457 }
55e303ae 4458
91447636
A
4459 if ( (io_size = *io_resid) ) {
4460 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
4461 xsize = uio_resid(uio);
55e303ae 4462
91447636
A
4463 retval = memory_object_control_uiomove(control, uio->uio_offset - start_offset,
4464 uio, start_offset, io_size, mark_dirty);
4465 xsize -= uio_resid(uio);
4466 io_size -= xsize;
55e303ae
A
4467 }
4468 uio->uio_segflg = segflg;
4469 *io_resid = io_size;
4470
55e303ae 4471 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
91447636 4472 (int)uio->uio_offset, uio_resid(uio), retval, 0x80000000 | segflg, 0);
55e303ae
A
4473
4474 return(retval);
4475}
4476
4477
4478int
91447636 4479is_file_clean(vnode_t vp, off_t filesize)
55e303ae
A
4480{
4481 off_t f_offset;
4482 int flags;
4483 int total_dirty = 0;
4484
4485 for (f_offset = 0; f_offset < filesize; f_offset += PAGE_SIZE_64) {
4486 if (ubc_page_op(vp, f_offset, 0, 0, &flags) == KERN_SUCCESS) {
4487 if (flags & UPL_POP_DIRTY) {
4488 total_dirty++;
4489 }
4490 }
4491 }
4492 if (total_dirty)
4493 return(EINVAL);
4494
4495 return (0);
4496}
4497
4498
4499
4500/*
4501 * Dirty region tracking/clustering mechanism.
4502 *
4503 * This code (vfs_drt_*) provides a mechanism for tracking and clustering
4504 * dirty regions within a larger space (file). It is primarily intended to
4505 * support clustering in large files with many dirty areas.
4506 *
4507 * The implementation assumes that the dirty regions are pages.
4508 *
4509 * To represent dirty pages within the file, we store bit vectors in a
4510 * variable-size circular hash.
4511 */
4512
4513/*
4514 * Bitvector size. This determines the number of pages we group in a
4515 * single hashtable entry. Each hashtable entry is aligned to this
4516 * size within the file.
4517 */
4518#define DRT_BITVECTOR_PAGES 256
4519
4520/*
4521 * File offset handling.
4522 *
4523 * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
4524 * the correct formula is (~(DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1)
4525 */
4526#define DRT_ADDRESS_MASK (~((1 << 20) - 1))
4527#define DRT_ALIGN_ADDRESS(addr) ((addr) & DRT_ADDRESS_MASK)
4528
4529/*
4530 * Hashtable address field handling.
4531 *
4532 * The low-order bits of the hashtable address are used to conserve
4533 * space.
4534 *
4535 * DRT_HASH_COUNT_MASK must be large enough to store the range
4536 * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
4537 * to indicate that the bucket is actually unoccupied.
4538 */
4539#define DRT_HASH_GET_ADDRESS(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
4540#define DRT_HASH_SET_ADDRESS(scm, i, a) \
4541 do { \
4542 (scm)->scm_hashtable[(i)].dhe_control = \
4543 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \
4544 } while (0)
4545#define DRT_HASH_COUNT_MASK 0x1ff
4546#define DRT_HASH_GET_COUNT(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
4547#define DRT_HASH_SET_COUNT(scm, i, c) \
4548 do { \
4549 (scm)->scm_hashtable[(i)].dhe_control = \
4550 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK); \
4551 } while (0)
4552#define DRT_HASH_CLEAR(scm, i) \
4553 do { \
4554 (scm)->scm_hashtable[(i)].dhe_control = 0; \
4555 } while (0)
4556#define DRT_HASH_VACATE(scm, i) DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
4557#define DRT_HASH_VACANT(scm, i) (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
4558#define DRT_HASH_COPY(oscm, oi, scm, i) \
4559 do { \
4560 (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control; \
4561 DRT_BITVECTOR_COPY(oscm, oi, scm, i); \
4562 } while(0);
4563
4564
4565/*
4566 * Hash table moduli.
4567 *
4568 * Since the hashtable entry's size is dependent on the size of
4569 * the bitvector, and since the hashtable size is constrained to
4570 * both being prime and fitting within the desired allocation
4571 * size, these values need to be manually determined.
4572 *
4573 * For DRT_BITVECTOR_SIZE = 256, the entry size is 40 bytes.
4574 *
4575 * The small hashtable allocation is 1024 bytes, so the modulus is 23.
4576 * The large hashtable allocation is 16384 bytes, so the modulus is 401.
4577 */
4578#define DRT_HASH_SMALL_MODULUS 23
4579#define DRT_HASH_LARGE_MODULUS 401
4580
4581#define DRT_SMALL_ALLOCATION 1024 /* 104 bytes spare */
4582#define DRT_LARGE_ALLOCATION 16384 /* 344 bytes spare */
4583
4584/* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */
4585
4586/*
4587 * Hashtable bitvector handling.
4588 *
4589 * Bitvector fields are 32 bits long.
4590 */
4591
4592#define DRT_HASH_SET_BIT(scm, i, bit) \
4593 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32))
4594
4595#define DRT_HASH_CLEAR_BIT(scm, i, bit) \
4596 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
4597
4598#define DRT_HASH_TEST_BIT(scm, i, bit) \
4599 ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
4600
4601#define DRT_BITVECTOR_CLEAR(scm, i) \
4602 bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
4603
4604#define DRT_BITVECTOR_COPY(oscm, oi, scm, i) \
4605 bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0], \
4606 &(scm)->scm_hashtable[(i)].dhe_bitvector[0], \
4607 (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
4608
4609
4610
4611/*
4612 * Hashtable entry.
4613 */
4614struct vfs_drt_hashentry {
4615 u_int64_t dhe_control;
4616 u_int32_t dhe_bitvector[DRT_BITVECTOR_PAGES / 32];
4617};
4618
4619/*
4620 * Dirty Region Tracking structure.
4621 *
4622 * The hashtable is allocated entirely inside the DRT structure.
4623 *
4624 * The hash is a simple circular prime modulus arrangement, the structure
4625 * is resized from small to large if it overflows.
4626 */
4627
4628struct vfs_drt_clustermap {
4629 u_int32_t scm_magic; /* sanity/detection */
4630#define DRT_SCM_MAGIC 0x12020003
4631 u_int32_t scm_modulus; /* current ring size */
4632 u_int32_t scm_buckets; /* number of occupied buckets */
4633 u_int32_t scm_lastclean; /* last entry we cleaned */
4634 u_int32_t scm_iskips; /* number of slot skips */
4635
4636 struct vfs_drt_hashentry scm_hashtable[0];
4637};
4638
4639
4640#define DRT_HASH(scm, addr) ((addr) % (scm)->scm_modulus)
4641#define DRT_HASH_NEXT(scm, addr) (((addr) + 1) % (scm)->scm_modulus)
4642
4643/*
4644 * Debugging codes and arguments.
4645 */
4646#define DRT_DEBUG_EMPTYFREE (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
4647#define DRT_DEBUG_RETCLUSTER (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
4648#define DRT_DEBUG_ALLOC (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
4649#define DRT_DEBUG_INSERT (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
4650#define DRT_DEBUG_MARK (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
4651 * dirty */
4652 /* 0, setcount */
4653 /* 1 (clean, no map) */
4654 /* 2 (map alloc fail) */
4655 /* 3, resid (partial) */
4656#define DRT_DEBUG_6 (FSDBG_CODE(DBG_FSRW, 87))
4657#define DRT_DEBUG_SCMDATA (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
4658 * lastclean, iskips */
4659
4660
55e303ae
A
4661static kern_return_t vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp);
4662static kern_return_t vfs_drt_free_map(struct vfs_drt_clustermap *cmap);
4663static kern_return_t vfs_drt_search_index(struct vfs_drt_clustermap *cmap,
4664 u_int64_t offset, int *indexp);
4665static kern_return_t vfs_drt_get_index(struct vfs_drt_clustermap **cmapp,
4666 u_int64_t offset,
4667 int *indexp,
4668 int recursed);
4669static kern_return_t vfs_drt_do_mark_pages(
4670 void **cmapp,
4671 u_int64_t offset,
4672 u_int length,
4673 int *setcountp,
4674 int dirty);
4675static void vfs_drt_trace(
4676 struct vfs_drt_clustermap *cmap,
4677 int code,
4678 int arg1,
4679 int arg2,
4680 int arg3,
4681 int arg4);
4682
4683
4684/*
4685 * Allocate and initialise a sparse cluster map.
4686 *
4687 * Will allocate a new map, resize or compact an existing map.
4688 *
4689 * XXX we should probably have at least one intermediate map size,
4690 * as the 1:16 ratio seems a bit drastic.
4691 */
4692static kern_return_t
4693vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp)
4694{
4695 struct vfs_drt_clustermap *cmap, *ocmap;
4696 kern_return_t kret;
4697 u_int64_t offset;
4698 int nsize, i, active_buckets, index, copycount;
4699
4700 ocmap = NULL;
4701 if (cmapp != NULL)
4702 ocmap = *cmapp;
4703
4704 /*
4705 * Decide on the size of the new map.
4706 */
4707 if (ocmap == NULL) {
4708 nsize = DRT_HASH_SMALL_MODULUS;
4709 } else {
4710 /* count the number of active buckets in the old map */
4711 active_buckets = 0;
4712 for (i = 0; i < ocmap->scm_modulus; i++) {
4713 if (!DRT_HASH_VACANT(ocmap, i) &&
4714 (DRT_HASH_GET_COUNT(ocmap, i) != 0))
4715 active_buckets++;
4716 }
4717 /*
4718 * If we're currently using the small allocation, check to
4719 * see whether we should grow to the large one.
4720 */
4721 if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
4722 /* if the ring is nearly full */
4723 if (active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) {
4724 nsize = DRT_HASH_LARGE_MODULUS;
4725 } else {
4726 nsize = DRT_HASH_SMALL_MODULUS;
4727 }
4728 } else {
4729 /* already using the large modulus */
4730 nsize = DRT_HASH_LARGE_MODULUS;
4731 /*
4732 * If the ring is completely full, there's
4733 * nothing useful for us to do. Behave as
4734 * though we had compacted into the new
4735 * array and return.
4736 */
4737 if (active_buckets >= DRT_HASH_LARGE_MODULUS)
4738 return(KERN_SUCCESS);
4739 }
4740 }
4741
4742 /*
4743 * Allocate and initialise the new map.
4744 */
4745
4746 kret = kmem_alloc(kernel_map, (vm_offset_t *)&cmap,
4747 (nsize == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION);
4748 if (kret != KERN_SUCCESS)
4749 return(kret);
4750 cmap->scm_magic = DRT_SCM_MAGIC;
4751 cmap->scm_modulus = nsize;
4752 cmap->scm_buckets = 0;
4753 cmap->scm_lastclean = 0;
4754 cmap->scm_iskips = 0;
4755 for (i = 0; i < cmap->scm_modulus; i++) {
4756 DRT_HASH_CLEAR(cmap, i);
4757 DRT_HASH_VACATE(cmap, i);
4758 DRT_BITVECTOR_CLEAR(cmap, i);
4759 }
4760
4761 /*
4762 * If there's an old map, re-hash entries from it into the new map.
4763 */
4764 copycount = 0;
4765 if (ocmap != NULL) {
4766 for (i = 0; i < ocmap->scm_modulus; i++) {
4767 /* skip empty buckets */
4768 if (DRT_HASH_VACANT(ocmap, i) ||
4769 (DRT_HASH_GET_COUNT(ocmap, i) == 0))
4770 continue;
4771 /* get new index */
4772 offset = DRT_HASH_GET_ADDRESS(ocmap, i);
4773 kret = vfs_drt_get_index(&cmap, offset, &index, 1);
4774 if (kret != KERN_SUCCESS) {
4775 /* XXX need to bail out gracefully here */
4776 panic("vfs_drt: new cluster map mysteriously too small");
4777 }
4778 /* copy */
4779 DRT_HASH_COPY(ocmap, i, cmap, index);
4780 copycount++;
4781 }
4782 }
4783
4784 /* log what we've done */
4785 vfs_drt_trace(cmap, DRT_DEBUG_ALLOC, copycount, 0, 0, 0);
4786
4787 /*
4788 * It's important to ensure that *cmapp always points to
4789 * a valid map, so we must overwrite it before freeing
4790 * the old map.
4791 */
4792 *cmapp = cmap;
4793 if (ocmap != NULL) {
4794 /* emit stats into trace buffer */
4795 vfs_drt_trace(ocmap, DRT_DEBUG_SCMDATA,
4796 ocmap->scm_modulus,
4797 ocmap->scm_buckets,
4798 ocmap->scm_lastclean,
4799 ocmap->scm_iskips);
4800
4801 vfs_drt_free_map(ocmap);
4802 }
4803 return(KERN_SUCCESS);
4804}
4805
4806
4807/*
4808 * Free a sparse cluster map.
4809 */
4810static kern_return_t
4811vfs_drt_free_map(struct vfs_drt_clustermap *cmap)
4812{
55e303ae
A
4813 kmem_free(kernel_map, (vm_offset_t)cmap,
4814 (cmap->scm_modulus == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION);
4815 return(KERN_SUCCESS);
4816}
4817
4818
4819/*
4820 * Find the hashtable slot currently occupied by an entry for the supplied offset.
4821 */
4822static kern_return_t
4823vfs_drt_search_index(struct vfs_drt_clustermap *cmap, u_int64_t offset, int *indexp)
4824{
91447636 4825 int index, i;
55e303ae
A
4826
4827 offset = DRT_ALIGN_ADDRESS(offset);
4828 index = DRT_HASH(cmap, offset);
4829
4830 /* traverse the hashtable */
4831 for (i = 0; i < cmap->scm_modulus; i++) {
4832
4833 /*
4834 * If the slot is vacant, we can stop.
4835 */
4836 if (DRT_HASH_VACANT(cmap, index))
4837 break;
4838
4839 /*
4840 * If the address matches our offset, we have success.
4841 */
4842 if (DRT_HASH_GET_ADDRESS(cmap, index) == offset) {
4843 *indexp = index;
4844 return(KERN_SUCCESS);
4845 }
4846
4847 /*
4848 * Move to the next slot, try again.
4849 */
4850 index = DRT_HASH_NEXT(cmap, index);
4851 }
4852 /*
4853 * It's not there.
4854 */
4855 return(KERN_FAILURE);
4856}
4857
4858/*
4859 * Find the hashtable slot for the supplied offset. If we haven't allocated
4860 * one yet, allocate one and populate the address field. Note that it will
4861 * not have a nonzero page count and thus will still technically be free, so
4862 * in the case where we are called to clean pages, the slot will remain free.
4863 */
4864static kern_return_t
4865vfs_drt_get_index(struct vfs_drt_clustermap **cmapp, u_int64_t offset, int *indexp, int recursed)
4866{
4867 struct vfs_drt_clustermap *cmap;
4868 kern_return_t kret;
4869 int index, i;
4870
4871 cmap = *cmapp;
4872
4873 /* look for an existing entry */
4874 kret = vfs_drt_search_index(cmap, offset, indexp);
4875 if (kret == KERN_SUCCESS)
4876 return(kret);
4877
4878 /* need to allocate an entry */
4879 offset = DRT_ALIGN_ADDRESS(offset);
4880 index = DRT_HASH(cmap, offset);
4881
4882 /* scan from the index forwards looking for a vacant slot */
4883 for (i = 0; i < cmap->scm_modulus; i++) {
4884 /* slot vacant? */
4885 if (DRT_HASH_VACANT(cmap, index) || DRT_HASH_GET_COUNT(cmap,index) == 0) {
4886 cmap->scm_buckets++;
4887 if (index < cmap->scm_lastclean)
4888 cmap->scm_lastclean = index;
4889 DRT_HASH_SET_ADDRESS(cmap, index, offset);
4890 DRT_HASH_SET_COUNT(cmap, index, 0);
4891 DRT_BITVECTOR_CLEAR(cmap, index);
4892 *indexp = index;
4893 vfs_drt_trace(cmap, DRT_DEBUG_INSERT, (int)offset, i, 0, 0);
4894 return(KERN_SUCCESS);
4895 }
4896 cmap->scm_iskips += i;
4897 index = DRT_HASH_NEXT(cmap, index);
4898 }
4899
4900 /*
4901 * We haven't found a vacant slot, so the map is full. If we're not
4902 * already recursed, try reallocating/compacting it.
4903 */
4904 if (recursed)
4905 return(KERN_FAILURE);
4906 kret = vfs_drt_alloc_map(cmapp);
4907 if (kret == KERN_SUCCESS) {
4908 /* now try to insert again */
4909 kret = vfs_drt_get_index(cmapp, offset, indexp, 1);
4910 }
4911 return(kret);
4912}
4913
4914/*
4915 * Implementation of set dirty/clean.
4916 *
4917 * In the 'clean' case, not finding a map is OK.
4918 */
4919static kern_return_t
4920vfs_drt_do_mark_pages(
4921 void **private,
4922 u_int64_t offset,
4923 u_int length,
4924 int *setcountp,
4925 int dirty)
4926{
4927 struct vfs_drt_clustermap *cmap, **cmapp;
4928 kern_return_t kret;
4929 int i, index, pgoff, pgcount, setcount, ecount;
4930
4931 cmapp = (struct vfs_drt_clustermap **)private;
4932 cmap = *cmapp;
4933
4934 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_START, (int)offset, (int)length, dirty, 0);
4935
4936 if (setcountp != NULL)
4937 *setcountp = 0;
4938
4939 /* allocate a cluster map if we don't already have one */
4940 if (cmap == NULL) {
4941 /* no cluster map, nothing to clean */
4942 if (!dirty) {
4943 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 1, 0, 0, 0);
4944 return(KERN_SUCCESS);
4945 }
4946 kret = vfs_drt_alloc_map(cmapp);
4947 if (kret != KERN_SUCCESS) {
4948 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 2, 0, 0, 0);
4949 return(kret);
4950 }
4951 }
4952 setcount = 0;
4953
4954 /*
4955 * Iterate over the length of the region.
4956 */
4957 while (length > 0) {
4958 /*
4959 * Get the hashtable index for this offset.
4960 *
4961 * XXX this will add blank entries if we are clearing a range
4962 * that hasn't been dirtied.
4963 */
4964 kret = vfs_drt_get_index(cmapp, offset, &index, 0);
4965 cmap = *cmapp; /* may have changed! */
4966 /* this may be a partial-success return */
4967 if (kret != KERN_SUCCESS) {
4968 if (setcountp != NULL)
4969 *setcountp = setcount;
4970 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 3, (int)length, 0, 0);
4971
4972 return(kret);
4973 }
4974
4975 /*
4976 * Work out how many pages we're modifying in this
4977 * hashtable entry.
4978 */
4979 pgoff = (offset - DRT_ALIGN_ADDRESS(offset)) / PAGE_SIZE;
4980 pgcount = min((length / PAGE_SIZE), (DRT_BITVECTOR_PAGES - pgoff));
4981
4982 /*
4983 * Iterate over pages, dirty/clearing as we go.
4984 */
4985 ecount = DRT_HASH_GET_COUNT(cmap, index);
4986 for (i = 0; i < pgcount; i++) {
4987 if (dirty) {
4988 if (!DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
4989 DRT_HASH_SET_BIT(cmap, index, pgoff + i);
4990 ecount++;
4991 setcount++;
4992 }
4993 } else {
4994 if (DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
4995 DRT_HASH_CLEAR_BIT(cmap, index, pgoff + i);
4996 ecount--;
4997 setcount++;
4998 }
4999 }
5000 }
5001 DRT_HASH_SET_COUNT(cmap, index, ecount);
91447636 5002
55e303ae
A
5003 offset += pgcount * PAGE_SIZE;
5004 length -= pgcount * PAGE_SIZE;
5005 }
5006 if (setcountp != NULL)
5007 *setcountp = setcount;
5008
5009 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 0, setcount, 0, 0);
5010
5011 return(KERN_SUCCESS);
5012}
5013
5014/*
5015 * Mark a set of pages as dirty/clean.
5016 *
5017 * This is a public interface.
5018 *
5019 * cmapp
5020 * Pointer to storage suitable for holding a pointer. Note that
5021 * this must either be NULL or a value set by this function.
5022 *
5023 * size
5024 * Current file size in bytes.
5025 *
5026 * offset
5027 * Offset of the first page to be marked as dirty, in bytes. Must be
5028 * page-aligned.
5029 *
5030 * length
5031 * Length of dirty region, in bytes. Must be a multiple of PAGE_SIZE.
5032 *
5033 * setcountp
5034 * Number of pages newly marked dirty by this call (optional).
5035 *
5036 * Returns KERN_SUCCESS if all the pages were successfully marked.
5037 */
5038static kern_return_t
5039vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, int *setcountp)
5040{
5041 /* XXX size unused, drop from interface */
5042 return(vfs_drt_do_mark_pages(cmapp, offset, length, setcountp, 1));
5043}
5044
91447636 5045#if 0
55e303ae
A
5046static kern_return_t
5047vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length)
5048{
5049 return(vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0));
5050}
91447636 5051#endif
55e303ae
A
5052
5053/*
5054 * Get a cluster of dirty pages.
5055 *
5056 * This is a public interface.
5057 *
5058 * cmapp
5059 * Pointer to storage managed by drt_mark_pages. Note that this must
5060 * be NULL or a value set by drt_mark_pages.
5061 *
5062 * offsetp
5063 * Returns the byte offset into the file of the first page in the cluster.
5064 *
5065 * lengthp
5066 * Returns the length in bytes of the cluster of dirty pages.
5067 *
5068 * Returns success if a cluster was found. If KERN_FAILURE is returned, there
5069 * are no dirty pages meeting the minmum size criteria. Private storage will
5070 * be released if there are no more dirty pages left in the map
5071 *
5072 */
5073static kern_return_t
5074vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp)
5075{
5076 struct vfs_drt_clustermap *cmap;
5077 u_int64_t offset;
5078 u_int length;
5079 int index, i, j, fs, ls;
5080
5081 /* sanity */
5082 if ((cmapp == NULL) || (*cmapp == NULL))
5083 return(KERN_FAILURE);
5084 cmap = *cmapp;
5085
5086 /* walk the hashtable */
5087 for (offset = 0, j = 0; j < cmap->scm_modulus; offset += (DRT_BITVECTOR_PAGES * PAGE_SIZE), j++) {
5088 index = DRT_HASH(cmap, offset);
5089
5090 if (DRT_HASH_VACANT(cmap, index) || (DRT_HASH_GET_COUNT(cmap, index) == 0))
5091 continue;
5092
5093 /* scan the bitfield for a string of bits */
5094 fs = -1;
5095
5096 for (i = 0; i < DRT_BITVECTOR_PAGES; i++) {
5097 if (DRT_HASH_TEST_BIT(cmap, index, i)) {
5098 fs = i;
5099 break;
5100 }
5101 }
5102 if (fs == -1) {
5103 /* didn't find any bits set */
5104 panic("vfs_drt: entry summary count > 0 but no bits set in map");
5105 }
5106 for (ls = 0; i < DRT_BITVECTOR_PAGES; i++, ls++) {
5107 if (!DRT_HASH_TEST_BIT(cmap, index, i))
5108 break;
5109 }
5110
5111 /* compute offset and length, mark pages clean */
5112 offset = DRT_HASH_GET_ADDRESS(cmap, index) + (PAGE_SIZE * fs);
5113 length = ls * PAGE_SIZE;
5114 vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
5115 cmap->scm_lastclean = index;
5116
5117 /* return successful */
5118 *offsetp = (off_t)offset;
5119 *lengthp = length;
5120
5121 vfs_drt_trace(cmap, DRT_DEBUG_RETCLUSTER, (int)offset, (int)length, 0, 0);
5122 return(KERN_SUCCESS);
5123 }
5124 /*
5125 * We didn't find anything... hashtable is empty
5126 * emit stats into trace buffer and
5127 * then free it
5128 */
5129 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
5130 cmap->scm_modulus,
5131 cmap->scm_buckets,
5132 cmap->scm_lastclean,
5133 cmap->scm_iskips);
5134
5135 vfs_drt_free_map(cmap);
5136 *cmapp = NULL;
5137
5138 return(KERN_FAILURE);
5139}
5140
5141
5142static kern_return_t
5143vfs_drt_control(void **cmapp, int op_type)
5144{
5145 struct vfs_drt_clustermap *cmap;
5146
5147 /* sanity */
5148 if ((cmapp == NULL) || (*cmapp == NULL))
5149 return(KERN_FAILURE);
5150 cmap = *cmapp;
5151
5152 switch (op_type) {
5153 case 0:
5154 /* emit stats into trace buffer */
5155 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
5156 cmap->scm_modulus,
5157 cmap->scm_buckets,
5158 cmap->scm_lastclean,
5159 cmap->scm_iskips);
5160
5161 vfs_drt_free_map(cmap);
5162 *cmapp = NULL;
5163 break;
5164
5165 case 1:
5166 cmap->scm_lastclean = 0;
5167 break;
5168 }
5169 return(KERN_SUCCESS);
5170}
5171
5172
5173
5174/*
5175 * Emit a summary of the state of the clustermap into the trace buffer
5176 * along with some caller-provided data.
5177 */
91447636 5178#if KDEBUG
55e303ae 5179static void
91447636 5180vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, int code, int arg1, int arg2, int arg3, int arg4)
55e303ae
A
5181{
5182 KERNEL_DEBUG(code, arg1, arg2, arg3, arg4, 0);
5183}
91447636
A
5184#else
5185static void
5186vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, __unused int code,
5187 __unused int arg1, __unused int arg2, __unused int arg3,
5188 __unused int arg4)
5189{
5190}
5191#endif
55e303ae 5192
91447636 5193#if 0
55e303ae
A
5194/*
5195 * Perform basic sanity check on the hash entry summary count
5196 * vs. the actual bits set in the entry.
5197 */
5198static void
5199vfs_drt_sanity(struct vfs_drt_clustermap *cmap)
5200{
5201 int index, i;
5202 int bits_on;
5203
5204 for (index = 0; index < cmap->scm_modulus; index++) {
5205 if (DRT_HASH_VACANT(cmap, index))
5206 continue;
5207
5208 for (bits_on = 0, i = 0; i < DRT_BITVECTOR_PAGES; i++) {
5209 if (DRT_HASH_TEST_BIT(cmap, index, i))
5210 bits_on++;
5211 }
5212 if (bits_on != DRT_HASH_GET_COUNT(cmap, index))
5213 panic("bits_on = %d, index = %d\n", bits_on, index);
5214 }
b4c24cb9 5215}
91447636 5216#endif