]> git.saurik.com Git - apple/xnu.git/blob - bsd/vfs/vfs_cluster.c
29a38b7c2e9bbc62f9cb1fe265bf5369cadc22db
[apple/xnu.git] / bsd / vfs / vfs_cluster.c
1 /*
2 * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
23 /*
24 * Copyright (c) 1993
25 * The Regents of the University of California. All rights reserved.
26 *
27 * Redistribution and use in source and binary forms, with or without
28 * modification, are permitted provided that the following conditions
29 * are met:
30 * 1. Redistributions of source code must retain the above copyright
31 * notice, this list of conditions and the following disclaimer.
32 * 2. Redistributions in binary form must reproduce the above copyright
33 * notice, this list of conditions and the following disclaimer in the
34 * documentation and/or other materials provided with the distribution.
35 * 3. All advertising materials mentioning features or use of this software
36 * must display the following acknowledgement:
37 * This product includes software developed by the University of
38 * California, Berkeley and its contributors.
39 * 4. Neither the name of the University nor the names of its contributors
40 * may be used to endorse or promote products derived from this software
41 * without specific prior written permission.
42 *
43 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
44 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
45 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
46 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
47 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
48 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
49 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
50 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
51 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
52 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
53 * SUCH DAMAGE.
54 *
55 * @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95
56 */
57
58 #include <sys/param.h>
59 #include <sys/proc_internal.h>
60 #include <sys/buf_internal.h>
61 #include <sys/mount_internal.h>
62 #include <sys/vnode_internal.h>
63 #include <sys/trace.h>
64 #include <sys/malloc.h>
65 #include <sys/time.h>
66 #include <sys/kernel.h>
67 #include <sys/resourcevar.h>
68 #include <sys/uio_internal.h>
69 #include <libkern/libkern.h>
70 #include <machine/machine_routines.h>
71
72 #include <sys/ubc_internal.h>
73
74 #include <mach/mach_types.h>
75 #include <mach/memory_object_types.h>
76 #include <mach/vm_map.h>
77 #include <mach/upl.h>
78
79 #include <vm/vm_kern.h>
80 #include <vm/vm_map.h>
81 #include <vm/vm_pageout.h>
82
83 #include <sys/kdebug.h>
84
85
86
87 #define CL_READ 0x01
88 #define CL_ASYNC 0x02
89 #define CL_COMMIT 0x04
90 #define CL_PAGEOUT 0x10
91 #define CL_AGE 0x20
92 #define CL_DUMP 0x40
93 #define CL_NOZERO 0x80
94 #define CL_PAGEIN 0x100
95 #define CL_DEV_MEMORY 0x200
96 #define CL_PRESERVE 0x400
97 #define CL_THROTTLE 0x800
98 #define CL_KEEPCACHED 0x1000
99
100
101 struct clios {
102 u_int io_completed; /* amount of io that has currently completed */
103 u_int io_issued; /* amount of io that was successfully issued */
104 int io_error; /* error code of first error encountered */
105 int io_wanted; /* someone is sleeping waiting for a change in state */
106 };
107
108 static lck_grp_t *cl_mtx_grp;
109 static lck_attr_t *cl_mtx_attr;
110 static lck_grp_attr_t *cl_mtx_grp_attr;
111 static lck_mtx_t *cl_mtxp;
112
113
114 static int cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
115 int flags, buf_t real_bp, struct clios *iostate);
116 static int cluster_iodone(buf_t bp, void *dummy);
117 static int cluster_rd_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize);
118 static int cluster_hard_throttle_on(vnode_t vp);
119
120 static int cluster_read_x(vnode_t vp, struct uio *uio, off_t filesize, int flags);
121 static int cluster_write_x(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF,
122 off_t headOff, off_t tailOff, int flags);
123 static int cluster_nocopy_read(vnode_t vp, struct uio *uio, off_t filesize);
124 static int cluster_nocopy_write(vnode_t vp, struct uio *uio, off_t newEOF);
125 static int cluster_phys_read(vnode_t vp, struct uio *uio, off_t filesize);
126 static int cluster_phys_write(vnode_t vp, struct uio *uio, off_t newEOF);
127 static int cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, int xsize, int flags);
128
129 static void cluster_rd_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *ra);
130
131 static int cluster_push_x(vnode_t vp, struct cl_extent *, off_t EOF, int flags);
132 static void cluster_push_EOF(vnode_t vp, off_t EOF);
133
134 static int cluster_try_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int can_delay, int push_all);
135
136 static void sparse_cluster_switch(struct cl_writebehind *, vnode_t vp, off_t EOF);
137 static void sparse_cluster_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_all);
138 static void sparse_cluster_add(struct cl_writebehind *, vnode_t vp, struct cl_extent *, off_t EOF);
139
140 static kern_return_t vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, int *setcountp);
141 static kern_return_t vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp);
142 static kern_return_t vfs_drt_control(void **cmapp, int op_type);
143
144 int is_file_clean(vnode_t, off_t);
145
146 /*
147 * throttle the number of async writes that
148 * can be outstanding on a single vnode
149 * before we issue a synchronous write
150 */
151 #define HARD_THROTTLE_MAXCNT 0
152 #define HARD_THROTTLE_MAXSIZE (64 * 1024)
153
154 int hard_throttle_on_root = 0;
155 struct timeval priority_IO_timestamp_for_root;
156
157
158 void
159 cluster_init(void) {
160 /*
161 * allocate lock group attribute and group
162 */
163 cl_mtx_grp_attr = lck_grp_attr_alloc_init();
164 //lck_grp_attr_setstat(cl_mtx_grp_attr);
165 cl_mtx_grp = lck_grp_alloc_init("cluster I/O", cl_mtx_grp_attr);
166
167 /*
168 * allocate the lock attribute
169 */
170 cl_mtx_attr = lck_attr_alloc_init();
171 //lck_attr_setdebug(clf_mtx_attr);
172
173 /*
174 * allocate and initialize mutex's used to protect updates and waits
175 * on the cluster_io context
176 */
177 cl_mtxp = lck_mtx_alloc_init(cl_mtx_grp, cl_mtx_attr);
178
179 if (cl_mtxp == NULL)
180 panic("cluster_init: failed to allocate cl_mtxp");
181 }
182
183
184
185 #define CLW_ALLOCATE 0x01
186 #define CLW_RETURNLOCKED 0x02
187 /*
188 * if the read ahead context doesn't yet exist,
189 * allocate and initialize it...
190 * the vnode lock serializes multiple callers
191 * during the actual assignment... first one
192 * to grab the lock wins... the other callers
193 * will release the now unnecessary storage
194 *
195 * once the context is present, try to grab (but don't block on)
196 * the lock associated with it... if someone
197 * else currently owns it, than the read
198 * will run without read-ahead. this allows
199 * multiple readers to run in parallel and
200 * since there's only 1 read ahead context,
201 * there's no real loss in only allowing 1
202 * reader to have read-ahead enabled.
203 */
204 static struct cl_readahead *
205 cluster_get_rap(vnode_t vp)
206 {
207 struct ubc_info *ubc;
208 struct cl_readahead *rap;
209
210 ubc = vp->v_ubcinfo;
211
212 if ((rap = ubc->cl_rahead) == NULL) {
213 MALLOC_ZONE(rap, struct cl_readahead *, sizeof *rap, M_CLRDAHEAD, M_WAITOK);
214
215 bzero(rap, sizeof *rap);
216 rap->cl_lastr = -1;
217 lck_mtx_init(&rap->cl_lockr, cl_mtx_grp, cl_mtx_attr);
218
219 vnode_lock(vp);
220
221 if (ubc->cl_rahead == NULL)
222 ubc->cl_rahead = rap;
223 else {
224 lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp);
225 FREE_ZONE((void *)rap, sizeof *rap, M_CLRDAHEAD);
226 rap = ubc->cl_rahead;
227 }
228 vnode_unlock(vp);
229 }
230 if (lck_mtx_try_lock(&rap->cl_lockr) == TRUE)
231 return(rap);
232
233 return ((struct cl_readahead *)NULL);
234 }
235
236
237 /*
238 * if the write behind context doesn't yet exist,
239 * and CLW_ALLOCATE is specified, allocate and initialize it...
240 * the vnode lock serializes multiple callers
241 * during the actual assignment... first one
242 * to grab the lock wins... the other callers
243 * will release the now unnecessary storage
244 *
245 * if CLW_RETURNLOCKED is set, grab (blocking if necessary)
246 * the lock associated with the write behind context before
247 * returning
248 */
249
250 static struct cl_writebehind *
251 cluster_get_wbp(vnode_t vp, int flags)
252 {
253 struct ubc_info *ubc;
254 struct cl_writebehind *wbp;
255
256 ubc = vp->v_ubcinfo;
257
258 if ((wbp = ubc->cl_wbehind) == NULL) {
259
260 if ( !(flags & CLW_ALLOCATE))
261 return ((struct cl_writebehind *)NULL);
262
263 MALLOC_ZONE(wbp, struct cl_writebehind *, sizeof *wbp, M_CLWRBEHIND, M_WAITOK);
264
265 bzero(wbp, sizeof *wbp);
266 lck_mtx_init(&wbp->cl_lockw, cl_mtx_grp, cl_mtx_attr);
267
268 vnode_lock(vp);
269
270 if (ubc->cl_wbehind == NULL)
271 ubc->cl_wbehind = wbp;
272 else {
273 lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp);
274 FREE_ZONE((void *)wbp, sizeof *wbp, M_CLWRBEHIND);
275 wbp = ubc->cl_wbehind;
276 }
277 vnode_unlock(vp);
278 }
279 if (flags & CLW_RETURNLOCKED)
280 lck_mtx_lock(&wbp->cl_lockw);
281
282 return (wbp);
283 }
284
285
286 static int
287 cluster_hard_throttle_on(vnode_t vp)
288 {
289 static struct timeval hard_throttle_maxelapsed = { 0, 200000 };
290
291 if (vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV) {
292 struct timeval elapsed;
293
294 if (hard_throttle_on_root)
295 return(1);
296
297 microuptime(&elapsed);
298 timevalsub(&elapsed, &priority_IO_timestamp_for_root);
299
300 if (timevalcmp(&elapsed, &hard_throttle_maxelapsed, <))
301 return(1);
302 }
303 return(0);
304 }
305
306
307 static int
308 cluster_iodone(buf_t bp, __unused void *dummy)
309 {
310 int b_flags;
311 int error;
312 int total_size;
313 int total_resid;
314 int upl_offset;
315 int zero_offset;
316 upl_t upl;
317 buf_t cbp;
318 buf_t cbp_head;
319 buf_t cbp_next;
320 buf_t real_bp;
321 struct clios *iostate;
322 int commit_size;
323 int pg_offset;
324
325 cbp_head = (buf_t)(bp->b_trans_head);
326
327 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
328 (int)cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
329
330 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
331 /*
332 * all I/O requests that are part of this transaction
333 * have to complete before we can process it
334 */
335 if ( !(cbp->b_flags & B_DONE)) {
336
337 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
338 (int)cbp_head, (int)cbp, cbp->b_bcount, cbp->b_flags, 0);
339
340 return 0;
341 }
342 }
343 error = 0;
344 total_size = 0;
345 total_resid = 0;
346
347 cbp = cbp_head;
348 upl_offset = cbp->b_uploffset;
349 upl = cbp->b_upl;
350 b_flags = cbp->b_flags;
351 real_bp = cbp->b_real_bp;
352 zero_offset= cbp->b_validend;
353 iostate = (struct clios *)cbp->b_iostate;
354
355 if (real_bp)
356 real_bp->b_dev = cbp->b_dev;
357
358 while (cbp) {
359 if ((cbp->b_flags & B_ERROR) && error == 0)
360 error = cbp->b_error;
361
362 total_resid += cbp->b_resid;
363 total_size += cbp->b_bcount;
364
365 cbp_next = cbp->b_trans_next;
366
367 free_io_buf(cbp);
368
369 cbp = cbp_next;
370 }
371 if (zero_offset)
372 cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
373
374 if (iostate) {
375 int need_wakeup = 0;
376
377 /*
378 * someone has issued multiple I/Os asynchrounsly
379 * and is waiting for them to complete (streaming)
380 */
381 lck_mtx_lock(cl_mtxp);
382
383 if (error && iostate->io_error == 0)
384 iostate->io_error = error;
385
386 iostate->io_completed += total_size;
387
388 if (iostate->io_wanted) {
389 /*
390 * someone is waiting for the state of
391 * this io stream to change
392 */
393 iostate->io_wanted = 0;
394 need_wakeup = 1;
395 }
396 lck_mtx_unlock(cl_mtxp);
397
398 if (need_wakeup)
399 wakeup((caddr_t)&iostate->io_wanted);
400 }
401 if ((b_flags & B_NEED_IODONE) && real_bp) {
402 if (error) {
403 real_bp->b_flags |= B_ERROR;
404 real_bp->b_error = error;
405 }
406 real_bp->b_resid = total_resid;
407
408 buf_biodone(real_bp);
409 }
410 if (error == 0 && total_resid)
411 error = EIO;
412
413 if (b_flags & B_COMMIT_UPL) {
414 pg_offset = upl_offset & PAGE_MASK;
415 commit_size = (pg_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
416
417 if (error || (b_flags & B_NOCACHE)) {
418 int upl_abort_code;
419 int page_in = 0;
420 int page_out = 0;
421
422 if (b_flags & B_PAGEIO) {
423 if (b_flags & B_READ)
424 page_in = 1;
425 else
426 page_out = 1;
427 }
428 if (b_flags & B_CACHE) /* leave pages in the cache unchanged on error */
429 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
430 else if (page_out && (error != ENXIO)) /* transient error */
431 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
432 else if (page_in)
433 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
434 else
435 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
436
437 ubc_upl_abort_range(upl, upl_offset - pg_offset, commit_size,
438 upl_abort_code);
439
440 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
441 (int)upl, upl_offset - pg_offset, commit_size,
442 0x80000000|upl_abort_code, 0);
443
444 } else {
445 int upl_commit_flags = UPL_COMMIT_FREE_ON_EMPTY;
446
447 if ((b_flags & B_PHYS) && (b_flags & B_READ))
448 upl_commit_flags |= UPL_COMMIT_SET_DIRTY;
449
450 if (b_flags & B_AGE)
451 upl_commit_flags |= UPL_COMMIT_INACTIVATE;
452
453 ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size,
454 upl_commit_flags);
455
456 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
457 (int)upl, upl_offset - pg_offset, commit_size,
458 upl_commit_flags, 0);
459 }
460 } else {
461 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
462 (int)upl, upl_offset, 0, error, 0);
463 }
464
465 return (error);
466 }
467
468
469 void
470 cluster_zero(upl_t upl, vm_offset_t upl_offset, int size, buf_t bp)
471 {
472 upl_page_info_t *pl;
473
474 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_START,
475 upl_offset, size, (int)bp, 0, 0);
476
477 if (bp == NULL || bp->b_datap == 0) {
478
479 pl = ubc_upl_pageinfo(upl);
480
481 while (size) {
482 int page_offset;
483 int page_index;
484 addr64_t zero_addr;
485 int zero_cnt;
486
487 page_index = upl_offset / PAGE_SIZE;
488 page_offset = upl_offset & PAGE_MASK;
489
490 zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << 12) + page_offset;
491 zero_cnt = min(PAGE_SIZE - page_offset, size);
492
493 bzero_phys(zero_addr, zero_cnt);
494
495 size -= zero_cnt;
496 upl_offset += zero_cnt;
497 }
498 } else
499 bzero((caddr_t)((vm_offset_t)bp->b_datap + upl_offset), size);
500
501 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_END,
502 upl_offset, size, 0, 0, 0);
503 }
504
505
506 static int
507 cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
508 int flags, buf_t real_bp, struct clios *iostate)
509 {
510 buf_t cbp;
511 u_int size;
512 u_int io_size;
513 int io_flags;
514 int bmap_flags;
515 int error = 0;
516 int retval = 0;
517 buf_t cbp_head = NULL;
518 buf_t cbp_tail = NULL;
519 int trans_count = 0;
520 u_int pg_count;
521 int pg_offset;
522 u_int max_iosize;
523 u_int max_vectors;
524 int priv;
525 int zero_offset = 0;
526 int async_throttle = 0;
527 mount_t mp;
528
529 mp = vp->v_mount;
530
531 if (mp->mnt_devblocksize > 1) {
532 /*
533 * round the requested size up so that this I/O ends on a
534 * page boundary in case this is a 'write'... if the filesystem
535 * has blocks allocated to back the page beyond the EOF, we want to
536 * make sure to write out the zero's that are sitting beyond the EOF
537 * so that in case the filesystem doesn't explicitly zero this area
538 * if a hole is created via a lseek/write beyond the current EOF,
539 * it will return zeros when it's read back from the disk. If the
540 * physical allocation doesn't extend for the whole page, we'll
541 * only write/read from the disk up to the end of this allocation
542 * via the extent info returned from the VNOP_BLOCKMAP call.
543 */
544 pg_offset = upl_offset & PAGE_MASK;
545
546 size = (((non_rounded_size + pg_offset) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - pg_offset;
547 } else {
548 /*
549 * anyone advertising a blocksize of 1 byte probably
550 * can't deal with us rounding up the request size
551 * AFP is one such filesystem/device
552 */
553 size = non_rounded_size;
554 }
555 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START,
556 (int)f_offset, size, upl_offset, flags, 0);
557
558 if (flags & CL_READ) {
559 io_flags = (B_READ);
560 bmap_flags = VNODE_READ;
561
562 max_iosize = mp->mnt_maxreadcnt;
563 max_vectors = mp->mnt_segreadcnt;
564 } else {
565 io_flags = 0;
566 bmap_flags = VNODE_WRITE;
567
568 max_iosize = mp->mnt_maxwritecnt;
569 max_vectors = mp->mnt_segwritecnt;
570 }
571 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_NONE, max_iosize, max_vectors, mp->mnt_devblocksize, 0, 0);
572
573 /*
574 * make sure the maximum iosize is a
575 * multiple of the page size
576 */
577 max_iosize &= ~PAGE_MASK;
578
579 if (flags & CL_THROTTLE) {
580 if ( !(flags & CL_PAGEOUT) && cluster_hard_throttle_on(vp)) {
581 if (max_iosize > HARD_THROTTLE_MAXSIZE)
582 max_iosize = HARD_THROTTLE_MAXSIZE;
583 async_throttle = HARD_THROTTLE_MAXCNT;
584 } else
585 async_throttle = VNODE_ASYNC_THROTTLE;
586 }
587 if (flags & CL_AGE)
588 io_flags |= B_AGE;
589 if (flags & CL_DUMP)
590 io_flags |= B_NOCACHE;
591 if (flags & (CL_PAGEIN | CL_PAGEOUT))
592 io_flags |= B_PAGEIO;
593 if (flags & CL_COMMIT)
594 io_flags |= B_COMMIT_UPL;
595 if (flags & CL_PRESERVE)
596 io_flags |= B_PHYS;
597 if (flags & CL_KEEPCACHED)
598 io_flags |= B_CACHE;
599
600 if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
601 /*
602 * then we are going to end up
603 * with a page that we can't complete (the file size wasn't a multiple
604 * of PAGE_SIZE and we're trying to read to the end of the file
605 * so we'll go ahead and zero out the portion of the page we can't
606 * read in from the file
607 */
608 zero_offset = upl_offset + non_rounded_size;
609 }
610 while (size) {
611 int pg_resid;
612 daddr64_t blkno;
613 daddr64_t lblkno;
614
615 if (size > max_iosize)
616 io_size = max_iosize;
617 else
618 io_size = size;
619
620 if ((error = VNOP_BLOCKMAP(vp, f_offset, io_size, &blkno, (size_t *)&io_size, NULL, bmap_flags, NULL))) {
621 break;
622 }
623 if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno))
624 real_bp->b_blkno = blkno;
625
626 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
627 (int)f_offset, (int)blkno, io_size, zero_offset, 0);
628
629 if (io_size == 0) {
630 /*
631 * vnop_blockmap didn't return an error... however, it did
632 * return an extent size of 0 which means we can't
633 * make forward progress on this I/O... a hole in the
634 * file would be returned as a blkno of -1 with a non-zero io_size
635 * a real extent is returned with a blkno != -1 and a non-zero io_size
636 */
637 error = EINVAL;
638 break;
639 }
640 if ( !(flags & CL_READ) && blkno == -1) {
641 off_t e_offset;
642
643 /*
644 * we're writing into a 'hole'
645 */
646 if (flags & CL_PAGEOUT) {
647 /*
648 * if we got here via cluster_pageout
649 * then just error the request and return
650 * the 'hole' should already have been covered
651 */
652 error = EINVAL;
653 break;
654 }
655 if ( !(flags & CL_COMMIT)) {
656 /*
657 * currently writes always request the commit to happen
658 * as part of the io completion... however, if the CL_COMMIT
659 * flag isn't specified, than we can't issue the abort_range
660 * since the call site is going to abort or commit the same upl..
661 * in this case we can only return an error
662 */
663 error = EINVAL;
664 break;
665 }
666 /*
667 * we can get here if the cluster code happens to
668 * pick up a page that was dirtied via mmap vs
669 * a 'write' and the page targets a 'hole'...
670 * i.e. the writes to the cluster were sparse
671 * and the file was being written for the first time
672 *
673 * we can also get here if the filesystem supports
674 * 'holes' that are less than PAGE_SIZE.... because
675 * we can't know if the range in the page that covers
676 * the 'hole' has been dirtied via an mmap or not,
677 * we have to assume the worst and try to push the
678 * entire page to storage.
679 *
680 * Try paging out the page individually before
681 * giving up entirely and dumping it (the pageout
682 * path will insure that the zero extent accounting
683 * has been taken care of before we get back into cluster_io)
684 */
685 ubc_upl_abort_range(upl, trunc_page(upl_offset), PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
686
687 e_offset = round_page_64(f_offset + 1);
688
689 if (ubc_sync_range(vp, f_offset, e_offset, UBC_PUSHDIRTY) == 0) {
690 error = EINVAL;
691 break;
692 }
693 io_size = e_offset - f_offset;
694
695 f_offset += io_size;
696 upl_offset += io_size;
697
698 if (size >= io_size)
699 size -= io_size;
700 else
701 size = 0;
702 /*
703 * keep track of how much of the original request
704 * that we've actually completed... non_rounded_size
705 * may go negative due to us rounding the request
706 * to a page size multiple (i.e. size > non_rounded_size)
707 */
708 non_rounded_size -= io_size;
709
710 if (non_rounded_size <= 0) {
711 /*
712 * we've transferred all of the data in the original
713 * request, but we were unable to complete the tail
714 * of the last page because the file didn't have
715 * an allocation to back that portion... this is ok.
716 */
717 size = 0;
718 }
719 continue;
720 }
721 lblkno = (daddr64_t)(f_offset / PAGE_SIZE_64);
722 /*
723 * we have now figured out how much I/O we can do - this is in 'io_size'
724 * pg_offset is the starting point in the first page for the I/O
725 * pg_count is the number of full and partial pages that 'io_size' encompasses
726 */
727 pg_offset = upl_offset & PAGE_MASK;
728
729 if (flags & CL_DEV_MEMORY) {
730 /*
731 * currently, can't deal with reading 'holes' in file
732 */
733 if (blkno == -1) {
734 error = EINVAL;
735 break;
736 }
737 /*
738 * treat physical requests as one 'giant' page
739 */
740 pg_count = 1;
741 } else
742 pg_count = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
743
744 if ((flags & CL_READ) && blkno == -1) {
745 int bytes_to_zero;
746
747 /*
748 * if we're reading and blkno == -1, then we've got a
749 * 'hole' in the file that we need to deal with by zeroing
750 * out the affected area in the upl
751 */
752 if (zero_offset && io_size == size) {
753 /*
754 * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
755 * than 'zero_offset' will be non-zero
756 * if the 'hole' returned by vnop_blockmap extends all the way to the eof
757 * (indicated by the io_size finishing off the I/O request for this UPL)
758 * than we're not going to issue an I/O for the
759 * last page in this upl... we need to zero both the hole and the tail
760 * of the page beyond the EOF, since the delayed zero-fill won't kick in
761 */
762 bytes_to_zero = (((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset;
763
764 zero_offset = 0;
765 } else
766 bytes_to_zero = io_size;
767
768 cluster_zero(upl, upl_offset, bytes_to_zero, real_bp);
769
770 if (cbp_head)
771 /*
772 * if there is a current I/O chain pending
773 * then the first page of the group we just zero'd
774 * will be handled by the I/O completion if the zero
775 * fill started in the middle of the page
776 */
777 pg_count = (io_size - pg_offset) / PAGE_SIZE;
778 else {
779 /*
780 * no pending I/O to pick up that first page
781 * so, we have to make sure it gets committed
782 * here.
783 * set the pg_offset to 0 so that the upl_commit_range
784 * starts with this page
785 */
786 pg_count = (io_size + pg_offset) / PAGE_SIZE;
787 pg_offset = 0;
788 }
789 if (io_size == size && ((upl_offset + io_size) & PAGE_MASK))
790 /*
791 * if we're done with the request for this UPL
792 * then we have to make sure to commit the last page
793 * even if we only partially zero-filled it
794 */
795 pg_count++;
796
797 if (pg_count) {
798 if (pg_offset)
799 pg_resid = PAGE_SIZE - pg_offset;
800 else
801 pg_resid = 0;
802
803 if (flags & CL_COMMIT)
804 ubc_upl_commit_range(upl,
805 (upl_offset + pg_resid) & ~PAGE_MASK,
806 pg_count * PAGE_SIZE,
807 UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
808 }
809 upl_offset += io_size;
810 f_offset += io_size;
811 size -= io_size;
812 /*
813 * keep track of how much of the original request
814 * that we've actually completed... non_rounded_size
815 * may go negative due to us rounding the request
816 * to a page size multiple (i.e. size > non_rounded_size)
817 */
818 non_rounded_size -= io_size;
819
820 if (non_rounded_size <= 0) {
821 /*
822 * we've transferred all of the data in the original
823 * request, but we were unable to complete the tail
824 * of the last page because the file didn't have
825 * an allocation to back that portion... this is ok.
826 */
827 size = 0;
828 }
829 if (cbp_head && pg_count)
830 goto start_io;
831 continue;
832
833 }
834 if (pg_count > max_vectors) {
835 if (((pg_count - max_vectors) * PAGE_SIZE) > io_size) {
836 io_size = PAGE_SIZE - pg_offset;
837 pg_count = 1;
838 } else {
839 io_size -= (pg_count - max_vectors) * PAGE_SIZE;
840 pg_count = max_vectors;
841 }
842 }
843
844 if ( !(mp->mnt_kern_flag & MNTK_VIRTUALDEV))
845 /*
846 * if we're not targeting a virtual device i.e. a disk image
847 * it's safe to dip into the reserve pool since real devices
848 * can complete this I/O request without requiring additional
849 * bufs from the alloc_io_buf pool
850 */
851 priv = 1;
852 else if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT))
853 /*
854 * Throttle the speculative IO
855 */
856 priv = 0;
857 else
858 priv = 1;
859
860 cbp = alloc_io_buf(vp, priv);
861
862 if (flags & CL_PAGEOUT) {
863 u_int i;
864
865 for (i = 0; i < pg_count; i++) {
866 if (buf_invalblkno(vp, lblkno + i, 0) == EBUSY)
867 panic("BUSY bp found in cluster_io");
868 }
869 }
870 if (flags & CL_ASYNC) {
871 if (buf_setcallback(cbp, (void *)cluster_iodone, NULL))
872 panic("buf_setcallback failed\n");
873 }
874 cbp->b_flags |= io_flags;
875
876 cbp->b_lblkno = lblkno;
877 cbp->b_blkno = blkno;
878 cbp->b_bcount = io_size;
879
880 if (buf_setupl(cbp, upl, upl_offset))
881 panic("buf_setupl failed\n");
882
883 cbp->b_trans_next = (buf_t)NULL;
884
885 if ((cbp->b_iostate = (void *)iostate))
886 /*
887 * caller wants to track the state of this
888 * io... bump the amount issued against this stream
889 */
890 iostate->io_issued += io_size;
891
892 if (flags & CL_READ) {
893 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
894 (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
895 }
896 else {
897 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
898 (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
899 }
900
901 if (cbp_head) {
902 cbp_tail->b_trans_next = cbp;
903 cbp_tail = cbp;
904 } else {
905 cbp_head = cbp;
906 cbp_tail = cbp;
907 }
908 (buf_t)(cbp->b_trans_head) = cbp_head;
909 trans_count++;
910
911 upl_offset += io_size;
912 f_offset += io_size;
913 size -= io_size;
914 /*
915 * keep track of how much of the original request
916 * that we've actually completed... non_rounded_size
917 * may go negative due to us rounding the request
918 * to a page size multiple (i.e. size > non_rounded_size)
919 */
920 non_rounded_size -= io_size;
921
922 if (non_rounded_size <= 0) {
923 /*
924 * we've transferred all of the data in the original
925 * request, but we were unable to complete the tail
926 * of the last page because the file didn't have
927 * an allocation to back that portion... this is ok.
928 */
929 size = 0;
930 }
931 if ( (!(upl_offset & PAGE_MASK) && !(flags & CL_DEV_MEMORY) && ((flags & CL_ASYNC) || trans_count > 8)) || size == 0) {
932 /*
933 * if we have no more I/O to issue or
934 * the current I/O we've prepared fully
935 * completes the last page in this request
936 * and it's either an ASYNC request or
937 * we've already accumulated more than 8 I/O's into
938 * this transaction and it's not an I/O directed to
939 * special DEVICE memory
940 * then go ahead and issue the I/O
941 */
942 start_io:
943 if (real_bp) {
944 cbp_head->b_flags |= B_NEED_IODONE;
945 cbp_head->b_real_bp = real_bp;
946 } else
947 cbp_head->b_real_bp = (buf_t)NULL;
948
949 if (size == 0) {
950 /*
951 * we're about to issue the last I/O for this upl
952 * if this was a read to the eof and the eof doesn't
953 * finish on a page boundary, than we need to zero-fill
954 * the rest of the page....
955 */
956 cbp_head->b_validend = zero_offset;
957 } else
958 cbp_head->b_validend = 0;
959
960 if (flags & CL_THROTTLE)
961 (void)vnode_waitforwrites(vp, async_throttle, 0, 0, (char *)"cluster_io");
962
963 for (cbp = cbp_head; cbp;) {
964 buf_t cbp_next;
965
966 if ( !(io_flags & B_READ))
967 vnode_startwrite(vp);
968
969 cbp_next = cbp->b_trans_next;
970
971 (void) VNOP_STRATEGY(cbp);
972 cbp = cbp_next;
973 }
974 if ( !(flags & CL_ASYNC)) {
975 int dummy;
976
977 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next)
978 buf_biowait(cbp);
979
980 if ((error = cluster_iodone(cbp_head, (void *)&dummy))) {
981 if ((flags & (CL_PAGEOUT | CL_KEEPCACHED) == CL_PAGEOUT) && (error == ENXIO))
982 error = 0; /* drop the error */
983 else {
984 if (retval == 0)
985 retval = error;
986 error = 0;
987 }
988 }
989 }
990 cbp_head = (buf_t)NULL;
991 cbp_tail = (buf_t)NULL;
992
993 trans_count = 0;
994 }
995 }
996 if (error) {
997 int abort_size;
998
999 io_size = 0;
1000
1001 for (cbp = cbp_head; cbp;) {
1002 buf_t cbp_next;
1003
1004 upl_offset -= cbp->b_bcount;
1005 size += cbp->b_bcount;
1006 io_size += cbp->b_bcount;
1007
1008 cbp_next = cbp->b_trans_next;
1009 free_io_buf(cbp);
1010 cbp = cbp_next;
1011 }
1012 if (iostate) {
1013 int need_wakeup = 0;
1014
1015 /*
1016 * update the error condition for this stream
1017 * since we never really issued the io
1018 * just go ahead and adjust it back
1019 */
1020 lck_mtx_lock(cl_mtxp);
1021
1022 if (iostate->io_error == 0)
1023 iostate->io_error = error;
1024 iostate->io_issued -= io_size;
1025
1026 if (iostate->io_wanted) {
1027 /*
1028 * someone is waiting for the state of
1029 * this io stream to change
1030 */
1031 iostate->io_wanted = 0;
1032 need_wakeup = 0;
1033 }
1034 lck_mtx_unlock(cl_mtxp);
1035
1036 if (need_wakeup)
1037 wakeup((caddr_t)&iostate->io_wanted);
1038 }
1039 pg_offset = upl_offset & PAGE_MASK;
1040 abort_size = (size + pg_offset + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1041
1042 if (flags & CL_COMMIT) {
1043 int upl_abort_code;
1044
1045 if (flags & CL_PRESERVE) {
1046 ubc_upl_commit_range(upl, upl_offset - pg_offset, abort_size,
1047 UPL_COMMIT_FREE_ON_EMPTY);
1048 } else {
1049 if ((flags & CL_PAGEOUT) && (error != ENXIO)) /* transient error */
1050 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
1051 else if (flags & CL_PAGEIN)
1052 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
1053 else
1054 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
1055
1056 ubc_upl_abort_range(upl, upl_offset - pg_offset, abort_size,
1057 upl_abort_code);
1058 }
1059 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
1060 (int)upl, upl_offset - pg_offset, abort_size, error, 0);
1061 }
1062 if (real_bp) {
1063 real_bp->b_flags |= B_ERROR;
1064 real_bp->b_error = error;
1065
1066 buf_biodone(real_bp);
1067 }
1068 if (retval == 0)
1069 retval = error;
1070 }
1071 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END,
1072 (int)f_offset, size, upl_offset, retval, 0);
1073
1074 return (retval);
1075 }
1076
1077
1078 static int
1079 cluster_rd_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize)
1080 {
1081 int pages_in_prefetch;
1082
1083 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
1084 (int)f_offset, size, (int)filesize, 0, 0);
1085
1086 if (f_offset >= filesize) {
1087 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
1088 (int)f_offset, 0, 0, 0, 0);
1089 return(0);
1090 }
1091 if (size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1092 size = (MAX_UPL_TRANSFER * PAGE_SIZE);
1093 else
1094 size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1095
1096 if ((off_t)size > (filesize - f_offset))
1097 size = filesize - f_offset;
1098 pages_in_prefetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
1099
1100 advisory_read(vp, filesize, f_offset, size);
1101
1102 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
1103 (int)f_offset + size, pages_in_prefetch, 0, 1, 0);
1104
1105 return (pages_in_prefetch);
1106 }
1107
1108
1109
1110 static void
1111 cluster_rd_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *rap)
1112 {
1113 daddr64_t r_addr;
1114 off_t f_offset;
1115 int size_of_prefetch;
1116
1117
1118 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
1119 (int)extent->b_addr, (int)extent->e_addr, (int)rap->cl_lastr, 0, 0);
1120
1121 if (extent->b_addr == rap->cl_lastr && extent->b_addr == extent->e_addr) {
1122 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1123 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 0, 0);
1124 return;
1125 }
1126 if (rap->cl_lastr == -1 || (extent->b_addr != rap->cl_lastr && extent->b_addr != (rap->cl_lastr + 1) &&
1127 (extent->b_addr != (rap->cl_maxra + 1) || rap->cl_ralen == 0))) {
1128 rap->cl_ralen = 0;
1129 rap->cl_maxra = 0;
1130
1131 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1132 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 1, 0);
1133
1134 return;
1135 }
1136 if (extent->e_addr < rap->cl_maxra) {
1137 if ((rap->cl_maxra - extent->e_addr) > (MAX_UPL_TRANSFER / 4)) {
1138
1139 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1140 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 2, 0);
1141 return;
1142 }
1143 }
1144 r_addr = max(extent->e_addr, rap->cl_maxra) + 1;
1145 f_offset = (off_t)(r_addr * PAGE_SIZE_64);
1146
1147 size_of_prefetch = 0;
1148
1149 ubc_range_op(vp, f_offset, f_offset + PAGE_SIZE_64, UPL_ROP_PRESENT, &size_of_prefetch);
1150
1151 if (size_of_prefetch) {
1152 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1153 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 3, 0);
1154 return;
1155 }
1156 if (f_offset < filesize) {
1157 daddr64_t read_size;
1158
1159 rap->cl_ralen = rap->cl_ralen ? min(MAX_UPL_TRANSFER, rap->cl_ralen << 1) : 1;
1160
1161 read_size = (extent->e_addr + 1) - extent->b_addr;
1162
1163 if (read_size > rap->cl_ralen) {
1164 if (read_size > MAX_UPL_TRANSFER)
1165 rap->cl_ralen = MAX_UPL_TRANSFER;
1166 else
1167 rap->cl_ralen = read_size;
1168 }
1169 size_of_prefetch = cluster_rd_prefetch(vp, f_offset, rap->cl_ralen * PAGE_SIZE, filesize);
1170
1171 if (size_of_prefetch)
1172 rap->cl_maxra = (r_addr + size_of_prefetch) - 1;
1173 }
1174 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1175 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 4, 0);
1176 }
1177
1178 int
1179 cluster_pageout(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset,
1180 int size, off_t filesize, int flags)
1181 {
1182 int io_size;
1183 int rounded_size;
1184 off_t max_size;
1185 int local_flags;
1186 struct cl_writebehind *wbp;
1187
1188 if (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)
1189 /*
1190 * if we know we're issuing this I/O to a virtual device (i.e. disk image)
1191 * then we don't want to enforce this throttle... if we do, we can
1192 * potentially deadlock since we're stalling the pageout thread at a time
1193 * when the disk image might need additional memory (which won't be available
1194 * if the pageout thread can't run)... instead we'll just depend on the throttle
1195 * that the pageout thread now has in place to deal with external files
1196 */
1197 local_flags = CL_PAGEOUT;
1198 else
1199 local_flags = CL_PAGEOUT | CL_THROTTLE;
1200
1201 if ((flags & UPL_IOSYNC) == 0)
1202 local_flags |= CL_ASYNC;
1203 if ((flags & UPL_NOCOMMIT) == 0)
1204 local_flags |= CL_COMMIT;
1205 if ((flags & UPL_KEEPCACHED))
1206 local_flags |= CL_KEEPCACHED;
1207
1208
1209 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
1210 (int)f_offset, size, (int)filesize, local_flags, 0);
1211
1212 /*
1213 * If they didn't specify any I/O, then we are done...
1214 * we can't issue an abort because we don't know how
1215 * big the upl really is
1216 */
1217 if (size <= 0)
1218 return (EINVAL);
1219
1220 if (vp->v_mount->mnt_flag & MNT_RDONLY) {
1221 if (local_flags & CL_COMMIT)
1222 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
1223 return (EROFS);
1224 }
1225 /*
1226 * can't page-in from a negative offset
1227 * or if we're starting beyond the EOF
1228 * or if the file offset isn't page aligned
1229 * or the size requested isn't a multiple of PAGE_SIZE
1230 */
1231 if (f_offset < 0 || f_offset >= filesize ||
1232 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
1233 if (local_flags & CL_COMMIT)
1234 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
1235 return (EINVAL);
1236 }
1237 max_size = filesize - f_offset;
1238
1239 if (size < max_size)
1240 io_size = size;
1241 else
1242 io_size = max_size;
1243
1244 rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1245
1246 if (size > rounded_size) {
1247 if (local_flags & CL_COMMIT)
1248 ubc_upl_abort_range(upl, upl_offset + rounded_size, size - rounded_size,
1249 UPL_ABORT_FREE_ON_EMPTY);
1250 }
1251 if ((wbp = cluster_get_wbp(vp, 0)) != NULL)
1252 wbp->cl_hasbeenpaged = 1;
1253
1254 return (cluster_io(vp, upl, upl_offset, f_offset, io_size,
1255 local_flags, (buf_t)NULL, (struct clios *)NULL));
1256 }
1257
1258 int
1259 cluster_pagein(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset,
1260 int size, off_t filesize, int flags)
1261 {
1262 u_int io_size;
1263 int rounded_size;
1264 off_t max_size;
1265 int retval;
1266 int local_flags = 0;
1267
1268 if (upl == NULL || size < 0)
1269 panic("cluster_pagein: NULL upl passed in");
1270
1271 if ((flags & UPL_IOSYNC) == 0)
1272 local_flags |= CL_ASYNC;
1273 if ((flags & UPL_NOCOMMIT) == 0)
1274 local_flags |= CL_COMMIT;
1275
1276
1277 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
1278 (int)f_offset, size, (int)filesize, local_flags, 0);
1279
1280 /*
1281 * can't page-in from a negative offset
1282 * or if we're starting beyond the EOF
1283 * or if the file offset isn't page aligned
1284 * or the size requested isn't a multiple of PAGE_SIZE
1285 */
1286 if (f_offset < 0 || f_offset >= filesize ||
1287 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) {
1288 if (local_flags & CL_COMMIT)
1289 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1290 return (EINVAL);
1291 }
1292 max_size = filesize - f_offset;
1293
1294 if (size < max_size)
1295 io_size = size;
1296 else
1297 io_size = max_size;
1298
1299 rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1300
1301 if (size > rounded_size && (local_flags & CL_COMMIT))
1302 ubc_upl_abort_range(upl, upl_offset + rounded_size,
1303 size - rounded_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1304
1305 retval = cluster_io(vp, upl, upl_offset, f_offset, io_size,
1306 local_flags | CL_READ | CL_PAGEIN, (buf_t)NULL, (struct clios *)NULL);
1307
1308 if (retval == 0 && !(flags & UPL_NORDAHEAD) && !(vp->v_flag & VRAOFF)) {
1309 struct cl_readahead *rap;
1310
1311 rap = cluster_get_rap(vp);
1312
1313 if (rap != NULL) {
1314 struct cl_extent extent;
1315
1316 extent.b_addr = (daddr64_t)(f_offset / PAGE_SIZE_64);
1317 extent.e_addr = (daddr64_t)((f_offset + ((off_t)io_size - 1)) / PAGE_SIZE_64);
1318
1319 if (rounded_size == PAGE_SIZE) {
1320 /*
1321 * we haven't read the last page in of the file yet
1322 * so let's try to read ahead if we're in
1323 * a sequential access pattern
1324 */
1325 cluster_rd_ahead(vp, &extent, filesize, rap);
1326 }
1327 rap->cl_lastr = extent.e_addr;
1328
1329 lck_mtx_unlock(&rap->cl_lockr);
1330 }
1331 }
1332 return (retval);
1333 }
1334
1335 int
1336 cluster_bp(buf_t bp)
1337 {
1338 off_t f_offset;
1339 int flags;
1340
1341 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START,
1342 (int)bp, (int)bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
1343
1344 if (bp->b_flags & B_READ)
1345 flags = CL_ASYNC | CL_READ;
1346 else
1347 flags = CL_ASYNC;
1348
1349 f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
1350
1351 return (cluster_io(bp->b_vp, bp->b_upl, 0, f_offset, bp->b_bcount, flags, bp, (struct clios *)NULL));
1352 }
1353
1354 int
1355 cluster_write(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int xflags)
1356 {
1357 int prev_resid;
1358 u_int clip_size;
1359 off_t max_io_size;
1360 int upl_size;
1361 int upl_flags;
1362 upl_t upl;
1363 int retval = 0;
1364 int flags;
1365
1366 flags = xflags;
1367
1368 if (vp->v_flag & VNOCACHE_DATA)
1369 flags |= IO_NOCACHE;
1370
1371 if ( (!(flags & IO_NOCACHE)) || (!uio) || (!UIO_SEG_IS_USER_SPACE(uio->uio_segflg))) {
1372 /*
1373 * go do a write through the cache if one of the following is true....
1374 * NOCACHE is not true
1375 * there is no uio structure or it doesn't target USERSPACE
1376 */
1377 return (cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags));
1378 }
1379
1380 #if LP64_DEBUG
1381 if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) {
1382 panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__);
1383 }
1384 #endif /* LP64_DEBUG */
1385
1386 while (uio_resid(uio) && uio->uio_offset < newEOF && retval == 0) {
1387 u_int64_t iov_len;
1388 u_int64_t iov_base;
1389
1390 /*
1391 * we know we have a resid, so this is safe
1392 * skip over any emtpy vectors
1393 */
1394 iov_len = uio_iov_len(uio);
1395
1396 while (iov_len == 0) {
1397 uio_next_iov(uio);
1398 uio->uio_iovcnt--;
1399 iov_len = uio_iov_len(uio);
1400 }
1401 iov_base = uio_iov_base(uio);
1402
1403 upl_size = PAGE_SIZE;
1404 upl_flags = UPL_QUERY_OBJECT_TYPE;
1405
1406 // LP64todo - fix this!
1407 if ((vm_map_get_upl(current_map(),
1408 CAST_DOWN(vm_offset_t, iov_base) & ~PAGE_MASK,
1409 &upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS) {
1410 /*
1411 * the user app must have passed in an invalid address
1412 */
1413 return (EFAULT);
1414 }
1415
1416 /*
1417 * We check every vector target but if it is physically
1418 * contiguous space, we skip the sanity checks.
1419 */
1420 if (upl_flags & UPL_PHYS_CONTIG) {
1421 int zflags;
1422
1423 zflags = flags & ~IO_TAILZEROFILL;
1424 zflags |= IO_HEADZEROFILL;
1425
1426 if (flags & IO_HEADZEROFILL) {
1427 /*
1428 * in case we have additional vectors, we don't want to do this again
1429 */
1430 flags &= ~IO_HEADZEROFILL;
1431
1432 if ((retval = cluster_write_x(vp, (struct uio *)0, 0, uio->uio_offset, headOff, 0, zflags)))
1433 return(retval);
1434 }
1435 retval = cluster_phys_write(vp, uio, newEOF);
1436
1437 if (uio_resid(uio) == 0 && (flags & IO_TAILZEROFILL)) {
1438 return (cluster_write_x(vp, (struct uio *)0, 0, tailOff, uio->uio_offset, 0, zflags));
1439 }
1440 }
1441 else if ((uio_resid(uio) < PAGE_SIZE) || (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL))) {
1442 /*
1443 * we're here because we're don't have a physically contiguous target buffer
1444 * go do a write through the cache if one of the following is true....
1445 * the total xfer size is less than a page...
1446 * we're being asked to ZEROFILL either the head or the tail of the I/O...
1447 */
1448 return (cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags));
1449 }
1450 // LP64todo - fix this!
1451 else if (((int)uio->uio_offset & PAGE_MASK) || (CAST_DOWN(int, iov_base) & PAGE_MASK)) {
1452 if (((int)uio->uio_offset & PAGE_MASK) == (CAST_DOWN(int, iov_base) & PAGE_MASK)) {
1453 /*
1454 * Bring the file offset write up to a pagesize boundary
1455 * this will also bring the base address to a page boundary
1456 * since they both are currently on the same offset within a page
1457 * note: if we get here, uio->uio_resid is greater than PAGE_SIZE
1458 * so the computed clip_size must always be less than the current uio_resid
1459 */
1460 clip_size = (PAGE_SIZE - (uio->uio_offset & PAGE_MASK_64));
1461
1462 /*
1463 * Fake the resid going into the cluster_write_x call
1464 * and restore it on the way out.
1465 */
1466 // LP64todo - fix this
1467 prev_resid = uio_resid(uio);
1468 uio_setresid(uio, clip_size);
1469
1470 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags);
1471
1472 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
1473 } else {
1474 /*
1475 * can't get both the file offset and the buffer offset aligned to a page boundary
1476 * so fire an I/O through the cache for this entire vector
1477 */
1478 // LP64todo - fix this
1479 clip_size = iov_len;
1480 // LP64todo - fix this
1481 prev_resid = uio_resid(uio);
1482 uio_setresid(uio, clip_size);
1483
1484 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags);
1485
1486 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
1487 }
1488 } else {
1489 /*
1490 * If we come in here, we know the offset into
1491 * the file is on a pagesize boundary and the
1492 * target buffer address is also on a page boundary
1493 */
1494 max_io_size = newEOF - uio->uio_offset;
1495 // LP64todo - fix this
1496 clip_size = uio_resid(uio);
1497 if (iov_len < clip_size)
1498 // LP64todo - fix this!
1499 clip_size = iov_len;
1500 if (max_io_size < clip_size)
1501 clip_size = max_io_size;
1502
1503 if (clip_size < PAGE_SIZE) {
1504 /*
1505 * Take care of tail end of write in this vector
1506 */
1507 // LP64todo - fix this
1508 prev_resid = uio_resid(uio);
1509 uio_setresid(uio, clip_size);
1510
1511 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags);
1512
1513 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
1514 } else {
1515 /* round clip_size down to a multiple of pagesize */
1516 clip_size = clip_size & ~(PAGE_MASK);
1517 // LP64todo - fix this
1518 prev_resid = uio_resid(uio);
1519 uio_setresid(uio, clip_size);
1520
1521 retval = cluster_nocopy_write(vp, uio, newEOF);
1522
1523 if ((retval == 0) && uio_resid(uio))
1524 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags);
1525
1526 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
1527 }
1528 } /* end else */
1529 } /* end while */
1530
1531 return(retval);
1532 }
1533
1534
1535 static int
1536 cluster_nocopy_write(vnode_t vp, struct uio *uio, off_t newEOF)
1537 {
1538 upl_t upl;
1539 upl_page_info_t *pl;
1540 vm_offset_t upl_offset;
1541 int io_size;
1542 int io_flag;
1543 int upl_size;
1544 int upl_needed_size;
1545 int pages_in_pl;
1546 int upl_flags;
1547 kern_return_t kret;
1548 int i;
1549 int force_data_sync;
1550 int error = 0;
1551 struct clios iostate;
1552 struct cl_writebehind *wbp;
1553 struct iovec *iov;
1554
1555 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
1556 (int)uio->uio_offset, (int)uio_resid(uio),
1557 (int)newEOF, 0, 0);
1558
1559 /*
1560 * When we enter this routine, we know
1561 * -- the offset into the file is on a pagesize boundary
1562 * -- the resid is a page multiple
1563 * -- the resid will not exceed iov_len
1564 */
1565
1566 if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) != NULL) {
1567
1568 cluster_try_push(wbp, vp, newEOF, 0, 1);
1569
1570 lck_mtx_unlock(&wbp->cl_lockw);
1571 }
1572 iostate.io_completed = 0;
1573 iostate.io_issued = 0;
1574 iostate.io_error = 0;
1575 iostate.io_wanted = 0;
1576
1577 iov = uio->uio_iov;
1578
1579 while (uio_resid(uio) && uio->uio_offset < newEOF && error == 0) {
1580 io_size = uio_resid(uio);
1581
1582 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1583 io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1584
1585 // LP64todo - fix this!
1586 upl_offset = CAST_DOWN(vm_offset_t, iov->iov_base) & PAGE_MASK;
1587
1588 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
1589
1590 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
1591 (int)upl_offset, upl_needed_size, (int)iov->iov_base, io_size, 0);
1592
1593 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
1594 pages_in_pl = 0;
1595 upl_size = upl_needed_size;
1596 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1597 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
1598
1599 // LP64todo - fix this!
1600 kret = vm_map_get_upl(current_map(),
1601 CAST_DOWN(vm_offset_t, iov->iov_base) & ~PAGE_MASK,
1602 &upl_size,
1603 &upl,
1604 NULL,
1605 &pages_in_pl,
1606 &upl_flags,
1607 force_data_sync);
1608
1609 if (kret != KERN_SUCCESS) {
1610 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1611 0, 0, 0, kret, 0);
1612 /*
1613 * cluster_nocopy_write: failed to get pagelist
1614 *
1615 * we may have already spun some portion of this request
1616 * off as async requests... we need to wait for the I/O
1617 * to complete before returning
1618 */
1619 goto wait_for_writes;
1620 }
1621 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1622 pages_in_pl = upl_size / PAGE_SIZE;
1623
1624 for (i = 0; i < pages_in_pl; i++) {
1625 if (!upl_valid_page(pl, i))
1626 break;
1627 }
1628 if (i == pages_in_pl)
1629 break;
1630
1631 /*
1632 * didn't get all the pages back that we
1633 * needed... release this upl and try again
1634 */
1635 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1636 UPL_ABORT_FREE_ON_EMPTY);
1637 }
1638 if (force_data_sync >= 3) {
1639 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1640 i, pages_in_pl, upl_size, kret, 0);
1641 /*
1642 * for some reason, we couldn't acquire a hold on all
1643 * the pages needed in the user's address space
1644 *
1645 * we may have already spun some portion of this request
1646 * off as async requests... we need to wait for the I/O
1647 * to complete before returning
1648 */
1649 goto wait_for_writes;
1650 }
1651
1652 /*
1653 * Consider the possibility that upl_size wasn't satisfied.
1654 */
1655 if (upl_size != upl_needed_size)
1656 io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
1657
1658 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1659 (int)upl_offset, upl_size, (int)iov->iov_base, io_size, 0);
1660
1661 if (io_size == 0) {
1662 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1663 UPL_ABORT_FREE_ON_EMPTY);
1664 /*
1665 * we may have already spun some portion of this request
1666 * off as async requests... we need to wait for the I/O
1667 * to complete before returning
1668 */
1669 goto wait_for_writes;
1670 }
1671 /*
1672 * Now look for pages already in the cache
1673 * and throw them away.
1674 * uio->uio_offset is page aligned within the file
1675 * io_size is a multiple of PAGE_SIZE
1676 */
1677 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + io_size, UPL_ROP_DUMP, NULL);
1678
1679 /*
1680 * we want push out these writes asynchronously so that we can overlap
1681 * the preparation of the next I/O
1682 * if there are already too many outstanding writes
1683 * wait until some complete before issuing the next
1684 */
1685 lck_mtx_lock(cl_mtxp);
1686
1687 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
1688 iostate.io_wanted = 1;
1689 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_write", 0);
1690 }
1691 lck_mtx_unlock(cl_mtxp);
1692
1693 if (iostate.io_error) {
1694 /*
1695 * one of the earlier writes we issued ran into a hard error
1696 * don't issue any more writes, cleanup the UPL
1697 * that was just created but not used, then
1698 * go wait for all writes that are part of this stream
1699 * to complete before returning the error to the caller
1700 */
1701 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1702 UPL_ABORT_FREE_ON_EMPTY);
1703
1704 goto wait_for_writes;
1705 }
1706 io_flag = CL_ASYNC | CL_PRESERVE | CL_COMMIT | CL_THROTTLE;
1707
1708 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
1709 (int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0);
1710
1711 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1712 io_size, io_flag, (buf_t)NULL, &iostate);
1713
1714 iov->iov_len -= io_size;
1715 ((u_int32_t)iov->iov_base) += io_size;
1716 uio_setresid(uio, (uio_resid(uio) - io_size));
1717 uio->uio_offset += io_size;
1718
1719 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
1720 (int)upl_offset, (int)uio->uio_offset, (int)uio_resid(uio), error, 0);
1721
1722 } /* end while */
1723
1724 wait_for_writes:
1725 /*
1726 * make sure all async writes issued as part of this stream
1727 * have completed before we return
1728 */
1729 lck_mtx_lock(cl_mtxp);
1730
1731 while (iostate.io_issued != iostate.io_completed) {
1732 iostate.io_wanted = 1;
1733 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_write", 0);
1734 }
1735 lck_mtx_unlock(cl_mtxp);
1736
1737 if (iostate.io_error)
1738 error = iostate.io_error;
1739
1740 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1741 (int)uio->uio_offset, (int)uio->uio_resid, error, 4, 0);
1742
1743 return (error);
1744 }
1745
1746
1747 static int
1748 cluster_phys_write(vnode_t vp, struct uio *uio, off_t newEOF)
1749 {
1750 upl_page_info_t *pl;
1751 addr64_t src_paddr;
1752 upl_t upl;
1753 vm_offset_t upl_offset;
1754 int tail_size;
1755 int io_size;
1756 int upl_size;
1757 int upl_needed_size;
1758 int pages_in_pl;
1759 int upl_flags;
1760 kern_return_t kret;
1761 int error = 0;
1762 u_int64_t iov_base;
1763 int devblocksize;
1764 struct cl_writebehind *wbp;
1765
1766 devblocksize = vp->v_mount->mnt_devblocksize;
1767 /*
1768 * When we enter this routine, we know
1769 * -- the resid will not exceed iov_len
1770 * -- the vector target address is physcially contiguous
1771 */
1772 if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) != NULL) {
1773
1774 cluster_try_push(wbp, vp, newEOF, 0, 1);
1775
1776 lck_mtx_unlock(&wbp->cl_lockw);
1777 }
1778 #if LP64_DEBUG
1779 if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) {
1780 panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__);
1781 }
1782 #endif /* LP64_DEBUG */
1783
1784 // LP64todo - fix this!
1785 io_size = uio_iov_len(uio);
1786 iov_base = uio_iov_base(uio);
1787 upl_offset = CAST_DOWN(upl_offset_t, iov_base) & PAGE_MASK;
1788 upl_needed_size = upl_offset + io_size;
1789
1790 pages_in_pl = 0;
1791 upl_size = upl_needed_size;
1792 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1793 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
1794
1795 // LP64todo - fix this!
1796 kret = vm_map_get_upl(current_map(),
1797 CAST_DOWN(upl_offset_t, iov_base) & ~PAGE_MASK,
1798 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
1799
1800 if (kret != KERN_SUCCESS) {
1801 /*
1802 * cluster_phys_write: failed to get pagelist
1803 * note: return kret here
1804 */
1805 return(EINVAL);
1806 }
1807 /*
1808 * Consider the possibility that upl_size wasn't satisfied.
1809 * This is a failure in the physical memory case.
1810 */
1811 if (upl_size < upl_needed_size) {
1812 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1813 return(EINVAL);
1814 }
1815 pl = ubc_upl_pageinfo(upl);
1816
1817 src_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + ((addr64_t)(iov_base & PAGE_MASK));
1818
1819 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
1820 int head_size;
1821
1822 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
1823
1824 if (head_size > io_size)
1825 head_size = io_size;
1826
1827 error = cluster_align_phys_io(vp, uio, src_paddr, head_size, 0);
1828
1829 if (error) {
1830 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1831
1832 return(EINVAL);
1833 }
1834 upl_offset += head_size;
1835 src_paddr += head_size;
1836 io_size -= head_size;
1837 }
1838 tail_size = io_size & (devblocksize - 1);
1839 io_size -= tail_size;
1840
1841 if (io_size) {
1842 /*
1843 * issue a synchronous write to cluster_io
1844 */
1845 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1846 io_size, CL_DEV_MEMORY, (buf_t)NULL, (struct clios *)NULL);
1847 }
1848 if (error == 0) {
1849 /*
1850 * The cluster_io write completed successfully,
1851 * update the uio structure
1852 */
1853 uio_setresid(uio, (uio_resid(uio) - io_size));
1854 uio_iov_len_add(uio, -io_size);
1855 uio_iov_base_add(uio, io_size);
1856 uio->uio_offset += io_size;
1857 src_paddr += io_size;
1858
1859 if (tail_size)
1860 error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, 0);
1861 }
1862 /*
1863 * just release our hold on the physically contiguous
1864 * region without changing any state
1865 */
1866 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1867
1868 return (error);
1869 }
1870
1871
1872 static int
1873 cluster_write_x(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int flags)
1874 {
1875 upl_page_info_t *pl;
1876 upl_t upl;
1877 vm_offset_t upl_offset = 0;
1878 int upl_size;
1879 off_t upl_f_offset;
1880 int pages_in_upl;
1881 int start_offset;
1882 int xfer_resid;
1883 int io_size;
1884 int io_offset;
1885 int bytes_to_zero;
1886 int bytes_to_move;
1887 kern_return_t kret;
1888 int retval = 0;
1889 int io_resid;
1890 long long total_size;
1891 long long zero_cnt;
1892 off_t zero_off;
1893 long long zero_cnt1;
1894 off_t zero_off1;
1895 struct cl_extent cl;
1896 int intersection;
1897 struct cl_writebehind *wbp;
1898
1899 if ((wbp = cluster_get_wbp(vp, 0)) != NULL)
1900 {
1901 if (wbp->cl_hasbeenpaged) {
1902 /*
1903 * this vnode had pages cleaned to it by
1904 * the pager which indicates that either
1905 * it's not very 'hot', or the system is
1906 * being overwhelmed by a lot of dirty
1907 * data being delayed in the VM cache...
1908 * in either event, we'll push our remaining
1909 * delayed data at this point... this will
1910 * be more efficient than paging out 1 page at
1911 * a time, and will also act as a throttle
1912 * by delaying this client from writing any
1913 * more data until all his delayed data has
1914 * at least been queued to the uderlying driver.
1915 */
1916 if (wbp->cl_number || wbp->cl_scmap)
1917 cluster_push_EOF(vp, newEOF);
1918
1919 wbp->cl_hasbeenpaged = 0;
1920 }
1921 }
1922 if (uio) {
1923 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1924 (int)uio->uio_offset, uio_resid(uio), (int)oldEOF, (int)newEOF, 0);
1925
1926 // LP64todo - fix this
1927 io_resid = uio_resid(uio);
1928 } else {
1929 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1930 0, 0, (int)oldEOF, (int)newEOF, 0);
1931
1932 io_resid = 0;
1933 }
1934 zero_cnt = 0;
1935 zero_cnt1 = 0;
1936 zero_off = 0;
1937 zero_off1 = 0;
1938
1939 if (flags & IO_HEADZEROFILL) {
1940 /*
1941 * some filesystems (HFS is one) don't support unallocated holes within a file...
1942 * so we zero fill the intervening space between the old EOF and the offset
1943 * where the next chunk of real data begins.... ftruncate will also use this
1944 * routine to zero fill to the new EOF when growing a file... in this case, the
1945 * uio structure will not be provided
1946 */
1947 if (uio) {
1948 if (headOff < uio->uio_offset) {
1949 zero_cnt = uio->uio_offset - headOff;
1950 zero_off = headOff;
1951 }
1952 } else if (headOff < newEOF) {
1953 zero_cnt = newEOF - headOff;
1954 zero_off = headOff;
1955 }
1956 }
1957 if (flags & IO_TAILZEROFILL) {
1958 if (uio) {
1959 // LP64todo - fix this
1960 zero_off1 = uio->uio_offset + uio_resid(uio);
1961
1962 if (zero_off1 < tailOff)
1963 zero_cnt1 = tailOff - zero_off1;
1964 }
1965 }
1966 if (zero_cnt == 0 && uio == (struct uio *) 0) {
1967 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
1968 retval, 0, 0, 0, 0);
1969 return (0);
1970 }
1971
1972 while ((total_size = (io_resid + zero_cnt + zero_cnt1)) && retval == 0) {
1973 /*
1974 * for this iteration of the loop, figure out where our starting point is
1975 */
1976 if (zero_cnt) {
1977 start_offset = (int)(zero_off & PAGE_MASK_64);
1978 upl_f_offset = zero_off - start_offset;
1979 } else if (io_resid) {
1980 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
1981 upl_f_offset = uio->uio_offset - start_offset;
1982 } else {
1983 start_offset = (int)(zero_off1 & PAGE_MASK_64);
1984 upl_f_offset = zero_off1 - start_offset;
1985 }
1986 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
1987 (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
1988
1989 if (total_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1990 total_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1991
1992 cl.b_addr = (daddr64_t)(upl_f_offset / PAGE_SIZE_64);
1993
1994 if (uio && ((flags & (IO_NOCACHE | IO_SYNC | IO_HEADZEROFILL | IO_TAILZEROFILL)) == 0)) {
1995 /*
1996 * assumption... total_size <= io_resid
1997 * because IO_HEADZEROFILL and IO_TAILZEROFILL not set
1998 */
1999 if ((start_offset + total_size) > (MAX_UPL_TRANSFER * PAGE_SIZE))
2000 total_size -= start_offset;
2001 xfer_resid = total_size;
2002
2003 retval = cluster_copy_ubc_data(vp, uio, &xfer_resid, 1);
2004
2005 if (retval)
2006 break;
2007
2008 io_resid -= (total_size - xfer_resid);
2009 total_size = xfer_resid;
2010 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2011 upl_f_offset = uio->uio_offset - start_offset;
2012
2013 if (total_size == 0) {
2014 if (start_offset) {
2015 /*
2016 * the write did not finish on a page boundary
2017 * which will leave upl_f_offset pointing to the
2018 * beginning of the last page written instead of
2019 * the page beyond it... bump it in this case
2020 * so that the cluster code records the last page
2021 * written as dirty
2022 */
2023 upl_f_offset += PAGE_SIZE_64;
2024 }
2025 upl_size = 0;
2026
2027 goto check_cluster;
2028 }
2029 }
2030 /*
2031 * compute the size of the upl needed to encompass
2032 * the requested write... limit each call to cluster_io
2033 * to the maximum UPL size... cluster_io will clip if
2034 * this exceeds the maximum io_size for the device,
2035 * make sure to account for
2036 * a starting offset that's not page aligned
2037 */
2038 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2039
2040 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2041 upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2042
2043 pages_in_upl = upl_size / PAGE_SIZE;
2044 io_size = upl_size - start_offset;
2045
2046 if ((long long)io_size > total_size)
2047 io_size = total_size;
2048
2049 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, io_size, total_size, 0, 0);
2050
2051
2052 /*
2053 * Gather the pages from the buffer cache.
2054 * The UPL_WILL_MODIFY flag lets the UPL subsystem know
2055 * that we intend to modify these pages.
2056 */
2057 kret = ubc_create_upl(vp,
2058 upl_f_offset,
2059 upl_size,
2060 &upl,
2061 &pl,
2062 UPL_SET_LITE | UPL_WILL_MODIFY);
2063 if (kret != KERN_SUCCESS)
2064 panic("cluster_write: failed to get pagelist");
2065
2066 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END,
2067 (int)upl, (int)upl_f_offset, start_offset, 0, 0);
2068
2069 if (start_offset && !upl_valid_page(pl, 0)) {
2070 int read_size;
2071
2072 /*
2073 * we're starting in the middle of the first page of the upl
2074 * and the page isn't currently valid, so we're going to have
2075 * to read it in first... this is a synchronous operation
2076 */
2077 read_size = PAGE_SIZE;
2078
2079 if ((upl_f_offset + read_size) > newEOF)
2080 read_size = newEOF - upl_f_offset;
2081
2082 retval = cluster_io(vp, upl, 0, upl_f_offset, read_size,
2083 CL_READ, (buf_t)NULL, (struct clios *)NULL);
2084 if (retval) {
2085 /*
2086 * we had an error during the read which causes us to abort
2087 * the current cluster_write request... before we do, we need
2088 * to release the rest of the pages in the upl without modifying
2089 * there state and mark the failed page in error
2090 */
2091 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
2092
2093 if (upl_size > PAGE_SIZE)
2094 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2095
2096 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
2097 (int)upl, 0, 0, retval, 0);
2098 break;
2099 }
2100 }
2101 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
2102 /*
2103 * the last offset we're writing to in this upl does not end on a page
2104 * boundary... if it's not beyond the old EOF, then we'll also need to
2105 * pre-read this page in if it isn't already valid
2106 */
2107 upl_offset = upl_size - PAGE_SIZE;
2108
2109 if ((upl_f_offset + start_offset + io_size) < oldEOF &&
2110 !upl_valid_page(pl, upl_offset / PAGE_SIZE)) {
2111 int read_size;
2112
2113 read_size = PAGE_SIZE;
2114
2115 if ((upl_f_offset + upl_offset + read_size) > newEOF)
2116 read_size = newEOF - (upl_f_offset + upl_offset);
2117
2118 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size,
2119 CL_READ, (buf_t)NULL, (struct clios *)NULL);
2120 if (retval) {
2121 /*
2122 * we had an error during the read which causes us to abort
2123 * the current cluster_write request... before we do, we
2124 * need to release the rest of the pages in the upl without
2125 * modifying there state and mark the failed page in error
2126 */
2127 ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
2128
2129 if (upl_size > PAGE_SIZE)
2130 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2131
2132 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
2133 (int)upl, 0, 0, retval, 0);
2134 break;
2135 }
2136 }
2137 }
2138 xfer_resid = io_size;
2139 io_offset = start_offset;
2140
2141 while (zero_cnt && xfer_resid) {
2142
2143 if (zero_cnt < (long long)xfer_resid)
2144 bytes_to_zero = zero_cnt;
2145 else
2146 bytes_to_zero = xfer_resid;
2147
2148 if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
2149 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2150 } else {
2151 int zero_pg_index;
2152
2153 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
2154 zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
2155
2156 if ( !upl_valid_page(pl, zero_pg_index)) {
2157 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2158
2159 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
2160 !upl_dirty_page(pl, zero_pg_index)) {
2161 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2162 }
2163 }
2164 xfer_resid -= bytes_to_zero;
2165 zero_cnt -= bytes_to_zero;
2166 zero_off += bytes_to_zero;
2167 io_offset += bytes_to_zero;
2168 }
2169 if (xfer_resid && io_resid) {
2170 bytes_to_move = min(io_resid, xfer_resid);
2171
2172 retval = cluster_copy_upl_data(uio, upl, io_offset, bytes_to_move);
2173
2174 if (retval) {
2175
2176 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2177
2178 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
2179 (int)upl, 0, 0, retval, 0);
2180 } else {
2181 io_resid -= bytes_to_move;
2182 xfer_resid -= bytes_to_move;
2183 io_offset += bytes_to_move;
2184 }
2185 }
2186 while (xfer_resid && zero_cnt1 && retval == 0) {
2187
2188 if (zero_cnt1 < (long long)xfer_resid)
2189 bytes_to_zero = zero_cnt1;
2190 else
2191 bytes_to_zero = xfer_resid;
2192
2193 if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
2194 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2195 } else {
2196 int zero_pg_index;
2197
2198 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off1 & PAGE_MASK_64));
2199 zero_pg_index = (int)((zero_off1 - upl_f_offset) / PAGE_SIZE_64);
2200
2201 if ( !upl_valid_page(pl, zero_pg_index)) {
2202 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2203 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
2204 !upl_dirty_page(pl, zero_pg_index)) {
2205 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2206 }
2207 }
2208 xfer_resid -= bytes_to_zero;
2209 zero_cnt1 -= bytes_to_zero;
2210 zero_off1 += bytes_to_zero;
2211 io_offset += bytes_to_zero;
2212 }
2213
2214 if (retval == 0) {
2215 int cl_index;
2216 int can_delay;
2217
2218 io_size += start_offset;
2219
2220 if ((upl_f_offset + io_size) >= newEOF && io_size < upl_size) {
2221 /*
2222 * if we're extending the file with this write
2223 * we'll zero fill the rest of the page so that
2224 * if the file gets extended again in such a way as to leave a
2225 * hole starting at this EOF, we'll have zero's in the correct spot
2226 */
2227 cluster_zero(upl, io_size, upl_size - io_size, NULL);
2228 }
2229 if (flags & IO_SYNC)
2230 /*
2231 * if the IO_SYNC flag is set than we need to
2232 * bypass any clusters and immediately issue
2233 * the I/O
2234 */
2235 goto issue_io;
2236 check_cluster:
2237 /*
2238 * take the lock to protect our accesses
2239 * of the writebehind and sparse cluster state
2240 */
2241 wbp = cluster_get_wbp(vp, CLW_ALLOCATE | CLW_RETURNLOCKED);
2242
2243 /*
2244 * calculate the last logical block number
2245 * that this delayed I/O encompassed
2246 */
2247 cl.e_addr = (daddr64_t)((upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64);
2248
2249 if (wbp->cl_scmap) {
2250
2251 if ( !(flags & IO_NOCACHE)) {
2252 /*
2253 * we've fallen into the sparse
2254 * cluster method of delaying dirty pages
2255 * first, we need to release the upl if we hold one
2256 * since pages in it may be present in the sparse cluster map
2257 * and may span 2 separate buckets there... if they do and
2258 * we happen to have to flush a bucket to make room and it intersects
2259 * this upl, a deadlock may result on page BUSY
2260 */
2261 if (upl_size)
2262 ubc_upl_commit_range(upl, 0, upl_size,
2263 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2264
2265 sparse_cluster_add(wbp, vp, &cl, newEOF);
2266
2267 lck_mtx_unlock(&wbp->cl_lockw);
2268
2269 continue;
2270 }
2271 /*
2272 * must have done cached writes that fell into
2273 * the sparse cluster mechanism... we've switched
2274 * to uncached writes on the file, so go ahead
2275 * and push whatever's in the sparse map
2276 * and switch back to normal clustering
2277 *
2278 * see the comment above concerning a possible deadlock...
2279 */
2280 if (upl_size) {
2281 ubc_upl_commit_range(upl, 0, upl_size,
2282 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2283 /*
2284 * setting upl_size to 0 keeps us from committing a
2285 * second time in the start_new_cluster path
2286 */
2287 upl_size = 0;
2288 }
2289 sparse_cluster_push(wbp, vp, newEOF, 1);
2290
2291 wbp->cl_number = 0;
2292 /*
2293 * no clusters of either type present at this point
2294 * so just go directly to start_new_cluster since
2295 * we know we need to delay this I/O since we've
2296 * already released the pages back into the cache
2297 * to avoid the deadlock with sparse_cluster_push
2298 */
2299 goto start_new_cluster;
2300 }
2301 upl_offset = 0;
2302
2303 if (wbp->cl_number == 0)
2304 /*
2305 * no clusters currently present
2306 */
2307 goto start_new_cluster;
2308
2309 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
2310 /*
2311 * check each cluster that we currently hold
2312 * try to merge some or all of this write into
2313 * one or more of the existing clusters... if
2314 * any portion of the write remains, start a
2315 * new cluster
2316 */
2317 if (cl.b_addr >= wbp->cl_clusters[cl_index].b_addr) {
2318 /*
2319 * the current write starts at or after the current cluster
2320 */
2321 if (cl.e_addr <= (wbp->cl_clusters[cl_index].b_addr + MAX_UPL_TRANSFER)) {
2322 /*
2323 * we have a write that fits entirely
2324 * within the existing cluster limits
2325 */
2326 if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr)
2327 /*
2328 * update our idea of where the cluster ends
2329 */
2330 wbp->cl_clusters[cl_index].e_addr = cl.e_addr;
2331 break;
2332 }
2333 if (cl.b_addr < (wbp->cl_clusters[cl_index].b_addr + MAX_UPL_TRANSFER)) {
2334 /*
2335 * we have a write that starts in the middle of the current cluster
2336 * but extends beyond the cluster's limit... we know this because
2337 * of the previous checks
2338 * we'll extend the current cluster to the max
2339 * and update the b_addr for the current write to reflect that
2340 * the head of it was absorbed into this cluster...
2341 * note that we'll always have a leftover tail in this case since
2342 * full absorbtion would have occurred in the clause above
2343 */
2344 wbp->cl_clusters[cl_index].e_addr = wbp->cl_clusters[cl_index].b_addr + MAX_UPL_TRANSFER;
2345
2346 if (upl_size) {
2347 daddr64_t start_pg_in_upl;
2348
2349 start_pg_in_upl = (daddr64_t)(upl_f_offset / PAGE_SIZE_64);
2350
2351 if (start_pg_in_upl < wbp->cl_clusters[cl_index].e_addr) {
2352 intersection = (int)((wbp->cl_clusters[cl_index].e_addr - start_pg_in_upl) * PAGE_SIZE);
2353
2354 ubc_upl_commit_range(upl, upl_offset, intersection,
2355 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2356 upl_f_offset += intersection;
2357 upl_offset += intersection;
2358 upl_size -= intersection;
2359 }
2360 }
2361 cl.b_addr = wbp->cl_clusters[cl_index].e_addr;
2362 }
2363 /*
2364 * we come here for the case where the current write starts
2365 * beyond the limit of the existing cluster or we have a leftover
2366 * tail after a partial absorbtion
2367 *
2368 * in either case, we'll check the remaining clusters before
2369 * starting a new one
2370 */
2371 } else {
2372 /*
2373 * the current write starts in front of the cluster we're currently considering
2374 */
2375 if ((wbp->cl_clusters[cl_index].e_addr - cl.b_addr) <= MAX_UPL_TRANSFER) {
2376 /*
2377 * we can just merge the new request into
2378 * this cluster and leave it in the cache
2379 * since the resulting cluster is still
2380 * less than the maximum allowable size
2381 */
2382 wbp->cl_clusters[cl_index].b_addr = cl.b_addr;
2383
2384 if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr) {
2385 /*
2386 * the current write completely
2387 * envelops the existing cluster and since
2388 * each write is limited to at most MAX_UPL_TRANSFER bytes
2389 * we can just use the start and last blocknos of the write
2390 * to generate the cluster limits
2391 */
2392 wbp->cl_clusters[cl_index].e_addr = cl.e_addr;
2393 }
2394 break;
2395 }
2396
2397 /*
2398 * if we were to combine this write with the current cluster
2399 * we would exceed the cluster size limit.... so,
2400 * let's see if there's any overlap of the new I/O with
2401 * the cluster we're currently considering... in fact, we'll
2402 * stretch the cluster out to it's full limit and see if we
2403 * get an intersection with the current write
2404 *
2405 */
2406 if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr - MAX_UPL_TRANSFER) {
2407 /*
2408 * the current write extends into the proposed cluster
2409 * clip the length of the current write after first combining it's
2410 * tail with the newly shaped cluster
2411 */
2412 wbp->cl_clusters[cl_index].b_addr = wbp->cl_clusters[cl_index].e_addr - MAX_UPL_TRANSFER;
2413
2414 if (upl_size) {
2415 intersection = (int)((cl.e_addr - wbp->cl_clusters[cl_index].b_addr) * PAGE_SIZE);
2416
2417 if (intersection > upl_size)
2418 /*
2419 * because the current write may consist of a number of pages found in the cache
2420 * which are not part of the UPL, we may have an intersection that exceeds
2421 * the size of the UPL that is also part of this write
2422 */
2423 intersection = upl_size;
2424
2425 ubc_upl_commit_range(upl, upl_offset + (upl_size - intersection), intersection,
2426 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2427 upl_size -= intersection;
2428 }
2429 cl.e_addr = wbp->cl_clusters[cl_index].b_addr;
2430 }
2431 /*
2432 * if we get here, there was no way to merge
2433 * any portion of this write with this cluster
2434 * or we could only merge part of it which
2435 * will leave a tail...
2436 * we'll check the remaining clusters before starting a new one
2437 */
2438 }
2439 }
2440 if (cl_index < wbp->cl_number)
2441 /*
2442 * we found an existing cluster(s) that we
2443 * could entirely merge this I/O into
2444 */
2445 goto delay_io;
2446
2447 if (wbp->cl_number < MAX_CLUSTERS && !(flags & IO_NOCACHE))
2448 /*
2449 * we didn't find an existing cluster to
2450 * merge into, but there's room to start
2451 * a new one
2452 */
2453 goto start_new_cluster;
2454
2455 /*
2456 * no exisitng cluster to merge with and no
2457 * room to start a new one... we'll try
2458 * pushing one of the existing ones... if none of
2459 * them are able to be pushed, we'll switch
2460 * to the sparse cluster mechanism
2461 * cluster_try_push updates cl_number to the
2462 * number of remaining clusters... and
2463 * returns the number of currently unused clusters
2464 */
2465 int ret_cluster_try_push = 0;
2466 /* if writes are not deferred, call cluster push immediately */
2467 if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) {
2468 if (flags & IO_NOCACHE)
2469 can_delay = 0;
2470 else
2471 can_delay = 1;
2472
2473 ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, can_delay, 0);
2474 }
2475
2476 /* execute following regardless writes are deferred or not */
2477 if (ret_cluster_try_push == 0) {
2478 /*
2479 * no more room in the normal cluster mechanism
2480 * so let's switch to the more expansive but expensive
2481 * sparse mechanism....
2482 * first, we need to release the upl if we hold one
2483 * since pages in it may be present in the sparse cluster map (after the cluster_switch)
2484 * and may span 2 separate buckets there... if they do and
2485 * we happen to have to flush a bucket to make room and it intersects
2486 * this upl, a deadlock may result on page BUSY
2487 */
2488 if (upl_size)
2489 ubc_upl_commit_range(upl, upl_offset, upl_size,
2490 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2491
2492 sparse_cluster_switch(wbp, vp, newEOF);
2493 sparse_cluster_add(wbp, vp, &cl, newEOF);
2494
2495 lck_mtx_unlock(&wbp->cl_lockw);
2496
2497 continue;
2498 }
2499 /*
2500 * we pushed one cluster successfully, so we must be sequentially writing this file
2501 * otherwise, we would have failed and fallen into the sparse cluster support
2502 * so let's take the opportunity to push out additional clusters as long as we
2503 * remain below the throttle... this will give us better I/O locality if we're
2504 * in a copy loop (i.e. we won't jump back and forth between the read and write points
2505 * however, we don't want to push so much out that the write throttle kicks in and
2506 * hangs this thread up until some of the I/O completes...
2507 */
2508 if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) {
2509 while (wbp->cl_number && (vp->v_numoutput <= (VNODE_ASYNC_THROTTLE / 2)))
2510 cluster_try_push(wbp, vp, newEOF, 0, 0);
2511 }
2512
2513 start_new_cluster:
2514 wbp->cl_clusters[wbp->cl_number].b_addr = cl.b_addr;
2515 wbp->cl_clusters[wbp->cl_number].e_addr = cl.e_addr;
2516
2517 if (flags & IO_NOCACHE)
2518 wbp->cl_clusters[wbp->cl_number].io_nocache = 1;
2519 else
2520 wbp->cl_clusters[wbp->cl_number].io_nocache = 0;
2521 wbp->cl_number++;
2522 delay_io:
2523 if (upl_size)
2524 ubc_upl_commit_range(upl, upl_offset, upl_size,
2525 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2526
2527 lck_mtx_unlock(&wbp->cl_lockw);
2528
2529 continue;
2530 issue_io:
2531 /*
2532 * we don't hold the vnode lock at this point
2533 *
2534 * because we had to ask for a UPL that provides currenty non-present pages, the
2535 * UPL has been automatically set to clear the dirty flags (both software and hardware)
2536 * upon committing it... this is not the behavior we want since it's possible for
2537 * pages currently present as part of a mapped file to be dirtied while the I/O is in flight.
2538 * in order to maintain some semblance of coherency with mapped writes
2539 * we need to drop the current upl and pick it back up with COPYOUT_FROM set
2540 * so that we correctly deal with a change in state of the hardware modify bit...
2541 * we do this via cluster_push_x... by passing along the IO_SYNC flag, we force
2542 * cluster_push_x to wait until all the I/Os have completed... cluster_push_x is also
2543 * responsible for generating the correct sized I/O(s)
2544 */
2545 ubc_upl_commit_range(upl, 0, upl_size,
2546 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2547
2548 cl.e_addr = (upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64;
2549
2550 retval = cluster_push_x(vp, &cl, newEOF, flags);
2551 }
2552 }
2553 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
2554 retval, 0, io_resid, 0, 0);
2555
2556 return (retval);
2557 }
2558
2559 int
2560 cluster_read(vnode_t vp, struct uio *uio, off_t filesize, int xflags)
2561 {
2562 int prev_resid;
2563 u_int clip_size;
2564 off_t max_io_size;
2565 int upl_size;
2566 int upl_flags;
2567 upl_t upl;
2568 int retval = 0;
2569 int flags;
2570
2571 flags = xflags;
2572
2573 if (vp->v_flag & VNOCACHE_DATA)
2574 flags |= IO_NOCACHE;
2575 if (vp->v_flag & VRAOFF)
2576 flags |= IO_RAOFF;
2577
2578 if (!((flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg))) {
2579 /*
2580 * go do a read through the cache if one of the following is true....
2581 * NOCACHE is not true
2582 * the uio request doesn't target USERSPACE
2583 */
2584 return (cluster_read_x(vp, uio, filesize, flags));
2585 }
2586
2587 #if LP64_DEBUG
2588 if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) {
2589 panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__);
2590 }
2591 #endif /* LP64_DEBUG */
2592
2593 while (uio_resid(uio) && uio->uio_offset < filesize && retval == 0) {
2594 u_int64_t iov_len;
2595 u_int64_t iov_base;
2596
2597 /*
2598 * we know we have a resid, so this is safe
2599 * skip over any emtpy vectors
2600 */
2601 iov_len = uio_iov_len(uio);
2602
2603 while (iov_len == 0) {
2604 uio_next_iov(uio);
2605 uio->uio_iovcnt--;
2606 iov_len = uio_iov_len(uio);
2607 }
2608 iov_base = uio_iov_base(uio);
2609 upl_size = PAGE_SIZE;
2610 upl_flags = UPL_QUERY_OBJECT_TYPE;
2611
2612 // LP64todo - fix this!
2613 if ((vm_map_get_upl(current_map(),
2614 CAST_DOWN(vm_offset_t, iov_base) & ~PAGE_MASK,
2615 &upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS) {
2616 /*
2617 * the user app must have passed in an invalid address
2618 */
2619 return (EFAULT);
2620 }
2621
2622 /*
2623 * We check every vector target but if it is physically
2624 * contiguous space, we skip the sanity checks.
2625 */
2626 if (upl_flags & UPL_PHYS_CONTIG) {
2627 retval = cluster_phys_read(vp, uio, filesize);
2628 }
2629 else if (uio_resid(uio) < PAGE_SIZE) {
2630 /*
2631 * we're here because we're don't have a physically contiguous target buffer
2632 * go do a read through the cache if
2633 * the total xfer size is less than a page...
2634 */
2635 return (cluster_read_x(vp, uio, filesize, flags));
2636 }
2637 // LP64todo - fix this!
2638 else if (((int)uio->uio_offset & PAGE_MASK) || (CAST_DOWN(int, iov_base) & PAGE_MASK)) {
2639 if (((int)uio->uio_offset & PAGE_MASK) == (CAST_DOWN(int, iov_base) & PAGE_MASK)) {
2640 /*
2641 * Bring the file offset read up to a pagesize boundary
2642 * this will also bring the base address to a page boundary
2643 * since they both are currently on the same offset within a page
2644 * note: if we get here, uio->uio_resid is greater than PAGE_SIZE
2645 * so the computed clip_size must always be less than the current uio_resid
2646 */
2647 clip_size = (PAGE_SIZE - (int)(uio->uio_offset & PAGE_MASK_64));
2648
2649 /*
2650 * Fake the resid going into the cluster_read_x call
2651 * and restore it on the way out.
2652 */
2653 prev_resid = uio_resid(uio);
2654 // LP64todo - fix this
2655 uio_setresid(uio, clip_size);
2656
2657 retval = cluster_read_x(vp, uio, filesize, flags);
2658
2659 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
2660 } else {
2661 /*
2662 * can't get both the file offset and the buffer offset aligned to a page boundary
2663 * so fire an I/O through the cache for this entire vector
2664 */
2665 // LP64todo - fix this!
2666 clip_size = iov_len;
2667 prev_resid = uio_resid(uio);
2668 uio_setresid(uio, clip_size);
2669
2670 retval = cluster_read_x(vp, uio, filesize, flags);
2671
2672 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
2673 }
2674 } else {
2675 /*
2676 * If we come in here, we know the offset into
2677 * the file is on a pagesize boundary
2678 */
2679 max_io_size = filesize - uio->uio_offset;
2680 // LP64todo - fix this
2681 clip_size = uio_resid(uio);
2682 if (iov_len < clip_size)
2683 clip_size = iov_len;
2684 if (max_io_size < clip_size)
2685 clip_size = (int)max_io_size;
2686
2687 if (clip_size < PAGE_SIZE) {
2688 /*
2689 * Take care of the tail end of the read in this vector.
2690 */
2691 // LP64todo - fix this
2692 prev_resid = uio_resid(uio);
2693 uio_setresid(uio, clip_size);
2694
2695 retval = cluster_read_x(vp, uio, filesize, flags);
2696
2697 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
2698 } else {
2699 /* round clip_size down to a multiple of pagesize */
2700 clip_size = clip_size & ~(PAGE_MASK);
2701 // LP64todo - fix this
2702 prev_resid = uio_resid(uio);
2703 uio_setresid(uio, clip_size);
2704
2705 retval = cluster_nocopy_read(vp, uio, filesize);
2706
2707 if ((retval==0) && uio_resid(uio))
2708 retval = cluster_read_x(vp, uio, filesize, flags);
2709
2710 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
2711 }
2712 } /* end else */
2713 } /* end while */
2714
2715 return(retval);
2716 }
2717
2718 static int
2719 cluster_read_x(vnode_t vp, struct uio *uio, off_t filesize, int flags)
2720 {
2721 upl_page_info_t *pl;
2722 upl_t upl;
2723 vm_offset_t upl_offset;
2724 int upl_size;
2725 off_t upl_f_offset;
2726 int start_offset;
2727 int start_pg;
2728 int last_pg;
2729 int uio_last = 0;
2730 int pages_in_upl;
2731 off_t max_size;
2732 off_t last_ioread_offset;
2733 off_t last_request_offset;
2734 u_int size_of_prefetch;
2735 u_int io_size;
2736 kern_return_t kret;
2737 int error = 0;
2738 int retval = 0;
2739 u_int max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2740 u_int rd_ahead_enabled = 1;
2741 u_int prefetch_enabled = 1;
2742 struct cl_readahead * rap;
2743 struct clios iostate;
2744 struct cl_extent extent;
2745
2746 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
2747 (int)uio->uio_offset, uio_resid(uio), (int)filesize, 0, 0);
2748
2749 // LP64todo - fix this
2750 last_request_offset = uio->uio_offset + uio_resid(uio);
2751
2752 if ((flags & (IO_RAOFF|IO_NOCACHE)) ||
2753 ((last_request_offset & ~PAGE_MASK_64) == (uio->uio_offset & ~PAGE_MASK_64))) {
2754 rd_ahead_enabled = 0;
2755 rap = NULL;
2756 } else {
2757 if (cluster_hard_throttle_on(vp)) {
2758 rd_ahead_enabled = 0;
2759 prefetch_enabled = 0;
2760
2761 max_rd_size = HARD_THROTTLE_MAXSIZE;
2762 }
2763 if ((rap = cluster_get_rap(vp)) == NULL)
2764 rd_ahead_enabled = 0;
2765 }
2766 if (last_request_offset > filesize)
2767 last_request_offset = filesize;
2768 extent.b_addr = uio->uio_offset / PAGE_SIZE_64;
2769 extent.e_addr = (last_request_offset - 1) / PAGE_SIZE_64;
2770
2771 if (rap != NULL && rap->cl_ralen && (rap->cl_lastr == extent.b_addr || (rap->cl_lastr + 1) == extent.b_addr)) {
2772 /*
2773 * determine if we already have a read-ahead in the pipe courtesy of the
2774 * last read systemcall that was issued...
2775 * if so, pick up it's extent to determine where we should start
2776 * with respect to any read-ahead that might be necessary to
2777 * garner all the data needed to complete this read systemcall
2778 */
2779 last_ioread_offset = (rap->cl_maxra * PAGE_SIZE_64) + PAGE_SIZE_64;
2780
2781 if (last_ioread_offset < uio->uio_offset)
2782 last_ioread_offset = (off_t)0;
2783 else if (last_ioread_offset > last_request_offset)
2784 last_ioread_offset = last_request_offset;
2785 } else
2786 last_ioread_offset = (off_t)0;
2787
2788 while (uio_resid(uio) && uio->uio_offset < filesize && retval == 0) {
2789 /*
2790 * compute the size of the upl needed to encompass
2791 * the requested read... limit each call to cluster_io
2792 * to the maximum UPL size... cluster_io will clip if
2793 * this exceeds the maximum io_size for the device,
2794 * make sure to account for
2795 * a starting offset that's not page aligned
2796 */
2797 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2798 upl_f_offset = uio->uio_offset - (off_t)start_offset;
2799 max_size = filesize - uio->uio_offset;
2800
2801 // LP64todo - fix this!
2802 if ((off_t)((unsigned int)uio_resid(uio)) < max_size)
2803 io_size = uio_resid(uio);
2804 else
2805 io_size = max_size;
2806
2807 if (!(flags & IO_NOCACHE)) {
2808
2809 while (io_size) {
2810 u_int io_resid;
2811 u_int io_requested;
2812
2813 /*
2814 * if we keep finding the pages we need already in the cache, then
2815 * don't bother to call cluster_rd_prefetch since it costs CPU cycles
2816 * to determine that we have all the pages we need... once we miss in
2817 * the cache and have issued an I/O, than we'll assume that we're likely
2818 * to continue to miss in the cache and it's to our advantage to try and prefetch
2819 */
2820 if (last_request_offset && last_ioread_offset && (size_of_prefetch = (last_request_offset - last_ioread_offset))) {
2821 if ((last_ioread_offset - uio->uio_offset) <= max_rd_size && prefetch_enabled) {
2822 /*
2823 * we've already issued I/O for this request and
2824 * there's still work to do and
2825 * our prefetch stream is running dry, so issue a
2826 * pre-fetch I/O... the I/O latency will overlap
2827 * with the copying of the data
2828 */
2829 if (size_of_prefetch > max_rd_size)
2830 size_of_prefetch = max_rd_size;
2831
2832 size_of_prefetch = cluster_rd_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize);
2833
2834 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
2835
2836 if (last_ioread_offset > last_request_offset)
2837 last_ioread_offset = last_request_offset;
2838 }
2839 }
2840 /*
2841 * limit the size of the copy we're about to do so that
2842 * we can notice that our I/O pipe is running dry and
2843 * get the next I/O issued before it does go dry
2844 */
2845 if (last_ioread_offset && io_size > ((MAX_UPL_TRANSFER * PAGE_SIZE) / 4))
2846 io_resid = ((MAX_UPL_TRANSFER * PAGE_SIZE) / 4);
2847 else
2848 io_resid = io_size;
2849
2850 io_requested = io_resid;
2851
2852 retval = cluster_copy_ubc_data(vp, uio, &io_resid, 0);
2853
2854 io_size -= (io_requested - io_resid);
2855
2856 if (retval || io_resid)
2857 /*
2858 * if we run into a real error or
2859 * a page that is not in the cache
2860 * we need to leave streaming mode
2861 */
2862 break;
2863
2864 if ((io_size == 0 || last_ioread_offset == last_request_offset) && rd_ahead_enabled) {
2865 /*
2866 * we're already finished the I/O for this read request
2867 * let's see if we should do a read-ahead
2868 */
2869 cluster_rd_ahead(vp, &extent, filesize, rap);
2870 }
2871 }
2872 if (retval)
2873 break;
2874 if (io_size == 0) {
2875 if (rap != NULL) {
2876 if (extent.e_addr < rap->cl_lastr)
2877 rap->cl_maxra = 0;
2878 rap->cl_lastr = extent.e_addr;
2879 }
2880 break;
2881 }
2882 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2883 upl_f_offset = uio->uio_offset - (off_t)start_offset;
2884 max_size = filesize - uio->uio_offset;
2885 }
2886 if (io_size > max_rd_size)
2887 io_size = max_rd_size;
2888
2889 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2890
2891 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE) / 4)
2892 upl_size = (MAX_UPL_TRANSFER * PAGE_SIZE) / 4;
2893 pages_in_upl = upl_size / PAGE_SIZE;
2894
2895 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
2896 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2897
2898 kret = ubc_create_upl(vp,
2899 upl_f_offset,
2900 upl_size,
2901 &upl,
2902 &pl,
2903 UPL_SET_LITE);
2904 if (kret != KERN_SUCCESS)
2905 panic("cluster_read: failed to get pagelist");
2906
2907 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
2908 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2909
2910 /*
2911 * scan from the beginning of the upl looking for the first
2912 * non-valid page.... this will become the first page in
2913 * the request we're going to make to 'cluster_io'... if all
2914 * of the pages are valid, we won't call through to 'cluster_io'
2915 */
2916 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
2917 if (!upl_valid_page(pl, start_pg))
2918 break;
2919 }
2920
2921 /*
2922 * scan from the starting invalid page looking for a valid
2923 * page before the end of the upl is reached, if we
2924 * find one, then it will be the last page of the request to
2925 * 'cluster_io'
2926 */
2927 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
2928 if (upl_valid_page(pl, last_pg))
2929 break;
2930 }
2931 iostate.io_completed = 0;
2932 iostate.io_issued = 0;
2933 iostate.io_error = 0;
2934 iostate.io_wanted = 0;
2935
2936 if (start_pg < last_pg) {
2937 /*
2938 * we found a range of 'invalid' pages that must be filled
2939 * if the last page in this range is the last page of the file
2940 * we may have to clip the size of it to keep from reading past
2941 * the end of the last physical block associated with the file
2942 */
2943 upl_offset = start_pg * PAGE_SIZE;
2944 io_size = (last_pg - start_pg) * PAGE_SIZE;
2945
2946 if ((upl_f_offset + upl_offset + io_size) > filesize)
2947 io_size = filesize - (upl_f_offset + upl_offset);
2948
2949 /*
2950 * issue an asynchronous read to cluster_io
2951 */
2952
2953 error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
2954 io_size, CL_READ | CL_ASYNC, (buf_t)NULL, &iostate);
2955 }
2956 if (error == 0) {
2957 /*
2958 * if the read completed successfully, or there was no I/O request
2959 * issued, than copy the data into user land via 'cluster_upl_copy_data'
2960 * we'll first add on any 'valid'
2961 * pages that were present in the upl when we acquired it.
2962 */
2963 u_int val_size;
2964
2965 for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
2966 if (!upl_valid_page(pl, uio_last))
2967 break;
2968 }
2969 /*
2970 * compute size to transfer this round, if uio->uio_resid is
2971 * still non-zero after this attempt, we'll loop around and
2972 * set up for another I/O.
2973 */
2974 val_size = (uio_last * PAGE_SIZE) - start_offset;
2975
2976 if (val_size > max_size)
2977 val_size = max_size;
2978
2979 if (val_size > uio_resid(uio))
2980 // LP64todo - fix this
2981 val_size = uio_resid(uio);
2982
2983 if (last_ioread_offset == 0)
2984 last_ioread_offset = uio->uio_offset + val_size;
2985
2986 if ((size_of_prefetch = (last_request_offset - last_ioread_offset)) && prefetch_enabled) {
2987 /*
2988 * if there's still I/O left to do for this request, and...
2989 * we're not in hard throttle mode, then issue a
2990 * pre-fetch I/O... the I/O latency will overlap
2991 * with the copying of the data
2992 */
2993 size_of_prefetch = cluster_rd_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize);
2994
2995 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
2996
2997 if (last_ioread_offset > last_request_offset)
2998 last_ioread_offset = last_request_offset;
2999
3000 } else if ((uio->uio_offset + val_size) == last_request_offset) {
3001 /*
3002 * this transfer will finish this request, so...
3003 * let's try to read ahead if we're in
3004 * a sequential access pattern and we haven't
3005 * explicitly disabled it
3006 */
3007 if (rd_ahead_enabled)
3008 cluster_rd_ahead(vp, &extent, filesize, rap);
3009
3010 if (rap != NULL) {
3011 if (extent.e_addr < rap->cl_lastr)
3012 rap->cl_maxra = 0;
3013 rap->cl_lastr = extent.e_addr;
3014 }
3015 }
3016 lck_mtx_lock(cl_mtxp);
3017
3018 while (iostate.io_issued != iostate.io_completed) {
3019 iostate.io_wanted = 1;
3020 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_x", 0);
3021 }
3022 lck_mtx_unlock(cl_mtxp);
3023
3024 if (iostate.io_error)
3025 error = iostate.io_error;
3026 else
3027 retval = cluster_copy_upl_data(uio, upl, start_offset, val_size);
3028 }
3029 if (start_pg < last_pg) {
3030 /*
3031 * compute the range of pages that we actually issued an I/O for
3032 * and either commit them as valid if the I/O succeeded
3033 * or abort them if the I/O failed
3034 */
3035 io_size = (last_pg - start_pg) * PAGE_SIZE;
3036
3037 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
3038 (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
3039
3040 if (error || (flags & IO_NOCACHE))
3041 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
3042 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3043 else
3044 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size,
3045 UPL_COMMIT_CLEAR_DIRTY |
3046 UPL_COMMIT_FREE_ON_EMPTY |
3047 UPL_COMMIT_INACTIVATE);
3048
3049 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
3050 (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
3051 }
3052 if ((last_pg - start_pg) < pages_in_upl) {
3053 int cur_pg;
3054 int commit_flags;
3055
3056 /*
3057 * the set of pages that we issued an I/O for did not encompass
3058 * the entire upl... so just release these without modifying
3059 * their state
3060 */
3061 if (error)
3062 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3063 else {
3064 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
3065 (int)upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
3066
3067 if (start_pg) {
3068 /*
3069 * we found some already valid pages at the beginning of
3070 * the upl commit these back to the inactive list with
3071 * reference cleared
3072 */
3073 for (cur_pg = 0; cur_pg < start_pg; cur_pg++) {
3074 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
3075 | UPL_COMMIT_INACTIVATE;
3076
3077 if (upl_dirty_page(pl, cur_pg))
3078 commit_flags |= UPL_COMMIT_SET_DIRTY;
3079
3080 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (flags & IO_NOCACHE))
3081 ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
3082 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3083 else
3084 ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
3085 PAGE_SIZE, commit_flags);
3086 }
3087 }
3088 if (last_pg < uio_last) {
3089 /*
3090 * we found some already valid pages immediately after the
3091 * pages we issued I/O for, commit these back to the
3092 * inactive list with reference cleared
3093 */
3094 for (cur_pg = last_pg; cur_pg < uio_last; cur_pg++) {
3095 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
3096 | UPL_COMMIT_INACTIVATE;
3097
3098 if (upl_dirty_page(pl, cur_pg))
3099 commit_flags |= UPL_COMMIT_SET_DIRTY;
3100
3101 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (flags & IO_NOCACHE))
3102 ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
3103 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3104 else
3105 ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
3106 PAGE_SIZE, commit_flags);
3107 }
3108 }
3109 if (uio_last < pages_in_upl) {
3110 /*
3111 * there were some invalid pages beyond the valid pages
3112 * that we didn't issue an I/O for, just release them
3113 * unchanged
3114 */
3115 ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
3116 (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
3117 }
3118
3119 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
3120 (int)upl, -1, -1, 0, 0);
3121 }
3122 }
3123 if (retval == 0)
3124 retval = error;
3125
3126 if ( uio_resid(uio) ) {
3127 if (cluster_hard_throttle_on(vp)) {
3128 rd_ahead_enabled = 0;
3129 prefetch_enabled = 0;
3130
3131 max_rd_size = HARD_THROTTLE_MAXSIZE;
3132 } else {
3133 if (rap != NULL)
3134 rd_ahead_enabled = 1;
3135 prefetch_enabled = 1;
3136
3137 max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE;
3138 }
3139 }
3140 }
3141 if (rap != NULL) {
3142 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
3143 (int)uio->uio_offset, uio_resid(uio), rap->cl_lastr, retval, 0);
3144
3145 lck_mtx_unlock(&rap->cl_lockr);
3146 } else {
3147 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
3148 (int)uio->uio_offset, uio_resid(uio), 0, retval, 0);
3149 }
3150
3151 return (retval);
3152 }
3153
3154
3155 static int
3156 cluster_nocopy_read(vnode_t vp, struct uio *uio, off_t filesize)
3157 {
3158 upl_t upl;
3159 upl_page_info_t *pl;
3160 vm_offset_t upl_offset;
3161 off_t max_io_size;
3162 int io_size;
3163 int upl_size;
3164 int upl_needed_size;
3165 int pages_in_pl;
3166 int upl_flags;
3167 kern_return_t kret;
3168 struct iovec *iov;
3169 int i;
3170 int force_data_sync;
3171 int retval = 0;
3172 int no_zero_fill = 0;
3173 int abort_flag = 0;
3174 struct clios iostate;
3175 u_int max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE;
3176 u_int max_rd_ahead = MAX_UPL_TRANSFER * PAGE_SIZE * 2;
3177
3178
3179 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
3180 (int)uio->uio_offset, uio_resid(uio), (int)filesize, 0, 0);
3181
3182 /*
3183 * When we enter this routine, we know
3184 * -- the offset into the file is on a pagesize boundary
3185 * -- the resid is a page multiple
3186 * -- the resid will not exceed iov_len
3187 */
3188
3189 iostate.io_completed = 0;
3190 iostate.io_issued = 0;
3191 iostate.io_error = 0;
3192 iostate.io_wanted = 0;
3193
3194 iov = uio->uio_iov;
3195
3196 while (uio_resid(uio) && uio->uio_offset < filesize && retval == 0) {
3197
3198 if (cluster_hard_throttle_on(vp)) {
3199 max_rd_size = HARD_THROTTLE_MAXSIZE;
3200 max_rd_ahead = HARD_THROTTLE_MAXSIZE - 1;
3201 } else {
3202 max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE;
3203 max_rd_ahead = MAX_UPL_TRANSFER * PAGE_SIZE * 2;
3204 }
3205 max_io_size = filesize - uio->uio_offset;
3206
3207 // LP64todo - fix this
3208 if (max_io_size < (off_t)((unsigned int)uio_resid(uio)))
3209 io_size = max_io_size;
3210 else
3211 io_size = uio_resid(uio);
3212
3213 /*
3214 * First look for pages already in the cache
3215 * and move them to user space.
3216 */
3217 retval = cluster_copy_ubc_data(vp, uio, &io_size, 0);
3218
3219 if (retval) {
3220 /*
3221 * we may have already spun some portion of this request
3222 * off as async requests... we need to wait for the I/O
3223 * to complete before returning
3224 */
3225 goto wait_for_reads;
3226 }
3227 /*
3228 * If we are already finished with this read, then return
3229 */
3230 if (io_size == 0) {
3231 /*
3232 * we may have already spun some portion of this request
3233 * off as async requests... we need to wait for the I/O
3234 * to complete before returning
3235 */
3236 goto wait_for_reads;
3237 }
3238 max_io_size = io_size;
3239
3240 if (max_io_size > max_rd_size)
3241 max_io_size = max_rd_size;
3242
3243 io_size = 0;
3244
3245 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + max_io_size, UPL_ROP_ABSENT, &io_size);
3246
3247 if (io_size == 0)
3248 /*
3249 * we may have already spun some portion of this request
3250 * off as async requests... we need to wait for the I/O
3251 * to complete before returning
3252 */
3253 goto wait_for_reads;
3254
3255 // LP64todo - fix this!
3256 upl_offset = CAST_DOWN(vm_offset_t, iov->iov_base) & PAGE_MASK;
3257 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
3258
3259 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
3260 (int)upl_offset, upl_needed_size, (int)iov->iov_base, io_size, 0);
3261
3262 if (upl_offset == 0 && ((io_size & PAGE_MASK) == 0)) {
3263 no_zero_fill = 1;
3264 abort_flag = UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY;
3265 } else {
3266 no_zero_fill = 0;
3267 abort_flag = UPL_ABORT_FREE_ON_EMPTY;
3268 }
3269 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
3270 pages_in_pl = 0;
3271 upl_size = upl_needed_size;
3272 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
3273
3274 if (no_zero_fill)
3275 upl_flags |= UPL_NOZEROFILL;
3276 if (force_data_sync)
3277 upl_flags |= UPL_FORCE_DATA_SYNC;
3278
3279 // LP64todo - fix this!
3280 kret = vm_map_create_upl(current_map(),
3281 (vm_map_offset_t)(CAST_DOWN(vm_offset_t, iov->iov_base) & ~PAGE_MASK),
3282 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags);
3283
3284 if (kret != KERN_SUCCESS) {
3285 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
3286 (int)upl_offset, upl_size, io_size, kret, 0);
3287 /*
3288 * cluster_nocopy_read: failed to get pagelist
3289 *
3290 * we may have already spun some portion of this request
3291 * off as async requests... we need to wait for the I/O
3292 * to complete before returning
3293 */
3294 goto wait_for_reads;
3295 }
3296 pages_in_pl = upl_size / PAGE_SIZE;
3297 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
3298
3299 for (i = 0; i < pages_in_pl; i++) {
3300 if (!upl_valid_page(pl, i))
3301 break;
3302 }
3303 if (i == pages_in_pl)
3304 break;
3305
3306 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, abort_flag);
3307 }
3308 if (force_data_sync >= 3) {
3309 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
3310 (int)upl_offset, upl_size, io_size, kret, 0);
3311
3312 goto wait_for_reads;
3313 }
3314 /*
3315 * Consider the possibility that upl_size wasn't satisfied.
3316 */
3317 if (upl_size != upl_needed_size)
3318 io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
3319
3320 if (io_size == 0) {
3321 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, abort_flag);
3322 goto wait_for_reads;
3323 }
3324 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
3325 (int)upl_offset, upl_size, io_size, kret, 0);
3326
3327 /*
3328 * request asynchronously so that we can overlap
3329 * the preparation of the next I/O
3330 * if there are already too many outstanding reads
3331 * wait until some have completed before issuing the next read
3332 */
3333 lck_mtx_lock(cl_mtxp);
3334
3335 while ((iostate.io_issued - iostate.io_completed) > max_rd_ahead) {
3336 iostate.io_wanted = 1;
3337 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_read", 0);
3338 }
3339 lck_mtx_unlock(cl_mtxp);
3340
3341 if (iostate.io_error) {
3342 /*
3343 * one of the earlier reads we issued ran into a hard error
3344 * don't issue any more reads, cleanup the UPL
3345 * that was just created but not used, then
3346 * go wait for any other reads to complete before
3347 * returning the error to the caller
3348 */
3349 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, abort_flag);
3350
3351 goto wait_for_reads;
3352 }
3353 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
3354 (int)upl, (int)upl_offset, (int)uio->uio_offset, io_size, 0);
3355
3356 retval = cluster_io(vp, upl, upl_offset, uio->uio_offset, io_size,
3357 CL_PRESERVE | CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO,
3358 (buf_t)NULL, &iostate);
3359
3360 /*
3361 * update the uio structure
3362 */
3363 ((u_int32_t)iov->iov_base) += io_size;
3364 iov->iov_len -= io_size;
3365 uio_setresid(uio, (uio_resid(uio) - io_size));
3366 uio->uio_offset += io_size;
3367
3368 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
3369 (int)upl, (int)uio->uio_offset, (int)uio_resid(uio), retval, 0);
3370
3371 } /* end while */
3372
3373 wait_for_reads:
3374 /*
3375 * make sure all async reads that are part of this stream
3376 * have completed before we return
3377 */
3378 lck_mtx_lock(cl_mtxp);
3379
3380 while (iostate.io_issued != iostate.io_completed) {
3381 iostate.io_wanted = 1;
3382 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_read", 0);
3383 }
3384 lck_mtx_unlock(cl_mtxp);
3385
3386 if (iostate.io_error)
3387 retval = iostate.io_error;
3388
3389 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
3390 (int)uio->uio_offset, (int)uio_resid(uio), 6, retval, 0);
3391
3392 return (retval);
3393 }
3394
3395
3396 static int
3397 cluster_phys_read(vnode_t vp, struct uio *uio, off_t filesize)
3398 {
3399 upl_page_info_t *pl;
3400 upl_t upl;
3401 vm_offset_t upl_offset;
3402 addr64_t dst_paddr;
3403 off_t max_size;
3404 #if LP64KERN
3405 int64_t io_size;
3406 u_int64_t iov_len;
3407 u_int64_t iov_base;
3408 #else
3409 int io_size;
3410 uint iov_len;
3411 uint iov_base;
3412 #endif
3413 int tail_size;
3414 int upl_size;
3415 int upl_needed_size;
3416 int pages_in_pl;
3417 int upl_flags;
3418 kern_return_t kret;
3419 struct clios iostate;
3420 int error;
3421 int devblocksize;
3422
3423 devblocksize = vp->v_mount->mnt_devblocksize;
3424 /*
3425 * When we enter this routine, we know
3426 * -- the resid will not exceed iov_len
3427 * -- the target address is physically contiguous
3428 */
3429
3430 #if LP64_DEBUG
3431 if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) {
3432 panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__);
3433 }
3434 #endif /* LP64_DEBUG */
3435
3436 iov_len = uio_iov_len(uio);
3437 iov_base = uio_iov_base(uio);
3438
3439 max_size = filesize - uio->uio_offset;
3440
3441 // LP64todo - fix this!
3442 if (max_size < 0 || (u_int64_t)max_size > iov_len)
3443 io_size = iov_len;
3444 else
3445 io_size = max_size;
3446
3447 // LP64todo - fix this!
3448 upl_offset = CAST_DOWN(vm_offset_t, iov_base) & PAGE_MASK;
3449 upl_needed_size = upl_offset + io_size;
3450
3451 error = 0;
3452 pages_in_pl = 0;
3453 upl_size = upl_needed_size;
3454 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
3455
3456 kret = vm_map_get_upl(current_map(),
3457 CAST_DOWN(vm_offset_t, iov_base) & ~PAGE_MASK,
3458 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
3459
3460 if (kret != KERN_SUCCESS) {
3461 /*
3462 * cluster_phys_read: failed to get pagelist
3463 */
3464 return(EINVAL);
3465 }
3466 if (upl_size < upl_needed_size) {
3467 /*
3468 * The upl_size wasn't satisfied.
3469 */
3470 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3471
3472 return(EINVAL);
3473 }
3474 pl = ubc_upl_pageinfo(upl);
3475
3476 dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + ((addr64_t)(iov_base & PAGE_MASK));
3477
3478 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
3479 int head_size;
3480
3481 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
3482
3483 if (head_size > io_size)
3484 head_size = io_size;
3485
3486 error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, CL_READ);
3487
3488 if (error) {
3489 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3490
3491 return(EINVAL);
3492 }
3493 upl_offset += head_size;
3494 dst_paddr += head_size;
3495 io_size -= head_size;
3496 }
3497 tail_size = io_size & (devblocksize - 1);
3498 io_size -= tail_size;
3499
3500 iostate.io_completed = 0;
3501 iostate.io_issued = 0;
3502 iostate.io_error = 0;
3503 iostate.io_wanted = 0;
3504
3505 while (io_size && error == 0) {
3506 int xsize;
3507
3508 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
3509 xsize = MAX_UPL_TRANSFER * PAGE_SIZE;
3510 else
3511 xsize = io_size;
3512 /*
3513 * request asynchronously so that we can overlap
3514 * the preparation of the next I/O... we'll do
3515 * the commit after all the I/O has completed
3516 * since its all issued against the same UPL
3517 * if there are already too many outstanding reads
3518 * wait until some have completed before issuing the next
3519 */
3520 lck_mtx_lock(cl_mtxp);
3521
3522 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
3523 iostate.io_wanted = 1;
3524 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_phys_read", 0);
3525 }
3526 lck_mtx_unlock(cl_mtxp);
3527
3528 error = cluster_io(vp, upl, upl_offset, uio->uio_offset, xsize,
3529 CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC,
3530 (buf_t)NULL, &iostate);
3531 /*
3532 * The cluster_io read was issued successfully,
3533 * update the uio structure
3534 */
3535 if (error == 0) {
3536 uio_setresid(uio, (uio_resid(uio) - xsize));
3537 uio_iov_base_add(uio, xsize);
3538 uio_iov_len_add(uio, -xsize);
3539 uio->uio_offset += xsize;
3540 dst_paddr += xsize;
3541 upl_offset += xsize;
3542 io_size -= xsize;
3543 }
3544 }
3545 /*
3546 * make sure all async reads that are part of this stream
3547 * have completed before we proceed
3548 */
3549 lck_mtx_lock(cl_mtxp);
3550
3551 while (iostate.io_issued != iostate.io_completed) {
3552 iostate.io_wanted = 1;
3553 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_phys_read", 0);
3554 }
3555 lck_mtx_unlock(cl_mtxp);
3556
3557 if (iostate.io_error)
3558 error = iostate.io_error;
3559
3560 if (error == 0 && tail_size)
3561 error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, CL_READ);
3562
3563 /*
3564 * just release our hold on the physically contiguous
3565 * region without changing any state
3566 */
3567 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3568
3569 return (error);
3570 }
3571
3572
3573 /*
3574 * generate advisory I/O's in the largest chunks possible
3575 * the completed pages will be released into the VM cache
3576 */
3577 int
3578 advisory_read(vnode_t vp, off_t filesize, off_t f_offset, int resid)
3579 {
3580 upl_page_info_t *pl;
3581 upl_t upl;
3582 vm_offset_t upl_offset;
3583 int upl_size;
3584 off_t upl_f_offset;
3585 int start_offset;
3586 int start_pg;
3587 int last_pg;
3588 int pages_in_upl;
3589 off_t max_size;
3590 int io_size;
3591 kern_return_t kret;
3592 int retval = 0;
3593 int issued_io;
3594 int skip_range;
3595
3596 if ( !UBCINFOEXISTS(vp))
3597 return(EINVAL);
3598
3599 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
3600 (int)f_offset, resid, (int)filesize, 0, 0);
3601
3602 while (resid && f_offset < filesize && retval == 0) {
3603 /*
3604 * compute the size of the upl needed to encompass
3605 * the requested read... limit each call to cluster_io
3606 * to the maximum UPL size... cluster_io will clip if
3607 * this exceeds the maximum io_size for the device,
3608 * make sure to account for
3609 * a starting offset that's not page aligned
3610 */
3611 start_offset = (int)(f_offset & PAGE_MASK_64);
3612 upl_f_offset = f_offset - (off_t)start_offset;
3613 max_size = filesize - f_offset;
3614
3615 if (resid < max_size)
3616 io_size = resid;
3617 else
3618 io_size = max_size;
3619
3620 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
3621 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
3622 upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
3623
3624 skip_range = 0;
3625 /*
3626 * return the number of contiguously present pages in the cache
3627 * starting at upl_f_offset within the file
3628 */
3629 ubc_range_op(vp, upl_f_offset, upl_f_offset + upl_size, UPL_ROP_PRESENT, &skip_range);
3630
3631 if (skip_range) {
3632 /*
3633 * skip over pages already present in the cache
3634 */
3635 io_size = skip_range - start_offset;
3636
3637 f_offset += io_size;
3638 resid -= io_size;
3639
3640 if (skip_range == upl_size)
3641 continue;
3642 /*
3643 * have to issue some real I/O
3644 * at this point, we know it's starting on a page boundary
3645 * because we've skipped over at least the first page in the request
3646 */
3647 start_offset = 0;
3648 upl_f_offset += skip_range;
3649 upl_size -= skip_range;
3650 }
3651 pages_in_upl = upl_size / PAGE_SIZE;
3652
3653 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_START,
3654 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
3655
3656 kret = ubc_create_upl(vp,
3657 upl_f_offset,
3658 upl_size,
3659 &upl,
3660 &pl,
3661 UPL_RET_ONLY_ABSENT | UPL_SET_LITE);
3662 if (kret != KERN_SUCCESS)
3663 return(retval);
3664 issued_io = 0;
3665
3666 /*
3667 * before we start marching forward, we must make sure we end on
3668 * a present page, otherwise we will be working with a freed
3669 * upl
3670 */
3671 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
3672 if (upl_page_present(pl, last_pg))
3673 break;
3674 }
3675 pages_in_upl = last_pg + 1;
3676
3677
3678 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_END,
3679 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
3680
3681
3682 for (last_pg = 0; last_pg < pages_in_upl; ) {
3683 /*
3684 * scan from the beginning of the upl looking for the first
3685 * page that is present.... this will become the first page in
3686 * the request we're going to make to 'cluster_io'... if all
3687 * of the pages are absent, we won't call through to 'cluster_io'
3688 */
3689 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
3690 if (upl_page_present(pl, start_pg))
3691 break;
3692 }
3693
3694 /*
3695 * scan from the starting present page looking for an absent
3696 * page before the end of the upl is reached, if we
3697 * find one, then it will terminate the range of pages being
3698 * presented to 'cluster_io'
3699 */
3700 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
3701 if (!upl_page_present(pl, last_pg))
3702 break;
3703 }
3704
3705 if (last_pg > start_pg) {
3706 /*
3707 * we found a range of pages that must be filled
3708 * if the last page in this range is the last page of the file
3709 * we may have to clip the size of it to keep from reading past
3710 * the end of the last physical block associated with the file
3711 */
3712 upl_offset = start_pg * PAGE_SIZE;
3713 io_size = (last_pg - start_pg) * PAGE_SIZE;
3714
3715 if ((upl_f_offset + upl_offset + io_size) > filesize)
3716 io_size = filesize - (upl_f_offset + upl_offset);
3717
3718 /*
3719 * issue an asynchronous read to cluster_io
3720 */
3721 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
3722 CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE, (buf_t)NULL, (struct clios *)NULL);
3723
3724 issued_io = 1;
3725 }
3726 }
3727 if (issued_io == 0)
3728 ubc_upl_abort(upl, 0);
3729
3730 io_size = upl_size - start_offset;
3731
3732 if (io_size > resid)
3733 io_size = resid;
3734 f_offset += io_size;
3735 resid -= io_size;
3736 }
3737
3738 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
3739 (int)f_offset, resid, retval, 0, 0);
3740
3741 return(retval);
3742 }
3743
3744
3745 int
3746 cluster_push(vnode_t vp, int flags)
3747 {
3748 int retval;
3749 struct cl_writebehind *wbp;
3750
3751 if ( !UBCINFOEXISTS(vp)) {
3752 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, (int)vp, flags, 0, -1, 0);
3753 return (0);
3754 }
3755 /* return if deferred write is set */
3756 if (((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) && (flags & IO_DEFWRITE)) {
3757 return (0);
3758 }
3759 if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) == NULL) {
3760 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, (int)vp, flags, 0, -2, 0);
3761 return (0);
3762 }
3763 if (wbp->cl_number == 0 && wbp->cl_scmap == NULL) {
3764 lck_mtx_unlock(&wbp->cl_lockw);
3765
3766 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, (int)vp, flags, 0, -3, 0);
3767 return(0);
3768 }
3769 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
3770 (int)wbp->cl_scmap, wbp->cl_number, flags, 0, 0);
3771
3772 if (wbp->cl_scmap) {
3773 sparse_cluster_push(wbp, vp, ubc_getsize(vp), 1);
3774
3775 retval = 1;
3776 } else
3777 retval = cluster_try_push(wbp, vp, ubc_getsize(vp), 0, 1);
3778
3779 lck_mtx_unlock(&wbp->cl_lockw);
3780
3781 if (flags & IO_SYNC)
3782 (void)vnode_waitforwrites(vp, 0, 0, 0, (char *)"cluster_push");
3783
3784 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
3785 (int)wbp->cl_scmap, wbp->cl_number, retval, 0, 0);
3786
3787 return (retval);
3788 }
3789
3790
3791 __private_extern__ void
3792 cluster_release(struct ubc_info *ubc)
3793 {
3794 struct cl_writebehind *wbp;
3795 struct cl_readahead *rap;
3796
3797 if ((wbp = ubc->cl_wbehind)) {
3798
3799 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, (int)ubc, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
3800
3801 if (wbp->cl_scmap)
3802 vfs_drt_control(&(wbp->cl_scmap), 0);
3803 } else {
3804 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, (int)ubc, 0, 0, 0, 0);
3805 }
3806
3807 rap = ubc->cl_rahead;
3808
3809 if (wbp != NULL) {
3810 lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp);
3811 FREE_ZONE((void *)wbp, sizeof *wbp, M_CLWRBEHIND);
3812 }
3813 if ((rap = ubc->cl_rahead)) {
3814 lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp);
3815 FREE_ZONE((void *)rap, sizeof *rap, M_CLRDAHEAD);
3816 }
3817 ubc->cl_rahead = NULL;
3818 ubc->cl_wbehind = NULL;
3819
3820 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_END, (int)ubc, (int)rap, (int)wbp, 0, 0);
3821 }
3822
3823
3824 static void
3825 cluster_push_EOF(vnode_t vp, off_t EOF)
3826 {
3827 struct cl_writebehind *wbp;
3828
3829 wbp = cluster_get_wbp(vp, CLW_ALLOCATE | CLW_RETURNLOCKED);
3830
3831 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
3832 (int)wbp->cl_scmap, wbp->cl_number, (int)EOF, 0, 0);
3833
3834 if (wbp->cl_scmap)
3835 sparse_cluster_push(wbp, vp, EOF, 1);
3836 else
3837 cluster_try_push(wbp, vp, EOF, 0, 1);
3838
3839 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
3840 (int)wbp->cl_scmap, wbp->cl_number, 0, 0, 0);
3841
3842 lck_mtx_unlock(&wbp->cl_lockw);
3843 }
3844
3845
3846 static int
3847 cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int can_delay, int push_all)
3848 {
3849 int cl_index;
3850 int cl_index1;
3851 int min_index;
3852 int cl_len;
3853 int cl_pushed = 0;
3854 struct cl_wextent l_clusters[MAX_CLUSTERS];
3855
3856 /*
3857 * the write behind context exists and has
3858 * already been locked...
3859 *
3860 * make a local 'sorted' copy of the clusters
3861 * and clear wbp->cl_number so that new clusters can
3862 * be developed
3863 */
3864 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
3865 for (min_index = -1, cl_index1 = 0; cl_index1 < wbp->cl_number; cl_index1++) {
3866 if (wbp->cl_clusters[cl_index1].b_addr == wbp->cl_clusters[cl_index1].e_addr)
3867 continue;
3868 if (min_index == -1)
3869 min_index = cl_index1;
3870 else if (wbp->cl_clusters[cl_index1].b_addr < wbp->cl_clusters[min_index].b_addr)
3871 min_index = cl_index1;
3872 }
3873 if (min_index == -1)
3874 break;
3875 l_clusters[cl_index].b_addr = wbp->cl_clusters[min_index].b_addr;
3876 l_clusters[cl_index].e_addr = wbp->cl_clusters[min_index].e_addr;
3877 l_clusters[cl_index].io_nocache = wbp->cl_clusters[min_index].io_nocache;
3878
3879 wbp->cl_clusters[min_index].b_addr = wbp->cl_clusters[min_index].e_addr;
3880 }
3881 wbp->cl_number = 0;
3882
3883 cl_len = cl_index;
3884
3885 if (can_delay && cl_len == MAX_CLUSTERS) {
3886 int i;
3887
3888 /*
3889 * determine if we appear to be writing the file sequentially
3890 * if not, by returning without having pushed any clusters
3891 * we will cause this vnode to be pushed into the sparse cluster mechanism
3892 * used for managing more random I/O patterns
3893 *
3894 * we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
3895 * that's why we're in try_push with can_delay true...
3896 *
3897 * check to make sure that all the clusters except the last one are 'full'... and that each cluster
3898 * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
3899 * so we can just make a simple pass through, up to, but not including the last one...
3900 * note that e_addr is not inclusive, so it will be equal to the b_addr of the next cluster if they
3901 * are sequential
3902 *
3903 * we let the last one be partial as long as it was adjacent to the previous one...
3904 * we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
3905 * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
3906 */
3907 for (i = 0; i < MAX_CLUSTERS - 1; i++) {
3908 if ((l_clusters[i].e_addr - l_clusters[i].b_addr) != MAX_UPL_TRANSFER)
3909 goto dont_try;
3910 if (l_clusters[i].e_addr != l_clusters[i+1].b_addr)
3911 goto dont_try;
3912 }
3913 }
3914 /*
3915 * drop the lock while we're firing off the I/Os...
3916 * this is safe since I'm working off of a private sorted copy
3917 * of the clusters, and I'm going to re-evaluate the public
3918 * state after I retake the lock
3919 */
3920 lck_mtx_unlock(&wbp->cl_lockw);
3921
3922 for (cl_index = 0; cl_index < cl_len; cl_index++) {
3923 int flags;
3924 struct cl_extent cl;
3925
3926 /*
3927 * try to push each cluster in turn...
3928 */
3929 if (l_clusters[cl_index].io_nocache)
3930 flags = IO_NOCACHE;
3931 else
3932 flags = 0;
3933 cl.b_addr = l_clusters[cl_index].b_addr;
3934 cl.e_addr = l_clusters[cl_index].e_addr;
3935
3936 cluster_push_x(vp, &cl, EOF, flags);
3937
3938 l_clusters[cl_index].b_addr = 0;
3939 l_clusters[cl_index].e_addr = 0;
3940
3941 cl_pushed++;
3942
3943 if (push_all == 0)
3944 break;
3945 }
3946 lck_mtx_lock(&wbp->cl_lockw);
3947
3948 dont_try:
3949 if (cl_len > cl_pushed) {
3950 /*
3951 * we didn't push all of the clusters, so
3952 * lets try to merge them back in to the vnode
3953 */
3954 if ((MAX_CLUSTERS - wbp->cl_number) < (cl_len - cl_pushed)) {
3955 /*
3956 * we picked up some new clusters while we were trying to
3957 * push the old ones... this can happen because I've dropped
3958 * the vnode lock... the sum of the
3959 * leftovers plus the new cluster count exceeds our ability
3960 * to represent them, so switch to the sparse cluster mechanism
3961 *
3962 * collect the active public clusters...
3963 */
3964 sparse_cluster_switch(wbp, vp, EOF);
3965
3966 for (cl_index = 0, cl_index1 = 0; cl_index < cl_len; cl_index++) {
3967 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr)
3968 continue;
3969 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
3970 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
3971 wbp->cl_clusters[cl_index1].io_nocache = l_clusters[cl_index].io_nocache;
3972
3973 cl_index1++;
3974 }
3975 /*
3976 * update the cluster count
3977 */
3978 wbp->cl_number = cl_index1;
3979
3980 /*
3981 * and collect the original clusters that were moved into the
3982 * local storage for sorting purposes
3983 */
3984 sparse_cluster_switch(wbp, vp, EOF);
3985
3986 } else {
3987 /*
3988 * we've got room to merge the leftovers back in
3989 * just append them starting at the next 'hole'
3990 * represented by wbp->cl_number
3991 */
3992 for (cl_index = 0, cl_index1 = wbp->cl_number; cl_index < cl_len; cl_index++) {
3993 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr)
3994 continue;
3995
3996 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
3997 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
3998 wbp->cl_clusters[cl_index1].io_nocache = l_clusters[cl_index].io_nocache;
3999
4000 cl_index1++;
4001 }
4002 /*
4003 * update the cluster count
4004 */
4005 wbp->cl_number = cl_index1;
4006 }
4007 }
4008 return(MAX_CLUSTERS - wbp->cl_number);
4009 }
4010
4011
4012
4013 static int
4014 cluster_push_x(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags)
4015 {
4016 upl_page_info_t *pl;
4017 upl_t upl;
4018 vm_offset_t upl_offset;
4019 int upl_size;
4020 off_t upl_f_offset;
4021 int pages_in_upl;
4022 int start_pg;
4023 int last_pg;
4024 int io_size;
4025 int io_flags;
4026 int upl_flags;
4027 int size;
4028 int error = 0;
4029 int retval;
4030 kern_return_t kret;
4031
4032
4033 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START,
4034 (int)cl->b_addr, (int)cl->e_addr, (int)EOF, flags, 0);
4035
4036 if ((pages_in_upl = (int)(cl->e_addr - cl->b_addr)) == 0) {
4037 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0);
4038
4039 return (0);
4040 }
4041 upl_size = pages_in_upl * PAGE_SIZE;
4042 upl_f_offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
4043
4044 if (upl_f_offset + upl_size >= EOF) {
4045
4046 if (upl_f_offset >= EOF) {
4047 /*
4048 * must have truncated the file and missed
4049 * clearing a dangling cluster (i.e. it's completely
4050 * beyond the new EOF
4051 */
4052 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0);
4053
4054 return(0);
4055 }
4056 size = EOF - upl_f_offset;
4057
4058 upl_size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
4059 pages_in_upl = upl_size / PAGE_SIZE;
4060 } else
4061 size = upl_size;
4062
4063 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, size, 0, 0, 0);
4064
4065 /*
4066 * by asking for UPL_COPYOUT_FROM and UPL_RET_ONLY_DIRTY, we get the following desirable behavior
4067 *
4068 * - only pages that are currently dirty are returned... these are the ones we need to clean
4069 * - the hardware dirty bit is cleared when the page is gathered into the UPL... the software dirty bit is set
4070 * - if we have to abort the I/O for some reason, the software dirty bit is left set since we didn't clean the page
4071 * - when we commit the page, the software dirty bit is cleared... the hardware dirty bit is untouched so that if
4072 * someone dirties this page while the I/O is in progress, we don't lose track of the new state
4073 *
4074 * when the I/O completes, we no longer ask for an explicit clear of the DIRTY state (either soft or hard)
4075 */
4076
4077 if ((vp->v_flag & VNOCACHE_DATA) || (flags & IO_NOCACHE))
4078 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE | UPL_WILL_BE_DUMPED;
4079 else
4080 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE;
4081
4082 kret = ubc_create_upl(vp,
4083 upl_f_offset,
4084 upl_size,
4085 &upl,
4086 &pl,
4087 upl_flags);
4088 if (kret != KERN_SUCCESS)
4089 panic("cluster_push: failed to get pagelist");
4090
4091 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END, (int)upl, upl_f_offset, 0, 0, 0);
4092
4093 /*
4094 * since we only asked for the dirty pages back
4095 * it's possible that we may only get a few or even none, so...
4096 * before we start marching forward, we must make sure we know
4097 * where the last present page is in the UPL, otherwise we could
4098 * end up working with a freed upl due to the FREE_ON_EMPTY semantics
4099 * employed by commit_range and abort_range.
4100 */
4101 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
4102 if (upl_page_present(pl, last_pg))
4103 break;
4104 }
4105 pages_in_upl = last_pg + 1;
4106
4107 if (pages_in_upl == 0) {
4108 ubc_upl_abort(upl, 0);
4109
4110 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 2, 0, 0, 0);
4111 return(0);
4112 }
4113
4114 for (last_pg = 0; last_pg < pages_in_upl; ) {
4115 /*
4116 * find the next dirty page in the UPL
4117 * this will become the first page in the
4118 * next I/O to generate
4119 */
4120 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
4121 if (upl_dirty_page(pl, start_pg))
4122 break;
4123 if (upl_page_present(pl, start_pg))
4124 /*
4125 * RET_ONLY_DIRTY will return non-dirty 'precious' pages
4126 * just release these unchanged since we're not going
4127 * to steal them or change their state
4128 */
4129 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
4130 }
4131 if (start_pg >= pages_in_upl)
4132 /*
4133 * done... no more dirty pages to push
4134 */
4135 break;
4136 if (start_pg > last_pg)
4137 /*
4138 * skipped over some non-dirty pages
4139 */
4140 size -= ((start_pg - last_pg) * PAGE_SIZE);
4141
4142 /*
4143 * find a range of dirty pages to write
4144 */
4145 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
4146 if (!upl_dirty_page(pl, last_pg))
4147 break;
4148 }
4149 upl_offset = start_pg * PAGE_SIZE;
4150
4151 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
4152
4153 io_flags = CL_THROTTLE | CL_COMMIT;
4154
4155 if ( !(flags & IO_SYNC))
4156 io_flags |= CL_ASYNC;
4157
4158 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
4159 io_flags, (buf_t)NULL, (struct clios *)NULL);
4160
4161 if (error == 0 && retval)
4162 error = retval;
4163
4164 size -= io_size;
4165 }
4166 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, 0, 0, 0);
4167
4168 return(error);
4169 }
4170
4171
4172 /*
4173 * sparse_cluster_switch is called with the write behind lock held
4174 */
4175 static void
4176 sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF)
4177 {
4178 int cl_index;
4179
4180 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
4181
4182 if (wbp->cl_scmap == NULL)
4183 wbp->cl_scdirty = 0;
4184
4185 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
4186 int flags;
4187 struct cl_extent cl;
4188
4189 for (cl.b_addr = wbp->cl_clusters[cl_index].b_addr; cl.b_addr < wbp->cl_clusters[cl_index].e_addr; cl.b_addr++) {
4190
4191 if (ubc_page_op(vp, (off_t)(cl.b_addr * PAGE_SIZE_64), 0, 0, &flags) == KERN_SUCCESS) {
4192 if (flags & UPL_POP_DIRTY) {
4193 cl.e_addr = cl.b_addr + 1;
4194
4195 sparse_cluster_add(wbp, vp, &cl, EOF);
4196 }
4197 }
4198 }
4199 }
4200 wbp->cl_number = 0;
4201
4202 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
4203 }
4204
4205
4206 /*
4207 * sparse_cluster_push is called with the write behind lock held
4208 */
4209 static void
4210 sparse_cluster_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_all)
4211 {
4212 struct cl_extent cl;
4213 off_t offset;
4214 u_int length;
4215
4216 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, push_all, 0);
4217
4218 if (push_all)
4219 vfs_drt_control(&(wbp->cl_scmap), 1);
4220
4221 for (;;) {
4222 if (vfs_drt_get_cluster(&(wbp->cl_scmap), &offset, &length) != KERN_SUCCESS)
4223 break;
4224
4225 cl.b_addr = (daddr64_t)(offset / PAGE_SIZE_64);
4226 cl.e_addr = (daddr64_t)((offset + length) / PAGE_SIZE_64);
4227
4228 wbp->cl_scdirty -= (int)(cl.e_addr - cl.b_addr);
4229
4230 cluster_push_x(vp, &cl, EOF, 0);
4231
4232 if (push_all == 0)
4233 break;
4234 }
4235 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
4236 }
4237
4238
4239 /*
4240 * sparse_cluster_add is called with the write behind lock held
4241 */
4242 static void
4243 sparse_cluster_add(struct cl_writebehind *wbp, vnode_t vp, struct cl_extent *cl, off_t EOF)
4244 {
4245 u_int new_dirty;
4246 u_int length;
4247 off_t offset;
4248
4249 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_START, (int)wbp->cl_scmap, wbp->cl_scdirty, (int)cl->b_addr, (int)cl->e_addr, 0);
4250
4251 offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
4252 length = ((u_int)(cl->e_addr - cl->b_addr)) * PAGE_SIZE;
4253
4254 while (vfs_drt_mark_pages(&(wbp->cl_scmap), offset, length, &new_dirty) != KERN_SUCCESS) {
4255 /*
4256 * no room left in the map
4257 * only a partial update was done
4258 * push out some pages and try again
4259 */
4260 wbp->cl_scdirty += new_dirty;
4261
4262 sparse_cluster_push(wbp, vp, EOF, 0);
4263
4264 offset += (new_dirty * PAGE_SIZE_64);
4265 length -= (new_dirty * PAGE_SIZE);
4266 }
4267 wbp->cl_scdirty += new_dirty;
4268
4269 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
4270 }
4271
4272
4273 static int
4274 cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, int xsize, int flags)
4275 {
4276 struct iovec *iov;
4277 upl_page_info_t *pl;
4278 upl_t upl;
4279 addr64_t ubc_paddr;
4280 kern_return_t kret;
4281 int error = 0;
4282 int did_read = 0;
4283 int abort_flags;
4284 int upl_flags;
4285
4286 iov = uio->uio_iov;
4287
4288 upl_flags = UPL_SET_LITE;
4289 if (! (flags & CL_READ)) {
4290 /*
4291 * "write" operation: let the UPL subsystem know
4292 * that we intend to modify the buffer cache pages
4293 * we're gathering.
4294 */
4295 upl_flags |= UPL_WILL_MODIFY;
4296 }
4297
4298 kret = ubc_create_upl(vp,
4299 uio->uio_offset & ~PAGE_MASK_64,
4300 PAGE_SIZE,
4301 &upl,
4302 &pl,
4303 upl_flags);
4304
4305 if (kret != KERN_SUCCESS)
4306 return(EINVAL);
4307
4308 if (!upl_valid_page(pl, 0)) {
4309 /*
4310 * issue a synchronous read to cluster_io
4311 */
4312 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
4313 CL_READ, (buf_t)NULL, (struct clios *)NULL);
4314 if (error) {
4315 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
4316
4317 return(error);
4318 }
4319 did_read = 1;
4320 }
4321 ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)(uio->uio_offset & PAGE_MASK_64);
4322
4323 /*
4324 * NOTE: There is no prototype for the following in BSD. It, and the definitions
4325 * of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
4326 * osfmk/ppc/mappings.h. They are not included here because there appears to be no
4327 * way to do so without exporting them to kexts as well.
4328 */
4329 if (flags & CL_READ)
4330 // copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk); /* Copy physical to physical and flush the destination */
4331 copypv(ubc_paddr, usr_paddr, xsize, 2 | 1 | 4); /* Copy physical to physical and flush the destination */
4332 else
4333 // copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc); /* Copy physical to physical and flush the source */
4334 copypv(usr_paddr, ubc_paddr, xsize, 2 | 1 | 8); /* Copy physical to physical and flush the source */
4335
4336 if ( !(flags & CL_READ) || (upl_valid_page(pl, 0) && upl_dirty_page(pl, 0))) {
4337 /*
4338 * issue a synchronous write to cluster_io
4339 */
4340 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
4341 0, (buf_t)NULL, (struct clios *)NULL);
4342 }
4343 if (error == 0) {
4344 uio->uio_offset += xsize;
4345 uio_iov_base_add(uio, xsize);
4346 uio_iov_len_add(uio, -xsize);
4347 uio_setresid(uio, (uio_resid(uio) - xsize));
4348 }
4349 if (did_read)
4350 abort_flags = UPL_ABORT_FREE_ON_EMPTY;
4351 else
4352 abort_flags = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
4353
4354 ubc_upl_abort_range(upl, 0, PAGE_SIZE, abort_flags);
4355
4356 return (error);
4357 }
4358
4359
4360
4361 int
4362 cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int xsize)
4363 {
4364 int pg_offset;
4365 int pg_index;
4366 int csize;
4367 int segflg;
4368 int retval = 0;
4369 upl_page_info_t *pl;
4370
4371 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
4372 (int)uio->uio_offset, uio_resid(uio), upl_offset, xsize, 0);
4373
4374 segflg = uio->uio_segflg;
4375
4376 switch(segflg) {
4377
4378 case UIO_USERSPACE32:
4379 case UIO_USERISPACE32:
4380 uio->uio_segflg = UIO_PHYS_USERSPACE32;
4381 break;
4382
4383 case UIO_USERSPACE:
4384 case UIO_USERISPACE:
4385 uio->uio_segflg = UIO_PHYS_USERSPACE;
4386 break;
4387
4388 case UIO_USERSPACE64:
4389 case UIO_USERISPACE64:
4390 uio->uio_segflg = UIO_PHYS_USERSPACE64;
4391 break;
4392
4393 case UIO_SYSSPACE32:
4394 uio->uio_segflg = UIO_PHYS_SYSSPACE32;
4395 break;
4396
4397 case UIO_SYSSPACE:
4398 uio->uio_segflg = UIO_PHYS_SYSSPACE;
4399 break;
4400
4401 case UIO_SYSSPACE64:
4402 uio->uio_segflg = UIO_PHYS_SYSSPACE64;
4403 break;
4404 }
4405 pl = ubc_upl_pageinfo(upl);
4406
4407 pg_index = upl_offset / PAGE_SIZE;
4408 pg_offset = upl_offset & PAGE_MASK;
4409 csize = min(PAGE_SIZE - pg_offset, xsize);
4410
4411 while (xsize && retval == 0) {
4412 addr64_t paddr;
4413
4414 paddr = ((addr64_t)upl_phys_page(pl, pg_index) << 12) + pg_offset;
4415
4416 retval = uiomove64(paddr, csize, uio);
4417
4418 pg_index += 1;
4419 pg_offset = 0;
4420 xsize -= csize;
4421 csize = min(PAGE_SIZE, xsize);
4422 }
4423 uio->uio_segflg = segflg;
4424
4425 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
4426 (int)uio->uio_offset, uio_resid(uio), retval, segflg, 0);
4427
4428 return (retval);
4429 }
4430
4431
4432 int
4433 cluster_copy_ubc_data(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty)
4434 {
4435 int segflg;
4436 int io_size;
4437 int xsize;
4438 int start_offset;
4439 int retval = 0;
4440 memory_object_control_t control;
4441
4442
4443 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
4444 (int)uio->uio_offset, uio_resid(uio), 0, *io_resid, 0);
4445
4446 control = ubc_getobject(vp, UBC_FLAGS_NONE);
4447 if (control == MEMORY_OBJECT_CONTROL_NULL) {
4448 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
4449 (int)uio->uio_offset, uio_resid(uio), retval, 3, 0);
4450
4451 return(0);
4452 }
4453 segflg = uio->uio_segflg;
4454
4455 switch(segflg) {
4456
4457 case UIO_USERSPACE32:
4458 case UIO_USERISPACE32:
4459 uio->uio_segflg = UIO_PHYS_USERSPACE32;
4460 break;
4461
4462 case UIO_USERSPACE64:
4463 case UIO_USERISPACE64:
4464 uio->uio_segflg = UIO_PHYS_USERSPACE64;
4465 break;
4466
4467 case UIO_SYSSPACE32:
4468 uio->uio_segflg = UIO_PHYS_SYSSPACE32;
4469 break;
4470
4471 case UIO_SYSSPACE64:
4472 uio->uio_segflg = UIO_PHYS_SYSSPACE64;
4473 break;
4474
4475 case UIO_USERSPACE:
4476 case UIO_USERISPACE:
4477 uio->uio_segflg = UIO_PHYS_USERSPACE;
4478 break;
4479
4480 case UIO_SYSSPACE:
4481 uio->uio_segflg = UIO_PHYS_SYSSPACE;
4482 break;
4483 }
4484
4485 if ( (io_size = *io_resid) ) {
4486 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
4487 xsize = uio_resid(uio);
4488
4489 retval = memory_object_control_uiomove(control, uio->uio_offset - start_offset,
4490 uio, start_offset, io_size, mark_dirty);
4491 xsize -= uio_resid(uio);
4492 io_size -= xsize;
4493 }
4494 uio->uio_segflg = segflg;
4495 *io_resid = io_size;
4496
4497 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
4498 (int)uio->uio_offset, uio_resid(uio), retval, 0x80000000 | segflg, 0);
4499
4500 return(retval);
4501 }
4502
4503
4504 int
4505 is_file_clean(vnode_t vp, off_t filesize)
4506 {
4507 off_t f_offset;
4508 int flags;
4509 int total_dirty = 0;
4510
4511 for (f_offset = 0; f_offset < filesize; f_offset += PAGE_SIZE_64) {
4512 if (ubc_page_op(vp, f_offset, 0, 0, &flags) == KERN_SUCCESS) {
4513 if (flags & UPL_POP_DIRTY) {
4514 total_dirty++;
4515 }
4516 }
4517 }
4518 if (total_dirty)
4519 return(EINVAL);
4520
4521 return (0);
4522 }
4523
4524
4525
4526 /*
4527 * Dirty region tracking/clustering mechanism.
4528 *
4529 * This code (vfs_drt_*) provides a mechanism for tracking and clustering
4530 * dirty regions within a larger space (file). It is primarily intended to
4531 * support clustering in large files with many dirty areas.
4532 *
4533 * The implementation assumes that the dirty regions are pages.
4534 *
4535 * To represent dirty pages within the file, we store bit vectors in a
4536 * variable-size circular hash.
4537 */
4538
4539 /*
4540 * Bitvector size. This determines the number of pages we group in a
4541 * single hashtable entry. Each hashtable entry is aligned to this
4542 * size within the file.
4543 */
4544 #define DRT_BITVECTOR_PAGES 256
4545
4546 /*
4547 * File offset handling.
4548 *
4549 * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
4550 * the correct formula is (~(DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1)
4551 */
4552 #define DRT_ADDRESS_MASK (~((1 << 20) - 1))
4553 #define DRT_ALIGN_ADDRESS(addr) ((addr) & DRT_ADDRESS_MASK)
4554
4555 /*
4556 * Hashtable address field handling.
4557 *
4558 * The low-order bits of the hashtable address are used to conserve
4559 * space.
4560 *
4561 * DRT_HASH_COUNT_MASK must be large enough to store the range
4562 * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
4563 * to indicate that the bucket is actually unoccupied.
4564 */
4565 #define DRT_HASH_GET_ADDRESS(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
4566 #define DRT_HASH_SET_ADDRESS(scm, i, a) \
4567 do { \
4568 (scm)->scm_hashtable[(i)].dhe_control = \
4569 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \
4570 } while (0)
4571 #define DRT_HASH_COUNT_MASK 0x1ff
4572 #define DRT_HASH_GET_COUNT(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
4573 #define DRT_HASH_SET_COUNT(scm, i, c) \
4574 do { \
4575 (scm)->scm_hashtable[(i)].dhe_control = \
4576 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK); \
4577 } while (0)
4578 #define DRT_HASH_CLEAR(scm, i) \
4579 do { \
4580 (scm)->scm_hashtable[(i)].dhe_control = 0; \
4581 } while (0)
4582 #define DRT_HASH_VACATE(scm, i) DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
4583 #define DRT_HASH_VACANT(scm, i) (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
4584 #define DRT_HASH_COPY(oscm, oi, scm, i) \
4585 do { \
4586 (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control; \
4587 DRT_BITVECTOR_COPY(oscm, oi, scm, i); \
4588 } while(0);
4589
4590
4591 /*
4592 * Hash table moduli.
4593 *
4594 * Since the hashtable entry's size is dependent on the size of
4595 * the bitvector, and since the hashtable size is constrained to
4596 * both being prime and fitting within the desired allocation
4597 * size, these values need to be manually determined.
4598 *
4599 * For DRT_BITVECTOR_SIZE = 256, the entry size is 40 bytes.
4600 *
4601 * The small hashtable allocation is 1024 bytes, so the modulus is 23.
4602 * The large hashtable allocation is 16384 bytes, so the modulus is 401.
4603 */
4604 #define DRT_HASH_SMALL_MODULUS 23
4605 #define DRT_HASH_LARGE_MODULUS 401
4606
4607 #define DRT_SMALL_ALLOCATION 1024 /* 104 bytes spare */
4608 #define DRT_LARGE_ALLOCATION 16384 /* 344 bytes spare */
4609
4610 /* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */
4611
4612 /*
4613 * Hashtable bitvector handling.
4614 *
4615 * Bitvector fields are 32 bits long.
4616 */
4617
4618 #define DRT_HASH_SET_BIT(scm, i, bit) \
4619 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32))
4620
4621 #define DRT_HASH_CLEAR_BIT(scm, i, bit) \
4622 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
4623
4624 #define DRT_HASH_TEST_BIT(scm, i, bit) \
4625 ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
4626
4627 #define DRT_BITVECTOR_CLEAR(scm, i) \
4628 bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
4629
4630 #define DRT_BITVECTOR_COPY(oscm, oi, scm, i) \
4631 bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0], \
4632 &(scm)->scm_hashtable[(i)].dhe_bitvector[0], \
4633 (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
4634
4635
4636
4637 /*
4638 * Hashtable entry.
4639 */
4640 struct vfs_drt_hashentry {
4641 u_int64_t dhe_control;
4642 u_int32_t dhe_bitvector[DRT_BITVECTOR_PAGES / 32];
4643 };
4644
4645 /*
4646 * Dirty Region Tracking structure.
4647 *
4648 * The hashtable is allocated entirely inside the DRT structure.
4649 *
4650 * The hash is a simple circular prime modulus arrangement, the structure
4651 * is resized from small to large if it overflows.
4652 */
4653
4654 struct vfs_drt_clustermap {
4655 u_int32_t scm_magic; /* sanity/detection */
4656 #define DRT_SCM_MAGIC 0x12020003
4657 u_int32_t scm_modulus; /* current ring size */
4658 u_int32_t scm_buckets; /* number of occupied buckets */
4659 u_int32_t scm_lastclean; /* last entry we cleaned */
4660 u_int32_t scm_iskips; /* number of slot skips */
4661
4662 struct vfs_drt_hashentry scm_hashtable[0];
4663 };
4664
4665
4666 #define DRT_HASH(scm, addr) ((addr) % (scm)->scm_modulus)
4667 #define DRT_HASH_NEXT(scm, addr) (((addr) + 1) % (scm)->scm_modulus)
4668
4669 /*
4670 * Debugging codes and arguments.
4671 */
4672 #define DRT_DEBUG_EMPTYFREE (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
4673 #define DRT_DEBUG_RETCLUSTER (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
4674 #define DRT_DEBUG_ALLOC (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
4675 #define DRT_DEBUG_INSERT (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
4676 #define DRT_DEBUG_MARK (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
4677 * dirty */
4678 /* 0, setcount */
4679 /* 1 (clean, no map) */
4680 /* 2 (map alloc fail) */
4681 /* 3, resid (partial) */
4682 #define DRT_DEBUG_6 (FSDBG_CODE(DBG_FSRW, 87))
4683 #define DRT_DEBUG_SCMDATA (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
4684 * lastclean, iskips */
4685
4686
4687 static kern_return_t vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp);
4688 static kern_return_t vfs_drt_free_map(struct vfs_drt_clustermap *cmap);
4689 static kern_return_t vfs_drt_search_index(struct vfs_drt_clustermap *cmap,
4690 u_int64_t offset, int *indexp);
4691 static kern_return_t vfs_drt_get_index(struct vfs_drt_clustermap **cmapp,
4692 u_int64_t offset,
4693 int *indexp,
4694 int recursed);
4695 static kern_return_t vfs_drt_do_mark_pages(
4696 void **cmapp,
4697 u_int64_t offset,
4698 u_int length,
4699 int *setcountp,
4700 int dirty);
4701 static void vfs_drt_trace(
4702 struct vfs_drt_clustermap *cmap,
4703 int code,
4704 int arg1,
4705 int arg2,
4706 int arg3,
4707 int arg4);
4708
4709
4710 /*
4711 * Allocate and initialise a sparse cluster map.
4712 *
4713 * Will allocate a new map, resize or compact an existing map.
4714 *
4715 * XXX we should probably have at least one intermediate map size,
4716 * as the 1:16 ratio seems a bit drastic.
4717 */
4718 static kern_return_t
4719 vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp)
4720 {
4721 struct vfs_drt_clustermap *cmap, *ocmap;
4722 kern_return_t kret;
4723 u_int64_t offset;
4724 int nsize, i, active_buckets, index, copycount;
4725
4726 ocmap = NULL;
4727 if (cmapp != NULL)
4728 ocmap = *cmapp;
4729
4730 /*
4731 * Decide on the size of the new map.
4732 */
4733 if (ocmap == NULL) {
4734 nsize = DRT_HASH_SMALL_MODULUS;
4735 } else {
4736 /* count the number of active buckets in the old map */
4737 active_buckets = 0;
4738 for (i = 0; i < ocmap->scm_modulus; i++) {
4739 if (!DRT_HASH_VACANT(ocmap, i) &&
4740 (DRT_HASH_GET_COUNT(ocmap, i) != 0))
4741 active_buckets++;
4742 }
4743 /*
4744 * If we're currently using the small allocation, check to
4745 * see whether we should grow to the large one.
4746 */
4747 if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
4748 /* if the ring is nearly full */
4749 if (active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) {
4750 nsize = DRT_HASH_LARGE_MODULUS;
4751 } else {
4752 nsize = DRT_HASH_SMALL_MODULUS;
4753 }
4754 } else {
4755 /* already using the large modulus */
4756 nsize = DRT_HASH_LARGE_MODULUS;
4757 /*
4758 * If the ring is completely full, there's
4759 * nothing useful for us to do. Behave as
4760 * though we had compacted into the new
4761 * array and return.
4762 */
4763 if (active_buckets >= DRT_HASH_LARGE_MODULUS)
4764 return(KERN_SUCCESS);
4765 }
4766 }
4767
4768 /*
4769 * Allocate and initialise the new map.
4770 */
4771
4772 kret = kmem_alloc(kernel_map, (vm_offset_t *)&cmap,
4773 (nsize == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION);
4774 if (kret != KERN_SUCCESS)
4775 return(kret);
4776 cmap->scm_magic = DRT_SCM_MAGIC;
4777 cmap->scm_modulus = nsize;
4778 cmap->scm_buckets = 0;
4779 cmap->scm_lastclean = 0;
4780 cmap->scm_iskips = 0;
4781 for (i = 0; i < cmap->scm_modulus; i++) {
4782 DRT_HASH_CLEAR(cmap, i);
4783 DRT_HASH_VACATE(cmap, i);
4784 DRT_BITVECTOR_CLEAR(cmap, i);
4785 }
4786
4787 /*
4788 * If there's an old map, re-hash entries from it into the new map.
4789 */
4790 copycount = 0;
4791 if (ocmap != NULL) {
4792 for (i = 0; i < ocmap->scm_modulus; i++) {
4793 /* skip empty buckets */
4794 if (DRT_HASH_VACANT(ocmap, i) ||
4795 (DRT_HASH_GET_COUNT(ocmap, i) == 0))
4796 continue;
4797 /* get new index */
4798 offset = DRT_HASH_GET_ADDRESS(ocmap, i);
4799 kret = vfs_drt_get_index(&cmap, offset, &index, 1);
4800 if (kret != KERN_SUCCESS) {
4801 /* XXX need to bail out gracefully here */
4802 panic("vfs_drt: new cluster map mysteriously too small");
4803 }
4804 /* copy */
4805 DRT_HASH_COPY(ocmap, i, cmap, index);
4806 copycount++;
4807 }
4808 }
4809
4810 /* log what we've done */
4811 vfs_drt_trace(cmap, DRT_DEBUG_ALLOC, copycount, 0, 0, 0);
4812
4813 /*
4814 * It's important to ensure that *cmapp always points to
4815 * a valid map, so we must overwrite it before freeing
4816 * the old map.
4817 */
4818 *cmapp = cmap;
4819 if (ocmap != NULL) {
4820 /* emit stats into trace buffer */
4821 vfs_drt_trace(ocmap, DRT_DEBUG_SCMDATA,
4822 ocmap->scm_modulus,
4823 ocmap->scm_buckets,
4824 ocmap->scm_lastclean,
4825 ocmap->scm_iskips);
4826
4827 vfs_drt_free_map(ocmap);
4828 }
4829 return(KERN_SUCCESS);
4830 }
4831
4832
4833 /*
4834 * Free a sparse cluster map.
4835 */
4836 static kern_return_t
4837 vfs_drt_free_map(struct vfs_drt_clustermap *cmap)
4838 {
4839 kmem_free(kernel_map, (vm_offset_t)cmap,
4840 (cmap->scm_modulus == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION);
4841 return(KERN_SUCCESS);
4842 }
4843
4844
4845 /*
4846 * Find the hashtable slot currently occupied by an entry for the supplied offset.
4847 */
4848 static kern_return_t
4849 vfs_drt_search_index(struct vfs_drt_clustermap *cmap, u_int64_t offset, int *indexp)
4850 {
4851 int index, i;
4852
4853 offset = DRT_ALIGN_ADDRESS(offset);
4854 index = DRT_HASH(cmap, offset);
4855
4856 /* traverse the hashtable */
4857 for (i = 0; i < cmap->scm_modulus; i++) {
4858
4859 /*
4860 * If the slot is vacant, we can stop.
4861 */
4862 if (DRT_HASH_VACANT(cmap, index))
4863 break;
4864
4865 /*
4866 * If the address matches our offset, we have success.
4867 */
4868 if (DRT_HASH_GET_ADDRESS(cmap, index) == offset) {
4869 *indexp = index;
4870 return(KERN_SUCCESS);
4871 }
4872
4873 /*
4874 * Move to the next slot, try again.
4875 */
4876 index = DRT_HASH_NEXT(cmap, index);
4877 }
4878 /*
4879 * It's not there.
4880 */
4881 return(KERN_FAILURE);
4882 }
4883
4884 /*
4885 * Find the hashtable slot for the supplied offset. If we haven't allocated
4886 * one yet, allocate one and populate the address field. Note that it will
4887 * not have a nonzero page count and thus will still technically be free, so
4888 * in the case where we are called to clean pages, the slot will remain free.
4889 */
4890 static kern_return_t
4891 vfs_drt_get_index(struct vfs_drt_clustermap **cmapp, u_int64_t offset, int *indexp, int recursed)
4892 {
4893 struct vfs_drt_clustermap *cmap;
4894 kern_return_t kret;
4895 int index, i;
4896
4897 cmap = *cmapp;
4898
4899 /* look for an existing entry */
4900 kret = vfs_drt_search_index(cmap, offset, indexp);
4901 if (kret == KERN_SUCCESS)
4902 return(kret);
4903
4904 /* need to allocate an entry */
4905 offset = DRT_ALIGN_ADDRESS(offset);
4906 index = DRT_HASH(cmap, offset);
4907
4908 /* scan from the index forwards looking for a vacant slot */
4909 for (i = 0; i < cmap->scm_modulus; i++) {
4910 /* slot vacant? */
4911 if (DRT_HASH_VACANT(cmap, index) || DRT_HASH_GET_COUNT(cmap,index) == 0) {
4912 cmap->scm_buckets++;
4913 if (index < cmap->scm_lastclean)
4914 cmap->scm_lastclean = index;
4915 DRT_HASH_SET_ADDRESS(cmap, index, offset);
4916 DRT_HASH_SET_COUNT(cmap, index, 0);
4917 DRT_BITVECTOR_CLEAR(cmap, index);
4918 *indexp = index;
4919 vfs_drt_trace(cmap, DRT_DEBUG_INSERT, (int)offset, i, 0, 0);
4920 return(KERN_SUCCESS);
4921 }
4922 cmap->scm_iskips += i;
4923 index = DRT_HASH_NEXT(cmap, index);
4924 }
4925
4926 /*
4927 * We haven't found a vacant slot, so the map is full. If we're not
4928 * already recursed, try reallocating/compacting it.
4929 */
4930 if (recursed)
4931 return(KERN_FAILURE);
4932 kret = vfs_drt_alloc_map(cmapp);
4933 if (kret == KERN_SUCCESS) {
4934 /* now try to insert again */
4935 kret = vfs_drt_get_index(cmapp, offset, indexp, 1);
4936 }
4937 return(kret);
4938 }
4939
4940 /*
4941 * Implementation of set dirty/clean.
4942 *
4943 * In the 'clean' case, not finding a map is OK.
4944 */
4945 static kern_return_t
4946 vfs_drt_do_mark_pages(
4947 void **private,
4948 u_int64_t offset,
4949 u_int length,
4950 int *setcountp,
4951 int dirty)
4952 {
4953 struct vfs_drt_clustermap *cmap, **cmapp;
4954 kern_return_t kret;
4955 int i, index, pgoff, pgcount, setcount, ecount;
4956
4957 cmapp = (struct vfs_drt_clustermap **)private;
4958 cmap = *cmapp;
4959
4960 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_START, (int)offset, (int)length, dirty, 0);
4961
4962 if (setcountp != NULL)
4963 *setcountp = 0;
4964
4965 /* allocate a cluster map if we don't already have one */
4966 if (cmap == NULL) {
4967 /* no cluster map, nothing to clean */
4968 if (!dirty) {
4969 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 1, 0, 0, 0);
4970 return(KERN_SUCCESS);
4971 }
4972 kret = vfs_drt_alloc_map(cmapp);
4973 if (kret != KERN_SUCCESS) {
4974 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 2, 0, 0, 0);
4975 return(kret);
4976 }
4977 }
4978 setcount = 0;
4979
4980 /*
4981 * Iterate over the length of the region.
4982 */
4983 while (length > 0) {
4984 /*
4985 * Get the hashtable index for this offset.
4986 *
4987 * XXX this will add blank entries if we are clearing a range
4988 * that hasn't been dirtied.
4989 */
4990 kret = vfs_drt_get_index(cmapp, offset, &index, 0);
4991 cmap = *cmapp; /* may have changed! */
4992 /* this may be a partial-success return */
4993 if (kret != KERN_SUCCESS) {
4994 if (setcountp != NULL)
4995 *setcountp = setcount;
4996 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 3, (int)length, 0, 0);
4997
4998 return(kret);
4999 }
5000
5001 /*
5002 * Work out how many pages we're modifying in this
5003 * hashtable entry.
5004 */
5005 pgoff = (offset - DRT_ALIGN_ADDRESS(offset)) / PAGE_SIZE;
5006 pgcount = min((length / PAGE_SIZE), (DRT_BITVECTOR_PAGES - pgoff));
5007
5008 /*
5009 * Iterate over pages, dirty/clearing as we go.
5010 */
5011 ecount = DRT_HASH_GET_COUNT(cmap, index);
5012 for (i = 0; i < pgcount; i++) {
5013 if (dirty) {
5014 if (!DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
5015 DRT_HASH_SET_BIT(cmap, index, pgoff + i);
5016 ecount++;
5017 setcount++;
5018 }
5019 } else {
5020 if (DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
5021 DRT_HASH_CLEAR_BIT(cmap, index, pgoff + i);
5022 ecount--;
5023 setcount++;
5024 }
5025 }
5026 }
5027 DRT_HASH_SET_COUNT(cmap, index, ecount);
5028
5029 offset += pgcount * PAGE_SIZE;
5030 length -= pgcount * PAGE_SIZE;
5031 }
5032 if (setcountp != NULL)
5033 *setcountp = setcount;
5034
5035 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 0, setcount, 0, 0);
5036
5037 return(KERN_SUCCESS);
5038 }
5039
5040 /*
5041 * Mark a set of pages as dirty/clean.
5042 *
5043 * This is a public interface.
5044 *
5045 * cmapp
5046 * Pointer to storage suitable for holding a pointer. Note that
5047 * this must either be NULL or a value set by this function.
5048 *
5049 * size
5050 * Current file size in bytes.
5051 *
5052 * offset
5053 * Offset of the first page to be marked as dirty, in bytes. Must be
5054 * page-aligned.
5055 *
5056 * length
5057 * Length of dirty region, in bytes. Must be a multiple of PAGE_SIZE.
5058 *
5059 * setcountp
5060 * Number of pages newly marked dirty by this call (optional).
5061 *
5062 * Returns KERN_SUCCESS if all the pages were successfully marked.
5063 */
5064 static kern_return_t
5065 vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, int *setcountp)
5066 {
5067 /* XXX size unused, drop from interface */
5068 return(vfs_drt_do_mark_pages(cmapp, offset, length, setcountp, 1));
5069 }
5070
5071 #if 0
5072 static kern_return_t
5073 vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length)
5074 {
5075 return(vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0));
5076 }
5077 #endif
5078
5079 /*
5080 * Get a cluster of dirty pages.
5081 *
5082 * This is a public interface.
5083 *
5084 * cmapp
5085 * Pointer to storage managed by drt_mark_pages. Note that this must
5086 * be NULL or a value set by drt_mark_pages.
5087 *
5088 * offsetp
5089 * Returns the byte offset into the file of the first page in the cluster.
5090 *
5091 * lengthp
5092 * Returns the length in bytes of the cluster of dirty pages.
5093 *
5094 * Returns success if a cluster was found. If KERN_FAILURE is returned, there
5095 * are no dirty pages meeting the minmum size criteria. Private storage will
5096 * be released if there are no more dirty pages left in the map
5097 *
5098 */
5099 static kern_return_t
5100 vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp)
5101 {
5102 struct vfs_drt_clustermap *cmap;
5103 u_int64_t offset;
5104 u_int length;
5105 int index, i, j, fs, ls;
5106
5107 /* sanity */
5108 if ((cmapp == NULL) || (*cmapp == NULL))
5109 return(KERN_FAILURE);
5110 cmap = *cmapp;
5111
5112 /* walk the hashtable */
5113 for (offset = 0, j = 0; j < cmap->scm_modulus; offset += (DRT_BITVECTOR_PAGES * PAGE_SIZE), j++) {
5114 index = DRT_HASH(cmap, offset);
5115
5116 if (DRT_HASH_VACANT(cmap, index) || (DRT_HASH_GET_COUNT(cmap, index) == 0))
5117 continue;
5118
5119 /* scan the bitfield for a string of bits */
5120 fs = -1;
5121
5122 for (i = 0; i < DRT_BITVECTOR_PAGES; i++) {
5123 if (DRT_HASH_TEST_BIT(cmap, index, i)) {
5124 fs = i;
5125 break;
5126 }
5127 }
5128 if (fs == -1) {
5129 /* didn't find any bits set */
5130 panic("vfs_drt: entry summary count > 0 but no bits set in map");
5131 }
5132 for (ls = 0; i < DRT_BITVECTOR_PAGES; i++, ls++) {
5133 if (!DRT_HASH_TEST_BIT(cmap, index, i))
5134 break;
5135 }
5136
5137 /* compute offset and length, mark pages clean */
5138 offset = DRT_HASH_GET_ADDRESS(cmap, index) + (PAGE_SIZE * fs);
5139 length = ls * PAGE_SIZE;
5140 vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
5141 cmap->scm_lastclean = index;
5142
5143 /* return successful */
5144 *offsetp = (off_t)offset;
5145 *lengthp = length;
5146
5147 vfs_drt_trace(cmap, DRT_DEBUG_RETCLUSTER, (int)offset, (int)length, 0, 0);
5148 return(KERN_SUCCESS);
5149 }
5150 /*
5151 * We didn't find anything... hashtable is empty
5152 * emit stats into trace buffer and
5153 * then free it
5154 */
5155 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
5156 cmap->scm_modulus,
5157 cmap->scm_buckets,
5158 cmap->scm_lastclean,
5159 cmap->scm_iskips);
5160
5161 vfs_drt_free_map(cmap);
5162 *cmapp = NULL;
5163
5164 return(KERN_FAILURE);
5165 }
5166
5167
5168 static kern_return_t
5169 vfs_drt_control(void **cmapp, int op_type)
5170 {
5171 struct vfs_drt_clustermap *cmap;
5172
5173 /* sanity */
5174 if ((cmapp == NULL) || (*cmapp == NULL))
5175 return(KERN_FAILURE);
5176 cmap = *cmapp;
5177
5178 switch (op_type) {
5179 case 0:
5180 /* emit stats into trace buffer */
5181 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
5182 cmap->scm_modulus,
5183 cmap->scm_buckets,
5184 cmap->scm_lastclean,
5185 cmap->scm_iskips);
5186
5187 vfs_drt_free_map(cmap);
5188 *cmapp = NULL;
5189 break;
5190
5191 case 1:
5192 cmap->scm_lastclean = 0;
5193 break;
5194 }
5195 return(KERN_SUCCESS);
5196 }
5197
5198
5199
5200 /*
5201 * Emit a summary of the state of the clustermap into the trace buffer
5202 * along with some caller-provided data.
5203 */
5204 #if KDEBUG
5205 static void
5206 vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, int code, int arg1, int arg2, int arg3, int arg4)
5207 {
5208 KERNEL_DEBUG(code, arg1, arg2, arg3, arg4, 0);
5209 }
5210 #else
5211 static void
5212 vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, __unused int code,
5213 __unused int arg1, __unused int arg2, __unused int arg3,
5214 __unused int arg4)
5215 {
5216 }
5217 #endif
5218
5219 #if 0
5220 /*
5221 * Perform basic sanity check on the hash entry summary count
5222 * vs. the actual bits set in the entry.
5223 */
5224 static void
5225 vfs_drt_sanity(struct vfs_drt_clustermap *cmap)
5226 {
5227 int index, i;
5228 int bits_on;
5229
5230 for (index = 0; index < cmap->scm_modulus; index++) {
5231 if (DRT_HASH_VACANT(cmap, index))
5232 continue;
5233
5234 for (bits_on = 0, i = 0; i < DRT_BITVECTOR_PAGES; i++) {
5235 if (DRT_HASH_TEST_BIT(cmap, index, i))
5236 bits_on++;
5237 }
5238 if (bits_on != DRT_HASH_GET_COUNT(cmap, index))
5239 panic("bits_on = %d, index = %d\n", bits_on, index);
5240 }
5241 }
5242 #endif