]> git.saurik.com Git - apple/xnu.git/blob - bsd/vfs/vfs_cluster.c
xnu-792.22.5.tar.gz
[apple/xnu.git] / bsd / vfs / vfs_cluster.c
1 /*
2 * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95
62 */
63
64 #include <sys/param.h>
65 #include <sys/proc_internal.h>
66 #include <sys/buf_internal.h>
67 #include <sys/mount_internal.h>
68 #include <sys/vnode_internal.h>
69 #include <sys/trace.h>
70 #include <sys/malloc.h>
71 #include <sys/time.h>
72 #include <sys/kernel.h>
73 #include <sys/resourcevar.h>
74 #include <sys/uio_internal.h>
75 #include <libkern/libkern.h>
76 #include <machine/machine_routines.h>
77
78 #include <sys/ubc_internal.h>
79
80 #include <mach/mach_types.h>
81 #include <mach/memory_object_types.h>
82 #include <mach/vm_map.h>
83 #include <mach/upl.h>
84
85 #include <vm/vm_kern.h>
86 #include <vm/vm_map.h>
87 #include <vm/vm_pageout.h>
88
89 #include <sys/kdebug.h>
90
91
92 #define CL_READ 0x01
93 #define CL_ASYNC 0x02
94 #define CL_COMMIT 0x04
95 #define CL_PAGEOUT 0x10
96 #define CL_AGE 0x20
97 #define CL_DUMP 0x40
98 #define CL_NOZERO 0x80
99 #define CL_PAGEIN 0x100
100 #define CL_DEV_MEMORY 0x200
101 #define CL_PRESERVE 0x400
102 #define CL_THROTTLE 0x800
103 #define CL_KEEPCACHED 0x1000
104
105
106 struct clios {
107 u_int io_completed; /* amount of io that has currently completed */
108 u_int io_issued; /* amount of io that was successfully issued */
109 int io_error; /* error code of first error encountered */
110 int io_wanted; /* someone is sleeping waiting for a change in state */
111 };
112
113 static lck_grp_t *cl_mtx_grp;
114 static lck_attr_t *cl_mtx_attr;
115 static lck_grp_attr_t *cl_mtx_grp_attr;
116 static lck_mtx_t *cl_mtxp;
117
118
119 static int cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
120 int flags, buf_t real_bp, struct clios *iostate);
121 static int cluster_iodone(buf_t bp, void *dummy);
122 static int cluster_rd_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize);
123 static int cluster_hard_throttle_on(vnode_t vp);
124
125 static int cluster_read_x(vnode_t vp, struct uio *uio, off_t filesize, int flags);
126 static int cluster_write_x(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF,
127 off_t headOff, off_t tailOff, int flags);
128 static int cluster_nocopy_read(vnode_t vp, struct uio *uio, off_t filesize);
129 static int cluster_nocopy_write(vnode_t vp, struct uio *uio, off_t newEOF);
130 static int cluster_phys_read(vnode_t vp, struct uio *uio, off_t filesize);
131 static int cluster_phys_write(vnode_t vp, struct uio *uio, off_t newEOF);
132 static int cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, int xsize, int flags);
133
134 static void cluster_rd_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *ra);
135
136 static int cluster_push_x(vnode_t vp, struct cl_extent *, off_t EOF, int flags);
137 static void cluster_push_EOF(vnode_t vp, off_t EOF);
138
139 static int cluster_try_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int can_delay, int push_all);
140
141 static void sparse_cluster_switch(struct cl_writebehind *, vnode_t vp, off_t EOF);
142 static void sparse_cluster_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_all);
143 static void sparse_cluster_add(struct cl_writebehind *, vnode_t vp, struct cl_extent *, off_t EOF);
144
145 static kern_return_t vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, int *setcountp);
146 static kern_return_t vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp);
147 static kern_return_t vfs_drt_control(void **cmapp, int op_type);
148
149 int is_file_clean(vnode_t, off_t);
150
151 /*
152 * throttle the number of async writes that
153 * can be outstanding on a single vnode
154 * before we issue a synchronous write
155 */
156 #define HARD_THROTTLE_MAXCNT 0
157 #define HARD_THROTTLE_MAXSIZE (64 * 1024)
158
159 int hard_throttle_on_root = 0;
160 struct timeval priority_IO_timestamp_for_root;
161
162
163 void
164 cluster_init(void) {
165 /*
166 * allocate lock group attribute and group
167 */
168 cl_mtx_grp_attr = lck_grp_attr_alloc_init();
169 cl_mtx_grp = lck_grp_alloc_init("cluster I/O", cl_mtx_grp_attr);
170
171 /*
172 * allocate the lock attribute
173 */
174 cl_mtx_attr = lck_attr_alloc_init();
175
176 /*
177 * allocate and initialize mutex's used to protect updates and waits
178 * on the cluster_io context
179 */
180 cl_mtxp = lck_mtx_alloc_init(cl_mtx_grp, cl_mtx_attr);
181
182 if (cl_mtxp == NULL)
183 panic("cluster_init: failed to allocate cl_mtxp");
184 }
185
186
187
188 #define CLW_ALLOCATE 0x01
189 #define CLW_RETURNLOCKED 0x02
190 /*
191 * if the read ahead context doesn't yet exist,
192 * allocate and initialize it...
193 * the vnode lock serializes multiple callers
194 * during the actual assignment... first one
195 * to grab the lock wins... the other callers
196 * will release the now unnecessary storage
197 *
198 * once the context is present, try to grab (but don't block on)
199 * the lock associated with it... if someone
200 * else currently owns it, than the read
201 * will run without read-ahead. this allows
202 * multiple readers to run in parallel and
203 * since there's only 1 read ahead context,
204 * there's no real loss in only allowing 1
205 * reader to have read-ahead enabled.
206 */
207 static struct cl_readahead *
208 cluster_get_rap(vnode_t vp)
209 {
210 struct ubc_info *ubc;
211 struct cl_readahead *rap;
212
213 ubc = vp->v_ubcinfo;
214
215 if ((rap = ubc->cl_rahead) == NULL) {
216 MALLOC_ZONE(rap, struct cl_readahead *, sizeof *rap, M_CLRDAHEAD, M_WAITOK);
217
218 bzero(rap, sizeof *rap);
219 rap->cl_lastr = -1;
220 lck_mtx_init(&rap->cl_lockr, cl_mtx_grp, cl_mtx_attr);
221
222 vnode_lock(vp);
223
224 if (ubc->cl_rahead == NULL)
225 ubc->cl_rahead = rap;
226 else {
227 lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp);
228 FREE_ZONE((void *)rap, sizeof *rap, M_CLRDAHEAD);
229 rap = ubc->cl_rahead;
230 }
231 vnode_unlock(vp);
232 }
233 if (lck_mtx_try_lock(&rap->cl_lockr) == TRUE)
234 return(rap);
235
236 return ((struct cl_readahead *)NULL);
237 }
238
239
240 /*
241 * if the write behind context doesn't yet exist,
242 * and CLW_ALLOCATE is specified, allocate and initialize it...
243 * the vnode lock serializes multiple callers
244 * during the actual assignment... first one
245 * to grab the lock wins... the other callers
246 * will release the now unnecessary storage
247 *
248 * if CLW_RETURNLOCKED is set, grab (blocking if necessary)
249 * the lock associated with the write behind context before
250 * returning
251 */
252
253 static struct cl_writebehind *
254 cluster_get_wbp(vnode_t vp, int flags)
255 {
256 struct ubc_info *ubc;
257 struct cl_writebehind *wbp;
258
259 ubc = vp->v_ubcinfo;
260
261 if ((wbp = ubc->cl_wbehind) == NULL) {
262
263 if ( !(flags & CLW_ALLOCATE))
264 return ((struct cl_writebehind *)NULL);
265
266 MALLOC_ZONE(wbp, struct cl_writebehind *, sizeof *wbp, M_CLWRBEHIND, M_WAITOK);
267
268 bzero(wbp, sizeof *wbp);
269 lck_mtx_init(&wbp->cl_lockw, cl_mtx_grp, cl_mtx_attr);
270
271 vnode_lock(vp);
272
273 if (ubc->cl_wbehind == NULL)
274 ubc->cl_wbehind = wbp;
275 else {
276 lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp);
277 FREE_ZONE((void *)wbp, sizeof *wbp, M_CLWRBEHIND);
278 wbp = ubc->cl_wbehind;
279 }
280 vnode_unlock(vp);
281 }
282 if (flags & CLW_RETURNLOCKED)
283 lck_mtx_lock(&wbp->cl_lockw);
284
285 return (wbp);
286 }
287
288
289 static int
290 cluster_hard_throttle_on(vnode_t vp)
291 {
292 static struct timeval hard_throttle_maxelapsed = { 0, 200000 };
293
294 if (vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV) {
295 struct timeval elapsed;
296
297 if (hard_throttle_on_root)
298 return(1);
299
300 microuptime(&elapsed);
301 timevalsub(&elapsed, &priority_IO_timestamp_for_root);
302
303 if (timevalcmp(&elapsed, &hard_throttle_maxelapsed, <))
304 return(1);
305 }
306 return(0);
307 }
308
309
310 static int
311 cluster_iodone(buf_t bp, __unused void *dummy)
312 {
313 int b_flags;
314 int error;
315 int total_size;
316 int total_resid;
317 int upl_offset;
318 int zero_offset;
319 upl_t upl;
320 buf_t cbp;
321 buf_t cbp_head;
322 buf_t cbp_next;
323 buf_t real_bp;
324 struct clios *iostate;
325 int commit_size;
326 int pg_offset;
327
328 cbp_head = (buf_t)(bp->b_trans_head);
329
330 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
331 (int)cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
332
333 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
334 /*
335 * all I/O requests that are part of this transaction
336 * have to complete before we can process it
337 */
338 if ( !(cbp->b_flags & B_DONE)) {
339
340 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
341 (int)cbp_head, (int)cbp, cbp->b_bcount, cbp->b_flags, 0);
342
343 return 0;
344 }
345 }
346 error = 0;
347 total_size = 0;
348 total_resid = 0;
349
350 cbp = cbp_head;
351 upl_offset = cbp->b_uploffset;
352 upl = cbp->b_upl;
353 b_flags = cbp->b_flags;
354 real_bp = cbp->b_real_bp;
355 zero_offset= cbp->b_validend;
356 iostate = (struct clios *)cbp->b_iostate;
357
358 if (real_bp)
359 real_bp->b_dev = cbp->b_dev;
360
361 while (cbp) {
362 if ((cbp->b_flags & B_ERROR) && error == 0)
363 error = cbp->b_error;
364
365 total_resid += cbp->b_resid;
366 total_size += cbp->b_bcount;
367
368 cbp_next = cbp->b_trans_next;
369
370 free_io_buf(cbp);
371
372 cbp = cbp_next;
373 }
374 if (zero_offset)
375 cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
376
377 if (iostate) {
378 int need_wakeup = 0;
379
380 /*
381 * someone has issued multiple I/Os asynchrounsly
382 * and is waiting for them to complete (streaming)
383 */
384 lck_mtx_lock(cl_mtxp);
385
386 if (error && iostate->io_error == 0)
387 iostate->io_error = error;
388
389 iostate->io_completed += total_size;
390
391 if (iostate->io_wanted) {
392 /*
393 * someone is waiting for the state of
394 * this io stream to change
395 */
396 iostate->io_wanted = 0;
397 need_wakeup = 1;
398 }
399 lck_mtx_unlock(cl_mtxp);
400
401 if (need_wakeup)
402 wakeup((caddr_t)&iostate->io_wanted);
403 }
404 if ((b_flags & B_NEED_IODONE) && real_bp) {
405 if (error) {
406 real_bp->b_flags |= B_ERROR;
407 real_bp->b_error = error;
408 }
409 real_bp->b_resid = total_resid;
410
411 buf_biodone(real_bp);
412 }
413 if (error == 0 && total_resid)
414 error = EIO;
415
416 if (b_flags & B_COMMIT_UPL) {
417 pg_offset = upl_offset & PAGE_MASK;
418 commit_size = (pg_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
419
420 if (error || (b_flags & B_NOCACHE)) {
421 int upl_abort_code;
422 int page_in = 0;
423 int page_out = 0;
424
425 if (b_flags & B_PAGEIO) {
426 if (b_flags & B_READ)
427 page_in = 1;
428 else
429 page_out = 1;
430 }
431 if (b_flags & B_CACHE) /* leave pages in the cache unchanged on error */
432 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
433 else if (page_out && (error != ENXIO)) /* transient error */
434 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
435 else if (page_in)
436 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
437 else
438 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
439
440 ubc_upl_abort_range(upl, upl_offset - pg_offset, commit_size,
441 upl_abort_code);
442
443 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
444 (int)upl, upl_offset - pg_offset, commit_size,
445 0x80000000|upl_abort_code, 0);
446
447 } else {
448 int upl_commit_flags = UPL_COMMIT_FREE_ON_EMPTY;
449
450 if ((b_flags & B_PHYS) && (b_flags & B_READ))
451 upl_commit_flags |= UPL_COMMIT_SET_DIRTY;
452
453 if (b_flags & B_AGE)
454 upl_commit_flags |= UPL_COMMIT_INACTIVATE;
455
456 ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size,
457 upl_commit_flags);
458
459 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
460 (int)upl, upl_offset - pg_offset, commit_size,
461 upl_commit_flags, 0);
462 }
463 } else {
464 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
465 (int)upl, upl_offset, 0, error, 0);
466 }
467
468 return (error);
469 }
470
471
472 void
473 cluster_zero(upl_t upl, vm_offset_t upl_offset, int size, buf_t bp)
474 {
475 upl_page_info_t *pl;
476
477 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_START,
478 upl_offset, size, (int)bp, 0, 0);
479
480 if (bp == NULL || bp->b_datap == 0) {
481
482 pl = ubc_upl_pageinfo(upl);
483
484 while (size) {
485 int page_offset;
486 int page_index;
487 addr64_t zero_addr;
488 int zero_cnt;
489
490 page_index = upl_offset / PAGE_SIZE;
491 page_offset = upl_offset & PAGE_MASK;
492
493 zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << 12) + page_offset;
494 zero_cnt = min(PAGE_SIZE - page_offset, size);
495
496 bzero_phys(zero_addr, zero_cnt);
497
498 size -= zero_cnt;
499 upl_offset += zero_cnt;
500 }
501 } else
502 bzero((caddr_t)((vm_offset_t)bp->b_datap + upl_offset), size);
503
504 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_END,
505 upl_offset, size, 0, 0, 0);
506 }
507
508
509 static int
510 cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int non_rounded_size,
511 int flags, buf_t real_bp, struct clios *iostate)
512 {
513 buf_t cbp;
514 u_int size;
515 u_int io_size;
516 int io_flags;
517 int bmap_flags;
518 int error = 0;
519 int retval = 0;
520 buf_t cbp_head = NULL;
521 buf_t cbp_tail = NULL;
522 int trans_count = 0;
523 u_int pg_count;
524 int pg_offset;
525 u_int max_iosize;
526 u_int max_vectors;
527 int priv;
528 int zero_offset = 0;
529 int async_throttle = 0;
530 mount_t mp;
531
532 mp = vp->v_mount;
533
534 if (mp->mnt_devblocksize > 1) {
535 /*
536 * round the requested size up so that this I/O ends on a
537 * page boundary in case this is a 'write'... if the filesystem
538 * has blocks allocated to back the page beyond the EOF, we want to
539 * make sure to write out the zero's that are sitting beyond the EOF
540 * so that in case the filesystem doesn't explicitly zero this area
541 * if a hole is created via a lseek/write beyond the current EOF,
542 * it will return zeros when it's read back from the disk. If the
543 * physical allocation doesn't extend for the whole page, we'll
544 * only write/read from the disk up to the end of this allocation
545 * via the extent info returned from the VNOP_BLOCKMAP call.
546 */
547 pg_offset = upl_offset & PAGE_MASK;
548
549 size = (((non_rounded_size + pg_offset) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - pg_offset;
550 } else {
551 /*
552 * anyone advertising a blocksize of 1 byte probably
553 * can't deal with us rounding up the request size
554 * AFP is one such filesystem/device
555 */
556 size = non_rounded_size;
557 }
558 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START,
559 (int)f_offset, size, upl_offset, flags, 0);
560
561 if (flags & CL_READ) {
562 io_flags = (B_READ);
563 bmap_flags = VNODE_READ;
564
565 max_iosize = mp->mnt_maxreadcnt;
566 max_vectors = mp->mnt_segreadcnt;
567 } else {
568 io_flags = 0;
569 bmap_flags = VNODE_WRITE;
570
571 max_iosize = mp->mnt_maxwritecnt;
572 max_vectors = mp->mnt_segwritecnt;
573 }
574 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_NONE, max_iosize, max_vectors, mp->mnt_devblocksize, 0, 0);
575
576 /*
577 * make sure the maximum iosize is a
578 * multiple of the page size
579 */
580 max_iosize &= ~PAGE_MASK;
581
582 if (flags & CL_THROTTLE) {
583 if ( !(flags & CL_PAGEOUT) && cluster_hard_throttle_on(vp)) {
584 if (max_iosize > HARD_THROTTLE_MAXSIZE)
585 max_iosize = HARD_THROTTLE_MAXSIZE;
586 async_throttle = HARD_THROTTLE_MAXCNT;
587 } else
588 async_throttle = VNODE_ASYNC_THROTTLE;
589 }
590 if (flags & CL_AGE)
591 io_flags |= B_AGE;
592 if (flags & CL_DUMP)
593 io_flags |= B_NOCACHE;
594 if (flags & (CL_PAGEIN | CL_PAGEOUT))
595 io_flags |= B_PAGEIO;
596 if (flags & CL_COMMIT)
597 io_flags |= B_COMMIT_UPL;
598 if (flags & CL_PRESERVE)
599 io_flags |= B_PHYS;
600 if (flags & CL_KEEPCACHED)
601 io_flags |= B_CACHE;
602
603 if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
604 /*
605 * then we are going to end up
606 * with a page that we can't complete (the file size wasn't a multiple
607 * of PAGE_SIZE and we're trying to read to the end of the file
608 * so we'll go ahead and zero out the portion of the page we can't
609 * read in from the file
610 */
611 zero_offset = upl_offset + non_rounded_size;
612 }
613 while (size) {
614 int pg_resid;
615 daddr64_t blkno;
616 daddr64_t lblkno;
617
618 if (size > max_iosize)
619 io_size = max_iosize;
620 else
621 io_size = size;
622
623 if ((error = VNOP_BLOCKMAP(vp, f_offset, io_size, &blkno, (size_t *)&io_size, NULL, bmap_flags, NULL))) {
624 break;
625 }
626 if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno))
627 real_bp->b_blkno = blkno;
628
629 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
630 (int)f_offset, (int)blkno, io_size, zero_offset, 0);
631
632 if (io_size == 0) {
633 /*
634 * vnop_blockmap didn't return an error... however, it did
635 * return an extent size of 0 which means we can't
636 * make forward progress on this I/O... a hole in the
637 * file would be returned as a blkno of -1 with a non-zero io_size
638 * a real extent is returned with a blkno != -1 and a non-zero io_size
639 */
640 error = EINVAL;
641 break;
642 }
643 if ( !(flags & CL_READ) && blkno == -1) {
644 off_t e_offset;
645
646 /*
647 * we're writing into a 'hole'
648 */
649 if (flags & CL_PAGEOUT) {
650 /*
651 * if we got here via cluster_pageout
652 * then just error the request and return
653 * the 'hole' should already have been covered
654 */
655 error = EINVAL;
656 break;
657 }
658 if ( !(flags & CL_COMMIT)) {
659 /*
660 * currently writes always request the commit to happen
661 * as part of the io completion... however, if the CL_COMMIT
662 * flag isn't specified, than we can't issue the abort_range
663 * since the call site is going to abort or commit the same upl..
664 * in this case we can only return an error
665 */
666 error = EINVAL;
667 break;
668 }
669 /*
670 * we can get here if the cluster code happens to
671 * pick up a page that was dirtied via mmap vs
672 * a 'write' and the page targets a 'hole'...
673 * i.e. the writes to the cluster were sparse
674 * and the file was being written for the first time
675 *
676 * we can also get here if the filesystem supports
677 * 'holes' that are less than PAGE_SIZE.... because
678 * we can't know if the range in the page that covers
679 * the 'hole' has been dirtied via an mmap or not,
680 * we have to assume the worst and try to push the
681 * entire page to storage.
682 *
683 * Try paging out the page individually before
684 * giving up entirely and dumping it (the pageout
685 * path will insure that the zero extent accounting
686 * has been taken care of before we get back into cluster_io)
687 */
688 ubc_upl_abort_range(upl, trunc_page(upl_offset), PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
689
690 e_offset = round_page_64(f_offset + 1);
691
692 if (ubc_sync_range(vp, f_offset, e_offset, UBC_PUSHDIRTY) == 0) {
693 error = EINVAL;
694 break;
695 }
696 io_size = e_offset - f_offset;
697
698 f_offset += io_size;
699 upl_offset += io_size;
700
701 if (size >= io_size)
702 size -= io_size;
703 else
704 size = 0;
705 /*
706 * keep track of how much of the original request
707 * that we've actually completed... non_rounded_size
708 * may go negative due to us rounding the request
709 * to a page size multiple (i.e. size > non_rounded_size)
710 */
711 non_rounded_size -= io_size;
712
713 if (non_rounded_size <= 0) {
714 /*
715 * we've transferred all of the data in the original
716 * request, but we were unable to complete the tail
717 * of the last page because the file didn't have
718 * an allocation to back that portion... this is ok.
719 */
720 size = 0;
721 }
722 continue;
723 }
724 lblkno = (daddr64_t)(f_offset / PAGE_SIZE_64);
725 /*
726 * we have now figured out how much I/O we can do - this is in 'io_size'
727 * pg_offset is the starting point in the first page for the I/O
728 * pg_count is the number of full and partial pages that 'io_size' encompasses
729 */
730 pg_offset = upl_offset & PAGE_MASK;
731
732 if (flags & CL_DEV_MEMORY) {
733 /*
734 * currently, can't deal with reading 'holes' in file
735 */
736 if (blkno == -1) {
737 error = EINVAL;
738 break;
739 }
740 /*
741 * treat physical requests as one 'giant' page
742 */
743 pg_count = 1;
744 } else
745 pg_count = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
746
747 if ((flags & CL_READ) && blkno == -1) {
748 int bytes_to_zero;
749
750 /*
751 * if we're reading and blkno == -1, then we've got a
752 * 'hole' in the file that we need to deal with by zeroing
753 * out the affected area in the upl
754 */
755 if (zero_offset && io_size == size) {
756 /*
757 * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
758 * than 'zero_offset' will be non-zero
759 * if the 'hole' returned by vnop_blockmap extends all the way to the eof
760 * (indicated by the io_size finishing off the I/O request for this UPL)
761 * than we're not going to issue an I/O for the
762 * last page in this upl... we need to zero both the hole and the tail
763 * of the page beyond the EOF, since the delayed zero-fill won't kick in
764 */
765 bytes_to_zero = (((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset;
766
767 zero_offset = 0;
768 } else
769 bytes_to_zero = io_size;
770
771 cluster_zero(upl, upl_offset, bytes_to_zero, real_bp);
772
773 if (cbp_head)
774 /*
775 * if there is a current I/O chain pending
776 * then the first page of the group we just zero'd
777 * will be handled by the I/O completion if the zero
778 * fill started in the middle of the page
779 */
780 pg_count = (io_size - pg_offset) / PAGE_SIZE;
781 else {
782 /*
783 * no pending I/O to pick up that first page
784 * so, we have to make sure it gets committed
785 * here.
786 * set the pg_offset to 0 so that the upl_commit_range
787 * starts with this page
788 */
789 pg_count = (io_size + pg_offset) / PAGE_SIZE;
790 pg_offset = 0;
791 }
792 if (io_size == size && ((upl_offset + io_size) & PAGE_MASK))
793 /*
794 * if we're done with the request for this UPL
795 * then we have to make sure to commit the last page
796 * even if we only partially zero-filled it
797 */
798 pg_count++;
799
800 if (pg_count) {
801 if (pg_offset)
802 pg_resid = PAGE_SIZE - pg_offset;
803 else
804 pg_resid = 0;
805
806 if (flags & CL_COMMIT)
807 ubc_upl_commit_range(upl,
808 (upl_offset + pg_resid) & ~PAGE_MASK,
809 pg_count * PAGE_SIZE,
810 UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
811 }
812 upl_offset += io_size;
813 f_offset += io_size;
814 size -= io_size;
815 /*
816 * keep track of how much of the original request
817 * that we've actually completed... non_rounded_size
818 * may go negative due to us rounding the request
819 * to a page size multiple (i.e. size > non_rounded_size)
820 */
821 non_rounded_size -= io_size;
822
823 if (non_rounded_size <= 0) {
824 /*
825 * we've transferred all of the data in the original
826 * request, but we were unable to complete the tail
827 * of the last page because the file didn't have
828 * an allocation to back that portion... this is ok.
829 */
830 size = 0;
831 }
832 if (cbp_head && pg_count)
833 goto start_io;
834 continue;
835
836 }
837 if (pg_count > max_vectors) {
838 if (((pg_count - max_vectors) * PAGE_SIZE) > io_size) {
839 io_size = PAGE_SIZE - pg_offset;
840 pg_count = 1;
841 } else {
842 io_size -= (pg_count - max_vectors) * PAGE_SIZE;
843 pg_count = max_vectors;
844 }
845 }
846
847 if ( !(mp->mnt_kern_flag & MNTK_VIRTUALDEV))
848 /*
849 * if we're not targeting a virtual device i.e. a disk image
850 * it's safe to dip into the reserve pool since real devices
851 * can complete this I/O request without requiring additional
852 * bufs from the alloc_io_buf pool
853 */
854 priv = 1;
855 else if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT))
856 /*
857 * Throttle the speculative IO
858 */
859 priv = 0;
860 else
861 priv = 1;
862
863 cbp = alloc_io_buf(vp, priv);
864
865 if (flags & CL_PAGEOUT) {
866 u_int i;
867
868 for (i = 0; i < pg_count; i++) {
869 if (buf_invalblkno(vp, lblkno + i, 0) == EBUSY)
870 panic("BUSY bp found in cluster_io");
871 }
872 }
873 if (flags & CL_ASYNC) {
874 if (buf_setcallback(cbp, (void *)cluster_iodone, NULL))
875 panic("buf_setcallback failed\n");
876 }
877 cbp->b_flags |= io_flags;
878
879 cbp->b_lblkno = lblkno;
880 cbp->b_blkno = blkno;
881 cbp->b_bcount = io_size;
882
883 if (buf_setupl(cbp, upl, upl_offset))
884 panic("buf_setupl failed\n");
885
886 cbp->b_trans_next = (buf_t)NULL;
887
888 if ((cbp->b_iostate = (void *)iostate))
889 /*
890 * caller wants to track the state of this
891 * io... bump the amount issued against this stream
892 */
893 iostate->io_issued += io_size;
894
895 if (flags & CL_READ) {
896 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
897 (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
898 }
899 else {
900 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
901 (int)cbp->b_lblkno, (int)cbp->b_blkno, upl_offset, io_size, 0);
902 }
903
904 if (cbp_head) {
905 cbp_tail->b_trans_next = cbp;
906 cbp_tail = cbp;
907 } else {
908 cbp_head = cbp;
909 cbp_tail = cbp;
910 }
911 (buf_t)(cbp->b_trans_head) = cbp_head;
912 trans_count++;
913
914 upl_offset += io_size;
915 f_offset += io_size;
916 size -= io_size;
917 /*
918 * keep track of how much of the original request
919 * that we've actually completed... non_rounded_size
920 * may go negative due to us rounding the request
921 * to a page size multiple (i.e. size > non_rounded_size)
922 */
923 non_rounded_size -= io_size;
924
925 if (non_rounded_size <= 0) {
926 /*
927 * we've transferred all of the data in the original
928 * request, but we were unable to complete the tail
929 * of the last page because the file didn't have
930 * an allocation to back that portion... this is ok.
931 */
932 size = 0;
933 }
934 if ( (!(upl_offset & PAGE_MASK) && !(flags & CL_DEV_MEMORY) && ((flags & CL_ASYNC) || trans_count > 8)) || size == 0) {
935 /*
936 * if we have no more I/O to issue or
937 * the current I/O we've prepared fully
938 * completes the last page in this request
939 * and it's either an ASYNC request or
940 * we've already accumulated more than 8 I/O's into
941 * this transaction and it's not an I/O directed to
942 * special DEVICE memory
943 * then go ahead and issue the I/O
944 */
945 start_io:
946 if (real_bp) {
947 cbp_head->b_flags |= B_NEED_IODONE;
948 cbp_head->b_real_bp = real_bp;
949 } else
950 cbp_head->b_real_bp = (buf_t)NULL;
951
952 if (size == 0) {
953 /*
954 * we're about to issue the last I/O for this upl
955 * if this was a read to the eof and the eof doesn't
956 * finish on a page boundary, than we need to zero-fill
957 * the rest of the page....
958 */
959 cbp_head->b_validend = zero_offset;
960 } else
961 cbp_head->b_validend = 0;
962
963 if (flags & CL_THROTTLE)
964 (void)vnode_waitforwrites(vp, async_throttle, 0, 0, (char *)"cluster_io");
965
966 for (cbp = cbp_head; cbp;) {
967 buf_t cbp_next;
968
969 if ( !(io_flags & B_READ))
970 vnode_startwrite(vp);
971
972 cbp_next = cbp->b_trans_next;
973
974 (void) VNOP_STRATEGY(cbp);
975 cbp = cbp_next;
976 }
977 if ( !(flags & CL_ASYNC)) {
978 int dummy;
979
980 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next)
981 buf_biowait(cbp);
982
983 if ((error = cluster_iodone(cbp_head, (void *)&dummy))) {
984 if (((flags & (CL_PAGEOUT | CL_KEEPCACHED)) == CL_PAGEOUT) && (error == ENXIO))
985 error = 0; /* drop the error */
986 else {
987 if (retval == 0)
988 retval = error;
989 error = 0;
990 }
991 }
992 }
993 cbp_head = (buf_t)NULL;
994 cbp_tail = (buf_t)NULL;
995
996 trans_count = 0;
997 }
998 }
999 if (error) {
1000 int abort_size;
1001
1002 io_size = 0;
1003
1004 for (cbp = cbp_head; cbp;) {
1005 buf_t cbp_next;
1006
1007 upl_offset -= cbp->b_bcount;
1008 size += cbp->b_bcount;
1009 io_size += cbp->b_bcount;
1010
1011 cbp_next = cbp->b_trans_next;
1012 free_io_buf(cbp);
1013 cbp = cbp_next;
1014 }
1015 if (iostate) {
1016 int need_wakeup = 0;
1017
1018 /*
1019 * update the error condition for this stream
1020 * since we never really issued the io
1021 * just go ahead and adjust it back
1022 */
1023 lck_mtx_lock(cl_mtxp);
1024
1025 if (iostate->io_error == 0)
1026 iostate->io_error = error;
1027 iostate->io_issued -= io_size;
1028
1029 if (iostate->io_wanted) {
1030 /*
1031 * someone is waiting for the state of
1032 * this io stream to change
1033 */
1034 iostate->io_wanted = 0;
1035 need_wakeup = 0;
1036 }
1037 lck_mtx_unlock(cl_mtxp);
1038
1039 if (need_wakeup)
1040 wakeup((caddr_t)&iostate->io_wanted);
1041 }
1042 pg_offset = upl_offset & PAGE_MASK;
1043 abort_size = (size + pg_offset + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1044
1045 if (flags & CL_COMMIT) {
1046 int upl_abort_code;
1047
1048 if (flags & CL_PRESERVE) {
1049 ubc_upl_commit_range(upl, upl_offset - pg_offset, abort_size,
1050 UPL_COMMIT_FREE_ON_EMPTY);
1051 } else {
1052 if ((flags & CL_PAGEOUT) && (error != ENXIO)) /* transient error */
1053 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
1054 else if (flags & CL_PAGEIN)
1055 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
1056 else
1057 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
1058
1059 ubc_upl_abort_range(upl, upl_offset - pg_offset, abort_size,
1060 upl_abort_code);
1061 }
1062 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
1063 (int)upl, upl_offset - pg_offset, abort_size, error, 0);
1064 }
1065 if (real_bp) {
1066 real_bp->b_flags |= B_ERROR;
1067 real_bp->b_error = error;
1068
1069 buf_biodone(real_bp);
1070 }
1071 if (retval == 0)
1072 retval = error;
1073 }
1074 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END,
1075 (int)f_offset, size, upl_offset, retval, 0);
1076
1077 return (retval);
1078 }
1079
1080
1081 static int
1082 cluster_rd_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize)
1083 {
1084 int pages_in_prefetch;
1085
1086 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
1087 (int)f_offset, size, (int)filesize, 0, 0);
1088
1089 if (f_offset >= filesize) {
1090 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
1091 (int)f_offset, 0, 0, 0, 0);
1092 return(0);
1093 }
1094 if (size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1095 size = (MAX_UPL_TRANSFER * PAGE_SIZE);
1096 else
1097 size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1098
1099 if ((off_t)size > (filesize - f_offset))
1100 size = filesize - f_offset;
1101 pages_in_prefetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
1102
1103 advisory_read(vp, filesize, f_offset, size);
1104
1105 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
1106 (int)f_offset + size, pages_in_prefetch, 0, 1, 0);
1107
1108 return (pages_in_prefetch);
1109 }
1110
1111
1112
1113 static void
1114 cluster_rd_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *rap)
1115 {
1116 daddr64_t r_addr;
1117 off_t f_offset;
1118 int size_of_prefetch;
1119
1120
1121 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
1122 (int)extent->b_addr, (int)extent->e_addr, (int)rap->cl_lastr, 0, 0);
1123
1124 if (extent->b_addr == rap->cl_lastr && extent->b_addr == extent->e_addr) {
1125 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1126 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 0, 0);
1127 return;
1128 }
1129 if (rap->cl_lastr == -1 || (extent->b_addr != rap->cl_lastr && extent->b_addr != (rap->cl_lastr + 1) &&
1130 (extent->b_addr != (rap->cl_maxra + 1) || rap->cl_ralen == 0))) {
1131 rap->cl_ralen = 0;
1132 rap->cl_maxra = 0;
1133
1134 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1135 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 1, 0);
1136
1137 return;
1138 }
1139 if (extent->e_addr < rap->cl_maxra) {
1140 if ((rap->cl_maxra - extent->e_addr) > (MAX_UPL_TRANSFER / 4)) {
1141
1142 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1143 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 2, 0);
1144 return;
1145 }
1146 }
1147 r_addr = max(extent->e_addr, rap->cl_maxra) + 1;
1148 f_offset = (off_t)(r_addr * PAGE_SIZE_64);
1149
1150 size_of_prefetch = 0;
1151
1152 ubc_range_op(vp, f_offset, f_offset + PAGE_SIZE_64, UPL_ROP_PRESENT, &size_of_prefetch);
1153
1154 if (size_of_prefetch) {
1155 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1156 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 3, 0);
1157 return;
1158 }
1159 if (f_offset < filesize) {
1160 daddr64_t read_size;
1161
1162 rap->cl_ralen = rap->cl_ralen ? min(MAX_UPL_TRANSFER, rap->cl_ralen << 1) : 1;
1163
1164 read_size = (extent->e_addr + 1) - extent->b_addr;
1165
1166 if (read_size > rap->cl_ralen) {
1167 if (read_size > MAX_UPL_TRANSFER)
1168 rap->cl_ralen = MAX_UPL_TRANSFER;
1169 else
1170 rap->cl_ralen = read_size;
1171 }
1172 size_of_prefetch = cluster_rd_prefetch(vp, f_offset, rap->cl_ralen * PAGE_SIZE, filesize);
1173
1174 if (size_of_prefetch)
1175 rap->cl_maxra = (r_addr + size_of_prefetch) - 1;
1176 }
1177 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
1178 rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 4, 0);
1179 }
1180
1181 int
1182 cluster_pageout(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset,
1183 int size, off_t filesize, int flags)
1184 {
1185 int io_size;
1186 int rounded_size;
1187 off_t max_size;
1188 int local_flags;
1189 struct cl_writebehind *wbp;
1190
1191 if (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)
1192 /*
1193 * if we know we're issuing this I/O to a virtual device (i.e. disk image)
1194 * then we don't want to enforce this throttle... if we do, we can
1195 * potentially deadlock since we're stalling the pageout thread at a time
1196 * when the disk image might need additional memory (which won't be available
1197 * if the pageout thread can't run)... instead we'll just depend on the throttle
1198 * that the pageout thread now has in place to deal with external files
1199 */
1200 local_flags = CL_PAGEOUT;
1201 else
1202 local_flags = CL_PAGEOUT | CL_THROTTLE;
1203
1204 if ((flags & UPL_IOSYNC) == 0)
1205 local_flags |= CL_ASYNC;
1206 if ((flags & UPL_NOCOMMIT) == 0)
1207 local_flags |= CL_COMMIT;
1208 if ((flags & UPL_KEEPCACHED))
1209 local_flags |= CL_KEEPCACHED;
1210
1211
1212 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
1213 (int)f_offset, size, (int)filesize, local_flags, 0);
1214
1215 /*
1216 * If they didn't specify any I/O, then we are done...
1217 * we can't issue an abort because we don't know how
1218 * big the upl really is
1219 */
1220 if (size <= 0)
1221 return (EINVAL);
1222
1223 if (vp->v_mount->mnt_flag & MNT_RDONLY) {
1224 if (local_flags & CL_COMMIT)
1225 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
1226 return (EROFS);
1227 }
1228 /*
1229 * can't page-in from a negative offset
1230 * or if we're starting beyond the EOF
1231 * or if the file offset isn't page aligned
1232 * or the size requested isn't a multiple of PAGE_SIZE
1233 */
1234 if (f_offset < 0 || f_offset >= filesize ||
1235 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
1236 if (local_flags & CL_COMMIT)
1237 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
1238 return (EINVAL);
1239 }
1240 max_size = filesize - f_offset;
1241
1242 if (size < max_size)
1243 io_size = size;
1244 else
1245 io_size = max_size;
1246
1247 rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1248
1249 if (size > rounded_size) {
1250 if (local_flags & CL_COMMIT)
1251 ubc_upl_abort_range(upl, upl_offset + rounded_size, size - rounded_size,
1252 UPL_ABORT_FREE_ON_EMPTY);
1253 }
1254 if ((wbp = cluster_get_wbp(vp, 0)) != NULL)
1255 wbp->cl_hasbeenpaged = 1;
1256
1257 return (cluster_io(vp, upl, upl_offset, f_offset, io_size,
1258 local_flags, (buf_t)NULL, (struct clios *)NULL));
1259 }
1260
1261 int
1262 cluster_pagein(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset,
1263 int size, off_t filesize, int flags)
1264 {
1265 u_int io_size;
1266 int rounded_size;
1267 off_t max_size;
1268 int retval;
1269 int local_flags = 0;
1270
1271 if (upl == NULL || size < 0)
1272 panic("cluster_pagein: NULL upl passed in");
1273
1274 if ((flags & UPL_IOSYNC) == 0)
1275 local_flags |= CL_ASYNC;
1276 if ((flags & UPL_NOCOMMIT) == 0)
1277 local_flags |= CL_COMMIT;
1278
1279
1280 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
1281 (int)f_offset, size, (int)filesize, local_flags, 0);
1282
1283 /*
1284 * can't page-in from a negative offset
1285 * or if we're starting beyond the EOF
1286 * or if the file offset isn't page aligned
1287 * or the size requested isn't a multiple of PAGE_SIZE
1288 */
1289 if (f_offset < 0 || f_offset >= filesize ||
1290 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) {
1291 if (local_flags & CL_COMMIT)
1292 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1293 return (EINVAL);
1294 }
1295 max_size = filesize - f_offset;
1296
1297 if (size < max_size)
1298 io_size = size;
1299 else
1300 io_size = max_size;
1301
1302 rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1303
1304 if (size > rounded_size && (local_flags & CL_COMMIT))
1305 ubc_upl_abort_range(upl, upl_offset + rounded_size,
1306 size - rounded_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1307
1308 retval = cluster_io(vp, upl, upl_offset, f_offset, io_size,
1309 local_flags | CL_READ | CL_PAGEIN, (buf_t)NULL, (struct clios *)NULL);
1310
1311 if (retval == 0 && !(flags & UPL_NORDAHEAD) && !(vp->v_flag & VRAOFF)) {
1312 struct cl_readahead *rap;
1313
1314 rap = cluster_get_rap(vp);
1315
1316 if (rap != NULL) {
1317 struct cl_extent extent;
1318
1319 extent.b_addr = (daddr64_t)(f_offset / PAGE_SIZE_64);
1320 extent.e_addr = (daddr64_t)((f_offset + ((off_t)io_size - 1)) / PAGE_SIZE_64);
1321
1322 if (rounded_size == PAGE_SIZE) {
1323 /*
1324 * we haven't read the last page in of the file yet
1325 * so let's try to read ahead if we're in
1326 * a sequential access pattern
1327 */
1328 cluster_rd_ahead(vp, &extent, filesize, rap);
1329 }
1330 rap->cl_lastr = extent.e_addr;
1331
1332 lck_mtx_unlock(&rap->cl_lockr);
1333 }
1334 }
1335 return (retval);
1336 }
1337
1338 int
1339 cluster_bp(buf_t bp)
1340 {
1341 off_t f_offset;
1342 int flags;
1343
1344 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START,
1345 (int)bp, (int)bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
1346
1347 if (bp->b_flags & B_READ)
1348 flags = CL_ASYNC | CL_READ;
1349 else
1350 flags = CL_ASYNC;
1351
1352 f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
1353
1354 return (cluster_io(bp->b_vp, bp->b_upl, 0, f_offset, bp->b_bcount, flags, bp, (struct clios *)NULL));
1355 }
1356
1357 int
1358 cluster_write(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int xflags)
1359 {
1360 int prev_resid;
1361 u_int clip_size;
1362 off_t max_io_size;
1363 int upl_size;
1364 int upl_flags;
1365 upl_t upl;
1366 int retval = 0;
1367 int flags;
1368
1369 flags = xflags;
1370
1371 if (vp->v_flag & VNOCACHE_DATA)
1372 flags |= IO_NOCACHE;
1373
1374 if ( (!(flags & IO_NOCACHE)) || (!uio) || (!UIO_SEG_IS_USER_SPACE(uio->uio_segflg))) {
1375 /*
1376 * go do a write through the cache if one of the following is true....
1377 * NOCACHE is not true
1378 * there is no uio structure or it doesn't target USERSPACE
1379 */
1380 return (cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags));
1381 }
1382
1383 #if LP64_DEBUG
1384 if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) {
1385 panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__);
1386 }
1387 #endif /* LP64_DEBUG */
1388
1389 while (uio_resid(uio) && uio->uio_offset < newEOF && retval == 0) {
1390 user_size_t iov_len;
1391 user_addr_t iov_base;
1392
1393 /*
1394 * we know we have a resid, so this is safe
1395 * skip over any emtpy vectors
1396 */
1397 uio_update(uio, (user_size_t)0);
1398
1399 iov_len = uio_curriovlen(uio);
1400 iov_base = uio_curriovbase(uio);
1401
1402 upl_size = PAGE_SIZE;
1403 upl_flags = UPL_QUERY_OBJECT_TYPE;
1404
1405 // LP64todo - fix this!
1406 if ((vm_map_get_upl(current_map(),
1407 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
1408 &upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS) {
1409 /*
1410 * the user app must have passed in an invalid address
1411 */
1412 return (EFAULT);
1413 }
1414
1415 /*
1416 * We check every vector target but if it is physically
1417 * contiguous space, we skip the sanity checks.
1418 */
1419 if (upl_flags & UPL_PHYS_CONTIG) {
1420 int zflags;
1421
1422 zflags = flags & ~IO_TAILZEROFILL;
1423 zflags |= IO_HEADZEROFILL;
1424
1425 if (flags & IO_HEADZEROFILL) {
1426 /*
1427 * in case we have additional vectors, we don't want to do this again
1428 */
1429 flags &= ~IO_HEADZEROFILL;
1430
1431 if ((retval = cluster_write_x(vp, (struct uio *)0, 0, uio->uio_offset, headOff, 0, zflags)))
1432 return(retval);
1433 }
1434 retval = cluster_phys_write(vp, uio, newEOF);
1435
1436 if (uio_resid(uio) == 0 && (flags & IO_TAILZEROFILL)) {
1437 return (cluster_write_x(vp, (struct uio *)0, 0, tailOff, uio->uio_offset, 0, zflags));
1438 }
1439 }
1440 else if ((uio_resid(uio) < PAGE_SIZE) || (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL))) {
1441 /*
1442 * we're here because we're don't have a physically contiguous target buffer
1443 * go do a write through the cache if one of the following is true....
1444 * the total xfer size is less than a page...
1445 * we're being asked to ZEROFILL either the head or the tail of the I/O...
1446 */
1447 return (cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags));
1448 }
1449 // LP64todo - fix this!
1450 else if (((int)uio->uio_offset & PAGE_MASK) || (CAST_DOWN(int, iov_base) & PAGE_MASK)) {
1451 if (((int)uio->uio_offset & PAGE_MASK) == (CAST_DOWN(int, iov_base) & PAGE_MASK)) {
1452 /*
1453 * Bring the file offset write up to a pagesize boundary
1454 * this will also bring the base address to a page boundary
1455 * since they both are currently on the same offset within a page
1456 * note: if we get here, uio->uio_resid is greater than PAGE_SIZE
1457 * so the computed clip_size must always be less than the current uio_resid
1458 */
1459 clip_size = (PAGE_SIZE - (uio->uio_offset & PAGE_MASK_64));
1460
1461 /*
1462 * Fake the resid going into the cluster_write_x call
1463 * and restore it on the way out.
1464 */
1465 // LP64todo - fix this
1466 prev_resid = uio_resid(uio);
1467 uio_setresid(uio, clip_size);
1468
1469 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags);
1470
1471 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
1472 } else {
1473 /*
1474 * can't get both the file offset and the buffer offset aligned to a page boundary
1475 * so fire an I/O through the cache for this entire vector
1476 */
1477 // LP64todo - fix this
1478 clip_size = iov_len;
1479 // LP64todo - fix this
1480 prev_resid = uio_resid(uio);
1481 uio_setresid(uio, clip_size);
1482
1483 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags);
1484
1485 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
1486 }
1487 } else {
1488 /*
1489 * If we come in here, we know the offset into
1490 * the file is on a pagesize boundary and the
1491 * target buffer address is also on a page boundary
1492 */
1493 max_io_size = newEOF - uio->uio_offset;
1494 // LP64todo - fix this
1495 clip_size = uio_resid(uio);
1496 if (iov_len < clip_size)
1497 // LP64todo - fix this!
1498 clip_size = iov_len;
1499 if (max_io_size < clip_size)
1500 clip_size = max_io_size;
1501
1502 if (clip_size < PAGE_SIZE) {
1503 /*
1504 * Take care of tail end of write in this vector
1505 */
1506 // LP64todo - fix this
1507 prev_resid = uio_resid(uio);
1508 uio_setresid(uio, clip_size);
1509
1510 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags);
1511
1512 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
1513 } else {
1514 /* round clip_size down to a multiple of pagesize */
1515 clip_size = clip_size & ~(PAGE_MASK);
1516 // LP64todo - fix this
1517 prev_resid = uio_resid(uio);
1518 uio_setresid(uio, clip_size);
1519
1520 retval = cluster_nocopy_write(vp, uio, newEOF);
1521
1522 if ((retval == 0) && uio_resid(uio))
1523 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, flags);
1524
1525 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
1526 }
1527 } /* end else */
1528 } /* end while */
1529
1530 return(retval);
1531 }
1532
1533
1534 static int
1535 cluster_nocopy_write(vnode_t vp, struct uio *uio, off_t newEOF)
1536 {
1537 upl_t upl;
1538 upl_page_info_t *pl;
1539 vm_offset_t upl_offset;
1540 int io_size;
1541 int io_flag;
1542 int upl_size;
1543 int upl_needed_size;
1544 int pages_in_pl;
1545 int upl_flags;
1546 kern_return_t kret;
1547 int i;
1548 int force_data_sync;
1549 int error = 0;
1550 struct clios iostate;
1551 struct cl_writebehind *wbp;
1552
1553
1554 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
1555 (int)uio->uio_offset, (int)uio_resid(uio),
1556 (int)newEOF, 0, 0);
1557
1558 /*
1559 * When we enter this routine, we know
1560 * -- the offset into the file is on a pagesize boundary
1561 * -- the resid is a page multiple
1562 * -- the resid will not exceed iov_len
1563 */
1564
1565 if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) != NULL) {
1566
1567 cluster_try_push(wbp, vp, newEOF, 0, 1);
1568
1569 lck_mtx_unlock(&wbp->cl_lockw);
1570 }
1571 iostate.io_completed = 0;
1572 iostate.io_issued = 0;
1573 iostate.io_error = 0;
1574 iostate.io_wanted = 0;
1575
1576 while (uio_resid(uio) && uio->uio_offset < newEOF && error == 0) {
1577 user_addr_t iov_base;
1578
1579 io_size = uio_resid(uio);
1580
1581 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1582 io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1583
1584 iov_base = uio_curriovbase(uio);
1585
1586 // LP64todo - fix this!
1587 upl_offset = CAST_DOWN(vm_offset_t, iov_base) & PAGE_MASK;
1588
1589 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
1590
1591 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
1592 (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
1593
1594 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
1595 pages_in_pl = 0;
1596 upl_size = upl_needed_size;
1597 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1598 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
1599
1600 // LP64todo - fix this!
1601 kret = vm_map_get_upl(current_map(),
1602 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
1603 &upl_size,
1604 &upl,
1605 NULL,
1606 &pages_in_pl,
1607 &upl_flags,
1608 force_data_sync);
1609
1610 if (kret != KERN_SUCCESS) {
1611 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1612 0, 0, 0, kret, 0);
1613 /*
1614 * cluster_nocopy_write: failed to get pagelist
1615 *
1616 * we may have already spun some portion of this request
1617 * off as async requests... we need to wait for the I/O
1618 * to complete before returning
1619 */
1620 goto wait_for_writes;
1621 }
1622 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1623 pages_in_pl = upl_size / PAGE_SIZE;
1624
1625 for (i = 0; i < pages_in_pl; i++) {
1626 if (!upl_valid_page(pl, i))
1627 break;
1628 }
1629 if (i == pages_in_pl)
1630 break;
1631
1632 /*
1633 * didn't get all the pages back that we
1634 * needed... release this upl and try again
1635 */
1636 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1637 UPL_ABORT_FREE_ON_EMPTY);
1638 }
1639 if (force_data_sync >= 3) {
1640 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1641 i, pages_in_pl, upl_size, kret, 0);
1642 /*
1643 * for some reason, we couldn't acquire a hold on all
1644 * the pages needed in the user's address space
1645 *
1646 * we may have already spun some portion of this request
1647 * off as async requests... we need to wait for the I/O
1648 * to complete before returning
1649 */
1650 goto wait_for_writes;
1651 }
1652
1653 /*
1654 * Consider the possibility that upl_size wasn't satisfied.
1655 */
1656 if (upl_size != upl_needed_size)
1657 io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
1658
1659 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1660 (int)upl_offset, upl_size, (int)iov_base, io_size, 0);
1661
1662 if (io_size == 0) {
1663 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1664 UPL_ABORT_FREE_ON_EMPTY);
1665 /*
1666 * we may have already spun some portion of this request
1667 * off as async requests... we need to wait for the I/O
1668 * to complete before returning
1669 */
1670 goto wait_for_writes;
1671 }
1672 /*
1673 * Now look for pages already in the cache
1674 * and throw them away.
1675 * uio->uio_offset is page aligned within the file
1676 * io_size is a multiple of PAGE_SIZE
1677 */
1678 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + io_size, UPL_ROP_DUMP, NULL);
1679
1680 /*
1681 * we want push out these writes asynchronously so that we can overlap
1682 * the preparation of the next I/O
1683 * if there are already too many outstanding writes
1684 * wait until some complete before issuing the next
1685 */
1686 lck_mtx_lock(cl_mtxp);
1687
1688 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
1689 iostate.io_wanted = 1;
1690 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_write", 0);
1691 }
1692 lck_mtx_unlock(cl_mtxp);
1693
1694 if (iostate.io_error) {
1695 /*
1696 * one of the earlier writes we issued ran into a hard error
1697 * don't issue any more writes, cleanup the UPL
1698 * that was just created but not used, then
1699 * go wait for all writes that are part of this stream
1700 * to complete before returning the error to the caller
1701 */
1702 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1703 UPL_ABORT_FREE_ON_EMPTY);
1704
1705 goto wait_for_writes;
1706 }
1707 io_flag = CL_ASYNC | CL_PRESERVE | CL_COMMIT | CL_THROTTLE;
1708
1709 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
1710 (int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0);
1711
1712 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1713 io_size, io_flag, (buf_t)NULL, &iostate);
1714
1715 uio_update(uio, (user_size_t)io_size);
1716
1717 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
1718 (int)upl_offset, (int)uio->uio_offset, (int)uio_resid(uio), error, 0);
1719
1720 } /* end while */
1721
1722 wait_for_writes:
1723 /*
1724 * make sure all async writes issued as part of this stream
1725 * have completed before we return
1726 */
1727 lck_mtx_lock(cl_mtxp);
1728
1729 while (iostate.io_issued != iostate.io_completed) {
1730 iostate.io_wanted = 1;
1731 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_write", 0);
1732 }
1733 lck_mtx_unlock(cl_mtxp);
1734
1735 if (iostate.io_error)
1736 error = iostate.io_error;
1737
1738 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1739 (int)uio->uio_offset, (int)uio->uio_resid, error, 4, 0);
1740
1741 return (error);
1742 }
1743
1744
1745 static int
1746 cluster_phys_write(vnode_t vp, struct uio *uio, off_t newEOF)
1747 {
1748 upl_page_info_t *pl;
1749 addr64_t src_paddr;
1750 upl_t upl;
1751 vm_offset_t upl_offset;
1752 int tail_size;
1753 int io_size;
1754 int upl_size;
1755 int upl_needed_size;
1756 int pages_in_pl;
1757 int upl_flags;
1758 kern_return_t kret;
1759 int error = 0;
1760 user_addr_t iov_base;
1761 int devblocksize;
1762 struct cl_writebehind *wbp;
1763
1764 devblocksize = vp->v_mount->mnt_devblocksize;
1765 /*
1766 * When we enter this routine, we know
1767 * -- the resid will not exceed iov_len
1768 * -- the vector target address is physcially contiguous
1769 */
1770 if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) != NULL) {
1771
1772 cluster_try_push(wbp, vp, newEOF, 0, 1);
1773
1774 lck_mtx_unlock(&wbp->cl_lockw);
1775 }
1776 #if LP64_DEBUG
1777 if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) {
1778 panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__);
1779 }
1780 #endif /* LP64_DEBUG */
1781
1782 // LP64todo - fix this!
1783 io_size = (int)uio_curriovlen(uio);
1784 iov_base = uio_curriovbase(uio);
1785
1786 upl_offset = CAST_DOWN(upl_offset_t, iov_base) & PAGE_MASK;
1787 upl_needed_size = upl_offset + io_size;
1788
1789 pages_in_pl = 0;
1790 upl_size = upl_needed_size;
1791 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1792 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
1793
1794 // LP64todo - fix this!
1795 kret = vm_map_get_upl(current_map(),
1796 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
1797 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
1798
1799 if (kret != KERN_SUCCESS) {
1800 /*
1801 * cluster_phys_write: failed to get pagelist
1802 * note: return kret here
1803 */
1804 return(EINVAL);
1805 }
1806 /*
1807 * Consider the possibility that upl_size wasn't satisfied.
1808 * This is a failure in the physical memory case.
1809 */
1810 if (upl_size < upl_needed_size) {
1811 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1812 return(EINVAL);
1813 }
1814 pl = ubc_upl_pageinfo(upl);
1815
1816 src_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)upl_offset;
1817
1818 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
1819 int head_size;
1820
1821 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
1822
1823 if (head_size > io_size)
1824 head_size = io_size;
1825
1826 error = cluster_align_phys_io(vp, uio, src_paddr, head_size, 0);
1827
1828 if (error) {
1829 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1830
1831 return(EINVAL);
1832 }
1833 upl_offset += head_size;
1834 src_paddr += head_size;
1835 io_size -= head_size;
1836 }
1837 tail_size = io_size & (devblocksize - 1);
1838 io_size -= tail_size;
1839
1840 if (io_size) {
1841 /*
1842 * issue a synchronous write to cluster_io
1843 */
1844 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1845 io_size, CL_DEV_MEMORY, (buf_t)NULL, (struct clios *)NULL);
1846 }
1847 if (error == 0) {
1848 /*
1849 * The cluster_io write completed successfully,
1850 * update the uio structure
1851 */
1852 uio_update(uio, (user_size_t)io_size);
1853
1854 src_paddr += io_size;
1855
1856 if (tail_size)
1857 error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, 0);
1858 }
1859 /*
1860 * just release our hold on the physically contiguous
1861 * region without changing any state
1862 */
1863 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1864
1865 return (error);
1866 }
1867
1868
1869 static int
1870 cluster_write_x(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t headOff, off_t tailOff, int flags)
1871 {
1872 upl_page_info_t *pl;
1873 upl_t upl;
1874 vm_offset_t upl_offset = 0;
1875 int upl_size;
1876 off_t upl_f_offset;
1877 int pages_in_upl;
1878 int start_offset;
1879 int xfer_resid;
1880 int io_size;
1881 int io_offset;
1882 int bytes_to_zero;
1883 int bytes_to_move;
1884 kern_return_t kret;
1885 int retval = 0;
1886 int io_resid;
1887 long long total_size;
1888 long long zero_cnt;
1889 off_t zero_off;
1890 long long zero_cnt1;
1891 off_t zero_off1;
1892 struct cl_extent cl;
1893 int intersection;
1894 struct cl_writebehind *wbp;
1895
1896 if ((wbp = cluster_get_wbp(vp, 0)) != NULL)
1897 {
1898 if (wbp->cl_hasbeenpaged) {
1899 /*
1900 * this vnode had pages cleaned to it by
1901 * the pager which indicates that either
1902 * it's not very 'hot', or the system is
1903 * being overwhelmed by a lot of dirty
1904 * data being delayed in the VM cache...
1905 * in either event, we'll push our remaining
1906 * delayed data at this point... this will
1907 * be more efficient than paging out 1 page at
1908 * a time, and will also act as a throttle
1909 * by delaying this client from writing any
1910 * more data until all his delayed data has
1911 * at least been queued to the uderlying driver.
1912 */
1913 if (wbp->cl_number || wbp->cl_scmap)
1914 cluster_push_EOF(vp, newEOF);
1915
1916 wbp->cl_hasbeenpaged = 0;
1917 }
1918 }
1919 if (uio) {
1920 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1921 (int)uio->uio_offset, uio_resid(uio), (int)oldEOF, (int)newEOF, 0);
1922
1923 // LP64todo - fix this
1924 io_resid = uio_resid(uio);
1925 } else {
1926 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1927 0, 0, (int)oldEOF, (int)newEOF, 0);
1928
1929 io_resid = 0;
1930 }
1931 zero_cnt = 0;
1932 zero_cnt1 = 0;
1933 zero_off = 0;
1934 zero_off1 = 0;
1935
1936 if (flags & IO_HEADZEROFILL) {
1937 /*
1938 * some filesystems (HFS is one) don't support unallocated holes within a file...
1939 * so we zero fill the intervening space between the old EOF and the offset
1940 * where the next chunk of real data begins.... ftruncate will also use this
1941 * routine to zero fill to the new EOF when growing a file... in this case, the
1942 * uio structure will not be provided
1943 */
1944 if (uio) {
1945 if (headOff < uio->uio_offset) {
1946 zero_cnt = uio->uio_offset - headOff;
1947 zero_off = headOff;
1948 }
1949 } else if (headOff < newEOF) {
1950 zero_cnt = newEOF - headOff;
1951 zero_off = headOff;
1952 }
1953 }
1954 if (flags & IO_TAILZEROFILL) {
1955 if (uio) {
1956 // LP64todo - fix this
1957 zero_off1 = uio->uio_offset + uio_resid(uio);
1958
1959 if (zero_off1 < tailOff)
1960 zero_cnt1 = tailOff - zero_off1;
1961 }
1962 }
1963 if (zero_cnt == 0 && uio == (struct uio *) 0) {
1964 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
1965 retval, 0, 0, 0, 0);
1966 return (0);
1967 }
1968
1969 while ((total_size = (io_resid + zero_cnt + zero_cnt1)) && retval == 0) {
1970 /*
1971 * for this iteration of the loop, figure out where our starting point is
1972 */
1973 if (zero_cnt) {
1974 start_offset = (int)(zero_off & PAGE_MASK_64);
1975 upl_f_offset = zero_off - start_offset;
1976 } else if (io_resid) {
1977 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
1978 upl_f_offset = uio->uio_offset - start_offset;
1979 } else {
1980 start_offset = (int)(zero_off1 & PAGE_MASK_64);
1981 upl_f_offset = zero_off1 - start_offset;
1982 }
1983 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
1984 (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
1985
1986 if (total_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1987 total_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1988
1989 cl.b_addr = (daddr64_t)(upl_f_offset / PAGE_SIZE_64);
1990
1991 if (uio && ((flags & (IO_NOCACHE | IO_SYNC | IO_HEADZEROFILL | IO_TAILZEROFILL)) == 0)) {
1992 /*
1993 * assumption... total_size <= io_resid
1994 * because IO_HEADZEROFILL and IO_TAILZEROFILL not set
1995 */
1996 if ((start_offset + total_size) > (MAX_UPL_TRANSFER * PAGE_SIZE))
1997 total_size -= start_offset;
1998 xfer_resid = total_size;
1999
2000 retval = cluster_copy_ubc_data(vp, uio, &xfer_resid, 1);
2001
2002 if (retval)
2003 break;
2004
2005 io_resid -= (total_size - xfer_resid);
2006 total_size = xfer_resid;
2007 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2008 upl_f_offset = uio->uio_offset - start_offset;
2009
2010 if (total_size == 0) {
2011 if (start_offset) {
2012 /*
2013 * the write did not finish on a page boundary
2014 * which will leave upl_f_offset pointing to the
2015 * beginning of the last page written instead of
2016 * the page beyond it... bump it in this case
2017 * so that the cluster code records the last page
2018 * written as dirty
2019 */
2020 upl_f_offset += PAGE_SIZE_64;
2021 }
2022 upl_size = 0;
2023
2024 goto check_cluster;
2025 }
2026 }
2027 /*
2028 * compute the size of the upl needed to encompass
2029 * the requested write... limit each call to cluster_io
2030 * to the maximum UPL size... cluster_io will clip if
2031 * this exceeds the maximum io_size for the device,
2032 * make sure to account for
2033 * a starting offset that's not page aligned
2034 */
2035 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2036
2037 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2038 upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2039
2040 pages_in_upl = upl_size / PAGE_SIZE;
2041 io_size = upl_size - start_offset;
2042
2043 if ((long long)io_size > total_size)
2044 io_size = total_size;
2045
2046 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, io_size, total_size, 0, 0);
2047
2048
2049 /*
2050 * Gather the pages from the buffer cache.
2051 * The UPL_WILL_MODIFY flag lets the UPL subsystem know
2052 * that we intend to modify these pages.
2053 */
2054 kret = ubc_create_upl(vp,
2055 upl_f_offset,
2056 upl_size,
2057 &upl,
2058 &pl,
2059 UPL_SET_LITE | UPL_WILL_MODIFY);
2060 if (kret != KERN_SUCCESS)
2061 panic("cluster_write: failed to get pagelist");
2062
2063 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END,
2064 (int)upl, (int)upl_f_offset, start_offset, 0, 0);
2065
2066 if (start_offset && !upl_valid_page(pl, 0)) {
2067 int read_size;
2068
2069 /*
2070 * we're starting in the middle of the first page of the upl
2071 * and the page isn't currently valid, so we're going to have
2072 * to read it in first... this is a synchronous operation
2073 */
2074 read_size = PAGE_SIZE;
2075
2076 if ((upl_f_offset + read_size) > newEOF)
2077 read_size = newEOF - upl_f_offset;
2078
2079 retval = cluster_io(vp, upl, 0, upl_f_offset, read_size,
2080 CL_READ, (buf_t)NULL, (struct clios *)NULL);
2081 if (retval) {
2082 /*
2083 * we had an error during the read which causes us to abort
2084 * the current cluster_write request... before we do, we need
2085 * to release the rest of the pages in the upl without modifying
2086 * there state and mark the failed page in error
2087 */
2088 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
2089
2090 if (upl_size > PAGE_SIZE)
2091 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2092
2093 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
2094 (int)upl, 0, 0, retval, 0);
2095 break;
2096 }
2097 }
2098 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
2099 /*
2100 * the last offset we're writing to in this upl does not end on a page
2101 * boundary... if it's not beyond the old EOF, then we'll also need to
2102 * pre-read this page in if it isn't already valid
2103 */
2104 upl_offset = upl_size - PAGE_SIZE;
2105
2106 if ((upl_f_offset + start_offset + io_size) < oldEOF &&
2107 !upl_valid_page(pl, upl_offset / PAGE_SIZE)) {
2108 int read_size;
2109
2110 read_size = PAGE_SIZE;
2111
2112 if ((upl_f_offset + upl_offset + read_size) > newEOF)
2113 read_size = newEOF - (upl_f_offset + upl_offset);
2114
2115 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size,
2116 CL_READ, (buf_t)NULL, (struct clios *)NULL);
2117 if (retval) {
2118 /*
2119 * we had an error during the read which causes us to abort
2120 * the current cluster_write request... before we do, we
2121 * need to release the rest of the pages in the upl without
2122 * modifying there state and mark the failed page in error
2123 */
2124 ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
2125
2126 if (upl_size > PAGE_SIZE)
2127 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2128
2129 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
2130 (int)upl, 0, 0, retval, 0);
2131 break;
2132 }
2133 }
2134 }
2135 xfer_resid = io_size;
2136 io_offset = start_offset;
2137
2138 while (zero_cnt && xfer_resid) {
2139
2140 if (zero_cnt < (long long)xfer_resid)
2141 bytes_to_zero = zero_cnt;
2142 else
2143 bytes_to_zero = xfer_resid;
2144
2145 if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
2146 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2147 } else {
2148 int zero_pg_index;
2149
2150 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
2151 zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
2152
2153 if ( !upl_valid_page(pl, zero_pg_index)) {
2154 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2155
2156 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
2157 !upl_dirty_page(pl, zero_pg_index)) {
2158 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2159 }
2160 }
2161 xfer_resid -= bytes_to_zero;
2162 zero_cnt -= bytes_to_zero;
2163 zero_off += bytes_to_zero;
2164 io_offset += bytes_to_zero;
2165 }
2166 if (xfer_resid && io_resid) {
2167 bytes_to_move = min(io_resid, xfer_resid);
2168
2169 retval = cluster_copy_upl_data(uio, upl, io_offset, bytes_to_move);
2170
2171 if (retval) {
2172
2173 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2174
2175 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
2176 (int)upl, 0, 0, retval, 0);
2177 } else {
2178 io_resid -= bytes_to_move;
2179 xfer_resid -= bytes_to_move;
2180 io_offset += bytes_to_move;
2181 }
2182 }
2183 while (xfer_resid && zero_cnt1 && retval == 0) {
2184
2185 if (zero_cnt1 < (long long)xfer_resid)
2186 bytes_to_zero = zero_cnt1;
2187 else
2188 bytes_to_zero = xfer_resid;
2189
2190 if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
2191 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2192 } else {
2193 int zero_pg_index;
2194
2195 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off1 & PAGE_MASK_64));
2196 zero_pg_index = (int)((zero_off1 - upl_f_offset) / PAGE_SIZE_64);
2197
2198 if ( !upl_valid_page(pl, zero_pg_index)) {
2199 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2200 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
2201 !upl_dirty_page(pl, zero_pg_index)) {
2202 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
2203 }
2204 }
2205 xfer_resid -= bytes_to_zero;
2206 zero_cnt1 -= bytes_to_zero;
2207 zero_off1 += bytes_to_zero;
2208 io_offset += bytes_to_zero;
2209 }
2210
2211 if (retval == 0) {
2212 int cl_index;
2213 int can_delay;
2214
2215 io_size += start_offset;
2216
2217 if ((upl_f_offset + io_size) >= newEOF && io_size < upl_size) {
2218 /*
2219 * if we're extending the file with this write
2220 * we'll zero fill the rest of the page so that
2221 * if the file gets extended again in such a way as to leave a
2222 * hole starting at this EOF, we'll have zero's in the correct spot
2223 */
2224 cluster_zero(upl, io_size, upl_size - io_size, NULL);
2225 }
2226 if (flags & IO_SYNC)
2227 /*
2228 * if the IO_SYNC flag is set than we need to
2229 * bypass any clusters and immediately issue
2230 * the I/O
2231 */
2232 goto issue_io;
2233 check_cluster:
2234 /*
2235 * take the lock to protect our accesses
2236 * of the writebehind and sparse cluster state
2237 */
2238 wbp = cluster_get_wbp(vp, CLW_ALLOCATE | CLW_RETURNLOCKED);
2239
2240 /*
2241 * calculate the last logical block number
2242 * that this delayed I/O encompassed
2243 */
2244 cl.e_addr = (daddr64_t)((upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64);
2245
2246 if (wbp->cl_scmap) {
2247
2248 if ( !(flags & IO_NOCACHE)) {
2249 /*
2250 * we've fallen into the sparse
2251 * cluster method of delaying dirty pages
2252 * first, we need to release the upl if we hold one
2253 * since pages in it may be present in the sparse cluster map
2254 * and may span 2 separate buckets there... if they do and
2255 * we happen to have to flush a bucket to make room and it intersects
2256 * this upl, a deadlock may result on page BUSY
2257 */
2258 if (upl_size)
2259 ubc_upl_commit_range(upl, 0, upl_size,
2260 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2261
2262 sparse_cluster_add(wbp, vp, &cl, newEOF);
2263
2264 lck_mtx_unlock(&wbp->cl_lockw);
2265
2266 continue;
2267 }
2268 /*
2269 * must have done cached writes that fell into
2270 * the sparse cluster mechanism... we've switched
2271 * to uncached writes on the file, so go ahead
2272 * and push whatever's in the sparse map
2273 * and switch back to normal clustering
2274 *
2275 * see the comment above concerning a possible deadlock...
2276 */
2277 if (upl_size) {
2278 ubc_upl_commit_range(upl, 0, upl_size,
2279 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2280 /*
2281 * setting upl_size to 0 keeps us from committing a
2282 * second time in the start_new_cluster path
2283 */
2284 upl_size = 0;
2285 }
2286 sparse_cluster_push(wbp, vp, newEOF, 1);
2287
2288 wbp->cl_number = 0;
2289 /*
2290 * no clusters of either type present at this point
2291 * so just go directly to start_new_cluster since
2292 * we know we need to delay this I/O since we've
2293 * already released the pages back into the cache
2294 * to avoid the deadlock with sparse_cluster_push
2295 */
2296 goto start_new_cluster;
2297 }
2298 upl_offset = 0;
2299
2300 if (wbp->cl_number == 0)
2301 /*
2302 * no clusters currently present
2303 */
2304 goto start_new_cluster;
2305
2306 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
2307 /*
2308 * check each cluster that we currently hold
2309 * try to merge some or all of this write into
2310 * one or more of the existing clusters... if
2311 * any portion of the write remains, start a
2312 * new cluster
2313 */
2314 if (cl.b_addr >= wbp->cl_clusters[cl_index].b_addr) {
2315 /*
2316 * the current write starts at or after the current cluster
2317 */
2318 if (cl.e_addr <= (wbp->cl_clusters[cl_index].b_addr + MAX_UPL_TRANSFER)) {
2319 /*
2320 * we have a write that fits entirely
2321 * within the existing cluster limits
2322 */
2323 if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr)
2324 /*
2325 * update our idea of where the cluster ends
2326 */
2327 wbp->cl_clusters[cl_index].e_addr = cl.e_addr;
2328 break;
2329 }
2330 if (cl.b_addr < (wbp->cl_clusters[cl_index].b_addr + MAX_UPL_TRANSFER)) {
2331 /*
2332 * we have a write that starts in the middle of the current cluster
2333 * but extends beyond the cluster's limit... we know this because
2334 * of the previous checks
2335 * we'll extend the current cluster to the max
2336 * and update the b_addr for the current write to reflect that
2337 * the head of it was absorbed into this cluster...
2338 * note that we'll always have a leftover tail in this case since
2339 * full absorbtion would have occurred in the clause above
2340 */
2341 wbp->cl_clusters[cl_index].e_addr = wbp->cl_clusters[cl_index].b_addr + MAX_UPL_TRANSFER;
2342
2343 if (upl_size) {
2344 daddr64_t start_pg_in_upl;
2345
2346 start_pg_in_upl = (daddr64_t)(upl_f_offset / PAGE_SIZE_64);
2347
2348 if (start_pg_in_upl < wbp->cl_clusters[cl_index].e_addr) {
2349 intersection = (int)((wbp->cl_clusters[cl_index].e_addr - start_pg_in_upl) * PAGE_SIZE);
2350
2351 ubc_upl_commit_range(upl, upl_offset, intersection,
2352 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2353 upl_f_offset += intersection;
2354 upl_offset += intersection;
2355 upl_size -= intersection;
2356 }
2357 }
2358 cl.b_addr = wbp->cl_clusters[cl_index].e_addr;
2359 }
2360 /*
2361 * we come here for the case where the current write starts
2362 * beyond the limit of the existing cluster or we have a leftover
2363 * tail after a partial absorbtion
2364 *
2365 * in either case, we'll check the remaining clusters before
2366 * starting a new one
2367 */
2368 } else {
2369 /*
2370 * the current write starts in front of the cluster we're currently considering
2371 */
2372 if ((wbp->cl_clusters[cl_index].e_addr - cl.b_addr) <= MAX_UPL_TRANSFER) {
2373 /*
2374 * we can just merge the new request into
2375 * this cluster and leave it in the cache
2376 * since the resulting cluster is still
2377 * less than the maximum allowable size
2378 */
2379 wbp->cl_clusters[cl_index].b_addr = cl.b_addr;
2380
2381 if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr) {
2382 /*
2383 * the current write completely
2384 * envelops the existing cluster and since
2385 * each write is limited to at most MAX_UPL_TRANSFER bytes
2386 * we can just use the start and last blocknos of the write
2387 * to generate the cluster limits
2388 */
2389 wbp->cl_clusters[cl_index].e_addr = cl.e_addr;
2390 }
2391 break;
2392 }
2393
2394 /*
2395 * if we were to combine this write with the current cluster
2396 * we would exceed the cluster size limit.... so,
2397 * let's see if there's any overlap of the new I/O with
2398 * the cluster we're currently considering... in fact, we'll
2399 * stretch the cluster out to it's full limit and see if we
2400 * get an intersection with the current write
2401 *
2402 */
2403 if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr - MAX_UPL_TRANSFER) {
2404 /*
2405 * the current write extends into the proposed cluster
2406 * clip the length of the current write after first combining it's
2407 * tail with the newly shaped cluster
2408 */
2409 wbp->cl_clusters[cl_index].b_addr = wbp->cl_clusters[cl_index].e_addr - MAX_UPL_TRANSFER;
2410
2411 if (upl_size) {
2412 intersection = (int)((cl.e_addr - wbp->cl_clusters[cl_index].b_addr) * PAGE_SIZE);
2413
2414 if (intersection > upl_size)
2415 /*
2416 * because the current write may consist of a number of pages found in the cache
2417 * which are not part of the UPL, we may have an intersection that exceeds
2418 * the size of the UPL that is also part of this write
2419 */
2420 intersection = upl_size;
2421
2422 ubc_upl_commit_range(upl, upl_offset + (upl_size - intersection), intersection,
2423 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2424 upl_size -= intersection;
2425 }
2426 cl.e_addr = wbp->cl_clusters[cl_index].b_addr;
2427 }
2428 /*
2429 * if we get here, there was no way to merge
2430 * any portion of this write with this cluster
2431 * or we could only merge part of it which
2432 * will leave a tail...
2433 * we'll check the remaining clusters before starting a new one
2434 */
2435 }
2436 }
2437 if (cl_index < wbp->cl_number)
2438 /*
2439 * we found an existing cluster(s) that we
2440 * could entirely merge this I/O into
2441 */
2442 goto delay_io;
2443
2444 if (wbp->cl_number < MAX_CLUSTERS && !(flags & IO_NOCACHE))
2445 /*
2446 * we didn't find an existing cluster to
2447 * merge into, but there's room to start
2448 * a new one
2449 */
2450 goto start_new_cluster;
2451
2452 /*
2453 * no exisitng cluster to merge with and no
2454 * room to start a new one... we'll try
2455 * pushing one of the existing ones... if none of
2456 * them are able to be pushed, we'll switch
2457 * to the sparse cluster mechanism
2458 * cluster_try_push updates cl_number to the
2459 * number of remaining clusters... and
2460 * returns the number of currently unused clusters
2461 */
2462 int ret_cluster_try_push = 0;
2463 /* if writes are not deferred, call cluster push immediately */
2464 if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) {
2465 if (flags & IO_NOCACHE)
2466 can_delay = 0;
2467 else
2468 can_delay = 1;
2469
2470 ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, can_delay, 0);
2471 }
2472
2473 /* execute following regardless writes are deferred or not */
2474 if (ret_cluster_try_push == 0) {
2475 /*
2476 * no more room in the normal cluster mechanism
2477 * so let's switch to the more expansive but expensive
2478 * sparse mechanism....
2479 * first, we need to release the upl if we hold one
2480 * since pages in it may be present in the sparse cluster map (after the cluster_switch)
2481 * and may span 2 separate buckets there... if they do and
2482 * we happen to have to flush a bucket to make room and it intersects
2483 * this upl, a deadlock may result on page BUSY
2484 */
2485 if (upl_size)
2486 ubc_upl_commit_range(upl, upl_offset, upl_size,
2487 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2488
2489 sparse_cluster_switch(wbp, vp, newEOF);
2490 sparse_cluster_add(wbp, vp, &cl, newEOF);
2491
2492 lck_mtx_unlock(&wbp->cl_lockw);
2493
2494 continue;
2495 }
2496 /*
2497 * we pushed one cluster successfully, so we must be sequentially writing this file
2498 * otherwise, we would have failed and fallen into the sparse cluster support
2499 * so let's take the opportunity to push out additional clusters as long as we
2500 * remain below the throttle... this will give us better I/O locality if we're
2501 * in a copy loop (i.e. we won't jump back and forth between the read and write points
2502 * however, we don't want to push so much out that the write throttle kicks in and
2503 * hangs this thread up until some of the I/O completes...
2504 */
2505 if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) {
2506 while (wbp->cl_number && (vp->v_numoutput <= (VNODE_ASYNC_THROTTLE / 2)))
2507 cluster_try_push(wbp, vp, newEOF, 0, 0);
2508 }
2509
2510 start_new_cluster:
2511 wbp->cl_clusters[wbp->cl_number].b_addr = cl.b_addr;
2512 wbp->cl_clusters[wbp->cl_number].e_addr = cl.e_addr;
2513
2514 if (flags & IO_NOCACHE)
2515 wbp->cl_clusters[wbp->cl_number].io_nocache = 1;
2516 else
2517 wbp->cl_clusters[wbp->cl_number].io_nocache = 0;
2518 wbp->cl_number++;
2519 delay_io:
2520 if (upl_size)
2521 ubc_upl_commit_range(upl, upl_offset, upl_size,
2522 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2523
2524 lck_mtx_unlock(&wbp->cl_lockw);
2525
2526 continue;
2527 issue_io:
2528 /*
2529 * we don't hold the vnode lock at this point
2530 *
2531 * because we had to ask for a UPL that provides currenty non-present pages, the
2532 * UPL has been automatically set to clear the dirty flags (both software and hardware)
2533 * upon committing it... this is not the behavior we want since it's possible for
2534 * pages currently present as part of a mapped file to be dirtied while the I/O is in flight.
2535 * in order to maintain some semblance of coherency with mapped writes
2536 * we need to drop the current upl and pick it back up with COPYOUT_FROM set
2537 * so that we correctly deal with a change in state of the hardware modify bit...
2538 * we do this via cluster_push_x... by passing along the IO_SYNC flag, we force
2539 * cluster_push_x to wait until all the I/Os have completed... cluster_push_x is also
2540 * responsible for generating the correct sized I/O(s)
2541 */
2542 ubc_upl_commit_range(upl, 0, upl_size,
2543 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2544
2545 cl.e_addr = (upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64;
2546
2547 retval = cluster_push_x(vp, &cl, newEOF, flags);
2548 }
2549 }
2550 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
2551 retval, 0, io_resid, 0, 0);
2552
2553 return (retval);
2554 }
2555
2556 int
2557 cluster_read(vnode_t vp, struct uio *uio, off_t filesize, int xflags)
2558 {
2559 int prev_resid;
2560 u_int clip_size;
2561 off_t max_io_size;
2562 int upl_size;
2563 int upl_flags;
2564 upl_t upl;
2565 int retval = 0;
2566 int flags;
2567
2568 flags = xflags;
2569
2570 if (vp->v_flag & VNOCACHE_DATA)
2571 flags |= IO_NOCACHE;
2572 if (vp->v_flag & VRAOFF)
2573 flags |= IO_RAOFF;
2574
2575 if (!((flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg))) {
2576 /*
2577 * go do a read through the cache if one of the following is true....
2578 * NOCACHE is not true
2579 * the uio request doesn't target USERSPACE
2580 */
2581 return (cluster_read_x(vp, uio, filesize, flags));
2582 }
2583
2584 #if LP64_DEBUG
2585 if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) {
2586 panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__);
2587 }
2588 #endif /* LP64_DEBUG */
2589
2590 while (uio_resid(uio) && uio->uio_offset < filesize && retval == 0) {
2591 user_size_t iov_len;
2592 user_addr_t iov_base;
2593
2594 /*
2595 * we know we have a resid, so this is safe
2596 * skip over any emtpy vectors
2597 */
2598 uio_update(uio, (user_size_t)0);
2599
2600 iov_len = uio_curriovlen(uio);
2601 iov_base = uio_curriovbase(uio);
2602
2603 upl_size = PAGE_SIZE;
2604 upl_flags = UPL_QUERY_OBJECT_TYPE;
2605
2606 // LP64todo - fix this!
2607 if ((vm_map_get_upl(current_map(),
2608 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
2609 &upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS) {
2610 /*
2611 * the user app must have passed in an invalid address
2612 */
2613 return (EFAULT);
2614 }
2615
2616 /*
2617 * We check every vector target but if it is physically
2618 * contiguous space, we skip the sanity checks.
2619 */
2620 if (upl_flags & UPL_PHYS_CONTIG) {
2621 retval = cluster_phys_read(vp, uio, filesize);
2622 }
2623 else if (uio_resid(uio) < PAGE_SIZE) {
2624 /*
2625 * we're here because we're don't have a physically contiguous target buffer
2626 * go do a read through the cache if
2627 * the total xfer size is less than a page...
2628 */
2629 return (cluster_read_x(vp, uio, filesize, flags));
2630 }
2631 // LP64todo - fix this!
2632 else if (((int)uio->uio_offset & PAGE_MASK) || (CAST_DOWN(int, iov_base) & PAGE_MASK)) {
2633 if (((int)uio->uio_offset & PAGE_MASK) == (CAST_DOWN(int, iov_base) & PAGE_MASK)) {
2634 /*
2635 * Bring the file offset read up to a pagesize boundary
2636 * this will also bring the base address to a page boundary
2637 * since they both are currently on the same offset within a page
2638 * note: if we get here, uio->uio_resid is greater than PAGE_SIZE
2639 * so the computed clip_size must always be less than the current uio_resid
2640 */
2641 clip_size = (PAGE_SIZE - (int)(uio->uio_offset & PAGE_MASK_64));
2642
2643 /*
2644 * Fake the resid going into the cluster_read_x call
2645 * and restore it on the way out.
2646 */
2647 prev_resid = uio_resid(uio);
2648 // LP64todo - fix this
2649 uio_setresid(uio, clip_size);
2650
2651 retval = cluster_read_x(vp, uio, filesize, flags);
2652
2653 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
2654 } else {
2655 /*
2656 * can't get both the file offset and the buffer offset aligned to a page boundary
2657 * so fire an I/O through the cache for this entire vector
2658 */
2659 // LP64todo - fix this!
2660 clip_size = iov_len;
2661 prev_resid = uio_resid(uio);
2662 uio_setresid(uio, clip_size);
2663
2664 retval = cluster_read_x(vp, uio, filesize, flags);
2665
2666 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
2667 }
2668 } else {
2669 /*
2670 * If we come in here, we know the offset into
2671 * the file is on a pagesize boundary
2672 */
2673 max_io_size = filesize - uio->uio_offset;
2674 // LP64todo - fix this
2675 clip_size = uio_resid(uio);
2676 if (iov_len < clip_size)
2677 clip_size = iov_len;
2678 if (max_io_size < clip_size)
2679 clip_size = (int)max_io_size;
2680
2681 if (clip_size < PAGE_SIZE) {
2682 /*
2683 * Take care of the tail end of the read in this vector.
2684 */
2685 // LP64todo - fix this
2686 prev_resid = uio_resid(uio);
2687 uio_setresid(uio, clip_size);
2688
2689 retval = cluster_read_x(vp, uio, filesize, flags);
2690
2691 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
2692 } else {
2693 /* round clip_size down to a multiple of pagesize */
2694 clip_size = clip_size & ~(PAGE_MASK);
2695 // LP64todo - fix this
2696 prev_resid = uio_resid(uio);
2697 uio_setresid(uio, clip_size);
2698
2699 retval = cluster_nocopy_read(vp, uio, filesize);
2700
2701 if ((retval==0) && uio_resid(uio))
2702 retval = cluster_read_x(vp, uio, filesize, flags);
2703
2704 uio_setresid(uio, prev_resid - (clip_size - uio_resid(uio)));
2705 }
2706 } /* end else */
2707 } /* end while */
2708
2709 return(retval);
2710 }
2711
2712 static int
2713 cluster_read_x(vnode_t vp, struct uio *uio, off_t filesize, int flags)
2714 {
2715 upl_page_info_t *pl;
2716 upl_t upl;
2717 vm_offset_t upl_offset;
2718 int upl_size;
2719 off_t upl_f_offset;
2720 int start_offset;
2721 int start_pg;
2722 int last_pg;
2723 int uio_last = 0;
2724 int pages_in_upl;
2725 off_t max_size;
2726 off_t last_ioread_offset;
2727 off_t last_request_offset;
2728 u_int size_of_prefetch;
2729 u_int io_size;
2730 kern_return_t kret;
2731 int error = 0;
2732 int retval = 0;
2733 u_int max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2734 u_int rd_ahead_enabled = 1;
2735 u_int prefetch_enabled = 1;
2736 struct cl_readahead * rap;
2737 struct clios iostate;
2738 struct cl_extent extent;
2739
2740 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
2741 (int)uio->uio_offset, uio_resid(uio), (int)filesize, 0, 0);
2742
2743 // LP64todo - fix this
2744 last_request_offset = uio->uio_offset + uio_resid(uio);
2745
2746 if ((flags & (IO_RAOFF|IO_NOCACHE)) ||
2747 ((last_request_offset & ~PAGE_MASK_64) == (uio->uio_offset & ~PAGE_MASK_64))) {
2748 rd_ahead_enabled = 0;
2749 rap = NULL;
2750 } else {
2751 if (cluster_hard_throttle_on(vp)) {
2752 rd_ahead_enabled = 0;
2753 prefetch_enabled = 0;
2754
2755 max_rd_size = HARD_THROTTLE_MAXSIZE;
2756 }
2757 if ((rap = cluster_get_rap(vp)) == NULL)
2758 rd_ahead_enabled = 0;
2759 }
2760 if (last_request_offset > filesize)
2761 last_request_offset = filesize;
2762 extent.b_addr = uio->uio_offset / PAGE_SIZE_64;
2763 extent.e_addr = (last_request_offset - 1) / PAGE_SIZE_64;
2764
2765 if (rap != NULL && rap->cl_ralen && (rap->cl_lastr == extent.b_addr || (rap->cl_lastr + 1) == extent.b_addr)) {
2766 /*
2767 * determine if we already have a read-ahead in the pipe courtesy of the
2768 * last read systemcall that was issued...
2769 * if so, pick up it's extent to determine where we should start
2770 * with respect to any read-ahead that might be necessary to
2771 * garner all the data needed to complete this read systemcall
2772 */
2773 last_ioread_offset = (rap->cl_maxra * PAGE_SIZE_64) + PAGE_SIZE_64;
2774
2775 if (last_ioread_offset < uio->uio_offset)
2776 last_ioread_offset = (off_t)0;
2777 else if (last_ioread_offset > last_request_offset)
2778 last_ioread_offset = last_request_offset;
2779 } else
2780 last_ioread_offset = (off_t)0;
2781
2782 while (uio_resid(uio) && uio->uio_offset < filesize && retval == 0) {
2783 /*
2784 * compute the size of the upl needed to encompass
2785 * the requested read... limit each call to cluster_io
2786 * to the maximum UPL size... cluster_io will clip if
2787 * this exceeds the maximum io_size for the device,
2788 * make sure to account for
2789 * a starting offset that's not page aligned
2790 */
2791 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2792 upl_f_offset = uio->uio_offset - (off_t)start_offset;
2793 max_size = filesize - uio->uio_offset;
2794
2795 // LP64todo - fix this!
2796 if ((off_t)((unsigned int)uio_resid(uio)) < max_size)
2797 io_size = uio_resid(uio);
2798 else
2799 io_size = max_size;
2800
2801 if (!(flags & IO_NOCACHE)) {
2802
2803 while (io_size) {
2804 u_int io_resid;
2805 u_int io_requested;
2806
2807 /*
2808 * if we keep finding the pages we need already in the cache, then
2809 * don't bother to call cluster_rd_prefetch since it costs CPU cycles
2810 * to determine that we have all the pages we need... once we miss in
2811 * the cache and have issued an I/O, than we'll assume that we're likely
2812 * to continue to miss in the cache and it's to our advantage to try and prefetch
2813 */
2814 if (last_request_offset && last_ioread_offset && (size_of_prefetch = (last_request_offset - last_ioread_offset))) {
2815 if ((last_ioread_offset - uio->uio_offset) <= max_rd_size && prefetch_enabled) {
2816 /*
2817 * we've already issued I/O for this request and
2818 * there's still work to do and
2819 * our prefetch stream is running dry, so issue a
2820 * pre-fetch I/O... the I/O latency will overlap
2821 * with the copying of the data
2822 */
2823 if (size_of_prefetch > max_rd_size)
2824 size_of_prefetch = max_rd_size;
2825
2826 size_of_prefetch = cluster_rd_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize);
2827
2828 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
2829
2830 if (last_ioread_offset > last_request_offset)
2831 last_ioread_offset = last_request_offset;
2832 }
2833 }
2834 /*
2835 * limit the size of the copy we're about to do so that
2836 * we can notice that our I/O pipe is running dry and
2837 * get the next I/O issued before it does go dry
2838 */
2839 if (last_ioread_offset && io_size > ((MAX_UPL_TRANSFER * PAGE_SIZE) / 4))
2840 io_resid = ((MAX_UPL_TRANSFER * PAGE_SIZE) / 4);
2841 else
2842 io_resid = io_size;
2843
2844 io_requested = io_resid;
2845
2846 retval = cluster_copy_ubc_data(vp, uio, &io_resid, 0);
2847
2848 io_size -= (io_requested - io_resid);
2849
2850 if (retval || io_resid)
2851 /*
2852 * if we run into a real error or
2853 * a page that is not in the cache
2854 * we need to leave streaming mode
2855 */
2856 break;
2857
2858 if ((io_size == 0 || last_ioread_offset == last_request_offset) && rd_ahead_enabled) {
2859 /*
2860 * we're already finished the I/O for this read request
2861 * let's see if we should do a read-ahead
2862 */
2863 cluster_rd_ahead(vp, &extent, filesize, rap);
2864 }
2865 }
2866 if (retval)
2867 break;
2868 if (io_size == 0) {
2869 if (rap != NULL) {
2870 if (extent.e_addr < rap->cl_lastr)
2871 rap->cl_maxra = 0;
2872 rap->cl_lastr = extent.e_addr;
2873 }
2874 break;
2875 }
2876 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2877 upl_f_offset = uio->uio_offset - (off_t)start_offset;
2878 max_size = filesize - uio->uio_offset;
2879 }
2880 if (io_size > max_rd_size)
2881 io_size = max_rd_size;
2882
2883 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2884
2885 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE) / 4)
2886 upl_size = (MAX_UPL_TRANSFER * PAGE_SIZE) / 4;
2887 pages_in_upl = upl_size / PAGE_SIZE;
2888
2889 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
2890 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2891
2892 kret = ubc_create_upl(vp,
2893 upl_f_offset,
2894 upl_size,
2895 &upl,
2896 &pl,
2897 UPL_SET_LITE);
2898 if (kret != KERN_SUCCESS)
2899 panic("cluster_read: failed to get pagelist");
2900
2901 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
2902 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2903
2904 /*
2905 * scan from the beginning of the upl looking for the first
2906 * non-valid page.... this will become the first page in
2907 * the request we're going to make to 'cluster_io'... if all
2908 * of the pages are valid, we won't call through to 'cluster_io'
2909 */
2910 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
2911 if (!upl_valid_page(pl, start_pg))
2912 break;
2913 }
2914
2915 /*
2916 * scan from the starting invalid page looking for a valid
2917 * page before the end of the upl is reached, if we
2918 * find one, then it will be the last page of the request to
2919 * 'cluster_io'
2920 */
2921 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
2922 if (upl_valid_page(pl, last_pg))
2923 break;
2924 }
2925 iostate.io_completed = 0;
2926 iostate.io_issued = 0;
2927 iostate.io_error = 0;
2928 iostate.io_wanted = 0;
2929
2930 if (start_pg < last_pg) {
2931 /*
2932 * we found a range of 'invalid' pages that must be filled
2933 * if the last page in this range is the last page of the file
2934 * we may have to clip the size of it to keep from reading past
2935 * the end of the last physical block associated with the file
2936 */
2937 upl_offset = start_pg * PAGE_SIZE;
2938 io_size = (last_pg - start_pg) * PAGE_SIZE;
2939
2940 if ((upl_f_offset + upl_offset + io_size) > filesize)
2941 io_size = filesize - (upl_f_offset + upl_offset);
2942
2943 /*
2944 * issue an asynchronous read to cluster_io
2945 */
2946
2947 error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
2948 io_size, CL_READ | CL_ASYNC, (buf_t)NULL, &iostate);
2949 }
2950 if (error == 0) {
2951 /*
2952 * if the read completed successfully, or there was no I/O request
2953 * issued, than copy the data into user land via 'cluster_upl_copy_data'
2954 * we'll first add on any 'valid'
2955 * pages that were present in the upl when we acquired it.
2956 */
2957 u_int val_size;
2958
2959 for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
2960 if (!upl_valid_page(pl, uio_last))
2961 break;
2962 }
2963 /*
2964 * compute size to transfer this round, if uio->uio_resid is
2965 * still non-zero after this attempt, we'll loop around and
2966 * set up for another I/O.
2967 */
2968 val_size = (uio_last * PAGE_SIZE) - start_offset;
2969
2970 if (val_size > max_size)
2971 val_size = max_size;
2972
2973 if (val_size > uio_resid(uio))
2974 // LP64todo - fix this
2975 val_size = uio_resid(uio);
2976
2977 if (last_ioread_offset == 0)
2978 last_ioread_offset = uio->uio_offset + val_size;
2979
2980 if ((size_of_prefetch = (last_request_offset - last_ioread_offset)) && prefetch_enabled) {
2981 /*
2982 * if there's still I/O left to do for this request, and...
2983 * we're not in hard throttle mode, then issue a
2984 * pre-fetch I/O... the I/O latency will overlap
2985 * with the copying of the data
2986 */
2987 size_of_prefetch = cluster_rd_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize);
2988
2989 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
2990
2991 if (last_ioread_offset > last_request_offset)
2992 last_ioread_offset = last_request_offset;
2993
2994 } else if ((uio->uio_offset + val_size) == last_request_offset) {
2995 /*
2996 * this transfer will finish this request, so...
2997 * let's try to read ahead if we're in
2998 * a sequential access pattern and we haven't
2999 * explicitly disabled it
3000 */
3001 if (rd_ahead_enabled)
3002 cluster_rd_ahead(vp, &extent, filesize, rap);
3003
3004 if (rap != NULL) {
3005 if (extent.e_addr < rap->cl_lastr)
3006 rap->cl_maxra = 0;
3007 rap->cl_lastr = extent.e_addr;
3008 }
3009 }
3010 lck_mtx_lock(cl_mtxp);
3011
3012 while (iostate.io_issued != iostate.io_completed) {
3013 iostate.io_wanted = 1;
3014 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_x", 0);
3015 }
3016 lck_mtx_unlock(cl_mtxp);
3017
3018 if (iostate.io_error)
3019 error = iostate.io_error;
3020 else
3021 retval = cluster_copy_upl_data(uio, upl, start_offset, val_size);
3022 }
3023 if (start_pg < last_pg) {
3024 /*
3025 * compute the range of pages that we actually issued an I/O for
3026 * and either commit them as valid if the I/O succeeded
3027 * or abort them if the I/O failed
3028 */
3029 io_size = (last_pg - start_pg) * PAGE_SIZE;
3030
3031 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
3032 (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
3033
3034 if (error || (flags & IO_NOCACHE))
3035 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
3036 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3037 else
3038 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size,
3039 UPL_COMMIT_CLEAR_DIRTY |
3040 UPL_COMMIT_FREE_ON_EMPTY |
3041 UPL_COMMIT_INACTIVATE);
3042
3043 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
3044 (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
3045 }
3046 if ((last_pg - start_pg) < pages_in_upl) {
3047 int cur_pg;
3048 int commit_flags;
3049
3050 /*
3051 * the set of pages that we issued an I/O for did not encompass
3052 * the entire upl... so just release these without modifying
3053 * their state
3054 */
3055 if (error)
3056 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3057 else {
3058 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
3059 (int)upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
3060
3061 if (start_pg) {
3062 /*
3063 * we found some already valid pages at the beginning of
3064 * the upl commit these back to the inactive list with
3065 * reference cleared
3066 */
3067 for (cur_pg = 0; cur_pg < start_pg; cur_pg++) {
3068 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
3069 | UPL_COMMIT_INACTIVATE;
3070
3071 if (upl_dirty_page(pl, cur_pg))
3072 commit_flags |= UPL_COMMIT_SET_DIRTY;
3073
3074 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (flags & IO_NOCACHE))
3075 ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
3076 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3077 else
3078 ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
3079 PAGE_SIZE, commit_flags);
3080 }
3081 }
3082 if (last_pg < uio_last) {
3083 /*
3084 * we found some already valid pages immediately after the
3085 * pages we issued I/O for, commit these back to the
3086 * inactive list with reference cleared
3087 */
3088 for (cur_pg = last_pg; cur_pg < uio_last; cur_pg++) {
3089 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
3090 | UPL_COMMIT_INACTIVATE;
3091
3092 if (upl_dirty_page(pl, cur_pg))
3093 commit_flags |= UPL_COMMIT_SET_DIRTY;
3094
3095 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (flags & IO_NOCACHE))
3096 ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
3097 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3098 else
3099 ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
3100 PAGE_SIZE, commit_flags);
3101 }
3102 }
3103 if (uio_last < pages_in_upl) {
3104 /*
3105 * there were some invalid pages beyond the valid pages
3106 * that we didn't issue an I/O for, just release them
3107 * unchanged
3108 */
3109 ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
3110 (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
3111 }
3112
3113 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
3114 (int)upl, -1, -1, 0, 0);
3115 }
3116 }
3117 if (retval == 0)
3118 retval = error;
3119
3120 if ( uio_resid(uio) ) {
3121 if (cluster_hard_throttle_on(vp)) {
3122 rd_ahead_enabled = 0;
3123 prefetch_enabled = 0;
3124
3125 max_rd_size = HARD_THROTTLE_MAXSIZE;
3126 } else {
3127 if (rap != NULL)
3128 rd_ahead_enabled = 1;
3129 prefetch_enabled = 1;
3130
3131 max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE;
3132 }
3133 }
3134 }
3135 if (rap != NULL) {
3136 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
3137 (int)uio->uio_offset, uio_resid(uio), rap->cl_lastr, retval, 0);
3138
3139 lck_mtx_unlock(&rap->cl_lockr);
3140 } else {
3141 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
3142 (int)uio->uio_offset, uio_resid(uio), 0, retval, 0);
3143 }
3144
3145 return (retval);
3146 }
3147
3148
3149 static int
3150 cluster_nocopy_read(vnode_t vp, struct uio *uio, off_t filesize)
3151 {
3152 upl_t upl;
3153 upl_page_info_t *pl;
3154 vm_offset_t upl_offset;
3155 off_t max_io_size;
3156 int io_size;
3157 int upl_size;
3158 int upl_needed_size;
3159 int pages_in_pl;
3160 int upl_flags;
3161 kern_return_t kret;
3162 int i;
3163 int force_data_sync;
3164 int retval = 0;
3165 int no_zero_fill = 0;
3166 int abort_flag = 0;
3167 struct clios iostate;
3168 u_int max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE;
3169 u_int max_rd_ahead = MAX_UPL_TRANSFER * PAGE_SIZE * 2;
3170
3171
3172 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
3173 (int)uio->uio_offset, uio_resid(uio), (int)filesize, 0, 0);
3174
3175 /*
3176 * When we enter this routine, we know
3177 * -- the offset into the file is on a pagesize boundary
3178 * -- the resid is a page multiple
3179 * -- the resid will not exceed iov_len
3180 */
3181
3182 iostate.io_completed = 0;
3183 iostate.io_issued = 0;
3184 iostate.io_error = 0;
3185 iostate.io_wanted = 0;
3186
3187 while (uio_resid(uio) && uio->uio_offset < filesize && retval == 0) {
3188 user_addr_t iov_base;
3189
3190 if (cluster_hard_throttle_on(vp)) {
3191 max_rd_size = HARD_THROTTLE_MAXSIZE;
3192 max_rd_ahead = HARD_THROTTLE_MAXSIZE - 1;
3193 } else {
3194 max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE;
3195 max_rd_ahead = MAX_UPL_TRANSFER * PAGE_SIZE * 8;
3196 }
3197 max_io_size = filesize - uio->uio_offset;
3198
3199 // LP64todo - fix this
3200 if (max_io_size < (off_t)((unsigned int)uio_resid(uio)))
3201 io_size = max_io_size;
3202 else
3203 io_size = uio_resid(uio);
3204
3205 /*
3206 * First look for pages already in the cache
3207 * and move them to user space.
3208 */
3209 retval = cluster_copy_ubc_data(vp, uio, &io_size, 0);
3210
3211 if (retval) {
3212 /*
3213 * we may have already spun some portion of this request
3214 * off as async requests... we need to wait for the I/O
3215 * to complete before returning
3216 */
3217 goto wait_for_reads;
3218 }
3219 /*
3220 * If we are already finished with this read, then return
3221 */
3222 if (io_size == 0) {
3223 /*
3224 * we may have already spun some portion of this request
3225 * off as async requests... we need to wait for the I/O
3226 * to complete before returning
3227 */
3228 goto wait_for_reads;
3229 }
3230 max_io_size = io_size;
3231
3232 if (max_io_size > max_rd_size)
3233 max_io_size = max_rd_size;
3234
3235 io_size = 0;
3236
3237 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + max_io_size, UPL_ROP_ABSENT, &io_size);
3238
3239 if (io_size == 0)
3240 /*
3241 * we may have already spun some portion of this request
3242 * off as async requests... we need to wait for the I/O
3243 * to complete before returning
3244 */
3245 goto wait_for_reads;
3246
3247 iov_base = uio_curriovbase(uio);
3248
3249 // LP64todo - fix this!
3250 upl_offset = CAST_DOWN(vm_offset_t, iov_base) & PAGE_MASK;
3251 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
3252
3253 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
3254 (int)upl_offset, upl_needed_size, (int)iov_base, io_size, 0);
3255
3256 if (upl_offset == 0 && ((io_size & PAGE_MASK) == 0)) {
3257 no_zero_fill = 1;
3258 abort_flag = UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY;
3259 } else {
3260 no_zero_fill = 0;
3261 abort_flag = UPL_ABORT_FREE_ON_EMPTY;
3262 }
3263 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
3264 pages_in_pl = 0;
3265 upl_size = upl_needed_size;
3266 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
3267
3268 if (no_zero_fill)
3269 upl_flags |= UPL_NOZEROFILL;
3270 if (force_data_sync)
3271 upl_flags |= UPL_FORCE_DATA_SYNC;
3272
3273 // LP64todo - fix this!
3274 kret = vm_map_create_upl(current_map(),
3275 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
3276 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags);
3277
3278 if (kret != KERN_SUCCESS) {
3279 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
3280 (int)upl_offset, upl_size, io_size, kret, 0);
3281 /*
3282 * cluster_nocopy_read: failed to get pagelist
3283 *
3284 * we may have already spun some portion of this request
3285 * off as async requests... we need to wait for the I/O
3286 * to complete before returning
3287 */
3288 goto wait_for_reads;
3289 }
3290 pages_in_pl = upl_size / PAGE_SIZE;
3291 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
3292
3293 for (i = 0; i < pages_in_pl; i++) {
3294 if (!upl_valid_page(pl, i))
3295 break;
3296 }
3297 if (i == pages_in_pl)
3298 break;
3299
3300 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, abort_flag);
3301 }
3302 if (force_data_sync >= 3) {
3303 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
3304 (int)upl_offset, upl_size, io_size, kret, 0);
3305
3306 goto wait_for_reads;
3307 }
3308 /*
3309 * Consider the possibility that upl_size wasn't satisfied.
3310 */
3311 if (upl_size != upl_needed_size)
3312 io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
3313
3314 if (io_size == 0) {
3315 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, abort_flag);
3316 goto wait_for_reads;
3317 }
3318 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
3319 (int)upl_offset, upl_size, io_size, kret, 0);
3320
3321 /*
3322 * request asynchronously so that we can overlap
3323 * the preparation of the next I/O
3324 * if there are already too many outstanding reads
3325 * wait until some have completed before issuing the next read
3326 */
3327 lck_mtx_lock(cl_mtxp);
3328
3329 while ((iostate.io_issued - iostate.io_completed) > max_rd_ahead) {
3330 iostate.io_wanted = 1;
3331 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_read", 0);
3332 }
3333 lck_mtx_unlock(cl_mtxp);
3334
3335 if (iostate.io_error) {
3336 /*
3337 * one of the earlier reads we issued ran into a hard error
3338 * don't issue any more reads, cleanup the UPL
3339 * that was just created but not used, then
3340 * go wait for any other reads to complete before
3341 * returning the error to the caller
3342 */
3343 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size, abort_flag);
3344
3345 goto wait_for_reads;
3346 }
3347 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
3348 (int)upl, (int)upl_offset, (int)uio->uio_offset, io_size, 0);
3349
3350 retval = cluster_io(vp, upl, upl_offset, uio->uio_offset, io_size,
3351 CL_PRESERVE | CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO,
3352 (buf_t)NULL, &iostate);
3353
3354 /*
3355 * update the uio structure
3356 */
3357 uio_update(uio, (user_size_t)io_size);
3358
3359 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
3360 (int)upl, (int)uio->uio_offset, (int)uio_resid(uio), retval, 0);
3361
3362 } /* end while */
3363
3364 wait_for_reads:
3365 /*
3366 * make sure all async reads that are part of this stream
3367 * have completed before we return
3368 */
3369 lck_mtx_lock(cl_mtxp);
3370
3371 while (iostate.io_issued != iostate.io_completed) {
3372 iostate.io_wanted = 1;
3373 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_nocopy_read", 0);
3374 }
3375 lck_mtx_unlock(cl_mtxp);
3376
3377 if (iostate.io_error)
3378 retval = iostate.io_error;
3379
3380 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
3381 (int)uio->uio_offset, (int)uio_resid(uio), 6, retval, 0);
3382
3383 return (retval);
3384 }
3385
3386
3387 static int
3388 cluster_phys_read(vnode_t vp, struct uio *uio, off_t filesize)
3389 {
3390 upl_page_info_t *pl;
3391 upl_t upl;
3392 vm_offset_t upl_offset;
3393 addr64_t dst_paddr;
3394 off_t max_size;
3395 int io_size;
3396 user_size_t iov_len;
3397 user_addr_t iov_base;
3398 int tail_size;
3399 int upl_size;
3400 int upl_needed_size;
3401 int pages_in_pl;
3402 int upl_flags;
3403 kern_return_t kret;
3404 struct clios iostate;
3405 int error;
3406 int devblocksize;
3407
3408 devblocksize = vp->v_mount->mnt_devblocksize;
3409 /*
3410 * When we enter this routine, we know
3411 * -- the resid will not exceed iov_len
3412 * -- the target address is physically contiguous
3413 */
3414
3415 #if LP64_DEBUG
3416 if (IS_VALID_UIO_SEGFLG(uio->uio_segflg) == 0) {
3417 panic("%s :%d - invalid uio_segflg\n", __FILE__, __LINE__);
3418 }
3419 #endif /* LP64_DEBUG */
3420
3421 iov_len = uio_curriovlen(uio);
3422 iov_base = uio_curriovbase(uio);
3423
3424 max_size = filesize - uio->uio_offset;
3425
3426 // LP64todo - fix this!
3427 if (max_size < 0 || (u_int64_t)max_size > iov_len)
3428 io_size = iov_len;
3429 else
3430 io_size = max_size;
3431
3432 // LP64todo - fix this!
3433 upl_offset = CAST_DOWN(vm_offset_t, iov_base) & PAGE_MASK;
3434 upl_needed_size = upl_offset + io_size;
3435
3436 error = 0;
3437 pages_in_pl = 0;
3438 upl_size = upl_needed_size;
3439 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
3440
3441 kret = vm_map_get_upl(current_map(),
3442 (vm_map_offset_t)(iov_base & ~((user_addr_t)PAGE_MASK)),
3443 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
3444
3445 if (kret != KERN_SUCCESS) {
3446 /*
3447 * cluster_phys_read: failed to get pagelist
3448 */
3449 return(EINVAL);
3450 }
3451 if (upl_size < upl_needed_size) {
3452 /*
3453 * The upl_size wasn't satisfied.
3454 */
3455 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3456
3457 return(EINVAL);
3458 }
3459 pl = ubc_upl_pageinfo(upl);
3460
3461 dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)upl_offset;
3462
3463 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
3464 int head_size;
3465
3466 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
3467
3468 if (head_size > io_size)
3469 head_size = io_size;
3470
3471 error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, CL_READ);
3472
3473 if (error) {
3474 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3475
3476 return(EINVAL);
3477 }
3478 upl_offset += head_size;
3479 dst_paddr += head_size;
3480 io_size -= head_size;
3481 }
3482 tail_size = io_size & (devblocksize - 1);
3483 io_size -= tail_size;
3484
3485 iostate.io_completed = 0;
3486 iostate.io_issued = 0;
3487 iostate.io_error = 0;
3488 iostate.io_wanted = 0;
3489
3490 while (io_size && error == 0) {
3491 int xsize;
3492
3493 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
3494 xsize = MAX_UPL_TRANSFER * PAGE_SIZE;
3495 else
3496 xsize = io_size;
3497 /*
3498 * request asynchronously so that we can overlap
3499 * the preparation of the next I/O... we'll do
3500 * the commit after all the I/O has completed
3501 * since its all issued against the same UPL
3502 * if there are already too many outstanding reads
3503 * wait until some have completed before issuing the next
3504 */
3505 lck_mtx_lock(cl_mtxp);
3506
3507 while ((iostate.io_issued - iostate.io_completed) > (8 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
3508 iostate.io_wanted = 1;
3509 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_phys_read", 0);
3510 }
3511 lck_mtx_unlock(cl_mtxp);
3512
3513 error = cluster_io(vp, upl, upl_offset, uio->uio_offset, xsize,
3514 CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC,
3515 (buf_t)NULL, &iostate);
3516 /*
3517 * The cluster_io read was issued successfully,
3518 * update the uio structure
3519 */
3520 if (error == 0) {
3521 uio_update(uio, (user_size_t)xsize);
3522
3523 dst_paddr += xsize;
3524 upl_offset += xsize;
3525 io_size -= xsize;
3526 }
3527 }
3528 /*
3529 * make sure all async reads that are part of this stream
3530 * have completed before we proceed
3531 */
3532 lck_mtx_lock(cl_mtxp);
3533
3534 while (iostate.io_issued != iostate.io_completed) {
3535 iostate.io_wanted = 1;
3536 msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_phys_read", 0);
3537 }
3538 lck_mtx_unlock(cl_mtxp);
3539
3540 if (iostate.io_error)
3541 error = iostate.io_error;
3542
3543 if (error == 0 && tail_size)
3544 error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, CL_READ);
3545
3546 /*
3547 * just release our hold on the physically contiguous
3548 * region without changing any state
3549 */
3550 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3551
3552 return (error);
3553 }
3554
3555
3556 /*
3557 * generate advisory I/O's in the largest chunks possible
3558 * the completed pages will be released into the VM cache
3559 */
3560 int
3561 advisory_read(vnode_t vp, off_t filesize, off_t f_offset, int resid)
3562 {
3563 upl_page_info_t *pl;
3564 upl_t upl;
3565 vm_offset_t upl_offset;
3566 int upl_size;
3567 off_t upl_f_offset;
3568 int start_offset;
3569 int start_pg;
3570 int last_pg;
3571 int pages_in_upl;
3572 off_t max_size;
3573 int io_size;
3574 kern_return_t kret;
3575 int retval = 0;
3576 int issued_io;
3577 int skip_range;
3578
3579 if ( !UBCINFOEXISTS(vp))
3580 return(EINVAL);
3581
3582 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
3583 (int)f_offset, resid, (int)filesize, 0, 0);
3584
3585 while (resid && f_offset < filesize && retval == 0) {
3586 /*
3587 * compute the size of the upl needed to encompass
3588 * the requested read... limit each call to cluster_io
3589 * to the maximum UPL size... cluster_io will clip if
3590 * this exceeds the maximum io_size for the device,
3591 * make sure to account for
3592 * a starting offset that's not page aligned
3593 */
3594 start_offset = (int)(f_offset & PAGE_MASK_64);
3595 upl_f_offset = f_offset - (off_t)start_offset;
3596 max_size = filesize - f_offset;
3597
3598 if (resid < max_size)
3599 io_size = resid;
3600 else
3601 io_size = max_size;
3602
3603 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
3604 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
3605 upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
3606
3607 skip_range = 0;
3608 /*
3609 * return the number of contiguously present pages in the cache
3610 * starting at upl_f_offset within the file
3611 */
3612 ubc_range_op(vp, upl_f_offset, upl_f_offset + upl_size, UPL_ROP_PRESENT, &skip_range);
3613
3614 if (skip_range) {
3615 /*
3616 * skip over pages already present in the cache
3617 */
3618 io_size = skip_range - start_offset;
3619
3620 f_offset += io_size;
3621 resid -= io_size;
3622
3623 if (skip_range == upl_size)
3624 continue;
3625 /*
3626 * have to issue some real I/O
3627 * at this point, we know it's starting on a page boundary
3628 * because we've skipped over at least the first page in the request
3629 */
3630 start_offset = 0;
3631 upl_f_offset += skip_range;
3632 upl_size -= skip_range;
3633 }
3634 pages_in_upl = upl_size / PAGE_SIZE;
3635
3636 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_START,
3637 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
3638
3639 kret = ubc_create_upl(vp,
3640 upl_f_offset,
3641 upl_size,
3642 &upl,
3643 &pl,
3644 UPL_RET_ONLY_ABSENT | UPL_SET_LITE);
3645 if (kret != KERN_SUCCESS)
3646 return(retval);
3647 issued_io = 0;
3648
3649 /*
3650 * before we start marching forward, we must make sure we end on
3651 * a present page, otherwise we will be working with a freed
3652 * upl
3653 */
3654 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
3655 if (upl_page_present(pl, last_pg))
3656 break;
3657 }
3658 pages_in_upl = last_pg + 1;
3659
3660
3661 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_END,
3662 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
3663
3664
3665 for (last_pg = 0; last_pg < pages_in_upl; ) {
3666 /*
3667 * scan from the beginning of the upl looking for the first
3668 * page that is present.... this will become the first page in
3669 * the request we're going to make to 'cluster_io'... if all
3670 * of the pages are absent, we won't call through to 'cluster_io'
3671 */
3672 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
3673 if (upl_page_present(pl, start_pg))
3674 break;
3675 }
3676
3677 /*
3678 * scan from the starting present page looking for an absent
3679 * page before the end of the upl is reached, if we
3680 * find one, then it will terminate the range of pages being
3681 * presented to 'cluster_io'
3682 */
3683 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
3684 if (!upl_page_present(pl, last_pg))
3685 break;
3686 }
3687
3688 if (last_pg > start_pg) {
3689 /*
3690 * we found a range of pages that must be filled
3691 * if the last page in this range is the last page of the file
3692 * we may have to clip the size of it to keep from reading past
3693 * the end of the last physical block associated with the file
3694 */
3695 upl_offset = start_pg * PAGE_SIZE;
3696 io_size = (last_pg - start_pg) * PAGE_SIZE;
3697
3698 if ((upl_f_offset + upl_offset + io_size) > filesize)
3699 io_size = filesize - (upl_f_offset + upl_offset);
3700
3701 /*
3702 * issue an asynchronous read to cluster_io
3703 */
3704 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
3705 CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE, (buf_t)NULL, (struct clios *)NULL);
3706
3707 issued_io = 1;
3708 }
3709 }
3710 if (issued_io == 0)
3711 ubc_upl_abort(upl, 0);
3712
3713 io_size = upl_size - start_offset;
3714
3715 if (io_size > resid)
3716 io_size = resid;
3717 f_offset += io_size;
3718 resid -= io_size;
3719 }
3720
3721 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
3722 (int)f_offset, resid, retval, 0, 0);
3723
3724 return(retval);
3725 }
3726
3727
3728 int
3729 cluster_push(vnode_t vp, int flags)
3730 {
3731 int retval;
3732 struct cl_writebehind *wbp;
3733
3734 if ( !UBCINFOEXISTS(vp)) {
3735 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, (int)vp, flags, 0, -1, 0);
3736 return (0);
3737 }
3738 /* return if deferred write is set */
3739 if (((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) && (flags & IO_DEFWRITE)) {
3740 return (0);
3741 }
3742 if ((wbp = cluster_get_wbp(vp, CLW_RETURNLOCKED)) == NULL) {
3743 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, (int)vp, flags, 0, -2, 0);
3744 return (0);
3745 }
3746 if (wbp->cl_number == 0 && wbp->cl_scmap == NULL) {
3747 lck_mtx_unlock(&wbp->cl_lockw);
3748
3749 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_NONE, (int)vp, flags, 0, -3, 0);
3750 return(0);
3751 }
3752 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
3753 (int)wbp->cl_scmap, wbp->cl_number, flags, 0, 0);
3754
3755 if (wbp->cl_scmap) {
3756 sparse_cluster_push(wbp, vp, ubc_getsize(vp), 1);
3757
3758 retval = 1;
3759 } else
3760 retval = cluster_try_push(wbp, vp, ubc_getsize(vp), 0, 1);
3761
3762 lck_mtx_unlock(&wbp->cl_lockw);
3763
3764 if (flags & IO_SYNC)
3765 (void)vnode_waitforwrites(vp, 0, 0, 0, (char *)"cluster_push");
3766
3767 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
3768 (int)wbp->cl_scmap, wbp->cl_number, retval, 0, 0);
3769
3770 return (retval);
3771 }
3772
3773
3774 __private_extern__ void
3775 cluster_release(struct ubc_info *ubc)
3776 {
3777 struct cl_writebehind *wbp;
3778 struct cl_readahead *rap;
3779
3780 if ((wbp = ubc->cl_wbehind)) {
3781
3782 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, (int)ubc, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
3783
3784 if (wbp->cl_scmap)
3785 vfs_drt_control(&(wbp->cl_scmap), 0);
3786 } else {
3787 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, (int)ubc, 0, 0, 0, 0);
3788 }
3789
3790 rap = ubc->cl_rahead;
3791
3792 if (wbp != NULL) {
3793 lck_mtx_destroy(&wbp->cl_lockw, cl_mtx_grp);
3794 FREE_ZONE((void *)wbp, sizeof *wbp, M_CLWRBEHIND);
3795 }
3796 if ((rap = ubc->cl_rahead)) {
3797 lck_mtx_destroy(&rap->cl_lockr, cl_mtx_grp);
3798 FREE_ZONE((void *)rap, sizeof *rap, M_CLRDAHEAD);
3799 }
3800 ubc->cl_rahead = NULL;
3801 ubc->cl_wbehind = NULL;
3802
3803 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_END, (int)ubc, (int)rap, (int)wbp, 0, 0);
3804 }
3805
3806
3807 static void
3808 cluster_push_EOF(vnode_t vp, off_t EOF)
3809 {
3810 struct cl_writebehind *wbp;
3811
3812 wbp = cluster_get_wbp(vp, CLW_ALLOCATE | CLW_RETURNLOCKED);
3813
3814 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
3815 (int)wbp->cl_scmap, wbp->cl_number, (int)EOF, 0, 0);
3816
3817 if (wbp->cl_scmap)
3818 sparse_cluster_push(wbp, vp, EOF, 1);
3819 else
3820 cluster_try_push(wbp, vp, EOF, 0, 1);
3821
3822 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
3823 (int)wbp->cl_scmap, wbp->cl_number, 0, 0, 0);
3824
3825 lck_mtx_unlock(&wbp->cl_lockw);
3826 }
3827
3828
3829 static int
3830 cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int can_delay, int push_all)
3831 {
3832 int cl_index;
3833 int cl_index1;
3834 int min_index;
3835 int cl_len;
3836 int cl_pushed = 0;
3837 struct cl_wextent l_clusters[MAX_CLUSTERS];
3838
3839 /*
3840 * the write behind context exists and has
3841 * already been locked...
3842 *
3843 * make a local 'sorted' copy of the clusters
3844 * and clear wbp->cl_number so that new clusters can
3845 * be developed
3846 */
3847 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
3848 for (min_index = -1, cl_index1 = 0; cl_index1 < wbp->cl_number; cl_index1++) {
3849 if (wbp->cl_clusters[cl_index1].b_addr == wbp->cl_clusters[cl_index1].e_addr)
3850 continue;
3851 if (min_index == -1)
3852 min_index = cl_index1;
3853 else if (wbp->cl_clusters[cl_index1].b_addr < wbp->cl_clusters[min_index].b_addr)
3854 min_index = cl_index1;
3855 }
3856 if (min_index == -1)
3857 break;
3858 l_clusters[cl_index].b_addr = wbp->cl_clusters[min_index].b_addr;
3859 l_clusters[cl_index].e_addr = wbp->cl_clusters[min_index].e_addr;
3860 l_clusters[cl_index].io_nocache = wbp->cl_clusters[min_index].io_nocache;
3861
3862 wbp->cl_clusters[min_index].b_addr = wbp->cl_clusters[min_index].e_addr;
3863 }
3864 wbp->cl_number = 0;
3865
3866 cl_len = cl_index;
3867
3868 if (can_delay && cl_len == MAX_CLUSTERS) {
3869 int i;
3870
3871 /*
3872 * determine if we appear to be writing the file sequentially
3873 * if not, by returning without having pushed any clusters
3874 * we will cause this vnode to be pushed into the sparse cluster mechanism
3875 * used for managing more random I/O patterns
3876 *
3877 * we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
3878 * that's why we're in try_push with can_delay true...
3879 *
3880 * check to make sure that all the clusters except the last one are 'full'... and that each cluster
3881 * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
3882 * so we can just make a simple pass through, up to, but not including the last one...
3883 * note that e_addr is not inclusive, so it will be equal to the b_addr of the next cluster if they
3884 * are sequential
3885 *
3886 * we let the last one be partial as long as it was adjacent to the previous one...
3887 * we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
3888 * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
3889 */
3890 for (i = 0; i < MAX_CLUSTERS - 1; i++) {
3891 if ((l_clusters[i].e_addr - l_clusters[i].b_addr) != MAX_UPL_TRANSFER)
3892 goto dont_try;
3893 if (l_clusters[i].e_addr != l_clusters[i+1].b_addr)
3894 goto dont_try;
3895 }
3896 }
3897 /*
3898 * drop the lock while we're firing off the I/Os...
3899 * this is safe since I'm working off of a private sorted copy
3900 * of the clusters, and I'm going to re-evaluate the public
3901 * state after I retake the lock
3902 */
3903 lck_mtx_unlock(&wbp->cl_lockw);
3904
3905 for (cl_index = 0; cl_index < cl_len; cl_index++) {
3906 int flags;
3907 struct cl_extent cl;
3908
3909 /*
3910 * try to push each cluster in turn...
3911 */
3912 if (l_clusters[cl_index].io_nocache)
3913 flags = IO_NOCACHE;
3914 else
3915 flags = 0;
3916 cl.b_addr = l_clusters[cl_index].b_addr;
3917 cl.e_addr = l_clusters[cl_index].e_addr;
3918
3919 cluster_push_x(vp, &cl, EOF, flags);
3920
3921 l_clusters[cl_index].b_addr = 0;
3922 l_clusters[cl_index].e_addr = 0;
3923
3924 cl_pushed++;
3925
3926 if (push_all == 0)
3927 break;
3928 }
3929 lck_mtx_lock(&wbp->cl_lockw);
3930
3931 dont_try:
3932 if (cl_len > cl_pushed) {
3933 /*
3934 * we didn't push all of the clusters, so
3935 * lets try to merge them back in to the vnode
3936 */
3937 if ((MAX_CLUSTERS - wbp->cl_number) < (cl_len - cl_pushed)) {
3938 /*
3939 * we picked up some new clusters while we were trying to
3940 * push the old ones... this can happen because I've dropped
3941 * the vnode lock... the sum of the
3942 * leftovers plus the new cluster count exceeds our ability
3943 * to represent them, so switch to the sparse cluster mechanism
3944 *
3945 * collect the active public clusters...
3946 */
3947 sparse_cluster_switch(wbp, vp, EOF);
3948
3949 for (cl_index = 0, cl_index1 = 0; cl_index < cl_len; cl_index++) {
3950 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr)
3951 continue;
3952 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
3953 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
3954 wbp->cl_clusters[cl_index1].io_nocache = l_clusters[cl_index].io_nocache;
3955
3956 cl_index1++;
3957 }
3958 /*
3959 * update the cluster count
3960 */
3961 wbp->cl_number = cl_index1;
3962
3963 /*
3964 * and collect the original clusters that were moved into the
3965 * local storage for sorting purposes
3966 */
3967 sparse_cluster_switch(wbp, vp, EOF);
3968
3969 } else {
3970 /*
3971 * we've got room to merge the leftovers back in
3972 * just append them starting at the next 'hole'
3973 * represented by wbp->cl_number
3974 */
3975 for (cl_index = 0, cl_index1 = wbp->cl_number; cl_index < cl_len; cl_index++) {
3976 if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr)
3977 continue;
3978
3979 wbp->cl_clusters[cl_index1].b_addr = l_clusters[cl_index].b_addr;
3980 wbp->cl_clusters[cl_index1].e_addr = l_clusters[cl_index].e_addr;
3981 wbp->cl_clusters[cl_index1].io_nocache = l_clusters[cl_index].io_nocache;
3982
3983 cl_index1++;
3984 }
3985 /*
3986 * update the cluster count
3987 */
3988 wbp->cl_number = cl_index1;
3989 }
3990 }
3991 return(MAX_CLUSTERS - wbp->cl_number);
3992 }
3993
3994
3995
3996 static int
3997 cluster_push_x(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags)
3998 {
3999 upl_page_info_t *pl;
4000 upl_t upl;
4001 vm_offset_t upl_offset;
4002 int upl_size;
4003 off_t upl_f_offset;
4004 int pages_in_upl;
4005 int start_pg;
4006 int last_pg;
4007 int io_size;
4008 int io_flags;
4009 int upl_flags;
4010 int size;
4011 int error = 0;
4012 int retval;
4013 kern_return_t kret;
4014
4015
4016 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START,
4017 (int)cl->b_addr, (int)cl->e_addr, (int)EOF, flags, 0);
4018
4019 if ((pages_in_upl = (int)(cl->e_addr - cl->b_addr)) == 0) {
4020 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0);
4021
4022 return (0);
4023 }
4024 upl_size = pages_in_upl * PAGE_SIZE;
4025 upl_f_offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
4026
4027 if (upl_f_offset + upl_size >= EOF) {
4028
4029 if (upl_f_offset >= EOF) {
4030 /*
4031 * must have truncated the file and missed
4032 * clearing a dangling cluster (i.e. it's completely
4033 * beyond the new EOF
4034 */
4035 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0);
4036
4037 return(0);
4038 }
4039 size = EOF - upl_f_offset;
4040
4041 upl_size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
4042 pages_in_upl = upl_size / PAGE_SIZE;
4043 } else
4044 size = upl_size;
4045
4046 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, size, 0, 0, 0);
4047
4048 /*
4049 * by asking for UPL_COPYOUT_FROM and UPL_RET_ONLY_DIRTY, we get the following desirable behavior
4050 *
4051 * - only pages that are currently dirty are returned... these are the ones we need to clean
4052 * - the hardware dirty bit is cleared when the page is gathered into the UPL... the software dirty bit is set
4053 * - if we have to abort the I/O for some reason, the software dirty bit is left set since we didn't clean the page
4054 * - when we commit the page, the software dirty bit is cleared... the hardware dirty bit is untouched so that if
4055 * someone dirties this page while the I/O is in progress, we don't lose track of the new state
4056 *
4057 * when the I/O completes, we no longer ask for an explicit clear of the DIRTY state (either soft or hard)
4058 */
4059
4060 if ((vp->v_flag & VNOCACHE_DATA) || (flags & IO_NOCACHE))
4061 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE | UPL_WILL_BE_DUMPED;
4062 else
4063 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE;
4064
4065 kret = ubc_create_upl(vp,
4066 upl_f_offset,
4067 upl_size,
4068 &upl,
4069 &pl,
4070 upl_flags);
4071 if (kret != KERN_SUCCESS)
4072 panic("cluster_push: failed to get pagelist");
4073
4074 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END, (int)upl, upl_f_offset, 0, 0, 0);
4075
4076 /*
4077 * since we only asked for the dirty pages back
4078 * it's possible that we may only get a few or even none, so...
4079 * before we start marching forward, we must make sure we know
4080 * where the last present page is in the UPL, otherwise we could
4081 * end up working with a freed upl due to the FREE_ON_EMPTY semantics
4082 * employed by commit_range and abort_range.
4083 */
4084 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
4085 if (upl_page_present(pl, last_pg))
4086 break;
4087 }
4088 pages_in_upl = last_pg + 1;
4089
4090 if (pages_in_upl == 0) {
4091 ubc_upl_abort(upl, 0);
4092
4093 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 2, 0, 0, 0);
4094 return(0);
4095 }
4096
4097 for (last_pg = 0; last_pg < pages_in_upl; ) {
4098 /*
4099 * find the next dirty page in the UPL
4100 * this will become the first page in the
4101 * next I/O to generate
4102 */
4103 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
4104 if (upl_dirty_page(pl, start_pg))
4105 break;
4106 if (upl_page_present(pl, start_pg))
4107 /*
4108 * RET_ONLY_DIRTY will return non-dirty 'precious' pages
4109 * just release these unchanged since we're not going
4110 * to steal them or change their state
4111 */
4112 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
4113 }
4114 if (start_pg >= pages_in_upl)
4115 /*
4116 * done... no more dirty pages to push
4117 */
4118 break;
4119 if (start_pg > last_pg)
4120 /*
4121 * skipped over some non-dirty pages
4122 */
4123 size -= ((start_pg - last_pg) * PAGE_SIZE);
4124
4125 /*
4126 * find a range of dirty pages to write
4127 */
4128 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
4129 if (!upl_dirty_page(pl, last_pg))
4130 break;
4131 }
4132 upl_offset = start_pg * PAGE_SIZE;
4133
4134 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
4135
4136 io_flags = CL_THROTTLE | CL_COMMIT;
4137
4138 if ( !(flags & IO_SYNC))
4139 io_flags |= CL_ASYNC;
4140
4141 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
4142 io_flags, (buf_t)NULL, (struct clios *)NULL);
4143
4144 if (error == 0 && retval)
4145 error = retval;
4146
4147 size -= io_size;
4148 }
4149 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, 0, 0, 0);
4150
4151 return(error);
4152 }
4153
4154
4155 /*
4156 * sparse_cluster_switch is called with the write behind lock held
4157 */
4158 static void
4159 sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF)
4160 {
4161 int cl_index;
4162
4163 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
4164
4165 if (wbp->cl_scmap == NULL)
4166 wbp->cl_scdirty = 0;
4167
4168 for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
4169 int flags;
4170 struct cl_extent cl;
4171
4172 for (cl.b_addr = wbp->cl_clusters[cl_index].b_addr; cl.b_addr < wbp->cl_clusters[cl_index].e_addr; cl.b_addr++) {
4173
4174 if (ubc_page_op(vp, (off_t)(cl.b_addr * PAGE_SIZE_64), 0, 0, &flags) == KERN_SUCCESS) {
4175 if (flags & UPL_POP_DIRTY) {
4176 cl.e_addr = cl.b_addr + 1;
4177
4178 sparse_cluster_add(wbp, vp, &cl, EOF);
4179 }
4180 }
4181 }
4182 }
4183 wbp->cl_number = 0;
4184
4185 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
4186 }
4187
4188
4189 /*
4190 * sparse_cluster_push is called with the write behind lock held
4191 */
4192 static void
4193 sparse_cluster_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_all)
4194 {
4195 struct cl_extent cl;
4196 off_t offset;
4197 u_int length;
4198
4199 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, push_all, 0);
4200
4201 if (push_all)
4202 vfs_drt_control(&(wbp->cl_scmap), 1);
4203
4204 for (;;) {
4205 if (vfs_drt_get_cluster(&(wbp->cl_scmap), &offset, &length) != KERN_SUCCESS)
4206 break;
4207
4208 cl.b_addr = (daddr64_t)(offset / PAGE_SIZE_64);
4209 cl.e_addr = (daddr64_t)((offset + length) / PAGE_SIZE_64);
4210
4211 wbp->cl_scdirty -= (int)(cl.e_addr - cl.b_addr);
4212
4213 cluster_push_x(vp, &cl, EOF, 0);
4214
4215 if (push_all == 0)
4216 break;
4217 }
4218 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
4219 }
4220
4221
4222 /*
4223 * sparse_cluster_add is called with the write behind lock held
4224 */
4225 static void
4226 sparse_cluster_add(struct cl_writebehind *wbp, vnode_t vp, struct cl_extent *cl, off_t EOF)
4227 {
4228 u_int new_dirty;
4229 u_int length;
4230 off_t offset;
4231
4232 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_START, (int)wbp->cl_scmap, wbp->cl_scdirty, (int)cl->b_addr, (int)cl->e_addr, 0);
4233
4234 offset = (off_t)(cl->b_addr * PAGE_SIZE_64);
4235 length = ((u_int)(cl->e_addr - cl->b_addr)) * PAGE_SIZE;
4236
4237 while (vfs_drt_mark_pages(&(wbp->cl_scmap), offset, length, &new_dirty) != KERN_SUCCESS) {
4238 /*
4239 * no room left in the map
4240 * only a partial update was done
4241 * push out some pages and try again
4242 */
4243 wbp->cl_scdirty += new_dirty;
4244
4245 sparse_cluster_push(wbp, vp, EOF, 0);
4246
4247 offset += (new_dirty * PAGE_SIZE_64);
4248 length -= (new_dirty * PAGE_SIZE);
4249 }
4250 wbp->cl_scdirty += new_dirty;
4251
4252 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, (int)vp, (int)wbp->cl_scmap, wbp->cl_scdirty, 0, 0);
4253 }
4254
4255
4256 static int
4257 cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, int xsize, int flags)
4258 {
4259 upl_page_info_t *pl;
4260 upl_t upl;
4261 addr64_t ubc_paddr;
4262 kern_return_t kret;
4263 int error = 0;
4264 int did_read = 0;
4265 int abort_flags;
4266 int upl_flags;
4267
4268 upl_flags = UPL_SET_LITE;
4269 if (! (flags & CL_READ)) {
4270 /*
4271 * "write" operation: let the UPL subsystem know
4272 * that we intend to modify the buffer cache pages
4273 * we're gathering.
4274 */
4275 upl_flags |= UPL_WILL_MODIFY;
4276 }
4277
4278 kret = ubc_create_upl(vp,
4279 uio->uio_offset & ~PAGE_MASK_64,
4280 PAGE_SIZE,
4281 &upl,
4282 &pl,
4283 upl_flags);
4284
4285 if (kret != KERN_SUCCESS)
4286 return(EINVAL);
4287
4288 if (!upl_valid_page(pl, 0)) {
4289 /*
4290 * issue a synchronous read to cluster_io
4291 */
4292 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
4293 CL_READ, (buf_t)NULL, (struct clios *)NULL);
4294 if (error) {
4295 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
4296
4297 return(error);
4298 }
4299 did_read = 1;
4300 }
4301 ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)(uio->uio_offset & PAGE_MASK_64);
4302
4303 /*
4304 * NOTE: There is no prototype for the following in BSD. It, and the definitions
4305 * of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
4306 * osfmk/ppc/mappings.h. They are not included here because there appears to be no
4307 * way to do so without exporting them to kexts as well.
4308 */
4309 if (flags & CL_READ)
4310 // copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk); /* Copy physical to physical and flush the destination */
4311 copypv(ubc_paddr, usr_paddr, xsize, 2 | 1 | 4); /* Copy physical to physical and flush the destination */
4312 else
4313 // copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc); /* Copy physical to physical and flush the source */
4314 copypv(usr_paddr, ubc_paddr, xsize, 2 | 1 | 8); /* Copy physical to physical and flush the source */
4315
4316 if ( !(flags & CL_READ) || (upl_valid_page(pl, 0) && upl_dirty_page(pl, 0))) {
4317 /*
4318 * issue a synchronous write to cluster_io
4319 */
4320 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE,
4321 0, (buf_t)NULL, (struct clios *)NULL);
4322 }
4323 if (error == 0)
4324 uio_update(uio, (user_size_t)xsize);
4325
4326 if (did_read)
4327 abort_flags = UPL_ABORT_FREE_ON_EMPTY;
4328 else
4329 abort_flags = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
4330
4331 ubc_upl_abort_range(upl, 0, PAGE_SIZE, abort_flags);
4332
4333 return (error);
4334 }
4335
4336
4337
4338 int
4339 cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int xsize)
4340 {
4341 int pg_offset;
4342 int pg_index;
4343 int csize;
4344 int segflg;
4345 int retval = 0;
4346 upl_page_info_t *pl;
4347
4348 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
4349 (int)uio->uio_offset, uio_resid(uio), upl_offset, xsize, 0);
4350
4351 segflg = uio->uio_segflg;
4352
4353 switch(segflg) {
4354
4355 case UIO_USERSPACE32:
4356 case UIO_USERISPACE32:
4357 uio->uio_segflg = UIO_PHYS_USERSPACE32;
4358 break;
4359
4360 case UIO_USERSPACE:
4361 case UIO_USERISPACE:
4362 uio->uio_segflg = UIO_PHYS_USERSPACE;
4363 break;
4364
4365 case UIO_USERSPACE64:
4366 case UIO_USERISPACE64:
4367 uio->uio_segflg = UIO_PHYS_USERSPACE64;
4368 break;
4369
4370 case UIO_SYSSPACE32:
4371 uio->uio_segflg = UIO_PHYS_SYSSPACE32;
4372 break;
4373
4374 case UIO_SYSSPACE:
4375 uio->uio_segflg = UIO_PHYS_SYSSPACE;
4376 break;
4377
4378 case UIO_SYSSPACE64:
4379 uio->uio_segflg = UIO_PHYS_SYSSPACE64;
4380 break;
4381 }
4382 pl = ubc_upl_pageinfo(upl);
4383
4384 pg_index = upl_offset / PAGE_SIZE;
4385 pg_offset = upl_offset & PAGE_MASK;
4386 csize = min(PAGE_SIZE - pg_offset, xsize);
4387
4388 while (xsize && retval == 0) {
4389 addr64_t paddr;
4390
4391 paddr = ((addr64_t)upl_phys_page(pl, pg_index) << 12) + pg_offset;
4392
4393 retval = uiomove64(paddr, csize, uio);
4394
4395 pg_index += 1;
4396 pg_offset = 0;
4397 xsize -= csize;
4398 csize = min(PAGE_SIZE, xsize);
4399 }
4400 uio->uio_segflg = segflg;
4401
4402 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
4403 (int)uio->uio_offset, uio_resid(uio), retval, segflg, 0);
4404
4405 return (retval);
4406 }
4407
4408
4409 int
4410 cluster_copy_ubc_data(vnode_t vp, struct uio *uio, int *io_resid, int mark_dirty)
4411 {
4412 int segflg;
4413 int io_size;
4414 int xsize;
4415 int start_offset;
4416 int retval = 0;
4417 memory_object_control_t control;
4418
4419
4420 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
4421 (int)uio->uio_offset, uio_resid(uio), 0, *io_resid, 0);
4422
4423 control = ubc_getobject(vp, UBC_FLAGS_NONE);
4424 if (control == MEMORY_OBJECT_CONTROL_NULL) {
4425 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
4426 (int)uio->uio_offset, uio_resid(uio), retval, 3, 0);
4427
4428 return(0);
4429 }
4430 segflg = uio->uio_segflg;
4431
4432 switch(segflg) {
4433
4434 case UIO_USERSPACE32:
4435 case UIO_USERISPACE32:
4436 uio->uio_segflg = UIO_PHYS_USERSPACE32;
4437 break;
4438
4439 case UIO_USERSPACE64:
4440 case UIO_USERISPACE64:
4441 uio->uio_segflg = UIO_PHYS_USERSPACE64;
4442 break;
4443
4444 case UIO_SYSSPACE32:
4445 uio->uio_segflg = UIO_PHYS_SYSSPACE32;
4446 break;
4447
4448 case UIO_SYSSPACE64:
4449 uio->uio_segflg = UIO_PHYS_SYSSPACE64;
4450 break;
4451
4452 case UIO_USERSPACE:
4453 case UIO_USERISPACE:
4454 uio->uio_segflg = UIO_PHYS_USERSPACE;
4455 break;
4456
4457 case UIO_SYSSPACE:
4458 uio->uio_segflg = UIO_PHYS_SYSSPACE;
4459 break;
4460 }
4461
4462 if ( (io_size = *io_resid) ) {
4463 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
4464 xsize = uio_resid(uio);
4465
4466 retval = memory_object_control_uiomove(control, uio->uio_offset - start_offset,
4467 uio, start_offset, io_size, mark_dirty);
4468 xsize -= uio_resid(uio);
4469 io_size -= xsize;
4470 }
4471 uio->uio_segflg = segflg;
4472 *io_resid = io_size;
4473
4474 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
4475 (int)uio->uio_offset, uio_resid(uio), retval, 0x80000000 | segflg, 0);
4476
4477 return(retval);
4478 }
4479
4480
4481 int
4482 is_file_clean(vnode_t vp, off_t filesize)
4483 {
4484 off_t f_offset;
4485 int flags;
4486 int total_dirty = 0;
4487
4488 for (f_offset = 0; f_offset < filesize; f_offset += PAGE_SIZE_64) {
4489 if (ubc_page_op(vp, f_offset, 0, 0, &flags) == KERN_SUCCESS) {
4490 if (flags & UPL_POP_DIRTY) {
4491 total_dirty++;
4492 }
4493 }
4494 }
4495 if (total_dirty)
4496 return(EINVAL);
4497
4498 return (0);
4499 }
4500
4501
4502
4503 /*
4504 * Dirty region tracking/clustering mechanism.
4505 *
4506 * This code (vfs_drt_*) provides a mechanism for tracking and clustering
4507 * dirty regions within a larger space (file). It is primarily intended to
4508 * support clustering in large files with many dirty areas.
4509 *
4510 * The implementation assumes that the dirty regions are pages.
4511 *
4512 * To represent dirty pages within the file, we store bit vectors in a
4513 * variable-size circular hash.
4514 */
4515
4516 /*
4517 * Bitvector size. This determines the number of pages we group in a
4518 * single hashtable entry. Each hashtable entry is aligned to this
4519 * size within the file.
4520 */
4521 #define DRT_BITVECTOR_PAGES 256
4522
4523 /*
4524 * File offset handling.
4525 *
4526 * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
4527 * the correct formula is (~(DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1)
4528 */
4529 #define DRT_ADDRESS_MASK (~((1 << 20) - 1))
4530 #define DRT_ALIGN_ADDRESS(addr) ((addr) & DRT_ADDRESS_MASK)
4531
4532 /*
4533 * Hashtable address field handling.
4534 *
4535 * The low-order bits of the hashtable address are used to conserve
4536 * space.
4537 *
4538 * DRT_HASH_COUNT_MASK must be large enough to store the range
4539 * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
4540 * to indicate that the bucket is actually unoccupied.
4541 */
4542 #define DRT_HASH_GET_ADDRESS(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
4543 #define DRT_HASH_SET_ADDRESS(scm, i, a) \
4544 do { \
4545 (scm)->scm_hashtable[(i)].dhe_control = \
4546 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \
4547 } while (0)
4548 #define DRT_HASH_COUNT_MASK 0x1ff
4549 #define DRT_HASH_GET_COUNT(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
4550 #define DRT_HASH_SET_COUNT(scm, i, c) \
4551 do { \
4552 (scm)->scm_hashtable[(i)].dhe_control = \
4553 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK); \
4554 } while (0)
4555 #define DRT_HASH_CLEAR(scm, i) \
4556 do { \
4557 (scm)->scm_hashtable[(i)].dhe_control = 0; \
4558 } while (0)
4559 #define DRT_HASH_VACATE(scm, i) DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
4560 #define DRT_HASH_VACANT(scm, i) (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
4561 #define DRT_HASH_COPY(oscm, oi, scm, i) \
4562 do { \
4563 (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control; \
4564 DRT_BITVECTOR_COPY(oscm, oi, scm, i); \
4565 } while(0);
4566
4567
4568 /*
4569 * Hash table moduli.
4570 *
4571 * Since the hashtable entry's size is dependent on the size of
4572 * the bitvector, and since the hashtable size is constrained to
4573 * both being prime and fitting within the desired allocation
4574 * size, these values need to be manually determined.
4575 *
4576 * For DRT_BITVECTOR_SIZE = 256, the entry size is 40 bytes.
4577 *
4578 * The small hashtable allocation is 1024 bytes, so the modulus is 23.
4579 * The large hashtable allocation is 16384 bytes, so the modulus is 401.
4580 */
4581 #define DRT_HASH_SMALL_MODULUS 23
4582 #define DRT_HASH_LARGE_MODULUS 401
4583
4584 #define DRT_SMALL_ALLOCATION 1024 /* 104 bytes spare */
4585 #define DRT_LARGE_ALLOCATION 16384 /* 344 bytes spare */
4586
4587 /* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */
4588
4589 /*
4590 * Hashtable bitvector handling.
4591 *
4592 * Bitvector fields are 32 bits long.
4593 */
4594
4595 #define DRT_HASH_SET_BIT(scm, i, bit) \
4596 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32))
4597
4598 #define DRT_HASH_CLEAR_BIT(scm, i, bit) \
4599 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
4600
4601 #define DRT_HASH_TEST_BIT(scm, i, bit) \
4602 ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
4603
4604 #define DRT_BITVECTOR_CLEAR(scm, i) \
4605 bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
4606
4607 #define DRT_BITVECTOR_COPY(oscm, oi, scm, i) \
4608 bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0], \
4609 &(scm)->scm_hashtable[(i)].dhe_bitvector[0], \
4610 (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
4611
4612
4613
4614 /*
4615 * Hashtable entry.
4616 */
4617 struct vfs_drt_hashentry {
4618 u_int64_t dhe_control;
4619 u_int32_t dhe_bitvector[DRT_BITVECTOR_PAGES / 32];
4620 };
4621
4622 /*
4623 * Dirty Region Tracking structure.
4624 *
4625 * The hashtable is allocated entirely inside the DRT structure.
4626 *
4627 * The hash is a simple circular prime modulus arrangement, the structure
4628 * is resized from small to large if it overflows.
4629 */
4630
4631 struct vfs_drt_clustermap {
4632 u_int32_t scm_magic; /* sanity/detection */
4633 #define DRT_SCM_MAGIC 0x12020003
4634 u_int32_t scm_modulus; /* current ring size */
4635 u_int32_t scm_buckets; /* number of occupied buckets */
4636 u_int32_t scm_lastclean; /* last entry we cleaned */
4637 u_int32_t scm_iskips; /* number of slot skips */
4638
4639 struct vfs_drt_hashentry scm_hashtable[0];
4640 };
4641
4642
4643 #define DRT_HASH(scm, addr) ((addr) % (scm)->scm_modulus)
4644 #define DRT_HASH_NEXT(scm, addr) (((addr) + 1) % (scm)->scm_modulus)
4645
4646 /*
4647 * Debugging codes and arguments.
4648 */
4649 #define DRT_DEBUG_EMPTYFREE (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
4650 #define DRT_DEBUG_RETCLUSTER (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
4651 #define DRT_DEBUG_ALLOC (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
4652 #define DRT_DEBUG_INSERT (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
4653 #define DRT_DEBUG_MARK (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
4654 * dirty */
4655 /* 0, setcount */
4656 /* 1 (clean, no map) */
4657 /* 2 (map alloc fail) */
4658 /* 3, resid (partial) */
4659 #define DRT_DEBUG_6 (FSDBG_CODE(DBG_FSRW, 87))
4660 #define DRT_DEBUG_SCMDATA (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
4661 * lastclean, iskips */
4662
4663
4664 static kern_return_t vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp);
4665 static kern_return_t vfs_drt_free_map(struct vfs_drt_clustermap *cmap);
4666 static kern_return_t vfs_drt_search_index(struct vfs_drt_clustermap *cmap,
4667 u_int64_t offset, int *indexp);
4668 static kern_return_t vfs_drt_get_index(struct vfs_drt_clustermap **cmapp,
4669 u_int64_t offset,
4670 int *indexp,
4671 int recursed);
4672 static kern_return_t vfs_drt_do_mark_pages(
4673 void **cmapp,
4674 u_int64_t offset,
4675 u_int length,
4676 int *setcountp,
4677 int dirty);
4678 static void vfs_drt_trace(
4679 struct vfs_drt_clustermap *cmap,
4680 int code,
4681 int arg1,
4682 int arg2,
4683 int arg3,
4684 int arg4);
4685
4686
4687 /*
4688 * Allocate and initialise a sparse cluster map.
4689 *
4690 * Will allocate a new map, resize or compact an existing map.
4691 *
4692 * XXX we should probably have at least one intermediate map size,
4693 * as the 1:16 ratio seems a bit drastic.
4694 */
4695 static kern_return_t
4696 vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp)
4697 {
4698 struct vfs_drt_clustermap *cmap, *ocmap;
4699 kern_return_t kret;
4700 u_int64_t offset;
4701 int nsize, i, active_buckets, index, copycount;
4702
4703 ocmap = NULL;
4704 if (cmapp != NULL)
4705 ocmap = *cmapp;
4706
4707 /*
4708 * Decide on the size of the new map.
4709 */
4710 if (ocmap == NULL) {
4711 nsize = DRT_HASH_SMALL_MODULUS;
4712 } else {
4713 /* count the number of active buckets in the old map */
4714 active_buckets = 0;
4715 for (i = 0; i < ocmap->scm_modulus; i++) {
4716 if (!DRT_HASH_VACANT(ocmap, i) &&
4717 (DRT_HASH_GET_COUNT(ocmap, i) != 0))
4718 active_buckets++;
4719 }
4720 /*
4721 * If we're currently using the small allocation, check to
4722 * see whether we should grow to the large one.
4723 */
4724 if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
4725 /* if the ring is nearly full */
4726 if (active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) {
4727 nsize = DRT_HASH_LARGE_MODULUS;
4728 } else {
4729 nsize = DRT_HASH_SMALL_MODULUS;
4730 }
4731 } else {
4732 /* already using the large modulus */
4733 nsize = DRT_HASH_LARGE_MODULUS;
4734 /*
4735 * If the ring is completely full, there's
4736 * nothing useful for us to do. Behave as
4737 * though we had compacted into the new
4738 * array and return.
4739 */
4740 if (active_buckets >= DRT_HASH_LARGE_MODULUS)
4741 return(KERN_SUCCESS);
4742 }
4743 }
4744
4745 /*
4746 * Allocate and initialise the new map.
4747 */
4748
4749 kret = kmem_alloc(kernel_map, (vm_offset_t *)&cmap,
4750 (nsize == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION);
4751 if (kret != KERN_SUCCESS)
4752 return(kret);
4753 cmap->scm_magic = DRT_SCM_MAGIC;
4754 cmap->scm_modulus = nsize;
4755 cmap->scm_buckets = 0;
4756 cmap->scm_lastclean = 0;
4757 cmap->scm_iskips = 0;
4758 for (i = 0; i < cmap->scm_modulus; i++) {
4759 DRT_HASH_CLEAR(cmap, i);
4760 DRT_HASH_VACATE(cmap, i);
4761 DRT_BITVECTOR_CLEAR(cmap, i);
4762 }
4763
4764 /*
4765 * If there's an old map, re-hash entries from it into the new map.
4766 */
4767 copycount = 0;
4768 if (ocmap != NULL) {
4769 for (i = 0; i < ocmap->scm_modulus; i++) {
4770 /* skip empty buckets */
4771 if (DRT_HASH_VACANT(ocmap, i) ||
4772 (DRT_HASH_GET_COUNT(ocmap, i) == 0))
4773 continue;
4774 /* get new index */
4775 offset = DRT_HASH_GET_ADDRESS(ocmap, i);
4776 kret = vfs_drt_get_index(&cmap, offset, &index, 1);
4777 if (kret != KERN_SUCCESS) {
4778 /* XXX need to bail out gracefully here */
4779 panic("vfs_drt: new cluster map mysteriously too small");
4780 }
4781 /* copy */
4782 DRT_HASH_COPY(ocmap, i, cmap, index);
4783 copycount++;
4784 }
4785 }
4786
4787 /* log what we've done */
4788 vfs_drt_trace(cmap, DRT_DEBUG_ALLOC, copycount, 0, 0, 0);
4789
4790 /*
4791 * It's important to ensure that *cmapp always points to
4792 * a valid map, so we must overwrite it before freeing
4793 * the old map.
4794 */
4795 *cmapp = cmap;
4796 if (ocmap != NULL) {
4797 /* emit stats into trace buffer */
4798 vfs_drt_trace(ocmap, DRT_DEBUG_SCMDATA,
4799 ocmap->scm_modulus,
4800 ocmap->scm_buckets,
4801 ocmap->scm_lastclean,
4802 ocmap->scm_iskips);
4803
4804 vfs_drt_free_map(ocmap);
4805 }
4806 return(KERN_SUCCESS);
4807 }
4808
4809
4810 /*
4811 * Free a sparse cluster map.
4812 */
4813 static kern_return_t
4814 vfs_drt_free_map(struct vfs_drt_clustermap *cmap)
4815 {
4816 kmem_free(kernel_map, (vm_offset_t)cmap,
4817 (cmap->scm_modulus == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION);
4818 return(KERN_SUCCESS);
4819 }
4820
4821
4822 /*
4823 * Find the hashtable slot currently occupied by an entry for the supplied offset.
4824 */
4825 static kern_return_t
4826 vfs_drt_search_index(struct vfs_drt_clustermap *cmap, u_int64_t offset, int *indexp)
4827 {
4828 int index, i;
4829
4830 offset = DRT_ALIGN_ADDRESS(offset);
4831 index = DRT_HASH(cmap, offset);
4832
4833 /* traverse the hashtable */
4834 for (i = 0; i < cmap->scm_modulus; i++) {
4835
4836 /*
4837 * If the slot is vacant, we can stop.
4838 */
4839 if (DRT_HASH_VACANT(cmap, index))
4840 break;
4841
4842 /*
4843 * If the address matches our offset, we have success.
4844 */
4845 if (DRT_HASH_GET_ADDRESS(cmap, index) == offset) {
4846 *indexp = index;
4847 return(KERN_SUCCESS);
4848 }
4849
4850 /*
4851 * Move to the next slot, try again.
4852 */
4853 index = DRT_HASH_NEXT(cmap, index);
4854 }
4855 /*
4856 * It's not there.
4857 */
4858 return(KERN_FAILURE);
4859 }
4860
4861 /*
4862 * Find the hashtable slot for the supplied offset. If we haven't allocated
4863 * one yet, allocate one and populate the address field. Note that it will
4864 * not have a nonzero page count and thus will still technically be free, so
4865 * in the case where we are called to clean pages, the slot will remain free.
4866 */
4867 static kern_return_t
4868 vfs_drt_get_index(struct vfs_drt_clustermap **cmapp, u_int64_t offset, int *indexp, int recursed)
4869 {
4870 struct vfs_drt_clustermap *cmap;
4871 kern_return_t kret;
4872 int index, i;
4873
4874 cmap = *cmapp;
4875
4876 /* look for an existing entry */
4877 kret = vfs_drt_search_index(cmap, offset, indexp);
4878 if (kret == KERN_SUCCESS)
4879 return(kret);
4880
4881 /* need to allocate an entry */
4882 offset = DRT_ALIGN_ADDRESS(offset);
4883 index = DRT_HASH(cmap, offset);
4884
4885 /* scan from the index forwards looking for a vacant slot */
4886 for (i = 0; i < cmap->scm_modulus; i++) {
4887 /* slot vacant? */
4888 if (DRT_HASH_VACANT(cmap, index) || DRT_HASH_GET_COUNT(cmap,index) == 0) {
4889 cmap->scm_buckets++;
4890 if (index < cmap->scm_lastclean)
4891 cmap->scm_lastclean = index;
4892 DRT_HASH_SET_ADDRESS(cmap, index, offset);
4893 DRT_HASH_SET_COUNT(cmap, index, 0);
4894 DRT_BITVECTOR_CLEAR(cmap, index);
4895 *indexp = index;
4896 vfs_drt_trace(cmap, DRT_DEBUG_INSERT, (int)offset, i, 0, 0);
4897 return(KERN_SUCCESS);
4898 }
4899 cmap->scm_iskips += i;
4900 index = DRT_HASH_NEXT(cmap, index);
4901 }
4902
4903 /*
4904 * We haven't found a vacant slot, so the map is full. If we're not
4905 * already recursed, try reallocating/compacting it.
4906 */
4907 if (recursed)
4908 return(KERN_FAILURE);
4909 kret = vfs_drt_alloc_map(cmapp);
4910 if (kret == KERN_SUCCESS) {
4911 /* now try to insert again */
4912 kret = vfs_drt_get_index(cmapp, offset, indexp, 1);
4913 }
4914 return(kret);
4915 }
4916
4917 /*
4918 * Implementation of set dirty/clean.
4919 *
4920 * In the 'clean' case, not finding a map is OK.
4921 */
4922 static kern_return_t
4923 vfs_drt_do_mark_pages(
4924 void **private,
4925 u_int64_t offset,
4926 u_int length,
4927 int *setcountp,
4928 int dirty)
4929 {
4930 struct vfs_drt_clustermap *cmap, **cmapp;
4931 kern_return_t kret;
4932 int i, index, pgoff, pgcount, setcount, ecount;
4933
4934 cmapp = (struct vfs_drt_clustermap **)private;
4935 cmap = *cmapp;
4936
4937 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_START, (int)offset, (int)length, dirty, 0);
4938
4939 if (setcountp != NULL)
4940 *setcountp = 0;
4941
4942 /* allocate a cluster map if we don't already have one */
4943 if (cmap == NULL) {
4944 /* no cluster map, nothing to clean */
4945 if (!dirty) {
4946 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 1, 0, 0, 0);
4947 return(KERN_SUCCESS);
4948 }
4949 kret = vfs_drt_alloc_map(cmapp);
4950 if (kret != KERN_SUCCESS) {
4951 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 2, 0, 0, 0);
4952 return(kret);
4953 }
4954 }
4955 setcount = 0;
4956
4957 /*
4958 * Iterate over the length of the region.
4959 */
4960 while (length > 0) {
4961 /*
4962 * Get the hashtable index for this offset.
4963 *
4964 * XXX this will add blank entries if we are clearing a range
4965 * that hasn't been dirtied.
4966 */
4967 kret = vfs_drt_get_index(cmapp, offset, &index, 0);
4968 cmap = *cmapp; /* may have changed! */
4969 /* this may be a partial-success return */
4970 if (kret != KERN_SUCCESS) {
4971 if (setcountp != NULL)
4972 *setcountp = setcount;
4973 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 3, (int)length, 0, 0);
4974
4975 return(kret);
4976 }
4977
4978 /*
4979 * Work out how many pages we're modifying in this
4980 * hashtable entry.
4981 */
4982 pgoff = (offset - DRT_ALIGN_ADDRESS(offset)) / PAGE_SIZE;
4983 pgcount = min((length / PAGE_SIZE), (DRT_BITVECTOR_PAGES - pgoff));
4984
4985 /*
4986 * Iterate over pages, dirty/clearing as we go.
4987 */
4988 ecount = DRT_HASH_GET_COUNT(cmap, index);
4989 for (i = 0; i < pgcount; i++) {
4990 if (dirty) {
4991 if (!DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
4992 DRT_HASH_SET_BIT(cmap, index, pgoff + i);
4993 ecount++;
4994 setcount++;
4995 }
4996 } else {
4997 if (DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
4998 DRT_HASH_CLEAR_BIT(cmap, index, pgoff + i);
4999 ecount--;
5000 setcount++;
5001 }
5002 }
5003 }
5004 DRT_HASH_SET_COUNT(cmap, index, ecount);
5005
5006 offset += pgcount * PAGE_SIZE;
5007 length -= pgcount * PAGE_SIZE;
5008 }
5009 if (setcountp != NULL)
5010 *setcountp = setcount;
5011
5012 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 0, setcount, 0, 0);
5013
5014 return(KERN_SUCCESS);
5015 }
5016
5017 /*
5018 * Mark a set of pages as dirty/clean.
5019 *
5020 * This is a public interface.
5021 *
5022 * cmapp
5023 * Pointer to storage suitable for holding a pointer. Note that
5024 * this must either be NULL or a value set by this function.
5025 *
5026 * size
5027 * Current file size in bytes.
5028 *
5029 * offset
5030 * Offset of the first page to be marked as dirty, in bytes. Must be
5031 * page-aligned.
5032 *
5033 * length
5034 * Length of dirty region, in bytes. Must be a multiple of PAGE_SIZE.
5035 *
5036 * setcountp
5037 * Number of pages newly marked dirty by this call (optional).
5038 *
5039 * Returns KERN_SUCCESS if all the pages were successfully marked.
5040 */
5041 static kern_return_t
5042 vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, int *setcountp)
5043 {
5044 /* XXX size unused, drop from interface */
5045 return(vfs_drt_do_mark_pages(cmapp, offset, length, setcountp, 1));
5046 }
5047
5048 #if 0
5049 static kern_return_t
5050 vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length)
5051 {
5052 return(vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0));
5053 }
5054 #endif
5055
5056 /*
5057 * Get a cluster of dirty pages.
5058 *
5059 * This is a public interface.
5060 *
5061 * cmapp
5062 * Pointer to storage managed by drt_mark_pages. Note that this must
5063 * be NULL or a value set by drt_mark_pages.
5064 *
5065 * offsetp
5066 * Returns the byte offset into the file of the first page in the cluster.
5067 *
5068 * lengthp
5069 * Returns the length in bytes of the cluster of dirty pages.
5070 *
5071 * Returns success if a cluster was found. If KERN_FAILURE is returned, there
5072 * are no dirty pages meeting the minmum size criteria. Private storage will
5073 * be released if there are no more dirty pages left in the map
5074 *
5075 */
5076 static kern_return_t
5077 vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp)
5078 {
5079 struct vfs_drt_clustermap *cmap;
5080 u_int64_t offset;
5081 u_int length;
5082 int index, i, j, fs, ls;
5083
5084 /* sanity */
5085 if ((cmapp == NULL) || (*cmapp == NULL))
5086 return(KERN_FAILURE);
5087 cmap = *cmapp;
5088
5089 /* walk the hashtable */
5090 for (offset = 0, j = 0; j < cmap->scm_modulus; offset += (DRT_BITVECTOR_PAGES * PAGE_SIZE), j++) {
5091 index = DRT_HASH(cmap, offset);
5092
5093 if (DRT_HASH_VACANT(cmap, index) || (DRT_HASH_GET_COUNT(cmap, index) == 0))
5094 continue;
5095
5096 /* scan the bitfield for a string of bits */
5097 fs = -1;
5098
5099 for (i = 0; i < DRT_BITVECTOR_PAGES; i++) {
5100 if (DRT_HASH_TEST_BIT(cmap, index, i)) {
5101 fs = i;
5102 break;
5103 }
5104 }
5105 if (fs == -1) {
5106 /* didn't find any bits set */
5107 panic("vfs_drt: entry summary count > 0 but no bits set in map");
5108 }
5109 for (ls = 0; i < DRT_BITVECTOR_PAGES; i++, ls++) {
5110 if (!DRT_HASH_TEST_BIT(cmap, index, i))
5111 break;
5112 }
5113
5114 /* compute offset and length, mark pages clean */
5115 offset = DRT_HASH_GET_ADDRESS(cmap, index) + (PAGE_SIZE * fs);
5116 length = ls * PAGE_SIZE;
5117 vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
5118 cmap->scm_lastclean = index;
5119
5120 /* return successful */
5121 *offsetp = (off_t)offset;
5122 *lengthp = length;
5123
5124 vfs_drt_trace(cmap, DRT_DEBUG_RETCLUSTER, (int)offset, (int)length, 0, 0);
5125 return(KERN_SUCCESS);
5126 }
5127 /*
5128 * We didn't find anything... hashtable is empty
5129 * emit stats into trace buffer and
5130 * then free it
5131 */
5132 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
5133 cmap->scm_modulus,
5134 cmap->scm_buckets,
5135 cmap->scm_lastclean,
5136 cmap->scm_iskips);
5137
5138 vfs_drt_free_map(cmap);
5139 *cmapp = NULL;
5140
5141 return(KERN_FAILURE);
5142 }
5143
5144
5145 static kern_return_t
5146 vfs_drt_control(void **cmapp, int op_type)
5147 {
5148 struct vfs_drt_clustermap *cmap;
5149
5150 /* sanity */
5151 if ((cmapp == NULL) || (*cmapp == NULL))
5152 return(KERN_FAILURE);
5153 cmap = *cmapp;
5154
5155 switch (op_type) {
5156 case 0:
5157 /* emit stats into trace buffer */
5158 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
5159 cmap->scm_modulus,
5160 cmap->scm_buckets,
5161 cmap->scm_lastclean,
5162 cmap->scm_iskips);
5163
5164 vfs_drt_free_map(cmap);
5165 *cmapp = NULL;
5166 break;
5167
5168 case 1:
5169 cmap->scm_lastclean = 0;
5170 break;
5171 }
5172 return(KERN_SUCCESS);
5173 }
5174
5175
5176
5177 /*
5178 * Emit a summary of the state of the clustermap into the trace buffer
5179 * along with some caller-provided data.
5180 */
5181 #if KDEBUG
5182 static void
5183 vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, int code, int arg1, int arg2, int arg3, int arg4)
5184 {
5185 KERNEL_DEBUG(code, arg1, arg2, arg3, arg4, 0);
5186 }
5187 #else
5188 static void
5189 vfs_drt_trace(__unused struct vfs_drt_clustermap *cmap, __unused int code,
5190 __unused int arg1, __unused int arg2, __unused int arg3,
5191 __unused int arg4)
5192 {
5193 }
5194 #endif
5195
5196 #if 0
5197 /*
5198 * Perform basic sanity check on the hash entry summary count
5199 * vs. the actual bits set in the entry.
5200 */
5201 static void
5202 vfs_drt_sanity(struct vfs_drt_clustermap *cmap)
5203 {
5204 int index, i;
5205 int bits_on;
5206
5207 for (index = 0; index < cmap->scm_modulus; index++) {
5208 if (DRT_HASH_VACANT(cmap, index))
5209 continue;
5210
5211 for (bits_on = 0, i = 0; i < DRT_BITVECTOR_PAGES; i++) {
5212 if (DRT_HASH_TEST_BIT(cmap, index, i))
5213 bits_on++;
5214 }
5215 if (bits_on != DRT_HASH_GET_COUNT(cmap, index))
5216 panic("bits_on = %d, index = %d\n", bits_on, index);
5217 }
5218 }
5219 #endif