]> git.saurik.com Git - apple/xnu.git/blob - bsd/vfs/vfs_cluster.c
xnu-517.12.7.tar.gz
[apple/xnu.git] / bsd / vfs / vfs_cluster.c
1 /*
2 * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
23 /*
24 * Copyright (c) 1993
25 * The Regents of the University of California. All rights reserved.
26 *
27 * Redistribution and use in source and binary forms, with or without
28 * modification, are permitted provided that the following conditions
29 * are met:
30 * 1. Redistributions of source code must retain the above copyright
31 * notice, this list of conditions and the following disclaimer.
32 * 2. Redistributions in binary form must reproduce the above copyright
33 * notice, this list of conditions and the following disclaimer in the
34 * documentation and/or other materials provided with the distribution.
35 * 3. All advertising materials mentioning features or use of this software
36 * must display the following acknowledgement:
37 * This product includes software developed by the University of
38 * California, Berkeley and its contributors.
39 * 4. Neither the name of the University nor the names of its contributors
40 * may be used to endorse or promote products derived from this software
41 * without specific prior written permission.
42 *
43 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
44 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
45 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
46 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
47 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
48 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
49 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
50 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
51 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
52 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
53 * SUCH DAMAGE.
54 *
55 * @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95
56 */
57
58 #include <sys/param.h>
59 #include <sys/proc.h>
60 #include <sys/buf.h>
61 #include <sys/vnode.h>
62 #include <sys/mount.h>
63 #include <sys/trace.h>
64 #include <sys/malloc.h>
65 #include <sys/time.h>
66 #include <sys/kernel.h>
67 #include <sys/resourcevar.h>
68 #include <libkern/libkern.h>
69 #include <machine/machine_routines.h>
70
71 #include <sys/ubc.h>
72 #include <vm/vm_pageout.h>
73
74 #include <mach/mach_types.h>
75 #include <mach/memory_object_types.h>
76
77 #include <sys/kdebug.h>
78
79 #define CL_READ 0x01
80 #define CL_ASYNC 0x02
81 #define CL_COMMIT 0x04
82 #define CL_PAGEOUT 0x10
83 #define CL_AGE 0x20
84 #define CL_DUMP 0x40
85 #define CL_NOZERO 0x80
86 #define CL_PAGEIN 0x100
87 #define CL_DEV_MEMORY 0x200
88 #define CL_PRESERVE 0x400
89 #define CL_THROTTLE 0x800
90
91
92 struct clios {
93 u_int io_completed; /* amount of io that has currently completed */
94 u_int io_issued; /* amount of io that was successfully issued */
95 int io_error; /* error code of first error encountered */
96 int io_wanted; /* someone is sleeping waiting for a change in state */
97 };
98
99
100 static void cluster_zero(upl_t upl, vm_offset_t upl_offset,
101 int size, struct buf *bp);
102 static int cluster_read_x(struct vnode *vp, struct uio *uio,
103 off_t filesize, int devblocksize, int flags);
104 static int cluster_write_x(struct vnode *vp, struct uio *uio,
105 off_t oldEOF, off_t newEOF, off_t headOff,
106 off_t tailOff, int devblocksize, int flags);
107 static int cluster_nocopy_read(struct vnode *vp, struct uio *uio,
108 off_t filesize, int devblocksize, int flags);
109 static int cluster_nocopy_write(struct vnode *vp, struct uio *uio,
110 off_t newEOF, int devblocksize, int flags);
111 static int cluster_phys_read(struct vnode *vp, struct uio *uio,
112 off_t filesize, int devblocksize, int flags);
113 static int cluster_phys_write(struct vnode *vp, struct uio *uio,
114 off_t newEOF, int devblocksize, int flags);
115 static int cluster_align_phys_io(struct vnode *vp, struct uio *uio,
116 addr64_t usr_paddr, int xsize, int devblocksize, int flags);
117 static int cluster_push_x(struct vnode *vp, off_t EOF, unsigned int first, unsigned int last, int can_delay);
118 static int cluster_try_push(struct vnode *vp, off_t EOF, int can_delay, int push_all);
119
120 static int sparse_cluster_switch(struct vnode *vp, off_t EOF);
121 static int sparse_cluster_push(struct vnode *vp, off_t EOF, int push_all);
122 static int sparse_cluster_add(struct vnode *vp, off_t EOF, daddr_t first, daddr_t last);
123
124 static kern_return_t vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, int *setcountp);
125 static kern_return_t vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length);
126 static kern_return_t vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp);
127 static kern_return_t vfs_drt_control(void **cmapp, int op_type);
128
129 int ubc_page_op_with_control __P((memory_object_control_t, off_t, int, ppnum_t *, int *));
130
131
132 /*
133 * throttle the number of async writes that
134 * can be outstanding on a single vnode
135 * before we issue a synchronous write
136 */
137 #define ASYNC_THROTTLE 18
138 #define HARD_THROTTLE_MAXCNT 1
139 #define HARD_THROTTLE_MAXSIZE (64 * 1024)
140
141 int hard_throttle_on_root = 0;
142 struct timeval priority_IO_timestamp_for_root;
143
144
145 static int
146 cluster_hard_throttle_on(vp)
147 struct vnode *vp;
148 {
149 static struct timeval hard_throttle_maxelapsed = { 0, 300000 };
150
151 if (vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV) {
152 struct timeval elapsed;
153
154 if (hard_throttle_on_root)
155 return(1);
156
157 elapsed = time;
158 timevalsub(&elapsed, &priority_IO_timestamp_for_root);
159
160 if (timevalcmp(&elapsed, &hard_throttle_maxelapsed, <))
161 return(1);
162 }
163 return(0);
164 }
165
166
167 static int
168 cluster_iodone(bp)
169 struct buf *bp;
170 {
171 int b_flags;
172 int error;
173 int total_size;
174 int total_resid;
175 int upl_offset;
176 int zero_offset;
177 upl_t upl;
178 struct buf *cbp;
179 struct buf *cbp_head;
180 struct buf *cbp_next;
181 struct buf *real_bp;
182 struct vnode *vp;
183 struct clios *iostate;
184 int commit_size;
185 int pg_offset;
186
187
188 cbp_head = (struct buf *)(bp->b_trans_head);
189
190 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
191 (int)cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
192
193 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
194 /*
195 * all I/O requests that are part of this transaction
196 * have to complete before we can process it
197 */
198 if ( !(cbp->b_flags & B_DONE)) {
199
200 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
201 (int)cbp_head, (int)cbp, cbp->b_bcount, cbp->b_flags, 0);
202
203 return 0;
204 }
205 }
206 error = 0;
207 total_size = 0;
208 total_resid = 0;
209
210 cbp = cbp_head;
211 upl_offset = cbp->b_uploffset;
212 upl = cbp->b_pagelist;
213 b_flags = cbp->b_flags;
214 real_bp = cbp->b_real_bp;
215 vp = cbp->b_vp;
216 zero_offset= cbp->b_validend;
217 iostate = (struct clios *)cbp->b_iostate;
218
219 while (cbp) {
220 if ((cbp->b_flags & B_ERROR) && error == 0)
221 error = cbp->b_error;
222
223 total_resid += cbp->b_resid;
224 total_size += cbp->b_bcount;
225
226 cbp_next = cbp->b_trans_next;
227
228 free_io_buf(cbp);
229
230 cbp = cbp_next;
231 }
232 if (zero_offset)
233 cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
234
235 if ((vp->v_flag & VTHROTTLED) && (vp->v_numoutput <= (ASYNC_THROTTLE / 3))) {
236 vp->v_flag &= ~VTHROTTLED;
237 wakeup((caddr_t)&vp->v_numoutput);
238 }
239 if (iostate) {
240 /*
241 * someone has issued multiple I/Os asynchrounsly
242 * and is waiting for them to complete (streaming)
243 */
244 if (error && iostate->io_error == 0)
245 iostate->io_error = error;
246
247 iostate->io_completed += total_size;
248
249 if (iostate->io_wanted) {
250 /*
251 * someone is waiting for the state of
252 * this io stream to change
253 */
254 iostate->io_wanted = 0;
255 wakeup((caddr_t)&iostate->io_wanted);
256 }
257 }
258 if ((b_flags & B_NEED_IODONE) && real_bp) {
259 if (error) {
260 real_bp->b_flags |= B_ERROR;
261 real_bp->b_error = error;
262 }
263 real_bp->b_resid = total_resid;
264
265 biodone(real_bp);
266 }
267 if (error == 0 && total_resid)
268 error = EIO;
269
270 if (b_flags & B_COMMIT_UPL) {
271 pg_offset = upl_offset & PAGE_MASK;
272 commit_size = (pg_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
273
274 if (error || (b_flags & B_NOCACHE)) {
275 int upl_abort_code;
276
277 if ((b_flags & B_PAGEOUT) && (error != ENXIO)) /* transient error */
278 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
279 else if (b_flags & B_PGIN)
280 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
281 else
282 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
283
284 ubc_upl_abort_range(upl, upl_offset - pg_offset, commit_size,
285 upl_abort_code);
286
287 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
288 (int)upl, upl_offset - pg_offset, commit_size,
289 0x80000000|upl_abort_code, 0);
290
291 } else {
292 int upl_commit_flags = UPL_COMMIT_FREE_ON_EMPTY;
293
294 if (b_flags & B_PHYS) {
295 if (b_flags & B_READ)
296 upl_commit_flags |= UPL_COMMIT_SET_DIRTY;
297 } else if ( !(b_flags & B_PAGEOUT))
298 upl_commit_flags |= UPL_COMMIT_CLEAR_DIRTY;
299
300 if (b_flags & B_AGE)
301 upl_commit_flags |= UPL_COMMIT_INACTIVATE;
302
303 ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size,
304 upl_commit_flags);
305
306 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
307 (int)upl, upl_offset - pg_offset, commit_size,
308 upl_commit_flags, 0);
309 }
310 } else
311 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
312 (int)upl, upl_offset, 0, error, 0);
313
314 return (error);
315 }
316
317
318 static void
319 cluster_zero(upl, upl_offset, size, bp)
320 upl_t upl;
321 vm_offset_t upl_offset;
322 int size;
323 struct buf *bp;
324 {
325 upl_page_info_t *pl;
326
327 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_START,
328 upl_offset, size, (int)bp, 0, 0);
329
330 if (bp == NULL || bp->b_data == NULL) {
331
332 pl = ubc_upl_pageinfo(upl);
333
334 while (size) {
335 int page_offset;
336 int page_index;
337 addr64_t zero_addr;
338 int zero_cnt;
339
340 page_index = upl_offset / PAGE_SIZE;
341 page_offset = upl_offset & PAGE_MASK;
342
343 zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << 12) + page_offset;
344 zero_cnt = min(PAGE_SIZE - page_offset, size);
345
346 bzero_phys(zero_addr, zero_cnt);
347
348 size -= zero_cnt;
349 upl_offset += zero_cnt;
350 }
351 } else
352 bzero((caddr_t)((vm_offset_t)bp->b_data + upl_offset), size);
353
354 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_END,
355 upl_offset, size, 0, 0, 0);
356 }
357
358 static int
359 cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, real_bp, iostate)
360 struct vnode *vp;
361 upl_t upl;
362 vm_offset_t upl_offset;
363 off_t f_offset;
364 int non_rounded_size;
365 int devblocksize;
366 int flags;
367 struct buf *real_bp;
368 struct clios *iostate;
369 {
370 struct buf *cbp;
371 u_int size;
372 u_int io_size;
373 int io_flags;
374 int error = 0;
375 int retval = 0;
376 struct buf *cbp_head = 0;
377 struct buf *cbp_tail = 0;
378 int buf_count = 0;
379 int pg_count;
380 int pg_offset;
381 u_int max_iosize;
382 u_int max_vectors;
383 int priv;
384 int zero_offset = 0;
385 int async_throttle;
386
387 if (devblocksize)
388 size = (non_rounded_size + (devblocksize - 1)) & ~(devblocksize - 1);
389 else
390 size = non_rounded_size;
391
392 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START,
393 (int)f_offset, size, upl_offset, flags, 0);
394
395
396 if (flags & CL_READ) {
397 io_flags = (B_VECTORLIST | B_READ);
398
399 vfs_io_attributes(vp, B_READ, &max_iosize, &max_vectors);
400 } else {
401 io_flags = (B_VECTORLIST | B_WRITEINPROG);
402
403 vfs_io_attributes(vp, B_WRITE, &max_iosize, &max_vectors);
404 }
405 /*
406 * make sure the maximum iosize are at least the size of a page
407 * and that they are multiples of the page size
408 */
409 max_iosize &= ~PAGE_MASK;
410
411 if (flags & CL_THROTTLE) {
412 if ( !(flags & CL_PAGEOUT) && cluster_hard_throttle_on(vp)) {
413 if (max_iosize > HARD_THROTTLE_MAXSIZE)
414 max_iosize = HARD_THROTTLE_MAXSIZE;
415 async_throttle = HARD_THROTTLE_MAXCNT;
416 } else
417 async_throttle = ASYNC_THROTTLE;
418 }
419 if (flags & CL_AGE)
420 io_flags |= B_AGE;
421 if (flags & CL_DUMP)
422 io_flags |= B_NOCACHE;
423 if (flags & CL_PAGEIN)
424 io_flags |= B_PGIN;
425 if (flags & CL_PAGEOUT)
426 io_flags |= B_PAGEOUT;
427 if (flags & CL_COMMIT)
428 io_flags |= B_COMMIT_UPL;
429 if (flags & CL_PRESERVE)
430 io_flags |= B_PHYS;
431
432 if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
433 /*
434 * then we are going to end up
435 * with a page that we can't complete (the file size wasn't a multiple
436 * of PAGE_SIZE and we're trying to read to the end of the file
437 * so we'll go ahead and zero out the portion of the page we can't
438 * read in from the file
439 */
440 zero_offset = upl_offset + non_rounded_size;
441 }
442 while (size) {
443 int vsize;
444 int i;
445 int pg_resid;
446 int num_contig;
447 daddr_t lblkno;
448 daddr_t blkno;
449
450 if (size > max_iosize)
451 io_size = max_iosize;
452 else
453 io_size = size;
454
455 if (error = VOP_CMAP(vp, f_offset, io_size, &blkno, (size_t *)&io_size, NULL)) {
456 if (error == EOPNOTSUPP)
457 panic("VOP_CMAP Unimplemented");
458 break;
459 }
460
461 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
462 (int)f_offset, (int)blkno, io_size, zero_offset, 0);
463
464 if ( (!(flags & CL_READ) && (long)blkno == -1) || io_size == 0) {
465 if (flags & CL_PAGEOUT) {
466 error = EINVAL;
467 break;
468 };
469
470 /* Try paging out the page individually before
471 giving up entirely and dumping it (it could
472 be mapped in a "hole" and require allocation
473 before the I/O:
474 */
475 ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
476 if (ubc_pushdirty_range(vp, f_offset, PAGE_SIZE_64) == 0) {
477 error = EINVAL;
478 break;
479 };
480
481 f_offset += PAGE_SIZE_64;
482 upl_offset += PAGE_SIZE;
483 size -= PAGE_SIZE;
484 continue;
485 }
486 lblkno = (daddr_t)(f_offset / PAGE_SIZE_64);
487 /*
488 * we have now figured out how much I/O we can do - this is in 'io_size'
489 * pg_offset is the starting point in the first page for the I/O
490 * pg_count is the number of full and partial pages that 'io_size' encompasses
491 */
492 pg_offset = upl_offset & PAGE_MASK;
493
494 if (flags & CL_DEV_MEMORY) {
495 /*
496 * currently, can't deal with reading 'holes' in file
497 */
498 if ((long)blkno == -1) {
499 error = EINVAL;
500 break;
501 }
502 /*
503 * treat physical requests as one 'giant' page
504 */
505 pg_count = 1;
506 } else
507 pg_count = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
508
509 if ((flags & CL_READ) && (long)blkno == -1) {
510 int bytes_to_zero;
511
512 /*
513 * if we're reading and blkno == -1, then we've got a
514 * 'hole' in the file that we need to deal with by zeroing
515 * out the affected area in the upl
516 */
517 if (zero_offset && io_size == size) {
518 /*
519 * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
520 * than 'zero_offset' will be non-zero
521 * if the 'hole' returned by VOP_CMAP extends all the way to the eof
522 * (indicated by the io_size finishing off the I/O request for this UPL)
523 * than we're not going to issue an I/O for the
524 * last page in this upl... we need to zero both the hole and the tail
525 * of the page beyond the EOF, since the delayed zero-fill won't kick in
526 */
527 bytes_to_zero = (((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset;
528
529 zero_offset = 0;
530 } else
531 bytes_to_zero = io_size;
532
533 cluster_zero(upl, upl_offset, bytes_to_zero, real_bp);
534
535 if (cbp_head)
536 /*
537 * if there is a current I/O chain pending
538 * then the first page of the group we just zero'd
539 * will be handled by the I/O completion if the zero
540 * fill started in the middle of the page
541 */
542 pg_count = (io_size - pg_offset) / PAGE_SIZE;
543 else {
544 /*
545 * no pending I/O to pick up that first page
546 * so, we have to make sure it gets committed
547 * here.
548 * set the pg_offset to 0 so that the upl_commit_range
549 * starts with this page
550 */
551 pg_count = (io_size + pg_offset) / PAGE_SIZE;
552 pg_offset = 0;
553 }
554 if (io_size == size && ((upl_offset + io_size) & PAGE_MASK))
555 /*
556 * if we're done with the request for this UPL
557 * then we have to make sure to commit the last page
558 * even if we only partially zero-filled it
559 */
560 pg_count++;
561
562 if (pg_count) {
563 if (pg_offset)
564 pg_resid = PAGE_SIZE - pg_offset;
565 else
566 pg_resid = 0;
567
568 if (flags & CL_COMMIT)
569 ubc_upl_commit_range(upl,
570 (upl_offset + pg_resid) & ~PAGE_MASK,
571 pg_count * PAGE_SIZE,
572 UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
573 }
574 upl_offset += io_size;
575 f_offset += io_size;
576 size -= io_size;
577
578 if (cbp_head && pg_count)
579 goto start_io;
580 continue;
581
582 } else if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno)) {
583 real_bp->b_blkno = blkno;
584 }
585
586 if (pg_count > max_vectors) {
587 io_size -= (pg_count - max_vectors) * PAGE_SIZE;
588
589 if (io_size < 0) {
590 io_size = PAGE_SIZE - pg_offset;
591 pg_count = 1;
592 } else
593 pg_count = max_vectors;
594 }
595
596 if ( !(vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV))
597 /*
598 * if we're not targeting a virtual device i.e. a disk image
599 * it's safe to dip into the reserve pool since real devices
600 * can complete this I/O request without requiring additional
601 * bufs from the alloc_io_buf pool
602 */
603 priv = 1;
604 else if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT))
605 /*
606 * Throttle the speculative IO
607 */
608 priv = 0;
609 else
610 priv = 1;
611
612 cbp = alloc_io_buf(vp, priv);
613
614
615 if (flags & CL_PAGEOUT) {
616 for (i = 0; i < pg_count; i++) {
617 int s;
618 struct buf *bp;
619
620 s = splbio();
621 if (bp = incore(vp, lblkno + i)) {
622 if (!ISSET(bp->b_flags, B_BUSY)) {
623 bremfree(bp);
624 SET(bp->b_flags, (B_BUSY | B_INVAL));
625 splx(s);
626 brelse(bp);
627 } else
628 panic("BUSY bp found in cluster_io");
629 }
630 splx(s);
631 }
632 }
633 if (flags & CL_ASYNC) {
634 cbp->b_flags |= (B_CALL | B_ASYNC);
635 cbp->b_iodone = (void *)cluster_iodone;
636 }
637 cbp->b_flags |= io_flags;
638
639 cbp->b_lblkno = lblkno;
640 cbp->b_blkno = blkno;
641 cbp->b_bcount = io_size;
642 cbp->b_pagelist = upl;
643 cbp->b_uploffset = upl_offset;
644 cbp->b_trans_next = (struct buf *)0;
645
646 if (cbp->b_iostate = (void *)iostate)
647 /*
648 * caller wants to track the state of this
649 * io... bump the amount issued against this stream
650 */
651 iostate->io_issued += io_size;
652
653 if (flags & CL_READ)
654 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
655 cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
656 else
657 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
658 cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
659
660 if (cbp_head) {
661 cbp_tail->b_trans_next = cbp;
662 cbp_tail = cbp;
663 } else {
664 cbp_head = cbp;
665 cbp_tail = cbp;
666 }
667 (struct buf *)(cbp->b_trans_head) = cbp_head;
668 buf_count++;
669
670 upl_offset += io_size;
671 f_offset += io_size;
672 size -= io_size;
673
674 if ( (!(upl_offset & PAGE_MASK) && !(flags & CL_DEV_MEMORY) && ((flags & CL_ASYNC) || buf_count > 8)) || size == 0) {
675 /*
676 * if we have no more I/O to issue or
677 * the current I/O we've prepared fully
678 * completes the last page in this request
679 * and it's either an ASYNC request or
680 * we've already accumulated more than 8 I/O's into
681 * this transaction and it's not an I/O directed to
682 * special DEVICE memory
683 * then go ahead and issue the I/O
684 */
685 start_io:
686 if (real_bp) {
687 cbp_head->b_flags |= B_NEED_IODONE;
688 cbp_head->b_real_bp = real_bp;
689 } else
690 cbp_head->b_real_bp = (struct buf *)NULL;
691
692 if (size == 0) {
693 /*
694 * we're about to issue the last I/O for this upl
695 * if this was a read to the eof and the eof doesn't
696 * finish on a page boundary, than we need to zero-fill
697 * the rest of the page....
698 */
699 cbp_head->b_validend = zero_offset;
700 } else
701 cbp_head->b_validend = 0;
702
703 if (flags & CL_THROTTLE) {
704 while (vp->v_numoutput >= async_throttle) {
705 vp->v_flag |= VTHROTTLED;
706 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_io", 0);
707 }
708 }
709 for (cbp = cbp_head; cbp;) {
710 struct buf * cbp_next;
711
712 if (io_flags & B_WRITEINPROG)
713 cbp->b_vp->v_numoutput++;
714
715 cbp_next = cbp->b_trans_next;
716
717 (void) VOP_STRATEGY(cbp);
718 cbp = cbp_next;
719 }
720 if ( !(flags & CL_ASYNC)) {
721 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next)
722 biowait(cbp);
723
724 if (error = cluster_iodone(cbp_head)) {
725 if ((flags & CL_PAGEOUT) && (error == ENXIO))
726 retval = 0; /* drop the error */
727 else
728 retval = error;
729 error = 0;
730 }
731 }
732 cbp_head = (struct buf *)0;
733 cbp_tail = (struct buf *)0;
734
735 buf_count = 0;
736 }
737 }
738 if (error) {
739 int abort_size;
740
741 io_size = 0;
742
743 for (cbp = cbp_head; cbp;) {
744 struct buf * cbp_next;
745
746 upl_offset -= cbp->b_bcount;
747 size += cbp->b_bcount;
748 io_size += cbp->b_bcount;
749
750 cbp_next = cbp->b_trans_next;
751 free_io_buf(cbp);
752 cbp = cbp_next;
753 }
754 if (iostate) {
755 /*
756 * update the error condition for this stream
757 * since we never really issued the io
758 * just go ahead and adjust it back
759 */
760 if (iostate->io_error == 0)
761 iostate->io_error = error;
762 iostate->io_issued -= io_size;
763
764 if (iostate->io_wanted) {
765 /*
766 * someone is waiting for the state of
767 * this io stream to change
768 */
769 iostate->io_wanted = 0;
770 wakeup((caddr_t)&iostate->io_wanted);
771 }
772 }
773 pg_offset = upl_offset & PAGE_MASK;
774 abort_size = (size + pg_offset + (PAGE_SIZE - 1)) & ~PAGE_MASK;
775
776 if (flags & CL_COMMIT) {
777 int upl_abort_code;
778
779 if (flags & CL_PRESERVE) {
780 ubc_upl_commit_range(upl, upl_offset - pg_offset, abort_size,
781 UPL_COMMIT_FREE_ON_EMPTY);
782 } else {
783 if ((flags & CL_PAGEOUT) && (error != ENXIO)) /* transient error */
784 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
785 else if (flags & CL_PAGEIN)
786 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
787 else
788 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
789
790 ubc_upl_abort_range(upl, upl_offset - pg_offset, abort_size,
791 upl_abort_code);
792 }
793 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
794 (int)upl, upl_offset - pg_offset, abort_size, error, 0);
795 }
796 if (real_bp) {
797 real_bp->b_flags |= B_ERROR;
798 real_bp->b_error = error;
799
800 biodone(real_bp);
801 }
802 if (retval == 0)
803 retval = error;
804 }
805 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END,
806 (int)f_offset, size, upl_offset, retval, 0);
807
808 return (retval);
809 }
810
811
812 static int
813 cluster_rd_prefetch(vp, f_offset, size, filesize, devblocksize)
814 struct vnode *vp;
815 off_t f_offset;
816 u_int size;
817 off_t filesize;
818 int devblocksize;
819 {
820 int pages_in_prefetch;
821
822 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
823 (int)f_offset, size, (int)filesize, 0, 0);
824
825 if (f_offset >= filesize) {
826 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
827 (int)f_offset, 0, 0, 0, 0);
828 return(0);
829 }
830 if (size > (MAX_UPL_TRANSFER * PAGE_SIZE))
831 size = (MAX_UPL_TRANSFER * PAGE_SIZE);
832 else
833 size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
834
835 if ((off_t)size > (filesize - f_offset))
836 size = filesize - f_offset;
837 pages_in_prefetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
838
839 advisory_read(vp, filesize, f_offset, size, devblocksize);
840
841 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
842 (int)f_offset + size, pages_in_prefetch, 0, 1, 0);
843
844 return (pages_in_prefetch);
845 }
846
847
848
849 static void
850 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize)
851 struct vnode *vp;
852 daddr_t b_lblkno;
853 daddr_t e_lblkno;
854 off_t filesize;
855 int devblocksize;
856 {
857 daddr_t r_lblkno;
858 off_t f_offset;
859 int size_of_prefetch;
860
861 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
862 b_lblkno, e_lblkno, vp->v_lastr, 0, 0);
863
864 if (b_lblkno == vp->v_lastr && b_lblkno == e_lblkno) {
865 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
866 vp->v_ralen, vp->v_maxra, vp->v_lastr, 0, 0);
867 return;
868 }
869 if (vp->v_lastr == -1 || (b_lblkno != vp->v_lastr && b_lblkno != (vp->v_lastr + 1) &&
870 (b_lblkno != (vp->v_maxra + 1) || vp->v_ralen == 0))) {
871 vp->v_ralen = 0;
872 vp->v_maxra = 0;
873
874 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
875 vp->v_ralen, vp->v_maxra, vp->v_lastr, 1, 0);
876
877 return;
878 }
879 if (e_lblkno < vp->v_maxra) {
880 if ((vp->v_maxra - e_lblkno) > (MAX_UPL_TRANSFER / 4)) {
881
882 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
883 vp->v_ralen, vp->v_maxra, vp->v_lastr, 2, 0);
884 return;
885 }
886 }
887 r_lblkno = max(e_lblkno, vp->v_maxra) + 1;
888 f_offset = (off_t)r_lblkno * PAGE_SIZE_64;
889
890 size_of_prefetch = 0;
891
892 ubc_range_op(vp, f_offset, f_offset + PAGE_SIZE_64, UPL_ROP_PRESENT, &size_of_prefetch);
893
894 if (size_of_prefetch) {
895 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
896 vp->v_ralen, vp->v_maxra, vp->v_lastr, 3, 0);
897 return;
898 }
899 if (f_offset < filesize) {
900 vp->v_ralen = vp->v_ralen ? min(MAX_UPL_TRANSFER, vp->v_ralen << 1) : 1;
901
902 if (((e_lblkno + 1) - b_lblkno) > vp->v_ralen)
903 vp->v_ralen = min(MAX_UPL_TRANSFER, (e_lblkno + 1) - b_lblkno);
904
905 size_of_prefetch = cluster_rd_prefetch(vp, f_offset, vp->v_ralen * PAGE_SIZE, filesize, devblocksize);
906
907 if (size_of_prefetch)
908 vp->v_maxra = (r_lblkno + size_of_prefetch) - 1;
909 }
910 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
911 vp->v_ralen, vp->v_maxra, vp->v_lastr, 4, 0);
912 }
913
914 int
915 cluster_pageout(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags)
916 struct vnode *vp;
917 upl_t upl;
918 vm_offset_t upl_offset;
919 off_t f_offset;
920 int size;
921 off_t filesize;
922 int devblocksize;
923 int flags;
924 {
925 int io_size;
926 int rounded_size;
927 off_t max_size;
928 int local_flags;
929
930 if (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)
931 /*
932 * if we know we're issuing this I/O to a virtual device (i.e. disk image)
933 * then we don't want to enforce this throttle... if we do, we can
934 * potentially deadlock since we're stalling the pageout thread at a time
935 * when the disk image might need additional memory (which won't be available
936 * if the pageout thread can't run)... instead we'll just depend on the throttle
937 * that the pageout thread now has in place to deal with external files
938 */
939 local_flags = CL_PAGEOUT;
940 else
941 local_flags = CL_PAGEOUT | CL_THROTTLE;
942
943 if ((flags & UPL_IOSYNC) == 0)
944 local_flags |= CL_ASYNC;
945 if ((flags & UPL_NOCOMMIT) == 0)
946 local_flags |= CL_COMMIT;
947
948
949 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
950 (int)f_offset, size, (int)filesize, local_flags, 0);
951
952 /*
953 * If they didn't specify any I/O, then we are done...
954 * we can't issue an abort because we don't know how
955 * big the upl really is
956 */
957 if (size <= 0)
958 return (EINVAL);
959
960 if (vp->v_mount->mnt_flag & MNT_RDONLY) {
961 if (local_flags & CL_COMMIT)
962 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
963 return (EROFS);
964 }
965 /*
966 * can't page-in from a negative offset
967 * or if we're starting beyond the EOF
968 * or if the file offset isn't page aligned
969 * or the size requested isn't a multiple of PAGE_SIZE
970 */
971 if (f_offset < 0 || f_offset >= filesize ||
972 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
973 if (local_flags & CL_COMMIT)
974 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
975 return (EINVAL);
976 }
977 max_size = filesize - f_offset;
978
979 if (size < max_size)
980 io_size = size;
981 else
982 io_size = max_size;
983
984 rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
985
986 if (size > rounded_size) {
987 if (local_flags & CL_COMMIT)
988 ubc_upl_abort_range(upl, upl_offset + rounded_size, size - rounded_size,
989 UPL_ABORT_FREE_ON_EMPTY);
990 }
991 vp->v_flag |= VHASBEENPAGED;
992
993 return (cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize,
994 local_flags, (struct buf *)0, (struct clios *)0));
995 }
996
997 int
998 cluster_pagein(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags)
999 struct vnode *vp;
1000 upl_t upl;
1001 vm_offset_t upl_offset;
1002 off_t f_offset;
1003 int size;
1004 off_t filesize;
1005 int devblocksize;
1006 int flags;
1007 {
1008 u_int io_size;
1009 int rounded_size;
1010 off_t max_size;
1011 int retval;
1012 int local_flags = 0;
1013
1014 if (upl == NULL || size < 0)
1015 panic("cluster_pagein: NULL upl passed in");
1016
1017 if ((flags & UPL_IOSYNC) == 0)
1018 local_flags |= CL_ASYNC;
1019 if ((flags & UPL_NOCOMMIT) == 0)
1020 local_flags |= CL_COMMIT;
1021
1022
1023 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
1024 (int)f_offset, size, (int)filesize, local_flags, 0);
1025
1026 /*
1027 * can't page-in from a negative offset
1028 * or if we're starting beyond the EOF
1029 * or if the file offset isn't page aligned
1030 * or the size requested isn't a multiple of PAGE_SIZE
1031 */
1032 if (f_offset < 0 || f_offset >= filesize ||
1033 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) {
1034 if (local_flags & CL_COMMIT)
1035 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1036 return (EINVAL);
1037 }
1038 max_size = filesize - f_offset;
1039
1040 if (size < max_size)
1041 io_size = size;
1042 else
1043 io_size = max_size;
1044
1045 rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1046
1047 if (size > rounded_size && (local_flags & CL_COMMIT))
1048 ubc_upl_abort_range(upl, upl_offset + rounded_size,
1049 size - rounded_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1050
1051 retval = cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize,
1052 local_flags | CL_READ | CL_PAGEIN, (struct buf *)0, (struct clios *)0);
1053
1054 if (retval == 0) {
1055 int b_lblkno;
1056 int e_lblkno;
1057
1058 b_lblkno = (int)(f_offset / PAGE_SIZE_64);
1059 e_lblkno = (int)
1060 ((f_offset + ((off_t)io_size - 1)) / PAGE_SIZE_64);
1061
1062 if (!(flags & UPL_NORDAHEAD) && !(vp->v_flag & VRAOFF) && rounded_size == PAGE_SIZE) {
1063 /*
1064 * we haven't read the last page in of the file yet
1065 * so let's try to read ahead if we're in
1066 * a sequential access pattern
1067 */
1068 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
1069 }
1070 vp->v_lastr = e_lblkno;
1071 }
1072 return (retval);
1073 }
1074
1075 int
1076 cluster_bp(bp)
1077 struct buf *bp;
1078 {
1079 off_t f_offset;
1080 int flags;
1081
1082 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START,
1083 (int)bp, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
1084
1085 if (bp->b_pagelist == (upl_t) 0)
1086 panic("cluster_bp: can't handle NULL upl yet\n");
1087 if (bp->b_flags & B_READ)
1088 flags = CL_ASYNC | CL_READ;
1089 else
1090 flags = CL_ASYNC;
1091
1092 f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
1093
1094 return (cluster_io(bp->b_vp, bp->b_pagelist, 0, f_offset, bp->b_bcount, 0, flags, bp, (struct clios *)0));
1095 }
1096
1097 int
1098 cluster_write(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
1099 struct vnode *vp;
1100 struct uio *uio;
1101 off_t oldEOF;
1102 off_t newEOF;
1103 off_t headOff;
1104 off_t tailOff;
1105 int devblocksize;
1106 int flags;
1107 {
1108 int prev_resid;
1109 int clip_size;
1110 off_t max_io_size;
1111 struct iovec *iov;
1112 int upl_size;
1113 int upl_flags;
1114 upl_t upl;
1115 int retval = 0;
1116
1117
1118 if (vp->v_flag & VHASBEENPAGED)
1119 {
1120 /*
1121 * this vnode had pages cleaned to it by
1122 * the pager which indicates that either
1123 * it's not very 'hot', or the system is
1124 * being overwhelmed by a lot of dirty
1125 * data being delayed in the VM cache...
1126 * in either event, we'll push our remaining
1127 * delayed data at this point... this will
1128 * be more efficient than paging out 1 page at
1129 * a time, and will also act as a throttle
1130 * by delaying this client from writing any
1131 * more data until all his delayed data has
1132 * at least been queued to the uderlying driver.
1133 */
1134 cluster_push(vp);
1135
1136 vp->v_flag &= ~VHASBEENPAGED;
1137 }
1138
1139 if ( (!(vp->v_flag & VNOCACHE_DATA)) || (!uio) || (uio->uio_segflg != UIO_USERSPACE))
1140 {
1141 /*
1142 * go do a write through the cache if one of the following is true....
1143 * NOCACHE is not true
1144 * there is no uio structure or it doesn't target USERSPACE
1145 */
1146 return (cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags));
1147 }
1148
1149 while (uio->uio_resid && uio->uio_offset < newEOF && retval == 0)
1150 {
1151 /*
1152 * we know we have a resid, so this is safe
1153 * skip over any emtpy vectors
1154 */
1155 iov = uio->uio_iov;
1156
1157 while (iov->iov_len == 0) {
1158 uio->uio_iov++;
1159 uio->uio_iovcnt--;
1160 iov = uio->uio_iov;
1161 }
1162 upl_size = PAGE_SIZE;
1163 upl_flags = UPL_QUERY_OBJECT_TYPE;
1164
1165 if ((vm_map_get_upl(current_map(),
1166 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1167 &upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS)
1168 {
1169 /*
1170 * the user app must have passed in an invalid address
1171 */
1172 return (EFAULT);
1173 }
1174
1175 /*
1176 * We check every vector target but if it is physically
1177 * contiguous space, we skip the sanity checks.
1178 */
1179 if (upl_flags & UPL_PHYS_CONTIG)
1180 {
1181 if (flags & IO_HEADZEROFILL)
1182 {
1183 flags &= ~IO_HEADZEROFILL;
1184
1185 if (retval = cluster_write_x(vp, (struct uio *)0, 0, uio->uio_offset, headOff, 0, devblocksize, IO_HEADZEROFILL))
1186 return(retval);
1187 }
1188
1189 retval = cluster_phys_write(vp, uio, newEOF, devblocksize, flags);
1190
1191 if (uio->uio_resid == 0 && (flags & IO_TAILZEROFILL))
1192 {
1193 return (cluster_write_x(vp, (struct uio *)0, 0, tailOff, uio->uio_offset, 0, devblocksize, IO_HEADZEROFILL));
1194 }
1195 }
1196 else if ((uio->uio_resid < PAGE_SIZE) || (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL)))
1197 {
1198 /*
1199 * we're here because we're don't have a physically contiguous target buffer
1200 * go do a write through the cache if one of the following is true....
1201 * the total xfer size is less than a page...
1202 * we're being asked to ZEROFILL either the head or the tail of the I/O...
1203 */
1204 return (cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags));
1205 }
1206 else if (((int)uio->uio_offset & PAGE_MASK) || ((int)iov->iov_base & PAGE_MASK))
1207 {
1208 if (((int)uio->uio_offset & PAGE_MASK) == ((int)iov->iov_base & PAGE_MASK))
1209 {
1210 /*
1211 * Bring the file offset write up to a pagesize boundary
1212 * this will also bring the base address to a page boundary
1213 * since they both are currently on the same offset within a page
1214 * note: if we get here, uio->uio_resid is greater than PAGE_SIZE
1215 * so the computed clip_size must always be less than the current uio_resid
1216 */
1217 clip_size = (PAGE_SIZE - (uio->uio_offset & PAGE_MASK_64));
1218
1219 /*
1220 * Fake the resid going into the cluster_write_x call
1221 * and restore it on the way out.
1222 */
1223 prev_resid = uio->uio_resid;
1224 uio->uio_resid = clip_size;
1225 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1226 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1227 }
1228 else
1229 {
1230 /*
1231 * can't get both the file offset and the buffer offset aligned to a page boundary
1232 * so fire an I/O through the cache for this entire vector
1233 */
1234 clip_size = iov->iov_len;
1235 prev_resid = uio->uio_resid;
1236 uio->uio_resid = clip_size;
1237 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1238 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1239 }
1240 }
1241 else
1242 {
1243 /*
1244 * If we come in here, we know the offset into
1245 * the file is on a pagesize boundary and the
1246 * target buffer address is also on a page boundary
1247 */
1248 max_io_size = newEOF - uio->uio_offset;
1249 clip_size = uio->uio_resid;
1250 if (iov->iov_len < clip_size)
1251 clip_size = iov->iov_len;
1252 if (max_io_size < clip_size)
1253 clip_size = max_io_size;
1254
1255 if (clip_size < PAGE_SIZE)
1256 {
1257 /*
1258 * Take care of tail end of write in this vector
1259 */
1260 prev_resid = uio->uio_resid;
1261 uio->uio_resid = clip_size;
1262 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1263 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1264 }
1265 else
1266 {
1267 /* round clip_size down to a multiple of pagesize */
1268 clip_size = clip_size & ~(PAGE_MASK);
1269 prev_resid = uio->uio_resid;
1270 uio->uio_resid = clip_size;
1271 retval = cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags);
1272 if ((retval == 0) && uio->uio_resid)
1273 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1274 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1275 }
1276 } /* end else */
1277 } /* end while */
1278 return(retval);
1279 }
1280
1281
1282 static int
1283 cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags)
1284 struct vnode *vp;
1285 struct uio *uio;
1286 off_t newEOF;
1287 int devblocksize;
1288 int flags;
1289 {
1290 upl_t upl;
1291 upl_page_info_t *pl;
1292 off_t upl_f_offset;
1293 vm_offset_t upl_offset;
1294 off_t max_io_size;
1295 int io_size;
1296 int io_flag;
1297 int upl_size;
1298 int upl_needed_size;
1299 int pages_in_pl;
1300 int upl_flags;
1301 kern_return_t kret;
1302 struct iovec *iov;
1303 int i;
1304 int force_data_sync;
1305 int error = 0;
1306 struct clios iostate;
1307
1308 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
1309 (int)uio->uio_offset, (int)uio->uio_resid,
1310 (int)newEOF, devblocksize, 0);
1311
1312 /*
1313 * When we enter this routine, we know
1314 * -- the offset into the file is on a pagesize boundary
1315 * -- the resid is a page multiple
1316 * -- the resid will not exceed iov_len
1317 */
1318 cluster_try_push(vp, newEOF, 0, 1);
1319
1320 iostate.io_completed = 0;
1321 iostate.io_issued = 0;
1322 iostate.io_error = 0;
1323 iostate.io_wanted = 0;
1324
1325 iov = uio->uio_iov;
1326
1327 while (uio->uio_resid && uio->uio_offset < newEOF && error == 0) {
1328 io_size = uio->uio_resid;
1329
1330 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1331 io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1332
1333 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK;
1334 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
1335
1336 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
1337 (int)upl_offset, upl_needed_size, (int)iov->iov_base, io_size, 0);
1338
1339 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
1340 pages_in_pl = 0;
1341 upl_size = upl_needed_size;
1342 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1343 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
1344
1345 kret = vm_map_get_upl(current_map(),
1346 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1347 &upl_size,
1348 &upl,
1349 NULL,
1350 &pages_in_pl,
1351 &upl_flags,
1352 force_data_sync);
1353
1354 if (kret != KERN_SUCCESS) {
1355 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1356 0, 0, 0, kret, 0);
1357 /*
1358 * cluster_nocopy_write: failed to get pagelist
1359 *
1360 * we may have already spun some portion of this request
1361 * off as async requests... we need to wait for the I/O
1362 * to complete before returning
1363 */
1364 goto wait_for_writes;
1365 }
1366 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1367 pages_in_pl = upl_size / PAGE_SIZE;
1368
1369 for (i = 0; i < pages_in_pl; i++) {
1370 if (!upl_valid_page(pl, i))
1371 break;
1372 }
1373 if (i == pages_in_pl)
1374 break;
1375
1376 /*
1377 * didn't get all the pages back that we
1378 * needed... release this upl and try again
1379 */
1380 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1381 UPL_ABORT_FREE_ON_EMPTY);
1382 }
1383 if (force_data_sync >= 3) {
1384 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1385 i, pages_in_pl, upl_size, kret, 0);
1386 /*
1387 * for some reason, we couldn't acquire a hold on all
1388 * the pages needed in the user's address space
1389 *
1390 * we may have already spun some portion of this request
1391 * off as async requests... we need to wait for the I/O
1392 * to complete before returning
1393 */
1394 goto wait_for_writes;
1395 }
1396
1397 /*
1398 * Consider the possibility that upl_size wasn't satisfied.
1399 */
1400 if (upl_size != upl_needed_size)
1401 io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
1402
1403 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1404 (int)upl_offset, upl_size, (int)iov->iov_base, io_size, 0);
1405
1406 if (io_size == 0) {
1407 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1408 UPL_ABORT_FREE_ON_EMPTY);
1409 /*
1410 * we may have already spun some portion of this request
1411 * off as async requests... we need to wait for the I/O
1412 * to complete before returning
1413 */
1414 goto wait_for_writes;
1415 }
1416 /*
1417 * Now look for pages already in the cache
1418 * and throw them away.
1419 * uio->uio_offset is page aligned within the file
1420 * io_size is a multiple of PAGE_SIZE
1421 */
1422 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + io_size, UPL_ROP_DUMP, NULL);
1423
1424 /*
1425 * we want push out these writes asynchronously so that we can overlap
1426 * the preparation of the next I/O
1427 * if there are already too many outstanding writes
1428 * wait until some complete before issuing the next
1429 */
1430 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
1431 iostate.io_wanted = 1;
1432 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_write", 0);
1433 }
1434 if (iostate.io_error) {
1435 /*
1436 * one of the earlier writes we issued ran into a hard error
1437 * don't issue any more writes, cleanup the UPL
1438 * that was just created but not used, then
1439 * go wait for all writes that are part of this stream
1440 * to complete before returning the error to the caller
1441 */
1442 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1443 UPL_ABORT_FREE_ON_EMPTY);
1444
1445 goto wait_for_writes;
1446 }
1447 io_flag = CL_ASYNC | CL_PRESERVE | CL_COMMIT | CL_THROTTLE;
1448
1449 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
1450 (int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0);
1451
1452 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1453 io_size, devblocksize, io_flag, (struct buf *)0, &iostate);
1454
1455 iov->iov_len -= io_size;
1456 iov->iov_base += io_size;
1457 uio->uio_resid -= io_size;
1458 uio->uio_offset += io_size;
1459
1460 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
1461 (int)upl_offset, (int)uio->uio_offset, (int)uio->uio_resid, error, 0);
1462
1463 } /* end while */
1464
1465 wait_for_writes:
1466 /*
1467 * make sure all async writes issued as part of this stream
1468 * have completed before we return
1469 */
1470 while (iostate.io_issued != iostate.io_completed) {
1471 iostate.io_wanted = 1;
1472 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_write", 0);
1473 }
1474 if (iostate.io_error)
1475 error = iostate.io_error;
1476
1477 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1478 (int)uio->uio_offset, (int)uio->uio_resid, error, 4, 0);
1479
1480 return (error);
1481 }
1482
1483
1484 static int
1485 cluster_phys_write(vp, uio, newEOF, devblocksize, flags)
1486 struct vnode *vp;
1487 struct uio *uio;
1488 off_t newEOF;
1489 int devblocksize;
1490 int flags;
1491 {
1492 upl_page_info_t *pl;
1493 addr64_t src_paddr;
1494 upl_t upl;
1495 vm_offset_t upl_offset;
1496 int tail_size;
1497 int io_size;
1498 int upl_size;
1499 int upl_needed_size;
1500 int pages_in_pl;
1501 int upl_flags;
1502 kern_return_t kret;
1503 struct iovec *iov;
1504 int error = 0;
1505
1506 /*
1507 * When we enter this routine, we know
1508 * -- the resid will not exceed iov_len
1509 * -- the vector target address is physcially contiguous
1510 */
1511 cluster_try_push(vp, newEOF, 0, 1);
1512
1513 iov = uio->uio_iov;
1514 io_size = iov->iov_len;
1515 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK;
1516 upl_needed_size = upl_offset + io_size;
1517
1518 pages_in_pl = 0;
1519 upl_size = upl_needed_size;
1520 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1521 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
1522
1523 kret = vm_map_get_upl(current_map(),
1524 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1525 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
1526
1527 if (kret != KERN_SUCCESS) {
1528 /*
1529 * cluster_phys_write: failed to get pagelist
1530 * note: return kret here
1531 */
1532 return(EINVAL);
1533 }
1534 /*
1535 * Consider the possibility that upl_size wasn't satisfied.
1536 * This is a failure in the physical memory case.
1537 */
1538 if (upl_size < upl_needed_size) {
1539 kernel_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1540 return(EINVAL);
1541 }
1542 pl = ubc_upl_pageinfo(upl);
1543
1544 src_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + ((addr64_t)((u_int)iov->iov_base & PAGE_MASK));
1545
1546 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
1547 int head_size;
1548
1549 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
1550
1551 if (head_size > io_size)
1552 head_size = io_size;
1553
1554 error = cluster_align_phys_io(vp, uio, src_paddr, head_size, devblocksize, 0);
1555
1556 if (error) {
1557 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1558
1559 return(EINVAL);
1560 }
1561 upl_offset += head_size;
1562 src_paddr += head_size;
1563 io_size -= head_size;
1564 }
1565 tail_size = io_size & (devblocksize - 1);
1566 io_size -= tail_size;
1567
1568 if (io_size) {
1569 /*
1570 * issue a synchronous write to cluster_io
1571 */
1572 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1573 io_size, 0, CL_DEV_MEMORY, (struct buf *)0, (struct clios *)0);
1574 }
1575 if (error == 0) {
1576 /*
1577 * The cluster_io write completed successfully,
1578 * update the uio structure
1579 */
1580 uio->uio_resid -= io_size;
1581 iov->iov_len -= io_size;
1582 iov->iov_base += io_size;
1583 uio->uio_offset += io_size;
1584 src_paddr += io_size;
1585
1586 if (tail_size)
1587 error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, devblocksize, 0);
1588 }
1589 /*
1590 * just release our hold on the physically contiguous
1591 * region without changing any state
1592 */
1593 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1594
1595 return (error);
1596 }
1597
1598
1599 static int
1600 cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
1601 struct vnode *vp;
1602 struct uio *uio;
1603 off_t oldEOF;
1604 off_t newEOF;
1605 off_t headOff;
1606 off_t tailOff;
1607 int devblocksize;
1608 int flags;
1609 {
1610 upl_page_info_t *pl;
1611 upl_t upl;
1612 vm_offset_t upl_offset;
1613 int upl_size;
1614 off_t upl_f_offset;
1615 int pages_in_upl;
1616 int start_offset;
1617 int xfer_resid;
1618 int io_size;
1619 int io_flags;
1620 int io_offset;
1621 int bytes_to_zero;
1622 int bytes_to_move;
1623 kern_return_t kret;
1624 int retval = 0;
1625 int uio_resid;
1626 long long total_size;
1627 long long zero_cnt;
1628 off_t zero_off;
1629 long long zero_cnt1;
1630 off_t zero_off1;
1631 daddr_t start_blkno;
1632 daddr_t last_blkno;
1633 int intersection;
1634
1635
1636 if (uio) {
1637 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1638 (int)uio->uio_offset, uio->uio_resid, (int)oldEOF, (int)newEOF, 0);
1639
1640 uio_resid = uio->uio_resid;
1641 } else {
1642 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1643 0, 0, (int)oldEOF, (int)newEOF, 0);
1644
1645 uio_resid = 0;
1646 }
1647 zero_cnt = 0;
1648 zero_cnt1 = 0;
1649
1650 if (flags & IO_HEADZEROFILL) {
1651 /*
1652 * some filesystems (HFS is one) don't support unallocated holes within a file...
1653 * so we zero fill the intervening space between the old EOF and the offset
1654 * where the next chunk of real data begins.... ftruncate will also use this
1655 * routine to zero fill to the new EOF when growing a file... in this case, the
1656 * uio structure will not be provided
1657 */
1658 if (uio) {
1659 if (headOff < uio->uio_offset) {
1660 zero_cnt = uio->uio_offset - headOff;
1661 zero_off = headOff;
1662 }
1663 } else if (headOff < newEOF) {
1664 zero_cnt = newEOF - headOff;
1665 zero_off = headOff;
1666 }
1667 }
1668 if (flags & IO_TAILZEROFILL) {
1669 if (uio) {
1670 zero_off1 = uio->uio_offset + uio->uio_resid;
1671
1672 if (zero_off1 < tailOff)
1673 zero_cnt1 = tailOff - zero_off1;
1674 }
1675 }
1676 if (zero_cnt == 0 && uio == (struct uio *) 0) {
1677 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
1678 retval, 0, 0, 0, 0);
1679 return (0);
1680 }
1681
1682 while ((total_size = (uio_resid + zero_cnt + zero_cnt1)) && retval == 0) {
1683 /*
1684 * for this iteration of the loop, figure out where our starting point is
1685 */
1686 if (zero_cnt) {
1687 start_offset = (int)(zero_off & PAGE_MASK_64);
1688 upl_f_offset = zero_off - start_offset;
1689 } else if (uio_resid) {
1690 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
1691 upl_f_offset = uio->uio_offset - start_offset;
1692 } else {
1693 start_offset = (int)(zero_off1 & PAGE_MASK_64);
1694 upl_f_offset = zero_off1 - start_offset;
1695 }
1696 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
1697 (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
1698
1699 if (total_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1700 total_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1701
1702 start_blkno = (daddr_t)(upl_f_offset / PAGE_SIZE_64);
1703
1704 if (uio && !(vp->v_flag & VNOCACHE_DATA) &&
1705 (flags & (IO_SYNC | IO_HEADZEROFILL | IO_TAILZEROFILL)) == 0) {
1706 /*
1707 * assumption... total_size <= uio_resid
1708 * because IO_HEADZEROFILL and IO_TAILZEROFILL not set
1709 */
1710 if ((start_offset + total_size) > (MAX_UPL_TRANSFER * PAGE_SIZE))
1711 total_size -= start_offset;
1712 xfer_resid = total_size;
1713
1714 retval = cluster_copy_ubc_data(vp, uio, &xfer_resid, 1);
1715
1716 if (retval)
1717 break;
1718
1719 uio_resid -= (total_size - xfer_resid);
1720 total_size = xfer_resid;
1721 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
1722 upl_f_offset = uio->uio_offset - start_offset;
1723
1724 if (total_size == 0) {
1725 if (start_offset) {
1726 /*
1727 * the write did not finish on a page boundary
1728 * which will leave upl_f_offset pointing to the
1729 * beginning of the last page written instead of
1730 * the page beyond it... bump it in this case
1731 * so that the cluster code records the last page
1732 * written as dirty
1733 */
1734 upl_f_offset += PAGE_SIZE_64;
1735 }
1736 upl_size = 0;
1737
1738 goto check_cluster;
1739 }
1740 }
1741 /*
1742 * compute the size of the upl needed to encompass
1743 * the requested write... limit each call to cluster_io
1744 * to the maximum UPL size... cluster_io will clip if
1745 * this exceeds the maximum io_size for the device,
1746 * make sure to account for
1747 * a starting offset that's not page aligned
1748 */
1749 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1750
1751 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1752 upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1753
1754 pages_in_upl = upl_size / PAGE_SIZE;
1755 io_size = upl_size - start_offset;
1756
1757 if ((long long)io_size > total_size)
1758 io_size = total_size;
1759
1760 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, io_size, total_size, 0, 0);
1761
1762
1763 kret = ubc_create_upl(vp,
1764 upl_f_offset,
1765 upl_size,
1766 &upl,
1767 &pl,
1768 UPL_SET_LITE);
1769 if (kret != KERN_SUCCESS)
1770 panic("cluster_write: failed to get pagelist");
1771
1772 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END,
1773 (int)upl, (int)upl_f_offset, start_offset, 0, 0);
1774
1775 if (start_offset && !upl_valid_page(pl, 0)) {
1776 int read_size;
1777
1778 /*
1779 * we're starting in the middle of the first page of the upl
1780 * and the page isn't currently valid, so we're going to have
1781 * to read it in first... this is a synchronous operation
1782 */
1783 read_size = PAGE_SIZE;
1784
1785 if ((upl_f_offset + read_size) > newEOF)
1786 read_size = newEOF - upl_f_offset;
1787
1788 retval = cluster_io(vp, upl, 0, upl_f_offset, read_size, devblocksize,
1789 CL_READ, (struct buf *)0, (struct clios *)0);
1790 if (retval) {
1791 /*
1792 * we had an error during the read which causes us to abort
1793 * the current cluster_write request... before we do, we need
1794 * to release the rest of the pages in the upl without modifying
1795 * there state and mark the failed page in error
1796 */
1797 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
1798 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1799
1800 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1801 (int)upl, 0, 0, retval, 0);
1802 break;
1803 }
1804 }
1805 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
1806 /*
1807 * the last offset we're writing to in this upl does not end on a page
1808 * boundary... if it's not beyond the old EOF, then we'll also need to
1809 * pre-read this page in if it isn't already valid
1810 */
1811 upl_offset = upl_size - PAGE_SIZE;
1812
1813 if ((upl_f_offset + start_offset + io_size) < oldEOF &&
1814 !upl_valid_page(pl, upl_offset / PAGE_SIZE)) {
1815 int read_size;
1816
1817 read_size = PAGE_SIZE;
1818
1819 if ((upl_f_offset + upl_offset + read_size) > newEOF)
1820 read_size = newEOF - (upl_f_offset + upl_offset);
1821
1822 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size, devblocksize,
1823 CL_READ, (struct buf *)0, (struct clios *)0);
1824 if (retval) {
1825 /*
1826 * we had an error during the read which causes us to abort
1827 * the current cluster_write request... before we do, we
1828 * need to release the rest of the pages in the upl without
1829 * modifying there state and mark the failed page in error
1830 */
1831 ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
1832 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1833
1834 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1835 (int)upl, 0, 0, retval, 0);
1836 break;
1837 }
1838 }
1839 }
1840 xfer_resid = io_size;
1841 io_offset = start_offset;
1842
1843 while (zero_cnt && xfer_resid) {
1844
1845 if (zero_cnt < (long long)xfer_resid)
1846 bytes_to_zero = zero_cnt;
1847 else
1848 bytes_to_zero = xfer_resid;
1849
1850 if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
1851 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
1852 } else {
1853 int zero_pg_index;
1854
1855 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
1856 zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
1857
1858 if ( !upl_valid_page(pl, zero_pg_index)) {
1859 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
1860
1861 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
1862 !upl_dirty_page(pl, zero_pg_index)) {
1863 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
1864 }
1865 }
1866 xfer_resid -= bytes_to_zero;
1867 zero_cnt -= bytes_to_zero;
1868 zero_off += bytes_to_zero;
1869 io_offset += bytes_to_zero;
1870 }
1871 if (xfer_resid && uio_resid) {
1872 bytes_to_move = min(uio_resid, xfer_resid);
1873
1874 retval = cluster_copy_upl_data(uio, upl, io_offset, bytes_to_move);
1875
1876 if (retval) {
1877
1878 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
1879
1880 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1881 (int)upl, 0, 0, retval, 0);
1882 } else {
1883 uio_resid -= bytes_to_move;
1884 xfer_resid -= bytes_to_move;
1885 io_offset += bytes_to_move;
1886 }
1887 }
1888 while (xfer_resid && zero_cnt1 && retval == 0) {
1889
1890 if (zero_cnt1 < (long long)xfer_resid)
1891 bytes_to_zero = zero_cnt1;
1892 else
1893 bytes_to_zero = xfer_resid;
1894
1895 if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
1896 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
1897 } else {
1898 int zero_pg_index;
1899
1900 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off1 & PAGE_MASK_64));
1901 zero_pg_index = (int)((zero_off1 - upl_f_offset) / PAGE_SIZE_64);
1902
1903 if ( !upl_valid_page(pl, zero_pg_index)) {
1904 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
1905 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
1906 !upl_dirty_page(pl, zero_pg_index)) {
1907 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
1908 }
1909 }
1910 xfer_resid -= bytes_to_zero;
1911 zero_cnt1 -= bytes_to_zero;
1912 zero_off1 += bytes_to_zero;
1913 io_offset += bytes_to_zero;
1914 }
1915
1916 if (retval == 0) {
1917 int cl_index;
1918 int can_delay;
1919
1920 io_size += start_offset;
1921
1922 if ((upl_f_offset + io_size) >= newEOF && io_size < upl_size) {
1923 /*
1924 * if we're extending the file with this write
1925 * we'll zero fill the rest of the page so that
1926 * if the file gets extended again in such a way as to leave a
1927 * hole starting at this EOF, we'll have zero's in the correct spot
1928 */
1929 cluster_zero(upl, io_size, upl_size - io_size, NULL);
1930 }
1931 if (flags & IO_SYNC)
1932 /*
1933 * if the IO_SYNC flag is set than we need to
1934 * bypass any clusters and immediately issue
1935 * the I/O
1936 */
1937 goto issue_io;
1938 check_cluster:
1939 /*
1940 * calculate the last logical block number
1941 * that this delayed I/O encompassed
1942 */
1943 last_blkno = (upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64;
1944
1945 if (vp->v_flag & VHASDIRTY) {
1946
1947 if ( !(vp->v_flag & VNOCACHE_DATA)) {
1948 /*
1949 * we've fallen into the sparse
1950 * cluster method of delaying dirty pages
1951 * first, we need to release the upl if we hold one
1952 * since pages in it may be present in the sparse cluster map
1953 * and may span 2 separate buckets there... if they do and
1954 * we happen to have to flush a bucket to make room and it intersects
1955 * this upl, a deadlock may result on page BUSY
1956 */
1957 if (upl_size)
1958 ubc_upl_commit_range(upl, 0, upl_size,
1959 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
1960
1961 sparse_cluster_add(vp, newEOF, start_blkno, last_blkno);
1962
1963 continue;
1964 }
1965 /*
1966 * must have done cached writes that fell into
1967 * the sparse cluster mechanism... we've switched
1968 * to uncached writes on the file, so go ahead
1969 * and push whatever's in the sparse map
1970 * and switch back to normal clustering
1971 *
1972 * see the comment above concerning a possible deadlock...
1973 */
1974 if (upl_size) {
1975 ubc_upl_commit_range(upl, 0, upl_size,
1976 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
1977 /*
1978 * setting upl_size to 0 keeps us from committing a
1979 * second time in the start_new_cluster path
1980 */
1981 upl_size = 0;
1982 }
1983 sparse_cluster_push(vp, ubc_getsize(vp), 1);
1984
1985 /*
1986 * no clusters of either type present at this point
1987 * so just go directly to start_new_cluster since
1988 * we know we need to delay this I/O since we've
1989 * already released the pages back into the cache
1990 * to avoid the deadlock with sparse_cluster_push
1991 */
1992 goto start_new_cluster;
1993 }
1994 upl_offset = 0;
1995
1996 if (vp->v_clen == 0)
1997 /*
1998 * no clusters currently present
1999 */
2000 goto start_new_cluster;
2001
2002 for (cl_index = 0; cl_index < vp->v_clen; cl_index++) {
2003 /*
2004 * check each cluster that we currently hold
2005 * try to merge some or all of this write into
2006 * one or more of the existing clusters... if
2007 * any portion of the write remains, start a
2008 * new cluster
2009 */
2010 if (start_blkno >= vp->v_clusters[cl_index].start_pg) {
2011 /*
2012 * the current write starts at or after the current cluster
2013 */
2014 if (last_blkno <= (vp->v_clusters[cl_index].start_pg + MAX_UPL_TRANSFER)) {
2015 /*
2016 * we have a write that fits entirely
2017 * within the existing cluster limits
2018 */
2019 if (last_blkno > vp->v_clusters[cl_index].last_pg)
2020 /*
2021 * update our idea of where the cluster ends
2022 */
2023 vp->v_clusters[cl_index].last_pg = last_blkno;
2024 break;
2025 }
2026 if (start_blkno < (vp->v_clusters[cl_index].start_pg + MAX_UPL_TRANSFER)) {
2027 /*
2028 * we have a write that starts in the middle of the current cluster
2029 * but extends beyond the cluster's limit... we know this because
2030 * of the previous checks
2031 * we'll extend the current cluster to the max
2032 * and update the start_blkno for the current write to reflect that
2033 * the head of it was absorbed into this cluster...
2034 * note that we'll always have a leftover tail in this case since
2035 * full absorbtion would have occurred in the clause above
2036 */
2037 vp->v_clusters[cl_index].last_pg = vp->v_clusters[cl_index].start_pg + MAX_UPL_TRANSFER;
2038
2039 if (upl_size) {
2040 int start_pg_in_upl;
2041
2042 start_pg_in_upl = upl_f_offset / PAGE_SIZE_64;
2043
2044 if (start_pg_in_upl < vp->v_clusters[cl_index].last_pg) {
2045 intersection = (vp->v_clusters[cl_index].last_pg - start_pg_in_upl) * PAGE_SIZE;
2046
2047 ubc_upl_commit_range(upl, upl_offset, intersection,
2048 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2049 upl_f_offset += intersection;
2050 upl_offset += intersection;
2051 upl_size -= intersection;
2052 }
2053 }
2054 start_blkno = vp->v_clusters[cl_index].last_pg;
2055 }
2056 /*
2057 * we come here for the case where the current write starts
2058 * beyond the limit of the existing cluster or we have a leftover
2059 * tail after a partial absorbtion
2060 *
2061 * in either case, we'll check the remaining clusters before
2062 * starting a new one
2063 */
2064 } else {
2065 /*
2066 * the current write starts in front of the cluster we're currently considering
2067 */
2068 if ((vp->v_clusters[cl_index].last_pg - start_blkno) <= MAX_UPL_TRANSFER) {
2069 /*
2070 * we can just merge the new request into
2071 * this cluster and leave it in the cache
2072 * since the resulting cluster is still
2073 * less than the maximum allowable size
2074 */
2075 vp->v_clusters[cl_index].start_pg = start_blkno;
2076
2077 if (last_blkno > vp->v_clusters[cl_index].last_pg) {
2078 /*
2079 * the current write completely
2080 * envelops the existing cluster and since
2081 * each write is limited to at most MAX_UPL_TRANSFER bytes
2082 * we can just use the start and last blocknos of the write
2083 * to generate the cluster limits
2084 */
2085 vp->v_clusters[cl_index].last_pg = last_blkno;
2086 }
2087 break;
2088 }
2089
2090 /*
2091 * if we were to combine this write with the current cluster
2092 * we would exceed the cluster size limit.... so,
2093 * let's see if there's any overlap of the new I/O with
2094 * the cluster we're currently considering... in fact, we'll
2095 * stretch the cluster out to it's full limit and see if we
2096 * get an intersection with the current write
2097 *
2098 */
2099 if (last_blkno > vp->v_clusters[cl_index].last_pg - MAX_UPL_TRANSFER) {
2100 /*
2101 * the current write extends into the proposed cluster
2102 * clip the length of the current write after first combining it's
2103 * tail with the newly shaped cluster
2104 */
2105 vp->v_clusters[cl_index].start_pg = vp->v_clusters[cl_index].last_pg - MAX_UPL_TRANSFER;
2106
2107 if (upl_size) {
2108 intersection = (last_blkno - vp->v_clusters[cl_index].start_pg) * PAGE_SIZE;
2109
2110 if (intersection > upl_size)
2111 /*
2112 * because the current write may consist of a number of pages found in the cache
2113 * which are not part of the UPL, we may have an intersection that exceeds
2114 * the size of the UPL that is also part of this write
2115 */
2116 intersection = upl_size;
2117
2118 ubc_upl_commit_range(upl, upl_offset + (upl_size - intersection), intersection,
2119 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2120 upl_size -= intersection;
2121 }
2122 last_blkno = vp->v_clusters[cl_index].start_pg;
2123 }
2124 /*
2125 * if we get here, there was no way to merge
2126 * any portion of this write with this cluster
2127 * or we could only merge part of it which
2128 * will leave a tail...
2129 * we'll check the remaining clusters before starting a new one
2130 */
2131 }
2132 }
2133 if (cl_index < vp->v_clen)
2134 /*
2135 * we found an existing cluster(s) that we
2136 * could entirely merge this I/O into
2137 */
2138 goto delay_io;
2139
2140 if (vp->v_clen < MAX_CLUSTERS && !(vp->v_flag & VNOCACHE_DATA))
2141 /*
2142 * we didn't find an existing cluster to
2143 * merge into, but there's room to start
2144 * a new one
2145 */
2146 goto start_new_cluster;
2147
2148 /*
2149 * no exisitng cluster to merge with and no
2150 * room to start a new one... we'll try
2151 * pushing one of the existing ones... if none of
2152 * them are able to be pushed, we'll switch
2153 * to the sparse cluster mechanism
2154 * cluster_try_push updates v_clen to the
2155 * number of remaining clusters... and
2156 * returns the number of currently unused clusters
2157 */
2158 if (vp->v_flag & VNOCACHE_DATA)
2159 can_delay = 0;
2160 else
2161 can_delay = 1;
2162
2163 if (cluster_try_push(vp, newEOF, can_delay, 0) == 0) {
2164 /*
2165 * no more room in the normal cluster mechanism
2166 * so let's switch to the more expansive but expensive
2167 * sparse mechanism....
2168 * first, we need to release the upl if we hold one
2169 * since pages in it may be present in the sparse cluster map (after the cluster_switch)
2170 * and may span 2 separate buckets there... if they do and
2171 * we happen to have to flush a bucket to make room and it intersects
2172 * this upl, a deadlock may result on page BUSY
2173 */
2174 if (upl_size)
2175 ubc_upl_commit_range(upl, upl_offset, upl_size,
2176 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2177
2178 sparse_cluster_switch(vp, newEOF);
2179 sparse_cluster_add(vp, newEOF, start_blkno, last_blkno);
2180
2181 continue;
2182 }
2183 /*
2184 * we pushed one cluster successfully, so we must be sequentially writing this file
2185 * otherwise, we would have failed and fallen into the sparse cluster support
2186 * so let's take the opportunity to push out additional clusters as long as we
2187 * remain below the throttle... this will give us better I/O locality if we're
2188 * in a copy loop (i.e. we won't jump back and forth between the read and write points
2189 * however, we don't want to push so much out that the write throttle kicks in and
2190 * hangs this thread up until some of the I/O completes...
2191 */
2192 while (vp->v_clen && (vp->v_numoutput <= (ASYNC_THROTTLE / 2)))
2193 cluster_try_push(vp, newEOF, 0, 0);
2194
2195 start_new_cluster:
2196 if (vp->v_clen == 0)
2197 vp->v_ciosiz = devblocksize;
2198
2199 vp->v_clusters[vp->v_clen].start_pg = start_blkno;
2200 vp->v_clusters[vp->v_clen].last_pg = last_blkno;
2201 vp->v_clen++;
2202
2203 delay_io:
2204 if (upl_size)
2205 ubc_upl_commit_range(upl, upl_offset, upl_size,
2206 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2207 continue;
2208 issue_io:
2209 /*
2210 * in order to maintain some semblance of coherency with mapped writes
2211 * we need to write the cluster back out as a multiple of the PAGESIZE
2212 * unless the cluster encompasses the last page of the file... in this
2213 * case we'll round out to the nearest device block boundary
2214 */
2215 io_size = upl_size;
2216
2217 if ((upl_f_offset + io_size) > newEOF) {
2218 io_size = newEOF - upl_f_offset;
2219 io_size = (io_size + (devblocksize - 1)) & ~(devblocksize - 1);
2220 }
2221
2222 if (flags & IO_SYNC)
2223 io_flags = CL_THROTTLE | CL_COMMIT | CL_AGE;
2224 else
2225 io_flags = CL_THROTTLE | CL_COMMIT | CL_AGE | CL_ASYNC;
2226
2227 if (vp->v_flag & VNOCACHE_DATA)
2228 io_flags |= CL_DUMP;
2229
2230 retval = cluster_io(vp, upl, 0, upl_f_offset, io_size, devblocksize,
2231 io_flags, (struct buf *)0, (struct clios *)0);
2232 }
2233 }
2234 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
2235 retval, 0, uio_resid, 0, 0);
2236
2237 return (retval);
2238 }
2239
2240 int
2241 cluster_read(vp, uio, filesize, devblocksize, flags)
2242 struct vnode *vp;
2243 struct uio *uio;
2244 off_t filesize;
2245 int devblocksize;
2246 int flags;
2247 {
2248 int prev_resid;
2249 int clip_size;
2250 off_t max_io_size;
2251 struct iovec *iov;
2252 int upl_size;
2253 int upl_flags;
2254 upl_t upl;
2255 int retval = 0;
2256
2257
2258 if (!((vp->v_flag & VNOCACHE_DATA) && (uio->uio_segflg == UIO_USERSPACE)))
2259 {
2260 /*
2261 * go do a read through the cache if one of the following is true....
2262 * NOCACHE is not true
2263 * the uio request doesn't target USERSPACE
2264 */
2265 return (cluster_read_x(vp, uio, filesize, devblocksize, flags));
2266 }
2267
2268 while (uio->uio_resid && uio->uio_offset < filesize && retval == 0)
2269 {
2270 /*
2271 * we know we have a resid, so this is safe
2272 * skip over any emtpy vectors
2273 */
2274 iov = uio->uio_iov;
2275
2276 while (iov->iov_len == 0) {
2277 uio->uio_iov++;
2278 uio->uio_iovcnt--;
2279 iov = uio->uio_iov;
2280 }
2281 upl_size = PAGE_SIZE;
2282 upl_flags = UPL_QUERY_OBJECT_TYPE;
2283
2284 if ((vm_map_get_upl(current_map(),
2285 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2286 &upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS)
2287 {
2288 /*
2289 * the user app must have passed in an invalid address
2290 */
2291 return (EFAULT);
2292 }
2293
2294 /*
2295 * We check every vector target but if it is physically
2296 * contiguous space, we skip the sanity checks.
2297 */
2298 if (upl_flags & UPL_PHYS_CONTIG)
2299 {
2300 retval = cluster_phys_read(vp, uio, filesize, devblocksize, flags);
2301 }
2302 else if (uio->uio_resid < PAGE_SIZE)
2303 {
2304 /*
2305 * we're here because we're don't have a physically contiguous target buffer
2306 * go do a read through the cache if
2307 * the total xfer size is less than a page...
2308 */
2309 return (cluster_read_x(vp, uio, filesize, devblocksize, flags));
2310 }
2311 else if (((int)uio->uio_offset & PAGE_MASK) || ((int)iov->iov_base & PAGE_MASK))
2312 {
2313 if (((int)uio->uio_offset & PAGE_MASK) == ((int)iov->iov_base & PAGE_MASK))
2314 {
2315 /*
2316 * Bring the file offset read up to a pagesize boundary
2317 * this will also bring the base address to a page boundary
2318 * since they both are currently on the same offset within a page
2319 * note: if we get here, uio->uio_resid is greater than PAGE_SIZE
2320 * so the computed clip_size must always be less than the current uio_resid
2321 */
2322 clip_size = (PAGE_SIZE - (int)(uio->uio_offset & PAGE_MASK_64));
2323
2324 /*
2325 * Fake the resid going into the cluster_read_x call
2326 * and restore it on the way out.
2327 */
2328 prev_resid = uio->uio_resid;
2329 uio->uio_resid = clip_size;
2330 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2331 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2332 }
2333 else
2334 {
2335 /*
2336 * can't get both the file offset and the buffer offset aligned to a page boundary
2337 * so fire an I/O through the cache for this entire vector
2338 */
2339 clip_size = iov->iov_len;
2340 prev_resid = uio->uio_resid;
2341 uio->uio_resid = clip_size;
2342 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2343 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2344 }
2345 }
2346 else
2347 {
2348 /*
2349 * If we come in here, we know the offset into
2350 * the file is on a pagesize boundary
2351 */
2352
2353 max_io_size = filesize - uio->uio_offset;
2354 clip_size = uio->uio_resid;
2355 if (iov->iov_len < clip_size)
2356 clip_size = iov->iov_len;
2357 if (max_io_size < clip_size)
2358 clip_size = (int)max_io_size;
2359
2360 if (clip_size < PAGE_SIZE)
2361 {
2362 /*
2363 * Take care of the tail end of the read in this vector.
2364 */
2365 prev_resid = uio->uio_resid;
2366 uio->uio_resid = clip_size;
2367 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2368 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2369 }
2370 else
2371 {
2372 /* round clip_size down to a multiple of pagesize */
2373 clip_size = clip_size & ~(PAGE_MASK);
2374 prev_resid = uio->uio_resid;
2375 uio->uio_resid = clip_size;
2376 retval = cluster_nocopy_read(vp, uio, filesize, devblocksize, flags);
2377 if ((retval==0) && uio->uio_resid)
2378 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2379 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2380 }
2381 } /* end else */
2382 } /* end while */
2383
2384 return(retval);
2385 }
2386
2387 static int
2388 cluster_read_x(vp, uio, filesize, devblocksize, flags)
2389 struct vnode *vp;
2390 struct uio *uio;
2391 off_t filesize;
2392 int devblocksize;
2393 int flags;
2394 {
2395 upl_page_info_t *pl;
2396 upl_t upl;
2397 vm_offset_t upl_offset;
2398 int upl_size;
2399 off_t upl_f_offset;
2400 int start_offset;
2401 int start_pg;
2402 int last_pg;
2403 int uio_last;
2404 int pages_in_upl;
2405 off_t max_size;
2406 off_t last_ioread_offset;
2407 off_t last_request_offset;
2408 u_int size_of_prefetch;
2409 int io_size;
2410 kern_return_t kret;
2411 int error = 0;
2412 int retval = 0;
2413 u_int b_lblkno;
2414 u_int e_lblkno;
2415 struct clios iostate;
2416 u_int max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2417 u_int rd_ahead_enabled = 1;
2418 u_int prefetch_enabled = 1;
2419
2420
2421 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
2422 (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0);
2423
2424 if (cluster_hard_throttle_on(vp)) {
2425 rd_ahead_enabled = 0;
2426 prefetch_enabled = 0;
2427
2428 max_rd_size = HARD_THROTTLE_MAXSIZE;
2429 }
2430 if (vp->v_flag & (VRAOFF|VNOCACHE_DATA))
2431 rd_ahead_enabled = 0;
2432
2433 last_request_offset = uio->uio_offset + uio->uio_resid;
2434
2435 if (last_request_offset > filesize)
2436 last_request_offset = filesize;
2437 b_lblkno = (u_int)(uio->uio_offset / PAGE_SIZE_64);
2438 e_lblkno = (u_int)((last_request_offset - 1) / PAGE_SIZE_64);
2439
2440 if (vp->v_ralen && (vp->v_lastr == b_lblkno || (vp->v_lastr + 1) == b_lblkno)) {
2441 /*
2442 * determine if we already have a read-ahead in the pipe courtesy of the
2443 * last read systemcall that was issued...
2444 * if so, pick up it's extent to determine where we should start
2445 * with respect to any read-ahead that might be necessary to
2446 * garner all the data needed to complete this read systemcall
2447 */
2448 last_ioread_offset = (vp->v_maxra * PAGE_SIZE_64) + PAGE_SIZE_64;
2449
2450 if (last_ioread_offset < uio->uio_offset)
2451 last_ioread_offset = (off_t)0;
2452 else if (last_ioread_offset > last_request_offset)
2453 last_ioread_offset = last_request_offset;
2454 } else
2455 last_ioread_offset = (off_t)0;
2456
2457 while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) {
2458 /*
2459 * compute the size of the upl needed to encompass
2460 * the requested read... limit each call to cluster_io
2461 * to the maximum UPL size... cluster_io will clip if
2462 * this exceeds the maximum io_size for the device,
2463 * make sure to account for
2464 * a starting offset that's not page aligned
2465 */
2466 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2467 upl_f_offset = uio->uio_offset - (off_t)start_offset;
2468 max_size = filesize - uio->uio_offset;
2469
2470 if ((off_t)((unsigned int)uio->uio_resid) < max_size)
2471 io_size = uio->uio_resid;
2472 else
2473 io_size = max_size;
2474
2475 if (!(vp->v_flag & VNOCACHE_DATA)) {
2476
2477 while (io_size) {
2478 u_int io_resid;
2479 u_int io_requested;
2480
2481 /*
2482 * if we keep finding the pages we need already in the cache, then
2483 * don't bother to call cluster_rd_prefetch since it costs CPU cycles
2484 * to determine that we have all the pages we need... once we miss in
2485 * the cache and have issued an I/O, than we'll assume that we're likely
2486 * to continue to miss in the cache and it's to our advantage to try and prefetch
2487 */
2488 if (last_request_offset && last_ioread_offset && (size_of_prefetch = (last_request_offset - last_ioread_offset))) {
2489 if ((last_ioread_offset - uio->uio_offset) <= max_rd_size && prefetch_enabled) {
2490 /*
2491 * we've already issued I/O for this request and
2492 * there's still work to do and
2493 * our prefetch stream is running dry, so issue a
2494 * pre-fetch I/O... the I/O latency will overlap
2495 * with the copying of the data
2496 */
2497 if (size_of_prefetch > max_rd_size)
2498 size_of_prefetch = max_rd_size;
2499
2500 size_of_prefetch = cluster_rd_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, devblocksize);
2501
2502 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
2503
2504 if (last_ioread_offset > last_request_offset)
2505 last_ioread_offset = last_request_offset;
2506 }
2507 }
2508 /*
2509 * limit the size of the copy we're about to do so that
2510 * we can notice that our I/O pipe is running dry and
2511 * get the next I/O issued before it does go dry
2512 */
2513 if (last_ioread_offset && io_size > ((MAX_UPL_TRANSFER * PAGE_SIZE) / 4))
2514 io_resid = ((MAX_UPL_TRANSFER * PAGE_SIZE) / 4);
2515 else
2516 io_resid = io_size;
2517
2518 io_requested = io_resid;
2519
2520 retval = cluster_copy_ubc_data(vp, uio, &io_resid, 0);
2521
2522 io_size -= (io_requested - io_resid);
2523
2524 if (retval || io_resid)
2525 /*
2526 * if we run into a real error or
2527 * a page that is not in the cache
2528 * we need to leave streaming mode
2529 */
2530 break;
2531
2532 if ((io_size == 0 || last_ioread_offset == last_request_offset) && rd_ahead_enabled) {
2533 /*
2534 * we're already finished the I/O for this read request
2535 * let's see if we should do a read-ahead
2536 */
2537 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
2538 }
2539 }
2540 if (retval)
2541 break;
2542 if (io_size == 0) {
2543 if (e_lblkno < vp->v_lastr)
2544 vp->v_maxra = 0;
2545 vp->v_lastr = e_lblkno;
2546
2547 break;
2548 }
2549 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2550 upl_f_offset = uio->uio_offset - (off_t)start_offset;
2551 max_size = filesize - uio->uio_offset;
2552 }
2553 if (io_size > max_rd_size)
2554 io_size = max_rd_size;
2555
2556 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2557
2558 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE) / 4)
2559 upl_size = (MAX_UPL_TRANSFER * PAGE_SIZE) / 4;
2560 pages_in_upl = upl_size / PAGE_SIZE;
2561
2562 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
2563 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2564
2565 kret = ubc_create_upl(vp,
2566 upl_f_offset,
2567 upl_size,
2568 &upl,
2569 &pl,
2570 UPL_SET_LITE);
2571 if (kret != KERN_SUCCESS)
2572 panic("cluster_read: failed to get pagelist");
2573
2574 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
2575 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2576
2577 /*
2578 * scan from the beginning of the upl looking for the first
2579 * non-valid page.... this will become the first page in
2580 * the request we're going to make to 'cluster_io'... if all
2581 * of the pages are valid, we won't call through to 'cluster_io'
2582 */
2583 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
2584 if (!upl_valid_page(pl, start_pg))
2585 break;
2586 }
2587
2588 /*
2589 * scan from the starting invalid page looking for a valid
2590 * page before the end of the upl is reached, if we
2591 * find one, then it will be the last page of the request to
2592 * 'cluster_io'
2593 */
2594 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
2595 if (upl_valid_page(pl, last_pg))
2596 break;
2597 }
2598 iostate.io_completed = 0;
2599 iostate.io_issued = 0;
2600 iostate.io_error = 0;
2601 iostate.io_wanted = 0;
2602
2603 if (start_pg < last_pg) {
2604 /*
2605 * we found a range of 'invalid' pages that must be filled
2606 * if the last page in this range is the last page of the file
2607 * we may have to clip the size of it to keep from reading past
2608 * the end of the last physical block associated with the file
2609 */
2610 upl_offset = start_pg * PAGE_SIZE;
2611 io_size = (last_pg - start_pg) * PAGE_SIZE;
2612
2613 if ((upl_f_offset + upl_offset + io_size) > filesize)
2614 io_size = filesize - (upl_f_offset + upl_offset);
2615
2616 /*
2617 * issue an asynchronous read to cluster_io
2618 */
2619
2620 error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
2621 io_size, devblocksize, CL_READ | CL_ASYNC, (struct buf *)0, &iostate);
2622 }
2623 if (error == 0) {
2624 /*
2625 * if the read completed successfully, or there was no I/O request
2626 * issued, than copy the data into user land via 'cluster_upl_copy_data'
2627 * we'll first add on any 'valid'
2628 * pages that were present in the upl when we acquired it.
2629 */
2630 u_int val_size;
2631
2632 for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
2633 if (!upl_valid_page(pl, uio_last))
2634 break;
2635 }
2636 /*
2637 * compute size to transfer this round, if uio->uio_resid is
2638 * still non-zero after this attempt, we'll loop around and
2639 * set up for another I/O.
2640 */
2641 val_size = (uio_last * PAGE_SIZE) - start_offset;
2642
2643 if (val_size > max_size)
2644 val_size = max_size;
2645
2646 if (val_size > uio->uio_resid)
2647 val_size = uio->uio_resid;
2648
2649 if (last_ioread_offset == 0)
2650 last_ioread_offset = uio->uio_offset + val_size;
2651
2652 if ((size_of_prefetch = (last_request_offset - last_ioread_offset)) && prefetch_enabled) {
2653 /*
2654 * if there's still I/O left to do for this request, and...
2655 * we're not in hard throttle mode, then issue a
2656 * pre-fetch I/O... the I/O latency will overlap
2657 * with the copying of the data
2658 */
2659 size_of_prefetch = cluster_rd_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, devblocksize);
2660
2661 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
2662
2663 if (last_ioread_offset > last_request_offset)
2664 last_ioread_offset = last_request_offset;
2665
2666 } else if ((uio->uio_offset + val_size) == last_request_offset) {
2667 /*
2668 * this transfer will finish this request, so...
2669 * let's try to read ahead if we're in
2670 * a sequential access pattern and we haven't
2671 * explicitly disabled it
2672 */
2673 if (rd_ahead_enabled)
2674 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
2675
2676 if (e_lblkno < vp->v_lastr)
2677 vp->v_maxra = 0;
2678 vp->v_lastr = e_lblkno;
2679 }
2680 while (iostate.io_issued != iostate.io_completed) {
2681 iostate.io_wanted = 1;
2682 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_read_x", 0);
2683 }
2684 if (iostate.io_error)
2685 error = iostate.io_error;
2686 else
2687 retval = cluster_copy_upl_data(uio, upl, start_offset, val_size);
2688 }
2689 if (start_pg < last_pg) {
2690 /*
2691 * compute the range of pages that we actually issued an I/O for
2692 * and either commit them as valid if the I/O succeeded
2693 * or abort them if the I/O failed
2694 */
2695 io_size = (last_pg - start_pg) * PAGE_SIZE;
2696
2697 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
2698 (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
2699
2700 if (error || (vp->v_flag & VNOCACHE_DATA))
2701 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
2702 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2703 else
2704 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size,
2705 UPL_COMMIT_CLEAR_DIRTY |
2706 UPL_COMMIT_FREE_ON_EMPTY |
2707 UPL_COMMIT_INACTIVATE);
2708
2709 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
2710 (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
2711 }
2712 if ((last_pg - start_pg) < pages_in_upl) {
2713 int cur_pg;
2714 int commit_flags;
2715
2716 /*
2717 * the set of pages that we issued an I/O for did not encompass
2718 * the entire upl... so just release these without modifying
2719 * their state
2720 */
2721 if (error)
2722 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2723 else {
2724 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
2725 (int)upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
2726
2727 if (start_pg) {
2728 /*
2729 * we found some already valid pages at the beginning of
2730 * the upl commit these back to the inactive list with
2731 * reference cleared
2732 */
2733 for (cur_pg = 0; cur_pg < start_pg; cur_pg++) {
2734 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
2735 | UPL_COMMIT_INACTIVATE;
2736
2737 if (upl_dirty_page(pl, cur_pg))
2738 commit_flags |= UPL_COMMIT_SET_DIRTY;
2739
2740 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA))
2741 ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
2742 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2743 else
2744 ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
2745 PAGE_SIZE, commit_flags);
2746 }
2747 }
2748 if (last_pg < uio_last) {
2749 /*
2750 * we found some already valid pages immediately after the
2751 * pages we issued I/O for, commit these back to the
2752 * inactive list with reference cleared
2753 */
2754 for (cur_pg = last_pg; cur_pg < uio_last; cur_pg++) {
2755 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
2756 | UPL_COMMIT_INACTIVATE;
2757
2758 if (upl_dirty_page(pl, cur_pg))
2759 commit_flags |= UPL_COMMIT_SET_DIRTY;
2760
2761 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA))
2762 ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
2763 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2764 else
2765 ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
2766 PAGE_SIZE, commit_flags);
2767 }
2768 }
2769 if (uio_last < pages_in_upl) {
2770 /*
2771 * there were some invalid pages beyond the valid pages
2772 * that we didn't issue an I/O for, just release them
2773 * unchanged
2774 */
2775 ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
2776 (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
2777 }
2778
2779 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
2780 (int)upl, -1, -1, 0, 0);
2781 }
2782 }
2783 if (retval == 0)
2784 retval = error;
2785 }
2786 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2787 (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
2788
2789 return (retval);
2790 }
2791
2792
2793 static int
2794 cluster_nocopy_read(vp, uio, filesize, devblocksize, flags)
2795 struct vnode *vp;
2796 struct uio *uio;
2797 off_t filesize;
2798 int devblocksize;
2799 int flags;
2800 {
2801 upl_t upl;
2802 upl_page_info_t *pl;
2803 vm_offset_t upl_offset;
2804 off_t max_io_size;
2805 int io_size;
2806 int upl_size;
2807 int upl_needed_size;
2808 int pages_in_pl;
2809 int upl_flags;
2810 kern_return_t kret;
2811 struct iovec *iov;
2812 int i;
2813 int force_data_sync;
2814 int retval = 0;
2815 struct clios iostate;
2816 u_int max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2817 u_int max_rd_ahead = MAX_UPL_TRANSFER * PAGE_SIZE * 2;
2818
2819
2820 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
2821 (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0);
2822
2823 /*
2824 * When we enter this routine, we know
2825 * -- the offset into the file is on a pagesize boundary
2826 * -- the resid is a page multiple
2827 * -- the resid will not exceed iov_len
2828 */
2829
2830 iostate.io_completed = 0;
2831 iostate.io_issued = 0;
2832 iostate.io_error = 0;
2833 iostate.io_wanted = 0;
2834
2835 iov = uio->uio_iov;
2836
2837 if (cluster_hard_throttle_on(vp)) {
2838 max_rd_size = HARD_THROTTLE_MAXSIZE;
2839 max_rd_ahead = HARD_THROTTLE_MAXSIZE - 1;
2840 }
2841 while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) {
2842
2843 max_io_size = filesize - uio->uio_offset;
2844
2845 if (max_io_size < (off_t)((unsigned int)uio->uio_resid))
2846 io_size = max_io_size;
2847 else
2848 io_size = uio->uio_resid;
2849
2850 /*
2851 * First look for pages already in the cache
2852 * and move them to user space.
2853 */
2854 retval = cluster_copy_ubc_data(vp, uio, &io_size, 0);
2855
2856 if (retval) {
2857 /*
2858 * we may have already spun some portion of this request
2859 * off as async requests... we need to wait for the I/O
2860 * to complete before returning
2861 */
2862 goto wait_for_reads;
2863 }
2864 /*
2865 * If we are already finished with this read, then return
2866 */
2867 if (io_size == 0) {
2868 /*
2869 * we may have already spun some portion of this request
2870 * off as async requests... we need to wait for the I/O
2871 * to complete before returning
2872 */
2873 goto wait_for_reads;
2874 }
2875 max_io_size = io_size;
2876
2877 if (max_io_size > max_rd_size)
2878 max_io_size = max_rd_size;
2879
2880 io_size = 0;
2881
2882 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + max_io_size, UPL_ROP_ABSENT, &io_size);
2883
2884 if (io_size == 0)
2885 /*
2886 * we may have already spun some portion of this request
2887 * off as async requests... we need to wait for the I/O
2888 * to complete before returning
2889 */
2890 goto wait_for_reads;
2891
2892 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK;
2893 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
2894
2895 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
2896 (int)upl_offset, upl_needed_size, (int)iov->iov_base, io_size, 0);
2897
2898 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
2899 pages_in_pl = 0;
2900 upl_size = upl_needed_size;
2901 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
2902
2903 kret = vm_map_get_upl(current_map(),
2904 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2905 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, force_data_sync);
2906
2907 if (kret != KERN_SUCCESS) {
2908 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2909 (int)upl_offset, upl_size, io_size, kret, 0);
2910 /*
2911 * cluster_nocopy_read: failed to get pagelist
2912 *
2913 * we may have already spun some portion of this request
2914 * off as async requests... we need to wait for the I/O
2915 * to complete before returning
2916 */
2917 goto wait_for_reads;
2918 }
2919 pages_in_pl = upl_size / PAGE_SIZE;
2920 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2921
2922 for (i = 0; i < pages_in_pl; i++) {
2923 if (!upl_valid_page(pl, i))
2924 break;
2925 }
2926 if (i == pages_in_pl)
2927 break;
2928
2929 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2930 UPL_ABORT_FREE_ON_EMPTY);
2931 }
2932 if (force_data_sync >= 3) {
2933 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2934 (int)upl_offset, upl_size, io_size, kret, 0);
2935
2936 goto wait_for_reads;
2937 }
2938 /*
2939 * Consider the possibility that upl_size wasn't satisfied.
2940 */
2941 if (upl_size != upl_needed_size)
2942 io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
2943
2944 if (io_size == 0) {
2945 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2946 UPL_ABORT_FREE_ON_EMPTY);
2947 goto wait_for_reads;
2948 }
2949 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2950 (int)upl_offset, upl_size, io_size, kret, 0);
2951
2952 /*
2953 * request asynchronously so that we can overlap
2954 * the preparation of the next I/O
2955 * if there are already too many outstanding reads
2956 * wait until some have completed before issuing the next read
2957 */
2958 while ((iostate.io_issued - iostate.io_completed) > max_rd_ahead) {
2959 iostate.io_wanted = 1;
2960 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_read", 0);
2961 }
2962 if (iostate.io_error) {
2963 /*
2964 * one of the earlier reads we issued ran into a hard error
2965 * don't issue any more reads, cleanup the UPL
2966 * that was just created but not used, then
2967 * go wait for any other reads to complete before
2968 * returning the error to the caller
2969 */
2970 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2971 UPL_ABORT_FREE_ON_EMPTY);
2972
2973 goto wait_for_reads;
2974 }
2975 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
2976 (int)upl, (int)upl_offset, (int)uio->uio_offset, io_size, 0);
2977
2978 retval = cluster_io(vp, upl, upl_offset, uio->uio_offset,
2979 io_size, devblocksize,
2980 CL_PRESERVE | CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO,
2981 (struct buf *)0, &iostate);
2982
2983 /*
2984 * update the uio structure
2985 */
2986 iov->iov_base += io_size;
2987 iov->iov_len -= io_size;
2988 uio->uio_resid -= io_size;
2989 uio->uio_offset += io_size;
2990
2991 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
2992 (int)upl, (int)uio->uio_offset, (int)uio->uio_resid, retval, 0);
2993
2994 } /* end while */
2995
2996 wait_for_reads:
2997 /*
2998 * make sure all async reads that are part of this stream
2999 * have completed before we return
3000 */
3001 while (iostate.io_issued != iostate.io_completed) {
3002 iostate.io_wanted = 1;
3003 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_read", 0);
3004 }
3005 if (iostate.io_error)
3006 retval = iostate.io_error;
3007
3008 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
3009 (int)uio->uio_offset, (int)uio->uio_resid, 6, retval, 0);
3010
3011 return (retval);
3012 }
3013
3014
3015 static int
3016 cluster_phys_read(vp, uio, filesize, devblocksize, flags)
3017 struct vnode *vp;
3018 struct uio *uio;
3019 off_t filesize;
3020 int devblocksize;
3021 int flags;
3022 {
3023 upl_page_info_t *pl;
3024 upl_t upl;
3025 vm_offset_t upl_offset;
3026 addr64_t dst_paddr;
3027 off_t max_size;
3028 int io_size;
3029 int tail_size;
3030 int upl_size;
3031 int upl_needed_size;
3032 int pages_in_pl;
3033 int upl_flags;
3034 kern_return_t kret;
3035 struct iovec *iov;
3036 struct clios iostate;
3037 int error;
3038
3039 /*
3040 * When we enter this routine, we know
3041 * -- the resid will not exceed iov_len
3042 * -- the target address is physically contiguous
3043 */
3044
3045 iov = uio->uio_iov;
3046
3047 max_size = filesize - uio->uio_offset;
3048
3049 if (max_size > (off_t)((unsigned int)iov->iov_len))
3050 io_size = iov->iov_len;
3051 else
3052 io_size = max_size;
3053
3054 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK;
3055 upl_needed_size = upl_offset + io_size;
3056
3057 error = 0;
3058 pages_in_pl = 0;
3059 upl_size = upl_needed_size;
3060 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
3061
3062 kret = vm_map_get_upl(current_map(),
3063 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
3064 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
3065
3066 if (kret != KERN_SUCCESS) {
3067 /*
3068 * cluster_phys_read: failed to get pagelist
3069 */
3070 return(EINVAL);
3071 }
3072 if (upl_size < upl_needed_size) {
3073 /*
3074 * The upl_size wasn't satisfied.
3075 */
3076 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3077
3078 return(EINVAL);
3079 }
3080 pl = ubc_upl_pageinfo(upl);
3081
3082 dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + ((addr64_t)((u_int)iov->iov_base & PAGE_MASK));
3083
3084 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
3085 int head_size;
3086
3087 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
3088
3089 if (head_size > io_size)
3090 head_size = io_size;
3091
3092 error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, devblocksize, CL_READ);
3093
3094 if (error) {
3095 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3096
3097 return(EINVAL);
3098 }
3099 upl_offset += head_size;
3100 dst_paddr += head_size;
3101 io_size -= head_size;
3102 }
3103 tail_size = io_size & (devblocksize - 1);
3104 io_size -= tail_size;
3105
3106 iostate.io_completed = 0;
3107 iostate.io_issued = 0;
3108 iostate.io_error = 0;
3109 iostate.io_wanted = 0;
3110
3111 while (io_size && error == 0) {
3112 int xsize;
3113
3114 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
3115 xsize = MAX_UPL_TRANSFER * PAGE_SIZE;
3116 else
3117 xsize = io_size;
3118 /*
3119 * request asynchronously so that we can overlap
3120 * the preparation of the next I/O... we'll do
3121 * the commit after all the I/O has completed
3122 * since its all issued against the same UPL
3123 * if there are already too many outstanding reads
3124 * wait until some have completed before issuing the next
3125 */
3126 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
3127 iostate.io_wanted = 1;
3128 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_phys_read", 0);
3129 }
3130
3131 error = cluster_io(vp, upl, upl_offset, uio->uio_offset, xsize, 0,
3132 CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC,
3133 (struct buf *)0, &iostate);
3134 /*
3135 * The cluster_io read was issued successfully,
3136 * update the uio structure
3137 */
3138 if (error == 0) {
3139 uio->uio_resid -= xsize;
3140 iov->iov_len -= xsize;
3141 iov->iov_base += xsize;
3142 uio->uio_offset += xsize;
3143 dst_paddr += xsize;
3144 upl_offset += xsize;
3145 io_size -= xsize;
3146 }
3147 }
3148 /*
3149 * make sure all async reads that are part of this stream
3150 * have completed before we proceed
3151 */
3152 while (iostate.io_issued != iostate.io_completed) {
3153 iostate.io_wanted = 1;
3154 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_phys_read", 0);
3155 }
3156 if (iostate.io_error) {
3157 error = iostate.io_error;
3158 }
3159 if (error == 0 && tail_size)
3160 error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, devblocksize, CL_READ);
3161
3162 /*
3163 * just release our hold on the physically contiguous
3164 * region without changing any state
3165 */
3166 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3167
3168 return (error);
3169 }
3170
3171
3172 /*
3173 * generate advisory I/O's in the largest chunks possible
3174 * the completed pages will be released into the VM cache
3175 */
3176 int
3177 advisory_read(vp, filesize, f_offset, resid, devblocksize)
3178 struct vnode *vp;
3179 off_t filesize;
3180 off_t f_offset;
3181 int resid;
3182 int devblocksize;
3183 {
3184 upl_page_info_t *pl;
3185 upl_t upl;
3186 vm_offset_t upl_offset;
3187 int upl_size;
3188 off_t upl_f_offset;
3189 int start_offset;
3190 int start_pg;
3191 int last_pg;
3192 int pages_in_upl;
3193 off_t max_size;
3194 int io_size;
3195 kern_return_t kret;
3196 int retval = 0;
3197 int issued_io;
3198 int skip_range;
3199
3200 if (!UBCINFOEXISTS(vp))
3201 return(EINVAL);
3202
3203 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
3204 (int)f_offset, resid, (int)filesize, devblocksize, 0);
3205
3206 while (resid && f_offset < filesize && retval == 0) {
3207 /*
3208 * compute the size of the upl needed to encompass
3209 * the requested read... limit each call to cluster_io
3210 * to the maximum UPL size... cluster_io will clip if
3211 * this exceeds the maximum io_size for the device,
3212 * make sure to account for
3213 * a starting offset that's not page aligned
3214 */
3215 start_offset = (int)(f_offset & PAGE_MASK_64);
3216 upl_f_offset = f_offset - (off_t)start_offset;
3217 max_size = filesize - f_offset;
3218
3219 if (resid < max_size)
3220 io_size = resid;
3221 else
3222 io_size = max_size;
3223
3224 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
3225 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
3226 upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
3227
3228 skip_range = 0;
3229 /*
3230 * return the number of contiguously present pages in the cache
3231 * starting at upl_f_offset within the file
3232 */
3233 ubc_range_op(vp, upl_f_offset, upl_f_offset + upl_size, UPL_ROP_PRESENT, &skip_range);
3234
3235 if (skip_range) {
3236 /*
3237 * skip over pages already present in the cache
3238 */
3239 io_size = skip_range - start_offset;
3240
3241 f_offset += io_size;
3242 resid -= io_size;
3243
3244 if (skip_range == upl_size)
3245 continue;
3246 /*
3247 * have to issue some real I/O
3248 * at this point, we know it's starting on a page boundary
3249 * because we've skipped over at least the first page in the request
3250 */
3251 start_offset = 0;
3252 upl_f_offset += skip_range;
3253 upl_size -= skip_range;
3254 }
3255 pages_in_upl = upl_size / PAGE_SIZE;
3256
3257 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_START,
3258 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
3259
3260 kret = ubc_create_upl(vp,
3261 upl_f_offset,
3262 upl_size,
3263 &upl,
3264 &pl,
3265 UPL_RET_ONLY_ABSENT | UPL_SET_LITE);
3266 if (kret != KERN_SUCCESS)
3267 return(retval);
3268 issued_io = 0;
3269
3270 /*
3271 * before we start marching forward, we must make sure we end on
3272 * a present page, otherwise we will be working with a freed
3273 * upl
3274 */
3275 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
3276 if (upl_page_present(pl, last_pg))
3277 break;
3278 }
3279 pages_in_upl = last_pg + 1;
3280
3281
3282 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_END,
3283 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
3284
3285
3286 for (last_pg = 0; last_pg < pages_in_upl; ) {
3287 /*
3288 * scan from the beginning of the upl looking for the first
3289 * page that is present.... this will become the first page in
3290 * the request we're going to make to 'cluster_io'... if all
3291 * of the pages are absent, we won't call through to 'cluster_io'
3292 */
3293 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
3294 if (upl_page_present(pl, start_pg))
3295 break;
3296 }
3297
3298 /*
3299 * scan from the starting present page looking for an absent
3300 * page before the end of the upl is reached, if we
3301 * find one, then it will terminate the range of pages being
3302 * presented to 'cluster_io'
3303 */
3304 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
3305 if (!upl_page_present(pl, last_pg))
3306 break;
3307 }
3308
3309 if (last_pg > start_pg) {
3310 /*
3311 * we found a range of pages that must be filled
3312 * if the last page in this range is the last page of the file
3313 * we may have to clip the size of it to keep from reading past
3314 * the end of the last physical block associated with the file
3315 */
3316 upl_offset = start_pg * PAGE_SIZE;
3317 io_size = (last_pg - start_pg) * PAGE_SIZE;
3318
3319 if ((upl_f_offset + upl_offset + io_size) > filesize)
3320 io_size = filesize - (upl_f_offset + upl_offset);
3321
3322 /*
3323 * issue an asynchronous read to cluster_io
3324 */
3325 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, devblocksize,
3326 CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE, (struct buf *)0, (struct clios *)0);
3327
3328 issued_io = 1;
3329 }
3330 }
3331 if (issued_io == 0)
3332 ubc_upl_abort(upl, 0);
3333
3334 io_size = upl_size - start_offset;
3335
3336 if (io_size > resid)
3337 io_size = resid;
3338 f_offset += io_size;
3339 resid -= io_size;
3340 }
3341
3342 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
3343 (int)f_offset, resid, retval, 0, 0);
3344
3345 return(retval);
3346 }
3347
3348
3349 int
3350 cluster_push(vp)
3351 struct vnode *vp;
3352 {
3353 int retval;
3354
3355 if (!UBCINFOEXISTS(vp) || (vp->v_clen == 0 && !(vp->v_flag & VHASDIRTY)))
3356 return(0);
3357
3358 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
3359 vp->v_flag & VHASDIRTY, vp->v_clen, 0, 0, 0);
3360
3361 if (vp->v_flag & VHASDIRTY) {
3362 sparse_cluster_push(vp, ubc_getsize(vp), 1);
3363
3364 vp->v_clen = 0;
3365 retval = 1;
3366 } else
3367 retval = cluster_try_push(vp, ubc_getsize(vp), 0, 1);
3368
3369 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
3370 vp->v_flag & VHASDIRTY, vp->v_clen, retval, 0, 0);
3371
3372 return (retval);
3373 }
3374
3375
3376 int
3377 cluster_release(vp)
3378 struct vnode *vp;
3379 {
3380 off_t offset;
3381 u_int length;
3382
3383 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, (int)vp, (int)vp->v_scmap, vp->v_scdirty, 0, 0);
3384
3385 if (vp->v_flag & VHASDIRTY) {
3386 vfs_drt_control(&(vp->v_scmap), 0);
3387
3388 vp->v_flag &= ~VHASDIRTY;
3389 }
3390 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_END, (int)vp, (int)vp->v_scmap, vp->v_scdirty, 0, 0);
3391 }
3392
3393
3394 static int
3395 cluster_try_push(vp, EOF, can_delay, push_all)
3396 struct vnode *vp;
3397 off_t EOF;
3398 int can_delay;
3399 int push_all;
3400 {
3401 int cl_index;
3402 int cl_index1;
3403 int min_index;
3404 int cl_len;
3405 int cl_total;
3406 int cl_pushed = 0;
3407 struct v_cluster l_clusters[MAX_CLUSTERS];
3408
3409 /*
3410 * make a local 'sorted' copy of the clusters
3411 * and clear vp->v_clen so that new clusters can
3412 * be developed
3413 */
3414 for (cl_index = 0; cl_index < vp->v_clen; cl_index++) {
3415 for (min_index = -1, cl_index1 = 0; cl_index1 < vp->v_clen; cl_index1++) {
3416 if (vp->v_clusters[cl_index1].start_pg == vp->v_clusters[cl_index1].last_pg)
3417 continue;
3418 if (min_index == -1)
3419 min_index = cl_index1;
3420 else if (vp->v_clusters[cl_index1].start_pg < vp->v_clusters[min_index].start_pg)
3421 min_index = cl_index1;
3422 }
3423 if (min_index == -1)
3424 break;
3425 l_clusters[cl_index].start_pg = vp->v_clusters[min_index].start_pg;
3426 l_clusters[cl_index].last_pg = vp->v_clusters[min_index].last_pg;
3427
3428 vp->v_clusters[min_index].start_pg = vp->v_clusters[min_index].last_pg;
3429 }
3430 cl_len = cl_index;
3431 vp->v_clen = 0;
3432
3433 if (can_delay && cl_len == MAX_CLUSTERS) {
3434 int i;
3435
3436 /*
3437 * determine if we appear to be writing the file sequentially
3438 * if not, by returning without having pushed any clusters
3439 * we will cause this vnode to be pushed into the sparse cluster mechanism
3440 * used for managing more random I/O patterns
3441 *
3442 * we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
3443 * that's why we're in try_push with can_delay true...
3444 *
3445 * check to make sure that all the clusters except the last one are 'full'... and that each cluster
3446 * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
3447 * so we can just make a simple pass through up, to but not including the last one...
3448 * note that last_pg is not inclusive, so it will be equal to the start_pg of the next cluster if they
3449 * are sequential
3450 *
3451 * we let the last one be partial as long as it was adjacent to the previous one...
3452 * we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
3453 * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
3454 */
3455 for (i = 0; i < MAX_CLUSTERS - 1; i++) {
3456 if ((l_clusters[i].last_pg - l_clusters[i].start_pg) != MAX_UPL_TRANSFER)
3457 goto dont_try;
3458 if (l_clusters[i].last_pg != l_clusters[i+1].start_pg)
3459 goto dont_try;
3460 }
3461 }
3462 for (cl_index = 0; cl_index < cl_len; cl_index++) {
3463 /*
3464 * try to push each cluster in turn... cluster_push_x may not
3465 * push the cluster if can_delay is TRUE and the cluster doesn't
3466 * meet the critera for an immediate push
3467 */
3468 if (cluster_push_x(vp, EOF, l_clusters[cl_index].start_pg, l_clusters[cl_index].last_pg, can_delay)) {
3469 l_clusters[cl_index].start_pg = 0;
3470 l_clusters[cl_index].last_pg = 0;
3471
3472 cl_pushed++;
3473
3474 if (push_all == 0)
3475 break;
3476 }
3477 }
3478 dont_try:
3479 if (cl_len > cl_pushed) {
3480 /*
3481 * we didn't push all of the clusters, so
3482 * lets try to merge them back in to the vnode
3483 */
3484 if ((MAX_CLUSTERS - vp->v_clen) < (cl_len - cl_pushed)) {
3485 /*
3486 * we picked up some new clusters while we were trying to
3487 * push the old ones (I don't think this can happen because
3488 * I'm holding the lock, but just in case)... the sum of the
3489 * leftovers plus the new cluster count exceeds our ability
3490 * to represent them, so switch to the sparse cluster mechanism
3491 */
3492
3493 /*
3494 * first collect the new clusters sitting in the vp
3495 */
3496 sparse_cluster_switch(vp, EOF);
3497
3498 for (cl_index = 0, cl_index1 = 0; cl_index < cl_len; cl_index++) {
3499 if (l_clusters[cl_index].start_pg == l_clusters[cl_index].last_pg)
3500 continue;
3501 vp->v_clusters[cl_index1].start_pg = l_clusters[cl_index].start_pg;
3502 vp->v_clusters[cl_index1].last_pg = l_clusters[cl_index].last_pg;
3503
3504 cl_index1++;
3505 }
3506 /*
3507 * update the cluster count
3508 */
3509 vp->v_clen = cl_index1;
3510
3511 /*
3512 * and collect the original clusters that were moved into the
3513 * local storage for sorting purposes
3514 */
3515 sparse_cluster_switch(vp, EOF);
3516
3517 } else {
3518 /*
3519 * we've got room to merge the leftovers back in
3520 * just append them starting at the next 'hole'
3521 * represented by vp->v_clen
3522 */
3523 for (cl_index = 0, cl_index1 = vp->v_clen; cl_index < cl_len; cl_index++) {
3524 if (l_clusters[cl_index].start_pg == l_clusters[cl_index].last_pg)
3525 continue;
3526
3527 vp->v_clusters[cl_index1].start_pg = l_clusters[cl_index].start_pg;
3528 vp->v_clusters[cl_index1].last_pg = l_clusters[cl_index].last_pg;
3529
3530 cl_index1++;
3531 }
3532 /*
3533 * update the cluster count
3534 */
3535 vp->v_clen = cl_index1;
3536 }
3537 }
3538 return(MAX_CLUSTERS - vp->v_clen);
3539 }
3540
3541
3542
3543 static int
3544 cluster_push_x(vp, EOF, first, last, can_delay)
3545 struct vnode *vp;
3546 off_t EOF;
3547 unsigned int first;
3548 unsigned int last;
3549 int can_delay;
3550 {
3551 upl_page_info_t *pl;
3552 upl_t upl;
3553 vm_offset_t upl_offset;
3554 int upl_size;
3555 off_t upl_f_offset;
3556 int pages_in_upl;
3557 int start_pg;
3558 int last_pg;
3559 int io_size;
3560 int io_flags;
3561 int upl_flags;
3562 int size;
3563 kern_return_t kret;
3564
3565
3566 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START,
3567 vp->v_clen, first, last, EOF, 0);
3568
3569 if ((pages_in_upl = last - first) == 0) {
3570 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0);
3571
3572 return (1);
3573 }
3574 upl_size = pages_in_upl * PAGE_SIZE;
3575 upl_f_offset = (off_t)((unsigned long long)first * PAGE_SIZE_64);
3576
3577 if (upl_f_offset + upl_size >= EOF) {
3578
3579 if (upl_f_offset >= EOF) {
3580 /*
3581 * must have truncated the file and missed
3582 * clearing a dangling cluster (i.e. it's completely
3583 * beyond the new EOF
3584 */
3585 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0);
3586
3587 return(1);
3588 }
3589 size = EOF - upl_f_offset;
3590
3591 upl_size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
3592 pages_in_upl = upl_size / PAGE_SIZE;
3593 } else
3594 size = upl_size;
3595
3596 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, size, 0, 0, 0);
3597
3598 if (vp->v_flag & VNOCACHE_DATA)
3599 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE | UPL_WILL_BE_DUMPED;
3600 else
3601 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE;
3602
3603 kret = ubc_create_upl(vp,
3604 upl_f_offset,
3605 upl_size,
3606 &upl,
3607 &pl,
3608 upl_flags);
3609 if (kret != KERN_SUCCESS)
3610 panic("cluster_push: failed to get pagelist");
3611
3612 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END, (int)upl, upl_f_offset, 0, 0, 0);
3613
3614 /*
3615 * since we only asked for the dirty pages back
3616 * it's possible that we may only get a few or even none, so...
3617 * before we start marching forward, we must make sure we know
3618 * where the last present page is in the UPL, otherwise we could
3619 * end up working with a freed upl due to the FREE_ON_EMPTY semantics
3620 * employed by commit_range and abort_range.
3621 */
3622 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
3623 if (upl_page_present(pl, last_pg))
3624 break;
3625 }
3626 pages_in_upl = last_pg + 1;
3627
3628 if (pages_in_upl == 0) {
3629 ubc_upl_abort(upl, 0);
3630
3631 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 2, 0, 0, 0);
3632 return(1);
3633 }
3634
3635 for (last_pg = 0; last_pg < pages_in_upl; ) {
3636 /*
3637 * find the next dirty page in the UPL
3638 * this will become the first page in the
3639 * next I/O to generate
3640 */
3641 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
3642 if (upl_dirty_page(pl, start_pg))
3643 break;
3644 if (upl_page_present(pl, start_pg))
3645 /*
3646 * RET_ONLY_DIRTY will return non-dirty 'precious' pages
3647 * just release these unchanged since we're not going
3648 * to steal them or change their state
3649 */
3650 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
3651 }
3652 if (start_pg >= pages_in_upl)
3653 /*
3654 * done... no more dirty pages to push
3655 */
3656 break;
3657 if (start_pg > last_pg)
3658 /*
3659 * skipped over some non-dirty pages
3660 */
3661 size -= ((start_pg - last_pg) * PAGE_SIZE);
3662
3663 /*
3664 * find a range of dirty pages to write
3665 */
3666 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
3667 if (!upl_dirty_page(pl, last_pg))
3668 break;
3669 }
3670 upl_offset = start_pg * PAGE_SIZE;
3671
3672 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
3673
3674 if (vp->v_flag & VNOCACHE_DATA)
3675 io_flags = CL_THROTTLE | CL_COMMIT | CL_ASYNC | CL_DUMP;
3676 else
3677 io_flags = CL_THROTTLE | CL_COMMIT | CL_ASYNC;
3678
3679 cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, vp->v_ciosiz, io_flags, (struct buf *)0, (struct clios *)0);
3680
3681 size -= io_size;
3682 }
3683 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, 0, 0, 0);
3684
3685 return(1);
3686 }
3687
3688
3689 static int
3690 sparse_cluster_switch(struct vnode *vp, off_t EOF)
3691 {
3692 int cl_index;
3693
3694 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, (int)vp, (int)vp->v_scmap, vp->v_scdirty, 0, 0);
3695
3696 if ( !(vp->v_flag & VHASDIRTY)) {
3697 vp->v_flag |= VHASDIRTY;
3698 vp->v_scdirty = 0;
3699 vp->v_scmap = 0;
3700 }
3701 for (cl_index = 0; cl_index < vp->v_clen; cl_index++) {
3702 int flags;
3703 int start_pg;
3704 int last_pg;
3705
3706 for (start_pg = vp->v_clusters[cl_index].start_pg; start_pg < vp->v_clusters[cl_index].last_pg; start_pg++) {
3707
3708 if (ubc_page_op(vp, (off_t)(((off_t)start_pg) * PAGE_SIZE_64), 0, 0, &flags) == KERN_SUCCESS) {
3709 if (flags & UPL_POP_DIRTY)
3710 sparse_cluster_add(vp, EOF, start_pg, start_pg + 1);
3711 }
3712 }
3713 }
3714 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, (int)vp, (int)vp->v_scmap, vp->v_scdirty, 0, 0);
3715 }
3716
3717
3718 static int
3719 sparse_cluster_push(struct vnode *vp, off_t EOF, int push_all)
3720 {
3721 unsigned int first;
3722 unsigned int last;
3723 off_t offset;
3724 u_int length;
3725
3726 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, (int)vp, (int)vp->v_scmap, vp->v_scdirty, push_all, 0);
3727
3728 if (push_all)
3729 vfs_drt_control(&(vp->v_scmap), 1);
3730
3731 for (;;) {
3732 if (vfs_drt_get_cluster(&(vp->v_scmap), &offset, &length) != KERN_SUCCESS) {
3733 vp->v_flag &= ~VHASDIRTY;
3734 vp->v_clen = 0;
3735 break;
3736 }
3737 first = (unsigned int)(offset / PAGE_SIZE_64);
3738 last = (unsigned int)((offset + length) / PAGE_SIZE_64);
3739
3740 cluster_push_x(vp, EOF, first, last, 0);
3741
3742 vp->v_scdirty -= (last - first);
3743
3744 if (push_all == 0)
3745 break;
3746 }
3747 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, (int)vp, (int)vp->v_scmap, vp->v_scdirty, 0, 0);
3748 }
3749
3750
3751 static int
3752 sparse_cluster_add(struct vnode *vp, off_t EOF, daddr_t first, daddr_t last)
3753 {
3754 u_int new_dirty;
3755 u_int length;
3756 off_t offset;
3757
3758 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_START, (int)vp->v_scmap, vp->v_scdirty, first, last, 0);
3759
3760 offset = (off_t)first * PAGE_SIZE_64;
3761 length = (last - first) * PAGE_SIZE;
3762
3763 while (vfs_drt_mark_pages(&(vp->v_scmap), offset, length, &new_dirty) != KERN_SUCCESS) {
3764 /*
3765 * no room left in the map
3766 * only a partial update was done
3767 * push out some pages and try again
3768 */
3769 vp->v_scdirty += new_dirty;
3770
3771 sparse_cluster_push(vp, EOF, 0);
3772
3773 offset += (new_dirty * PAGE_SIZE_64);
3774 length -= (new_dirty * PAGE_SIZE);
3775 }
3776 vp->v_scdirty += new_dirty;
3777
3778 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, (int)vp, (int)vp->v_scmap, vp->v_scdirty, 0, 0);
3779 }
3780
3781
3782 static int
3783 cluster_align_phys_io(struct vnode *vp, struct uio *uio, addr64_t usr_paddr, int xsize, int devblocksize, int flags)
3784 {
3785 struct iovec *iov;
3786 upl_page_info_t *pl;
3787 upl_t upl;
3788 addr64_t ubc_paddr;
3789 kern_return_t kret;
3790 int error = 0;
3791
3792 iov = uio->uio_iov;
3793
3794 kret = ubc_create_upl(vp,
3795 uio->uio_offset & ~PAGE_MASK_64,
3796 PAGE_SIZE,
3797 &upl,
3798 &pl,
3799 UPL_SET_LITE);
3800
3801 if (kret != KERN_SUCCESS)
3802 return(EINVAL);
3803
3804 if (!upl_valid_page(pl, 0)) {
3805 /*
3806 * issue a synchronous read to cluster_io
3807 */
3808 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, devblocksize,
3809 CL_READ, (struct buf *)0, (struct clios *)0);
3810 if (error) {
3811 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3812
3813 return(error);
3814 }
3815 }
3816 ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)(uio->uio_offset & PAGE_MASK_64);
3817
3818 /*
3819 * NOTE: There is no prototype for the following in BSD. It, and the definitions
3820 * of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
3821 * osfmk/ppc/mappings.h. They are not included here because there appears to be no
3822 * way to do so without exporting them to kexts as well.
3823 */
3824 if (flags & CL_READ)
3825 // copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk); /* Copy physical to physical and flush the destination */
3826 copypv(ubc_paddr, usr_paddr, xsize, 2 | 1 | 4); /* Copy physical to physical and flush the destination */
3827 else
3828 // copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc); /* Copy physical to physical and flush the source */
3829 copypv(usr_paddr, ubc_paddr, xsize, 2 | 1 | 8); /* Copy physical to physical and flush the source */
3830
3831 if ( !(flags & CL_READ) || (upl_valid_page(pl, 0) && upl_dirty_page(pl, 0))) {
3832 /*
3833 * issue a synchronous write to cluster_io
3834 */
3835 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, devblocksize,
3836 0, (struct buf *)0, (struct clios *)0);
3837 }
3838 if (error == 0) {
3839 uio->uio_offset += xsize;
3840 iov->iov_base += xsize;
3841 iov->iov_len -= xsize;
3842 uio->uio_resid -= xsize;
3843 }
3844 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3845
3846 return (error);
3847 }
3848
3849
3850
3851 int
3852 cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int xsize)
3853 {
3854 int pg_offset;
3855 int pg_index;
3856 int csize;
3857 int segflg;
3858 int retval = 0;
3859 upl_page_info_t *pl;
3860 boolean_t funnel_state = FALSE;
3861
3862
3863 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
3864 (int)uio->uio_offset, uio->uio_resid, upl_offset, xsize, 0);
3865
3866 if (xsize >= (16 * 1024))
3867 funnel_state = thread_funnel_set(kernel_flock, FALSE);
3868
3869 segflg = uio->uio_segflg;
3870
3871 switch(segflg) {
3872
3873 case UIO_USERSPACE:
3874 case UIO_USERISPACE:
3875 uio->uio_segflg = UIO_PHYS_USERSPACE;
3876 break;
3877
3878 case UIO_SYSSPACE:
3879 uio->uio_segflg = UIO_PHYS_SYSSPACE;
3880 break;
3881 }
3882 pl = ubc_upl_pageinfo(upl);
3883
3884 pg_index = upl_offset / PAGE_SIZE;
3885 pg_offset = upl_offset & PAGE_MASK;
3886 csize = min(PAGE_SIZE - pg_offset, xsize);
3887
3888 while (xsize && retval == 0) {
3889 addr64_t paddr;
3890
3891 paddr = ((addr64_t)upl_phys_page(pl, pg_index) << 12) + pg_offset;
3892
3893 retval = uiomove64(paddr, csize, uio);
3894
3895 pg_index += 1;
3896 pg_offset = 0;
3897 xsize -= csize;
3898 csize = min(PAGE_SIZE, xsize);
3899 }
3900 uio->uio_segflg = segflg;
3901
3902 if (funnel_state == TRUE)
3903 thread_funnel_set(kernel_flock, TRUE);
3904
3905 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
3906 (int)uio->uio_offset, uio->uio_resid, retval, segflg, 0);
3907
3908 return (retval);
3909 }
3910
3911
3912 int
3913 cluster_copy_ubc_data(struct vnode *vp, struct uio *uio, int *io_resid, int mark_dirty)
3914 {
3915 int segflg;
3916 int io_size;
3917 int xsize;
3918 int start_offset;
3919 off_t f_offset;
3920 int retval = 0;
3921 memory_object_control_t control;
3922 int op_flags = UPL_POP_SET | UPL_POP_BUSY;
3923 boolean_t funnel_state = FALSE;
3924
3925
3926 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
3927 (int)uio->uio_offset, uio->uio_resid, 0, *io_resid, 0);
3928
3929 control = ubc_getobject(vp, UBC_FLAGS_NONE);
3930 if (control == MEMORY_OBJECT_CONTROL_NULL) {
3931 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
3932 (int)uio->uio_offset, uio->uio_resid, retval, 3, 0);
3933
3934 return(0);
3935 }
3936 if (mark_dirty)
3937 op_flags |= UPL_POP_DIRTY;
3938
3939 segflg = uio->uio_segflg;
3940
3941 switch(segflg) {
3942
3943 case UIO_USERSPACE:
3944 case UIO_USERISPACE:
3945 uio->uio_segflg = UIO_PHYS_USERSPACE;
3946 break;
3947
3948 case UIO_SYSSPACE:
3949 uio->uio_segflg = UIO_PHYS_SYSSPACE;
3950 break;
3951 }
3952 io_size = *io_resid;
3953 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
3954 f_offset = uio->uio_offset - start_offset;
3955 xsize = min(PAGE_SIZE - start_offset, io_size);
3956
3957 while (io_size && retval == 0) {
3958 ppnum_t pgframe;
3959
3960 if (ubc_page_op_with_control(control, f_offset, op_flags, &pgframe, 0) != KERN_SUCCESS)
3961 break;
3962
3963 if (funnel_state == FALSE && io_size >= (16 * 1024))
3964 funnel_state = thread_funnel_set(kernel_flock, FALSE);
3965
3966 retval = uiomove64((addr64_t)(((addr64_t)pgframe << 12) + start_offset), xsize, uio);
3967
3968 ubc_page_op_with_control(control, f_offset, UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
3969
3970 io_size -= xsize;
3971 start_offset = 0;
3972 f_offset = uio->uio_offset;
3973 xsize = min(PAGE_SIZE, io_size);
3974 }
3975 uio->uio_segflg = segflg;
3976 *io_resid = io_size;
3977
3978 if (funnel_state == TRUE)
3979 thread_funnel_set(kernel_flock, TRUE);
3980
3981 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
3982 (int)uio->uio_offset, uio->uio_resid, retval, 0x80000000 | segflg, 0);
3983
3984 return(retval);
3985 }
3986
3987
3988 int
3989 is_file_clean(struct vnode *vp, off_t filesize)
3990 {
3991 off_t f_offset;
3992 int flags;
3993 int total_dirty = 0;
3994
3995 for (f_offset = 0; f_offset < filesize; f_offset += PAGE_SIZE_64) {
3996 if (ubc_page_op(vp, f_offset, 0, 0, &flags) == KERN_SUCCESS) {
3997 if (flags & UPL_POP_DIRTY) {
3998 total_dirty++;
3999 }
4000 }
4001 }
4002 if (total_dirty)
4003 return(EINVAL);
4004
4005 return (0);
4006 }
4007
4008
4009
4010 /*
4011 * Dirty region tracking/clustering mechanism.
4012 *
4013 * This code (vfs_drt_*) provides a mechanism for tracking and clustering
4014 * dirty regions within a larger space (file). It is primarily intended to
4015 * support clustering in large files with many dirty areas.
4016 *
4017 * The implementation assumes that the dirty regions are pages.
4018 *
4019 * To represent dirty pages within the file, we store bit vectors in a
4020 * variable-size circular hash.
4021 */
4022
4023 /*
4024 * Bitvector size. This determines the number of pages we group in a
4025 * single hashtable entry. Each hashtable entry is aligned to this
4026 * size within the file.
4027 */
4028 #define DRT_BITVECTOR_PAGES 256
4029
4030 /*
4031 * File offset handling.
4032 *
4033 * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
4034 * the correct formula is (~(DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1)
4035 */
4036 #define DRT_ADDRESS_MASK (~((1 << 20) - 1))
4037 #define DRT_ALIGN_ADDRESS(addr) ((addr) & DRT_ADDRESS_MASK)
4038
4039 /*
4040 * Hashtable address field handling.
4041 *
4042 * The low-order bits of the hashtable address are used to conserve
4043 * space.
4044 *
4045 * DRT_HASH_COUNT_MASK must be large enough to store the range
4046 * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
4047 * to indicate that the bucket is actually unoccupied.
4048 */
4049 #define DRT_HASH_GET_ADDRESS(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
4050 #define DRT_HASH_SET_ADDRESS(scm, i, a) \
4051 do { \
4052 (scm)->scm_hashtable[(i)].dhe_control = \
4053 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \
4054 } while (0)
4055 #define DRT_HASH_COUNT_MASK 0x1ff
4056 #define DRT_HASH_GET_COUNT(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
4057 #define DRT_HASH_SET_COUNT(scm, i, c) \
4058 do { \
4059 (scm)->scm_hashtable[(i)].dhe_control = \
4060 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK); \
4061 } while (0)
4062 #define DRT_HASH_CLEAR(scm, i) \
4063 do { \
4064 (scm)->scm_hashtable[(i)].dhe_control = 0; \
4065 } while (0)
4066 #define DRT_HASH_VACATE(scm, i) DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
4067 #define DRT_HASH_VACANT(scm, i) (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
4068 #define DRT_HASH_COPY(oscm, oi, scm, i) \
4069 do { \
4070 (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control; \
4071 DRT_BITVECTOR_COPY(oscm, oi, scm, i); \
4072 } while(0);
4073
4074
4075 /*
4076 * Hash table moduli.
4077 *
4078 * Since the hashtable entry's size is dependent on the size of
4079 * the bitvector, and since the hashtable size is constrained to
4080 * both being prime and fitting within the desired allocation
4081 * size, these values need to be manually determined.
4082 *
4083 * For DRT_BITVECTOR_SIZE = 256, the entry size is 40 bytes.
4084 *
4085 * The small hashtable allocation is 1024 bytes, so the modulus is 23.
4086 * The large hashtable allocation is 16384 bytes, so the modulus is 401.
4087 */
4088 #define DRT_HASH_SMALL_MODULUS 23
4089 #define DRT_HASH_LARGE_MODULUS 401
4090
4091 #define DRT_SMALL_ALLOCATION 1024 /* 104 bytes spare */
4092 #define DRT_LARGE_ALLOCATION 16384 /* 344 bytes spare */
4093
4094 /* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */
4095
4096 /*
4097 * Hashtable bitvector handling.
4098 *
4099 * Bitvector fields are 32 bits long.
4100 */
4101
4102 #define DRT_HASH_SET_BIT(scm, i, bit) \
4103 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32))
4104
4105 #define DRT_HASH_CLEAR_BIT(scm, i, bit) \
4106 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
4107
4108 #define DRT_HASH_TEST_BIT(scm, i, bit) \
4109 ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
4110
4111 #define DRT_BITVECTOR_CLEAR(scm, i) \
4112 bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
4113
4114 #define DRT_BITVECTOR_COPY(oscm, oi, scm, i) \
4115 bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0], \
4116 &(scm)->scm_hashtable[(i)].dhe_bitvector[0], \
4117 (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
4118
4119
4120
4121 /*
4122 * Hashtable entry.
4123 */
4124 struct vfs_drt_hashentry {
4125 u_int64_t dhe_control;
4126 u_int32_t dhe_bitvector[DRT_BITVECTOR_PAGES / 32];
4127 };
4128
4129 /*
4130 * Dirty Region Tracking structure.
4131 *
4132 * The hashtable is allocated entirely inside the DRT structure.
4133 *
4134 * The hash is a simple circular prime modulus arrangement, the structure
4135 * is resized from small to large if it overflows.
4136 */
4137
4138 struct vfs_drt_clustermap {
4139 u_int32_t scm_magic; /* sanity/detection */
4140 #define DRT_SCM_MAGIC 0x12020003
4141 u_int32_t scm_modulus; /* current ring size */
4142 u_int32_t scm_buckets; /* number of occupied buckets */
4143 u_int32_t scm_lastclean; /* last entry we cleaned */
4144 u_int32_t scm_iskips; /* number of slot skips */
4145
4146 struct vfs_drt_hashentry scm_hashtable[0];
4147 };
4148
4149
4150 #define DRT_HASH(scm, addr) ((addr) % (scm)->scm_modulus)
4151 #define DRT_HASH_NEXT(scm, addr) (((addr) + 1) % (scm)->scm_modulus)
4152
4153 /*
4154 * Debugging codes and arguments.
4155 */
4156 #define DRT_DEBUG_EMPTYFREE (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
4157 #define DRT_DEBUG_RETCLUSTER (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
4158 #define DRT_DEBUG_ALLOC (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
4159 #define DRT_DEBUG_INSERT (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
4160 #define DRT_DEBUG_MARK (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
4161 * dirty */
4162 /* 0, setcount */
4163 /* 1 (clean, no map) */
4164 /* 2 (map alloc fail) */
4165 /* 3, resid (partial) */
4166 #define DRT_DEBUG_6 (FSDBG_CODE(DBG_FSRW, 87))
4167 #define DRT_DEBUG_SCMDATA (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
4168 * lastclean, iskips */
4169
4170
4171 static void vfs_drt_sanity(struct vfs_drt_clustermap *cmap);
4172 static kern_return_t vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp);
4173 static kern_return_t vfs_drt_free_map(struct vfs_drt_clustermap *cmap);
4174 static kern_return_t vfs_drt_search_index(struct vfs_drt_clustermap *cmap,
4175 u_int64_t offset, int *indexp);
4176 static kern_return_t vfs_drt_get_index(struct vfs_drt_clustermap **cmapp,
4177 u_int64_t offset,
4178 int *indexp,
4179 int recursed);
4180 static kern_return_t vfs_drt_do_mark_pages(
4181 void **cmapp,
4182 u_int64_t offset,
4183 u_int length,
4184 int *setcountp,
4185 int dirty);
4186 static void vfs_drt_trace(
4187 struct vfs_drt_clustermap *cmap,
4188 int code,
4189 int arg1,
4190 int arg2,
4191 int arg3,
4192 int arg4);
4193
4194
4195 /*
4196 * Allocate and initialise a sparse cluster map.
4197 *
4198 * Will allocate a new map, resize or compact an existing map.
4199 *
4200 * XXX we should probably have at least one intermediate map size,
4201 * as the 1:16 ratio seems a bit drastic.
4202 */
4203 static kern_return_t
4204 vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp)
4205 {
4206 struct vfs_drt_clustermap *cmap, *ocmap;
4207 kern_return_t kret;
4208 u_int64_t offset;
4209 int nsize, i, active_buckets, index, copycount;
4210
4211 ocmap = NULL;
4212 if (cmapp != NULL)
4213 ocmap = *cmapp;
4214
4215 /*
4216 * Decide on the size of the new map.
4217 */
4218 if (ocmap == NULL) {
4219 nsize = DRT_HASH_SMALL_MODULUS;
4220 } else {
4221 /* count the number of active buckets in the old map */
4222 active_buckets = 0;
4223 for (i = 0; i < ocmap->scm_modulus; i++) {
4224 if (!DRT_HASH_VACANT(ocmap, i) &&
4225 (DRT_HASH_GET_COUNT(ocmap, i) != 0))
4226 active_buckets++;
4227 }
4228 /*
4229 * If we're currently using the small allocation, check to
4230 * see whether we should grow to the large one.
4231 */
4232 if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
4233 /* if the ring is nearly full */
4234 if (active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) {
4235 nsize = DRT_HASH_LARGE_MODULUS;
4236 } else {
4237 nsize = DRT_HASH_SMALL_MODULUS;
4238 }
4239 } else {
4240 /* already using the large modulus */
4241 nsize = DRT_HASH_LARGE_MODULUS;
4242 /*
4243 * If the ring is completely full, there's
4244 * nothing useful for us to do. Behave as
4245 * though we had compacted into the new
4246 * array and return.
4247 */
4248 if (active_buckets >= DRT_HASH_LARGE_MODULUS)
4249 return(KERN_SUCCESS);
4250 }
4251 }
4252
4253 /*
4254 * Allocate and initialise the new map.
4255 */
4256
4257 kret = kmem_alloc(kernel_map, (vm_offset_t *)&cmap,
4258 (nsize == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION);
4259 if (kret != KERN_SUCCESS)
4260 return(kret);
4261 cmap->scm_magic = DRT_SCM_MAGIC;
4262 cmap->scm_modulus = nsize;
4263 cmap->scm_buckets = 0;
4264 cmap->scm_lastclean = 0;
4265 cmap->scm_iskips = 0;
4266 for (i = 0; i < cmap->scm_modulus; i++) {
4267 DRT_HASH_CLEAR(cmap, i);
4268 DRT_HASH_VACATE(cmap, i);
4269 DRT_BITVECTOR_CLEAR(cmap, i);
4270 }
4271
4272 /*
4273 * If there's an old map, re-hash entries from it into the new map.
4274 */
4275 copycount = 0;
4276 if (ocmap != NULL) {
4277 for (i = 0; i < ocmap->scm_modulus; i++) {
4278 /* skip empty buckets */
4279 if (DRT_HASH_VACANT(ocmap, i) ||
4280 (DRT_HASH_GET_COUNT(ocmap, i) == 0))
4281 continue;
4282 /* get new index */
4283 offset = DRT_HASH_GET_ADDRESS(ocmap, i);
4284 kret = vfs_drt_get_index(&cmap, offset, &index, 1);
4285 if (kret != KERN_SUCCESS) {
4286 /* XXX need to bail out gracefully here */
4287 panic("vfs_drt: new cluster map mysteriously too small");
4288 }
4289 /* copy */
4290 DRT_HASH_COPY(ocmap, i, cmap, index);
4291 copycount++;
4292 }
4293 }
4294
4295 /* log what we've done */
4296 vfs_drt_trace(cmap, DRT_DEBUG_ALLOC, copycount, 0, 0, 0);
4297
4298 /*
4299 * It's important to ensure that *cmapp always points to
4300 * a valid map, so we must overwrite it before freeing
4301 * the old map.
4302 */
4303 *cmapp = cmap;
4304 if (ocmap != NULL) {
4305 /* emit stats into trace buffer */
4306 vfs_drt_trace(ocmap, DRT_DEBUG_SCMDATA,
4307 ocmap->scm_modulus,
4308 ocmap->scm_buckets,
4309 ocmap->scm_lastclean,
4310 ocmap->scm_iskips);
4311
4312 vfs_drt_free_map(ocmap);
4313 }
4314 return(KERN_SUCCESS);
4315 }
4316
4317
4318 /*
4319 * Free a sparse cluster map.
4320 */
4321 static kern_return_t
4322 vfs_drt_free_map(struct vfs_drt_clustermap *cmap)
4323 {
4324 kern_return_t ret;
4325
4326 kmem_free(kernel_map, (vm_offset_t)cmap,
4327 (cmap->scm_modulus == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION);
4328 return(KERN_SUCCESS);
4329 }
4330
4331
4332 /*
4333 * Find the hashtable slot currently occupied by an entry for the supplied offset.
4334 */
4335 static kern_return_t
4336 vfs_drt_search_index(struct vfs_drt_clustermap *cmap, u_int64_t offset, int *indexp)
4337 {
4338 kern_return_t kret;
4339 int index, i, tries;
4340
4341 offset = DRT_ALIGN_ADDRESS(offset);
4342 index = DRT_HASH(cmap, offset);
4343
4344 /* traverse the hashtable */
4345 for (i = 0; i < cmap->scm_modulus; i++) {
4346
4347 /*
4348 * If the slot is vacant, we can stop.
4349 */
4350 if (DRT_HASH_VACANT(cmap, index))
4351 break;
4352
4353 /*
4354 * If the address matches our offset, we have success.
4355 */
4356 if (DRT_HASH_GET_ADDRESS(cmap, index) == offset) {
4357 *indexp = index;
4358 return(KERN_SUCCESS);
4359 }
4360
4361 /*
4362 * Move to the next slot, try again.
4363 */
4364 index = DRT_HASH_NEXT(cmap, index);
4365 }
4366 /*
4367 * It's not there.
4368 */
4369 return(KERN_FAILURE);
4370 }
4371
4372 /*
4373 * Find the hashtable slot for the supplied offset. If we haven't allocated
4374 * one yet, allocate one and populate the address field. Note that it will
4375 * not have a nonzero page count and thus will still technically be free, so
4376 * in the case where we are called to clean pages, the slot will remain free.
4377 */
4378 static kern_return_t
4379 vfs_drt_get_index(struct vfs_drt_clustermap **cmapp, u_int64_t offset, int *indexp, int recursed)
4380 {
4381 struct vfs_drt_clustermap *cmap;
4382 kern_return_t kret;
4383 int index, i;
4384
4385 cmap = *cmapp;
4386
4387 /* look for an existing entry */
4388 kret = vfs_drt_search_index(cmap, offset, indexp);
4389 if (kret == KERN_SUCCESS)
4390 return(kret);
4391
4392 /* need to allocate an entry */
4393 offset = DRT_ALIGN_ADDRESS(offset);
4394 index = DRT_HASH(cmap, offset);
4395
4396 /* scan from the index forwards looking for a vacant slot */
4397 for (i = 0; i < cmap->scm_modulus; i++) {
4398 /* slot vacant? */
4399 if (DRT_HASH_VACANT(cmap, index) || DRT_HASH_GET_COUNT(cmap,index) == 0) {
4400 cmap->scm_buckets++;
4401 if (index < cmap->scm_lastclean)
4402 cmap->scm_lastclean = index;
4403 DRT_HASH_SET_ADDRESS(cmap, index, offset);
4404 DRT_HASH_SET_COUNT(cmap, index, 0);
4405 DRT_BITVECTOR_CLEAR(cmap, index);
4406 *indexp = index;
4407 vfs_drt_trace(cmap, DRT_DEBUG_INSERT, (int)offset, i, 0, 0);
4408 return(KERN_SUCCESS);
4409 }
4410 cmap->scm_iskips += i;
4411 index = DRT_HASH_NEXT(cmap, index);
4412 }
4413
4414 /*
4415 * We haven't found a vacant slot, so the map is full. If we're not
4416 * already recursed, try reallocating/compacting it.
4417 */
4418 if (recursed)
4419 return(KERN_FAILURE);
4420 kret = vfs_drt_alloc_map(cmapp);
4421 if (kret == KERN_SUCCESS) {
4422 /* now try to insert again */
4423 kret = vfs_drt_get_index(cmapp, offset, indexp, 1);
4424 }
4425 return(kret);
4426 }
4427
4428 /*
4429 * Implementation of set dirty/clean.
4430 *
4431 * In the 'clean' case, not finding a map is OK.
4432 */
4433 static kern_return_t
4434 vfs_drt_do_mark_pages(
4435 void **private,
4436 u_int64_t offset,
4437 u_int length,
4438 int *setcountp,
4439 int dirty)
4440 {
4441 struct vfs_drt_clustermap *cmap, **cmapp;
4442 kern_return_t kret;
4443 int i, index, pgoff, pgcount, setcount, ecount;
4444
4445 cmapp = (struct vfs_drt_clustermap **)private;
4446 cmap = *cmapp;
4447
4448 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_START, (int)offset, (int)length, dirty, 0);
4449
4450 if (setcountp != NULL)
4451 *setcountp = 0;
4452
4453 /* allocate a cluster map if we don't already have one */
4454 if (cmap == NULL) {
4455 /* no cluster map, nothing to clean */
4456 if (!dirty) {
4457 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 1, 0, 0, 0);
4458 return(KERN_SUCCESS);
4459 }
4460 kret = vfs_drt_alloc_map(cmapp);
4461 if (kret != KERN_SUCCESS) {
4462 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 2, 0, 0, 0);
4463 return(kret);
4464 }
4465 }
4466 setcount = 0;
4467
4468 /*
4469 * Iterate over the length of the region.
4470 */
4471 while (length > 0) {
4472 /*
4473 * Get the hashtable index for this offset.
4474 *
4475 * XXX this will add blank entries if we are clearing a range
4476 * that hasn't been dirtied.
4477 */
4478 kret = vfs_drt_get_index(cmapp, offset, &index, 0);
4479 cmap = *cmapp; /* may have changed! */
4480 /* this may be a partial-success return */
4481 if (kret != KERN_SUCCESS) {
4482 if (setcountp != NULL)
4483 *setcountp = setcount;
4484 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 3, (int)length, 0, 0);
4485
4486 return(kret);
4487 }
4488
4489 /*
4490 * Work out how many pages we're modifying in this
4491 * hashtable entry.
4492 */
4493 pgoff = (offset - DRT_ALIGN_ADDRESS(offset)) / PAGE_SIZE;
4494 pgcount = min((length / PAGE_SIZE), (DRT_BITVECTOR_PAGES - pgoff));
4495
4496 /*
4497 * Iterate over pages, dirty/clearing as we go.
4498 */
4499 ecount = DRT_HASH_GET_COUNT(cmap, index);
4500 for (i = 0; i < pgcount; i++) {
4501 if (dirty) {
4502 if (!DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
4503 DRT_HASH_SET_BIT(cmap, index, pgoff + i);
4504 ecount++;
4505 setcount++;
4506 }
4507 } else {
4508 if (DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
4509 DRT_HASH_CLEAR_BIT(cmap, index, pgoff + i);
4510 ecount--;
4511 setcount++;
4512 }
4513 }
4514 }
4515 DRT_HASH_SET_COUNT(cmap, index, ecount);
4516 next:
4517 offset += pgcount * PAGE_SIZE;
4518 length -= pgcount * PAGE_SIZE;
4519 }
4520 if (setcountp != NULL)
4521 *setcountp = setcount;
4522
4523 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 0, setcount, 0, 0);
4524
4525 return(KERN_SUCCESS);
4526 }
4527
4528 /*
4529 * Mark a set of pages as dirty/clean.
4530 *
4531 * This is a public interface.
4532 *
4533 * cmapp
4534 * Pointer to storage suitable for holding a pointer. Note that
4535 * this must either be NULL or a value set by this function.
4536 *
4537 * size
4538 * Current file size in bytes.
4539 *
4540 * offset
4541 * Offset of the first page to be marked as dirty, in bytes. Must be
4542 * page-aligned.
4543 *
4544 * length
4545 * Length of dirty region, in bytes. Must be a multiple of PAGE_SIZE.
4546 *
4547 * setcountp
4548 * Number of pages newly marked dirty by this call (optional).
4549 *
4550 * Returns KERN_SUCCESS if all the pages were successfully marked.
4551 */
4552 static kern_return_t
4553 vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, int *setcountp)
4554 {
4555 /* XXX size unused, drop from interface */
4556 return(vfs_drt_do_mark_pages(cmapp, offset, length, setcountp, 1));
4557 }
4558
4559 static kern_return_t
4560 vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length)
4561 {
4562 return(vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0));
4563 }
4564
4565 /*
4566 * Get a cluster of dirty pages.
4567 *
4568 * This is a public interface.
4569 *
4570 * cmapp
4571 * Pointer to storage managed by drt_mark_pages. Note that this must
4572 * be NULL or a value set by drt_mark_pages.
4573 *
4574 * offsetp
4575 * Returns the byte offset into the file of the first page in the cluster.
4576 *
4577 * lengthp
4578 * Returns the length in bytes of the cluster of dirty pages.
4579 *
4580 * Returns success if a cluster was found. If KERN_FAILURE is returned, there
4581 * are no dirty pages meeting the minmum size criteria. Private storage will
4582 * be released if there are no more dirty pages left in the map
4583 *
4584 */
4585 static kern_return_t
4586 vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp)
4587 {
4588 struct vfs_drt_clustermap *cmap;
4589 u_int64_t offset;
4590 u_int length;
4591 int index, i, j, fs, ls;
4592
4593 /* sanity */
4594 if ((cmapp == NULL) || (*cmapp == NULL))
4595 return(KERN_FAILURE);
4596 cmap = *cmapp;
4597
4598 /* walk the hashtable */
4599 for (offset = 0, j = 0; j < cmap->scm_modulus; offset += (DRT_BITVECTOR_PAGES * PAGE_SIZE), j++) {
4600 index = DRT_HASH(cmap, offset);
4601
4602 if (DRT_HASH_VACANT(cmap, index) || (DRT_HASH_GET_COUNT(cmap, index) == 0))
4603 continue;
4604
4605 /* scan the bitfield for a string of bits */
4606 fs = -1;
4607
4608 for (i = 0; i < DRT_BITVECTOR_PAGES; i++) {
4609 if (DRT_HASH_TEST_BIT(cmap, index, i)) {
4610 fs = i;
4611 break;
4612 }
4613 }
4614 if (fs == -1) {
4615 /* didn't find any bits set */
4616 panic("vfs_drt: entry summary count > 0 but no bits set in map");
4617 }
4618 for (ls = 0; i < DRT_BITVECTOR_PAGES; i++, ls++) {
4619 if (!DRT_HASH_TEST_BIT(cmap, index, i))
4620 break;
4621 }
4622
4623 /* compute offset and length, mark pages clean */
4624 offset = DRT_HASH_GET_ADDRESS(cmap, index) + (PAGE_SIZE * fs);
4625 length = ls * PAGE_SIZE;
4626 vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
4627 cmap->scm_lastclean = index;
4628
4629 /* return successful */
4630 *offsetp = (off_t)offset;
4631 *lengthp = length;
4632
4633 vfs_drt_trace(cmap, DRT_DEBUG_RETCLUSTER, (int)offset, (int)length, 0, 0);
4634 return(KERN_SUCCESS);
4635 }
4636 /*
4637 * We didn't find anything... hashtable is empty
4638 * emit stats into trace buffer and
4639 * then free it
4640 */
4641 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
4642 cmap->scm_modulus,
4643 cmap->scm_buckets,
4644 cmap->scm_lastclean,
4645 cmap->scm_iskips);
4646
4647 vfs_drt_free_map(cmap);
4648 *cmapp = NULL;
4649
4650 return(KERN_FAILURE);
4651 }
4652
4653
4654 static kern_return_t
4655 vfs_drt_control(void **cmapp, int op_type)
4656 {
4657 struct vfs_drt_clustermap *cmap;
4658
4659 /* sanity */
4660 if ((cmapp == NULL) || (*cmapp == NULL))
4661 return(KERN_FAILURE);
4662 cmap = *cmapp;
4663
4664 switch (op_type) {
4665 case 0:
4666 /* emit stats into trace buffer */
4667 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
4668 cmap->scm_modulus,
4669 cmap->scm_buckets,
4670 cmap->scm_lastclean,
4671 cmap->scm_iskips);
4672
4673 vfs_drt_free_map(cmap);
4674 *cmapp = NULL;
4675 break;
4676
4677 case 1:
4678 cmap->scm_lastclean = 0;
4679 break;
4680 }
4681 return(KERN_SUCCESS);
4682 }
4683
4684
4685
4686 /*
4687 * Emit a summary of the state of the clustermap into the trace buffer
4688 * along with some caller-provided data.
4689 */
4690 static void
4691 vfs_drt_trace(struct vfs_drt_clustermap *cmap, int code, int arg1, int arg2, int arg3, int arg4)
4692 {
4693 KERNEL_DEBUG(code, arg1, arg2, arg3, arg4, 0);
4694 }
4695
4696 /*
4697 * Perform basic sanity check on the hash entry summary count
4698 * vs. the actual bits set in the entry.
4699 */
4700 static void
4701 vfs_drt_sanity(struct vfs_drt_clustermap *cmap)
4702 {
4703 int index, i;
4704 int bits_on;
4705
4706 for (index = 0; index < cmap->scm_modulus; index++) {
4707 if (DRT_HASH_VACANT(cmap, index))
4708 continue;
4709
4710 for (bits_on = 0, i = 0; i < DRT_BITVECTOR_PAGES; i++) {
4711 if (DRT_HASH_TEST_BIT(cmap, index, i))
4712 bits_on++;
4713 }
4714 if (bits_on != DRT_HASH_GET_COUNT(cmap, index))
4715 panic("bits_on = %d, index = %d\n", bits_on, index);
4716 }
4717 }