]> git.saurik.com Git - apple/xnu.git/blob - bsd/vfs/vfs_cluster.c
160acae0a313b9e0b21bf680734d20768d431451
[apple/xnu.git] / bsd / vfs / vfs_cluster.c
1 /*
2 * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
7 *
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * file.
14 *
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
22 *
23 * @APPLE_LICENSE_HEADER_END@
24 */
25 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
26 /*
27 * Copyright (c) 1993
28 * The Regents of the University of California. All rights reserved.
29 *
30 * Redistribution and use in source and binary forms, with or without
31 * modification, are permitted provided that the following conditions
32 * are met:
33 * 1. Redistributions of source code must retain the above copyright
34 * notice, this list of conditions and the following disclaimer.
35 * 2. Redistributions in binary form must reproduce the above copyright
36 * notice, this list of conditions and the following disclaimer in the
37 * documentation and/or other materials provided with the distribution.
38 * 3. All advertising materials mentioning features or use of this software
39 * must display the following acknowledgement:
40 * This product includes software developed by the University of
41 * California, Berkeley and its contributors.
42 * 4. Neither the name of the University nor the names of its contributors
43 * may be used to endorse or promote products derived from this software
44 * without specific prior written permission.
45 *
46 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
47 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
48 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
49 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
50 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
51 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
52 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
53 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
54 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
55 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
56 * SUCH DAMAGE.
57 *
58 * @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95
59 */
60
61 #include <sys/param.h>
62 #include <sys/proc.h>
63 #include <sys/buf.h>
64 #include <sys/vnode.h>
65 #include <sys/mount.h>
66 #include <sys/trace.h>
67 #include <sys/malloc.h>
68 #include <sys/resourcevar.h>
69 #include <libkern/libkern.h>
70
71 #include <sys/ubc.h>
72 #include <vm/vm_pageout.h>
73
74 #include <sys/kdebug.h>
75
76 #define CL_READ 0x01
77 #define CL_ASYNC 0x02
78 #define CL_COMMIT 0x04
79 #define CL_PAGEOUT 0x10
80 #define CL_AGE 0x20
81 #define CL_DUMP 0x40
82 #define CL_NOZERO 0x80
83 #define CL_PAGEIN 0x100
84 #define CL_DEV_MEMORY 0x200
85 #define CL_PRESERVE 0x400
86
87
88 struct clios {
89 u_int io_completed; /* amount of io that has currently completed */
90 u_int io_issued; /* amount of io that was successfully issued */
91 int io_error; /* error code of first error encountered */
92 int io_wanted; /* someone is sleeping waiting for a change in state */
93 };
94
95
96 static void cluster_zero(upl_t upl, vm_offset_t upl_offset,
97 int size, struct buf *bp);
98 static int cluster_read_x(struct vnode *vp, struct uio *uio,
99 off_t filesize, int devblocksize, int flags);
100 static int cluster_write_x(struct vnode *vp, struct uio *uio,
101 off_t oldEOF, off_t newEOF, off_t headOff,
102 off_t tailOff, int devblocksize, int flags);
103 static int cluster_nocopy_read(struct vnode *vp, struct uio *uio,
104 off_t filesize, int devblocksize, int flags);
105 static int cluster_nocopy_write(struct vnode *vp, struct uio *uio,
106 off_t newEOF, int devblocksize, int flags);
107 static int cluster_phys_read(struct vnode *vp, struct uio *uio,
108 off_t filesize, int devblocksize, int flags);
109 static int cluster_phys_write(struct vnode *vp, struct uio *uio,
110 off_t newEOF, int devblocksize, int flags);
111 static int cluster_align_phys_io(struct vnode *vp, struct uio *uio,
112 vm_offset_t usr_paddr, int xsize, int devblocksize, int flags);
113 static int cluster_push_x(struct vnode *vp, off_t EOF, daddr_t first, daddr_t last, int can_delay);
114 static int cluster_try_push(struct vnode *vp, off_t newEOF, int can_delay, int push_all);
115
116
117 /*
118 * throttle the number of async writes that
119 * can be outstanding on a single vnode
120 * before we issue a synchronous write
121 */
122 #define ASYNC_THROTTLE 9
123
124 static int
125 cluster_iodone(bp)
126 struct buf *bp;
127 {
128 int b_flags;
129 int error;
130 int total_size;
131 int total_resid;
132 int upl_offset;
133 int zero_offset;
134 upl_t upl;
135 struct buf *cbp;
136 struct buf *cbp_head;
137 struct buf *cbp_next;
138 struct buf *real_bp;
139 struct vnode *vp;
140 struct clios *iostate;
141 int commit_size;
142 int pg_offset;
143
144
145 cbp_head = (struct buf *)(bp->b_trans_head);
146
147 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
148 (int)cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
149
150 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
151 /*
152 * all I/O requests that are part of this transaction
153 * have to complete before we can process it
154 */
155 if ( !(cbp->b_flags & B_DONE)) {
156
157 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
158 (int)cbp_head, (int)cbp, cbp->b_bcount, cbp->b_flags, 0);
159
160 return 0;
161 }
162 }
163 error = 0;
164 total_size = 0;
165 total_resid = 0;
166
167 cbp = cbp_head;
168 upl_offset = cbp->b_uploffset;
169 upl = cbp->b_pagelist;
170 b_flags = cbp->b_flags;
171 real_bp = cbp->b_real_bp;
172 vp = cbp->b_vp;
173 zero_offset= cbp->b_validend;
174 iostate = (struct clios *)cbp->b_iostate;
175
176 while (cbp) {
177 if (cbp->b_vectorcount > 1)
178 _FREE(cbp->b_vectorlist, M_SEGMENT);
179
180 if ((cbp->b_flags & B_ERROR) && error == 0)
181 error = cbp->b_error;
182
183 total_resid += cbp->b_resid;
184 total_size += cbp->b_bcount;
185
186 cbp_next = cbp->b_trans_next;
187
188 free_io_buf(cbp);
189
190 cbp = cbp_next;
191 }
192 if (zero_offset)
193 cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
194
195 if ((vp->v_flag & VTHROTTLED) && (vp->v_numoutput <= (ASYNC_THROTTLE / 3))) {
196 vp->v_flag &= ~VTHROTTLED;
197 wakeup((caddr_t)&vp->v_numoutput);
198 }
199 if (iostate) {
200 /*
201 * someone has issued multiple I/Os asynchrounsly
202 * and is waiting for them to complete (streaming)
203 */
204 if (error && iostate->io_error == 0)
205 iostate->io_error = error;
206
207 iostate->io_completed += total_size;
208
209 if (iostate->io_wanted) {
210 /*
211 * someone is waiting for the state of
212 * this io stream to change
213 */
214 iostate->io_wanted = 0;
215 wakeup((caddr_t)&iostate->io_wanted);
216 }
217 }
218 if ((b_flags & B_NEED_IODONE) && real_bp) {
219 if (error) {
220 real_bp->b_flags |= B_ERROR;
221 real_bp->b_error = error;
222 }
223 real_bp->b_resid = total_resid;
224
225 biodone(real_bp);
226 }
227 if (error == 0 && total_resid)
228 error = EIO;
229
230 if (b_flags & B_COMMIT_UPL) {
231 pg_offset = upl_offset & PAGE_MASK;
232 commit_size = (((pg_offset + total_size) + (PAGE_SIZE - 1)) / PAGE_SIZE) * PAGE_SIZE;
233
234 if (error || (b_flags & B_NOCACHE) || ((b_flags & B_PHYS) && !(b_flags & B_READ))) {
235 int upl_abort_code;
236
237 if (b_flags & B_PHYS)
238 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
239 else if ((b_flags & B_PAGEOUT) && (error != ENXIO)) /* transient error */
240 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
241 else if (b_flags & B_PGIN)
242 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
243 else
244 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
245
246 ubc_upl_abort_range(upl, upl_offset - pg_offset, commit_size,
247 upl_abort_code);
248
249 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
250 (int)upl, upl_offset - pg_offset, commit_size,
251 0x80000000|upl_abort_code, 0);
252
253 } else {
254 int upl_commit_flags = UPL_COMMIT_FREE_ON_EMPTY;
255
256 if (b_flags & B_PHYS)
257 upl_commit_flags |= UPL_COMMIT_SET_DIRTY;
258 else if ( !(b_flags & B_PAGEOUT))
259 upl_commit_flags |= UPL_COMMIT_CLEAR_DIRTY;
260 if (b_flags & B_AGE)
261 upl_commit_flags |= UPL_COMMIT_INACTIVATE;
262
263 ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size,
264 upl_commit_flags);
265
266 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
267 (int)upl, upl_offset - pg_offset, commit_size,
268 upl_commit_flags, 0);
269 }
270 } else
271 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
272 (int)upl, upl_offset, 0, error, 0);
273
274 return (error);
275 }
276
277
278 static void
279 cluster_zero(upl, upl_offset, size, bp)
280 upl_t upl;
281 vm_offset_t upl_offset;
282 int size;
283 struct buf *bp;
284 {
285 vm_offset_t io_addr = 0;
286 int must_unmap = 0;
287 kern_return_t kret;
288
289 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_NONE,
290 upl_offset, size, (int)bp, 0, 0);
291
292 if (bp == NULL || bp->b_data == NULL) {
293 kret = ubc_upl_map(upl, &io_addr);
294
295 if (kret != KERN_SUCCESS)
296 panic("cluster_zero: ubc_upl_map() failed with (%d)", kret);
297 if (io_addr == 0)
298 panic("cluster_zero: ubc_upl_map() mapped 0");
299
300 must_unmap = 1;
301 } else
302 io_addr = (vm_offset_t)bp->b_data;
303 bzero((caddr_t)(io_addr + upl_offset), size);
304
305 if (must_unmap) {
306 kret = ubc_upl_unmap(upl);
307
308 if (kret != KERN_SUCCESS)
309 panic("cluster_zero: kernel_upl_unmap failed");
310 }
311 }
312
313 static int
314 cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, real_bp, iostate)
315 struct vnode *vp;
316 upl_t upl;
317 vm_offset_t upl_offset;
318 off_t f_offset;
319 int non_rounded_size;
320 int devblocksize;
321 int flags;
322 struct buf *real_bp;
323 struct clios *iostate;
324 {
325 struct buf *cbp;
326 struct iovec *iovp;
327 u_int size;
328 u_int io_size;
329 int io_flags;
330 int error = 0;
331 int retval = 0;
332 struct buf *cbp_head = 0;
333 struct buf *cbp_tail = 0;
334 upl_page_info_t *pl;
335 int buf_count = 0;
336 int pg_count;
337 int pg_offset;
338 u_int max_iosize;
339 u_int max_vectors;
340 int priv;
341 int zero_offset = 0;
342 u_int first_lblkno;
343
344 if (flags & CL_READ) {
345 io_flags = (B_VECTORLIST | B_READ);
346
347 vfs_io_attributes(vp, B_READ, &max_iosize, &max_vectors);
348 } else {
349 io_flags = (B_VECTORLIST | B_WRITEINPROG);
350
351 vfs_io_attributes(vp, B_WRITE, &max_iosize, &max_vectors);
352 }
353 pl = ubc_upl_pageinfo(upl);
354
355 if (flags & CL_AGE)
356 io_flags |= B_AGE;
357 if (flags & CL_DUMP)
358 io_flags |= B_NOCACHE;
359 if (flags & CL_PAGEIN)
360 io_flags |= B_PGIN;
361 if (flags & CL_PAGEOUT)
362 io_flags |= B_PAGEOUT;
363 if (flags & CL_COMMIT)
364 io_flags |= B_COMMIT_UPL;
365 if (flags & CL_PRESERVE)
366 io_flags |= B_PHYS;
367
368 if (devblocksize)
369 size = (non_rounded_size + (devblocksize - 1)) & ~(devblocksize - 1);
370 else
371 size = non_rounded_size;
372
373
374 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START,
375 (int)f_offset, size, upl_offset, flags, 0);
376
377 if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
378 /*
379 * then we are going to end up
380 * with a page that we can't complete (the file size wasn't a multiple
381 * of PAGE_SIZE and we're trying to read to the end of the file
382 * so we'll go ahead and zero out the portion of the page we can't
383 * read in from the file
384 */
385 zero_offset = upl_offset + non_rounded_size;
386 }
387 while (size) {
388 int vsize;
389 int i;
390 int pl_index;
391 int pg_resid;
392 int num_contig;
393 daddr_t lblkno;
394 daddr_t blkno;
395
396 if (size > max_iosize)
397 io_size = max_iosize;
398 else
399 io_size = size;
400
401 if (error = VOP_CMAP(vp, f_offset, io_size, &blkno, (size_t *)&io_size, NULL)) {
402 if (error == EOPNOTSUPP)
403 panic("VOP_CMAP Unimplemented");
404 break;
405 }
406
407 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
408 (int)f_offset, (int)blkno, io_size, zero_offset, 0);
409
410 if ( (!(flags & CL_READ) && (long)blkno == -1) || io_size == 0) {
411 if (flags & CL_PAGEOUT) {
412 error = EINVAL;
413 break;
414 };
415
416 /* Try paging out the page individually before
417 giving up entirely and dumping it (it could
418 be mapped in a "hole" and require allocation
419 before the I/O:
420 */
421 ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE_64, UPL_ABORT_FREE_ON_EMPTY);
422 if (ubc_pushdirty_range(vp, f_offset, PAGE_SIZE_64) == 0) {
423 error = EINVAL;
424 break;
425 };
426
427 upl_offset += PAGE_SIZE_64;
428 f_offset += PAGE_SIZE_64;
429 size -= PAGE_SIZE_64;
430 continue;
431 }
432 lblkno = (daddr_t)(f_offset / PAGE_SIZE_64);
433 /*
434 * we have now figured out how much I/O we can do - this is in 'io_size'
435 * pl_index represents the first page in the 'upl' that the I/O will occur for
436 * pg_offset is the starting point in the first page for the I/O
437 * pg_count is the number of full and partial pages that 'io_size' encompasses
438 */
439 pl_index = upl_offset / PAGE_SIZE;
440 pg_offset = upl_offset & PAGE_MASK;
441 pg_count = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
442
443 if (flags & CL_DEV_MEMORY) {
444 /*
445 * currently, can't deal with reading 'holes' in file
446 */
447 if ((long)blkno == -1) {
448 error = EINVAL;
449 break;
450 }
451 /*
452 * treat physical requests as one 'giant' page
453 */
454 pg_count = 1;
455 }
456 if ((flags & CL_READ) && (long)blkno == -1) {
457 int bytes_to_zero;
458
459 /*
460 * if we're reading and blkno == -1, then we've got a
461 * 'hole' in the file that we need to deal with by zeroing
462 * out the affected area in the upl
463 */
464 if (zero_offset && io_size == size) {
465 /*
466 * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
467 * than 'zero_offset' will be non-zero
468 * if the 'hole' returned by VOP_CMAP extends all the way to the eof
469 * (indicated by the io_size finishing off the I/O request for this UPL)
470 * than we're not going to issue an I/O for the
471 * last page in this upl... we need to zero both the hole and the tail
472 * of the page beyond the EOF, since the delayed zero-fill won't kick in
473 */
474 bytes_to_zero = (((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset;
475
476 zero_offset = 0;
477 } else
478 bytes_to_zero = io_size;
479
480 cluster_zero(upl, upl_offset, bytes_to_zero, real_bp);
481
482 if (cbp_head)
483 /*
484 * if there is a current I/O chain pending
485 * then the first page of the group we just zero'd
486 * will be handled by the I/O completion if the zero
487 * fill started in the middle of the page
488 */
489 pg_count = (io_size - pg_offset) / PAGE_SIZE;
490 else {
491 /*
492 * no pending I/O to pick up that first page
493 * so, we have to make sure it gets committed
494 * here.
495 * set the pg_offset to 0 so that the upl_commit_range
496 * starts with this page
497 */
498 pg_count = (io_size + pg_offset) / PAGE_SIZE;
499 pg_offset = 0;
500 }
501 if (io_size == size && ((upl_offset + io_size) & PAGE_MASK))
502 /*
503 * if we're done with the request for this UPL
504 * then we have to make sure to commit the last page
505 * even if we only partially zero-filled it
506 */
507 pg_count++;
508
509 if (pg_count) {
510 if (pg_offset)
511 pg_resid = PAGE_SIZE - pg_offset;
512 else
513 pg_resid = 0;
514
515 if (flags & CL_COMMIT)
516 ubc_upl_commit_range(upl,
517 (upl_offset + pg_resid) & ~PAGE_MASK,
518 pg_count * PAGE_SIZE,
519 UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
520 }
521 upl_offset += io_size;
522 f_offset += io_size;
523 size -= io_size;
524
525 if (cbp_head && pg_count)
526 goto start_io;
527 continue;
528
529 } else if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno)) {
530 real_bp->b_blkno = blkno;
531 }
532
533 if (pg_count > 1) {
534 if (pg_count > max_vectors) {
535 io_size -= (pg_count - max_vectors) * PAGE_SIZE;
536
537 if (io_size < 0) {
538 io_size = PAGE_SIZE - pg_offset;
539 pg_count = 1;
540 } else
541 pg_count = max_vectors;
542 }
543 /*
544 * we need to allocate space for the vector list
545 */
546 if (pg_count > 1) {
547 iovp = (struct iovec *)_MALLOC(sizeof(struct iovec) * pg_count,
548 M_SEGMENT, M_NOWAIT);
549
550 if (iovp == (struct iovec *) 0) {
551 /*
552 * if the allocation fails, then throttle down to a single page
553 */
554 io_size = PAGE_SIZE - pg_offset;
555 pg_count = 1;
556 }
557 }
558 }
559
560 /* Throttle the speculative IO */
561 if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT))
562 priv = 0;
563 else
564 priv = 1;
565
566 cbp = alloc_io_buf(vp, priv);
567
568 if (pg_count == 1)
569 /*
570 * we use the io vector that's reserved in the buffer header
571 * this insures we can always issue an I/O even in a low memory
572 * condition that prevents the _MALLOC from succeeding... this
573 * is necessary to prevent deadlocks with the pager
574 */
575 iovp = (struct iovec *)(&cbp->b_vects[0]);
576
577 cbp->b_vectorlist = (void *)iovp;
578 cbp->b_vectorcount = pg_count;
579
580 if (flags & CL_DEV_MEMORY) {
581
582 iovp->iov_len = io_size;
583 iovp->iov_base = (caddr_t)upl_phys_page(pl, 0);
584
585 if (iovp->iov_base == (caddr_t) 0) {
586 free_io_buf(cbp);
587 error = EINVAL;
588 } else
589 iovp->iov_base += upl_offset;
590 } else {
591
592 for (i = 0, vsize = io_size; i < pg_count; i++, iovp++) {
593 int psize;
594
595 psize = PAGE_SIZE - pg_offset;
596
597 if (psize > vsize)
598 psize = vsize;
599
600 iovp->iov_len = psize;
601 iovp->iov_base = (caddr_t)upl_phys_page(pl, pl_index + i);
602
603 if (iovp->iov_base == (caddr_t) 0) {
604 if (pg_count > 1)
605 _FREE(cbp->b_vectorlist, M_SEGMENT);
606 free_io_buf(cbp);
607
608 error = EINVAL;
609 break;
610 }
611 iovp->iov_base += pg_offset;
612 pg_offset = 0;
613
614 if (flags & CL_PAGEOUT) {
615 int s;
616 struct buf *bp;
617
618 s = splbio();
619 if (bp = incore(vp, lblkno + i)) {
620 if (!ISSET(bp->b_flags, B_BUSY)) {
621 bremfree(bp);
622 SET(bp->b_flags, (B_BUSY | B_INVAL));
623 splx(s);
624 brelse(bp);
625 } else
626 panic("BUSY bp found in cluster_io");
627 }
628 splx(s);
629 }
630 vsize -= psize;
631 }
632 }
633 if (error)
634 break;
635
636 if (flags & CL_ASYNC) {
637 cbp->b_flags |= (B_CALL | B_ASYNC);
638 cbp->b_iodone = (void *)cluster_iodone;
639 }
640 cbp->b_flags |= io_flags;
641
642 cbp->b_lblkno = lblkno;
643 cbp->b_blkno = blkno;
644 cbp->b_bcount = io_size;
645 cbp->b_pagelist = upl;
646 cbp->b_uploffset = upl_offset;
647 cbp->b_trans_next = (struct buf *)0;
648
649 if (cbp->b_iostate = (void *)iostate)
650 /*
651 * caller wants to track the state of this
652 * io... bump the amount issued against this stream
653 */
654 iostate->io_issued += io_size;
655
656 if (flags & CL_READ)
657 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
658 cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
659 else
660 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
661 cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
662
663 if (cbp_head) {
664 cbp_tail->b_trans_next = cbp;
665 cbp_tail = cbp;
666 } else {
667 cbp_head = cbp;
668 cbp_tail = cbp;
669 }
670 (struct buf *)(cbp->b_trans_head) = cbp_head;
671 buf_count++;
672
673 upl_offset += io_size;
674 f_offset += io_size;
675 size -= io_size;
676
677 if ( (!(upl_offset & PAGE_MASK) && !(flags & CL_DEV_MEMORY) && ((flags & CL_ASYNC) || buf_count > 8)) || size == 0) {
678 /*
679 * if we have no more I/O to issue or
680 * the current I/O we've prepared fully
681 * completes the last page in this request
682 * and it's either an ASYNC request or
683 * we've already accumulated more than 8 I/O's into
684 * this transaction and it's not an I/O directed to
685 * special DEVICE memory
686 * then go ahead and issue the I/O
687 */
688 start_io:
689 if (real_bp) {
690 cbp_head->b_flags |= B_NEED_IODONE;
691 cbp_head->b_real_bp = real_bp;
692 } else
693 cbp_head->b_real_bp = (struct buf *)NULL;
694
695 if (size == 0) {
696 /*
697 * we're about to issue the last I/O for this upl
698 * if this was a read to the eof and the eof doesn't
699 * finish on a page boundary, than we need to zero-fill
700 * the rest of the page....
701 */
702 cbp_head->b_validend = zero_offset;
703 } else
704 cbp_head->b_validend = 0;
705
706 for (cbp = cbp_head; cbp;) {
707 struct buf * cbp_next;
708
709 if (io_flags & B_WRITEINPROG)
710 cbp->b_vp->v_numoutput++;
711
712 cbp_next = cbp->b_trans_next;
713
714 (void) VOP_STRATEGY(cbp);
715 cbp = cbp_next;
716 }
717 if ( !(flags & CL_ASYNC)) {
718 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next)
719 biowait(cbp);
720
721 if (error = cluster_iodone(cbp_head)) {
722 if ((flags & CL_PAGEOUT) && (error == ENXIO))
723 retval = 0; /* drop the error */
724 else
725 retval = error;
726 error = 0;
727 }
728 }
729 cbp_head = (struct buf *)0;
730 cbp_tail = (struct buf *)0;
731
732 buf_count = 0;
733 }
734 }
735 if (error) {
736 int abort_size;
737
738 io_size = 0;
739
740 for (cbp = cbp_head; cbp;) {
741 struct buf * cbp_next;
742
743 if (cbp->b_vectorcount > 1)
744 _FREE(cbp->b_vectorlist, M_SEGMENT);
745 upl_offset -= cbp->b_bcount;
746 size += cbp->b_bcount;
747 io_size += cbp->b_bcount;
748
749 cbp_next = cbp->b_trans_next;
750 free_io_buf(cbp);
751 cbp = cbp_next;
752 }
753 if (iostate) {
754 /*
755 * update the error condition for this stream
756 * since we never really issued the io
757 * just go ahead and adjust it back
758 */
759 if (iostate->io_error == 0)
760 iostate->io_error = error;
761 iostate->io_issued -= io_size;
762
763 if (iostate->io_wanted) {
764 /*
765 * someone is waiting for the state of
766 * this io stream to change
767 */
768 iostate->io_wanted = 0;
769 wakeup((caddr_t)&iostate->io_wanted);
770 }
771 }
772 pg_offset = upl_offset & PAGE_MASK;
773 abort_size = ((size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE) * PAGE_SIZE;
774
775 if (flags & CL_COMMIT) {
776 int upl_abort_code;
777
778 if (flags & CL_PRESERVE)
779 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
780 else if ((flags & CL_PAGEOUT) && (error != ENXIO)) /* transient error */
781 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
782 else if (flags & CL_PAGEIN)
783 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
784 else
785 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
786
787 ubc_upl_abort_range(upl, upl_offset - pg_offset, abort_size,
788 upl_abort_code);
789
790 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
791 (int)upl, upl_offset - pg_offset, abort_size, error, 0);
792 }
793 if (real_bp) {
794 real_bp->b_flags |= B_ERROR;
795 real_bp->b_error = error;
796
797 biodone(real_bp);
798 }
799 if (retval == 0)
800 retval = error;
801 }
802 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END,
803 (int)f_offset, size, upl_offset, retval, 0);
804
805 return (retval);
806 }
807
808
809 static int
810 cluster_rd_prefetch(vp, f_offset, size, filesize, devblocksize)
811 struct vnode *vp;
812 off_t f_offset;
813 u_int size;
814 off_t filesize;
815 int devblocksize;
816 {
817 int pages_to_fetch;
818 int skipped_pages;
819
820 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
821 (int)f_offset, size, (int)filesize, 0, 0);
822
823 if (f_offset >= filesize) {
824 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
825 (int)f_offset, 0, 0, 0, 0);
826 return(0);
827 }
828 if (size > (MAX_UPL_TRANSFER * PAGE_SIZE))
829 size = MAX_UPL_TRANSFER * PAGE_SIZE;
830 else
831 size = (size + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1);
832
833 if ((off_t)size > (filesize - f_offset))
834 size = filesize - f_offset;
835
836 pages_to_fetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
837
838 for (skipped_pages = 0; skipped_pages < pages_to_fetch; skipped_pages++) {
839 if (ubc_page_op(vp, f_offset, 0, 0, 0) != KERN_SUCCESS)
840 break;
841 f_offset += PAGE_SIZE;
842 size -= PAGE_SIZE;
843 }
844 if (skipped_pages < pages_to_fetch)
845 advisory_read(vp, filesize, f_offset, size, devblocksize);
846
847 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
848 (int)f_offset + (pages_to_fetch * PAGE_SIZE), skipped_pages, 0, 1, 0);
849
850 return (pages_to_fetch);
851 }
852
853
854
855 static void
856 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize)
857 struct vnode *vp;
858 daddr_t b_lblkno;
859 daddr_t e_lblkno;
860 off_t filesize;
861 int devblocksize;
862 {
863 daddr_t r_lblkno;
864 off_t f_offset;
865 int size_of_prefetch;
866 int max_pages;
867
868 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
869 b_lblkno, e_lblkno, vp->v_lastr, 0, 0);
870
871 if (b_lblkno == vp->v_lastr && b_lblkno == e_lblkno) {
872 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
873 vp->v_ralen, vp->v_maxra, vp->v_lastr, 0, 0);
874 return;
875 }
876
877 if (vp->v_lastr == -1 || (b_lblkno != vp->v_lastr && b_lblkno != (vp->v_lastr + 1) &&
878 (b_lblkno != (vp->v_maxra + 1) || vp->v_ralen == 0))) {
879 vp->v_ralen = 0;
880 vp->v_maxra = 0;
881
882 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
883 vp->v_ralen, vp->v_maxra, vp->v_lastr, 1, 0);
884
885 return;
886 }
887 max_pages = MAX_UPL_TRANSFER;
888
889 vp->v_ralen = vp->v_ralen ? min(max_pages, vp->v_ralen << 1) : 1;
890
891 if (((e_lblkno + 1) - b_lblkno) > vp->v_ralen)
892 vp->v_ralen = min(max_pages, (e_lblkno + 1) - b_lblkno);
893
894 if (e_lblkno < vp->v_maxra) {
895 if ((vp->v_maxra - e_lblkno) > max(max_pages / 16, 4)) {
896
897 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
898 vp->v_ralen, vp->v_maxra, vp->v_lastr, 2, 0);
899 return;
900 }
901 }
902 r_lblkno = max(e_lblkno, vp->v_maxra) + 1;
903 f_offset = (off_t)r_lblkno * PAGE_SIZE_64;
904
905 if (f_offset < filesize) {
906 size_of_prefetch = cluster_rd_prefetch(vp, f_offset, vp->v_ralen * PAGE_SIZE, filesize, devblocksize);
907
908 if (size_of_prefetch)
909 vp->v_maxra = (r_lblkno + size_of_prefetch) - 1;
910 }
911 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
912 vp->v_ralen, vp->v_maxra, vp->v_lastr, 3, 0);
913 }
914
915 int
916 cluster_pageout(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags)
917 struct vnode *vp;
918 upl_t upl;
919 vm_offset_t upl_offset;
920 off_t f_offset;
921 int size;
922 off_t filesize;
923 int devblocksize;
924 int flags;
925 {
926 int io_size;
927 int pg_size;
928 off_t max_size;
929 int local_flags = CL_PAGEOUT;
930
931 if ((flags & UPL_IOSYNC) == 0)
932 local_flags |= CL_ASYNC;
933 if ((flags & UPL_NOCOMMIT) == 0)
934 local_flags |= CL_COMMIT;
935
936
937 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
938 (int)f_offset, size, (int)filesize, local_flags, 0);
939
940 /*
941 * If they didn't specify any I/O, then we are done...
942 * we can't issue an abort because we don't know how
943 * big the upl really is
944 */
945 if (size <= 0)
946 return (EINVAL);
947
948 if (vp->v_mount->mnt_flag & MNT_RDONLY) {
949 if (local_flags & CL_COMMIT)
950 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
951 return (EROFS);
952 }
953 /*
954 * can't page-in from a negative offset
955 * or if we're starting beyond the EOF
956 * or if the file offset isn't page aligned
957 * or the size requested isn't a multiple of PAGE_SIZE
958 */
959 if (f_offset < 0 || f_offset >= filesize ||
960 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
961 if (local_flags & CL_COMMIT)
962 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
963 return (EINVAL);
964 }
965 max_size = filesize - f_offset;
966
967 if (size < max_size)
968 io_size = size;
969 else
970 io_size = max_size;
971
972 pg_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
973
974 if (size > pg_size) {
975 if (local_flags & CL_COMMIT)
976 ubc_upl_abort_range(upl, upl_offset + pg_size, size - pg_size,
977 UPL_ABORT_FREE_ON_EMPTY);
978 }
979 while (vp->v_numoutput >= ASYNC_THROTTLE) {
980 vp->v_flag |= VTHROTTLED;
981 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_pageout", 0);
982 }
983
984 return (cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize,
985 local_flags, (struct buf *)0, (struct clios *)0));
986 }
987
988 int
989 cluster_pagein(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags)
990 struct vnode *vp;
991 upl_t upl;
992 vm_offset_t upl_offset;
993 off_t f_offset;
994 int size;
995 off_t filesize;
996 int devblocksize;
997 int flags;
998 {
999 u_int io_size;
1000 int rounded_size;
1001 off_t max_size;
1002 int retval;
1003 int local_flags = 0;
1004
1005 if (upl == NULL || size < 0)
1006 panic("cluster_pagein: NULL upl passed in");
1007
1008 if ((flags & UPL_IOSYNC) == 0)
1009 local_flags |= CL_ASYNC;
1010 if ((flags & UPL_NOCOMMIT) == 0)
1011 local_flags |= CL_COMMIT;
1012
1013
1014 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
1015 (int)f_offset, size, (int)filesize, local_flags, 0);
1016
1017 /*
1018 * can't page-in from a negative offset
1019 * or if we're starting beyond the EOF
1020 * or if the file offset isn't page aligned
1021 * or the size requested isn't a multiple of PAGE_SIZE
1022 */
1023 if (f_offset < 0 || f_offset >= filesize ||
1024 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) {
1025 if (local_flags & CL_COMMIT)
1026 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1027 return (EINVAL);
1028 }
1029 max_size = filesize - f_offset;
1030
1031 if (size < max_size)
1032 io_size = size;
1033 else
1034 io_size = max_size;
1035
1036 rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1037
1038 if (size > rounded_size && (local_flags & CL_COMMIT))
1039 ubc_upl_abort_range(upl, upl_offset + rounded_size,
1040 size - (upl_offset + rounded_size), UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1041
1042 retval = cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize,
1043 local_flags | CL_READ | CL_PAGEIN, (struct buf *)0, (struct clios *)0);
1044
1045 if (retval == 0) {
1046 int b_lblkno;
1047 int e_lblkno;
1048
1049 b_lblkno = (int)(f_offset / PAGE_SIZE_64);
1050 e_lblkno = (int)
1051 ((f_offset + ((off_t)io_size - 1)) / PAGE_SIZE_64);
1052
1053 if (!(flags & UPL_NORDAHEAD) && !(vp->v_flag & VRAOFF) && rounded_size == PAGE_SIZE) {
1054 /*
1055 * we haven't read the last page in of the file yet
1056 * so let's try to read ahead if we're in
1057 * a sequential access pattern
1058 */
1059 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
1060 }
1061 vp->v_lastr = e_lblkno;
1062 }
1063 return (retval);
1064 }
1065
1066 int
1067 cluster_bp(bp)
1068 struct buf *bp;
1069 {
1070 off_t f_offset;
1071 int flags;
1072
1073 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START,
1074 (int)bp, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
1075
1076 if (bp->b_pagelist == (upl_t) 0)
1077 panic("cluster_bp: can't handle NULL upl yet\n");
1078 if (bp->b_flags & B_READ)
1079 flags = CL_ASYNC | CL_READ;
1080 else
1081 flags = CL_ASYNC;
1082
1083 f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
1084
1085 return (cluster_io(bp->b_vp, bp->b_pagelist, 0, f_offset, bp->b_bcount, 0, flags, bp, (struct clios *)0));
1086 }
1087
1088 int
1089 cluster_write(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
1090 struct vnode *vp;
1091 struct uio *uio;
1092 off_t oldEOF;
1093 off_t newEOF;
1094 off_t headOff;
1095 off_t tailOff;
1096 int devblocksize;
1097 int flags;
1098 {
1099 int prev_resid;
1100 int clip_size;
1101 off_t max_io_size;
1102 struct iovec *iov;
1103 vm_offset_t upl_offset;
1104 int upl_size;
1105 int pages_in_pl;
1106 upl_page_info_t *pl;
1107 int upl_flags;
1108 upl_t upl;
1109 int retval = 0;
1110
1111
1112 if ( (!(vp->v_flag & VNOCACHE_DATA)) || (!uio) || (uio->uio_segflg != UIO_USERSPACE))
1113 {
1114 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1115 return(retval);
1116 }
1117
1118 while (uio->uio_resid && uio->uio_offset < newEOF && retval == 0)
1119 {
1120 /* we know we have a resid, so this is safe */
1121 iov = uio->uio_iov;
1122 while (iov->iov_len == 0) {
1123 uio->uio_iov++;
1124 uio->uio_iovcnt--;
1125 iov = uio->uio_iov;
1126 }
1127
1128 /*
1129 * We check every vector target and if it is physically
1130 * contiguous space, we skip the sanity checks.
1131 */
1132
1133 upl_offset = (vm_offset_t)iov->iov_base & ~PAGE_MASK;
1134 upl_size = (upl_offset + PAGE_SIZE +(PAGE_SIZE -1)) & ~PAGE_MASK;
1135 pages_in_pl = 0;
1136 upl_flags = UPL_QUERY_OBJECT_TYPE;
1137 if ((vm_map_get_upl(current_map(),
1138 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1139 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0)) != KERN_SUCCESS)
1140 {
1141 /*
1142 * the user app must have passed in an invalid address
1143 */
1144 return (EFAULT);
1145 }
1146
1147 if (upl_flags & UPL_PHYS_CONTIG)
1148 {
1149 if (flags & IO_HEADZEROFILL)
1150 {
1151 flags &= ~IO_HEADZEROFILL;
1152
1153 if (retval = cluster_write_x(vp, (struct uio *)0, 0, uio->uio_offset, headOff, 0, devblocksize, IO_HEADZEROFILL))
1154 return(retval);
1155 }
1156
1157 retval = cluster_phys_write(vp, uio, newEOF, devblocksize, flags);
1158
1159 if (uio->uio_resid == 0 && (flags & IO_TAILZEROFILL))
1160 {
1161 retval = cluster_write_x(vp, (struct uio *)0, 0, tailOff, uio->uio_offset, 0, devblocksize, IO_HEADZEROFILL);
1162 return(retval);
1163 }
1164 }
1165 else if ((uio->uio_resid < 4 * PAGE_SIZE) || (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL)))
1166 {
1167 /*
1168 * We set a threshhold of 4 pages to decide if the nocopy
1169 * write loop is worth the trouble...
1170 * we also come here if we're trying to zero the head and/or tail
1171 * of a partially written page, and the user source is not a physically contiguous region
1172 */
1173 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1174 return(retval);
1175 }
1176 else if (uio->uio_offset & PAGE_MASK_64)
1177 {
1178 /* Bring the file offset write up to a pagesize boundary */
1179 clip_size = (PAGE_SIZE - (uio->uio_offset & PAGE_MASK_64));
1180 if (uio->uio_resid < clip_size)
1181 clip_size = uio->uio_resid;
1182 /*
1183 * Fake the resid going into the cluster_write_x call
1184 * and restore it on the way out.
1185 */
1186 prev_resid = uio->uio_resid;
1187 uio->uio_resid = clip_size;
1188 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1189 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1190 }
1191 else if ((int)iov->iov_base & PAGE_MASK_64)
1192 {
1193 clip_size = iov->iov_len;
1194 prev_resid = uio->uio_resid;
1195 uio->uio_resid = clip_size;
1196 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1197 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1198 }
1199 else
1200 {
1201 /*
1202 * If we come in here, we know the offset into
1203 * the file is on a pagesize boundary
1204 */
1205
1206 max_io_size = newEOF - uio->uio_offset;
1207 clip_size = uio->uio_resid;
1208 if (iov->iov_len < clip_size)
1209 clip_size = iov->iov_len;
1210 if (max_io_size < clip_size)
1211 clip_size = max_io_size;
1212
1213 if (clip_size < PAGE_SIZE)
1214 {
1215 /*
1216 * Take care of tail end of write in this vector
1217 */
1218 prev_resid = uio->uio_resid;
1219 uio->uio_resid = clip_size;
1220 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1221 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1222 }
1223 else
1224 {
1225 /* round clip_size down to a multiple of pagesize */
1226 clip_size = clip_size & ~(PAGE_MASK);
1227 prev_resid = uio->uio_resid;
1228 uio->uio_resid = clip_size;
1229 retval = cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags);
1230 if ((retval == 0) && uio->uio_resid)
1231 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1232 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1233 }
1234 } /* end else */
1235 } /* end while */
1236 return(retval);
1237 }
1238
1239
1240 static int
1241 cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags)
1242 struct vnode *vp;
1243 struct uio *uio;
1244 off_t newEOF;
1245 int devblocksize;
1246 int flags;
1247 {
1248 upl_t upl;
1249 upl_page_info_t *pl;
1250 off_t upl_f_offset;
1251 vm_offset_t upl_offset;
1252 off_t max_io_size;
1253 int io_size;
1254 int io_flag;
1255 int upl_size;
1256 int upl_needed_size;
1257 int pages_in_pl;
1258 int upl_flags;
1259 kern_return_t kret;
1260 struct iovec *iov;
1261 int i;
1262 int first = 1;
1263 int force_data_sync;
1264 int error = 0;
1265 struct clios iostate;
1266
1267 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
1268 (int)uio->uio_offset, (int)uio->uio_resid,
1269 (int)newEOF, devblocksize, 0);
1270
1271 /*
1272 * When we enter this routine, we know
1273 * -- the offset into the file is on a pagesize boundary
1274 * -- the resid is a page multiple
1275 * -- the resid will not exceed iov_len
1276 */
1277 cluster_try_push(vp, newEOF, 0, 1);
1278
1279 iostate.io_completed = 0;
1280 iostate.io_issued = 0;
1281 iostate.io_error = 0;
1282 iostate.io_wanted = 0;
1283
1284 iov = uio->uio_iov;
1285
1286 while (uio->uio_resid && uio->uio_offset < newEOF && error == 0) {
1287 io_size = uio->uio_resid;
1288
1289 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1290 io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1291
1292 if (first) {
1293 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE) / 4)
1294 io_size = (MAX_UPL_TRANSFER * PAGE_SIZE) / 8;
1295 first = 0;
1296 }
1297 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
1298 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
1299
1300 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
1301 (int)upl_offset, upl_needed_size, (int)iov->iov_base, io_size, 0);
1302
1303 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
1304 pages_in_pl = 0;
1305 upl_size = upl_needed_size;
1306 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1307 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
1308
1309 kret = vm_map_get_upl(current_map(),
1310 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1311 &upl_size,
1312 &upl,
1313 NULL,
1314 &pages_in_pl,
1315 &upl_flags,
1316 force_data_sync);
1317
1318 if (kret != KERN_SUCCESS) {
1319 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1320 0, 0, 0, kret, 0);
1321
1322 /*
1323 * cluster_nocopy_write: failed to get pagelist
1324 *
1325 * we may have already spun some portion of this request
1326 * off as async requests... we need to wait for the I/O
1327 * to complete before returning
1328 */
1329 goto wait_for_writes;
1330 }
1331 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1332 pages_in_pl = upl_size / PAGE_SIZE;
1333
1334 for (i = 0; i < pages_in_pl; i++) {
1335 if (!upl_valid_page(pl, i))
1336 break;
1337 }
1338 if (i == pages_in_pl)
1339 break;
1340
1341 /*
1342 * didn't get all the pages back that we
1343 * needed... release this upl and try again
1344 */
1345 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1346 UPL_ABORT_FREE_ON_EMPTY);
1347 }
1348 if (force_data_sync >= 3) {
1349 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1350 i, pages_in_pl, upl_size, kret, 0);
1351
1352 /*
1353 * for some reason, we couldn't acquire a hold on all
1354 * the pages needed in the user's address space
1355 *
1356 * we may have already spun some portion of this request
1357 * off as async requests... we need to wait for the I/O
1358 * to complete before returning
1359 */
1360 goto wait_for_writes;
1361 }
1362
1363 /*
1364 * Consider the possibility that upl_size wasn't satisfied.
1365 */
1366 if (upl_size != upl_needed_size)
1367 io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
1368
1369 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1370 (int)upl_offset, upl_size, (int)iov->iov_base, io_size, 0);
1371
1372 if (io_size == 0) {
1373 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1374 UPL_ABORT_FREE_ON_EMPTY);
1375
1376 /*
1377 * we may have already spun some portion of this request
1378 * off as async requests... we need to wait for the I/O
1379 * to complete before returning
1380 */
1381 goto wait_for_writes;
1382 }
1383 /*
1384 * Now look for pages already in the cache
1385 * and throw them away.
1386 */
1387
1388 upl_f_offset = uio->uio_offset; /* this is page aligned in the file */
1389 max_io_size = io_size;
1390
1391 while (max_io_size) {
1392 /*
1393 * Flag UPL_POP_DUMP says if the page is found
1394 * in the page cache it must be thrown away.
1395 */
1396 ubc_page_op(vp,
1397 upl_f_offset,
1398 UPL_POP_SET | UPL_POP_BUSY | UPL_POP_DUMP,
1399 0, 0);
1400 max_io_size -= PAGE_SIZE_64;
1401 upl_f_offset += PAGE_SIZE_64;
1402 }
1403 /*
1404 * we want push out these writes asynchronously so that we can overlap
1405 * the preparation of the next I/O
1406 * if there are already too many outstanding writes
1407 * wait until some complete before issuing the next
1408 */
1409 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
1410 iostate.io_wanted = 1;
1411 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_write", 0);
1412 }
1413 if (iostate.io_error) {
1414 /*
1415 * one of the earlier writes we issued ran into a hard error
1416 * don't issue any more writes, cleanup the UPL
1417 * that was just created but not used, then
1418 * go wait for all writes that are part of this stream
1419 * to complete before returning the error to the caller
1420 */
1421 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1422 UPL_ABORT_FREE_ON_EMPTY);
1423
1424 goto wait_for_writes;
1425 }
1426 io_flag = CL_ASYNC | CL_PRESERVE | CL_COMMIT;
1427
1428 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
1429 (int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0);
1430
1431 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1432 io_size, devblocksize, io_flag, (struct buf *)0, &iostate);
1433
1434 iov->iov_len -= io_size;
1435 iov->iov_base += io_size;
1436 uio->uio_resid -= io_size;
1437 uio->uio_offset += io_size;
1438
1439 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
1440 (int)upl_offset, (int)uio->uio_offset, (int)uio->uio_resid, error, 0);
1441
1442 } /* end while */
1443
1444 wait_for_writes:
1445 /*
1446 * make sure all async writes issued as part of this stream
1447 * have completed before we return
1448 */
1449 while (iostate.io_issued != iostate.io_completed) {
1450 iostate.io_wanted = 1;
1451 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_write", 0);
1452 }
1453 if (iostate.io_error)
1454 error = iostate.io_error;
1455
1456 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1457 (int)uio->uio_offset, (int)uio->uio_resid, error, 4, 0);
1458
1459 return (error);
1460 }
1461
1462
1463 static int
1464 cluster_phys_write(vp, uio, newEOF, devblocksize, flags)
1465 struct vnode *vp;
1466 struct uio *uio;
1467 off_t newEOF;
1468 int devblocksize;
1469 int flags;
1470 {
1471 upl_page_info_t *pl;
1472 vm_offset_t src_paddr;
1473 upl_t upl;
1474 vm_offset_t upl_offset;
1475 int tail_size;
1476 int io_size;
1477 int upl_size;
1478 int upl_needed_size;
1479 int pages_in_pl;
1480 int upl_flags;
1481 kern_return_t kret;
1482 struct iovec *iov;
1483 int error = 0;
1484
1485 /*
1486 * When we enter this routine, we know
1487 * -- the resid will not exceed iov_len
1488 * -- the vector target address is physcially contiguous
1489 */
1490 cluster_try_push(vp, newEOF, 0, 1);
1491
1492 iov = uio->uio_iov;
1493 io_size = iov->iov_len;
1494 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
1495 upl_needed_size = upl_offset + io_size;
1496
1497 pages_in_pl = 0;
1498 upl_size = upl_needed_size;
1499 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1500 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
1501
1502 kret = vm_map_get_upl(current_map(),
1503 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1504 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
1505
1506 if (kret != KERN_SUCCESS) {
1507 /*
1508 * cluster_phys_write: failed to get pagelist
1509 * note: return kret here
1510 */
1511 return(EINVAL);
1512 }
1513 /*
1514 * Consider the possibility that upl_size wasn't satisfied.
1515 * This is a failure in the physical memory case.
1516 */
1517 if (upl_size < upl_needed_size) {
1518 kernel_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1519 return(EINVAL);
1520 }
1521 pl = ubc_upl_pageinfo(upl);
1522
1523 src_paddr = (vm_offset_t)upl_phys_page(pl, 0) + ((vm_offset_t)iov->iov_base & PAGE_MASK);
1524
1525 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
1526 int head_size;
1527
1528 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
1529
1530 if (head_size > io_size)
1531 head_size = io_size;
1532
1533 error = cluster_align_phys_io(vp, uio, src_paddr, head_size, devblocksize, 0);
1534
1535 if (error) {
1536 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1537
1538 return(EINVAL);
1539 }
1540 upl_offset += head_size;
1541 src_paddr += head_size;
1542 io_size -= head_size;
1543 }
1544 tail_size = io_size & (devblocksize - 1);
1545 io_size -= tail_size;
1546
1547 if (io_size) {
1548 /*
1549 * issue a synchronous write to cluster_io
1550 */
1551 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1552 io_size, 0, CL_DEV_MEMORY, (struct buf *)0, (struct clios *)0);
1553 }
1554 if (error == 0) {
1555 /*
1556 * The cluster_io write completed successfully,
1557 * update the uio structure
1558 */
1559 uio->uio_resid -= io_size;
1560 iov->iov_len -= io_size;
1561 iov->iov_base += io_size;
1562 uio->uio_offset += io_size;
1563 src_paddr += io_size;
1564
1565 if (tail_size)
1566 error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, devblocksize, 0);
1567 }
1568 /*
1569 * just release our hold on the physically contiguous
1570 * region without changing any state
1571 */
1572 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1573
1574 return (error);
1575 }
1576
1577
1578 static int
1579 cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
1580 struct vnode *vp;
1581 struct uio *uio;
1582 off_t oldEOF;
1583 off_t newEOF;
1584 off_t headOff;
1585 off_t tailOff;
1586 int devblocksize;
1587 int flags;
1588 {
1589 upl_page_info_t *pl;
1590 upl_t upl;
1591 vm_offset_t upl_offset;
1592 int upl_size;
1593 off_t upl_f_offset;
1594 int pages_in_upl;
1595 int start_offset;
1596 int xfer_resid;
1597 int io_size;
1598 int io_flags;
1599 vm_offset_t io_address;
1600 int io_offset;
1601 int bytes_to_zero;
1602 int bytes_to_move;
1603 kern_return_t kret;
1604 int retval = 0;
1605 int uio_resid;
1606 long long total_size;
1607 long long zero_cnt;
1608 off_t zero_off;
1609 long long zero_cnt1;
1610 off_t zero_off1;
1611 daddr_t start_blkno;
1612 daddr_t last_blkno;
1613
1614 if (uio) {
1615 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1616 (int)uio->uio_offset, uio->uio_resid, (int)oldEOF, (int)newEOF, 0);
1617
1618 uio_resid = uio->uio_resid;
1619 } else {
1620 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1621 0, 0, (int)oldEOF, (int)newEOF, 0);
1622
1623 uio_resid = 0;
1624 }
1625 zero_cnt = 0;
1626 zero_cnt1 = 0;
1627
1628 if (flags & IO_HEADZEROFILL) {
1629 /*
1630 * some filesystems (HFS is one) don't support unallocated holes within a file...
1631 * so we zero fill the intervening space between the old EOF and the offset
1632 * where the next chunk of real data begins.... ftruncate will also use this
1633 * routine to zero fill to the new EOF when growing a file... in this case, the
1634 * uio structure will not be provided
1635 */
1636 if (uio) {
1637 if (headOff < uio->uio_offset) {
1638 zero_cnt = uio->uio_offset - headOff;
1639 zero_off = headOff;
1640 }
1641 } else if (headOff < newEOF) {
1642 zero_cnt = newEOF - headOff;
1643 zero_off = headOff;
1644 }
1645 }
1646 if (flags & IO_TAILZEROFILL) {
1647 if (uio) {
1648 zero_off1 = uio->uio_offset + uio->uio_resid;
1649
1650 if (zero_off1 < tailOff)
1651 zero_cnt1 = tailOff - zero_off1;
1652 }
1653 }
1654 if (zero_cnt == 0 && uio == (struct uio *) 0)
1655 {
1656 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
1657 retval, 0, 0, 0, 0);
1658 return (0);
1659 }
1660
1661 while ((total_size = (uio_resid + zero_cnt + zero_cnt1)) && retval == 0) {
1662 /*
1663 * for this iteration of the loop, figure out where our starting point is
1664 */
1665 if (zero_cnt) {
1666 start_offset = (int)(zero_off & PAGE_MASK_64);
1667 upl_f_offset = zero_off - start_offset;
1668 } else if (uio_resid) {
1669 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
1670 upl_f_offset = uio->uio_offset - start_offset;
1671 } else {
1672 start_offset = (int)(zero_off1 & PAGE_MASK_64);
1673 upl_f_offset = zero_off1 - start_offset;
1674 }
1675 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
1676 (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
1677
1678 if (total_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1679 total_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1680
1681 /*
1682 * compute the size of the upl needed to encompass
1683 * the requested write... limit each call to cluster_io
1684 * to the maximum UPL size... cluster_io will clip if
1685 * this exceeds the maximum io_size for the device,
1686 * make sure to account for
1687 * a starting offset that's not page aligned
1688 */
1689 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1690
1691 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1692 upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1693
1694 pages_in_upl = upl_size / PAGE_SIZE;
1695 io_size = upl_size - start_offset;
1696
1697 if ((long long)io_size > total_size)
1698 io_size = total_size;
1699
1700 start_blkno = (daddr_t)(upl_f_offset / PAGE_SIZE_64);
1701 last_blkno = start_blkno + pages_in_upl;
1702
1703 kret = ubc_create_upl(vp,
1704 upl_f_offset,
1705 upl_size,
1706 &upl,
1707 &pl,
1708 UPL_FLAGS_NONE);
1709 if (kret != KERN_SUCCESS)
1710 panic("cluster_write: failed to get pagelist");
1711
1712 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_NONE,
1713 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
1714
1715 if (start_offset && !upl_valid_page(pl, 0)) {
1716 int read_size;
1717
1718 /*
1719 * we're starting in the middle of the first page of the upl
1720 * and the page isn't currently valid, so we're going to have
1721 * to read it in first... this is a synchronous operation
1722 */
1723 read_size = PAGE_SIZE;
1724
1725 if ((upl_f_offset + read_size) > newEOF)
1726 read_size = newEOF - upl_f_offset;
1727
1728 retval = cluster_io(vp, upl, 0, upl_f_offset, read_size, devblocksize,
1729 CL_READ, (struct buf *)0, (struct clios *)0);
1730 if (retval) {
1731 /*
1732 * we had an error during the read which causes us to abort
1733 * the current cluster_write request... before we do, we need
1734 * to release the rest of the pages in the upl without modifying
1735 * there state and mark the failed page in error
1736 */
1737 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
1738 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1739
1740 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1741 (int)upl, 0, 0, retval, 0);
1742 break;
1743 }
1744 }
1745 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
1746 /*
1747 * the last offset we're writing to in this upl does not end on a page
1748 * boundary... if it's not beyond the old EOF, then we'll also need to
1749 * pre-read this page in if it isn't already valid
1750 */
1751 upl_offset = upl_size - PAGE_SIZE;
1752
1753 if ((upl_f_offset + start_offset + io_size) < oldEOF &&
1754 !upl_valid_page(pl, upl_offset / PAGE_SIZE)) {
1755 int read_size;
1756
1757 read_size = PAGE_SIZE;
1758
1759 if ((upl_f_offset + upl_offset + read_size) > newEOF)
1760 read_size = newEOF - (upl_f_offset + upl_offset);
1761
1762 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size, devblocksize,
1763 CL_READ, (struct buf *)0, (struct clios *)0);
1764 if (retval) {
1765 /*
1766 * we had an error during the read which causes us to abort
1767 * the current cluster_write request... before we do, we
1768 * need to release the rest of the pages in the upl without
1769 * modifying there state and mark the failed page in error
1770 */
1771 ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
1772 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1773
1774 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1775 (int)upl, 0, 0, retval, 0);
1776 break;
1777 }
1778 }
1779 }
1780 if ((kret = ubc_upl_map(upl, &io_address)) != KERN_SUCCESS)
1781 panic("cluster_write: ubc_upl_map failed\n");
1782 xfer_resid = io_size;
1783 io_offset = start_offset;
1784
1785 while (zero_cnt && xfer_resid) {
1786
1787 if (zero_cnt < (long long)xfer_resid)
1788 bytes_to_zero = zero_cnt;
1789 else
1790 bytes_to_zero = xfer_resid;
1791
1792 if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
1793 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1794
1795 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1796 (int)upl_f_offset + io_offset, bytes_to_zero,
1797 (int)io_offset, xfer_resid, 0);
1798 } else {
1799 int zero_pg_index;
1800
1801 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
1802 zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
1803
1804 if ( !upl_valid_page(pl, zero_pg_index)) {
1805 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1806
1807 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1808 (int)upl_f_offset + io_offset, bytes_to_zero,
1809 (int)io_offset, xfer_resid, 0);
1810
1811 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
1812 !upl_dirty_page(pl, zero_pg_index)) {
1813 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1814
1815 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1816 (int)upl_f_offset + io_offset, bytes_to_zero,
1817 (int)io_offset, xfer_resid, 0);
1818 }
1819 }
1820 xfer_resid -= bytes_to_zero;
1821 zero_cnt -= bytes_to_zero;
1822 zero_off += bytes_to_zero;
1823 io_offset += bytes_to_zero;
1824 }
1825 if (xfer_resid && uio_resid) {
1826 bytes_to_move = min(uio_resid, xfer_resid);
1827
1828 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 42)) | DBG_FUNC_NONE,
1829 (int)uio->uio_offset, bytes_to_move, uio_resid, xfer_resid, 0);
1830
1831 retval = uiomove((caddr_t)(io_address + io_offset), bytes_to_move, uio);
1832
1833
1834 if (retval) {
1835 if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
1836 panic("cluster_write: kernel_upl_unmap failed\n");
1837
1838 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
1839
1840 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1841 (int)upl, 0, 0, retval, 0);
1842 } else {
1843 uio_resid -= bytes_to_move;
1844 xfer_resid -= bytes_to_move;
1845 io_offset += bytes_to_move;
1846 }
1847 }
1848 while (xfer_resid && zero_cnt1 && retval == 0) {
1849
1850 if (zero_cnt1 < (long long)xfer_resid)
1851 bytes_to_zero = zero_cnt1;
1852 else
1853 bytes_to_zero = xfer_resid;
1854
1855 if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
1856 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1857
1858 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1859 (int)upl_f_offset + io_offset,
1860 bytes_to_zero, (int)io_offset, xfer_resid, 0);
1861 } else {
1862 int zero_pg_index;
1863
1864 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off1 & PAGE_MASK_64));
1865 zero_pg_index = (int)((zero_off1 - upl_f_offset) / PAGE_SIZE_64);
1866
1867 if ( !upl_valid_page(pl, zero_pg_index)) {
1868 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1869
1870 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1871 (int)upl_f_offset + io_offset,
1872 bytes_to_zero, (int)io_offset, xfer_resid, 0);
1873
1874 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
1875 !upl_dirty_page(pl, zero_pg_index)) {
1876 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1877
1878 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1879 (int)upl_f_offset + io_offset,
1880 bytes_to_zero, (int)io_offset, xfer_resid, 0);
1881 }
1882 }
1883 xfer_resid -= bytes_to_zero;
1884 zero_cnt1 -= bytes_to_zero;
1885 zero_off1 += bytes_to_zero;
1886 io_offset += bytes_to_zero;
1887 }
1888
1889 if (retval == 0) {
1890 int cl_index;
1891 int can_delay;
1892
1893 io_size += start_offset;
1894
1895 if ((upl_f_offset + io_size) >= newEOF && io_size < upl_size) {
1896 /*
1897 * if we're extending the file with this write
1898 * we'll zero fill the rest of the page so that
1899 * if the file gets extended again in such a way as to leave a
1900 * hole starting at this EOF, we'll have zero's in the correct spot
1901 */
1902 bzero((caddr_t)(io_address + io_size), upl_size - io_size);
1903
1904 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1905 (int)upl_f_offset + io_size,
1906 upl_size - io_size, 0, 0, 0);
1907 }
1908 if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
1909 panic("cluster_write: kernel_upl_unmap failed\n");
1910
1911 if (flags & IO_SYNC)
1912 /*
1913 * if the IO_SYNC flag is set than we need to
1914 * bypass any clusters and immediately issue
1915 * the I/O
1916 */
1917 goto issue_io;
1918
1919 if (vp->v_clen == 0)
1920 /*
1921 * no clusters currently present
1922 */
1923 goto start_new_cluster;
1924
1925 /*
1926 * keep track of the overall dirty page
1927 * range we've developed
1928 * in case we have to fall back to the
1929 * VHASDIRTY method of flushing
1930 */
1931 if (vp->v_flag & VHASDIRTY)
1932 goto delay_io;
1933
1934 for (cl_index = 0; cl_index < vp->v_clen; cl_index++) {
1935 /*
1936 * we have an existing cluster... see if this write will extend it nicely
1937 */
1938 if (start_blkno >= vp->v_clusters[cl_index].start_pg) {
1939 /*
1940 * the current write starts at or after the current cluster
1941 */
1942 if (last_blkno <= (vp->v_clusters[cl_index].start_pg + MAX_UPL_TRANSFER)) {
1943 /*
1944 * we have a write that fits entirely
1945 * within the existing cluster limits
1946 */
1947 if (last_blkno > vp->v_clusters[cl_index].last_pg)
1948 /*
1949 * update our idea of where the cluster ends
1950 */
1951 vp->v_clusters[cl_index].last_pg = last_blkno;
1952 break;
1953 }
1954 if (start_blkno < (vp->v_clusters[cl_index].start_pg + MAX_UPL_TRANSFER)) {
1955 /*
1956 * we have a write that starts in the middle of the current cluster
1957 * but extends beyond the cluster's limit
1958 * we'll clip the current cluster if we actually
1959 * overlap with the new write
1960 * and start a new cluster with the current write
1961 */
1962 if (vp->v_clusters[cl_index].last_pg > start_blkno)
1963 vp->v_clusters[cl_index].last_pg = start_blkno;
1964 }
1965 /*
1966 * we also get here for the case where the current write starts
1967 * beyond the limit of the existing cluster
1968 *
1969 * in either case, we'll check the remaining clusters before
1970 * starting a new one
1971 */
1972 } else {
1973 /*
1974 * the current write starts in front of the current cluster
1975 */
1976 if ((vp->v_clusters[cl_index].last_pg - start_blkno) <= MAX_UPL_TRANSFER) {
1977 /*
1978 * we can just merge the old cluster
1979 * with the new request and leave it
1980 * in the cache
1981 */
1982 vp->v_clusters[cl_index].start_pg = start_blkno;
1983
1984 if (last_blkno > vp->v_clusters[cl_index].last_pg) {
1985 /*
1986 * the current write completely
1987 * envelops the existing cluster
1988 */
1989 vp->v_clusters[cl_index].last_pg = last_blkno;
1990 }
1991 break;
1992 }
1993
1994 /*
1995 * if we were to combine this write with the current cluster
1996 * we would exceed the cluster size limit.... so,
1997 * let's see if there's any overlap of the new I/O with
1998 * the existing cluster...
1999 *
2000 */
2001 if (last_blkno > vp->v_clusters[cl_index].start_pg)
2002 /*
2003 * the current write extends into the existing cluster
2004 * clip the current cluster by moving the start position
2005 * to where the current write ends
2006 */
2007 vp->v_clusters[cl_index].start_pg = last_blkno;
2008 /*
2009 * if we get here, there was no way to merge
2010 * the new I/O with this cluster and
2011 * keep it under our maximum cluster length
2012 * we'll check the remaining clusters before starting a new one
2013 */
2014 }
2015 }
2016 if (cl_index < vp->v_clen)
2017 /*
2018 * we found an existing cluster that we
2019 * could merger this I/O into
2020 */
2021 goto delay_io;
2022
2023 if (vp->v_clen < MAX_CLUSTERS && !(vp->v_flag & VNOCACHE_DATA))
2024 /*
2025 * we didn't find an existing cluster to
2026 * merge into, but there's room to start
2027 * a new one
2028 */
2029 goto start_new_cluster;
2030
2031 /*
2032 * no exisitng cluster to merge with and no
2033 * room to start a new one... we'll try
2034 * pushing the existing ones... if none of
2035 * them are able to be pushed, we'll have
2036 * to fall back on the VHASDIRTY mechanism
2037 * cluster_try_push will set v_clen to the
2038 * number of remaining clusters if it is
2039 * unable to push all of them
2040 */
2041 if (vp->v_flag & VNOCACHE_DATA)
2042 can_delay = 0;
2043 else
2044 can_delay = 1;
2045
2046 if (cluster_try_push(vp, newEOF, 0, 0) == 0) {
2047 vp->v_flag |= VHASDIRTY;
2048 goto delay_io;
2049 }
2050 start_new_cluster:
2051 if (vp->v_clen == 0) {
2052 vp->v_ciosiz = devblocksize;
2053 vp->v_cstart = start_blkno;
2054 vp->v_lastw = last_blkno;
2055 }
2056 vp->v_clusters[vp->v_clen].start_pg = start_blkno;
2057 vp->v_clusters[vp->v_clen].last_pg = last_blkno;
2058 vp->v_clen++;
2059 delay_io:
2060 /*
2061 * make sure we keep v_cstart and v_lastw up to
2062 * date in case we have to fall back on the
2063 * V_HASDIRTY mechanism (or we've already entered it)
2064 */
2065 if (start_blkno < vp->v_cstart)
2066 vp->v_cstart = start_blkno;
2067 if (last_blkno > vp->v_lastw)
2068 vp->v_lastw = last_blkno;
2069
2070 ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2071 continue;
2072 issue_io:
2073 /*
2074 * in order to maintain some semblance of coherency with mapped writes
2075 * we need to write the cluster back out as a multiple of the PAGESIZE
2076 * unless the cluster encompasses the last page of the file... in this
2077 * case we'll round out to the nearest device block boundary
2078 */
2079 io_size = upl_size;
2080
2081 if ((upl_f_offset + io_size) > newEOF) {
2082 io_size = newEOF - upl_f_offset;
2083 io_size = (io_size + (devblocksize - 1)) & ~(devblocksize - 1);
2084 }
2085
2086 if (flags & IO_SYNC)
2087 io_flags = CL_COMMIT | CL_AGE;
2088 else
2089 io_flags = CL_COMMIT | CL_AGE | CL_ASYNC;
2090
2091 if (vp->v_flag & VNOCACHE_DATA)
2092 io_flags |= CL_DUMP;
2093
2094 while (vp->v_numoutput >= ASYNC_THROTTLE) {
2095 vp->v_flag |= VTHROTTLED;
2096 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_write", 0);
2097 }
2098 retval = cluster_io(vp, upl, 0, upl_f_offset, io_size, devblocksize,
2099 io_flags, (struct buf *)0, (struct clios *)0);
2100 }
2101 }
2102 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
2103 retval, 0, 0, 0, 0);
2104
2105 return (retval);
2106 }
2107
2108 int
2109 cluster_read(vp, uio, filesize, devblocksize, flags)
2110 struct vnode *vp;
2111 struct uio *uio;
2112 off_t filesize;
2113 int devblocksize;
2114 int flags;
2115 {
2116 int prev_resid;
2117 int clip_size;
2118 off_t max_io_size;
2119 struct iovec *iov;
2120 vm_offset_t upl_offset;
2121 int upl_size;
2122 int pages_in_pl;
2123 upl_page_info_t *pl;
2124 int upl_flags;
2125 upl_t upl;
2126 int retval = 0;
2127
2128 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
2129 (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0);
2130
2131 /*
2132 * We set a threshhold of 4 pages to decide if the nocopy
2133 * read loop is worth the trouble...
2134 */
2135
2136 if (!((vp->v_flag & VNOCACHE_DATA) && (uio->uio_segflg == UIO_USERSPACE)))
2137 {
2138 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2139 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2140 (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
2141 return(retval);
2142 }
2143
2144 while (uio->uio_resid && uio->uio_offset < filesize && retval == 0)
2145 {
2146 /* we know we have a resid, so this is safe */
2147 iov = uio->uio_iov;
2148 while (iov->iov_len == 0) {
2149 uio->uio_iov++;
2150 uio->uio_iovcnt--;
2151 iov = uio->uio_iov;
2152 }
2153
2154 /*
2155 * We check every vector target and if it is physically
2156 * contiguous space, we skip the sanity checks.
2157 */
2158
2159 upl_offset = (vm_offset_t)iov->iov_base & ~PAGE_MASK;
2160 upl_size = (upl_offset + PAGE_SIZE +(PAGE_SIZE -1)) & ~PAGE_MASK;
2161 pages_in_pl = 0;
2162 upl_flags = UPL_QUERY_OBJECT_TYPE;
2163 if((vm_map_get_upl(current_map(),
2164 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2165 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0)) != KERN_SUCCESS)
2166 {
2167 /*
2168 * the user app must have passed in an invalid address
2169 */
2170 return (EFAULT);
2171 }
2172
2173 if (upl_flags & UPL_PHYS_CONTIG)
2174 {
2175 retval = cluster_phys_read(vp, uio, filesize, devblocksize, flags);
2176 }
2177 else if (uio->uio_resid < 4 * PAGE_SIZE)
2178 {
2179 /*
2180 * We set a threshhold of 4 pages to decide if the nocopy
2181 * read loop is worth the trouble...
2182 */
2183 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2184 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2185 (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
2186 return(retval);
2187 }
2188 else if (uio->uio_offset & PAGE_MASK_64)
2189 {
2190 /* Bring the file offset read up to a pagesize boundary */
2191 clip_size = (PAGE_SIZE - (int)(uio->uio_offset & PAGE_MASK_64));
2192 if (uio->uio_resid < clip_size)
2193 clip_size = uio->uio_resid;
2194 /*
2195 * Fake the resid going into the cluster_read_x call
2196 * and restore it on the way out.
2197 */
2198 prev_resid = uio->uio_resid;
2199 uio->uio_resid = clip_size;
2200 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2201 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2202 }
2203 else if ((int)iov->iov_base & PAGE_MASK_64)
2204 {
2205 clip_size = iov->iov_len;
2206 prev_resid = uio->uio_resid;
2207 uio->uio_resid = clip_size;
2208 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2209 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2210 }
2211 else
2212 {
2213 /*
2214 * If we come in here, we know the offset into
2215 * the file is on a pagesize boundary
2216 */
2217
2218 max_io_size = filesize - uio->uio_offset;
2219 clip_size = uio->uio_resid;
2220 if (iov->iov_len < clip_size)
2221 clip_size = iov->iov_len;
2222 if (max_io_size < clip_size)
2223 clip_size = (int)max_io_size;
2224
2225 if (clip_size < PAGE_SIZE)
2226 {
2227 /*
2228 * Take care of the tail end of the read in this vector.
2229 */
2230 prev_resid = uio->uio_resid;
2231 uio->uio_resid = clip_size;
2232 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2233 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2234 }
2235 else
2236 {
2237 /* round clip_size down to a multiple of pagesize */
2238 clip_size = clip_size & ~(PAGE_MASK);
2239 prev_resid = uio->uio_resid;
2240 uio->uio_resid = clip_size;
2241 retval = cluster_nocopy_read(vp, uio, filesize, devblocksize, flags);
2242 if ((retval==0) && uio->uio_resid)
2243 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2244 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2245 }
2246 } /* end else */
2247 } /* end while */
2248
2249 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2250 (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
2251
2252 return(retval);
2253 }
2254
2255
2256 static int
2257 cluster_read_x(vp, uio, filesize, devblocksize, flags)
2258 struct vnode *vp;
2259 struct uio *uio;
2260 off_t filesize;
2261 int devblocksize;
2262 int flags;
2263 {
2264 upl_page_info_t *pl;
2265 upl_t upl;
2266 vm_offset_t upl_offset;
2267 int upl_size;
2268 off_t upl_f_offset;
2269 int start_offset;
2270 int start_pg;
2271 int last_pg;
2272 int uio_last;
2273 int pages_in_upl;
2274 off_t max_size;
2275 int io_size;
2276 vm_offset_t io_address;
2277 kern_return_t kret;
2278 int segflg;
2279 int error = 0;
2280 int retval = 0;
2281 int b_lblkno;
2282 int e_lblkno;
2283
2284 b_lblkno = (int)(uio->uio_offset / PAGE_SIZE_64);
2285
2286 while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) {
2287 /*
2288 * compute the size of the upl needed to encompass
2289 * the requested read... limit each call to cluster_io
2290 * to the maximum UPL size... cluster_io will clip if
2291 * this exceeds the maximum io_size for the device,
2292 * make sure to account for
2293 * a starting offset that's not page aligned
2294 */
2295 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2296 upl_f_offset = uio->uio_offset - (off_t)start_offset;
2297 max_size = filesize - uio->uio_offset;
2298
2299 if ((off_t)((unsigned int)uio->uio_resid) < max_size)
2300 io_size = uio->uio_resid;
2301 else
2302 io_size = max_size;
2303
2304 if (uio->uio_segflg == UIO_USERSPACE && !(vp->v_flag & VNOCACHE_DATA)) {
2305 segflg = uio->uio_segflg;
2306
2307 uio->uio_segflg = UIO_PHYS_USERSPACE;
2308
2309 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
2310 (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2311
2312 while (io_size && retval == 0) {
2313 int xsize;
2314 vm_offset_t paddr;
2315
2316 if (ubc_page_op(vp,
2317 upl_f_offset,
2318 UPL_POP_SET | UPL_POP_BUSY,
2319 &paddr, 0) != KERN_SUCCESS)
2320 break;
2321
2322 xsize = PAGE_SIZE - start_offset;
2323
2324 if (xsize > io_size)
2325 xsize = io_size;
2326
2327 retval = uiomove((caddr_t)(paddr + start_offset), xsize, uio);
2328
2329 ubc_page_op(vp, upl_f_offset,
2330 UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
2331
2332 io_size -= xsize;
2333 start_offset = (int)
2334 (uio->uio_offset & PAGE_MASK_64);
2335 upl_f_offset = uio->uio_offset - start_offset;
2336 }
2337 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
2338 (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2339
2340 uio->uio_segflg = segflg;
2341
2342 if (retval)
2343 break;
2344
2345 if (io_size == 0) {
2346 /*
2347 * we're already finished with this read request
2348 * let's see if we should do a read-ahead
2349 */
2350 e_lblkno = (int)
2351 ((uio->uio_offset - 1) / PAGE_SIZE_64);
2352
2353 if (!(vp->v_flag & VRAOFF))
2354 /*
2355 * let's try to read ahead if we're in
2356 * a sequential access pattern
2357 */
2358 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
2359 vp->v_lastr = e_lblkno;
2360
2361 break;
2362 }
2363 max_size = filesize - uio->uio_offset;
2364 }
2365 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2366 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2367 upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2368 pages_in_upl = upl_size / PAGE_SIZE;
2369
2370 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
2371 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2372
2373 kret = ubc_create_upl(vp,
2374 upl_f_offset,
2375 upl_size,
2376 &upl,
2377 &pl,
2378 UPL_FLAGS_NONE);
2379 if (kret != KERN_SUCCESS)
2380 panic("cluster_read: failed to get pagelist");
2381
2382 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
2383 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2384
2385 /*
2386 * scan from the beginning of the upl looking for the first
2387 * non-valid page.... this will become the first page in
2388 * the request we're going to make to 'cluster_io'... if all
2389 * of the pages are valid, we won't call through to 'cluster_io'
2390 */
2391 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
2392 if (!upl_valid_page(pl, start_pg))
2393 break;
2394 }
2395
2396 /*
2397 * scan from the starting invalid page looking for a valid
2398 * page before the end of the upl is reached, if we
2399 * find one, then it will be the last page of the request to
2400 * 'cluster_io'
2401 */
2402 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
2403 if (upl_valid_page(pl, last_pg))
2404 break;
2405 }
2406
2407 if (start_pg < last_pg) {
2408 /*
2409 * we found a range of 'invalid' pages that must be filled
2410 * if the last page in this range is the last page of the file
2411 * we may have to clip the size of it to keep from reading past
2412 * the end of the last physical block associated with the file
2413 */
2414 upl_offset = start_pg * PAGE_SIZE;
2415 io_size = (last_pg - start_pg) * PAGE_SIZE;
2416
2417 if ((upl_f_offset + upl_offset + io_size) > filesize)
2418 io_size = filesize - (upl_f_offset + upl_offset);
2419
2420 /*
2421 * issue a synchronous read to cluster_io
2422 */
2423
2424 error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
2425 io_size, devblocksize, CL_READ, (struct buf *)0, (struct clios *)0);
2426 }
2427 if (error == 0) {
2428 /*
2429 * if the read completed successfully, or there was no I/O request
2430 * issued, than map the upl into kernel address space and
2431 * move the data into user land.... we'll first add on any 'valid'
2432 * pages that were present in the upl when we acquired it.
2433 */
2434 u_int val_size;
2435 u_int size_of_prefetch;
2436
2437 for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
2438 if (!upl_valid_page(pl, uio_last))
2439 break;
2440 }
2441 /*
2442 * compute size to transfer this round, if uio->uio_resid is
2443 * still non-zero after this uiomove, we'll loop around and
2444 * set up for another I/O.
2445 */
2446 val_size = (uio_last * PAGE_SIZE) - start_offset;
2447
2448 if (max_size < val_size)
2449 val_size = max_size;
2450
2451 if (uio->uio_resid < val_size)
2452 val_size = uio->uio_resid;
2453
2454 e_lblkno = (int)((uio->uio_offset + ((off_t)val_size - 1)) / PAGE_SIZE_64);
2455
2456 if (size_of_prefetch = (uio->uio_resid - val_size)) {
2457 /*
2458 * if there's still I/O left to do for this request, then issue a
2459 * pre-fetch I/O... the I/O wait time will overlap
2460 * with the copying of the data
2461 */
2462 cluster_rd_prefetch(vp, uio->uio_offset + val_size, size_of_prefetch, filesize, devblocksize);
2463 } else {
2464 if (!(vp->v_flag & VRAOFF) && !(vp->v_flag & VNOCACHE_DATA))
2465 /*
2466 * let's try to read ahead if we're in
2467 * a sequential access pattern
2468 */
2469 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
2470 vp->v_lastr = e_lblkno;
2471 }
2472 if (uio->uio_segflg == UIO_USERSPACE) {
2473 int offset;
2474
2475 segflg = uio->uio_segflg;
2476
2477 uio->uio_segflg = UIO_PHYS_USERSPACE;
2478
2479
2480 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
2481 (int)uio->uio_offset, val_size, uio->uio_resid, 0, 0);
2482
2483 offset = start_offset;
2484
2485 while (val_size && retval == 0) {
2486 int csize;
2487 int i;
2488 caddr_t paddr;
2489
2490 i = offset / PAGE_SIZE;
2491 csize = min(PAGE_SIZE - start_offset, val_size);
2492
2493 paddr = (caddr_t)upl_phys_page(pl, i) + start_offset;
2494
2495 retval = uiomove(paddr, csize, uio);
2496
2497 val_size -= csize;
2498 offset += csize;
2499 start_offset = offset & PAGE_MASK;
2500 }
2501 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
2502 (int)uio->uio_offset, val_size, uio->uio_resid, 0, 0);
2503
2504 uio->uio_segflg = segflg;
2505 }
2506 else
2507 {
2508 if ((kret = ubc_upl_map(upl, &io_address)) != KERN_SUCCESS)
2509 panic("cluster_read: ubc_upl_map() failed\n");
2510
2511 retval = uiomove((caddr_t)(io_address + start_offset), val_size, uio);
2512
2513 if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
2514 panic("cluster_read: ubc_upl_unmap() failed\n");
2515 }
2516 }
2517 if (start_pg < last_pg) {
2518 /*
2519 * compute the range of pages that we actually issued an I/O for
2520 * and either commit them as valid if the I/O succeeded
2521 * or abort them if the I/O failed
2522 */
2523 io_size = (last_pg - start_pg) * PAGE_SIZE;
2524
2525 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
2526 (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
2527
2528 if (error || (vp->v_flag & VNOCACHE_DATA))
2529 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
2530 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2531 else
2532 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size,
2533 UPL_COMMIT_CLEAR_DIRTY
2534 | UPL_COMMIT_FREE_ON_EMPTY
2535 | UPL_COMMIT_INACTIVATE);
2536
2537 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
2538 (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
2539 }
2540 if ((last_pg - start_pg) < pages_in_upl) {
2541 int cur_pg;
2542 int commit_flags;
2543
2544 /*
2545 * the set of pages that we issued an I/O for did not encompass
2546 * the entire upl... so just release these without modifying
2547 * there state
2548 */
2549 if (error)
2550 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2551 else {
2552 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
2553 (int)upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
2554
2555 if (start_pg) {
2556 /*
2557 * we found some already valid pages at the beginning of
2558 * the upl commit these back to the inactive list with
2559 * reference cleared
2560 */
2561 for (cur_pg = 0; cur_pg < start_pg; cur_pg++) {
2562 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
2563 | UPL_COMMIT_INACTIVATE;
2564
2565 if (upl_dirty_page(pl, cur_pg))
2566 commit_flags |= UPL_COMMIT_SET_DIRTY;
2567
2568 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA))
2569 ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
2570 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2571 else
2572 ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
2573 PAGE_SIZE, commit_flags);
2574 }
2575 }
2576 if (last_pg < uio_last) {
2577 /*
2578 * we found some already valid pages immediately after the
2579 * pages we issued I/O for, commit these back to the
2580 * inactive list with reference cleared
2581 */
2582 for (cur_pg = last_pg; cur_pg < uio_last; cur_pg++) {
2583 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
2584 | UPL_COMMIT_INACTIVATE;
2585
2586 if (upl_dirty_page(pl, cur_pg))
2587 commit_flags |= UPL_COMMIT_SET_DIRTY;
2588
2589 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA))
2590 ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
2591 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2592 else
2593 ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
2594 PAGE_SIZE, commit_flags);
2595 }
2596 }
2597 if (uio_last < pages_in_upl) {
2598 /*
2599 * there were some invalid pages beyond the valid pages
2600 * that we didn't issue an I/O for, just release them
2601 * unchanged
2602 */
2603 ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
2604 (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
2605 }
2606
2607 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
2608 (int)upl, -1, -1, 0, 0);
2609 }
2610 }
2611 if (retval == 0)
2612 retval = error;
2613 }
2614
2615 return (retval);
2616 }
2617
2618
2619 static int
2620 cluster_nocopy_read(vp, uio, filesize, devblocksize, flags)
2621 struct vnode *vp;
2622 struct uio *uio;
2623 off_t filesize;
2624 int devblocksize;
2625 int flags;
2626 {
2627 upl_t upl;
2628 upl_page_info_t *pl;
2629 off_t upl_f_offset;
2630 vm_offset_t upl_offset;
2631 off_t start_upl_f_offset;
2632 off_t max_io_size;
2633 int io_size;
2634 int upl_size;
2635 int upl_needed_size;
2636 int pages_in_pl;
2637 vm_offset_t paddr;
2638 int upl_flags;
2639 kern_return_t kret;
2640 int segflg;
2641 struct iovec *iov;
2642 int i;
2643 int force_data_sync;
2644 int retval = 0;
2645 int first = 1;
2646 struct clios iostate;
2647
2648 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
2649 (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0);
2650
2651 /*
2652 * When we enter this routine, we know
2653 * -- the offset into the file is on a pagesize boundary
2654 * -- the resid is a page multiple
2655 * -- the resid will not exceed iov_len
2656 */
2657
2658 iostate.io_completed = 0;
2659 iostate.io_issued = 0;
2660 iostate.io_error = 0;
2661 iostate.io_wanted = 0;
2662
2663 iov = uio->uio_iov;
2664
2665 while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) {
2666
2667 max_io_size = filesize - uio->uio_offset;
2668
2669 if (max_io_size < (off_t)((unsigned int)uio->uio_resid))
2670 io_size = max_io_size;
2671 else
2672 io_size = uio->uio_resid;
2673
2674 /*
2675 * We don't come into this routine unless
2676 * UIO_USERSPACE is set.
2677 */
2678 segflg = uio->uio_segflg;
2679
2680 uio->uio_segflg = UIO_PHYS_USERSPACE;
2681
2682 /*
2683 * First look for pages already in the cache
2684 * and move them to user space.
2685 */
2686 while (io_size && (retval == 0)) {
2687 upl_f_offset = uio->uio_offset;
2688
2689 /*
2690 * If this call fails, it means the page is not
2691 * in the page cache.
2692 */
2693 if (ubc_page_op(vp, upl_f_offset,
2694 UPL_POP_SET | UPL_POP_BUSY, &paddr, 0) != KERN_SUCCESS)
2695 break;
2696
2697 retval = uiomove((caddr_t)(paddr), PAGE_SIZE, uio);
2698
2699 ubc_page_op(vp, upl_f_offset,
2700 UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
2701
2702 io_size -= PAGE_SIZE;
2703 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 71)) | DBG_FUNC_NONE,
2704 (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2705 }
2706 uio->uio_segflg = segflg;
2707
2708 if (retval) {
2709 /*
2710 * we may have already spun some portion of this request
2711 * off as async requests... we need to wait for the I/O
2712 * to complete before returning
2713 */
2714 goto wait_for_reads;
2715 }
2716 /*
2717 * If we are already finished with this read, then return
2718 */
2719 if (io_size == 0) {
2720 /*
2721 * we may have already spun some portion of this request
2722 * off as async requests... we need to wait for the I/O
2723 * to complete before returning
2724 */
2725 goto wait_for_reads;
2726 }
2727 max_io_size = io_size;
2728
2729 if (max_io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2730 max_io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2731 if (first) {
2732 if (max_io_size > (MAX_UPL_TRANSFER * PAGE_SIZE) / 4)
2733 max_io_size = (MAX_UPL_TRANSFER * PAGE_SIZE) / 8;
2734 first = 0;
2735 }
2736 start_upl_f_offset = uio->uio_offset; /* this is page aligned in the file */
2737 upl_f_offset = start_upl_f_offset;
2738 io_size = 0;
2739
2740 while (io_size < max_io_size) {
2741 if (ubc_page_op(vp, upl_f_offset,
2742 UPL_POP_SET | UPL_POP_BUSY, &paddr, 0) == KERN_SUCCESS) {
2743 ubc_page_op(vp, upl_f_offset,
2744 UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
2745 break;
2746 }
2747 /*
2748 * Build up the io request parameters.
2749 */
2750 io_size += PAGE_SIZE_64;
2751 upl_f_offset += PAGE_SIZE_64;
2752 }
2753 if (io_size == 0)
2754 /*
2755 * we may have already spun some portion of this request
2756 * off as async requests... we need to wait for the I/O
2757 * to complete before returning
2758 */
2759 goto wait_for_reads;
2760
2761 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
2762 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
2763
2764 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
2765 (int)upl_offset, upl_needed_size, (int)iov->iov_base, io_size, 0);
2766
2767 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
2768 pages_in_pl = 0;
2769 upl_size = upl_needed_size;
2770 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
2771
2772 kret = vm_map_get_upl(current_map(),
2773 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2774 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, force_data_sync);
2775
2776 if (kret != KERN_SUCCESS) {
2777 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2778 (int)upl_offset, upl_size, io_size, kret, 0);
2779
2780 /*
2781 * cluster_nocopy_read: failed to get pagelist
2782 *
2783 * we may have already spun some portion of this request
2784 * off as async requests... we need to wait for the I/O
2785 * to complete before returning
2786 */
2787 goto wait_for_reads;
2788 }
2789 pages_in_pl = upl_size / PAGE_SIZE;
2790 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2791
2792 for (i = 0; i < pages_in_pl; i++) {
2793 if (!upl_valid_page(pl, i))
2794 break;
2795 }
2796 if (i == pages_in_pl)
2797 break;
2798
2799 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2800 UPL_ABORT_FREE_ON_EMPTY);
2801 }
2802 if (force_data_sync >= 3) {
2803 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2804 (int)upl_offset, upl_size, io_size, kret, 0);
2805
2806 goto wait_for_reads;
2807 }
2808 /*
2809 * Consider the possibility that upl_size wasn't satisfied.
2810 */
2811 if (upl_size != upl_needed_size)
2812 io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
2813
2814 if (io_size == 0) {
2815 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2816 UPL_ABORT_FREE_ON_EMPTY);
2817 goto wait_for_reads;
2818 }
2819 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2820 (int)upl_offset, upl_size, io_size, kret, 0);
2821
2822 /*
2823 * request asynchronously so that we can overlap
2824 * the preparation of the next I/O
2825 * if there are already too many outstanding reads
2826 * wait until some have completed before issuing the next read
2827 */
2828 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
2829 iostate.io_wanted = 1;
2830 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_read", 0);
2831 }
2832 if (iostate.io_error) {
2833 /*
2834 * one of the earlier reads we issued ran into a hard error
2835 * don't issue any more reads, cleanup the UPL
2836 * that was just created but not used, then
2837 * go wait for any other reads to complete before
2838 * returning the error to the caller
2839 */
2840 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2841 UPL_ABORT_FREE_ON_EMPTY);
2842
2843 goto wait_for_reads;
2844 }
2845 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
2846 (int)upl, (int)upl_offset, (int)start_upl_f_offset, io_size, 0);
2847
2848 retval = cluster_io(vp, upl, upl_offset, start_upl_f_offset,
2849 io_size, devblocksize,
2850 CL_PRESERVE | CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO,
2851 (struct buf *)0, &iostate);
2852
2853 /*
2854 * update the uio structure
2855 */
2856 iov->iov_base += io_size;
2857 iov->iov_len -= io_size;
2858 uio->uio_resid -= io_size;
2859 uio->uio_offset += io_size;
2860
2861 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
2862 (int)upl, (int)uio->uio_offset, (int)uio->uio_resid, retval, 0);
2863
2864 } /* end while */
2865
2866 wait_for_reads:
2867 /*
2868 * make sure all async reads that are part of this stream
2869 * have completed before we return
2870 */
2871 while (iostate.io_issued != iostate.io_completed) {
2872 iostate.io_wanted = 1;
2873 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_read", 0);
2874 }
2875 if (iostate.io_error)
2876 retval = iostate.io_error;
2877
2878 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2879 (int)uio->uio_offset, (int)uio->uio_resid, 6, retval, 0);
2880
2881 return (retval);
2882 }
2883
2884
2885 static int
2886 cluster_phys_read(vp, uio, filesize, devblocksize, flags)
2887 struct vnode *vp;
2888 struct uio *uio;
2889 off_t filesize;
2890 int devblocksize;
2891 int flags;
2892 {
2893 upl_page_info_t *pl;
2894 upl_t upl;
2895 vm_offset_t upl_offset;
2896 vm_offset_t dst_paddr;
2897 off_t max_size;
2898 int io_size;
2899 int tail_size;
2900 int upl_size;
2901 int upl_needed_size;
2902 int pages_in_pl;
2903 int upl_flags;
2904 kern_return_t kret;
2905 struct iovec *iov;
2906 struct clios iostate;
2907 int error;
2908
2909 /*
2910 * When we enter this routine, we know
2911 * -- the resid will not exceed iov_len
2912 * -- the target address is physically contiguous
2913 */
2914
2915 iov = uio->uio_iov;
2916
2917 max_size = filesize - uio->uio_offset;
2918
2919 if (max_size > (off_t)((unsigned int)iov->iov_len))
2920 io_size = iov->iov_len;
2921 else
2922 io_size = max_size;
2923
2924 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
2925 upl_needed_size = upl_offset + io_size;
2926
2927 error = 0;
2928 pages_in_pl = 0;
2929 upl_size = upl_needed_size;
2930 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
2931
2932 kret = vm_map_get_upl(current_map(),
2933 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2934 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
2935
2936 if (kret != KERN_SUCCESS) {
2937 /*
2938 * cluster_phys_read: failed to get pagelist
2939 */
2940 return(EINVAL);
2941 }
2942 if (upl_size < upl_needed_size) {
2943 /*
2944 * The upl_size wasn't satisfied.
2945 */
2946 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2947
2948 return(EINVAL);
2949 }
2950 pl = ubc_upl_pageinfo(upl);
2951
2952 dst_paddr = (vm_offset_t)upl_phys_page(pl, 0) + ((vm_offset_t)iov->iov_base & PAGE_MASK);
2953
2954 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
2955 int head_size;
2956
2957 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
2958
2959 if (head_size > io_size)
2960 head_size = io_size;
2961
2962 error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, devblocksize, CL_READ);
2963
2964 if (error) {
2965 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2966
2967 return(EINVAL);
2968 }
2969 upl_offset += head_size;
2970 dst_paddr += head_size;
2971 io_size -= head_size;
2972 }
2973 tail_size = io_size & (devblocksize - 1);
2974 io_size -= tail_size;
2975
2976 iostate.io_completed = 0;
2977 iostate.io_issued = 0;
2978 iostate.io_error = 0;
2979 iostate.io_wanted = 0;
2980
2981 while (io_size && error == 0) {
2982 int xsize;
2983
2984 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2985 xsize = MAX_UPL_TRANSFER * PAGE_SIZE;
2986 else
2987 xsize = io_size;
2988 /*
2989 * request asynchronously so that we can overlap
2990 * the preparation of the next I/O... we'll do
2991 * the commit after all the I/O has completed
2992 * since its all issued against the same UPL
2993 * if there are already too many outstanding reads
2994 * wait until some have completed before issuing the next
2995 */
2996 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
2997 iostate.io_wanted = 1;
2998 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_phys_read", 0);
2999 }
3000
3001 error = cluster_io(vp, upl, upl_offset, uio->uio_offset, xsize, 0,
3002 CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC,
3003 (struct buf *)0, &iostate);
3004 /*
3005 * The cluster_io read was issued successfully,
3006 * update the uio structure
3007 */
3008 if (error == 0) {
3009 uio->uio_resid -= xsize;
3010 iov->iov_len -= xsize;
3011 iov->iov_base += xsize;
3012 uio->uio_offset += xsize;
3013 dst_paddr += xsize;
3014 upl_offset += xsize;
3015 io_size -= xsize;
3016 }
3017 }
3018 /*
3019 * make sure all async reads that are part of this stream
3020 * have completed before we proceed
3021 */
3022 while (iostate.io_issued != iostate.io_completed) {
3023 iostate.io_wanted = 1;
3024 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_phys_read", 0);
3025 }
3026 if (iostate.io_error) {
3027 error = iostate.io_error;
3028 }
3029 if (error == 0 && tail_size)
3030 error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, devblocksize, CL_READ);
3031
3032 /*
3033 * just release our hold on the physically contiguous
3034 * region without changing any state
3035 */
3036 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3037
3038 return (error);
3039 }
3040
3041
3042 /*
3043 * generate advisory I/O's in the largest chunks possible
3044 * the completed pages will be released into the VM cache
3045 */
3046 int
3047 advisory_read(vp, filesize, f_offset, resid, devblocksize)
3048 struct vnode *vp;
3049 off_t filesize;
3050 off_t f_offset;
3051 int resid;
3052 int devblocksize;
3053 {
3054 upl_page_info_t *pl;
3055 upl_t upl;
3056 vm_offset_t upl_offset;
3057 int upl_size;
3058 off_t upl_f_offset;
3059 int start_offset;
3060 int start_pg;
3061 int last_pg;
3062 int pages_in_upl;
3063 off_t max_size;
3064 int io_size;
3065 kern_return_t kret;
3066 int retval = 0;
3067 int issued_io;
3068
3069 if (!UBCINFOEXISTS(vp))
3070 return(EINVAL);
3071
3072 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
3073 (int)f_offset, resid, (int)filesize, devblocksize, 0);
3074
3075 while (resid && f_offset < filesize && retval == 0) {
3076 /*
3077 * compute the size of the upl needed to encompass
3078 * the requested read... limit each call to cluster_io
3079 * to the maximum UPL size... cluster_io will clip if
3080 * this exceeds the maximum io_size for the device,
3081 * make sure to account for
3082 * a starting offset that's not page aligned
3083 */
3084 start_offset = (int)(f_offset & PAGE_MASK_64);
3085 upl_f_offset = f_offset - (off_t)start_offset;
3086 max_size = filesize - f_offset;
3087
3088 if (resid < max_size)
3089 io_size = resid;
3090 else
3091 io_size = max_size;
3092
3093 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
3094 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
3095 upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
3096 pages_in_upl = upl_size / PAGE_SIZE;
3097
3098 kret = ubc_create_upl(vp,
3099 upl_f_offset,
3100 upl_size,
3101 &upl,
3102 &pl,
3103 UPL_RET_ONLY_ABSENT);
3104 if (kret != KERN_SUCCESS)
3105 return(retval);
3106 issued_io = 0;
3107
3108 /*
3109 * before we start marching forward, we must make sure we end on
3110 * a present page, otherwise we will be working with a freed
3111 * upl
3112 */
3113 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
3114 if (upl_page_present(pl, last_pg))
3115 break;
3116 }
3117 pages_in_upl = last_pg + 1;
3118
3119
3120 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_NONE,
3121 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
3122
3123
3124 for (last_pg = 0; last_pg < pages_in_upl; ) {
3125 /*
3126 * scan from the beginning of the upl looking for the first
3127 * page that is present.... this will become the first page in
3128 * the request we're going to make to 'cluster_io'... if all
3129 * of the pages are absent, we won't call through to 'cluster_io'
3130 */
3131 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
3132 if (upl_page_present(pl, start_pg))
3133 break;
3134 }
3135
3136 /*
3137 * scan from the starting present page looking for an absent
3138 * page before the end of the upl is reached, if we
3139 * find one, then it will terminate the range of pages being
3140 * presented to 'cluster_io'
3141 */
3142 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
3143 if (!upl_page_present(pl, last_pg))
3144 break;
3145 }
3146
3147 if (last_pg > start_pg) {
3148 /*
3149 * we found a range of pages that must be filled
3150 * if the last page in this range is the last page of the file
3151 * we may have to clip the size of it to keep from reading past
3152 * the end of the last physical block associated with the file
3153 */
3154 upl_offset = start_pg * PAGE_SIZE;
3155 io_size = (last_pg - start_pg) * PAGE_SIZE;
3156
3157 if ((upl_f_offset + upl_offset + io_size) > filesize)
3158 io_size = filesize - (upl_f_offset + upl_offset);
3159
3160 /*
3161 * issue an asynchronous read to cluster_io
3162 */
3163 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, devblocksize,
3164 CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE, (struct buf *)0, (struct clios *)0);
3165
3166 issued_io = 1;
3167 }
3168 }
3169 if (issued_io == 0)
3170 ubc_upl_abort(upl, 0);
3171
3172 io_size = upl_size - start_offset;
3173
3174 if (io_size > resid)
3175 io_size = resid;
3176 f_offset += io_size;
3177 resid -= io_size;
3178 }
3179
3180 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
3181 (int)f_offset, resid, retval, 0, 0);
3182
3183 return(retval);
3184 }
3185
3186
3187 int
3188 cluster_push(vp)
3189 struct vnode *vp;
3190 {
3191 int retval;
3192
3193 if (!UBCINFOEXISTS(vp) || vp->v_clen == 0) {
3194 vp->v_flag &= ~VHASDIRTY;
3195 return(0);
3196 }
3197
3198 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
3199 vp->v_flag & VHASDIRTY, vp->v_clen, 0, 0, 0);
3200
3201 if (vp->v_flag & VHASDIRTY) {
3202 daddr_t start_pg;
3203 daddr_t last_pg;
3204 daddr_t end_pg;
3205
3206 start_pg = vp->v_cstart;
3207 end_pg = vp->v_lastw;
3208
3209 vp->v_flag &= ~VHASDIRTY;
3210 vp->v_clen = 0;
3211
3212 while (start_pg < end_pg) {
3213 last_pg = start_pg + MAX_UPL_TRANSFER;
3214
3215 if (last_pg > end_pg)
3216 last_pg = end_pg;
3217
3218 cluster_push_x(vp, ubc_getsize(vp), start_pg, last_pg, 0);
3219
3220 start_pg = last_pg;
3221 }
3222 return (1);
3223 }
3224 retval = cluster_try_push(vp, ubc_getsize(vp), 0, 1);
3225
3226 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
3227 vp->v_flag & VHASDIRTY, vp->v_clen, retval, 0, 0);
3228
3229 return (retval);
3230 }
3231
3232
3233 static int
3234 cluster_try_push(vp, EOF, can_delay, push_all)
3235 struct vnode *vp;
3236 off_t EOF;
3237 int can_delay;
3238 int push_all;
3239 {
3240 int cl_index;
3241 int cl_index1;
3242 int min_index;
3243 int cl_len;
3244 int cl_total;
3245 int cl_pushed;
3246 struct v_cluster l_clusters[MAX_CLUSTERS];
3247
3248 /*
3249 * make a local 'sorted' copy of the clusters
3250 * and clear vp->v_clen so that new clusters can
3251 * be developed
3252 */
3253 for (cl_index = 0; cl_index < vp->v_clen; cl_index++) {
3254 for (min_index = -1, cl_index1 = 0; cl_index1 < vp->v_clen; cl_index1++) {
3255 if (vp->v_clusters[cl_index1].start_pg == vp->v_clusters[cl_index1].last_pg)
3256 continue;
3257 if (min_index == -1)
3258 min_index = cl_index1;
3259 else if (vp->v_clusters[cl_index1].start_pg < vp->v_clusters[min_index].start_pg)
3260 min_index = cl_index1;
3261 }
3262 if (min_index == -1)
3263 break;
3264 l_clusters[cl_index].start_pg = vp->v_clusters[min_index].start_pg;
3265 l_clusters[cl_index].last_pg = vp->v_clusters[min_index].last_pg;
3266
3267 vp->v_clusters[min_index].start_pg = vp->v_clusters[min_index].last_pg;
3268 }
3269 cl_len = cl_index;
3270 vp->v_clen = 0;
3271
3272 for (cl_pushed = 0, cl_index = 0; cl_index < cl_len; cl_index++) {
3273 /*
3274 * try to push each cluster in turn... cluster_push_x may not
3275 * push the cluster if can_delay is TRUE and the cluster doesn't
3276 * meet the critera for an immediate push
3277 */
3278 if (cluster_push_x(vp, EOF, l_clusters[cl_index].start_pg, l_clusters[cl_index].last_pg, can_delay)) {
3279 l_clusters[cl_index].start_pg = 0;
3280 l_clusters[cl_index].last_pg = 0;
3281
3282 cl_pushed++;
3283
3284 if (push_all == 0)
3285 break;
3286 }
3287 }
3288 if (cl_len > cl_pushed) {
3289 /*
3290 * we didn't push all of the clusters, so
3291 * lets try to merge them back in to the vnode
3292 */
3293 if ((MAX_CLUSTERS - vp->v_clen) < (cl_len - cl_pushed)) {
3294 /*
3295 * we picked up some new clusters while we were trying to
3296 * push the old ones (I don't think this can happen because
3297 * I'm holding the lock, but just in case)... the sum of the
3298 * leftovers plus the new cluster count exceeds our ability
3299 * to represent them, so fall back to the VHASDIRTY mechanism
3300 */
3301 for (cl_index = 0; cl_index < cl_len; cl_index++) {
3302 if (l_clusters[cl_index].start_pg == l_clusters[cl_index].last_pg)
3303 continue;
3304
3305 if (l_clusters[cl_index].start_pg < vp->v_cstart)
3306 vp->v_cstart = l_clusters[cl_index].start_pg;
3307 if (l_clusters[cl_index].last_pg > vp->v_lastw)
3308 vp->v_lastw = l_clusters[cl_index].last_pg;
3309 }
3310 vp->v_flag |= VHASDIRTY;
3311 } else {
3312 /*
3313 * we've got room to merge the leftovers back in
3314 * just append them starting at the next 'hole'
3315 * represented by vp->v_clen
3316 */
3317 for (cl_index = 0, cl_index1 = vp->v_clen; cl_index < cl_len; cl_index++) {
3318 if (l_clusters[cl_index].start_pg == l_clusters[cl_index].last_pg)
3319 continue;
3320
3321 vp->v_clusters[cl_index1].start_pg = l_clusters[cl_index].start_pg;
3322 vp->v_clusters[cl_index1].last_pg = l_clusters[cl_index].last_pg;
3323
3324 if (cl_index1 == 0) {
3325 vp->v_cstart = l_clusters[cl_index].start_pg;
3326 vp->v_lastw = l_clusters[cl_index].last_pg;
3327 } else {
3328 if (l_clusters[cl_index].start_pg < vp->v_cstart)
3329 vp->v_cstart = l_clusters[cl_index].start_pg;
3330 if (l_clusters[cl_index].last_pg > vp->v_lastw)
3331 vp->v_lastw = l_clusters[cl_index].last_pg;
3332 }
3333 cl_index1++;
3334 }
3335 /*
3336 * update the cluster count
3337 */
3338 vp->v_clen = cl_index1;
3339 }
3340 }
3341 return(MAX_CLUSTERS - vp->v_clen);
3342 }
3343
3344
3345
3346 static int
3347 cluster_push_x(vp, EOF, first, last, can_delay)
3348 struct vnode *vp;
3349 off_t EOF;
3350 daddr_t first;
3351 daddr_t last;
3352 int can_delay;
3353 {
3354 upl_page_info_t *pl;
3355 upl_t upl;
3356 vm_offset_t upl_offset;
3357 int upl_size;
3358 off_t upl_f_offset;
3359 int pages_in_upl;
3360 int start_pg;
3361 int last_pg;
3362 int io_size;
3363 int io_flags;
3364 int size;
3365 kern_return_t kret;
3366
3367
3368 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START,
3369 vp->v_clen, first, last, EOF, 0);
3370
3371 if ((pages_in_upl = last - first) == 0) {
3372 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0);
3373
3374 return (1);
3375 }
3376 upl_size = pages_in_upl * PAGE_SIZE;
3377 upl_f_offset = ((off_t)first) * PAGE_SIZE_64;
3378
3379 if (upl_f_offset + upl_size >= EOF) {
3380
3381 if (upl_f_offset >= EOF) {
3382 /*
3383 * must have truncated the file and missed
3384 * clearing a dangling cluster (i.e. it's completely
3385 * beyond the new EOF
3386 */
3387 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0);
3388
3389 return(1);
3390 }
3391 size = EOF - upl_f_offset;
3392
3393 upl_size = (size + (PAGE_SIZE - 1) ) & ~(PAGE_SIZE - 1);
3394 pages_in_upl = upl_size / PAGE_SIZE;
3395 } else {
3396 if (can_delay && (pages_in_upl < (MAX_UPL_TRANSFER - (MAX_UPL_TRANSFER / 2))))
3397 return(0);
3398 size = upl_size;
3399 }
3400 kret = ubc_create_upl(vp,
3401 upl_f_offset,
3402 upl_size,
3403 &upl,
3404 &pl,
3405 UPL_RET_ONLY_DIRTY);
3406 if (kret != KERN_SUCCESS)
3407 panic("cluster_push: failed to get pagelist");
3408
3409 if (can_delay) {
3410 int num_of_dirty;
3411
3412 for (num_of_dirty = 0, start_pg = 0; start_pg < pages_in_upl; start_pg++) {
3413 if (upl_valid_page(pl, start_pg) && upl_dirty_page(pl, start_pg))
3414 num_of_dirty++;
3415 }
3416 if (num_of_dirty < pages_in_upl / 2) {
3417 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3418
3419 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 0, 2, num_of_dirty, (pages_in_upl / 2), 0);
3420
3421 return(0);
3422 }
3423 }
3424 last_pg = 0;
3425
3426 while (size) {
3427
3428 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
3429 if (upl_valid_page(pl, start_pg) && upl_dirty_page(pl, start_pg))
3430 break;
3431 }
3432 if (start_pg > last_pg) {
3433 io_size = (start_pg - last_pg) * PAGE_SIZE;
3434
3435 ubc_upl_abort_range(upl, last_pg * PAGE_SIZE, io_size,
3436 UPL_ABORT_FREE_ON_EMPTY);
3437
3438 if (io_size < size)
3439 size -= io_size;
3440 else
3441 break;
3442 }
3443 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
3444 if (!upl_valid_page(pl, last_pg) || !upl_dirty_page(pl, last_pg))
3445 break;
3446 }
3447 upl_offset = start_pg * PAGE_SIZE;
3448
3449 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
3450
3451 if (vp->v_flag & VNOCACHE_DATA)
3452 io_flags = CL_COMMIT | CL_AGE | CL_ASYNC | CL_DUMP;
3453 else
3454 io_flags = CL_COMMIT | CL_AGE | CL_ASYNC;
3455
3456 while (vp->v_numoutput >= ASYNC_THROTTLE) {
3457 vp->v_flag |= VTHROTTLED;
3458 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_push", 0);
3459 }
3460 cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, vp->v_ciosiz, io_flags, (struct buf *)0, (struct clios *)0);
3461
3462 size -= io_size;
3463 }
3464 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, 0, 0, 0);
3465
3466 return(1);
3467 }
3468
3469
3470
3471 static int
3472 cluster_align_phys_io(struct vnode *vp, struct uio *uio, vm_offset_t usr_paddr, int xsize, int devblocksize, int flags)
3473 {
3474 struct iovec *iov;
3475 upl_page_info_t *pl;
3476 upl_t upl;
3477 vm_offset_t ubc_paddr;
3478 kern_return_t kret;
3479 int error = 0;
3480
3481 iov = uio->uio_iov;
3482
3483 kret = ubc_create_upl(vp,
3484 uio->uio_offset & ~PAGE_MASK_64,
3485 PAGE_SIZE,
3486 &upl,
3487 &pl,
3488 UPL_FLAGS_NONE);
3489
3490 if (kret != KERN_SUCCESS)
3491 return(EINVAL);
3492
3493 if (!upl_valid_page(pl, 0)) {
3494 /*
3495 * issue a synchronous read to cluster_io
3496 */
3497 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, devblocksize,
3498 CL_READ, (struct buf *)0, (struct clios *)0);
3499 if (error) {
3500 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3501
3502 return(error);
3503 }
3504 }
3505 ubc_paddr = (vm_offset_t)upl_phys_page(pl, 0) + (int)(uio->uio_offset & PAGE_MASK_64);
3506
3507 if (flags & CL_READ)
3508 copyp2p(ubc_paddr, usr_paddr, xsize, 2);
3509 else
3510 copyp2p(usr_paddr, ubc_paddr, xsize, 1);
3511
3512 if ( !(flags & CL_READ) || upl_dirty_page(pl, 0)) {
3513 /*
3514 * issue a synchronous write to cluster_io
3515 */
3516 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, devblocksize,
3517 0, (struct buf *)0, (struct clios *)0);
3518 }
3519 if (error == 0) {
3520 uio->uio_offset += xsize;
3521 iov->iov_base += xsize;
3522 iov->iov_len -= xsize;
3523 uio->uio_resid -= xsize;
3524 }
3525 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3526
3527 return (error);
3528 }