]> git.saurik.com Git - apple/xnu.git/blob - bsd/vfs/vfs_cluster.c
080cd55b199f460366361538478b9db493d78977
[apple/xnu.git] / bsd / vfs / vfs_cluster.c
1 /*
2 * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
23 /*
24 * Copyright (c) 1993
25 * The Regents of the University of California. All rights reserved.
26 *
27 * Redistribution and use in source and binary forms, with or without
28 * modification, are permitted provided that the following conditions
29 * are met:
30 * 1. Redistributions of source code must retain the above copyright
31 * notice, this list of conditions and the following disclaimer.
32 * 2. Redistributions in binary form must reproduce the above copyright
33 * notice, this list of conditions and the following disclaimer in the
34 * documentation and/or other materials provided with the distribution.
35 * 3. All advertising materials mentioning features or use of this software
36 * must display the following acknowledgement:
37 * This product includes software developed by the University of
38 * California, Berkeley and its contributors.
39 * 4. Neither the name of the University nor the names of its contributors
40 * may be used to endorse or promote products derived from this software
41 * without specific prior written permission.
42 *
43 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
44 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
45 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
46 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
47 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
48 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
49 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
50 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
51 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
52 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
53 * SUCH DAMAGE.
54 *
55 * @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95
56 */
57
58 #include <sys/param.h>
59 #include <sys/proc.h>
60 #include <sys/buf.h>
61 #include <sys/vnode.h>
62 #include <sys/mount.h>
63 #include <sys/trace.h>
64 #include <sys/malloc.h>
65 #include <sys/resourcevar.h>
66 #include <libkern/libkern.h>
67
68 #include <sys/ubc.h>
69 #include <vm/vm_pageout.h>
70
71 #include <sys/kdebug.h>
72
73 #define CL_READ 0x01
74 #define CL_ASYNC 0x02
75 #define CL_COMMIT 0x04
76 #define CL_PAGEOUT 0x10
77 #define CL_AGE 0x20
78 #define CL_DUMP 0x40
79 #define CL_NOZERO 0x80
80 #define CL_PAGEIN 0x100
81 #define CL_DEV_MEMORY 0x200
82 #define CL_PRESERVE 0x400
83
84
85 struct clios {
86 u_int io_completed; /* amount of io that has currently completed */
87 u_int io_issued; /* amount of io that was successfully issued */
88 int io_error; /* error code of first error encountered */
89 int io_wanted; /* someone is sleeping waiting for a change in state */
90 };
91
92
93 static void cluster_zero(upl_t upl, vm_offset_t upl_offset,
94 int size, struct buf *bp);
95 static int cluster_read_x(struct vnode *vp, struct uio *uio,
96 off_t filesize, int devblocksize, int flags);
97 static int cluster_write_x(struct vnode *vp, struct uio *uio,
98 off_t oldEOF, off_t newEOF, off_t headOff,
99 off_t tailOff, int devblocksize, int flags);
100 static int cluster_nocopy_read(struct vnode *vp, struct uio *uio,
101 off_t filesize, int devblocksize, int flags);
102 static int cluster_nocopy_write(struct vnode *vp, struct uio *uio,
103 off_t newEOF, int devblocksize, int flags);
104 static int cluster_phys_read(struct vnode *vp, struct uio *uio,
105 off_t filesize, int devblocksize, int flags);
106 static int cluster_phys_write(struct vnode *vp, struct uio *uio,
107 off_t newEOF, int devblocksize, int flags);
108 static int cluster_align_phys_io(struct vnode *vp, struct uio *uio,
109 vm_offset_t usr_paddr, int xsize, int devblocksize, int flags);
110 static int cluster_push_x(struct vnode *vp, off_t EOF, daddr_t first, daddr_t last, int can_delay);
111 static int cluster_try_push(struct vnode *vp, off_t newEOF, int can_delay, int push_all);
112
113
114 /*
115 * throttle the number of async writes that
116 * can be outstanding on a single vnode
117 * before we issue a synchronous write
118 */
119 #define ASYNC_THROTTLE 9
120
121 static int
122 cluster_iodone(bp)
123 struct buf *bp;
124 {
125 int b_flags;
126 int error;
127 int total_size;
128 int total_resid;
129 int upl_offset;
130 int zero_offset;
131 upl_t upl;
132 struct buf *cbp;
133 struct buf *cbp_head;
134 struct buf *cbp_next;
135 struct buf *real_bp;
136 struct vnode *vp;
137 struct clios *iostate;
138 int commit_size;
139 int pg_offset;
140
141
142 cbp_head = (struct buf *)(bp->b_trans_head);
143
144 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
145 (int)cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
146
147 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
148 /*
149 * all I/O requests that are part of this transaction
150 * have to complete before we can process it
151 */
152 if ( !(cbp->b_flags & B_DONE)) {
153
154 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
155 (int)cbp_head, (int)cbp, cbp->b_bcount, cbp->b_flags, 0);
156
157 return 0;
158 }
159 }
160 error = 0;
161 total_size = 0;
162 total_resid = 0;
163
164 cbp = cbp_head;
165 upl_offset = cbp->b_uploffset;
166 upl = cbp->b_pagelist;
167 b_flags = cbp->b_flags;
168 real_bp = cbp->b_real_bp;
169 vp = cbp->b_vp;
170 zero_offset= cbp->b_validend;
171 iostate = (struct clios *)cbp->b_iostate;
172
173 while (cbp) {
174 if (cbp->b_vectorcount > 1)
175 _FREE(cbp->b_vectorlist, M_SEGMENT);
176
177 if ((cbp->b_flags & B_ERROR) && error == 0)
178 error = cbp->b_error;
179
180 total_resid += cbp->b_resid;
181 total_size += cbp->b_bcount;
182
183 cbp_next = cbp->b_trans_next;
184
185 free_io_buf(cbp);
186
187 cbp = cbp_next;
188 }
189 if (zero_offset)
190 cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
191
192 if ((vp->v_flag & VTHROTTLED) && (vp->v_numoutput <= (ASYNC_THROTTLE / 3))) {
193 vp->v_flag &= ~VTHROTTLED;
194 wakeup((caddr_t)&vp->v_numoutput);
195 }
196 if (iostate) {
197 /*
198 * someone has issued multiple I/Os asynchrounsly
199 * and is waiting for them to complete (streaming)
200 */
201 if (error && iostate->io_error == 0)
202 iostate->io_error = error;
203
204 iostate->io_completed += total_size;
205
206 if (iostate->io_wanted) {
207 /*
208 * someone is waiting for the state of
209 * this io stream to change
210 */
211 iostate->io_wanted = 0;
212 wakeup((caddr_t)&iostate->io_wanted);
213 }
214 }
215 if ((b_flags & B_NEED_IODONE) && real_bp) {
216 if (error) {
217 real_bp->b_flags |= B_ERROR;
218 real_bp->b_error = error;
219 }
220 real_bp->b_resid = total_resid;
221
222 biodone(real_bp);
223 }
224 if (error == 0 && total_resid)
225 error = EIO;
226
227 if (b_flags & B_COMMIT_UPL) {
228 pg_offset = upl_offset & PAGE_MASK;
229 commit_size = (((pg_offset + total_size) + (PAGE_SIZE - 1)) / PAGE_SIZE) * PAGE_SIZE;
230
231 if (error || (b_flags & B_NOCACHE) || ((b_flags & B_PHYS) && !(b_flags & B_READ))) {
232 int upl_abort_code;
233
234 if (b_flags & B_PHYS)
235 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
236 else if ((b_flags & B_PAGEOUT) && (error != ENXIO)) /* transient error */
237 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
238 else if (b_flags & B_PGIN)
239 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
240 else
241 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
242
243 ubc_upl_abort_range(upl, upl_offset - pg_offset, commit_size,
244 upl_abort_code);
245
246 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
247 (int)upl, upl_offset - pg_offset, commit_size,
248 0x80000000|upl_abort_code, 0);
249
250 } else {
251 int upl_commit_flags = UPL_COMMIT_FREE_ON_EMPTY;
252
253 if (b_flags & B_PHYS)
254 upl_commit_flags |= UPL_COMMIT_SET_DIRTY;
255 else if ( !(b_flags & B_PAGEOUT))
256 upl_commit_flags |= UPL_COMMIT_CLEAR_DIRTY;
257 if (b_flags & B_AGE)
258 upl_commit_flags |= UPL_COMMIT_INACTIVATE;
259
260 ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size,
261 upl_commit_flags);
262
263 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
264 (int)upl, upl_offset - pg_offset, commit_size,
265 upl_commit_flags, 0);
266 }
267 } else
268 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
269 (int)upl, upl_offset, 0, error, 0);
270
271 return (error);
272 }
273
274
275 static void
276 cluster_zero(upl, upl_offset, size, bp)
277 upl_t upl;
278 vm_offset_t upl_offset;
279 int size;
280 struct buf *bp;
281 {
282 vm_offset_t io_addr = 0;
283 int must_unmap = 0;
284 kern_return_t kret;
285
286 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_NONE,
287 upl_offset, size, (int)bp, 0, 0);
288
289 if (bp == NULL || bp->b_data == NULL) {
290 kret = ubc_upl_map(upl, &io_addr);
291
292 if (kret != KERN_SUCCESS)
293 panic("cluster_zero: ubc_upl_map() failed with (%d)", kret);
294 if (io_addr == 0)
295 panic("cluster_zero: ubc_upl_map() mapped 0");
296
297 must_unmap = 1;
298 } else
299 io_addr = (vm_offset_t)bp->b_data;
300 bzero((caddr_t)(io_addr + upl_offset), size);
301
302 if (must_unmap) {
303 kret = ubc_upl_unmap(upl);
304
305 if (kret != KERN_SUCCESS)
306 panic("cluster_zero: kernel_upl_unmap failed");
307 }
308 }
309
310 static int
311 cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, real_bp, iostate)
312 struct vnode *vp;
313 upl_t upl;
314 vm_offset_t upl_offset;
315 off_t f_offset;
316 int non_rounded_size;
317 int devblocksize;
318 int flags;
319 struct buf *real_bp;
320 struct clios *iostate;
321 {
322 struct buf *cbp;
323 struct iovec *iovp;
324 u_int size;
325 u_int io_size;
326 int io_flags;
327 int error = 0;
328 int retval = 0;
329 struct buf *cbp_head = 0;
330 struct buf *cbp_tail = 0;
331 upl_page_info_t *pl;
332 int buf_count = 0;
333 int pg_count;
334 int pg_offset;
335 u_int max_iosize;
336 u_int max_vectors;
337 int priv;
338 int zero_offset = 0;
339 u_int first_lblkno;
340
341 if (flags & CL_READ) {
342 io_flags = (B_VECTORLIST | B_READ);
343
344 vfs_io_attributes(vp, B_READ, &max_iosize, &max_vectors);
345 } else {
346 io_flags = (B_VECTORLIST | B_WRITEINPROG);
347
348 vfs_io_attributes(vp, B_WRITE, &max_iosize, &max_vectors);
349 }
350 pl = ubc_upl_pageinfo(upl);
351
352 if (flags & CL_AGE)
353 io_flags |= B_AGE;
354 if (flags & CL_DUMP)
355 io_flags |= B_NOCACHE;
356 if (flags & CL_PAGEIN)
357 io_flags |= B_PGIN;
358 if (flags & CL_PAGEOUT)
359 io_flags |= B_PAGEOUT;
360 if (flags & CL_COMMIT)
361 io_flags |= B_COMMIT_UPL;
362 if (flags & CL_PRESERVE)
363 io_flags |= B_PHYS;
364
365 if (devblocksize)
366 size = (non_rounded_size + (devblocksize - 1)) & ~(devblocksize - 1);
367 else
368 size = non_rounded_size;
369
370
371 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START,
372 (int)f_offset, size, upl_offset, flags, 0);
373
374 if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
375 /*
376 * then we are going to end up
377 * with a page that we can't complete (the file size wasn't a multiple
378 * of PAGE_SIZE and we're trying to read to the end of the file
379 * so we'll go ahead and zero out the portion of the page we can't
380 * read in from the file
381 */
382 zero_offset = upl_offset + non_rounded_size;
383 }
384 while (size) {
385 int vsize;
386 int i;
387 int pl_index;
388 int pg_resid;
389 int num_contig;
390 daddr_t lblkno;
391 daddr_t blkno;
392
393 if (size > max_iosize)
394 io_size = max_iosize;
395 else
396 io_size = size;
397
398 if (error = VOP_CMAP(vp, f_offset, io_size, &blkno, (size_t *)&io_size, NULL)) {
399 if (error == EOPNOTSUPP)
400 panic("VOP_CMAP Unimplemented");
401 break;
402 }
403
404 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
405 (int)f_offset, (int)blkno, io_size, zero_offset, 0);
406
407 if ( (!(flags & CL_READ) && (long)blkno == -1) || io_size == 0) {
408 if (flags & CL_PAGEOUT) {
409 error = EINVAL;
410 break;
411 };
412
413 /* Try paging out the page individually before
414 giving up entirely and dumping it (it could
415 be mapped in a "hole" and require allocation
416 before the I/O:
417 */
418 ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE_64, UPL_ABORT_FREE_ON_EMPTY);
419 if (ubc_pushdirty_range(vp, f_offset, PAGE_SIZE_64) == 0) {
420 error = EINVAL;
421 break;
422 };
423
424 upl_offset += PAGE_SIZE_64;
425 f_offset += PAGE_SIZE_64;
426 size -= PAGE_SIZE_64;
427 continue;
428 }
429 lblkno = (daddr_t)(f_offset / PAGE_SIZE_64);
430 /*
431 * we have now figured out how much I/O we can do - this is in 'io_size'
432 * pl_index represents the first page in the 'upl' that the I/O will occur for
433 * pg_offset is the starting point in the first page for the I/O
434 * pg_count is the number of full and partial pages that 'io_size' encompasses
435 */
436 pl_index = upl_offset / PAGE_SIZE;
437 pg_offset = upl_offset & PAGE_MASK;
438 pg_count = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
439
440 if (flags & CL_DEV_MEMORY) {
441 /*
442 * currently, can't deal with reading 'holes' in file
443 */
444 if ((long)blkno == -1) {
445 error = EINVAL;
446 break;
447 }
448 /*
449 * treat physical requests as one 'giant' page
450 */
451 pg_count = 1;
452 }
453 if ((flags & CL_READ) && (long)blkno == -1) {
454 int bytes_to_zero;
455
456 /*
457 * if we're reading and blkno == -1, then we've got a
458 * 'hole' in the file that we need to deal with by zeroing
459 * out the affected area in the upl
460 */
461 if (zero_offset && io_size == size) {
462 /*
463 * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
464 * than 'zero_offset' will be non-zero
465 * if the 'hole' returned by VOP_CMAP extends all the way to the eof
466 * (indicated by the io_size finishing off the I/O request for this UPL)
467 * than we're not going to issue an I/O for the
468 * last page in this upl... we need to zero both the hole and the tail
469 * of the page beyond the EOF, since the delayed zero-fill won't kick in
470 */
471 bytes_to_zero = (((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset;
472
473 zero_offset = 0;
474 } else
475 bytes_to_zero = io_size;
476
477 cluster_zero(upl, upl_offset, bytes_to_zero, real_bp);
478
479 if (cbp_head)
480 /*
481 * if there is a current I/O chain pending
482 * then the first page of the group we just zero'd
483 * will be handled by the I/O completion if the zero
484 * fill started in the middle of the page
485 */
486 pg_count = (io_size - pg_offset) / PAGE_SIZE;
487 else {
488 /*
489 * no pending I/O to pick up that first page
490 * so, we have to make sure it gets committed
491 * here.
492 * set the pg_offset to 0 so that the upl_commit_range
493 * starts with this page
494 */
495 pg_count = (io_size + pg_offset) / PAGE_SIZE;
496 pg_offset = 0;
497 }
498 if (io_size == size && ((upl_offset + io_size) & PAGE_MASK))
499 /*
500 * if we're done with the request for this UPL
501 * then we have to make sure to commit the last page
502 * even if we only partially zero-filled it
503 */
504 pg_count++;
505
506 if (pg_count) {
507 if (pg_offset)
508 pg_resid = PAGE_SIZE - pg_offset;
509 else
510 pg_resid = 0;
511
512 if (flags & CL_COMMIT)
513 ubc_upl_commit_range(upl,
514 (upl_offset + pg_resid) & ~PAGE_MASK,
515 pg_count * PAGE_SIZE,
516 UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
517 }
518 upl_offset += io_size;
519 f_offset += io_size;
520 size -= io_size;
521
522 if (cbp_head && pg_count)
523 goto start_io;
524 continue;
525
526 } else if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno)) {
527 real_bp->b_blkno = blkno;
528 }
529
530 if (pg_count > 1) {
531 if (pg_count > max_vectors) {
532 io_size -= (pg_count - max_vectors) * PAGE_SIZE;
533
534 if (io_size < 0) {
535 io_size = PAGE_SIZE - pg_offset;
536 pg_count = 1;
537 } else
538 pg_count = max_vectors;
539 }
540 /*
541 * we need to allocate space for the vector list
542 */
543 if (pg_count > 1) {
544 iovp = (struct iovec *)_MALLOC(sizeof(struct iovec) * pg_count,
545 M_SEGMENT, M_NOWAIT);
546
547 if (iovp == (struct iovec *) 0) {
548 /*
549 * if the allocation fails, then throttle down to a single page
550 */
551 io_size = PAGE_SIZE - pg_offset;
552 pg_count = 1;
553 }
554 }
555 }
556
557 /* Throttle the speculative IO */
558 if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT))
559 priv = 0;
560 else
561 priv = 1;
562
563 cbp = alloc_io_buf(vp, priv);
564
565 if (pg_count == 1)
566 /*
567 * we use the io vector that's reserved in the buffer header
568 * this insures we can always issue an I/O even in a low memory
569 * condition that prevents the _MALLOC from succeeding... this
570 * is necessary to prevent deadlocks with the pager
571 */
572 iovp = (struct iovec *)(&cbp->b_vects[0]);
573
574 cbp->b_vectorlist = (void *)iovp;
575 cbp->b_vectorcount = pg_count;
576
577 if (flags & CL_DEV_MEMORY) {
578
579 iovp->iov_len = io_size;
580 iovp->iov_base = (caddr_t)upl_phys_page(pl, 0);
581
582 if (iovp->iov_base == (caddr_t) 0) {
583 free_io_buf(cbp);
584 error = EINVAL;
585 } else
586 iovp->iov_base += upl_offset;
587 } else {
588
589 for (i = 0, vsize = io_size; i < pg_count; i++, iovp++) {
590 int psize;
591
592 psize = PAGE_SIZE - pg_offset;
593
594 if (psize > vsize)
595 psize = vsize;
596
597 iovp->iov_len = psize;
598 iovp->iov_base = (caddr_t)upl_phys_page(pl, pl_index + i);
599
600 if (iovp->iov_base == (caddr_t) 0) {
601 if (pg_count > 1)
602 _FREE(cbp->b_vectorlist, M_SEGMENT);
603 free_io_buf(cbp);
604
605 error = EINVAL;
606 break;
607 }
608 iovp->iov_base += pg_offset;
609 pg_offset = 0;
610
611 if (flags & CL_PAGEOUT) {
612 int s;
613 struct buf *bp;
614
615 s = splbio();
616 if (bp = incore(vp, lblkno + i)) {
617 if (!ISSET(bp->b_flags, B_BUSY)) {
618 bremfree(bp);
619 SET(bp->b_flags, (B_BUSY | B_INVAL));
620 splx(s);
621 brelse(bp);
622 } else
623 panic("BUSY bp found in cluster_io");
624 }
625 splx(s);
626 }
627 vsize -= psize;
628 }
629 }
630 if (error)
631 break;
632
633 if (flags & CL_ASYNC) {
634 cbp->b_flags |= (B_CALL | B_ASYNC);
635 cbp->b_iodone = (void *)cluster_iodone;
636 }
637 cbp->b_flags |= io_flags;
638
639 cbp->b_lblkno = lblkno;
640 cbp->b_blkno = blkno;
641 cbp->b_bcount = io_size;
642 cbp->b_pagelist = upl;
643 cbp->b_uploffset = upl_offset;
644 cbp->b_trans_next = (struct buf *)0;
645
646 if (cbp->b_iostate = (void *)iostate)
647 /*
648 * caller wants to track the state of this
649 * io... bump the amount issued against this stream
650 */
651 iostate->io_issued += io_size;
652
653 if (flags & CL_READ)
654 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
655 cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
656 else
657 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
658 cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
659
660 if (cbp_head) {
661 cbp_tail->b_trans_next = cbp;
662 cbp_tail = cbp;
663 } else {
664 cbp_head = cbp;
665 cbp_tail = cbp;
666 }
667 (struct buf *)(cbp->b_trans_head) = cbp_head;
668 buf_count++;
669
670 upl_offset += io_size;
671 f_offset += io_size;
672 size -= io_size;
673
674 if ( (!(upl_offset & PAGE_MASK) && !(flags & CL_DEV_MEMORY) && ((flags & CL_ASYNC) || buf_count > 8)) || size == 0) {
675 /*
676 * if we have no more I/O to issue or
677 * the current I/O we've prepared fully
678 * completes the last page in this request
679 * and it's either an ASYNC request or
680 * we've already accumulated more than 8 I/O's into
681 * this transaction and it's not an I/O directed to
682 * special DEVICE memory
683 * then go ahead and issue the I/O
684 */
685 start_io:
686 if (real_bp) {
687 cbp_head->b_flags |= B_NEED_IODONE;
688 cbp_head->b_real_bp = real_bp;
689 } else
690 cbp_head->b_real_bp = (struct buf *)NULL;
691
692 if (size == 0) {
693 /*
694 * we're about to issue the last I/O for this upl
695 * if this was a read to the eof and the eof doesn't
696 * finish on a page boundary, than we need to zero-fill
697 * the rest of the page....
698 */
699 cbp_head->b_validend = zero_offset;
700 } else
701 cbp_head->b_validend = 0;
702
703 for (cbp = cbp_head; cbp;) {
704 struct buf * cbp_next;
705
706 if (io_flags & B_WRITEINPROG)
707 cbp->b_vp->v_numoutput++;
708
709 cbp_next = cbp->b_trans_next;
710
711 (void) VOP_STRATEGY(cbp);
712 cbp = cbp_next;
713 }
714 if ( !(flags & CL_ASYNC)) {
715 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next)
716 biowait(cbp);
717
718 if (error = cluster_iodone(cbp_head)) {
719 if ((flags & CL_PAGEOUT) && (error == ENXIO))
720 retval = 0; /* drop the error */
721 else
722 retval = error;
723 error = 0;
724 }
725 }
726 cbp_head = (struct buf *)0;
727 cbp_tail = (struct buf *)0;
728
729 buf_count = 0;
730 }
731 }
732 if (error) {
733 int abort_size;
734
735 io_size = 0;
736
737 for (cbp = cbp_head; cbp;) {
738 struct buf * cbp_next;
739
740 if (cbp->b_vectorcount > 1)
741 _FREE(cbp->b_vectorlist, M_SEGMENT);
742 upl_offset -= cbp->b_bcount;
743 size += cbp->b_bcount;
744 io_size += cbp->b_bcount;
745
746 cbp_next = cbp->b_trans_next;
747 free_io_buf(cbp);
748 cbp = cbp_next;
749 }
750 if (iostate) {
751 /*
752 * update the error condition for this stream
753 * since we never really issued the io
754 * just go ahead and adjust it back
755 */
756 if (iostate->io_error == 0)
757 iostate->io_error = error;
758 iostate->io_issued -= io_size;
759
760 if (iostate->io_wanted) {
761 /*
762 * someone is waiting for the state of
763 * this io stream to change
764 */
765 iostate->io_wanted = 0;
766 wakeup((caddr_t)&iostate->io_wanted);
767 }
768 }
769 pg_offset = upl_offset & PAGE_MASK;
770 abort_size = ((size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE) * PAGE_SIZE;
771
772 if (flags & CL_COMMIT) {
773 int upl_abort_code;
774
775 if (flags & CL_PRESERVE)
776 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
777 else if ((flags & CL_PAGEOUT) && (error != ENXIO)) /* transient error */
778 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
779 else if (flags & CL_PAGEIN)
780 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
781 else
782 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
783
784 ubc_upl_abort_range(upl, upl_offset - pg_offset, abort_size,
785 upl_abort_code);
786
787 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
788 (int)upl, upl_offset - pg_offset, abort_size, error, 0);
789 }
790 if (real_bp) {
791 real_bp->b_flags |= B_ERROR;
792 real_bp->b_error = error;
793
794 biodone(real_bp);
795 }
796 if (retval == 0)
797 retval = error;
798 }
799 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END,
800 (int)f_offset, size, upl_offset, retval, 0);
801
802 return (retval);
803 }
804
805
806 static int
807 cluster_rd_prefetch(vp, f_offset, size, filesize, devblocksize)
808 struct vnode *vp;
809 off_t f_offset;
810 u_int size;
811 off_t filesize;
812 int devblocksize;
813 {
814 int pages_to_fetch;
815 int skipped_pages;
816
817 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
818 (int)f_offset, size, (int)filesize, 0, 0);
819
820 if (f_offset >= filesize) {
821 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
822 (int)f_offset, 0, 0, 0, 0);
823 return(0);
824 }
825 if (size > (MAX_UPL_TRANSFER * PAGE_SIZE))
826 size = MAX_UPL_TRANSFER * PAGE_SIZE;
827 else
828 size = (size + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1);
829
830 if ((off_t)size > (filesize - f_offset))
831 size = filesize - f_offset;
832
833 pages_to_fetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
834
835 for (skipped_pages = 0; skipped_pages < pages_to_fetch; skipped_pages++) {
836 if (ubc_page_op(vp, f_offset, 0, 0, 0) != KERN_SUCCESS)
837 break;
838 f_offset += PAGE_SIZE;
839 size -= PAGE_SIZE;
840 }
841 if (skipped_pages < pages_to_fetch)
842 advisory_read(vp, filesize, f_offset, size, devblocksize);
843
844 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
845 (int)f_offset + (pages_to_fetch * PAGE_SIZE), skipped_pages, 0, 1, 0);
846
847 return (pages_to_fetch);
848 }
849
850
851
852 static void
853 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize)
854 struct vnode *vp;
855 daddr_t b_lblkno;
856 daddr_t e_lblkno;
857 off_t filesize;
858 int devblocksize;
859 {
860 daddr_t r_lblkno;
861 off_t f_offset;
862 int size_of_prefetch;
863 int max_pages;
864
865 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
866 b_lblkno, e_lblkno, vp->v_lastr, 0, 0);
867
868 if (b_lblkno == vp->v_lastr && b_lblkno == e_lblkno) {
869 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
870 vp->v_ralen, vp->v_maxra, vp->v_lastr, 0, 0);
871 return;
872 }
873
874 if (vp->v_lastr == -1 || (b_lblkno != vp->v_lastr && b_lblkno != (vp->v_lastr + 1) &&
875 (b_lblkno != (vp->v_maxra + 1) || vp->v_ralen == 0))) {
876 vp->v_ralen = 0;
877 vp->v_maxra = 0;
878
879 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
880 vp->v_ralen, vp->v_maxra, vp->v_lastr, 1, 0);
881
882 return;
883 }
884 max_pages = MAX_UPL_TRANSFER;
885
886 vp->v_ralen = vp->v_ralen ? min(max_pages, vp->v_ralen << 1) : 1;
887
888 if (((e_lblkno + 1) - b_lblkno) > vp->v_ralen)
889 vp->v_ralen = min(max_pages, (e_lblkno + 1) - b_lblkno);
890
891 if (e_lblkno < vp->v_maxra) {
892 if ((vp->v_maxra - e_lblkno) > max(max_pages / 16, 4)) {
893
894 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
895 vp->v_ralen, vp->v_maxra, vp->v_lastr, 2, 0);
896 return;
897 }
898 }
899 r_lblkno = max(e_lblkno, vp->v_maxra) + 1;
900 f_offset = (off_t)r_lblkno * PAGE_SIZE_64;
901
902 if (f_offset < filesize) {
903 size_of_prefetch = cluster_rd_prefetch(vp, f_offset, vp->v_ralen * PAGE_SIZE, filesize, devblocksize);
904
905 if (size_of_prefetch)
906 vp->v_maxra = (r_lblkno + size_of_prefetch) - 1;
907 }
908 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
909 vp->v_ralen, vp->v_maxra, vp->v_lastr, 3, 0);
910 }
911
912 int
913 cluster_pageout(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags)
914 struct vnode *vp;
915 upl_t upl;
916 vm_offset_t upl_offset;
917 off_t f_offset;
918 int size;
919 off_t filesize;
920 int devblocksize;
921 int flags;
922 {
923 int io_size;
924 int pg_size;
925 off_t max_size;
926 int local_flags = CL_PAGEOUT;
927
928 if ((flags & UPL_IOSYNC) == 0)
929 local_flags |= CL_ASYNC;
930 if ((flags & UPL_NOCOMMIT) == 0)
931 local_flags |= CL_COMMIT;
932
933
934 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
935 (int)f_offset, size, (int)filesize, local_flags, 0);
936
937 /*
938 * If they didn't specify any I/O, then we are done...
939 * we can't issue an abort because we don't know how
940 * big the upl really is
941 */
942 if (size <= 0)
943 return (EINVAL);
944
945 if (vp->v_mount->mnt_flag & MNT_RDONLY) {
946 if (local_flags & CL_COMMIT)
947 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
948 return (EROFS);
949 }
950 /*
951 * can't page-in from a negative offset
952 * or if we're starting beyond the EOF
953 * or if the file offset isn't page aligned
954 * or the size requested isn't a multiple of PAGE_SIZE
955 */
956 if (f_offset < 0 || f_offset >= filesize ||
957 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
958 if (local_flags & CL_COMMIT)
959 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
960 return (EINVAL);
961 }
962 max_size = filesize - f_offset;
963
964 if (size < max_size)
965 io_size = size;
966 else
967 io_size = max_size;
968
969 pg_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
970
971 if (size > pg_size) {
972 if (local_flags & CL_COMMIT)
973 ubc_upl_abort_range(upl, upl_offset + pg_size, size - pg_size,
974 UPL_ABORT_FREE_ON_EMPTY);
975 }
976 while (vp->v_numoutput >= ASYNC_THROTTLE) {
977 vp->v_flag |= VTHROTTLED;
978 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_pageout", 0);
979 }
980
981 return (cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize,
982 local_flags, (struct buf *)0, (struct clios *)0));
983 }
984
985 int
986 cluster_pagein(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags)
987 struct vnode *vp;
988 upl_t upl;
989 vm_offset_t upl_offset;
990 off_t f_offset;
991 int size;
992 off_t filesize;
993 int devblocksize;
994 int flags;
995 {
996 u_int io_size;
997 int rounded_size;
998 off_t max_size;
999 int retval;
1000 int local_flags = 0;
1001
1002 if (upl == NULL || size < 0)
1003 panic("cluster_pagein: NULL upl passed in");
1004
1005 if ((flags & UPL_IOSYNC) == 0)
1006 local_flags |= CL_ASYNC;
1007 if ((flags & UPL_NOCOMMIT) == 0)
1008 local_flags |= CL_COMMIT;
1009
1010
1011 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
1012 (int)f_offset, size, (int)filesize, local_flags, 0);
1013
1014 /*
1015 * can't page-in from a negative offset
1016 * or if we're starting beyond the EOF
1017 * or if the file offset isn't page aligned
1018 * or the size requested isn't a multiple of PAGE_SIZE
1019 */
1020 if (f_offset < 0 || f_offset >= filesize ||
1021 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) {
1022 if (local_flags & CL_COMMIT)
1023 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1024 return (EINVAL);
1025 }
1026 max_size = filesize - f_offset;
1027
1028 if (size < max_size)
1029 io_size = size;
1030 else
1031 io_size = max_size;
1032
1033 rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1034
1035 if (size > rounded_size && (local_flags & CL_COMMIT))
1036 ubc_upl_abort_range(upl, upl_offset + rounded_size,
1037 size - (upl_offset + rounded_size), UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1038
1039 retval = cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize,
1040 local_flags | CL_READ | CL_PAGEIN, (struct buf *)0, (struct clios *)0);
1041
1042 if (retval == 0) {
1043 int b_lblkno;
1044 int e_lblkno;
1045
1046 b_lblkno = (int)(f_offset / PAGE_SIZE_64);
1047 e_lblkno = (int)
1048 ((f_offset + ((off_t)io_size - 1)) / PAGE_SIZE_64);
1049
1050 if (!(flags & UPL_NORDAHEAD) && !(vp->v_flag & VRAOFF) && rounded_size == PAGE_SIZE) {
1051 /*
1052 * we haven't read the last page in of the file yet
1053 * so let's try to read ahead if we're in
1054 * a sequential access pattern
1055 */
1056 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
1057 }
1058 vp->v_lastr = e_lblkno;
1059 }
1060 return (retval);
1061 }
1062
1063 int
1064 cluster_bp(bp)
1065 struct buf *bp;
1066 {
1067 off_t f_offset;
1068 int flags;
1069
1070 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START,
1071 (int)bp, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
1072
1073 if (bp->b_pagelist == (upl_t) 0)
1074 panic("cluster_bp: can't handle NULL upl yet\n");
1075 if (bp->b_flags & B_READ)
1076 flags = CL_ASYNC | CL_READ;
1077 else
1078 flags = CL_ASYNC;
1079
1080 f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
1081
1082 return (cluster_io(bp->b_vp, bp->b_pagelist, 0, f_offset, bp->b_bcount, 0, flags, bp, (struct clios *)0));
1083 }
1084
1085 int
1086 cluster_write(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
1087 struct vnode *vp;
1088 struct uio *uio;
1089 off_t oldEOF;
1090 off_t newEOF;
1091 off_t headOff;
1092 off_t tailOff;
1093 int devblocksize;
1094 int flags;
1095 {
1096 int prev_resid;
1097 int clip_size;
1098 off_t max_io_size;
1099 struct iovec *iov;
1100 vm_offset_t upl_offset;
1101 int upl_size;
1102 int pages_in_pl;
1103 upl_page_info_t *pl;
1104 int upl_flags;
1105 upl_t upl;
1106 int retval = 0;
1107
1108
1109 if ( (!(vp->v_flag & VNOCACHE_DATA)) || (!uio) || (uio->uio_segflg != UIO_USERSPACE))
1110 {
1111 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1112 return(retval);
1113 }
1114
1115 while (uio->uio_resid && uio->uio_offset < newEOF && retval == 0)
1116 {
1117 /* we know we have a resid, so this is safe */
1118 iov = uio->uio_iov;
1119 while (iov->iov_len == 0) {
1120 uio->uio_iov++;
1121 uio->uio_iovcnt--;
1122 iov = uio->uio_iov;
1123 }
1124
1125 /*
1126 * We check every vector target and if it is physically
1127 * contiguous space, we skip the sanity checks.
1128 */
1129
1130 upl_offset = (vm_offset_t)iov->iov_base & ~PAGE_MASK;
1131 upl_size = (upl_offset + PAGE_SIZE +(PAGE_SIZE -1)) & ~PAGE_MASK;
1132 pages_in_pl = 0;
1133 upl_flags = UPL_QUERY_OBJECT_TYPE;
1134 if ((vm_map_get_upl(current_map(),
1135 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1136 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0)) != KERN_SUCCESS)
1137 {
1138 /*
1139 * the user app must have passed in an invalid address
1140 */
1141 return (EFAULT);
1142 }
1143
1144 if (upl_flags & UPL_PHYS_CONTIG)
1145 {
1146 if (flags & IO_HEADZEROFILL)
1147 {
1148 flags &= ~IO_HEADZEROFILL;
1149
1150 if (retval = cluster_write_x(vp, (struct uio *)0, 0, uio->uio_offset, headOff, 0, devblocksize, IO_HEADZEROFILL))
1151 return(retval);
1152 }
1153
1154 retval = cluster_phys_write(vp, uio, newEOF, devblocksize, flags);
1155
1156 if (uio->uio_resid == 0 && (flags & IO_TAILZEROFILL))
1157 {
1158 retval = cluster_write_x(vp, (struct uio *)0, 0, tailOff, uio->uio_offset, 0, devblocksize, IO_HEADZEROFILL);
1159 return(retval);
1160 }
1161 }
1162 else if ((uio->uio_resid < 4 * PAGE_SIZE) || (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL)))
1163 {
1164 /*
1165 * We set a threshhold of 4 pages to decide if the nocopy
1166 * write loop is worth the trouble...
1167 * we also come here if we're trying to zero the head and/or tail
1168 * of a partially written page, and the user source is not a physically contiguous region
1169 */
1170 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1171 return(retval);
1172 }
1173 else if (uio->uio_offset & PAGE_MASK_64)
1174 {
1175 /* Bring the file offset write up to a pagesize boundary */
1176 clip_size = (PAGE_SIZE - (uio->uio_offset & PAGE_MASK_64));
1177 if (uio->uio_resid < clip_size)
1178 clip_size = uio->uio_resid;
1179 /*
1180 * Fake the resid going into the cluster_write_x call
1181 * and restore it on the way out.
1182 */
1183 prev_resid = uio->uio_resid;
1184 uio->uio_resid = clip_size;
1185 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1186 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1187 }
1188 else if ((int)iov->iov_base & PAGE_MASK_64)
1189 {
1190 clip_size = iov->iov_len;
1191 prev_resid = uio->uio_resid;
1192 uio->uio_resid = clip_size;
1193 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1194 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1195 }
1196 else
1197 {
1198 /*
1199 * If we come in here, we know the offset into
1200 * the file is on a pagesize boundary
1201 */
1202
1203 max_io_size = newEOF - uio->uio_offset;
1204 clip_size = uio->uio_resid;
1205 if (iov->iov_len < clip_size)
1206 clip_size = iov->iov_len;
1207 if (max_io_size < clip_size)
1208 clip_size = max_io_size;
1209
1210 if (clip_size < PAGE_SIZE)
1211 {
1212 /*
1213 * Take care of tail end of write in this vector
1214 */
1215 prev_resid = uio->uio_resid;
1216 uio->uio_resid = clip_size;
1217 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1218 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1219 }
1220 else
1221 {
1222 /* round clip_size down to a multiple of pagesize */
1223 clip_size = clip_size & ~(PAGE_MASK);
1224 prev_resid = uio->uio_resid;
1225 uio->uio_resid = clip_size;
1226 retval = cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags);
1227 if ((retval == 0) && uio->uio_resid)
1228 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1229 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1230 }
1231 } /* end else */
1232 } /* end while */
1233 return(retval);
1234 }
1235
1236
1237 static int
1238 cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags)
1239 struct vnode *vp;
1240 struct uio *uio;
1241 off_t newEOF;
1242 int devblocksize;
1243 int flags;
1244 {
1245 upl_t upl;
1246 upl_page_info_t *pl;
1247 off_t upl_f_offset;
1248 vm_offset_t upl_offset;
1249 off_t max_io_size;
1250 int io_size;
1251 int io_flag;
1252 int upl_size;
1253 int upl_needed_size;
1254 int pages_in_pl;
1255 int upl_flags;
1256 kern_return_t kret;
1257 struct iovec *iov;
1258 int i;
1259 int first = 1;
1260 int force_data_sync;
1261 int error = 0;
1262 struct clios iostate;
1263
1264 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
1265 (int)uio->uio_offset, (int)uio->uio_resid,
1266 (int)newEOF, devblocksize, 0);
1267
1268 /*
1269 * When we enter this routine, we know
1270 * -- the offset into the file is on a pagesize boundary
1271 * -- the resid is a page multiple
1272 * -- the resid will not exceed iov_len
1273 */
1274 cluster_try_push(vp, newEOF, 0, 1);
1275
1276 iostate.io_completed = 0;
1277 iostate.io_issued = 0;
1278 iostate.io_error = 0;
1279 iostate.io_wanted = 0;
1280
1281 iov = uio->uio_iov;
1282
1283 while (uio->uio_resid && uio->uio_offset < newEOF && error == 0) {
1284 io_size = uio->uio_resid;
1285
1286 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1287 io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1288
1289 if (first) {
1290 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE) / 4)
1291 io_size = (MAX_UPL_TRANSFER * PAGE_SIZE) / 8;
1292 first = 0;
1293 }
1294 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
1295 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
1296
1297 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
1298 (int)upl_offset, upl_needed_size, (int)iov->iov_base, io_size, 0);
1299
1300 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
1301 pages_in_pl = 0;
1302 upl_size = upl_needed_size;
1303 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1304 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
1305
1306 kret = vm_map_get_upl(current_map(),
1307 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1308 &upl_size,
1309 &upl,
1310 NULL,
1311 &pages_in_pl,
1312 &upl_flags,
1313 force_data_sync);
1314
1315 if (kret != KERN_SUCCESS) {
1316 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1317 0, 0, 0, kret, 0);
1318
1319 /*
1320 * cluster_nocopy_write: failed to get pagelist
1321 *
1322 * we may have already spun some portion of this request
1323 * off as async requests... we need to wait for the I/O
1324 * to complete before returning
1325 */
1326 goto wait_for_writes;
1327 }
1328 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1329 pages_in_pl = upl_size / PAGE_SIZE;
1330
1331 for (i = 0; i < pages_in_pl; i++) {
1332 if (!upl_valid_page(pl, i))
1333 break;
1334 }
1335 if (i == pages_in_pl)
1336 break;
1337
1338 /*
1339 * didn't get all the pages back that we
1340 * needed... release this upl and try again
1341 */
1342 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1343 UPL_ABORT_FREE_ON_EMPTY);
1344 }
1345 if (force_data_sync >= 3) {
1346 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1347 i, pages_in_pl, upl_size, kret, 0);
1348
1349 /*
1350 * for some reason, we couldn't acquire a hold on all
1351 * the pages needed in the user's address space
1352 *
1353 * we may have already spun some portion of this request
1354 * off as async requests... we need to wait for the I/O
1355 * to complete before returning
1356 */
1357 goto wait_for_writes;
1358 }
1359
1360 /*
1361 * Consider the possibility that upl_size wasn't satisfied.
1362 */
1363 if (upl_size != upl_needed_size)
1364 io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
1365
1366 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1367 (int)upl_offset, upl_size, (int)iov->iov_base, io_size, 0);
1368
1369 if (io_size == 0) {
1370 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1371 UPL_ABORT_FREE_ON_EMPTY);
1372
1373 /*
1374 * we may have already spun some portion of this request
1375 * off as async requests... we need to wait for the I/O
1376 * to complete before returning
1377 */
1378 goto wait_for_writes;
1379 }
1380 /*
1381 * Now look for pages already in the cache
1382 * and throw them away.
1383 */
1384
1385 upl_f_offset = uio->uio_offset; /* this is page aligned in the file */
1386 max_io_size = io_size;
1387
1388 while (max_io_size) {
1389 /*
1390 * Flag UPL_POP_DUMP says if the page is found
1391 * in the page cache it must be thrown away.
1392 */
1393 ubc_page_op(vp,
1394 upl_f_offset,
1395 UPL_POP_SET | UPL_POP_BUSY | UPL_POP_DUMP,
1396 0, 0);
1397 max_io_size -= PAGE_SIZE_64;
1398 upl_f_offset += PAGE_SIZE_64;
1399 }
1400 /*
1401 * we want push out these writes asynchronously so that we can overlap
1402 * the preparation of the next I/O
1403 * if there are already too many outstanding writes
1404 * wait until some complete before issuing the next
1405 */
1406 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
1407 iostate.io_wanted = 1;
1408 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_write", 0);
1409 }
1410 if (iostate.io_error) {
1411 /*
1412 * one of the earlier writes we issued ran into a hard error
1413 * don't issue any more writes, cleanup the UPL
1414 * that was just created but not used, then
1415 * go wait for all writes that are part of this stream
1416 * to complete before returning the error to the caller
1417 */
1418 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1419 UPL_ABORT_FREE_ON_EMPTY);
1420
1421 goto wait_for_writes;
1422 }
1423 io_flag = CL_ASYNC | CL_PRESERVE | CL_COMMIT;
1424
1425 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
1426 (int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0);
1427
1428 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1429 io_size, devblocksize, io_flag, (struct buf *)0, &iostate);
1430
1431 iov->iov_len -= io_size;
1432 iov->iov_base += io_size;
1433 uio->uio_resid -= io_size;
1434 uio->uio_offset += io_size;
1435
1436 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
1437 (int)upl_offset, (int)uio->uio_offset, (int)uio->uio_resid, error, 0);
1438
1439 } /* end while */
1440
1441 wait_for_writes:
1442 /*
1443 * make sure all async writes issued as part of this stream
1444 * have completed before we return
1445 */
1446 while (iostate.io_issued != iostate.io_completed) {
1447 iostate.io_wanted = 1;
1448 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_write", 0);
1449 }
1450 if (iostate.io_error)
1451 error = iostate.io_error;
1452
1453 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1454 (int)uio->uio_offset, (int)uio->uio_resid, error, 4, 0);
1455
1456 return (error);
1457 }
1458
1459
1460 static int
1461 cluster_phys_write(vp, uio, newEOF, devblocksize, flags)
1462 struct vnode *vp;
1463 struct uio *uio;
1464 off_t newEOF;
1465 int devblocksize;
1466 int flags;
1467 {
1468 upl_page_info_t *pl;
1469 vm_offset_t src_paddr;
1470 upl_t upl;
1471 vm_offset_t upl_offset;
1472 int tail_size;
1473 int io_size;
1474 int upl_size;
1475 int upl_needed_size;
1476 int pages_in_pl;
1477 int upl_flags;
1478 kern_return_t kret;
1479 struct iovec *iov;
1480 int error = 0;
1481
1482 /*
1483 * When we enter this routine, we know
1484 * -- the resid will not exceed iov_len
1485 * -- the vector target address is physcially contiguous
1486 */
1487 cluster_try_push(vp, newEOF, 0, 1);
1488
1489 iov = uio->uio_iov;
1490 io_size = iov->iov_len;
1491 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
1492 upl_needed_size = upl_offset + io_size;
1493
1494 pages_in_pl = 0;
1495 upl_size = upl_needed_size;
1496 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1497 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
1498
1499 kret = vm_map_get_upl(current_map(),
1500 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1501 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
1502
1503 if (kret != KERN_SUCCESS) {
1504 /*
1505 * cluster_phys_write: failed to get pagelist
1506 * note: return kret here
1507 */
1508 return(EINVAL);
1509 }
1510 /*
1511 * Consider the possibility that upl_size wasn't satisfied.
1512 * This is a failure in the physical memory case.
1513 */
1514 if (upl_size < upl_needed_size) {
1515 kernel_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1516 return(EINVAL);
1517 }
1518 pl = ubc_upl_pageinfo(upl);
1519
1520 src_paddr = (vm_offset_t)upl_phys_page(pl, 0) + ((vm_offset_t)iov->iov_base & PAGE_MASK);
1521
1522 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
1523 int head_size;
1524
1525 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
1526
1527 if (head_size > io_size)
1528 head_size = io_size;
1529
1530 error = cluster_align_phys_io(vp, uio, src_paddr, head_size, devblocksize, 0);
1531
1532 if (error) {
1533 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1534
1535 return(EINVAL);
1536 }
1537 upl_offset += head_size;
1538 src_paddr += head_size;
1539 io_size -= head_size;
1540 }
1541 tail_size = io_size & (devblocksize - 1);
1542 io_size -= tail_size;
1543
1544 if (io_size) {
1545 /*
1546 * issue a synchronous write to cluster_io
1547 */
1548 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1549 io_size, 0, CL_DEV_MEMORY, (struct buf *)0, (struct clios *)0);
1550 }
1551 if (error == 0) {
1552 /*
1553 * The cluster_io write completed successfully,
1554 * update the uio structure
1555 */
1556 uio->uio_resid -= io_size;
1557 iov->iov_len -= io_size;
1558 iov->iov_base += io_size;
1559 uio->uio_offset += io_size;
1560 src_paddr += io_size;
1561
1562 if (tail_size)
1563 error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, devblocksize, 0);
1564 }
1565 /*
1566 * just release our hold on the physically contiguous
1567 * region without changing any state
1568 */
1569 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1570
1571 return (error);
1572 }
1573
1574
1575 static int
1576 cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
1577 struct vnode *vp;
1578 struct uio *uio;
1579 off_t oldEOF;
1580 off_t newEOF;
1581 off_t headOff;
1582 off_t tailOff;
1583 int devblocksize;
1584 int flags;
1585 {
1586 upl_page_info_t *pl;
1587 upl_t upl;
1588 vm_offset_t upl_offset;
1589 int upl_size;
1590 off_t upl_f_offset;
1591 int pages_in_upl;
1592 int start_offset;
1593 int xfer_resid;
1594 int io_size;
1595 int io_flags;
1596 vm_offset_t io_address;
1597 int io_offset;
1598 int bytes_to_zero;
1599 int bytes_to_move;
1600 kern_return_t kret;
1601 int retval = 0;
1602 int uio_resid;
1603 long long total_size;
1604 long long zero_cnt;
1605 off_t zero_off;
1606 long long zero_cnt1;
1607 off_t zero_off1;
1608 daddr_t start_blkno;
1609 daddr_t last_blkno;
1610
1611 if (uio) {
1612 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1613 (int)uio->uio_offset, uio->uio_resid, (int)oldEOF, (int)newEOF, 0);
1614
1615 uio_resid = uio->uio_resid;
1616 } else {
1617 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1618 0, 0, (int)oldEOF, (int)newEOF, 0);
1619
1620 uio_resid = 0;
1621 }
1622 zero_cnt = 0;
1623 zero_cnt1 = 0;
1624
1625 if (flags & IO_HEADZEROFILL) {
1626 /*
1627 * some filesystems (HFS is one) don't support unallocated holes within a file...
1628 * so we zero fill the intervening space between the old EOF and the offset
1629 * where the next chunk of real data begins.... ftruncate will also use this
1630 * routine to zero fill to the new EOF when growing a file... in this case, the
1631 * uio structure will not be provided
1632 */
1633 if (uio) {
1634 if (headOff < uio->uio_offset) {
1635 zero_cnt = uio->uio_offset - headOff;
1636 zero_off = headOff;
1637 }
1638 } else if (headOff < newEOF) {
1639 zero_cnt = newEOF - headOff;
1640 zero_off = headOff;
1641 }
1642 }
1643 if (flags & IO_TAILZEROFILL) {
1644 if (uio) {
1645 zero_off1 = uio->uio_offset + uio->uio_resid;
1646
1647 if (zero_off1 < tailOff)
1648 zero_cnt1 = tailOff - zero_off1;
1649 }
1650 }
1651 if (zero_cnt == 0 && uio == (struct uio *) 0)
1652 {
1653 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
1654 retval, 0, 0, 0, 0);
1655 return (0);
1656 }
1657
1658 while ((total_size = (uio_resid + zero_cnt + zero_cnt1)) && retval == 0) {
1659 /*
1660 * for this iteration of the loop, figure out where our starting point is
1661 */
1662 if (zero_cnt) {
1663 start_offset = (int)(zero_off & PAGE_MASK_64);
1664 upl_f_offset = zero_off - start_offset;
1665 } else if (uio_resid) {
1666 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
1667 upl_f_offset = uio->uio_offset - start_offset;
1668 } else {
1669 start_offset = (int)(zero_off1 & PAGE_MASK_64);
1670 upl_f_offset = zero_off1 - start_offset;
1671 }
1672 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
1673 (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
1674
1675 if (total_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1676 total_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1677
1678 /*
1679 * compute the size of the upl needed to encompass
1680 * the requested write... limit each call to cluster_io
1681 * to the maximum UPL size... cluster_io will clip if
1682 * this exceeds the maximum io_size for the device,
1683 * make sure to account for
1684 * a starting offset that's not page aligned
1685 */
1686 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1687
1688 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1689 upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1690
1691 pages_in_upl = upl_size / PAGE_SIZE;
1692 io_size = upl_size - start_offset;
1693
1694 if ((long long)io_size > total_size)
1695 io_size = total_size;
1696
1697 start_blkno = (daddr_t)(upl_f_offset / PAGE_SIZE_64);
1698 last_blkno = start_blkno + pages_in_upl;
1699
1700 kret = ubc_create_upl(vp,
1701 upl_f_offset,
1702 upl_size,
1703 &upl,
1704 &pl,
1705 UPL_FLAGS_NONE);
1706 if (kret != KERN_SUCCESS)
1707 panic("cluster_write: failed to get pagelist");
1708
1709 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_NONE,
1710 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
1711
1712 if (start_offset && !upl_valid_page(pl, 0)) {
1713 int read_size;
1714
1715 /*
1716 * we're starting in the middle of the first page of the upl
1717 * and the page isn't currently valid, so we're going to have
1718 * to read it in first... this is a synchronous operation
1719 */
1720 read_size = PAGE_SIZE;
1721
1722 if ((upl_f_offset + read_size) > newEOF)
1723 read_size = newEOF - upl_f_offset;
1724
1725 retval = cluster_io(vp, upl, 0, upl_f_offset, read_size, devblocksize,
1726 CL_READ, (struct buf *)0, (struct clios *)0);
1727 if (retval) {
1728 /*
1729 * we had an error during the read which causes us to abort
1730 * the current cluster_write request... before we do, we need
1731 * to release the rest of the pages in the upl without modifying
1732 * there state and mark the failed page in error
1733 */
1734 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
1735 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1736
1737 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1738 (int)upl, 0, 0, retval, 0);
1739 break;
1740 }
1741 }
1742 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
1743 /*
1744 * the last offset we're writing to in this upl does not end on a page
1745 * boundary... if it's not beyond the old EOF, then we'll also need to
1746 * pre-read this page in if it isn't already valid
1747 */
1748 upl_offset = upl_size - PAGE_SIZE;
1749
1750 if ((upl_f_offset + start_offset + io_size) < oldEOF &&
1751 !upl_valid_page(pl, upl_offset / PAGE_SIZE)) {
1752 int read_size;
1753
1754 read_size = PAGE_SIZE;
1755
1756 if ((upl_f_offset + upl_offset + read_size) > newEOF)
1757 read_size = newEOF - (upl_f_offset + upl_offset);
1758
1759 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size, devblocksize,
1760 CL_READ, (struct buf *)0, (struct clios *)0);
1761 if (retval) {
1762 /*
1763 * we had an error during the read which causes us to abort
1764 * the current cluster_write request... before we do, we
1765 * need to release the rest of the pages in the upl without
1766 * modifying there state and mark the failed page in error
1767 */
1768 ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
1769 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1770
1771 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1772 (int)upl, 0, 0, retval, 0);
1773 break;
1774 }
1775 }
1776 }
1777 if ((kret = ubc_upl_map(upl, &io_address)) != KERN_SUCCESS)
1778 panic("cluster_write: ubc_upl_map failed\n");
1779 xfer_resid = io_size;
1780 io_offset = start_offset;
1781
1782 while (zero_cnt && xfer_resid) {
1783
1784 if (zero_cnt < (long long)xfer_resid)
1785 bytes_to_zero = zero_cnt;
1786 else
1787 bytes_to_zero = xfer_resid;
1788
1789 if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
1790 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1791
1792 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1793 (int)upl_f_offset + io_offset, bytes_to_zero,
1794 (int)io_offset, xfer_resid, 0);
1795 } else {
1796 int zero_pg_index;
1797
1798 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
1799 zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
1800
1801 if ( !upl_valid_page(pl, zero_pg_index)) {
1802 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1803
1804 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1805 (int)upl_f_offset + io_offset, bytes_to_zero,
1806 (int)io_offset, xfer_resid, 0);
1807
1808 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
1809 !upl_dirty_page(pl, zero_pg_index)) {
1810 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1811
1812 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1813 (int)upl_f_offset + io_offset, bytes_to_zero,
1814 (int)io_offset, xfer_resid, 0);
1815 }
1816 }
1817 xfer_resid -= bytes_to_zero;
1818 zero_cnt -= bytes_to_zero;
1819 zero_off += bytes_to_zero;
1820 io_offset += bytes_to_zero;
1821 }
1822 if (xfer_resid && uio_resid) {
1823 bytes_to_move = min(uio_resid, xfer_resid);
1824
1825 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 42)) | DBG_FUNC_NONE,
1826 (int)uio->uio_offset, bytes_to_move, uio_resid, xfer_resid, 0);
1827
1828 retval = uiomove((caddr_t)(io_address + io_offset), bytes_to_move, uio);
1829
1830
1831 if (retval) {
1832 if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
1833 panic("cluster_write: kernel_upl_unmap failed\n");
1834
1835 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
1836
1837 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1838 (int)upl, 0, 0, retval, 0);
1839 } else {
1840 uio_resid -= bytes_to_move;
1841 xfer_resid -= bytes_to_move;
1842 io_offset += bytes_to_move;
1843 }
1844 }
1845 while (xfer_resid && zero_cnt1 && retval == 0) {
1846
1847 if (zero_cnt1 < (long long)xfer_resid)
1848 bytes_to_zero = zero_cnt1;
1849 else
1850 bytes_to_zero = xfer_resid;
1851
1852 if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
1853 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1854
1855 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1856 (int)upl_f_offset + io_offset,
1857 bytes_to_zero, (int)io_offset, xfer_resid, 0);
1858 } else {
1859 int zero_pg_index;
1860
1861 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off1 & PAGE_MASK_64));
1862 zero_pg_index = (int)((zero_off1 - upl_f_offset) / PAGE_SIZE_64);
1863
1864 if ( !upl_valid_page(pl, zero_pg_index)) {
1865 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1866
1867 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1868 (int)upl_f_offset + io_offset,
1869 bytes_to_zero, (int)io_offset, xfer_resid, 0);
1870
1871 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
1872 !upl_dirty_page(pl, zero_pg_index)) {
1873 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1874
1875 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1876 (int)upl_f_offset + io_offset,
1877 bytes_to_zero, (int)io_offset, xfer_resid, 0);
1878 }
1879 }
1880 xfer_resid -= bytes_to_zero;
1881 zero_cnt1 -= bytes_to_zero;
1882 zero_off1 += bytes_to_zero;
1883 io_offset += bytes_to_zero;
1884 }
1885
1886 if (retval == 0) {
1887 int cl_index;
1888 int can_delay;
1889
1890 io_size += start_offset;
1891
1892 if ((upl_f_offset + io_size) >= newEOF && io_size < upl_size) {
1893 /*
1894 * if we're extending the file with this write
1895 * we'll zero fill the rest of the page so that
1896 * if the file gets extended again in such a way as to leave a
1897 * hole starting at this EOF, we'll have zero's in the correct spot
1898 */
1899 bzero((caddr_t)(io_address + io_size), upl_size - io_size);
1900
1901 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1902 (int)upl_f_offset + io_size,
1903 upl_size - io_size, 0, 0, 0);
1904 }
1905 if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
1906 panic("cluster_write: kernel_upl_unmap failed\n");
1907
1908 if (flags & IO_SYNC)
1909 /*
1910 * if the IO_SYNC flag is set than we need to
1911 * bypass any clusters and immediately issue
1912 * the I/O
1913 */
1914 goto issue_io;
1915
1916 if (vp->v_clen == 0)
1917 /*
1918 * no clusters currently present
1919 */
1920 goto start_new_cluster;
1921
1922 /*
1923 * keep track of the overall dirty page
1924 * range we've developed
1925 * in case we have to fall back to the
1926 * VHASDIRTY method of flushing
1927 */
1928 if (vp->v_flag & VHASDIRTY)
1929 goto delay_io;
1930
1931 for (cl_index = 0; cl_index < vp->v_clen; cl_index++) {
1932 /*
1933 * we have an existing cluster... see if this write will extend it nicely
1934 */
1935 if (start_blkno >= vp->v_clusters[cl_index].start_pg) {
1936 /*
1937 * the current write starts at or after the current cluster
1938 */
1939 if (last_blkno <= (vp->v_clusters[cl_index].start_pg + MAX_UPL_TRANSFER)) {
1940 /*
1941 * we have a write that fits entirely
1942 * within the existing cluster limits
1943 */
1944 if (last_blkno > vp->v_clusters[cl_index].last_pg)
1945 /*
1946 * update our idea of where the cluster ends
1947 */
1948 vp->v_clusters[cl_index].last_pg = last_blkno;
1949 break;
1950 }
1951 if (start_blkno < (vp->v_clusters[cl_index].start_pg + MAX_UPL_TRANSFER)) {
1952 /*
1953 * we have a write that starts in the middle of the current cluster
1954 * but extends beyond the cluster's limit
1955 * we'll clip the current cluster if we actually
1956 * overlap with the new write
1957 * and start a new cluster with the current write
1958 */
1959 if (vp->v_clusters[cl_index].last_pg > start_blkno)
1960 vp->v_clusters[cl_index].last_pg = start_blkno;
1961 }
1962 /*
1963 * we also get here for the case where the current write starts
1964 * beyond the limit of the existing cluster
1965 *
1966 * in either case, we'll check the remaining clusters before
1967 * starting a new one
1968 */
1969 } else {
1970 /*
1971 * the current write starts in front of the current cluster
1972 */
1973 if ((vp->v_clusters[cl_index].last_pg - start_blkno) <= MAX_UPL_TRANSFER) {
1974 /*
1975 * we can just merge the old cluster
1976 * with the new request and leave it
1977 * in the cache
1978 */
1979 vp->v_clusters[cl_index].start_pg = start_blkno;
1980
1981 if (last_blkno > vp->v_clusters[cl_index].last_pg) {
1982 /*
1983 * the current write completely
1984 * envelops the existing cluster
1985 */
1986 vp->v_clusters[cl_index].last_pg = last_blkno;
1987 }
1988 break;
1989 }
1990
1991 /*
1992 * if we were to combine this write with the current cluster
1993 * we would exceed the cluster size limit.... so,
1994 * let's see if there's any overlap of the new I/O with
1995 * the existing cluster...
1996 *
1997 */
1998 if (last_blkno > vp->v_clusters[cl_index].start_pg)
1999 /*
2000 * the current write extends into the existing cluster
2001 * clip the current cluster by moving the start position
2002 * to where the current write ends
2003 */
2004 vp->v_clusters[cl_index].start_pg = last_blkno;
2005 /*
2006 * if we get here, there was no way to merge
2007 * the new I/O with this cluster and
2008 * keep it under our maximum cluster length
2009 * we'll check the remaining clusters before starting a new one
2010 */
2011 }
2012 }
2013 if (cl_index < vp->v_clen)
2014 /*
2015 * we found an existing cluster that we
2016 * could merger this I/O into
2017 */
2018 goto delay_io;
2019
2020 if (vp->v_clen < MAX_CLUSTERS && !(vp->v_flag & VNOCACHE_DATA))
2021 /*
2022 * we didn't find an existing cluster to
2023 * merge into, but there's room to start
2024 * a new one
2025 */
2026 goto start_new_cluster;
2027
2028 /*
2029 * no exisitng cluster to merge with and no
2030 * room to start a new one... we'll try
2031 * pushing the existing ones... if none of
2032 * them are able to be pushed, we'll have
2033 * to fall back on the VHASDIRTY mechanism
2034 * cluster_try_push will set v_clen to the
2035 * number of remaining clusters if it is
2036 * unable to push all of them
2037 */
2038 if (vp->v_flag & VNOCACHE_DATA)
2039 can_delay = 0;
2040 else
2041 can_delay = 1;
2042
2043 if (cluster_try_push(vp, newEOF, 0, 0) == 0) {
2044 vp->v_flag |= VHASDIRTY;
2045 goto delay_io;
2046 }
2047 start_new_cluster:
2048 if (vp->v_clen == 0) {
2049 vp->v_ciosiz = devblocksize;
2050 vp->v_cstart = start_blkno;
2051 vp->v_lastw = last_blkno;
2052 }
2053 vp->v_clusters[vp->v_clen].start_pg = start_blkno;
2054 vp->v_clusters[vp->v_clen].last_pg = last_blkno;
2055 vp->v_clen++;
2056 delay_io:
2057 /*
2058 * make sure we keep v_cstart and v_lastw up to
2059 * date in case we have to fall back on the
2060 * V_HASDIRTY mechanism (or we've already entered it)
2061 */
2062 if (start_blkno < vp->v_cstart)
2063 vp->v_cstart = start_blkno;
2064 if (last_blkno > vp->v_lastw)
2065 vp->v_lastw = last_blkno;
2066
2067 ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2068 continue;
2069 issue_io:
2070 /*
2071 * in order to maintain some semblance of coherency with mapped writes
2072 * we need to write the cluster back out as a multiple of the PAGESIZE
2073 * unless the cluster encompasses the last page of the file... in this
2074 * case we'll round out to the nearest device block boundary
2075 */
2076 io_size = upl_size;
2077
2078 if ((upl_f_offset + io_size) > newEOF) {
2079 io_size = newEOF - upl_f_offset;
2080 io_size = (io_size + (devblocksize - 1)) & ~(devblocksize - 1);
2081 }
2082
2083 if (flags & IO_SYNC)
2084 io_flags = CL_COMMIT | CL_AGE;
2085 else
2086 io_flags = CL_COMMIT | CL_AGE | CL_ASYNC;
2087
2088 if (vp->v_flag & VNOCACHE_DATA)
2089 io_flags |= CL_DUMP;
2090
2091 while (vp->v_numoutput >= ASYNC_THROTTLE) {
2092 vp->v_flag |= VTHROTTLED;
2093 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_write", 0);
2094 }
2095 retval = cluster_io(vp, upl, 0, upl_f_offset, io_size, devblocksize,
2096 io_flags, (struct buf *)0, (struct clios *)0);
2097 }
2098 }
2099 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
2100 retval, 0, 0, 0, 0);
2101
2102 return (retval);
2103 }
2104
2105 int
2106 cluster_read(vp, uio, filesize, devblocksize, flags)
2107 struct vnode *vp;
2108 struct uio *uio;
2109 off_t filesize;
2110 int devblocksize;
2111 int flags;
2112 {
2113 int prev_resid;
2114 int clip_size;
2115 off_t max_io_size;
2116 struct iovec *iov;
2117 vm_offset_t upl_offset;
2118 int upl_size;
2119 int pages_in_pl;
2120 upl_page_info_t *pl;
2121 int upl_flags;
2122 upl_t upl;
2123 int retval = 0;
2124
2125 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
2126 (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0);
2127
2128 /*
2129 * We set a threshhold of 4 pages to decide if the nocopy
2130 * read loop is worth the trouble...
2131 */
2132
2133 if (!((vp->v_flag & VNOCACHE_DATA) && (uio->uio_segflg == UIO_USERSPACE)))
2134 {
2135 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2136 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2137 (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
2138 return(retval);
2139 }
2140
2141 while (uio->uio_resid && uio->uio_offset < filesize && retval == 0)
2142 {
2143 /* we know we have a resid, so this is safe */
2144 iov = uio->uio_iov;
2145 while (iov->iov_len == 0) {
2146 uio->uio_iov++;
2147 uio->uio_iovcnt--;
2148 iov = uio->uio_iov;
2149 }
2150
2151 /*
2152 * We check every vector target and if it is physically
2153 * contiguous space, we skip the sanity checks.
2154 */
2155
2156 upl_offset = (vm_offset_t)iov->iov_base & ~PAGE_MASK;
2157 upl_size = (upl_offset + PAGE_SIZE +(PAGE_SIZE -1)) & ~PAGE_MASK;
2158 pages_in_pl = 0;
2159 upl_flags = UPL_QUERY_OBJECT_TYPE;
2160 if((vm_map_get_upl(current_map(),
2161 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2162 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0)) != KERN_SUCCESS)
2163 {
2164 /*
2165 * the user app must have passed in an invalid address
2166 */
2167 return (EFAULT);
2168 }
2169
2170 if (upl_flags & UPL_PHYS_CONTIG)
2171 {
2172 retval = cluster_phys_read(vp, uio, filesize, devblocksize, flags);
2173 }
2174 else if (uio->uio_resid < 4 * PAGE_SIZE)
2175 {
2176 /*
2177 * We set a threshhold of 4 pages to decide if the nocopy
2178 * read loop is worth the trouble...
2179 */
2180 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2181 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2182 (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
2183 return(retval);
2184 }
2185 else if (uio->uio_offset & PAGE_MASK_64)
2186 {
2187 /* Bring the file offset read up to a pagesize boundary */
2188 clip_size = (PAGE_SIZE - (int)(uio->uio_offset & PAGE_MASK_64));
2189 if (uio->uio_resid < clip_size)
2190 clip_size = uio->uio_resid;
2191 /*
2192 * Fake the resid going into the cluster_read_x call
2193 * and restore it on the way out.
2194 */
2195 prev_resid = uio->uio_resid;
2196 uio->uio_resid = clip_size;
2197 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2198 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2199 }
2200 else if ((int)iov->iov_base & PAGE_MASK_64)
2201 {
2202 clip_size = iov->iov_len;
2203 prev_resid = uio->uio_resid;
2204 uio->uio_resid = clip_size;
2205 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2206 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2207 }
2208 else
2209 {
2210 /*
2211 * If we come in here, we know the offset into
2212 * the file is on a pagesize boundary
2213 */
2214
2215 max_io_size = filesize - uio->uio_offset;
2216 clip_size = uio->uio_resid;
2217 if (iov->iov_len < clip_size)
2218 clip_size = iov->iov_len;
2219 if (max_io_size < clip_size)
2220 clip_size = (int)max_io_size;
2221
2222 if (clip_size < PAGE_SIZE)
2223 {
2224 /*
2225 * Take care of the tail end of the read in this vector.
2226 */
2227 prev_resid = uio->uio_resid;
2228 uio->uio_resid = clip_size;
2229 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2230 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2231 }
2232 else
2233 {
2234 /* round clip_size down to a multiple of pagesize */
2235 clip_size = clip_size & ~(PAGE_MASK);
2236 prev_resid = uio->uio_resid;
2237 uio->uio_resid = clip_size;
2238 retval = cluster_nocopy_read(vp, uio, filesize, devblocksize, flags);
2239 if ((retval==0) && uio->uio_resid)
2240 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2241 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2242 }
2243 } /* end else */
2244 } /* end while */
2245
2246 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2247 (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
2248
2249 return(retval);
2250 }
2251
2252
2253 static int
2254 cluster_read_x(vp, uio, filesize, devblocksize, flags)
2255 struct vnode *vp;
2256 struct uio *uio;
2257 off_t filesize;
2258 int devblocksize;
2259 int flags;
2260 {
2261 upl_page_info_t *pl;
2262 upl_t upl;
2263 vm_offset_t upl_offset;
2264 int upl_size;
2265 off_t upl_f_offset;
2266 int start_offset;
2267 int start_pg;
2268 int last_pg;
2269 int uio_last;
2270 int pages_in_upl;
2271 off_t max_size;
2272 int io_size;
2273 vm_offset_t io_address;
2274 kern_return_t kret;
2275 int segflg;
2276 int error = 0;
2277 int retval = 0;
2278 int b_lblkno;
2279 int e_lblkno;
2280
2281 b_lblkno = (int)(uio->uio_offset / PAGE_SIZE_64);
2282
2283 while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) {
2284 /*
2285 * compute the size of the upl needed to encompass
2286 * the requested read... limit each call to cluster_io
2287 * to the maximum UPL size... cluster_io will clip if
2288 * this exceeds the maximum io_size for the device,
2289 * make sure to account for
2290 * a starting offset that's not page aligned
2291 */
2292 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2293 upl_f_offset = uio->uio_offset - (off_t)start_offset;
2294 max_size = filesize - uio->uio_offset;
2295
2296 if ((off_t)((unsigned int)uio->uio_resid) < max_size)
2297 io_size = uio->uio_resid;
2298 else
2299 io_size = max_size;
2300
2301 if (uio->uio_segflg == UIO_USERSPACE && !(vp->v_flag & VNOCACHE_DATA)) {
2302 segflg = uio->uio_segflg;
2303
2304 uio->uio_segflg = UIO_PHYS_USERSPACE;
2305
2306 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
2307 (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2308
2309 while (io_size && retval == 0) {
2310 int xsize;
2311 vm_offset_t paddr;
2312
2313 if (ubc_page_op(vp,
2314 upl_f_offset,
2315 UPL_POP_SET | UPL_POP_BUSY,
2316 &paddr, 0) != KERN_SUCCESS)
2317 break;
2318
2319 xsize = PAGE_SIZE - start_offset;
2320
2321 if (xsize > io_size)
2322 xsize = io_size;
2323
2324 retval = uiomove((caddr_t)(paddr + start_offset), xsize, uio);
2325
2326 ubc_page_op(vp, upl_f_offset,
2327 UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
2328
2329 io_size -= xsize;
2330 start_offset = (int)
2331 (uio->uio_offset & PAGE_MASK_64);
2332 upl_f_offset = uio->uio_offset - start_offset;
2333 }
2334 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
2335 (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2336
2337 uio->uio_segflg = segflg;
2338
2339 if (retval)
2340 break;
2341
2342 if (io_size == 0) {
2343 /*
2344 * we're already finished with this read request
2345 * let's see if we should do a read-ahead
2346 */
2347 e_lblkno = (int)
2348 ((uio->uio_offset - 1) / PAGE_SIZE_64);
2349
2350 if (!(vp->v_flag & VRAOFF))
2351 /*
2352 * let's try to read ahead if we're in
2353 * a sequential access pattern
2354 */
2355 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
2356 vp->v_lastr = e_lblkno;
2357
2358 break;
2359 }
2360 max_size = filesize - uio->uio_offset;
2361 }
2362 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2363 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2364 upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2365 pages_in_upl = upl_size / PAGE_SIZE;
2366
2367 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
2368 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2369
2370 kret = ubc_create_upl(vp,
2371 upl_f_offset,
2372 upl_size,
2373 &upl,
2374 &pl,
2375 UPL_FLAGS_NONE);
2376 if (kret != KERN_SUCCESS)
2377 panic("cluster_read: failed to get pagelist");
2378
2379 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
2380 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2381
2382 /*
2383 * scan from the beginning of the upl looking for the first
2384 * non-valid page.... this will become the first page in
2385 * the request we're going to make to 'cluster_io'... if all
2386 * of the pages are valid, we won't call through to 'cluster_io'
2387 */
2388 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
2389 if (!upl_valid_page(pl, start_pg))
2390 break;
2391 }
2392
2393 /*
2394 * scan from the starting invalid page looking for a valid
2395 * page before the end of the upl is reached, if we
2396 * find one, then it will be the last page of the request to
2397 * 'cluster_io'
2398 */
2399 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
2400 if (upl_valid_page(pl, last_pg))
2401 break;
2402 }
2403
2404 if (start_pg < last_pg) {
2405 /*
2406 * we found a range of 'invalid' pages that must be filled
2407 * if the last page in this range is the last page of the file
2408 * we may have to clip the size of it to keep from reading past
2409 * the end of the last physical block associated with the file
2410 */
2411 upl_offset = start_pg * PAGE_SIZE;
2412 io_size = (last_pg - start_pg) * PAGE_SIZE;
2413
2414 if ((upl_f_offset + upl_offset + io_size) > filesize)
2415 io_size = filesize - (upl_f_offset + upl_offset);
2416
2417 /*
2418 * issue a synchronous read to cluster_io
2419 */
2420
2421 error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
2422 io_size, devblocksize, CL_READ, (struct buf *)0, (struct clios *)0);
2423 }
2424 if (error == 0) {
2425 /*
2426 * if the read completed successfully, or there was no I/O request
2427 * issued, than map the upl into kernel address space and
2428 * move the data into user land.... we'll first add on any 'valid'
2429 * pages that were present in the upl when we acquired it.
2430 */
2431 u_int val_size;
2432 u_int size_of_prefetch;
2433
2434 for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
2435 if (!upl_valid_page(pl, uio_last))
2436 break;
2437 }
2438 /*
2439 * compute size to transfer this round, if uio->uio_resid is
2440 * still non-zero after this uiomove, we'll loop around and
2441 * set up for another I/O.
2442 */
2443 val_size = (uio_last * PAGE_SIZE) - start_offset;
2444
2445 if (max_size < val_size)
2446 val_size = max_size;
2447
2448 if (uio->uio_resid < val_size)
2449 val_size = uio->uio_resid;
2450
2451 e_lblkno = (int)((uio->uio_offset + ((off_t)val_size - 1)) / PAGE_SIZE_64);
2452
2453 if (size_of_prefetch = (uio->uio_resid - val_size)) {
2454 /*
2455 * if there's still I/O left to do for this request, then issue a
2456 * pre-fetch I/O... the I/O wait time will overlap
2457 * with the copying of the data
2458 */
2459 cluster_rd_prefetch(vp, uio->uio_offset + val_size, size_of_prefetch, filesize, devblocksize);
2460 } else {
2461 if (!(vp->v_flag & VRAOFF) && !(vp->v_flag & VNOCACHE_DATA))
2462 /*
2463 * let's try to read ahead if we're in
2464 * a sequential access pattern
2465 */
2466 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
2467 vp->v_lastr = e_lblkno;
2468 }
2469 if (uio->uio_segflg == UIO_USERSPACE) {
2470 int offset;
2471
2472 segflg = uio->uio_segflg;
2473
2474 uio->uio_segflg = UIO_PHYS_USERSPACE;
2475
2476
2477 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
2478 (int)uio->uio_offset, val_size, uio->uio_resid, 0, 0);
2479
2480 offset = start_offset;
2481
2482 while (val_size && retval == 0) {
2483 int csize;
2484 int i;
2485 caddr_t paddr;
2486
2487 i = offset / PAGE_SIZE;
2488 csize = min(PAGE_SIZE - start_offset, val_size);
2489
2490 paddr = (caddr_t)upl_phys_page(pl, i) + start_offset;
2491
2492 retval = uiomove(paddr, csize, uio);
2493
2494 val_size -= csize;
2495 offset += csize;
2496 start_offset = offset & PAGE_MASK;
2497 }
2498 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
2499 (int)uio->uio_offset, val_size, uio->uio_resid, 0, 0);
2500
2501 uio->uio_segflg = segflg;
2502 }
2503 else
2504 {
2505 if ((kret = ubc_upl_map(upl, &io_address)) != KERN_SUCCESS)
2506 panic("cluster_read: ubc_upl_map() failed\n");
2507
2508 retval = uiomove((caddr_t)(io_address + start_offset), val_size, uio);
2509
2510 if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
2511 panic("cluster_read: ubc_upl_unmap() failed\n");
2512 }
2513 }
2514 if (start_pg < last_pg) {
2515 /*
2516 * compute the range of pages that we actually issued an I/O for
2517 * and either commit them as valid if the I/O succeeded
2518 * or abort them if the I/O failed
2519 */
2520 io_size = (last_pg - start_pg) * PAGE_SIZE;
2521
2522 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
2523 (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
2524
2525 if (error || (vp->v_flag & VNOCACHE_DATA))
2526 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
2527 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2528 else
2529 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size,
2530 UPL_COMMIT_CLEAR_DIRTY
2531 | UPL_COMMIT_FREE_ON_EMPTY
2532 | UPL_COMMIT_INACTIVATE);
2533
2534 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
2535 (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
2536 }
2537 if ((last_pg - start_pg) < pages_in_upl) {
2538 int cur_pg;
2539 int commit_flags;
2540
2541 /*
2542 * the set of pages that we issued an I/O for did not encompass
2543 * the entire upl... so just release these without modifying
2544 * there state
2545 */
2546 if (error)
2547 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2548 else {
2549 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
2550 (int)upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
2551
2552 if (start_pg) {
2553 /*
2554 * we found some already valid pages at the beginning of
2555 * the upl commit these back to the inactive list with
2556 * reference cleared
2557 */
2558 for (cur_pg = 0; cur_pg < start_pg; cur_pg++) {
2559 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
2560 | UPL_COMMIT_INACTIVATE;
2561
2562 if (upl_dirty_page(pl, cur_pg))
2563 commit_flags |= UPL_COMMIT_SET_DIRTY;
2564
2565 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA))
2566 ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
2567 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2568 else
2569 ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
2570 PAGE_SIZE, commit_flags);
2571 }
2572 }
2573 if (last_pg < uio_last) {
2574 /*
2575 * we found some already valid pages immediately after the
2576 * pages we issued I/O for, commit these back to the
2577 * inactive list with reference cleared
2578 */
2579 for (cur_pg = last_pg; cur_pg < uio_last; cur_pg++) {
2580 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
2581 | UPL_COMMIT_INACTIVATE;
2582
2583 if (upl_dirty_page(pl, cur_pg))
2584 commit_flags |= UPL_COMMIT_SET_DIRTY;
2585
2586 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA))
2587 ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
2588 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2589 else
2590 ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
2591 PAGE_SIZE, commit_flags);
2592 }
2593 }
2594 if (uio_last < pages_in_upl) {
2595 /*
2596 * there were some invalid pages beyond the valid pages
2597 * that we didn't issue an I/O for, just release them
2598 * unchanged
2599 */
2600 ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
2601 (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
2602 }
2603
2604 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
2605 (int)upl, -1, -1, 0, 0);
2606 }
2607 }
2608 if (retval == 0)
2609 retval = error;
2610 }
2611
2612 return (retval);
2613 }
2614
2615
2616 static int
2617 cluster_nocopy_read(vp, uio, filesize, devblocksize, flags)
2618 struct vnode *vp;
2619 struct uio *uio;
2620 off_t filesize;
2621 int devblocksize;
2622 int flags;
2623 {
2624 upl_t upl;
2625 upl_page_info_t *pl;
2626 off_t upl_f_offset;
2627 vm_offset_t upl_offset;
2628 off_t start_upl_f_offset;
2629 off_t max_io_size;
2630 int io_size;
2631 int upl_size;
2632 int upl_needed_size;
2633 int pages_in_pl;
2634 vm_offset_t paddr;
2635 int upl_flags;
2636 kern_return_t kret;
2637 int segflg;
2638 struct iovec *iov;
2639 int i;
2640 int force_data_sync;
2641 int retval = 0;
2642 int first = 1;
2643 struct clios iostate;
2644
2645 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
2646 (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0);
2647
2648 /*
2649 * When we enter this routine, we know
2650 * -- the offset into the file is on a pagesize boundary
2651 * -- the resid is a page multiple
2652 * -- the resid will not exceed iov_len
2653 */
2654
2655 iostate.io_completed = 0;
2656 iostate.io_issued = 0;
2657 iostate.io_error = 0;
2658 iostate.io_wanted = 0;
2659
2660 iov = uio->uio_iov;
2661
2662 while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) {
2663
2664 max_io_size = filesize - uio->uio_offset;
2665
2666 if (max_io_size < (off_t)((unsigned int)uio->uio_resid))
2667 io_size = max_io_size;
2668 else
2669 io_size = uio->uio_resid;
2670
2671 /*
2672 * We don't come into this routine unless
2673 * UIO_USERSPACE is set.
2674 */
2675 segflg = uio->uio_segflg;
2676
2677 uio->uio_segflg = UIO_PHYS_USERSPACE;
2678
2679 /*
2680 * First look for pages already in the cache
2681 * and move them to user space.
2682 */
2683 while (io_size && (retval == 0)) {
2684 upl_f_offset = uio->uio_offset;
2685
2686 /*
2687 * If this call fails, it means the page is not
2688 * in the page cache.
2689 */
2690 if (ubc_page_op(vp, upl_f_offset,
2691 UPL_POP_SET | UPL_POP_BUSY, &paddr, 0) != KERN_SUCCESS)
2692 break;
2693
2694 retval = uiomove((caddr_t)(paddr), PAGE_SIZE, uio);
2695
2696 ubc_page_op(vp, upl_f_offset,
2697 UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
2698
2699 io_size -= PAGE_SIZE;
2700 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 71)) | DBG_FUNC_NONE,
2701 (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2702 }
2703 uio->uio_segflg = segflg;
2704
2705 if (retval) {
2706 /*
2707 * we may have already spun some portion of this request
2708 * off as async requests... we need to wait for the I/O
2709 * to complete before returning
2710 */
2711 goto wait_for_reads;
2712 }
2713 /*
2714 * If we are already finished with this read, then return
2715 */
2716 if (io_size == 0) {
2717 /*
2718 * we may have already spun some portion of this request
2719 * off as async requests... we need to wait for the I/O
2720 * to complete before returning
2721 */
2722 goto wait_for_reads;
2723 }
2724 max_io_size = io_size;
2725
2726 if (max_io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2727 max_io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2728 if (first) {
2729 if (max_io_size > (MAX_UPL_TRANSFER * PAGE_SIZE) / 4)
2730 max_io_size = (MAX_UPL_TRANSFER * PAGE_SIZE) / 8;
2731 first = 0;
2732 }
2733 start_upl_f_offset = uio->uio_offset; /* this is page aligned in the file */
2734 upl_f_offset = start_upl_f_offset;
2735 io_size = 0;
2736
2737 while (io_size < max_io_size) {
2738 if (ubc_page_op(vp, upl_f_offset,
2739 UPL_POP_SET | UPL_POP_BUSY, &paddr, 0) == KERN_SUCCESS) {
2740 ubc_page_op(vp, upl_f_offset,
2741 UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
2742 break;
2743 }
2744 /*
2745 * Build up the io request parameters.
2746 */
2747 io_size += PAGE_SIZE_64;
2748 upl_f_offset += PAGE_SIZE_64;
2749 }
2750 if (io_size == 0)
2751 /*
2752 * we may have already spun some portion of this request
2753 * off as async requests... we need to wait for the I/O
2754 * to complete before returning
2755 */
2756 goto wait_for_reads;
2757
2758 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
2759 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
2760
2761 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
2762 (int)upl_offset, upl_needed_size, (int)iov->iov_base, io_size, 0);
2763
2764 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
2765 pages_in_pl = 0;
2766 upl_size = upl_needed_size;
2767 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
2768
2769 kret = vm_map_get_upl(current_map(),
2770 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2771 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, force_data_sync);
2772
2773 if (kret != KERN_SUCCESS) {
2774 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2775 (int)upl_offset, upl_size, io_size, kret, 0);
2776
2777 /*
2778 * cluster_nocopy_read: failed to get pagelist
2779 *
2780 * we may have already spun some portion of this request
2781 * off as async requests... we need to wait for the I/O
2782 * to complete before returning
2783 */
2784 goto wait_for_reads;
2785 }
2786 pages_in_pl = upl_size / PAGE_SIZE;
2787 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2788
2789 for (i = 0; i < pages_in_pl; i++) {
2790 if (!upl_valid_page(pl, i))
2791 break;
2792 }
2793 if (i == pages_in_pl)
2794 break;
2795
2796 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2797 UPL_ABORT_FREE_ON_EMPTY);
2798 }
2799 if (force_data_sync >= 3) {
2800 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2801 (int)upl_offset, upl_size, io_size, kret, 0);
2802
2803 goto wait_for_reads;
2804 }
2805 /*
2806 * Consider the possibility that upl_size wasn't satisfied.
2807 */
2808 if (upl_size != upl_needed_size)
2809 io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
2810
2811 if (io_size == 0) {
2812 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2813 UPL_ABORT_FREE_ON_EMPTY);
2814 goto wait_for_reads;
2815 }
2816 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2817 (int)upl_offset, upl_size, io_size, kret, 0);
2818
2819 /*
2820 * request asynchronously so that we can overlap
2821 * the preparation of the next I/O
2822 * if there are already too many outstanding reads
2823 * wait until some have completed before issuing the next read
2824 */
2825 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
2826 iostate.io_wanted = 1;
2827 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_read", 0);
2828 }
2829 if (iostate.io_error) {
2830 /*
2831 * one of the earlier reads we issued ran into a hard error
2832 * don't issue any more reads, cleanup the UPL
2833 * that was just created but not used, then
2834 * go wait for any other reads to complete before
2835 * returning the error to the caller
2836 */
2837 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2838 UPL_ABORT_FREE_ON_EMPTY);
2839
2840 goto wait_for_reads;
2841 }
2842 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
2843 (int)upl, (int)upl_offset, (int)start_upl_f_offset, io_size, 0);
2844
2845 retval = cluster_io(vp, upl, upl_offset, start_upl_f_offset,
2846 io_size, devblocksize,
2847 CL_PRESERVE | CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO,
2848 (struct buf *)0, &iostate);
2849
2850 /*
2851 * update the uio structure
2852 */
2853 iov->iov_base += io_size;
2854 iov->iov_len -= io_size;
2855 uio->uio_resid -= io_size;
2856 uio->uio_offset += io_size;
2857
2858 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
2859 (int)upl, (int)uio->uio_offset, (int)uio->uio_resid, retval, 0);
2860
2861 } /* end while */
2862
2863 wait_for_reads:
2864 /*
2865 * make sure all async reads that are part of this stream
2866 * have completed before we return
2867 */
2868 while (iostate.io_issued != iostate.io_completed) {
2869 iostate.io_wanted = 1;
2870 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_read", 0);
2871 }
2872 if (iostate.io_error)
2873 retval = iostate.io_error;
2874
2875 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2876 (int)uio->uio_offset, (int)uio->uio_resid, 6, retval, 0);
2877
2878 return (retval);
2879 }
2880
2881
2882 static int
2883 cluster_phys_read(vp, uio, filesize, devblocksize, flags)
2884 struct vnode *vp;
2885 struct uio *uio;
2886 off_t filesize;
2887 int devblocksize;
2888 int flags;
2889 {
2890 upl_page_info_t *pl;
2891 upl_t upl;
2892 vm_offset_t upl_offset;
2893 vm_offset_t dst_paddr;
2894 off_t max_size;
2895 int io_size;
2896 int tail_size;
2897 int upl_size;
2898 int upl_needed_size;
2899 int pages_in_pl;
2900 int upl_flags;
2901 kern_return_t kret;
2902 struct iovec *iov;
2903 struct clios iostate;
2904 int error;
2905
2906 /*
2907 * When we enter this routine, we know
2908 * -- the resid will not exceed iov_len
2909 * -- the target address is physically contiguous
2910 */
2911
2912 iov = uio->uio_iov;
2913
2914 max_size = filesize - uio->uio_offset;
2915
2916 if (max_size > (off_t)((unsigned int)iov->iov_len))
2917 io_size = iov->iov_len;
2918 else
2919 io_size = max_size;
2920
2921 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
2922 upl_needed_size = upl_offset + io_size;
2923
2924 error = 0;
2925 pages_in_pl = 0;
2926 upl_size = upl_needed_size;
2927 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
2928
2929 kret = vm_map_get_upl(current_map(),
2930 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2931 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
2932
2933 if (kret != KERN_SUCCESS) {
2934 /*
2935 * cluster_phys_read: failed to get pagelist
2936 */
2937 return(EINVAL);
2938 }
2939 if (upl_size < upl_needed_size) {
2940 /*
2941 * The upl_size wasn't satisfied.
2942 */
2943 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2944
2945 return(EINVAL);
2946 }
2947 pl = ubc_upl_pageinfo(upl);
2948
2949 dst_paddr = (vm_offset_t)upl_phys_page(pl, 0) + ((vm_offset_t)iov->iov_base & PAGE_MASK);
2950
2951 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
2952 int head_size;
2953
2954 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
2955
2956 if (head_size > io_size)
2957 head_size = io_size;
2958
2959 error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, devblocksize, CL_READ);
2960
2961 if (error) {
2962 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2963
2964 return(EINVAL);
2965 }
2966 upl_offset += head_size;
2967 dst_paddr += head_size;
2968 io_size -= head_size;
2969 }
2970 tail_size = io_size & (devblocksize - 1);
2971 io_size -= tail_size;
2972
2973 iostate.io_completed = 0;
2974 iostate.io_issued = 0;
2975 iostate.io_error = 0;
2976 iostate.io_wanted = 0;
2977
2978 while (io_size && error == 0) {
2979 int xsize;
2980
2981 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2982 xsize = MAX_UPL_TRANSFER * PAGE_SIZE;
2983 else
2984 xsize = io_size;
2985 /*
2986 * request asynchronously so that we can overlap
2987 * the preparation of the next I/O... we'll do
2988 * the commit after all the I/O has completed
2989 * since its all issued against the same UPL
2990 * if there are already too many outstanding reads
2991 * wait until some have completed before issuing the next
2992 */
2993 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
2994 iostate.io_wanted = 1;
2995 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_phys_read", 0);
2996 }
2997
2998 error = cluster_io(vp, upl, upl_offset, uio->uio_offset, xsize, 0,
2999 CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC,
3000 (struct buf *)0, &iostate);
3001 /*
3002 * The cluster_io read was issued successfully,
3003 * update the uio structure
3004 */
3005 if (error == 0) {
3006 uio->uio_resid -= xsize;
3007 iov->iov_len -= xsize;
3008 iov->iov_base += xsize;
3009 uio->uio_offset += xsize;
3010 dst_paddr += xsize;
3011 upl_offset += xsize;
3012 io_size -= xsize;
3013 }
3014 }
3015 /*
3016 * make sure all async reads that are part of this stream
3017 * have completed before we proceed
3018 */
3019 while (iostate.io_issued != iostate.io_completed) {
3020 iostate.io_wanted = 1;
3021 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_phys_read", 0);
3022 }
3023 if (iostate.io_error) {
3024 error = iostate.io_error;
3025 }
3026 if (error == 0 && tail_size)
3027 error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, devblocksize, CL_READ);
3028
3029 /*
3030 * just release our hold on the physically contiguous
3031 * region without changing any state
3032 */
3033 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3034
3035 return (error);
3036 }
3037
3038
3039 /*
3040 * generate advisory I/O's in the largest chunks possible
3041 * the completed pages will be released into the VM cache
3042 */
3043 int
3044 advisory_read(vp, filesize, f_offset, resid, devblocksize)
3045 struct vnode *vp;
3046 off_t filesize;
3047 off_t f_offset;
3048 int resid;
3049 int devblocksize;
3050 {
3051 upl_page_info_t *pl;
3052 upl_t upl;
3053 vm_offset_t upl_offset;
3054 int upl_size;
3055 off_t upl_f_offset;
3056 int start_offset;
3057 int start_pg;
3058 int last_pg;
3059 int pages_in_upl;
3060 off_t max_size;
3061 int io_size;
3062 kern_return_t kret;
3063 int retval = 0;
3064 int issued_io;
3065
3066 if (!UBCINFOEXISTS(vp))
3067 return(EINVAL);
3068
3069 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
3070 (int)f_offset, resid, (int)filesize, devblocksize, 0);
3071
3072 while (resid && f_offset < filesize && retval == 0) {
3073 /*
3074 * compute the size of the upl needed to encompass
3075 * the requested read... limit each call to cluster_io
3076 * to the maximum UPL size... cluster_io will clip if
3077 * this exceeds the maximum io_size for the device,
3078 * make sure to account for
3079 * a starting offset that's not page aligned
3080 */
3081 start_offset = (int)(f_offset & PAGE_MASK_64);
3082 upl_f_offset = f_offset - (off_t)start_offset;
3083 max_size = filesize - f_offset;
3084
3085 if (resid < max_size)
3086 io_size = resid;
3087 else
3088 io_size = max_size;
3089
3090 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
3091 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
3092 upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
3093 pages_in_upl = upl_size / PAGE_SIZE;
3094
3095 kret = ubc_create_upl(vp,
3096 upl_f_offset,
3097 upl_size,
3098 &upl,
3099 &pl,
3100 UPL_RET_ONLY_ABSENT);
3101 if (kret != KERN_SUCCESS)
3102 return(retval);
3103 issued_io = 0;
3104
3105 /*
3106 * before we start marching forward, we must make sure we end on
3107 * a present page, otherwise we will be working with a freed
3108 * upl
3109 */
3110 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
3111 if (upl_page_present(pl, last_pg))
3112 break;
3113 }
3114 pages_in_upl = last_pg + 1;
3115
3116
3117 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_NONE,
3118 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
3119
3120
3121 for (last_pg = 0; last_pg < pages_in_upl; ) {
3122 /*
3123 * scan from the beginning of the upl looking for the first
3124 * page that is present.... this will become the first page in
3125 * the request we're going to make to 'cluster_io'... if all
3126 * of the pages are absent, we won't call through to 'cluster_io'
3127 */
3128 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
3129 if (upl_page_present(pl, start_pg))
3130 break;
3131 }
3132
3133 /*
3134 * scan from the starting present page looking for an absent
3135 * page before the end of the upl is reached, if we
3136 * find one, then it will terminate the range of pages being
3137 * presented to 'cluster_io'
3138 */
3139 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
3140 if (!upl_page_present(pl, last_pg))
3141 break;
3142 }
3143
3144 if (last_pg > start_pg) {
3145 /*
3146 * we found a range of pages that must be filled
3147 * if the last page in this range is the last page of the file
3148 * we may have to clip the size of it to keep from reading past
3149 * the end of the last physical block associated with the file
3150 */
3151 upl_offset = start_pg * PAGE_SIZE;
3152 io_size = (last_pg - start_pg) * PAGE_SIZE;
3153
3154 if ((upl_f_offset + upl_offset + io_size) > filesize)
3155 io_size = filesize - (upl_f_offset + upl_offset);
3156
3157 /*
3158 * issue an asynchronous read to cluster_io
3159 */
3160 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, devblocksize,
3161 CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE, (struct buf *)0, (struct clios *)0);
3162
3163 issued_io = 1;
3164 }
3165 }
3166 if (issued_io == 0)
3167 ubc_upl_abort(upl, 0);
3168
3169 io_size = upl_size - start_offset;
3170
3171 if (io_size > resid)
3172 io_size = resid;
3173 f_offset += io_size;
3174 resid -= io_size;
3175 }
3176
3177 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
3178 (int)f_offset, resid, retval, 0, 0);
3179
3180 return(retval);
3181 }
3182
3183
3184 int
3185 cluster_push(vp)
3186 struct vnode *vp;
3187 {
3188 int retval;
3189
3190 if (!UBCINFOEXISTS(vp) || vp->v_clen == 0) {
3191 vp->v_flag &= ~VHASDIRTY;
3192 return(0);
3193 }
3194
3195 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
3196 vp->v_flag & VHASDIRTY, vp->v_clen, 0, 0, 0);
3197
3198 if (vp->v_flag & VHASDIRTY) {
3199 daddr_t start_pg;
3200 daddr_t last_pg;
3201 daddr_t end_pg;
3202
3203 start_pg = vp->v_cstart;
3204 end_pg = vp->v_lastw;
3205
3206 vp->v_flag &= ~VHASDIRTY;
3207 vp->v_clen = 0;
3208
3209 while (start_pg < end_pg) {
3210 last_pg = start_pg + MAX_UPL_TRANSFER;
3211
3212 if (last_pg > end_pg)
3213 last_pg = end_pg;
3214
3215 cluster_push_x(vp, ubc_getsize(vp), start_pg, last_pg, 0);
3216
3217 start_pg = last_pg;
3218 }
3219 return (1);
3220 }
3221 retval = cluster_try_push(vp, ubc_getsize(vp), 0, 1);
3222
3223 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
3224 vp->v_flag & VHASDIRTY, vp->v_clen, retval, 0, 0);
3225
3226 return (retval);
3227 }
3228
3229
3230 static int
3231 cluster_try_push(vp, EOF, can_delay, push_all)
3232 struct vnode *vp;
3233 off_t EOF;
3234 int can_delay;
3235 int push_all;
3236 {
3237 int cl_index;
3238 int cl_index1;
3239 int min_index;
3240 int cl_len;
3241 int cl_total;
3242 int cl_pushed;
3243 struct v_cluster l_clusters[MAX_CLUSTERS];
3244
3245 /*
3246 * make a local 'sorted' copy of the clusters
3247 * and clear vp->v_clen so that new clusters can
3248 * be developed
3249 */
3250 for (cl_index = 0; cl_index < vp->v_clen; cl_index++) {
3251 for (min_index = -1, cl_index1 = 0; cl_index1 < vp->v_clen; cl_index1++) {
3252 if (vp->v_clusters[cl_index1].start_pg == vp->v_clusters[cl_index1].last_pg)
3253 continue;
3254 if (min_index == -1)
3255 min_index = cl_index1;
3256 else if (vp->v_clusters[cl_index1].start_pg < vp->v_clusters[min_index].start_pg)
3257 min_index = cl_index1;
3258 }
3259 if (min_index == -1)
3260 break;
3261 l_clusters[cl_index].start_pg = vp->v_clusters[min_index].start_pg;
3262 l_clusters[cl_index].last_pg = vp->v_clusters[min_index].last_pg;
3263
3264 vp->v_clusters[min_index].start_pg = vp->v_clusters[min_index].last_pg;
3265 }
3266 cl_len = cl_index;
3267 vp->v_clen = 0;
3268
3269 for (cl_pushed = 0, cl_index = 0; cl_index < cl_len; cl_index++) {
3270 /*
3271 * try to push each cluster in turn... cluster_push_x may not
3272 * push the cluster if can_delay is TRUE and the cluster doesn't
3273 * meet the critera for an immediate push
3274 */
3275 if (cluster_push_x(vp, EOF, l_clusters[cl_index].start_pg, l_clusters[cl_index].last_pg, can_delay)) {
3276 l_clusters[cl_index].start_pg = 0;
3277 l_clusters[cl_index].last_pg = 0;
3278
3279 cl_pushed++;
3280
3281 if (push_all == 0)
3282 break;
3283 }
3284 }
3285 if (cl_len > cl_pushed) {
3286 /*
3287 * we didn't push all of the clusters, so
3288 * lets try to merge them back in to the vnode
3289 */
3290 if ((MAX_CLUSTERS - vp->v_clen) < (cl_len - cl_pushed)) {
3291 /*
3292 * we picked up some new clusters while we were trying to
3293 * push the old ones (I don't think this can happen because
3294 * I'm holding the lock, but just in case)... the sum of the
3295 * leftovers plus the new cluster count exceeds our ability
3296 * to represent them, so fall back to the VHASDIRTY mechanism
3297 */
3298 for (cl_index = 0; cl_index < cl_len; cl_index++) {
3299 if (l_clusters[cl_index].start_pg == l_clusters[cl_index].last_pg)
3300 continue;
3301
3302 if (l_clusters[cl_index].start_pg < vp->v_cstart)
3303 vp->v_cstart = l_clusters[cl_index].start_pg;
3304 if (l_clusters[cl_index].last_pg > vp->v_lastw)
3305 vp->v_lastw = l_clusters[cl_index].last_pg;
3306 }
3307 vp->v_flag |= VHASDIRTY;
3308 } else {
3309 /*
3310 * we've got room to merge the leftovers back in
3311 * just append them starting at the next 'hole'
3312 * represented by vp->v_clen
3313 */
3314 for (cl_index = 0, cl_index1 = vp->v_clen; cl_index < cl_len; cl_index++) {
3315 if (l_clusters[cl_index].start_pg == l_clusters[cl_index].last_pg)
3316 continue;
3317
3318 vp->v_clusters[cl_index1].start_pg = l_clusters[cl_index].start_pg;
3319 vp->v_clusters[cl_index1].last_pg = l_clusters[cl_index].last_pg;
3320
3321 if (cl_index1 == 0) {
3322 vp->v_cstart = l_clusters[cl_index].start_pg;
3323 vp->v_lastw = l_clusters[cl_index].last_pg;
3324 } else {
3325 if (l_clusters[cl_index].start_pg < vp->v_cstart)
3326 vp->v_cstart = l_clusters[cl_index].start_pg;
3327 if (l_clusters[cl_index].last_pg > vp->v_lastw)
3328 vp->v_lastw = l_clusters[cl_index].last_pg;
3329 }
3330 cl_index1++;
3331 }
3332 /*
3333 * update the cluster count
3334 */
3335 vp->v_clen = cl_index1;
3336 }
3337 }
3338 return(MAX_CLUSTERS - vp->v_clen);
3339 }
3340
3341
3342
3343 static int
3344 cluster_push_x(vp, EOF, first, last, can_delay)
3345 struct vnode *vp;
3346 off_t EOF;
3347 daddr_t first;
3348 daddr_t last;
3349 int can_delay;
3350 {
3351 upl_page_info_t *pl;
3352 upl_t upl;
3353 vm_offset_t upl_offset;
3354 int upl_size;
3355 off_t upl_f_offset;
3356 int pages_in_upl;
3357 int start_pg;
3358 int last_pg;
3359 int io_size;
3360 int io_flags;
3361 int size;
3362 kern_return_t kret;
3363
3364
3365 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START,
3366 vp->v_clen, first, last, EOF, 0);
3367
3368 if ((pages_in_upl = last - first) == 0) {
3369 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0);
3370
3371 return (1);
3372 }
3373 upl_size = pages_in_upl * PAGE_SIZE;
3374 upl_f_offset = ((off_t)first) * PAGE_SIZE_64;
3375
3376 if (upl_f_offset + upl_size >= EOF) {
3377
3378 if (upl_f_offset >= EOF) {
3379 /*
3380 * must have truncated the file and missed
3381 * clearing a dangling cluster (i.e. it's completely
3382 * beyond the new EOF
3383 */
3384 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0);
3385
3386 return(1);
3387 }
3388 size = EOF - upl_f_offset;
3389
3390 upl_size = (size + (PAGE_SIZE - 1) ) & ~(PAGE_SIZE - 1);
3391 pages_in_upl = upl_size / PAGE_SIZE;
3392 } else {
3393 if (can_delay && (pages_in_upl < (MAX_UPL_TRANSFER - (MAX_UPL_TRANSFER / 2))))
3394 return(0);
3395 size = upl_size;
3396 }
3397 kret = ubc_create_upl(vp,
3398 upl_f_offset,
3399 upl_size,
3400 &upl,
3401 &pl,
3402 UPL_RET_ONLY_DIRTY);
3403 if (kret != KERN_SUCCESS)
3404 panic("cluster_push: failed to get pagelist");
3405
3406 if (can_delay) {
3407 int num_of_dirty;
3408
3409 for (num_of_dirty = 0, start_pg = 0; start_pg < pages_in_upl; start_pg++) {
3410 if (upl_valid_page(pl, start_pg) && upl_dirty_page(pl, start_pg))
3411 num_of_dirty++;
3412 }
3413 if (num_of_dirty < pages_in_upl / 2) {
3414 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3415
3416 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 0, 2, num_of_dirty, (pages_in_upl / 2), 0);
3417
3418 return(0);
3419 }
3420 }
3421 last_pg = 0;
3422
3423 while (size) {
3424
3425 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
3426 if (upl_valid_page(pl, start_pg) && upl_dirty_page(pl, start_pg))
3427 break;
3428 }
3429 if (start_pg > last_pg) {
3430 io_size = (start_pg - last_pg) * PAGE_SIZE;
3431
3432 ubc_upl_abort_range(upl, last_pg * PAGE_SIZE, io_size,
3433 UPL_ABORT_FREE_ON_EMPTY);
3434
3435 if (io_size < size)
3436 size -= io_size;
3437 else
3438 break;
3439 }
3440 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
3441 if (!upl_valid_page(pl, last_pg) || !upl_dirty_page(pl, last_pg))
3442 break;
3443 }
3444 upl_offset = start_pg * PAGE_SIZE;
3445
3446 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
3447
3448 if (vp->v_flag & VNOCACHE_DATA)
3449 io_flags = CL_COMMIT | CL_AGE | CL_ASYNC | CL_DUMP;
3450 else
3451 io_flags = CL_COMMIT | CL_AGE | CL_ASYNC;
3452
3453 while (vp->v_numoutput >= ASYNC_THROTTLE) {
3454 vp->v_flag |= VTHROTTLED;
3455 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_push", 0);
3456 }
3457 cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, vp->v_ciosiz, io_flags, (struct buf *)0, (struct clios *)0);
3458
3459 size -= io_size;
3460 }
3461 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, 0, 0, 0);
3462
3463 return(1);
3464 }
3465
3466
3467
3468 static int
3469 cluster_align_phys_io(struct vnode *vp, struct uio *uio, vm_offset_t usr_paddr, int xsize, int devblocksize, int flags)
3470 {
3471 struct iovec *iov;
3472 upl_page_info_t *pl;
3473 upl_t upl;
3474 vm_offset_t ubc_paddr;
3475 kern_return_t kret;
3476 int error = 0;
3477
3478 iov = uio->uio_iov;
3479
3480 kret = ubc_create_upl(vp,
3481 uio->uio_offset & ~PAGE_MASK_64,
3482 PAGE_SIZE,
3483 &upl,
3484 &pl,
3485 UPL_FLAGS_NONE);
3486
3487 if (kret != KERN_SUCCESS)
3488 return(EINVAL);
3489
3490 if (!upl_valid_page(pl, 0)) {
3491 /*
3492 * issue a synchronous read to cluster_io
3493 */
3494 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, devblocksize,
3495 CL_READ, (struct buf *)0, (struct clios *)0);
3496 if (error) {
3497 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3498
3499 return(error);
3500 }
3501 }
3502 ubc_paddr = (vm_offset_t)upl_phys_page(pl, 0) + (int)(uio->uio_offset & PAGE_MASK_64);
3503
3504 if (flags & CL_READ)
3505 copyp2p(ubc_paddr, usr_paddr, xsize, 2);
3506 else
3507 copyp2p(usr_paddr, ubc_paddr, xsize, 1);
3508
3509 if ( !(flags & CL_READ) || upl_dirty_page(pl, 0)) {
3510 /*
3511 * issue a synchronous write to cluster_io
3512 */
3513 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, devblocksize,
3514 0, (struct buf *)0, (struct clios *)0);
3515 }
3516 if (error == 0) {
3517 uio->uio_offset += xsize;
3518 iov->iov_base += xsize;
3519 iov->iov_len -= xsize;
3520 uio->uio_resid -= xsize;
3521 }
3522 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3523
3524 return (error);
3525 }