]> git.saurik.com Git - apple/xnu.git/blob - bsd/vfs/vfs_cluster.c
ec2eaf7f499bbbb8bfa39d58c86f861c4ef77711
[apple/xnu.git] / bsd / vfs / vfs_cluster.c
1 /*
2 * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
7 *
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * file.
14 *
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
22 *
23 * @APPLE_LICENSE_HEADER_END@
24 */
25 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
26 /*
27 * Copyright (c) 1993
28 * The Regents of the University of California. All rights reserved.
29 *
30 * Redistribution and use in source and binary forms, with or without
31 * modification, are permitted provided that the following conditions
32 * are met:
33 * 1. Redistributions of source code must retain the above copyright
34 * notice, this list of conditions and the following disclaimer.
35 * 2. Redistributions in binary form must reproduce the above copyright
36 * notice, this list of conditions and the following disclaimer in the
37 * documentation and/or other materials provided with the distribution.
38 * 3. All advertising materials mentioning features or use of this software
39 * must display the following acknowledgement:
40 * This product includes software developed by the University of
41 * California, Berkeley and its contributors.
42 * 4. Neither the name of the University nor the names of its contributors
43 * may be used to endorse or promote products derived from this software
44 * without specific prior written permission.
45 *
46 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
47 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
48 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
49 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
50 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
51 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
52 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
53 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
54 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
55 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
56 * SUCH DAMAGE.
57 *
58 * @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95
59 */
60
61 #include <sys/param.h>
62 #include <sys/proc.h>
63 #include <sys/buf.h>
64 #include <sys/vnode.h>
65 #include <sys/mount.h>
66 #include <sys/trace.h>
67 #include <sys/malloc.h>
68 #include <sys/resourcevar.h>
69 #include <libkern/libkern.h>
70
71 #include <sys/ubc.h>
72 #include <vm/vm_pageout.h>
73
74 #include <sys/kdebug.h>
75
76 #define CL_READ 0x01
77 #define CL_ASYNC 0x02
78 #define CL_COMMIT 0x04
79 #define CL_PAGEOUT 0x10
80 #define CL_AGE 0x20
81 #define CL_DUMP 0x40
82 #define CL_NOZERO 0x80
83 #define CL_PAGEIN 0x100
84 #define CL_DEV_MEMORY 0x200
85 #define CL_PRESERVE 0x400
86
87
88 struct clios {
89 u_int io_completed; /* amount of io that has currently completed */
90 u_int io_issued; /* amount of io that was successfully issued */
91 int io_error; /* error code of first error encountered */
92 int io_wanted; /* someone is sleeping waiting for a change in state */
93 };
94
95
96 static void cluster_zero(upl_t upl, vm_offset_t upl_offset,
97 int size, struct buf *bp);
98 static int cluster_read_x(struct vnode *vp, struct uio *uio,
99 off_t filesize, int devblocksize, int flags);
100 static int cluster_write_x(struct vnode *vp, struct uio *uio,
101 off_t oldEOF, off_t newEOF, off_t headOff,
102 off_t tailOff, int devblocksize, int flags);
103 static int cluster_nocopy_read(struct vnode *vp, struct uio *uio,
104 off_t filesize, int devblocksize, int flags);
105 static int cluster_nocopy_write(struct vnode *vp, struct uio *uio,
106 off_t newEOF, int devblocksize, int flags);
107 static int cluster_phys_read(struct vnode *vp, struct uio *uio,
108 off_t filesize, int devblocksize, int flags);
109 static int cluster_phys_write(struct vnode *vp, struct uio *uio,
110 off_t newEOF, int devblocksize, int flags);
111 static int cluster_align_phys_io(struct vnode *vp, struct uio *uio,
112 addr64_t usr_paddr, int xsize, int devblocksize, int flags);
113 static int cluster_push_x(struct vnode *vp, off_t EOF, daddr_t first, daddr_t last, int can_delay);
114 static int cluster_try_push(struct vnode *vp, off_t newEOF, int can_delay, int push_all);
115
116
117 /*
118 * throttle the number of async writes that
119 * can be outstanding on a single vnode
120 * before we issue a synchronous write
121 */
122 #define ASYNC_THROTTLE 9
123
124 static int
125 cluster_iodone(bp)
126 struct buf *bp;
127 {
128 int b_flags;
129 int error;
130 int total_size;
131 int total_resid;
132 int upl_offset;
133 int zero_offset;
134 upl_t upl;
135 struct buf *cbp;
136 struct buf *cbp_head;
137 struct buf *cbp_next;
138 struct buf *real_bp;
139 struct vnode *vp;
140 struct clios *iostate;
141 int commit_size;
142 int pg_offset;
143
144
145 cbp_head = (struct buf *)(bp->b_trans_head);
146
147 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
148 (int)cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
149
150 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
151 /*
152 * all I/O requests that are part of this transaction
153 * have to complete before we can process it
154 */
155 if ( !(cbp->b_flags & B_DONE)) {
156
157 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
158 (int)cbp_head, (int)cbp, cbp->b_bcount, cbp->b_flags, 0);
159
160 return 0;
161 }
162 }
163 error = 0;
164 total_size = 0;
165 total_resid = 0;
166
167 cbp = cbp_head;
168 upl_offset = cbp->b_uploffset;
169 upl = cbp->b_pagelist;
170 b_flags = cbp->b_flags;
171 real_bp = cbp->b_real_bp;
172 vp = cbp->b_vp;
173 zero_offset= cbp->b_validend;
174 iostate = (struct clios *)cbp->b_iostate;
175
176 while (cbp) {
177 if ((cbp->b_flags & B_ERROR) && error == 0)
178 error = cbp->b_error;
179
180 total_resid += cbp->b_resid;
181 total_size += cbp->b_bcount;
182
183 cbp_next = cbp->b_trans_next;
184
185 free_io_buf(cbp);
186
187 cbp = cbp_next;
188 }
189 if (zero_offset)
190 cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
191
192 if ((vp->v_flag & VTHROTTLED) && (vp->v_numoutput <= (ASYNC_THROTTLE / 3))) {
193 vp->v_flag &= ~VTHROTTLED;
194 wakeup((caddr_t)&vp->v_numoutput);
195 }
196 if (iostate) {
197 /*
198 * someone has issued multiple I/Os asynchrounsly
199 * and is waiting for them to complete (streaming)
200 */
201 if (error && iostate->io_error == 0)
202 iostate->io_error = error;
203
204 iostate->io_completed += total_size;
205
206 if (iostate->io_wanted) {
207 /*
208 * someone is waiting for the state of
209 * this io stream to change
210 */
211 iostate->io_wanted = 0;
212 wakeup((caddr_t)&iostate->io_wanted);
213 }
214 }
215 if ((b_flags & B_NEED_IODONE) && real_bp) {
216 if (error) {
217 real_bp->b_flags |= B_ERROR;
218 real_bp->b_error = error;
219 }
220 real_bp->b_resid = total_resid;
221
222 biodone(real_bp);
223 }
224 if (error == 0 && total_resid)
225 error = EIO;
226
227 if (b_flags & B_COMMIT_UPL) {
228 pg_offset = upl_offset & PAGE_MASK;
229 commit_size = (((pg_offset + total_size) + (PAGE_SIZE - 1)) / PAGE_SIZE) * PAGE_SIZE;
230
231 if (error || (b_flags & B_NOCACHE) || ((b_flags & B_PHYS) && !(b_flags & B_READ))) {
232 int upl_abort_code;
233
234 if (b_flags & B_PHYS)
235 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
236 else if ((b_flags & B_PAGEOUT) && (error != ENXIO)) /* transient error */
237 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
238 else if (b_flags & B_PGIN)
239 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
240 else
241 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
242
243 ubc_upl_abort_range(upl, upl_offset - pg_offset, commit_size,
244 upl_abort_code);
245
246 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
247 (int)upl, upl_offset - pg_offset, commit_size,
248 0x80000000|upl_abort_code, 0);
249
250 } else {
251 int upl_commit_flags = UPL_COMMIT_FREE_ON_EMPTY;
252
253 if (b_flags & B_PHYS)
254 upl_commit_flags |= UPL_COMMIT_SET_DIRTY;
255 else if ( !(b_flags & B_PAGEOUT))
256 upl_commit_flags |= UPL_COMMIT_CLEAR_DIRTY;
257 if (b_flags & B_AGE)
258 upl_commit_flags |= UPL_COMMIT_INACTIVATE;
259
260 ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size,
261 upl_commit_flags);
262
263 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
264 (int)upl, upl_offset - pg_offset, commit_size,
265 upl_commit_flags, 0);
266 }
267 } else
268 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
269 (int)upl, upl_offset, 0, error, 0);
270
271 return (error);
272 }
273
274
275 static void
276 cluster_zero(upl, upl_offset, size, bp)
277 upl_t upl;
278 vm_offset_t upl_offset;
279 int size;
280 struct buf *bp;
281 {
282 vm_offset_t io_addr = 0;
283 int must_unmap = 0;
284 kern_return_t kret;
285
286 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_NONE,
287 upl_offset, size, (int)bp, 0, 0);
288
289 if (bp == NULL || bp->b_data == NULL) {
290 kret = ubc_upl_map(upl, &io_addr);
291
292 if (kret != KERN_SUCCESS)
293 panic("cluster_zero: ubc_upl_map() failed with (%d)", kret);
294 if (io_addr == 0)
295 panic("cluster_zero: ubc_upl_map() mapped 0");
296
297 must_unmap = 1;
298 } else
299 io_addr = (vm_offset_t)bp->b_data;
300 bzero((caddr_t)(io_addr + upl_offset), size);
301
302 if (must_unmap) {
303 kret = ubc_upl_unmap(upl);
304
305 if (kret != KERN_SUCCESS)
306 panic("cluster_zero: kernel_upl_unmap failed");
307 }
308 }
309
310 static int
311 cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, real_bp, iostate)
312 struct vnode *vp;
313 upl_t upl;
314 vm_offset_t upl_offset;
315 off_t f_offset;
316 int non_rounded_size;
317 int devblocksize;
318 int flags;
319 struct buf *real_bp;
320 struct clios *iostate;
321 {
322 struct buf *cbp;
323 u_int size;
324 u_int io_size;
325 int io_flags;
326 int error = 0;
327 int retval = 0;
328 struct buf *cbp_head = 0;
329 struct buf *cbp_tail = 0;
330 upl_page_info_t *pl;
331 int buf_count = 0;
332 int pg_count;
333 int pg_offset;
334 u_int max_iosize;
335 u_int max_vectors;
336 int priv;
337 int zero_offset = 0;
338 u_int first_lblkno;
339
340 if (flags & CL_READ) {
341 io_flags = (B_VECTORLIST | B_READ);
342
343 vfs_io_attributes(vp, B_READ, &max_iosize, &max_vectors);
344 } else {
345 io_flags = (B_VECTORLIST | B_WRITEINPROG);
346
347 vfs_io_attributes(vp, B_WRITE, &max_iosize, &max_vectors);
348 }
349 pl = ubc_upl_pageinfo(upl);
350
351 if (flags & CL_AGE)
352 io_flags |= B_AGE;
353 if (flags & CL_DUMP)
354 io_flags |= B_NOCACHE;
355 if (flags & CL_PAGEIN)
356 io_flags |= B_PGIN;
357 if (flags & CL_PAGEOUT)
358 io_flags |= B_PAGEOUT;
359 if (flags & CL_COMMIT)
360 io_flags |= B_COMMIT_UPL;
361 if (flags & CL_PRESERVE)
362 io_flags |= B_PHYS;
363
364 if (devblocksize)
365 size = (non_rounded_size + (devblocksize - 1)) & ~(devblocksize - 1);
366 else
367 size = non_rounded_size;
368
369
370 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START,
371 (int)f_offset, size, upl_offset, flags, 0);
372
373 if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
374 /*
375 * then we are going to end up
376 * with a page that we can't complete (the file size wasn't a multiple
377 * of PAGE_SIZE and we're trying to read to the end of the file
378 * so we'll go ahead and zero out the portion of the page we can't
379 * read in from the file
380 */
381 zero_offset = upl_offset + non_rounded_size;
382 }
383 while (size) {
384 int i;
385 int pl_index;
386 int pg_resid;
387 int num_contig;
388 daddr_t lblkno;
389 daddr_t blkno;
390
391 if (size > max_iosize)
392 io_size = max_iosize;
393 else
394 io_size = size;
395
396 if (error = VOP_CMAP(vp, f_offset, io_size, &blkno, (size_t *)&io_size, NULL)) {
397 if (error == EOPNOTSUPP)
398 panic("VOP_CMAP Unimplemented");
399 break;
400 }
401
402 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
403 (int)f_offset, (int)blkno, io_size, zero_offset, 0);
404
405 if ( (!(flags & CL_READ) && (long)blkno == -1) || io_size == 0) {
406 if (flags & CL_PAGEOUT) {
407 error = EINVAL;
408 break;
409 };
410
411 /* Try paging out the page individually before
412 giving up entirely and dumping it (it could
413 be mapped in a "hole" and require allocation
414 before the I/O:
415 */
416 ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE_64, UPL_ABORT_FREE_ON_EMPTY);
417 if (ubc_pushdirty_range(vp, f_offset, PAGE_SIZE_64) == 0) {
418 error = EINVAL;
419 break;
420 };
421
422 upl_offset += PAGE_SIZE_64;
423 f_offset += PAGE_SIZE_64;
424 size -= PAGE_SIZE_64;
425 continue;
426 }
427 lblkno = (daddr_t)(f_offset / PAGE_SIZE_64);
428 /*
429 * we have now figured out how much I/O we can do - this is in 'io_size'
430 * pl_index represents the first page in the 'upl' that the I/O will occur for
431 * pg_offset is the starting point in the first page for the I/O
432 * pg_count is the number of full and partial pages that 'io_size' encompasses
433 */
434 pl_index = upl_offset / PAGE_SIZE;
435 pg_offset = upl_offset & PAGE_MASK;
436 pg_count = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
437
438 if (flags & CL_DEV_MEMORY) {
439 /*
440 * currently, can't deal with reading 'holes' in file
441 */
442 if ((long)blkno == -1) {
443 error = EINVAL;
444 break;
445 }
446 /*
447 * treat physical requests as one 'giant' page
448 */
449 pg_count = 1;
450 }
451 if ((flags & CL_READ) && (long)blkno == -1) {
452 int bytes_to_zero;
453
454 /*
455 * if we're reading and blkno == -1, then we've got a
456 * 'hole' in the file that we need to deal with by zeroing
457 * out the affected area in the upl
458 */
459 if (zero_offset && io_size == size) {
460 /*
461 * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
462 * than 'zero_offset' will be non-zero
463 * if the 'hole' returned by VOP_CMAP extends all the way to the eof
464 * (indicated by the io_size finishing off the I/O request for this UPL)
465 * than we're not going to issue an I/O for the
466 * last page in this upl... we need to zero both the hole and the tail
467 * of the page beyond the EOF, since the delayed zero-fill won't kick in
468 */
469 bytes_to_zero = (((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset;
470
471 zero_offset = 0;
472 } else
473 bytes_to_zero = io_size;
474
475 cluster_zero(upl, upl_offset, bytes_to_zero, real_bp);
476
477 if (cbp_head)
478 /*
479 * if there is a current I/O chain pending
480 * then the first page of the group we just zero'd
481 * will be handled by the I/O completion if the zero
482 * fill started in the middle of the page
483 */
484 pg_count = (io_size - pg_offset) / PAGE_SIZE;
485 else {
486 /*
487 * no pending I/O to pick up that first page
488 * so, we have to make sure it gets committed
489 * here.
490 * set the pg_offset to 0 so that the upl_commit_range
491 * starts with this page
492 */
493 pg_count = (io_size + pg_offset) / PAGE_SIZE;
494 pg_offset = 0;
495 }
496 if (io_size == size && ((upl_offset + io_size) & PAGE_MASK))
497 /*
498 * if we're done with the request for this UPL
499 * then we have to make sure to commit the last page
500 * even if we only partially zero-filled it
501 */
502 pg_count++;
503
504 if (pg_count) {
505 if (pg_offset)
506 pg_resid = PAGE_SIZE - pg_offset;
507 else
508 pg_resid = 0;
509
510 if (flags & CL_COMMIT)
511 ubc_upl_commit_range(upl,
512 (upl_offset + pg_resid) & ~PAGE_MASK,
513 pg_count * PAGE_SIZE,
514 UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
515 }
516 upl_offset += io_size;
517 f_offset += io_size;
518 size -= io_size;
519
520 if (cbp_head && pg_count)
521 goto start_io;
522 continue;
523
524 } else if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno)) {
525 real_bp->b_blkno = blkno;
526 }
527
528 if (pg_count > max_vectors) {
529 io_size -= (pg_count - max_vectors) * PAGE_SIZE;
530
531 if (io_size < 0) {
532 io_size = PAGE_SIZE - pg_offset;
533 pg_count = 1;
534 } else
535 pg_count = max_vectors;
536 }
537
538 /* Throttle the speculative IO */
539 if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT))
540 priv = 0;
541 else
542 priv = 1;
543
544 cbp = alloc_io_buf(vp, priv);
545
546
547 if (flags & CL_PAGEOUT) {
548 for (i = 0; i < pg_count; i++) {
549 int s;
550 struct buf *bp;
551
552 s = splbio();
553 if (bp = incore(vp, lblkno + i)) {
554 if (!ISSET(bp->b_flags, B_BUSY)) {
555 bremfree(bp);
556 SET(bp->b_flags, (B_BUSY | B_INVAL));
557 splx(s);
558 brelse(bp);
559 } else
560 panic("BUSY bp found in cluster_io");
561 }
562 splx(s);
563 }
564 }
565 if (flags & CL_ASYNC) {
566 cbp->b_flags |= (B_CALL | B_ASYNC);
567 cbp->b_iodone = (void *)cluster_iodone;
568 }
569 cbp->b_flags |= io_flags;
570
571 cbp->b_lblkno = lblkno;
572 cbp->b_blkno = blkno;
573 cbp->b_bcount = io_size;
574 cbp->b_pagelist = upl;
575 cbp->b_uploffset = upl_offset;
576 cbp->b_trans_next = (struct buf *)0;
577
578 if (cbp->b_iostate = (void *)iostate)
579 /*
580 * caller wants to track the state of this
581 * io... bump the amount issued against this stream
582 */
583 iostate->io_issued += io_size;
584
585 if (flags & CL_READ)
586 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
587 cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
588 else
589 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
590 cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
591
592 if (cbp_head) {
593 cbp_tail->b_trans_next = cbp;
594 cbp_tail = cbp;
595 } else {
596 cbp_head = cbp;
597 cbp_tail = cbp;
598 }
599 (struct buf *)(cbp->b_trans_head) = cbp_head;
600 buf_count++;
601
602 upl_offset += io_size;
603 f_offset += io_size;
604 size -= io_size;
605
606 if ( (!(upl_offset & PAGE_MASK) && !(flags & CL_DEV_MEMORY) && ((flags & CL_ASYNC) || buf_count > 8)) || size == 0) {
607 /*
608 * if we have no more I/O to issue or
609 * the current I/O we've prepared fully
610 * completes the last page in this request
611 * and it's either an ASYNC request or
612 * we've already accumulated more than 8 I/O's into
613 * this transaction and it's not an I/O directed to
614 * special DEVICE memory
615 * then go ahead and issue the I/O
616 */
617 start_io:
618 if (real_bp) {
619 cbp_head->b_flags |= B_NEED_IODONE;
620 cbp_head->b_real_bp = real_bp;
621 } else
622 cbp_head->b_real_bp = (struct buf *)NULL;
623
624 if (size == 0) {
625 /*
626 * we're about to issue the last I/O for this upl
627 * if this was a read to the eof and the eof doesn't
628 * finish on a page boundary, than we need to zero-fill
629 * the rest of the page....
630 */
631 cbp_head->b_validend = zero_offset;
632 } else
633 cbp_head->b_validend = 0;
634
635 for (cbp = cbp_head; cbp;) {
636 struct buf * cbp_next;
637
638 if (io_flags & B_WRITEINPROG)
639 cbp->b_vp->v_numoutput++;
640
641 cbp_next = cbp->b_trans_next;
642
643 (void) VOP_STRATEGY(cbp);
644 cbp = cbp_next;
645 }
646 if ( !(flags & CL_ASYNC)) {
647 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next)
648 biowait(cbp);
649
650 if (error = cluster_iodone(cbp_head)) {
651 if ((flags & CL_PAGEOUT) && (error == ENXIO))
652 retval = 0; /* drop the error */
653 else
654 retval = error;
655 error = 0;
656 }
657 }
658 cbp_head = (struct buf *)0;
659 cbp_tail = (struct buf *)0;
660
661 buf_count = 0;
662 }
663 }
664 if (error) {
665 int abort_size;
666
667 io_size = 0;
668
669 for (cbp = cbp_head; cbp;) {
670 struct buf * cbp_next;
671
672 upl_offset -= cbp->b_bcount;
673 size += cbp->b_bcount;
674 io_size += cbp->b_bcount;
675
676 cbp_next = cbp->b_trans_next;
677 free_io_buf(cbp);
678 cbp = cbp_next;
679 }
680 if (iostate) {
681 /*
682 * update the error condition for this stream
683 * since we never really issued the io
684 * just go ahead and adjust it back
685 */
686 if (iostate->io_error == 0)
687 iostate->io_error = error;
688 iostate->io_issued -= io_size;
689
690 if (iostate->io_wanted) {
691 /*
692 * someone is waiting for the state of
693 * this io stream to change
694 */
695 iostate->io_wanted = 0;
696 wakeup((caddr_t)&iostate->io_wanted);
697 }
698 }
699 pg_offset = upl_offset & PAGE_MASK;
700 abort_size = ((size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE) * PAGE_SIZE;
701
702 if (flags & CL_COMMIT) {
703 int upl_abort_code;
704
705 if (flags & CL_PRESERVE)
706 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
707 else if ((flags & CL_PAGEOUT) && (error != ENXIO)) /* transient error */
708 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
709 else if (flags & CL_PAGEIN)
710 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
711 else
712 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
713
714 ubc_upl_abort_range(upl, upl_offset - pg_offset, abort_size,
715 upl_abort_code);
716
717 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
718 (int)upl, upl_offset - pg_offset, abort_size, error, 0);
719 }
720 if (real_bp) {
721 real_bp->b_flags |= B_ERROR;
722 real_bp->b_error = error;
723
724 biodone(real_bp);
725 }
726 if (retval == 0)
727 retval = error;
728 }
729 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END,
730 (int)f_offset, size, upl_offset, retval, 0);
731
732 return (retval);
733 }
734
735
736 static int
737 cluster_rd_prefetch(vp, f_offset, size, filesize, devblocksize)
738 struct vnode *vp;
739 off_t f_offset;
740 u_int size;
741 off_t filesize;
742 int devblocksize;
743 {
744 int pages_to_fetch;
745 int skipped_pages;
746
747 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
748 (int)f_offset, size, (int)filesize, 0, 0);
749
750 if (f_offset >= filesize) {
751 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
752 (int)f_offset, 0, 0, 0, 0);
753 return(0);
754 }
755 if (size > (MAX_UPL_TRANSFER * PAGE_SIZE))
756 size = MAX_UPL_TRANSFER * PAGE_SIZE;
757 else
758 size = (size + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1);
759
760 if ((off_t)size > (filesize - f_offset))
761 size = filesize - f_offset;
762
763 pages_to_fetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
764
765 for (skipped_pages = 0; skipped_pages < pages_to_fetch; skipped_pages++) {
766 if (ubc_page_op(vp, f_offset, 0, 0, 0) != KERN_SUCCESS)
767 break;
768 f_offset += PAGE_SIZE;
769 size -= PAGE_SIZE;
770 }
771 if (skipped_pages < pages_to_fetch)
772 advisory_read(vp, filesize, f_offset, size, devblocksize);
773
774 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
775 (int)f_offset + (pages_to_fetch * PAGE_SIZE), skipped_pages, 0, 1, 0);
776
777 return (pages_to_fetch);
778 }
779
780
781
782 static void
783 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize)
784 struct vnode *vp;
785 daddr_t b_lblkno;
786 daddr_t e_lblkno;
787 off_t filesize;
788 int devblocksize;
789 {
790 daddr_t r_lblkno;
791 off_t f_offset;
792 int size_of_prefetch;
793 int max_pages;
794
795 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
796 b_lblkno, e_lblkno, vp->v_lastr, 0, 0);
797
798 if (b_lblkno == vp->v_lastr && b_lblkno == e_lblkno) {
799 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
800 vp->v_ralen, vp->v_maxra, vp->v_lastr, 0, 0);
801 return;
802 }
803
804 if (vp->v_lastr == -1 || (b_lblkno != vp->v_lastr && b_lblkno != (vp->v_lastr + 1) &&
805 (b_lblkno != (vp->v_maxra + 1) || vp->v_ralen == 0))) {
806 vp->v_ralen = 0;
807 vp->v_maxra = 0;
808
809 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
810 vp->v_ralen, vp->v_maxra, vp->v_lastr, 1, 0);
811
812 return;
813 }
814 max_pages = MAX_UPL_TRANSFER;
815
816 vp->v_ralen = vp->v_ralen ? min(max_pages, vp->v_ralen << 1) : 1;
817
818 if (((e_lblkno + 1) - b_lblkno) > vp->v_ralen)
819 vp->v_ralen = min(max_pages, (e_lblkno + 1) - b_lblkno);
820
821 if (e_lblkno < vp->v_maxra) {
822 if ((vp->v_maxra - e_lblkno) > max(max_pages / 16, 4)) {
823
824 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
825 vp->v_ralen, vp->v_maxra, vp->v_lastr, 2, 0);
826 return;
827 }
828 }
829 r_lblkno = max(e_lblkno, vp->v_maxra) + 1;
830 f_offset = (off_t)r_lblkno * PAGE_SIZE_64;
831
832 if (f_offset < filesize) {
833 size_of_prefetch = cluster_rd_prefetch(vp, f_offset, vp->v_ralen * PAGE_SIZE, filesize, devblocksize);
834
835 if (size_of_prefetch)
836 vp->v_maxra = (r_lblkno + size_of_prefetch) - 1;
837 }
838 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
839 vp->v_ralen, vp->v_maxra, vp->v_lastr, 3, 0);
840 }
841
842 int
843 cluster_pageout(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags)
844 struct vnode *vp;
845 upl_t upl;
846 vm_offset_t upl_offset;
847 off_t f_offset;
848 int size;
849 off_t filesize;
850 int devblocksize;
851 int flags;
852 {
853 int io_size;
854 int pg_size;
855 off_t max_size;
856 int local_flags = CL_PAGEOUT;
857
858 if ((flags & UPL_IOSYNC) == 0)
859 local_flags |= CL_ASYNC;
860 if ((flags & UPL_NOCOMMIT) == 0)
861 local_flags |= CL_COMMIT;
862
863
864 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
865 (int)f_offset, size, (int)filesize, local_flags, 0);
866
867 /*
868 * If they didn't specify any I/O, then we are done...
869 * we can't issue an abort because we don't know how
870 * big the upl really is
871 */
872 if (size <= 0)
873 return (EINVAL);
874
875 if (vp->v_mount->mnt_flag & MNT_RDONLY) {
876 if (local_flags & CL_COMMIT)
877 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
878 return (EROFS);
879 }
880 /*
881 * can't page-in from a negative offset
882 * or if we're starting beyond the EOF
883 * or if the file offset isn't page aligned
884 * or the size requested isn't a multiple of PAGE_SIZE
885 */
886 if (f_offset < 0 || f_offset >= filesize ||
887 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
888 if (local_flags & CL_COMMIT)
889 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
890 return (EINVAL);
891 }
892 max_size = filesize - f_offset;
893
894 if (size < max_size)
895 io_size = size;
896 else
897 io_size = max_size;
898
899 pg_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
900
901 if (size > pg_size) {
902 if (local_flags & CL_COMMIT)
903 ubc_upl_abort_range(upl, upl_offset + pg_size, size - pg_size,
904 UPL_ABORT_FREE_ON_EMPTY);
905 }
906 while (vp->v_numoutput >= ASYNC_THROTTLE) {
907 vp->v_flag |= VTHROTTLED;
908 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_pageout", 0);
909 }
910
911 return (cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize,
912 local_flags, (struct buf *)0, (struct clios *)0));
913 }
914
915 int
916 cluster_pagein(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags)
917 struct vnode *vp;
918 upl_t upl;
919 vm_offset_t upl_offset;
920 off_t f_offset;
921 int size;
922 off_t filesize;
923 int devblocksize;
924 int flags;
925 {
926 u_int io_size;
927 int rounded_size;
928 off_t max_size;
929 int retval;
930 int local_flags = 0;
931
932 if (upl == NULL || size < 0)
933 panic("cluster_pagein: NULL upl passed in");
934
935 if ((flags & UPL_IOSYNC) == 0)
936 local_flags |= CL_ASYNC;
937 if ((flags & UPL_NOCOMMIT) == 0)
938 local_flags |= CL_COMMIT;
939
940
941 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
942 (int)f_offset, size, (int)filesize, local_flags, 0);
943
944 /*
945 * can't page-in from a negative offset
946 * or if we're starting beyond the EOF
947 * or if the file offset isn't page aligned
948 * or the size requested isn't a multiple of PAGE_SIZE
949 */
950 if (f_offset < 0 || f_offset >= filesize ||
951 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) {
952 if (local_flags & CL_COMMIT)
953 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
954 return (EINVAL);
955 }
956 max_size = filesize - f_offset;
957
958 if (size < max_size)
959 io_size = size;
960 else
961 io_size = max_size;
962
963 rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
964
965 if (size > rounded_size && (local_flags & CL_COMMIT))
966 ubc_upl_abort_range(upl, upl_offset + rounded_size,
967 size - (upl_offset + rounded_size), UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
968
969 retval = cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize,
970 local_flags | CL_READ | CL_PAGEIN, (struct buf *)0, (struct clios *)0);
971
972 if (retval == 0) {
973 int b_lblkno;
974 int e_lblkno;
975
976 b_lblkno = (int)(f_offset / PAGE_SIZE_64);
977 e_lblkno = (int)
978 ((f_offset + ((off_t)io_size - 1)) / PAGE_SIZE_64);
979
980 if (!(flags & UPL_NORDAHEAD) && !(vp->v_flag & VRAOFF) && rounded_size == PAGE_SIZE) {
981 /*
982 * we haven't read the last page in of the file yet
983 * so let's try to read ahead if we're in
984 * a sequential access pattern
985 */
986 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
987 }
988 vp->v_lastr = e_lblkno;
989 }
990 return (retval);
991 }
992
993 int
994 cluster_bp(bp)
995 struct buf *bp;
996 {
997 off_t f_offset;
998 int flags;
999
1000 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START,
1001 (int)bp, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
1002
1003 if (bp->b_pagelist == (upl_t) 0)
1004 panic("cluster_bp: can't handle NULL upl yet\n");
1005 if (bp->b_flags & B_READ)
1006 flags = CL_ASYNC | CL_READ;
1007 else
1008 flags = CL_ASYNC;
1009
1010 f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
1011
1012 return (cluster_io(bp->b_vp, bp->b_pagelist, 0, f_offset, bp->b_bcount, 0, flags, bp, (struct clios *)0));
1013 }
1014
1015 int
1016 cluster_write(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
1017 struct vnode *vp;
1018 struct uio *uio;
1019 off_t oldEOF;
1020 off_t newEOF;
1021 off_t headOff;
1022 off_t tailOff;
1023 int devblocksize;
1024 int flags;
1025 {
1026 int prev_resid;
1027 int clip_size;
1028 off_t max_io_size;
1029 struct iovec *iov;
1030 vm_offset_t upl_offset;
1031 int upl_size;
1032 int pages_in_pl;
1033 upl_page_info_t *pl;
1034 int upl_flags;
1035 upl_t upl;
1036 int retval = 0;
1037
1038
1039 if ( (!(vp->v_flag & VNOCACHE_DATA)) || (!uio) || (uio->uio_segflg != UIO_USERSPACE))
1040 {
1041 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1042 return(retval);
1043 }
1044
1045 while (uio->uio_resid && uio->uio_offset < newEOF && retval == 0)
1046 {
1047 /* we know we have a resid, so this is safe */
1048 iov = uio->uio_iov;
1049 while (iov->iov_len == 0) {
1050 uio->uio_iov++;
1051 uio->uio_iovcnt--;
1052 iov = uio->uio_iov;
1053 }
1054
1055 /*
1056 * We check every vector target and if it is physically
1057 * contiguous space, we skip the sanity checks.
1058 */
1059
1060 upl_offset = (vm_offset_t)iov->iov_base & ~PAGE_MASK;
1061 upl_size = (upl_offset + PAGE_SIZE +(PAGE_SIZE -1)) & ~PAGE_MASK;
1062 pages_in_pl = 0;
1063 upl_flags = UPL_QUERY_OBJECT_TYPE;
1064 if ((vm_map_get_upl(current_map(),
1065 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1066 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0)) != KERN_SUCCESS)
1067 {
1068 /*
1069 * the user app must have passed in an invalid address
1070 */
1071 return (EFAULT);
1072 }
1073
1074 if (upl_flags & UPL_PHYS_CONTIG)
1075 {
1076 if (flags & IO_HEADZEROFILL)
1077 {
1078 flags &= ~IO_HEADZEROFILL;
1079
1080 if (retval = cluster_write_x(vp, (struct uio *)0, 0, uio->uio_offset, headOff, 0, devblocksize, IO_HEADZEROFILL))
1081 return(retval);
1082 }
1083
1084 retval = cluster_phys_write(vp, uio, newEOF, devblocksize, flags);
1085
1086 if (uio->uio_resid == 0 && (flags & IO_TAILZEROFILL))
1087 {
1088 retval = cluster_write_x(vp, (struct uio *)0, 0, tailOff, uio->uio_offset, 0, devblocksize, IO_HEADZEROFILL);
1089 return(retval);
1090 }
1091 }
1092 else if ((uio->uio_resid < 4 * PAGE_SIZE) || (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL)))
1093 {
1094 /*
1095 * We set a threshhold of 4 pages to decide if the nocopy
1096 * write loop is worth the trouble...
1097 * we also come here if we're trying to zero the head and/or tail
1098 * of a partially written page, and the user source is not a physically contiguous region
1099 */
1100 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1101 return(retval);
1102 }
1103 else if (uio->uio_offset & PAGE_MASK_64)
1104 {
1105 /* Bring the file offset write up to a pagesize boundary */
1106 clip_size = (PAGE_SIZE - (uio->uio_offset & PAGE_MASK_64));
1107 if (uio->uio_resid < clip_size)
1108 clip_size = uio->uio_resid;
1109 /*
1110 * Fake the resid going into the cluster_write_x call
1111 * and restore it on the way out.
1112 */
1113 prev_resid = uio->uio_resid;
1114 uio->uio_resid = clip_size;
1115 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1116 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1117 }
1118 else if ((int)iov->iov_base & PAGE_MASK_64)
1119 {
1120 clip_size = iov->iov_len;
1121 prev_resid = uio->uio_resid;
1122 uio->uio_resid = clip_size;
1123 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1124 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1125 }
1126 else
1127 {
1128 /*
1129 * If we come in here, we know the offset into
1130 * the file is on a pagesize boundary
1131 */
1132
1133 max_io_size = newEOF - uio->uio_offset;
1134 clip_size = uio->uio_resid;
1135 if (iov->iov_len < clip_size)
1136 clip_size = iov->iov_len;
1137 if (max_io_size < clip_size)
1138 clip_size = max_io_size;
1139
1140 if (clip_size < PAGE_SIZE)
1141 {
1142 /*
1143 * Take care of tail end of write in this vector
1144 */
1145 prev_resid = uio->uio_resid;
1146 uio->uio_resid = clip_size;
1147 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1148 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1149 }
1150 else
1151 {
1152 /* round clip_size down to a multiple of pagesize */
1153 clip_size = clip_size & ~(PAGE_MASK);
1154 prev_resid = uio->uio_resid;
1155 uio->uio_resid = clip_size;
1156 retval = cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags);
1157 if ((retval == 0) && uio->uio_resid)
1158 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1159 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1160 }
1161 } /* end else */
1162 } /* end while */
1163 return(retval);
1164 }
1165
1166
1167 static int
1168 cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags)
1169 struct vnode *vp;
1170 struct uio *uio;
1171 off_t newEOF;
1172 int devblocksize;
1173 int flags;
1174 {
1175 upl_t upl;
1176 upl_page_info_t *pl;
1177 off_t upl_f_offset;
1178 vm_offset_t upl_offset;
1179 off_t max_io_size;
1180 int io_size;
1181 int io_flag;
1182 int upl_size;
1183 int upl_needed_size;
1184 int pages_in_pl;
1185 int upl_flags;
1186 kern_return_t kret;
1187 struct iovec *iov;
1188 int i;
1189 int first = 1;
1190 int force_data_sync;
1191 int error = 0;
1192 struct clios iostate;
1193
1194 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
1195 (int)uio->uio_offset, (int)uio->uio_resid,
1196 (int)newEOF, devblocksize, 0);
1197
1198 /*
1199 * When we enter this routine, we know
1200 * -- the offset into the file is on a pagesize boundary
1201 * -- the resid is a page multiple
1202 * -- the resid will not exceed iov_len
1203 */
1204 cluster_try_push(vp, newEOF, 0, 1);
1205
1206 iostate.io_completed = 0;
1207 iostate.io_issued = 0;
1208 iostate.io_error = 0;
1209 iostate.io_wanted = 0;
1210
1211 iov = uio->uio_iov;
1212
1213 while (uio->uio_resid && uio->uio_offset < newEOF && error == 0) {
1214 io_size = uio->uio_resid;
1215
1216 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1217 io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1218
1219 if (first) {
1220 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE) / 4)
1221 io_size = (MAX_UPL_TRANSFER * PAGE_SIZE) / 8;
1222 first = 0;
1223 }
1224 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
1225 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
1226
1227 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
1228 (int)upl_offset, upl_needed_size, (int)iov->iov_base, io_size, 0);
1229
1230 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
1231 pages_in_pl = 0;
1232 upl_size = upl_needed_size;
1233 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1234 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
1235
1236 kret = vm_map_get_upl(current_map(),
1237 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1238 &upl_size,
1239 &upl,
1240 NULL,
1241 &pages_in_pl,
1242 &upl_flags,
1243 force_data_sync);
1244
1245 if (kret != KERN_SUCCESS) {
1246 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1247 0, 0, 0, kret, 0);
1248
1249 /*
1250 * cluster_nocopy_write: failed to get pagelist
1251 *
1252 * we may have already spun some portion of this request
1253 * off as async requests... we need to wait for the I/O
1254 * to complete before returning
1255 */
1256 goto wait_for_writes;
1257 }
1258 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1259 pages_in_pl = upl_size / PAGE_SIZE;
1260
1261 for (i = 0; i < pages_in_pl; i++) {
1262 if (!upl_valid_page(pl, i))
1263 break;
1264 }
1265 if (i == pages_in_pl)
1266 break;
1267
1268 /*
1269 * didn't get all the pages back that we
1270 * needed... release this upl and try again
1271 */
1272 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1273 UPL_ABORT_FREE_ON_EMPTY);
1274 }
1275 if (force_data_sync >= 3) {
1276 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1277 i, pages_in_pl, upl_size, kret, 0);
1278
1279 /*
1280 * for some reason, we couldn't acquire a hold on all
1281 * the pages needed in the user's address space
1282 *
1283 * we may have already spun some portion of this request
1284 * off as async requests... we need to wait for the I/O
1285 * to complete before returning
1286 */
1287 goto wait_for_writes;
1288 }
1289
1290 /*
1291 * Consider the possibility that upl_size wasn't satisfied.
1292 */
1293 if (upl_size != upl_needed_size)
1294 io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
1295
1296 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1297 (int)upl_offset, upl_size, (int)iov->iov_base, io_size, 0);
1298
1299 if (io_size == 0) {
1300 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1301 UPL_ABORT_FREE_ON_EMPTY);
1302
1303 /*
1304 * we may have already spun some portion of this request
1305 * off as async requests... we need to wait for the I/O
1306 * to complete before returning
1307 */
1308 goto wait_for_writes;
1309 }
1310 /*
1311 * Now look for pages already in the cache
1312 * and throw them away.
1313 */
1314
1315 upl_f_offset = uio->uio_offset; /* this is page aligned in the file */
1316 max_io_size = io_size;
1317
1318 while (max_io_size) {
1319 /*
1320 * Flag UPL_POP_DUMP says if the page is found
1321 * in the page cache it must be thrown away.
1322 */
1323 ubc_page_op(vp,
1324 upl_f_offset,
1325 UPL_POP_SET | UPL_POP_BUSY | UPL_POP_DUMP,
1326 0, 0);
1327 max_io_size -= PAGE_SIZE_64;
1328 upl_f_offset += PAGE_SIZE_64;
1329 }
1330 /*
1331 * we want push out these writes asynchronously so that we can overlap
1332 * the preparation of the next I/O
1333 * if there are already too many outstanding writes
1334 * wait until some complete before issuing the next
1335 */
1336 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
1337 iostate.io_wanted = 1;
1338 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_write", 0);
1339 }
1340 if (iostate.io_error) {
1341 /*
1342 * one of the earlier writes we issued ran into a hard error
1343 * don't issue any more writes, cleanup the UPL
1344 * that was just created but not used, then
1345 * go wait for all writes that are part of this stream
1346 * to complete before returning the error to the caller
1347 */
1348 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1349 UPL_ABORT_FREE_ON_EMPTY);
1350
1351 goto wait_for_writes;
1352 }
1353 io_flag = CL_ASYNC | CL_PRESERVE | CL_COMMIT;
1354
1355 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
1356 (int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0);
1357
1358 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1359 io_size, devblocksize, io_flag, (struct buf *)0, &iostate);
1360
1361 iov->iov_len -= io_size;
1362 iov->iov_base += io_size;
1363 uio->uio_resid -= io_size;
1364 uio->uio_offset += io_size;
1365
1366 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
1367 (int)upl_offset, (int)uio->uio_offset, (int)uio->uio_resid, error, 0);
1368
1369 } /* end while */
1370
1371 wait_for_writes:
1372 /*
1373 * make sure all async writes issued as part of this stream
1374 * have completed before we return
1375 */
1376 while (iostate.io_issued != iostate.io_completed) {
1377 iostate.io_wanted = 1;
1378 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_write", 0);
1379 }
1380 if (iostate.io_error)
1381 error = iostate.io_error;
1382
1383 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1384 (int)uio->uio_offset, (int)uio->uio_resid, error, 4, 0);
1385
1386 return (error);
1387 }
1388
1389
1390 static int
1391 cluster_phys_write(vp, uio, newEOF, devblocksize, flags)
1392 struct vnode *vp;
1393 struct uio *uio;
1394 off_t newEOF;
1395 int devblocksize;
1396 int flags;
1397 {
1398 upl_page_info_t *pl;
1399 addr64_t src_paddr;
1400 upl_t upl;
1401 vm_offset_t upl_offset;
1402 int tail_size;
1403 int io_size;
1404 int upl_size;
1405 int upl_needed_size;
1406 int pages_in_pl;
1407 int upl_flags;
1408 kern_return_t kret;
1409 struct iovec *iov;
1410 int error = 0;
1411
1412 /*
1413 * When we enter this routine, we know
1414 * -- the resid will not exceed iov_len
1415 * -- the vector target address is physcially contiguous
1416 */
1417 cluster_try_push(vp, newEOF, 0, 1);
1418
1419 iov = uio->uio_iov;
1420 io_size = iov->iov_len;
1421 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
1422 upl_needed_size = upl_offset + io_size;
1423
1424 pages_in_pl = 0;
1425 upl_size = upl_needed_size;
1426 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1427 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
1428
1429 kret = vm_map_get_upl(current_map(),
1430 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1431 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
1432
1433 if (kret != KERN_SUCCESS) {
1434 /*
1435 * cluster_phys_write: failed to get pagelist
1436 * note: return kret here
1437 */
1438 return(EINVAL);
1439 }
1440 /*
1441 * Consider the possibility that upl_size wasn't satisfied.
1442 * This is a failure in the physical memory case.
1443 */
1444 if (upl_size < upl_needed_size) {
1445 kernel_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1446 return(EINVAL);
1447 }
1448 pl = ubc_upl_pageinfo(upl);
1449
1450 src_paddr = (((addr64_t)(int)upl_phys_page(pl, 0)) << 12) + ((addr64_t)iov->iov_base & PAGE_MASK);
1451
1452 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
1453 int head_size;
1454
1455 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
1456
1457 if (head_size > io_size)
1458 head_size = io_size;
1459
1460 error = cluster_align_phys_io(vp, uio, src_paddr, head_size, devblocksize, 0);
1461
1462 if (error) {
1463 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1464
1465 return(EINVAL);
1466 }
1467 upl_offset += head_size;
1468 src_paddr += head_size;
1469 io_size -= head_size;
1470 }
1471 tail_size = io_size & (devblocksize - 1);
1472 io_size -= tail_size;
1473
1474 if (io_size) {
1475 /*
1476 * issue a synchronous write to cluster_io
1477 */
1478 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1479 io_size, 0, CL_DEV_MEMORY, (struct buf *)0, (struct clios *)0);
1480 }
1481 if (error == 0) {
1482 /*
1483 * The cluster_io write completed successfully,
1484 * update the uio structure
1485 */
1486 uio->uio_resid -= io_size;
1487 iov->iov_len -= io_size;
1488 iov->iov_base += io_size;
1489 uio->uio_offset += io_size;
1490 src_paddr += io_size;
1491
1492 if (tail_size)
1493 error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, devblocksize, 0);
1494 }
1495 /*
1496 * just release our hold on the physically contiguous
1497 * region without changing any state
1498 */
1499 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1500
1501 return (error);
1502 }
1503
1504
1505 static int
1506 cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
1507 struct vnode *vp;
1508 struct uio *uio;
1509 off_t oldEOF;
1510 off_t newEOF;
1511 off_t headOff;
1512 off_t tailOff;
1513 int devblocksize;
1514 int flags;
1515 {
1516 upl_page_info_t *pl;
1517 upl_t upl;
1518 vm_offset_t upl_offset;
1519 int upl_size;
1520 off_t upl_f_offset;
1521 int pages_in_upl;
1522 int start_offset;
1523 int xfer_resid;
1524 int io_size;
1525 int io_flags;
1526 vm_offset_t io_address;
1527 int io_offset;
1528 int bytes_to_zero;
1529 int bytes_to_move;
1530 kern_return_t kret;
1531 int retval = 0;
1532 int uio_resid;
1533 long long total_size;
1534 long long zero_cnt;
1535 off_t zero_off;
1536 long long zero_cnt1;
1537 off_t zero_off1;
1538 daddr_t start_blkno;
1539 daddr_t last_blkno;
1540
1541 if (uio) {
1542 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1543 (int)uio->uio_offset, uio->uio_resid, (int)oldEOF, (int)newEOF, 0);
1544
1545 uio_resid = uio->uio_resid;
1546 } else {
1547 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1548 0, 0, (int)oldEOF, (int)newEOF, 0);
1549
1550 uio_resid = 0;
1551 }
1552 zero_cnt = 0;
1553 zero_cnt1 = 0;
1554
1555 if (flags & IO_HEADZEROFILL) {
1556 /*
1557 * some filesystems (HFS is one) don't support unallocated holes within a file...
1558 * so we zero fill the intervening space between the old EOF and the offset
1559 * where the next chunk of real data begins.... ftruncate will also use this
1560 * routine to zero fill to the new EOF when growing a file... in this case, the
1561 * uio structure will not be provided
1562 */
1563 if (uio) {
1564 if (headOff < uio->uio_offset) {
1565 zero_cnt = uio->uio_offset - headOff;
1566 zero_off = headOff;
1567 }
1568 } else if (headOff < newEOF) {
1569 zero_cnt = newEOF - headOff;
1570 zero_off = headOff;
1571 }
1572 }
1573 if (flags & IO_TAILZEROFILL) {
1574 if (uio) {
1575 zero_off1 = uio->uio_offset + uio->uio_resid;
1576
1577 if (zero_off1 < tailOff)
1578 zero_cnt1 = tailOff - zero_off1;
1579 }
1580 }
1581 if (zero_cnt == 0 && uio == (struct uio *) 0)
1582 {
1583 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
1584 retval, 0, 0, 0, 0);
1585 return (0);
1586 }
1587
1588 while ((total_size = (uio_resid + zero_cnt + zero_cnt1)) && retval == 0) {
1589 /*
1590 * for this iteration of the loop, figure out where our starting point is
1591 */
1592 if (zero_cnt) {
1593 start_offset = (int)(zero_off & PAGE_MASK_64);
1594 upl_f_offset = zero_off - start_offset;
1595 } else if (uio_resid) {
1596 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
1597 upl_f_offset = uio->uio_offset - start_offset;
1598 } else {
1599 start_offset = (int)(zero_off1 & PAGE_MASK_64);
1600 upl_f_offset = zero_off1 - start_offset;
1601 }
1602 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
1603 (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
1604
1605 if (total_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1606 total_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1607
1608 /*
1609 * compute the size of the upl needed to encompass
1610 * the requested write... limit each call to cluster_io
1611 * to the maximum UPL size... cluster_io will clip if
1612 * this exceeds the maximum io_size for the device,
1613 * make sure to account for
1614 * a starting offset that's not page aligned
1615 */
1616 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1617
1618 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1619 upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1620
1621 pages_in_upl = upl_size / PAGE_SIZE;
1622 io_size = upl_size - start_offset;
1623
1624 if ((long long)io_size > total_size)
1625 io_size = total_size;
1626
1627 start_blkno = (daddr_t)(upl_f_offset / PAGE_SIZE_64);
1628 last_blkno = start_blkno + pages_in_upl;
1629
1630 kret = ubc_create_upl(vp,
1631 upl_f_offset,
1632 upl_size,
1633 &upl,
1634 &pl,
1635 UPL_FLAGS_NONE);
1636 if (kret != KERN_SUCCESS)
1637 panic("cluster_write: failed to get pagelist");
1638
1639 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_NONE,
1640 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
1641
1642 if (start_offset && !upl_valid_page(pl, 0)) {
1643 int read_size;
1644
1645 /*
1646 * we're starting in the middle of the first page of the upl
1647 * and the page isn't currently valid, so we're going to have
1648 * to read it in first... this is a synchronous operation
1649 */
1650 read_size = PAGE_SIZE;
1651
1652 if ((upl_f_offset + read_size) > newEOF)
1653 read_size = newEOF - upl_f_offset;
1654
1655 retval = cluster_io(vp, upl, 0, upl_f_offset, read_size, devblocksize,
1656 CL_READ, (struct buf *)0, (struct clios *)0);
1657 if (retval) {
1658 /*
1659 * we had an error during the read which causes us to abort
1660 * the current cluster_write request... before we do, we need
1661 * to release the rest of the pages in the upl without modifying
1662 * there state and mark the failed page in error
1663 */
1664 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
1665 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1666
1667 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1668 (int)upl, 0, 0, retval, 0);
1669 break;
1670 }
1671 }
1672 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
1673 /*
1674 * the last offset we're writing to in this upl does not end on a page
1675 * boundary... if it's not beyond the old EOF, then we'll also need to
1676 * pre-read this page in if it isn't already valid
1677 */
1678 upl_offset = upl_size - PAGE_SIZE;
1679
1680 if ((upl_f_offset + start_offset + io_size) < oldEOF &&
1681 !upl_valid_page(pl, upl_offset / PAGE_SIZE)) {
1682 int read_size;
1683
1684 read_size = PAGE_SIZE;
1685
1686 if ((upl_f_offset + upl_offset + read_size) > newEOF)
1687 read_size = newEOF - (upl_f_offset + upl_offset);
1688
1689 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size, devblocksize,
1690 CL_READ, (struct buf *)0, (struct clios *)0);
1691 if (retval) {
1692 /*
1693 * we had an error during the read which causes us to abort
1694 * the current cluster_write request... before we do, we
1695 * need to release the rest of the pages in the upl without
1696 * modifying there state and mark the failed page in error
1697 */
1698 ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
1699 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1700
1701 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1702 (int)upl, 0, 0, retval, 0);
1703 break;
1704 }
1705 }
1706 }
1707 if ((kret = ubc_upl_map(upl, &io_address)) != KERN_SUCCESS)
1708 panic("cluster_write: ubc_upl_map failed\n");
1709 xfer_resid = io_size;
1710 io_offset = start_offset;
1711
1712 while (zero_cnt && xfer_resid) {
1713
1714 if (zero_cnt < (long long)xfer_resid)
1715 bytes_to_zero = zero_cnt;
1716 else
1717 bytes_to_zero = xfer_resid;
1718
1719 if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
1720 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1721
1722 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1723 (int)upl_f_offset + io_offset, bytes_to_zero,
1724 (int)io_offset, xfer_resid, 0);
1725 } else {
1726 int zero_pg_index;
1727
1728 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
1729 zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
1730
1731 if ( !upl_valid_page(pl, zero_pg_index)) {
1732 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1733
1734 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1735 (int)upl_f_offset + io_offset, bytes_to_zero,
1736 (int)io_offset, xfer_resid, 0);
1737
1738 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
1739 !upl_dirty_page(pl, zero_pg_index)) {
1740 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1741
1742 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1743 (int)upl_f_offset + io_offset, bytes_to_zero,
1744 (int)io_offset, xfer_resid, 0);
1745 }
1746 }
1747 xfer_resid -= bytes_to_zero;
1748 zero_cnt -= bytes_to_zero;
1749 zero_off += bytes_to_zero;
1750 io_offset += bytes_to_zero;
1751 }
1752 if (xfer_resid && uio_resid) {
1753 bytes_to_move = min(uio_resid, xfer_resid);
1754
1755 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 42)) | DBG_FUNC_NONE,
1756 (int)uio->uio_offset, bytes_to_move, uio_resid, xfer_resid, 0);
1757
1758 retval = uiomove((caddr_t)(io_address + io_offset), bytes_to_move, uio);
1759
1760
1761 if (retval) {
1762 if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
1763 panic("cluster_write: kernel_upl_unmap failed\n");
1764
1765 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
1766
1767 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1768 (int)upl, 0, 0, retval, 0);
1769 } else {
1770 uio_resid -= bytes_to_move;
1771 xfer_resid -= bytes_to_move;
1772 io_offset += bytes_to_move;
1773 }
1774 }
1775 while (xfer_resid && zero_cnt1 && retval == 0) {
1776
1777 if (zero_cnt1 < (long long)xfer_resid)
1778 bytes_to_zero = zero_cnt1;
1779 else
1780 bytes_to_zero = xfer_resid;
1781
1782 if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
1783 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1784
1785 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1786 (int)upl_f_offset + io_offset,
1787 bytes_to_zero, (int)io_offset, xfer_resid, 0);
1788 } else {
1789 int zero_pg_index;
1790
1791 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off1 & PAGE_MASK_64));
1792 zero_pg_index = (int)((zero_off1 - upl_f_offset) / PAGE_SIZE_64);
1793
1794 if ( !upl_valid_page(pl, zero_pg_index)) {
1795 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1796
1797 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1798 (int)upl_f_offset + io_offset,
1799 bytes_to_zero, (int)io_offset, xfer_resid, 0);
1800
1801 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
1802 !upl_dirty_page(pl, zero_pg_index)) {
1803 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1804
1805 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1806 (int)upl_f_offset + io_offset,
1807 bytes_to_zero, (int)io_offset, xfer_resid, 0);
1808 }
1809 }
1810 xfer_resid -= bytes_to_zero;
1811 zero_cnt1 -= bytes_to_zero;
1812 zero_off1 += bytes_to_zero;
1813 io_offset += bytes_to_zero;
1814 }
1815
1816 if (retval == 0) {
1817 int cl_index;
1818 int can_delay;
1819
1820 io_size += start_offset;
1821
1822 if ((upl_f_offset + io_size) >= newEOF && io_size < upl_size) {
1823 /*
1824 * if we're extending the file with this write
1825 * we'll zero fill the rest of the page so that
1826 * if the file gets extended again in such a way as to leave a
1827 * hole starting at this EOF, we'll have zero's in the correct spot
1828 */
1829 bzero((caddr_t)(io_address + io_size), upl_size - io_size);
1830
1831 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1832 (int)upl_f_offset + io_size,
1833 upl_size - io_size, 0, 0, 0);
1834 }
1835 if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
1836 panic("cluster_write: kernel_upl_unmap failed\n");
1837
1838 if (flags & IO_SYNC)
1839 /*
1840 * if the IO_SYNC flag is set than we need to
1841 * bypass any clusters and immediately issue
1842 * the I/O
1843 */
1844 goto issue_io;
1845
1846 if (vp->v_clen == 0)
1847 /*
1848 * no clusters currently present
1849 */
1850 goto start_new_cluster;
1851
1852 /*
1853 * keep track of the overall dirty page
1854 * range we've developed
1855 * in case we have to fall back to the
1856 * VHASDIRTY method of flushing
1857 */
1858 if (vp->v_flag & VHASDIRTY)
1859 goto delay_io;
1860
1861 for (cl_index = 0; cl_index < vp->v_clen; cl_index++) {
1862 /*
1863 * we have an existing cluster... see if this write will extend it nicely
1864 */
1865 if (start_blkno >= vp->v_clusters[cl_index].start_pg) {
1866 /*
1867 * the current write starts at or after the current cluster
1868 */
1869 if (last_blkno <= (vp->v_clusters[cl_index].start_pg + MAX_UPL_TRANSFER)) {
1870 /*
1871 * we have a write that fits entirely
1872 * within the existing cluster limits
1873 */
1874 if (last_blkno > vp->v_clusters[cl_index].last_pg)
1875 /*
1876 * update our idea of where the cluster ends
1877 */
1878 vp->v_clusters[cl_index].last_pg = last_blkno;
1879 break;
1880 }
1881 if (start_blkno < (vp->v_clusters[cl_index].start_pg + MAX_UPL_TRANSFER)) {
1882 /*
1883 * we have a write that starts in the middle of the current cluster
1884 * but extends beyond the cluster's limit
1885 * we'll clip the current cluster if we actually
1886 * overlap with the new write
1887 * and start a new cluster with the current write
1888 */
1889 if (vp->v_clusters[cl_index].last_pg > start_blkno)
1890 vp->v_clusters[cl_index].last_pg = start_blkno;
1891 }
1892 /*
1893 * we also get here for the case where the current write starts
1894 * beyond the limit of the existing cluster
1895 *
1896 * in either case, we'll check the remaining clusters before
1897 * starting a new one
1898 */
1899 } else {
1900 /*
1901 * the current write starts in front of the current cluster
1902 */
1903 if ((vp->v_clusters[cl_index].last_pg - start_blkno) <= MAX_UPL_TRANSFER) {
1904 /*
1905 * we can just merge the old cluster
1906 * with the new request and leave it
1907 * in the cache
1908 */
1909 vp->v_clusters[cl_index].start_pg = start_blkno;
1910
1911 if (last_blkno > vp->v_clusters[cl_index].last_pg) {
1912 /*
1913 * the current write completely
1914 * envelops the existing cluster
1915 */
1916 vp->v_clusters[cl_index].last_pg = last_blkno;
1917 }
1918 break;
1919 }
1920
1921 /*
1922 * if we were to combine this write with the current cluster
1923 * we would exceed the cluster size limit.... so,
1924 * let's see if there's any overlap of the new I/O with
1925 * the existing cluster...
1926 *
1927 */
1928 if (last_blkno > vp->v_clusters[cl_index].start_pg)
1929 /*
1930 * the current write extends into the existing cluster
1931 * clip the current cluster by moving the start position
1932 * to where the current write ends
1933 */
1934 vp->v_clusters[cl_index].start_pg = last_blkno;
1935 /*
1936 * if we get here, there was no way to merge
1937 * the new I/O with this cluster and
1938 * keep it under our maximum cluster length
1939 * we'll check the remaining clusters before starting a new one
1940 */
1941 }
1942 }
1943 if (cl_index < vp->v_clen)
1944 /*
1945 * we found an existing cluster that we
1946 * could merger this I/O into
1947 */
1948 goto delay_io;
1949
1950 if (vp->v_clen < MAX_CLUSTERS && !(vp->v_flag & VNOCACHE_DATA))
1951 /*
1952 * we didn't find an existing cluster to
1953 * merge into, but there's room to start
1954 * a new one
1955 */
1956 goto start_new_cluster;
1957
1958 /*
1959 * no exisitng cluster to merge with and no
1960 * room to start a new one... we'll try
1961 * pushing the existing ones... if none of
1962 * them are able to be pushed, we'll have
1963 * to fall back on the VHASDIRTY mechanism
1964 * cluster_try_push will set v_clen to the
1965 * number of remaining clusters if it is
1966 * unable to push all of them
1967 */
1968 if (vp->v_flag & VNOCACHE_DATA)
1969 can_delay = 0;
1970 else
1971 can_delay = 1;
1972
1973 if (cluster_try_push(vp, newEOF, 0, 0) == 0) {
1974 vp->v_flag |= VHASDIRTY;
1975 goto delay_io;
1976 }
1977 start_new_cluster:
1978 if (vp->v_clen == 0) {
1979 vp->v_ciosiz = devblocksize;
1980 vp->v_cstart = start_blkno;
1981 vp->v_lastw = last_blkno;
1982 }
1983 vp->v_clusters[vp->v_clen].start_pg = start_blkno;
1984 vp->v_clusters[vp->v_clen].last_pg = last_blkno;
1985 vp->v_clen++;
1986 delay_io:
1987 /*
1988 * make sure we keep v_cstart and v_lastw up to
1989 * date in case we have to fall back on the
1990 * V_HASDIRTY mechanism (or we've already entered it)
1991 */
1992 if (start_blkno < vp->v_cstart)
1993 vp->v_cstart = start_blkno;
1994 if (last_blkno > vp->v_lastw)
1995 vp->v_lastw = last_blkno;
1996
1997 ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
1998 continue;
1999 issue_io:
2000 /*
2001 * in order to maintain some semblance of coherency with mapped writes
2002 * we need to write the cluster back out as a multiple of the PAGESIZE
2003 * unless the cluster encompasses the last page of the file... in this
2004 * case we'll round out to the nearest device block boundary
2005 */
2006 io_size = upl_size;
2007
2008 if ((upl_f_offset + io_size) > newEOF) {
2009 io_size = newEOF - upl_f_offset;
2010 io_size = (io_size + (devblocksize - 1)) & ~(devblocksize - 1);
2011 }
2012
2013 if (flags & IO_SYNC)
2014 io_flags = CL_COMMIT | CL_AGE;
2015 else
2016 io_flags = CL_COMMIT | CL_AGE | CL_ASYNC;
2017
2018 if (vp->v_flag & VNOCACHE_DATA)
2019 io_flags |= CL_DUMP;
2020
2021 while (vp->v_numoutput >= ASYNC_THROTTLE) {
2022 vp->v_flag |= VTHROTTLED;
2023 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_write", 0);
2024 }
2025 retval = cluster_io(vp, upl, 0, upl_f_offset, io_size, devblocksize,
2026 io_flags, (struct buf *)0, (struct clios *)0);
2027 }
2028 }
2029 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
2030 retval, 0, 0, 0, 0);
2031
2032 return (retval);
2033 }
2034
2035 int
2036 cluster_read(vp, uio, filesize, devblocksize, flags)
2037 struct vnode *vp;
2038 struct uio *uio;
2039 off_t filesize;
2040 int devblocksize;
2041 int flags;
2042 {
2043 int prev_resid;
2044 int clip_size;
2045 off_t max_io_size;
2046 struct iovec *iov;
2047 vm_offset_t upl_offset;
2048 int upl_size;
2049 int pages_in_pl;
2050 upl_page_info_t *pl;
2051 int upl_flags;
2052 upl_t upl;
2053 int retval = 0;
2054
2055 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
2056 (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0);
2057
2058 /*
2059 * We set a threshhold of 4 pages to decide if the nocopy
2060 * read loop is worth the trouble...
2061 */
2062
2063 if (!((vp->v_flag & VNOCACHE_DATA) && (uio->uio_segflg == UIO_USERSPACE)))
2064 {
2065 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2066 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2067 (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
2068 return(retval);
2069 }
2070
2071 while (uio->uio_resid && uio->uio_offset < filesize && retval == 0)
2072 {
2073 /* we know we have a resid, so this is safe */
2074 iov = uio->uio_iov;
2075 while (iov->iov_len == 0) {
2076 uio->uio_iov++;
2077 uio->uio_iovcnt--;
2078 iov = uio->uio_iov;
2079 }
2080
2081 /*
2082 * We check every vector target and if it is physically
2083 * contiguous space, we skip the sanity checks.
2084 */
2085
2086 upl_offset = (vm_offset_t)iov->iov_base & ~PAGE_MASK;
2087 upl_size = (upl_offset + PAGE_SIZE +(PAGE_SIZE -1)) & ~PAGE_MASK;
2088 pages_in_pl = 0;
2089 upl_flags = UPL_QUERY_OBJECT_TYPE;
2090 if((vm_map_get_upl(current_map(),
2091 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2092 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0)) != KERN_SUCCESS)
2093 {
2094 /*
2095 * the user app must have passed in an invalid address
2096 */
2097 return (EFAULT);
2098 }
2099
2100 if (upl_flags & UPL_PHYS_CONTIG)
2101 {
2102 retval = cluster_phys_read(vp, uio, filesize, devblocksize, flags);
2103 }
2104 else if (uio->uio_resid < 4 * PAGE_SIZE)
2105 {
2106 /*
2107 * We set a threshhold of 4 pages to decide if the nocopy
2108 * read loop is worth the trouble...
2109 */
2110 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2111 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2112 (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
2113 return(retval);
2114 }
2115 else if (uio->uio_offset & PAGE_MASK_64)
2116 {
2117 /* Bring the file offset read up to a pagesize boundary */
2118 clip_size = (PAGE_SIZE - (int)(uio->uio_offset & PAGE_MASK_64));
2119 if (uio->uio_resid < clip_size)
2120 clip_size = uio->uio_resid;
2121 /*
2122 * Fake the resid going into the cluster_read_x call
2123 * and restore it on the way out.
2124 */
2125 prev_resid = uio->uio_resid;
2126 uio->uio_resid = clip_size;
2127 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2128 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2129 }
2130 else if ((int)iov->iov_base & PAGE_MASK_64)
2131 {
2132 clip_size = iov->iov_len;
2133 prev_resid = uio->uio_resid;
2134 uio->uio_resid = clip_size;
2135 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2136 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2137 }
2138 else
2139 {
2140 /*
2141 * If we come in here, we know the offset into
2142 * the file is on a pagesize boundary
2143 */
2144
2145 max_io_size = filesize - uio->uio_offset;
2146 clip_size = uio->uio_resid;
2147 if (iov->iov_len < clip_size)
2148 clip_size = iov->iov_len;
2149 if (max_io_size < clip_size)
2150 clip_size = (int)max_io_size;
2151
2152 if (clip_size < PAGE_SIZE)
2153 {
2154 /*
2155 * Take care of the tail end of the read in this vector.
2156 */
2157 prev_resid = uio->uio_resid;
2158 uio->uio_resid = clip_size;
2159 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2160 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2161 }
2162 else
2163 {
2164 /* round clip_size down to a multiple of pagesize */
2165 clip_size = clip_size & ~(PAGE_MASK);
2166 prev_resid = uio->uio_resid;
2167 uio->uio_resid = clip_size;
2168 retval = cluster_nocopy_read(vp, uio, filesize, devblocksize, flags);
2169 if ((retval==0) && uio->uio_resid)
2170 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2171 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2172 }
2173 } /* end else */
2174 } /* end while */
2175
2176 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2177 (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
2178
2179 return(retval);
2180 }
2181
2182
2183 static int
2184 cluster_read_x(vp, uio, filesize, devblocksize, flags)
2185 struct vnode *vp;
2186 struct uio *uio;
2187 off_t filesize;
2188 int devblocksize;
2189 int flags;
2190 {
2191 upl_page_info_t *pl;
2192 upl_t upl;
2193 vm_offset_t upl_offset;
2194 int upl_size;
2195 off_t upl_f_offset;
2196 int start_offset;
2197 int start_pg;
2198 int last_pg;
2199 int uio_last;
2200 int pages_in_upl;
2201 off_t max_size;
2202 int io_size;
2203 vm_offset_t io_address;
2204 kern_return_t kret;
2205 int segflg;
2206 int error = 0;
2207 int retval = 0;
2208 int b_lblkno;
2209 int e_lblkno;
2210
2211 b_lblkno = (int)(uio->uio_offset / PAGE_SIZE_64);
2212
2213 while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) {
2214 /*
2215 * compute the size of the upl needed to encompass
2216 * the requested read... limit each call to cluster_io
2217 * to the maximum UPL size... cluster_io will clip if
2218 * this exceeds the maximum io_size for the device,
2219 * make sure to account for
2220 * a starting offset that's not page aligned
2221 */
2222 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2223 upl_f_offset = uio->uio_offset - (off_t)start_offset;
2224 max_size = filesize - uio->uio_offset;
2225
2226 if ((off_t)((unsigned int)uio->uio_resid) < max_size)
2227 io_size = uio->uio_resid;
2228 else
2229 io_size = max_size;
2230
2231 if (uio->uio_segflg == UIO_USERSPACE && !(vp->v_flag & VNOCACHE_DATA)) {
2232 segflg = uio->uio_segflg;
2233
2234 uio->uio_segflg = UIO_PHYS_USERSPACE;
2235
2236 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
2237 (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2238
2239 while (io_size && retval == 0) {
2240 int xsize;
2241 ppnum_t paddr;
2242
2243 if (ubc_page_op(vp,
2244 upl_f_offset,
2245 UPL_POP_SET | UPL_POP_BUSY,
2246 &paddr, 0) != KERN_SUCCESS)
2247 break;
2248
2249 xsize = PAGE_SIZE - start_offset;
2250
2251 if (xsize > io_size)
2252 xsize = io_size;
2253
2254 retval = uiomove64((addr64_t)(((addr64_t)paddr << 12) + start_offset), xsize, uio);
2255
2256 ubc_page_op(vp, upl_f_offset,
2257 UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
2258
2259 io_size -= xsize;
2260 start_offset = (int)
2261 (uio->uio_offset & PAGE_MASK_64);
2262 upl_f_offset = uio->uio_offset - start_offset;
2263 }
2264 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
2265 (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2266
2267 uio->uio_segflg = segflg;
2268
2269 if (retval)
2270 break;
2271
2272 if (io_size == 0) {
2273 /*
2274 * we're already finished with this read request
2275 * let's see if we should do a read-ahead
2276 */
2277 e_lblkno = (int)
2278 ((uio->uio_offset - 1) / PAGE_SIZE_64);
2279
2280 if (!(vp->v_flag & VRAOFF))
2281 /*
2282 * let's try to read ahead if we're in
2283 * a sequential access pattern
2284 */
2285 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
2286 vp->v_lastr = e_lblkno;
2287
2288 break;
2289 }
2290 max_size = filesize - uio->uio_offset;
2291 }
2292 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2293 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2294 upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2295 pages_in_upl = upl_size / PAGE_SIZE;
2296
2297 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
2298 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2299
2300 kret = ubc_create_upl(vp,
2301 upl_f_offset,
2302 upl_size,
2303 &upl,
2304 &pl,
2305 UPL_FLAGS_NONE);
2306 if (kret != KERN_SUCCESS)
2307 panic("cluster_read: failed to get pagelist");
2308
2309 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
2310 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2311
2312 /*
2313 * scan from the beginning of the upl looking for the first
2314 * non-valid page.... this will become the first page in
2315 * the request we're going to make to 'cluster_io'... if all
2316 * of the pages are valid, we won't call through to 'cluster_io'
2317 */
2318 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
2319 if (!upl_valid_page(pl, start_pg))
2320 break;
2321 }
2322
2323 /*
2324 * scan from the starting invalid page looking for a valid
2325 * page before the end of the upl is reached, if we
2326 * find one, then it will be the last page of the request to
2327 * 'cluster_io'
2328 */
2329 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
2330 if (upl_valid_page(pl, last_pg))
2331 break;
2332 }
2333
2334 if (start_pg < last_pg) {
2335 /*
2336 * we found a range of 'invalid' pages that must be filled
2337 * if the last page in this range is the last page of the file
2338 * we may have to clip the size of it to keep from reading past
2339 * the end of the last physical block associated with the file
2340 */
2341 upl_offset = start_pg * PAGE_SIZE;
2342 io_size = (last_pg - start_pg) * PAGE_SIZE;
2343
2344 if ((upl_f_offset + upl_offset + io_size) > filesize)
2345 io_size = filesize - (upl_f_offset + upl_offset);
2346
2347 /*
2348 * issue a synchronous read to cluster_io
2349 */
2350
2351 error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
2352 io_size, devblocksize, CL_READ, (struct buf *)0, (struct clios *)0);
2353 }
2354 if (error == 0) {
2355 /*
2356 * if the read completed successfully, or there was no I/O request
2357 * issued, than map the upl into kernel address space and
2358 * move the data into user land.... we'll first add on any 'valid'
2359 * pages that were present in the upl when we acquired it.
2360 */
2361 u_int val_size;
2362 u_int size_of_prefetch;
2363
2364 for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
2365 if (!upl_valid_page(pl, uio_last))
2366 break;
2367 }
2368 /*
2369 * compute size to transfer this round, if uio->uio_resid is
2370 * still non-zero after this uiomove, we'll loop around and
2371 * set up for another I/O.
2372 */
2373 val_size = (uio_last * PAGE_SIZE) - start_offset;
2374
2375 if (max_size < val_size)
2376 val_size = max_size;
2377
2378 if (uio->uio_resid < val_size)
2379 val_size = uio->uio_resid;
2380
2381 e_lblkno = (int)((uio->uio_offset + ((off_t)val_size - 1)) / PAGE_SIZE_64);
2382
2383 if (size_of_prefetch = (uio->uio_resid - val_size)) {
2384 /*
2385 * if there's still I/O left to do for this request, then issue a
2386 * pre-fetch I/O... the I/O wait time will overlap
2387 * with the copying of the data
2388 */
2389 cluster_rd_prefetch(vp, uio->uio_offset + val_size, size_of_prefetch, filesize, devblocksize);
2390 } else {
2391 if (!(vp->v_flag & VRAOFF) && !(vp->v_flag & VNOCACHE_DATA))
2392 /*
2393 * let's try to read ahead if we're in
2394 * a sequential access pattern
2395 */
2396 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
2397 vp->v_lastr = e_lblkno;
2398 }
2399 if (uio->uio_segflg == UIO_USERSPACE) {
2400 int offset;
2401
2402 segflg = uio->uio_segflg;
2403
2404 uio->uio_segflg = UIO_PHYS_USERSPACE;
2405
2406
2407 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
2408 (int)uio->uio_offset, val_size, uio->uio_resid, 0, 0);
2409
2410 offset = start_offset;
2411
2412 while (val_size && retval == 0) {
2413 int csize;
2414 int i;
2415 addr64_t paddr;
2416
2417 i = offset / PAGE_SIZE;
2418 csize = min(PAGE_SIZE - start_offset, val_size);
2419
2420 paddr = ((addr64_t)upl_phys_page(pl, i) << 12) + start_offset;
2421
2422 retval = uiomove64(paddr, csize, uio);
2423
2424 val_size -= csize;
2425 offset += csize;
2426 start_offset = offset & PAGE_MASK;
2427 }
2428 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
2429 (int)uio->uio_offset, val_size, uio->uio_resid, 0, 0);
2430
2431 uio->uio_segflg = segflg;
2432 }
2433 else
2434 {
2435 if ((kret = ubc_upl_map(upl, &io_address)) != KERN_SUCCESS)
2436 panic("cluster_read: ubc_upl_map() failed\n");
2437
2438 retval = uiomove((caddr_t)(io_address + start_offset), val_size, uio);
2439
2440 if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
2441 panic("cluster_read: ubc_upl_unmap() failed\n");
2442 }
2443 }
2444 if (start_pg < last_pg) {
2445 /*
2446 * compute the range of pages that we actually issued an I/O for
2447 * and either commit them as valid if the I/O succeeded
2448 * or abort them if the I/O failed
2449 */
2450 io_size = (last_pg - start_pg) * PAGE_SIZE;
2451
2452 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
2453 (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
2454
2455 if (error || (vp->v_flag & VNOCACHE_DATA))
2456 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
2457 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2458 else
2459 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size,
2460 UPL_COMMIT_CLEAR_DIRTY
2461 | UPL_COMMIT_FREE_ON_EMPTY
2462 | UPL_COMMIT_INACTIVATE);
2463
2464 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
2465 (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
2466 }
2467 if ((last_pg - start_pg) < pages_in_upl) {
2468 int cur_pg;
2469 int commit_flags;
2470
2471 /*
2472 * the set of pages that we issued an I/O for did not encompass
2473 * the entire upl... so just release these without modifying
2474 * there state
2475 */
2476 if (error)
2477 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2478 else {
2479 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
2480 (int)upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
2481
2482 if (start_pg) {
2483 /*
2484 * we found some already valid pages at the beginning of
2485 * the upl commit these back to the inactive list with
2486 * reference cleared
2487 */
2488 for (cur_pg = 0; cur_pg < start_pg; cur_pg++) {
2489 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
2490 | UPL_COMMIT_INACTIVATE;
2491
2492 if (upl_dirty_page(pl, cur_pg))
2493 commit_flags |= UPL_COMMIT_SET_DIRTY;
2494
2495 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA))
2496 ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
2497 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2498 else
2499 ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
2500 PAGE_SIZE, commit_flags);
2501 }
2502 }
2503 if (last_pg < uio_last) {
2504 /*
2505 * we found some already valid pages immediately after the
2506 * pages we issued I/O for, commit these back to the
2507 * inactive list with reference cleared
2508 */
2509 for (cur_pg = last_pg; cur_pg < uio_last; cur_pg++) {
2510 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
2511 | UPL_COMMIT_INACTIVATE;
2512
2513 if (upl_dirty_page(pl, cur_pg))
2514 commit_flags |= UPL_COMMIT_SET_DIRTY;
2515
2516 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA))
2517 ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
2518 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2519 else
2520 ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
2521 PAGE_SIZE, commit_flags);
2522 }
2523 }
2524 if (uio_last < pages_in_upl) {
2525 /*
2526 * there were some invalid pages beyond the valid pages
2527 * that we didn't issue an I/O for, just release them
2528 * unchanged
2529 */
2530 ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
2531 (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
2532 }
2533
2534 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
2535 (int)upl, -1, -1, 0, 0);
2536 }
2537 }
2538 if (retval == 0)
2539 retval = error;
2540 }
2541
2542 return (retval);
2543 }
2544
2545
2546 static int
2547 cluster_nocopy_read(vp, uio, filesize, devblocksize, flags)
2548 struct vnode *vp;
2549 struct uio *uio;
2550 off_t filesize;
2551 int devblocksize;
2552 int flags;
2553 {
2554 upl_t upl;
2555 upl_page_info_t *pl;
2556 off_t upl_f_offset;
2557 vm_offset_t upl_offset;
2558 off_t start_upl_f_offset;
2559 off_t max_io_size;
2560 int io_size;
2561 int upl_size;
2562 int upl_needed_size;
2563 int pages_in_pl;
2564 ppnum_t paddr;
2565 int upl_flags;
2566 kern_return_t kret;
2567 int segflg;
2568 struct iovec *iov;
2569 int i;
2570 int force_data_sync;
2571 int retval = 0;
2572 int first = 1;
2573 struct clios iostate;
2574
2575 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
2576 (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0);
2577
2578 /*
2579 * When we enter this routine, we know
2580 * -- the offset into the file is on a pagesize boundary
2581 * -- the resid is a page multiple
2582 * -- the resid will not exceed iov_len
2583 */
2584
2585 iostate.io_completed = 0;
2586 iostate.io_issued = 0;
2587 iostate.io_error = 0;
2588 iostate.io_wanted = 0;
2589
2590 iov = uio->uio_iov;
2591
2592 while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) {
2593
2594 max_io_size = filesize - uio->uio_offset;
2595
2596 if (max_io_size < (off_t)((unsigned int)uio->uio_resid))
2597 io_size = max_io_size;
2598 else
2599 io_size = uio->uio_resid;
2600
2601 /*
2602 * We don't come into this routine unless
2603 * UIO_USERSPACE is set.
2604 */
2605 segflg = uio->uio_segflg;
2606
2607 uio->uio_segflg = UIO_PHYS_USERSPACE;
2608
2609 /*
2610 * First look for pages already in the cache
2611 * and move them to user space.
2612 */
2613 while (io_size && (retval == 0)) {
2614 upl_f_offset = uio->uio_offset;
2615
2616 /*
2617 * If this call fails, it means the page is not
2618 * in the page cache.
2619 */
2620 if (ubc_page_op(vp, upl_f_offset,
2621 UPL_POP_SET | UPL_POP_BUSY, &paddr, 0) != KERN_SUCCESS)
2622 break;
2623
2624 retval = uiomove64((addr64_t)paddr << 12, PAGE_SIZE, uio);
2625
2626 ubc_page_op(vp, upl_f_offset,
2627 UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
2628
2629 io_size -= PAGE_SIZE;
2630 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 71)) | DBG_FUNC_NONE,
2631 (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2632 }
2633 uio->uio_segflg = segflg;
2634
2635 if (retval) {
2636 /*
2637 * we may have already spun some portion of this request
2638 * off as async requests... we need to wait for the I/O
2639 * to complete before returning
2640 */
2641 goto wait_for_reads;
2642 }
2643 /*
2644 * If we are already finished with this read, then return
2645 */
2646 if (io_size == 0) {
2647 /*
2648 * we may have already spun some portion of this request
2649 * off as async requests... we need to wait for the I/O
2650 * to complete before returning
2651 */
2652 goto wait_for_reads;
2653 }
2654 max_io_size = io_size;
2655
2656 if (max_io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2657 max_io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2658 if (first) {
2659 if (max_io_size > (MAX_UPL_TRANSFER * PAGE_SIZE) / 4)
2660 max_io_size = (MAX_UPL_TRANSFER * PAGE_SIZE) / 8;
2661 first = 0;
2662 }
2663 start_upl_f_offset = uio->uio_offset; /* this is page aligned in the file */
2664 upl_f_offset = start_upl_f_offset;
2665 io_size = 0;
2666
2667 while (io_size < max_io_size) {
2668 if (ubc_page_op(vp, upl_f_offset,
2669 UPL_POP_SET | UPL_POP_BUSY, &paddr, 0) == KERN_SUCCESS) {
2670 ubc_page_op(vp, upl_f_offset,
2671 UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
2672 break;
2673 }
2674 /*
2675 * Build up the io request parameters.
2676 */
2677 io_size += PAGE_SIZE_64;
2678 upl_f_offset += PAGE_SIZE_64;
2679 }
2680 if (io_size == 0)
2681 /*
2682 * we may have already spun some portion of this request
2683 * off as async requests... we need to wait for the I/O
2684 * to complete before returning
2685 */
2686 goto wait_for_reads;
2687
2688 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
2689 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
2690
2691 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
2692 (int)upl_offset, upl_needed_size, (int)iov->iov_base, io_size, 0);
2693
2694 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
2695 pages_in_pl = 0;
2696 upl_size = upl_needed_size;
2697 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
2698
2699 kret = vm_map_get_upl(current_map(),
2700 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2701 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, force_data_sync);
2702
2703 if (kret != KERN_SUCCESS) {
2704 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2705 (int)upl_offset, upl_size, io_size, kret, 0);
2706
2707 /*
2708 * cluster_nocopy_read: failed to get pagelist
2709 *
2710 * we may have already spun some portion of this request
2711 * off as async requests... we need to wait for the I/O
2712 * to complete before returning
2713 */
2714 goto wait_for_reads;
2715 }
2716 pages_in_pl = upl_size / PAGE_SIZE;
2717 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2718
2719 for (i = 0; i < pages_in_pl; i++) {
2720 if (!upl_valid_page(pl, i))
2721 break;
2722 }
2723 if (i == pages_in_pl)
2724 break;
2725
2726 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2727 UPL_ABORT_FREE_ON_EMPTY);
2728 }
2729 if (force_data_sync >= 3) {
2730 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2731 (int)upl_offset, upl_size, io_size, kret, 0);
2732
2733 goto wait_for_reads;
2734 }
2735 /*
2736 * Consider the possibility that upl_size wasn't satisfied.
2737 */
2738 if (upl_size != upl_needed_size)
2739 io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
2740
2741 if (io_size == 0) {
2742 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2743 UPL_ABORT_FREE_ON_EMPTY);
2744 goto wait_for_reads;
2745 }
2746 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2747 (int)upl_offset, upl_size, io_size, kret, 0);
2748
2749 /*
2750 * request asynchronously so that we can overlap
2751 * the preparation of the next I/O
2752 * if there are already too many outstanding reads
2753 * wait until some have completed before issuing the next read
2754 */
2755 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
2756 iostate.io_wanted = 1;
2757 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_read", 0);
2758 }
2759 if (iostate.io_error) {
2760 /*
2761 * one of the earlier reads we issued ran into a hard error
2762 * don't issue any more reads, cleanup the UPL
2763 * that was just created but not used, then
2764 * go wait for any other reads to complete before
2765 * returning the error to the caller
2766 */
2767 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2768 UPL_ABORT_FREE_ON_EMPTY);
2769
2770 goto wait_for_reads;
2771 }
2772 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
2773 (int)upl, (int)upl_offset, (int)start_upl_f_offset, io_size, 0);
2774
2775 retval = cluster_io(vp, upl, upl_offset, start_upl_f_offset,
2776 io_size, devblocksize,
2777 CL_PRESERVE | CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO,
2778 (struct buf *)0, &iostate);
2779
2780 /*
2781 * update the uio structure
2782 */
2783 iov->iov_base += io_size;
2784 iov->iov_len -= io_size;
2785 uio->uio_resid -= io_size;
2786 uio->uio_offset += io_size;
2787
2788 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
2789 (int)upl, (int)uio->uio_offset, (int)uio->uio_resid, retval, 0);
2790
2791 } /* end while */
2792
2793 wait_for_reads:
2794 /*
2795 * make sure all async reads that are part of this stream
2796 * have completed before we return
2797 */
2798 while (iostate.io_issued != iostate.io_completed) {
2799 iostate.io_wanted = 1;
2800 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_read", 0);
2801 }
2802 if (iostate.io_error)
2803 retval = iostate.io_error;
2804
2805 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2806 (int)uio->uio_offset, (int)uio->uio_resid, 6, retval, 0);
2807
2808 return (retval);
2809 }
2810
2811
2812 static int
2813 cluster_phys_read(vp, uio, filesize, devblocksize, flags)
2814 struct vnode *vp;
2815 struct uio *uio;
2816 off_t filesize;
2817 int devblocksize;
2818 int flags;
2819 {
2820 upl_page_info_t *pl;
2821 upl_t upl;
2822 vm_offset_t upl_offset;
2823 addr64_t dst_paddr;
2824 off_t max_size;
2825 int io_size;
2826 int tail_size;
2827 int upl_size;
2828 int upl_needed_size;
2829 int pages_in_pl;
2830 int upl_flags;
2831 kern_return_t kret;
2832 struct iovec *iov;
2833 struct clios iostate;
2834 int error;
2835
2836 /*
2837 * When we enter this routine, we know
2838 * -- the resid will not exceed iov_len
2839 * -- the target address is physically contiguous
2840 */
2841
2842 iov = uio->uio_iov;
2843
2844 max_size = filesize - uio->uio_offset;
2845
2846 if (max_size > (off_t)((unsigned int)iov->iov_len))
2847 io_size = iov->iov_len;
2848 else
2849 io_size = max_size;
2850
2851 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
2852 upl_needed_size = upl_offset + io_size;
2853
2854 error = 0;
2855 pages_in_pl = 0;
2856 upl_size = upl_needed_size;
2857 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
2858
2859 kret = vm_map_get_upl(current_map(),
2860 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2861 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
2862
2863 if (kret != KERN_SUCCESS) {
2864 /*
2865 * cluster_phys_read: failed to get pagelist
2866 */
2867 return(EINVAL);
2868 }
2869 if (upl_size < upl_needed_size) {
2870 /*
2871 * The upl_size wasn't satisfied.
2872 */
2873 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2874
2875 return(EINVAL);
2876 }
2877 pl = ubc_upl_pageinfo(upl);
2878
2879 dst_paddr = (((addr64_t)(int)upl_phys_page(pl, 0)) << 12) + ((addr64_t)iov->iov_base & PAGE_MASK);
2880
2881 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
2882 int head_size;
2883
2884 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
2885
2886 if (head_size > io_size)
2887 head_size = io_size;
2888
2889 error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, devblocksize, CL_READ);
2890
2891 if (error) {
2892 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2893
2894 return(EINVAL);
2895 }
2896 upl_offset += head_size;
2897 dst_paddr += head_size;
2898 io_size -= head_size;
2899 }
2900 tail_size = io_size & (devblocksize - 1);
2901 io_size -= tail_size;
2902
2903 iostate.io_completed = 0;
2904 iostate.io_issued = 0;
2905 iostate.io_error = 0;
2906 iostate.io_wanted = 0;
2907
2908 while (io_size && error == 0) {
2909 int xsize;
2910
2911 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2912 xsize = MAX_UPL_TRANSFER * PAGE_SIZE;
2913 else
2914 xsize = io_size;
2915 /*
2916 * request asynchronously so that we can overlap
2917 * the preparation of the next I/O... we'll do
2918 * the commit after all the I/O has completed
2919 * since its all issued against the same UPL
2920 * if there are already too many outstanding reads
2921 * wait until some have completed before issuing the next
2922 */
2923 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
2924 iostate.io_wanted = 1;
2925 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_phys_read", 0);
2926 }
2927
2928 error = cluster_io(vp, upl, upl_offset, uio->uio_offset, xsize, 0,
2929 CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC,
2930 (struct buf *)0, &iostate);
2931 /*
2932 * The cluster_io read was issued successfully,
2933 * update the uio structure
2934 */
2935 if (error == 0) {
2936 uio->uio_resid -= xsize;
2937 iov->iov_len -= xsize;
2938 iov->iov_base += xsize;
2939 uio->uio_offset += xsize;
2940 dst_paddr += xsize;
2941 upl_offset += xsize;
2942 io_size -= xsize;
2943 }
2944 }
2945 /*
2946 * make sure all async reads that are part of this stream
2947 * have completed before we proceed
2948 */
2949 while (iostate.io_issued != iostate.io_completed) {
2950 iostate.io_wanted = 1;
2951 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_phys_read", 0);
2952 }
2953 if (iostate.io_error) {
2954 error = iostate.io_error;
2955 }
2956 if (error == 0 && tail_size)
2957 error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, devblocksize, CL_READ);
2958
2959 /*
2960 * just release our hold on the physically contiguous
2961 * region without changing any state
2962 */
2963 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2964
2965 return (error);
2966 }
2967
2968
2969 /*
2970 * generate advisory I/O's in the largest chunks possible
2971 * the completed pages will be released into the VM cache
2972 */
2973 int
2974 advisory_read(vp, filesize, f_offset, resid, devblocksize)
2975 struct vnode *vp;
2976 off_t filesize;
2977 off_t f_offset;
2978 int resid;
2979 int devblocksize;
2980 {
2981 upl_page_info_t *pl;
2982 upl_t upl;
2983 vm_offset_t upl_offset;
2984 int upl_size;
2985 off_t upl_f_offset;
2986 int start_offset;
2987 int start_pg;
2988 int last_pg;
2989 int pages_in_upl;
2990 off_t max_size;
2991 int io_size;
2992 kern_return_t kret;
2993 int retval = 0;
2994 int issued_io;
2995
2996 if (!UBCINFOEXISTS(vp))
2997 return(EINVAL);
2998
2999 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
3000 (int)f_offset, resid, (int)filesize, devblocksize, 0);
3001
3002 while (resid && f_offset < filesize && retval == 0) {
3003 /*
3004 * compute the size of the upl needed to encompass
3005 * the requested read... limit each call to cluster_io
3006 * to the maximum UPL size... cluster_io will clip if
3007 * this exceeds the maximum io_size for the device,
3008 * make sure to account for
3009 * a starting offset that's not page aligned
3010 */
3011 start_offset = (int)(f_offset & PAGE_MASK_64);
3012 upl_f_offset = f_offset - (off_t)start_offset;
3013 max_size = filesize - f_offset;
3014
3015 if (resid < max_size)
3016 io_size = resid;
3017 else
3018 io_size = max_size;
3019
3020 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
3021 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
3022 upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
3023 pages_in_upl = upl_size / PAGE_SIZE;
3024
3025 kret = ubc_create_upl(vp,
3026 upl_f_offset,
3027 upl_size,
3028 &upl,
3029 &pl,
3030 UPL_RET_ONLY_ABSENT);
3031 if (kret != KERN_SUCCESS)
3032 return(retval);
3033 issued_io = 0;
3034
3035 /*
3036 * before we start marching forward, we must make sure we end on
3037 * a present page, otherwise we will be working with a freed
3038 * upl
3039 */
3040 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
3041 if (upl_page_present(pl, last_pg))
3042 break;
3043 }
3044 pages_in_upl = last_pg + 1;
3045
3046
3047 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_NONE,
3048 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
3049
3050
3051 for (last_pg = 0; last_pg < pages_in_upl; ) {
3052 /*
3053 * scan from the beginning of the upl looking for the first
3054 * page that is present.... this will become the first page in
3055 * the request we're going to make to 'cluster_io'... if all
3056 * of the pages are absent, we won't call through to 'cluster_io'
3057 */
3058 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
3059 if (upl_page_present(pl, start_pg))
3060 break;
3061 }
3062
3063 /*
3064 * scan from the starting present page looking for an absent
3065 * page before the end of the upl is reached, if we
3066 * find one, then it will terminate the range of pages being
3067 * presented to 'cluster_io'
3068 */
3069 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
3070 if (!upl_page_present(pl, last_pg))
3071 break;
3072 }
3073
3074 if (last_pg > start_pg) {
3075 /*
3076 * we found a range of pages that must be filled
3077 * if the last page in this range is the last page of the file
3078 * we may have to clip the size of it to keep from reading past
3079 * the end of the last physical block associated with the file
3080 */
3081 upl_offset = start_pg * PAGE_SIZE;
3082 io_size = (last_pg - start_pg) * PAGE_SIZE;
3083
3084 if ((upl_f_offset + upl_offset + io_size) > filesize)
3085 io_size = filesize - (upl_f_offset + upl_offset);
3086
3087 /*
3088 * issue an asynchronous read to cluster_io
3089 */
3090 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, devblocksize,
3091 CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE, (struct buf *)0, (struct clios *)0);
3092
3093 issued_io = 1;
3094 }
3095 }
3096 if (issued_io == 0)
3097 ubc_upl_abort(upl, 0);
3098
3099 io_size = upl_size - start_offset;
3100
3101 if (io_size > resid)
3102 io_size = resid;
3103 f_offset += io_size;
3104 resid -= io_size;
3105 }
3106
3107 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
3108 (int)f_offset, resid, retval, 0, 0);
3109
3110 return(retval);
3111 }
3112
3113
3114 int
3115 cluster_push(vp)
3116 struct vnode *vp;
3117 {
3118 int retval;
3119
3120 if (!UBCINFOEXISTS(vp) || vp->v_clen == 0) {
3121 vp->v_flag &= ~VHASDIRTY;
3122 return(0);
3123 }
3124
3125 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
3126 vp->v_flag & VHASDIRTY, vp->v_clen, 0, 0, 0);
3127
3128 if (vp->v_flag & VHASDIRTY) {
3129 daddr_t start_pg;
3130 daddr_t last_pg;
3131 daddr_t end_pg;
3132
3133 start_pg = vp->v_cstart;
3134 end_pg = vp->v_lastw;
3135
3136 vp->v_flag &= ~VHASDIRTY;
3137 vp->v_clen = 0;
3138
3139 while (start_pg < end_pg) {
3140 last_pg = start_pg + MAX_UPL_TRANSFER;
3141
3142 if (last_pg > end_pg)
3143 last_pg = end_pg;
3144
3145 cluster_push_x(vp, ubc_getsize(vp), start_pg, last_pg, 0);
3146
3147 start_pg = last_pg;
3148 }
3149 return (1);
3150 }
3151 retval = cluster_try_push(vp, ubc_getsize(vp), 0, 1);
3152
3153 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
3154 vp->v_flag & VHASDIRTY, vp->v_clen, retval, 0, 0);
3155
3156 return (retval);
3157 }
3158
3159
3160 static int
3161 cluster_try_push(vp, EOF, can_delay, push_all)
3162 struct vnode *vp;
3163 off_t EOF;
3164 int can_delay;
3165 int push_all;
3166 {
3167 int cl_index;
3168 int cl_index1;
3169 int min_index;
3170 int cl_len;
3171 int cl_total;
3172 int cl_pushed;
3173 struct v_cluster l_clusters[MAX_CLUSTERS];
3174
3175 /*
3176 * make a local 'sorted' copy of the clusters
3177 * and clear vp->v_clen so that new clusters can
3178 * be developed
3179 */
3180 for (cl_index = 0; cl_index < vp->v_clen; cl_index++) {
3181 for (min_index = -1, cl_index1 = 0; cl_index1 < vp->v_clen; cl_index1++) {
3182 if (vp->v_clusters[cl_index1].start_pg == vp->v_clusters[cl_index1].last_pg)
3183 continue;
3184 if (min_index == -1)
3185 min_index = cl_index1;
3186 else if (vp->v_clusters[cl_index1].start_pg < vp->v_clusters[min_index].start_pg)
3187 min_index = cl_index1;
3188 }
3189 if (min_index == -1)
3190 break;
3191 l_clusters[cl_index].start_pg = vp->v_clusters[min_index].start_pg;
3192 l_clusters[cl_index].last_pg = vp->v_clusters[min_index].last_pg;
3193
3194 vp->v_clusters[min_index].start_pg = vp->v_clusters[min_index].last_pg;
3195 }
3196 cl_len = cl_index;
3197 vp->v_clen = 0;
3198
3199 for (cl_pushed = 0, cl_index = 0; cl_index < cl_len; cl_index++) {
3200 /*
3201 * try to push each cluster in turn... cluster_push_x may not
3202 * push the cluster if can_delay is TRUE and the cluster doesn't
3203 * meet the critera for an immediate push
3204 */
3205 if (cluster_push_x(vp, EOF, l_clusters[cl_index].start_pg, l_clusters[cl_index].last_pg, can_delay)) {
3206 l_clusters[cl_index].start_pg = 0;
3207 l_clusters[cl_index].last_pg = 0;
3208
3209 cl_pushed++;
3210
3211 if (push_all == 0)
3212 break;
3213 }
3214 }
3215 if (cl_len > cl_pushed) {
3216 /*
3217 * we didn't push all of the clusters, so
3218 * lets try to merge them back in to the vnode
3219 */
3220 if ((MAX_CLUSTERS - vp->v_clen) < (cl_len - cl_pushed)) {
3221 /*
3222 * we picked up some new clusters while we were trying to
3223 * push the old ones (I don't think this can happen because
3224 * I'm holding the lock, but just in case)... the sum of the
3225 * leftovers plus the new cluster count exceeds our ability
3226 * to represent them, so fall back to the VHASDIRTY mechanism
3227 */
3228 for (cl_index = 0; cl_index < cl_len; cl_index++) {
3229 if (l_clusters[cl_index].start_pg == l_clusters[cl_index].last_pg)
3230 continue;
3231
3232 if (l_clusters[cl_index].start_pg < vp->v_cstart)
3233 vp->v_cstart = l_clusters[cl_index].start_pg;
3234 if (l_clusters[cl_index].last_pg > vp->v_lastw)
3235 vp->v_lastw = l_clusters[cl_index].last_pg;
3236 }
3237 vp->v_flag |= VHASDIRTY;
3238 } else {
3239 /*
3240 * we've got room to merge the leftovers back in
3241 * just append them starting at the next 'hole'
3242 * represented by vp->v_clen
3243 */
3244 for (cl_index = 0, cl_index1 = vp->v_clen; cl_index < cl_len; cl_index++) {
3245 if (l_clusters[cl_index].start_pg == l_clusters[cl_index].last_pg)
3246 continue;
3247
3248 vp->v_clusters[cl_index1].start_pg = l_clusters[cl_index].start_pg;
3249 vp->v_clusters[cl_index1].last_pg = l_clusters[cl_index].last_pg;
3250
3251 if (cl_index1 == 0) {
3252 vp->v_cstart = l_clusters[cl_index].start_pg;
3253 vp->v_lastw = l_clusters[cl_index].last_pg;
3254 } else {
3255 if (l_clusters[cl_index].start_pg < vp->v_cstart)
3256 vp->v_cstart = l_clusters[cl_index].start_pg;
3257 if (l_clusters[cl_index].last_pg > vp->v_lastw)
3258 vp->v_lastw = l_clusters[cl_index].last_pg;
3259 }
3260 cl_index1++;
3261 }
3262 /*
3263 * update the cluster count
3264 */
3265 vp->v_clen = cl_index1;
3266 }
3267 }
3268 return(MAX_CLUSTERS - vp->v_clen);
3269 }
3270
3271
3272
3273 static int
3274 cluster_push_x(vp, EOF, first, last, can_delay)
3275 struct vnode *vp;
3276 off_t EOF;
3277 daddr_t first;
3278 daddr_t last;
3279 int can_delay;
3280 {
3281 upl_page_info_t *pl;
3282 upl_t upl;
3283 vm_offset_t upl_offset;
3284 int upl_size;
3285 off_t upl_f_offset;
3286 int pages_in_upl;
3287 int start_pg;
3288 int last_pg;
3289 int io_size;
3290 int io_flags;
3291 int size;
3292 kern_return_t kret;
3293
3294
3295 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START,
3296 vp->v_clen, first, last, EOF, 0);
3297
3298 if ((pages_in_upl = last - first) == 0) {
3299 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0);
3300
3301 return (1);
3302 }
3303 upl_size = pages_in_upl * PAGE_SIZE;
3304 upl_f_offset = ((off_t)first) * PAGE_SIZE_64;
3305
3306 if (upl_f_offset + upl_size >= EOF) {
3307
3308 if (upl_f_offset >= EOF) {
3309 /*
3310 * must have truncated the file and missed
3311 * clearing a dangling cluster (i.e. it's completely
3312 * beyond the new EOF
3313 */
3314 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0);
3315
3316 return(1);
3317 }
3318 size = EOF - upl_f_offset;
3319
3320 upl_size = (size + (PAGE_SIZE - 1) ) & ~(PAGE_SIZE - 1);
3321 pages_in_upl = upl_size / PAGE_SIZE;
3322 } else {
3323 if (can_delay && (pages_in_upl < (MAX_UPL_TRANSFER - (MAX_UPL_TRANSFER / 2))))
3324 return(0);
3325 size = upl_size;
3326 }
3327 kret = ubc_create_upl(vp,
3328 upl_f_offset,
3329 upl_size,
3330 &upl,
3331 &pl,
3332 UPL_RET_ONLY_DIRTY);
3333 if (kret != KERN_SUCCESS)
3334 panic("cluster_push: failed to get pagelist");
3335
3336 if (can_delay) {
3337 int num_of_dirty;
3338
3339 for (num_of_dirty = 0, start_pg = 0; start_pg < pages_in_upl; start_pg++) {
3340 if (upl_valid_page(pl, start_pg) && upl_dirty_page(pl, start_pg))
3341 num_of_dirty++;
3342 }
3343 if (num_of_dirty < pages_in_upl / 2) {
3344 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3345
3346 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 0, 2, num_of_dirty, (pages_in_upl / 2), 0);
3347
3348 return(0);
3349 }
3350 }
3351 last_pg = 0;
3352
3353 while (size) {
3354
3355 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
3356 if (upl_valid_page(pl, start_pg) && upl_dirty_page(pl, start_pg))
3357 break;
3358 }
3359 if (start_pg > last_pg) {
3360 io_size = (start_pg - last_pg) * PAGE_SIZE;
3361
3362 ubc_upl_abort_range(upl, last_pg * PAGE_SIZE, io_size,
3363 UPL_ABORT_FREE_ON_EMPTY);
3364
3365 if (io_size < size)
3366 size -= io_size;
3367 else
3368 break;
3369 }
3370 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
3371 if (!upl_valid_page(pl, last_pg) || !upl_dirty_page(pl, last_pg))
3372 break;
3373 }
3374 upl_offset = start_pg * PAGE_SIZE;
3375
3376 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
3377
3378 if (vp->v_flag & VNOCACHE_DATA)
3379 io_flags = CL_COMMIT | CL_AGE | CL_ASYNC | CL_DUMP;
3380 else
3381 io_flags = CL_COMMIT | CL_AGE | CL_ASYNC;
3382
3383 while (vp->v_numoutput >= ASYNC_THROTTLE) {
3384 vp->v_flag |= VTHROTTLED;
3385 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_push", 0);
3386 }
3387 cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, vp->v_ciosiz, io_flags, (struct buf *)0, (struct clios *)0);
3388
3389 size -= io_size;
3390 }
3391 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, 0, 0, 0);
3392
3393 return(1);
3394 }
3395
3396
3397
3398 static int
3399 cluster_align_phys_io(struct vnode *vp, struct uio *uio, addr64_t usr_paddr, int xsize, int devblocksize, int flags)
3400 {
3401 struct iovec *iov;
3402 upl_page_info_t *pl;
3403 upl_t upl;
3404 addr64_t ubc_paddr;
3405 kern_return_t kret;
3406 int error = 0;
3407
3408 iov = uio->uio_iov;
3409
3410 kret = ubc_create_upl(vp,
3411 uio->uio_offset & ~PAGE_MASK_64,
3412 PAGE_SIZE,
3413 &upl,
3414 &pl,
3415 UPL_FLAGS_NONE);
3416
3417 if (kret != KERN_SUCCESS)
3418 return(EINVAL);
3419
3420 if (!upl_valid_page(pl, 0)) {
3421 /*
3422 * issue a synchronous read to cluster_io
3423 */
3424 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, devblocksize,
3425 CL_READ, (struct buf *)0, (struct clios *)0);
3426 if (error) {
3427 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3428
3429 return(error);
3430 }
3431 }
3432 ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)(uio->uio_offset & PAGE_MASK_64);
3433
3434 /*
3435 * NOTE: There is no prototype for the following in BSD. It, and the definitions
3436 * of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
3437 * osfmk/ppc/mappings.h. They are not included here because there appears to be no
3438 * way to do so without exporting them to kexts as well.
3439 */
3440 if (flags & CL_READ)
3441 // copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk); /* Copy physical to physical and flush the destination */
3442 copypv(ubc_paddr, usr_paddr, xsize, 2 | 1 | 4); /* Copy physical to physical and flush the destination */
3443 else
3444 // copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc); /* Copy physical to physical and flush the source */
3445 copypv(ubc_paddr, usr_paddr, xsize, 2 | 1 | 8); /* Copy physical to physical and flush the source */
3446
3447 if ( !(flags & CL_READ) || upl_dirty_page(pl, 0)) {
3448 /*
3449 * issue a synchronous write to cluster_io
3450 */
3451 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, devblocksize,
3452 0, (struct buf *)0, (struct clios *)0);
3453 }
3454 if (error == 0) {
3455 uio->uio_offset += xsize;
3456 iov->iov_base += xsize;
3457 iov->iov_len -= xsize;
3458 uio->uio_resid -= xsize;
3459 }
3460 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3461
3462 return (error);
3463 }