]> git.saurik.com Git - apple/xnu.git/blob - bsd/vfs/vfs_cluster.c
5756a104a3cd8316d1916f92ad14535616670498
[apple/xnu.git] / bsd / vfs / vfs_cluster.c
1 /*
2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
23 /*
24 * Copyright (c) 1993
25 * The Regents of the University of California. All rights reserved.
26 *
27 * Redistribution and use in source and binary forms, with or without
28 * modification, are permitted provided that the following conditions
29 * are met:
30 * 1. Redistributions of source code must retain the above copyright
31 * notice, this list of conditions and the following disclaimer.
32 * 2. Redistributions in binary form must reproduce the above copyright
33 * notice, this list of conditions and the following disclaimer in the
34 * documentation and/or other materials provided with the distribution.
35 * 3. All advertising materials mentioning features or use of this software
36 * must display the following acknowledgement:
37 * This product includes software developed by the University of
38 * California, Berkeley and its contributors.
39 * 4. Neither the name of the University nor the names of its contributors
40 * may be used to endorse or promote products derived from this software
41 * without specific prior written permission.
42 *
43 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
44 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
45 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
46 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
47 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
48 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
49 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
50 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
51 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
52 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
53 * SUCH DAMAGE.
54 *
55 * @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95
56 */
57
58 #include <sys/param.h>
59 #include <sys/proc.h>
60 #include <sys/buf.h>
61 #include <sys/vnode.h>
62 #include <sys/mount.h>
63 #include <sys/trace.h>
64 #include <sys/malloc.h>
65 #include <sys/resourcevar.h>
66 #include <libkern/libkern.h>
67
68 #include <sys/ubc.h>
69 #include <vm/vm_pageout.h>
70
71 #include <sys/kdebug.h>
72
73 #define CL_READ 0x01
74 #define CL_ASYNC 0x02
75 #define CL_COMMIT 0x04
76 #define CL_NOMAP 0x08
77 #define CL_PAGEOUT 0x10
78 #define CL_AGE 0x20
79 #define CL_DUMP 0x40
80 #define CL_NOZERO 0x80
81 #define CL_PAGEIN 0x100
82 #define CL_DEV_MEMORY 0x200
83
84 /*
85 * throttle the number of async writes that
86 * can be outstanding on a single vnode
87 * before we issue a synchronous write
88 */
89 #define ASYNC_THROTTLE 6
90
91 static int
92 cluster_iodone(bp)
93 struct buf *bp;
94 {
95 int b_flags;
96 int error;
97 int total_size;
98 int total_resid;
99 int upl_offset;
100 upl_t upl;
101 struct buf *cbp;
102 struct buf *cbp_head;
103 struct buf *cbp_next;
104 struct buf *real_bp;
105 struct vnode *vp;
106 int commit_size;
107 int pg_offset;
108
109
110 cbp_head = (struct buf *)(bp->b_trans_head);
111
112 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
113 cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
114
115 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
116 /*
117 * all I/O requests that are part of this transaction
118 * have to complete before we can process it
119 */
120 if ( !(cbp->b_flags & B_DONE)) {
121
122 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
123 cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0);
124
125 return 0;
126 }
127 }
128 error = 0;
129 total_size = 0;
130 total_resid = 0;
131
132 cbp = cbp_head;
133 upl_offset = cbp->b_uploffset;
134 upl = cbp->b_pagelist;
135 b_flags = cbp->b_flags;
136 real_bp = cbp->b_real_bp;
137 vp = cbp->b_vp;
138
139 while (cbp) {
140 if (cbp->b_vectorcount > 1)
141 _FREE(cbp->b_vectorlist, M_SEGMENT);
142
143 if ((cbp->b_flags & B_ERROR) && error == 0)
144 error = cbp->b_error;
145
146 total_resid += cbp->b_resid;
147 total_size += cbp->b_bcount;
148
149 cbp_next = cbp->b_trans_next;
150
151 free_io_buf(cbp);
152
153 cbp = cbp_next;
154 }
155 if ((vp->v_flag & VTHROTTLED) && (vp->v_numoutput <= (ASYNC_THROTTLE / 3))) {
156 vp->v_flag &= ~VTHROTTLED;
157 wakeup((caddr_t)&vp->v_numoutput);
158 }
159 if ((b_flags & B_NEED_IODONE) && real_bp) {
160 if (error) {
161 real_bp->b_flags |= B_ERROR;
162 real_bp->b_error = error;
163 }
164 real_bp->b_resid = total_resid;
165
166 biodone(real_bp);
167 }
168 if (error == 0 && total_resid)
169 error = EIO;
170
171 if (b_flags & B_COMMIT_UPL) {
172 pg_offset = upl_offset & PAGE_MASK;
173 commit_size = (((pg_offset + total_size) + (PAGE_SIZE - 1)) / PAGE_SIZE) * PAGE_SIZE;
174
175 if (error || (b_flags & B_NOCACHE)) {
176 int upl_abort_code;
177
178 if (b_flags & B_PAGEOUT)
179 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
180 else if (b_flags & B_PGIN)
181 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
182 else
183 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
184
185 ubc_upl_abort_range(upl, upl_offset - pg_offset, commit_size,
186 upl_abort_code);
187
188 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
189 upl, upl_offset - pg_offset, commit_size,
190 0x80000000|upl_abort_code, 0);
191
192 } else {
193 int upl_commit_flags = UPL_COMMIT_FREE_ON_EMPTY;
194
195 if ( !(b_flags & B_PAGEOUT))
196 upl_commit_flags |= UPL_COMMIT_CLEAR_DIRTY;
197 if (b_flags & B_AGE)
198 upl_commit_flags |= UPL_COMMIT_INACTIVATE;
199
200 ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size,
201 upl_commit_flags);
202
203 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
204 upl, upl_offset - pg_offset, commit_size,
205 upl_commit_flags, 0);
206 }
207 } else
208 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
209 upl, upl_offset, 0, error, 0);
210
211 return (error);
212 }
213
214
215 static void
216 cluster_zero(upl, upl_offset, size, flags, bp)
217 upl_t upl;
218 vm_offset_t upl_offset;
219 int size;
220 int flags;
221 struct buf *bp;
222 {
223 vm_offset_t io_addr = 0;
224 kern_return_t kret;
225
226 if ( !(flags & CL_NOMAP)) {
227 kret = ubc_upl_map(upl, &io_addr);
228
229 if (kret != KERN_SUCCESS)
230 panic("cluster_zero: ubc_upl_map() failed with (%d)", kret);
231 if (io_addr == 0)
232 panic("cluster_zero: ubc_upl_map() mapped 0");
233 } else
234 io_addr = (vm_offset_t)bp->b_data;
235 bzero((caddr_t)(io_addr + upl_offset), size);
236
237 if ( !(flags & CL_NOMAP)) {
238 kret = ubc_upl_unmap(upl);
239
240 if (kret != KERN_SUCCESS)
241 panic("cluster_zero: kernel_upl_unmap failed");
242 }
243 }
244
245 static int
246 cluster_io(vp, upl, upl_offset, f_offset, size, flags, real_bp)
247 struct vnode *vp;
248 upl_t upl;
249 vm_offset_t upl_offset;
250 off_t f_offset;
251 int size;
252 int flags;
253 struct buf *real_bp;
254 {
255 struct buf *cbp;
256 struct iovec *iovp;
257 int io_flags;
258 int error = 0;
259 int retval = 0;
260 struct buf *cbp_head = 0;
261 struct buf *cbp_tail = 0;
262 upl_page_info_t *pl;
263 int pg_count;
264 int pg_offset;
265 int max_iosize;
266 int max_vectors;
267 int priv;
268
269 if (flags & CL_READ) {
270 io_flags = (B_VECTORLIST | B_READ);
271
272 vfs_io_attributes(vp, B_READ, &max_iosize, &max_vectors);
273 } else {
274 io_flags = (B_VECTORLIST | B_WRITEINPROG);
275
276 vfs_io_attributes(vp, B_WRITE, &max_iosize, &max_vectors);
277 }
278 pl = ubc_upl_pageinfo(upl);
279
280 if (flags & CL_ASYNC)
281 io_flags |= (B_CALL | B_ASYNC);
282 if (flags & CL_AGE)
283 io_flags |= B_AGE;
284 if (flags & CL_DUMP)
285 io_flags |= B_NOCACHE;
286 if (flags & CL_PAGEIN)
287 io_flags |= B_PGIN;
288
289
290 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START,
291 (int)f_offset, size, upl_offset, flags, 0);
292
293 if ((flags & CL_READ) && ((upl_offset + size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
294 /*
295 * then we are going to end up
296 * with a page that we can't complete (the file size wasn't a multiple
297 * of PAGE_SIZE and we're trying to read to the end of the file
298 * so we'll go ahead and zero out the portion of the page we can't
299 * read in from the file
300 */
301 cluster_zero(upl, upl_offset + size, PAGE_SIZE - ((upl_offset + size) & PAGE_MASK), flags, real_bp);
302
303 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_NONE,
304 upl_offset + size, PAGE_SIZE - ((upl_offset + size) & PAGE_MASK),
305 flags, real_bp, 0);
306 }
307 while (size) {
308 size_t io_size;
309 int vsize;
310 int i;
311 int pl_index;
312 int pg_resid;
313 int num_contig;
314 daddr_t lblkno;
315 daddr_t blkno;
316
317 if (size > max_iosize)
318 io_size = max_iosize;
319 else
320 io_size = size;
321
322 if (error = VOP_CMAP(vp, f_offset, io_size, &blkno, &io_size, NULL)) {
323 if (error == EOPNOTSUPP)
324 panic("VOP_CMAP Unimplemented");
325 break;
326 }
327
328 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
329 (int)f_offset, (int)blkno, io_size, 0, 0);
330
331 if ( (!(flags & CL_READ) && (long)blkno == -1) || io_size == 0) {
332 if (flags & CL_PAGEOUT) {
333 error = EINVAL;
334 break;
335 };
336
337 /* Try paging out the page individually before
338 giving up entirely and dumping it (it could
339 be mapped in a "hole" and require allocation
340 before the I/O:
341 */
342 ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE_64, UPL_ABORT_FREE_ON_EMPTY);
343 if (ubc_pushdirty_range(vp, f_offset, PAGE_SIZE_64) == 0) {
344 error = EINVAL;
345 break;
346 };
347
348 upl_offset += PAGE_SIZE_64;
349 f_offset += PAGE_SIZE_64;
350 size -= PAGE_SIZE_64;
351 continue;
352 }
353 lblkno = (daddr_t)(f_offset / PAGE_SIZE_64);
354 /*
355 * we have now figured out how much I/O we can do - this is in 'io_size'
356 * pl_index represents the first page in the 'upl' that the I/O will occur for
357 * pg_offset is the starting point in the first page for the I/O
358 * pg_count is the number of full and partial pages that 'io_size' encompasses
359 */
360 pl_index = upl_offset / PAGE_SIZE;
361 pg_offset = upl_offset & PAGE_MASK;
362 pg_count = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
363
364 if (flags & CL_DEV_MEMORY) {
365 /*
366 * currently, can't deal with reading 'holes' in file
367 */
368 if ((long)blkno == -1) {
369 error = EINVAL;
370 break;
371 }
372 /*
373 * treat physical requests as one 'giant' page
374 */
375 pg_count = 1;
376 }
377 if ((flags & CL_READ) && (long)blkno == -1) {
378 /*
379 * if we're reading and blkno == -1, then we've got a
380 * 'hole' in the file that we need to deal with by zeroing
381 * out the affected area in the upl
382 */
383 cluster_zero(upl, upl_offset, io_size, flags, real_bp);
384
385 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_NONE,
386 upl_offset, io_size, flags, real_bp, 0);
387
388 pg_count = (io_size - pg_offset) / PAGE_SIZE;
389
390 if (io_size == size && ((upl_offset + io_size) & PAGE_MASK))
391 pg_count++;
392
393 if (pg_count) {
394 if (pg_offset)
395 pg_resid = PAGE_SIZE - pg_offset;
396 else
397 pg_resid = 0;
398 if (flags & CL_COMMIT)
399 ubc_upl_commit_range(upl,
400 upl_offset + pg_resid,
401 pg_count * PAGE_SIZE,
402 UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
403 }
404 upl_offset += io_size;
405 f_offset += io_size;
406 size -= io_size;
407
408 if (cbp_head && pg_count)
409 goto start_io;
410 continue;
411 } else if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno)) {
412 real_bp->b_blkno = blkno;
413 }
414
415 if (pg_count > 1) {
416 if (pg_count > max_vectors) {
417 io_size -= (pg_count - max_vectors) * PAGE_SIZE;
418
419 if (io_size < 0) {
420 io_size = PAGE_SIZE - pg_offset;
421 pg_count = 1;
422 } else
423 pg_count = max_vectors;
424 }
425 /*
426 * we need to allocate space for the vector list
427 */
428 if (pg_count > 1) {
429 iovp = (struct iovec *)_MALLOC(sizeof(struct iovec) * pg_count,
430 M_SEGMENT, M_NOWAIT);
431
432 if (iovp == (struct iovec *) 0) {
433 /*
434 * if the allocation fails, then throttle down to a single page
435 */
436 io_size = PAGE_SIZE - pg_offset;
437 pg_count = 1;
438 }
439 }
440 }
441
442 /* Throttle the speculative IO */
443 if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT))
444 priv = 0;
445 else
446 priv = 1;
447
448 cbp = alloc_io_buf(vp, priv);
449
450 if (pg_count == 1)
451 /*
452 * we use the io vector that's reserved in the buffer header
453 * this insures we can always issue an I/O even in a low memory
454 * condition that prevents the _MALLOC from succeeding... this
455 * is necessary to prevent deadlocks with the pager
456 */
457 iovp = (struct iovec *)(&cbp->b_vects[0]);
458
459 cbp->b_vectorlist = (void *)iovp;
460 cbp->b_vectorcount = pg_count;
461
462 if (flags & CL_DEV_MEMORY) {
463
464 iovp->iov_len = io_size;
465 iovp->iov_base = (caddr_t)upl_phys_page(pl, 0);
466
467 if (iovp->iov_base == (caddr_t) 0) {
468 free_io_buf(cbp);
469 error = EINVAL;
470 } else
471 iovp->iov_base += upl_offset;
472 } else {
473
474 for (i = 0, vsize = io_size; i < pg_count; i++, iovp++) {
475 int psize;
476
477 psize = PAGE_SIZE - pg_offset;
478
479 if (psize > vsize)
480 psize = vsize;
481
482 iovp->iov_len = psize;
483 iovp->iov_base = (caddr_t)upl_phys_page(pl, pl_index + i);
484
485 if (iovp->iov_base == (caddr_t) 0) {
486 if (pg_count > 1)
487 _FREE(cbp->b_vectorlist, M_SEGMENT);
488 free_io_buf(cbp);
489
490 error = EINVAL;
491 break;
492 }
493 iovp->iov_base += pg_offset;
494 pg_offset = 0;
495
496 if (flags & CL_PAGEOUT) {
497 int s;
498 struct buf *bp;
499
500 s = splbio();
501 if (bp = incore(vp, lblkno + i)) {
502 if (!ISSET(bp->b_flags, B_BUSY)) {
503 bremfree(bp);
504 SET(bp->b_flags, (B_BUSY | B_INVAL));
505 splx(s);
506 brelse(bp);
507 } else
508 panic("BUSY bp found in cluster_io");
509 }
510 splx(s);
511 }
512 vsize -= psize;
513 }
514 }
515 if (error)
516 break;
517
518 if (flags & CL_ASYNC)
519 cbp->b_iodone = (void *)cluster_iodone;
520 cbp->b_flags |= io_flags;
521
522 cbp->b_lblkno = lblkno;
523 cbp->b_blkno = blkno;
524 cbp->b_bcount = io_size;
525 cbp->b_pagelist = upl;
526 cbp->b_uploffset = upl_offset;
527 cbp->b_trans_next = (struct buf *)0;
528
529 if (flags & CL_READ)
530 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
531 cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
532 else
533 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
534 cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
535
536 if (cbp_head) {
537 cbp_tail->b_trans_next = cbp;
538 cbp_tail = cbp;
539 } else {
540 cbp_head = cbp;
541 cbp_tail = cbp;
542 }
543 (struct buf *)(cbp->b_trans_head) = cbp_head;
544
545 upl_offset += io_size;
546 f_offset += io_size;
547 size -= io_size;
548
549 if ( (!(upl_offset & PAGE_MASK) && !(flags & CL_DEV_MEMORY)) || size == 0) {
550 /*
551 * if we have no more I/O to issue or
552 * the current I/O we've prepared fully
553 * completes the last page in this request
554 * or it's been completed via a zero-fill
555 * due to a 'hole' in the file
556 * then go ahead and issue the I/O
557 */
558 start_io:
559 if (flags & CL_COMMIT)
560 cbp_head->b_flags |= B_COMMIT_UPL;
561 if (flags & CL_PAGEOUT)
562 cbp_head->b_flags |= B_PAGEOUT;
563 if (flags & CL_PAGEIN)
564 cbp_head->b_flags |= B_PGIN;
565
566 if (real_bp) {
567 cbp_head->b_flags |= B_NEED_IODONE;
568 cbp_head->b_real_bp = real_bp;
569 }
570
571 for (cbp = cbp_head; cbp;) {
572 struct buf * cbp_next;
573
574 if (io_flags & B_WRITEINPROG)
575 cbp->b_vp->v_numoutput++;
576
577 cbp_next = cbp->b_trans_next;
578
579 (void) VOP_STRATEGY(cbp);
580 cbp = cbp_next;
581 }
582 if ( !(flags & CL_ASYNC)) {
583 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next)
584 biowait(cbp);
585
586 if (error = cluster_iodone(cbp_head)) {
587 retval = error;
588 error = 0;
589 }
590 }
591 cbp_head = (struct buf *)0;
592 cbp_tail = (struct buf *)0;
593 }
594 }
595 if (error) {
596 int abort_size;
597
598 for (cbp = cbp_head; cbp;) {
599 struct buf * cbp_next;
600
601 if (cbp->b_vectorcount > 1)
602 _FREE(cbp->b_vectorlist, M_SEGMENT);
603 upl_offset -= cbp->b_bcount;
604 size += cbp->b_bcount;
605
606 cbp_next = cbp->b_trans_next;
607 free_io_buf(cbp);
608 cbp = cbp_next;
609 }
610 pg_offset = upl_offset & PAGE_MASK;
611 abort_size = ((size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE) * PAGE_SIZE;
612
613 if (flags & CL_COMMIT) {
614 int upl_abort_code;
615
616 if (flags & CL_PAGEOUT)
617 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
618 else if (flags & CL_PAGEIN)
619 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
620 else
621 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
622
623 ubc_upl_abort_range(upl, upl_offset - pg_offset, abort_size,
624 upl_abort_code);
625
626 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
627 upl, upl_offset - pg_offset, abort_size, error, 0);
628 }
629 if (real_bp) {
630 real_bp->b_flags |= B_ERROR;
631 real_bp->b_error = error;
632
633 biodone(real_bp);
634 }
635 if (retval == 0)
636 retval = error;
637 }
638 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END,
639 (int)f_offset, size, upl_offset, retval, 0);
640
641 return (retval);
642 }
643
644
645 static int
646 cluster_rd_prefetch(vp, f_offset, size, filesize, devblocksize)
647 struct vnode *vp;
648 off_t f_offset;
649 u_int size;
650 off_t filesize;
651 int devblocksize;
652 {
653 upl_t upl;
654 upl_page_info_t *pl;
655 int pages_in_upl;
656 int start_pg;
657 int last_pg;
658 int last_valid;
659 int io_size;
660
661
662 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
663 (int)f_offset, size, (int)filesize, 0, 0);
664
665 if (f_offset >= filesize) {
666 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
667 (int)f_offset, 0, 0, 0, 0);
668 return(0);
669 }
670 if (ubc_page_op(vp, f_offset, 0, 0, 0) == KERN_SUCCESS) {
671 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
672 (int)f_offset, 0, 0, 0, 0);
673 return(0);
674 }
675 if (size > (MAX_UPL_TRANSFER * PAGE_SIZE))
676 size = MAX_UPL_TRANSFER * PAGE_SIZE;
677 else
678 size = (size + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1);
679
680 if ((off_t)size > (filesize - f_offset))
681 size = ((filesize - f_offset) + (devblocksize - 1)) & ~(devblocksize - 1);
682
683 pages_in_upl = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
684
685 ubc_create_upl(vp,
686 f_offset,
687 pages_in_upl * PAGE_SIZE,
688 &upl,
689 &pl,
690 UPL_FLAGS_NONE);
691
692 if (upl == (upl_t) 0)
693 return(0);
694
695 /*
696 * scan from the beginning of the upl looking for the first
697 * non-valid page.... this will become the first page in
698 * the request we're going to make to 'cluster_io'... if all
699 * of the pages are valid, we won't call through to 'cluster_io'
700 */
701 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
702 if (!upl_valid_page(pl, start_pg))
703 break;
704 }
705
706 /*
707 * scan from the starting invalid page looking for a valid
708 * page before the end of the upl is reached, if we
709 * find one, then it will be the last page of the request to
710 * 'cluster_io'
711 */
712 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
713 if (upl_valid_page(pl, last_pg))
714 break;
715 }
716
717 /*
718 * if we find any more free valid pages at the tail of the upl
719 * than update maxra accordingly....
720 */
721 for (last_valid = last_pg; last_valid < pages_in_upl; last_valid++) {
722 if (!upl_valid_page(pl, last_valid))
723 break;
724 }
725 if (start_pg < last_pg) {
726 vm_offset_t upl_offset;
727
728 /*
729 * we found a range of 'invalid' pages that must be filled
730 * 'size' has already been clipped to the LEOF
731 * make sure it's at least a multiple of the device block size
732 */
733 upl_offset = start_pg * PAGE_SIZE;
734 io_size = (last_pg - start_pg) * PAGE_SIZE;
735
736 if ((upl_offset + io_size) > size) {
737 io_size = size - upl_offset;
738
739 KERNEL_DEBUG(0xd001000, upl_offset, size, io_size, 0, 0);
740 }
741 cluster_io(vp, upl, upl_offset, f_offset + upl_offset, io_size,
742 CL_READ | CL_COMMIT | CL_ASYNC | CL_AGE, (struct buf *)0);
743 }
744 if (start_pg) {
745 /*
746 * start_pg of non-zero indicates we found some already valid pages
747 * at the beginning of the upl.... we need to release these without
748 * modifying there state
749 */
750 ubc_upl_abort_range(upl, 0, start_pg * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
751
752 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 50)) | DBG_FUNC_NONE,
753 upl, 0, start_pg * PAGE_SIZE, 0, 0);
754 }
755 if (last_pg < pages_in_upl) {
756 /*
757 * the set of pages that we issued an I/O for did not extend all the
758 * way to the end of the upl... so just release them without modifying
759 * there state
760 */
761 ubc_upl_abort_range(upl, last_pg * PAGE_SIZE, (pages_in_upl - last_pg) * PAGE_SIZE,
762 UPL_ABORT_FREE_ON_EMPTY);
763
764 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 50)) | DBG_FUNC_NONE,
765 upl, last_pg * PAGE_SIZE, (pages_in_upl - last_pg) * PAGE_SIZE, 0, 0);
766 }
767
768 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
769 (int)f_offset + (last_valid * PAGE_SIZE), 0, 0, 0, 0);
770
771 return(last_valid);
772 }
773
774
775
776 static void
777 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize)
778 struct vnode *vp;
779 daddr_t b_lblkno;
780 daddr_t e_lblkno;
781 off_t filesize;
782 int devblocksize;
783 {
784 daddr_t r_lblkno;
785 off_t f_offset;
786 int size_of_prefetch;
787 int max_iosize;
788 int max_pages;
789
790 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
791 b_lblkno, e_lblkno, vp->v_lastr, 0, 0);
792
793 if (b_lblkno == vp->v_lastr && b_lblkno == e_lblkno) {
794 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
795 vp->v_ralen, vp->v_maxra, vp->v_lastr, 0, 0);
796 return;
797 }
798
799 if (vp->v_lastr == -1 || (b_lblkno != vp->v_lastr && b_lblkno != (vp->v_lastr + 1) && b_lblkno != (vp->v_maxra + 1))) {
800 vp->v_ralen = 0;
801 vp->v_maxra = 0;
802
803 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
804 vp->v_ralen, vp->v_maxra, vp->v_lastr, 1, 0);
805
806 return;
807 }
808 vfs_io_attributes(vp, B_READ, &max_iosize, &max_pages);
809
810 if ((max_iosize / PAGE_SIZE) < max_pages)
811 max_pages = max_iosize / PAGE_SIZE;
812 if (max_pages > MAX_UPL_TRANSFER)
813 max_pages = MAX_UPL_TRANSFER;
814
815 vp->v_ralen = vp->v_ralen ? min(max_pages, vp->v_ralen << 1) : 1;
816
817 if (((e_lblkno + 1) - b_lblkno) > vp->v_ralen)
818 vp->v_ralen = min(max_pages, (e_lblkno + 1) - b_lblkno);
819
820 if (e_lblkno < vp->v_maxra) {
821 if ((vp->v_maxra - e_lblkno) > (max_pages / 4)) {
822
823 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
824 vp->v_ralen, vp->v_maxra, vp->v_lastr, 2, 0);
825 return;
826 }
827 }
828 r_lblkno = max(e_lblkno, vp->v_maxra) + 1;
829 f_offset = (off_t)r_lblkno * PAGE_SIZE_64;
830
831 size_of_prefetch = cluster_rd_prefetch(vp, f_offset, vp->v_ralen * PAGE_SIZE, filesize, devblocksize);
832
833 if (size_of_prefetch)
834 vp->v_maxra = r_lblkno + (size_of_prefetch - 1);
835
836 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
837 vp->v_ralen, vp->v_maxra, vp->v_lastr, 3, 0);
838 }
839
840
841 cluster_pageout(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags)
842 struct vnode *vp;
843 upl_t upl;
844 vm_offset_t upl_offset;
845 off_t f_offset;
846 int size;
847 off_t filesize;
848 int devblocksize;
849 int flags;
850 {
851 int io_size;
852 int pg_size;
853 off_t max_size;
854 int local_flags = CL_PAGEOUT;
855
856 if ((flags & UPL_IOSYNC) == 0)
857 local_flags |= CL_ASYNC;
858 if ((flags & UPL_NOCOMMIT) == 0)
859 local_flags |= CL_COMMIT;
860
861 if (upl == (upl_t) 0)
862 panic("cluster_pageout: can't handle NULL upl yet\n");
863
864
865 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
866 (int)f_offset, size, (int)filesize, local_flags, 0);
867
868 /*
869 * If they didn't specify any I/O, then we are done...
870 * we can't issue an abort because we don't know how
871 * big the upl really is
872 */
873 if (size <= 0)
874 return (EINVAL);
875
876 if (vp->v_mount->mnt_flag & MNT_RDONLY) {
877 if (local_flags & CL_COMMIT)
878 ubc_upl_abort_range(upl, upl_offset, size,
879 UPL_ABORT_FREE_ON_EMPTY);
880 return (EROFS);
881 }
882 /*
883 * can't page-in from a negative offset
884 * or if we're starting beyond the EOF
885 * or if the file offset isn't page aligned
886 * or the size requested isn't a multiple of PAGE_SIZE
887 */
888 if (f_offset < 0 || f_offset >= filesize ||
889 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
890 if (local_flags & CL_COMMIT)
891 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
892 return (EINVAL);
893 }
894 max_size = filesize - f_offset;
895
896 if (size < max_size)
897 io_size = size;
898 else
899 io_size = (max_size + (devblocksize - 1)) & ~(devblocksize - 1);
900
901 pg_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
902
903 if (size > pg_size) {
904 if (local_flags & CL_COMMIT)
905 ubc_upl_abort_range(upl, upl_offset + pg_size, size - pg_size,
906 UPL_ABORT_FREE_ON_EMPTY);
907 }
908 while (vp->v_numoutput >= ASYNC_THROTTLE) {
909 vp->v_flag |= VTHROTTLED;
910 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_pageout", 0);
911 }
912
913 return (cluster_io(vp, upl, upl_offset, f_offset, io_size,
914 local_flags, (struct buf *)0));
915 }
916
917
918 cluster_pagein(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags)
919 struct vnode *vp;
920 upl_t upl;
921 vm_offset_t upl_offset;
922 off_t f_offset;
923 int size;
924 off_t filesize;
925 int devblocksize;
926 int flags;
927 {
928 u_int io_size;
929 int pg_size;
930 off_t max_size;
931 int retval;
932 int local_flags = 0;
933
934
935 /*
936 * If they didn't ask for any data, then we are done...
937 * we can't issue an abort because we don't know how
938 * big the upl really is
939 */
940 if (size <= 0)
941 return (EINVAL);
942
943 if ((flags & UPL_NOCOMMIT) == 0)
944 local_flags = CL_COMMIT;
945
946 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
947 (int)f_offset, size, (int)filesize, local_flags, 0);
948
949 /*
950 * can't page-in from a negative offset
951 * or if we're starting beyond the EOF
952 * or if the file offset isn't page aligned
953 * or the size requested isn't a multiple of PAGE_SIZE
954 */
955 if (f_offset < 0 || f_offset >= filesize ||
956 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
957 if (local_flags & CL_COMMIT)
958 ubc_upl_abort_range(upl, upl_offset, size,
959 UPL_ABORT_ERROR | UPL_ABORT_FREE_ON_EMPTY);
960 return (EINVAL);
961 }
962 max_size = filesize - f_offset;
963
964 if (size < max_size)
965 io_size = size;
966 else
967 io_size = (max_size + (devblocksize - 1)) & ~(devblocksize - 1);
968
969 pg_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
970
971 if (upl == (upl_t) 0) {
972 ubc_create_upl( vp,
973 f_offset,
974 pg_size,
975 &upl,
976 NULL,
977 UPL_FLAGS_NONE);
978
979 if (upl == (upl_t) 0)
980 return (EINVAL);
981
982 upl_offset = (vm_offset_t)0;
983 size = pg_size;
984 }
985 if (size > pg_size) {
986 if (local_flags & CL_COMMIT)
987 ubc_upl_abort_range(upl, upl_offset + pg_size, size - pg_size,
988 UPL_ABORT_FREE_ON_EMPTY);
989 }
990
991 retval = cluster_io(vp, upl, upl_offset, f_offset, io_size,
992 local_flags | CL_READ | CL_PAGEIN, (struct buf *)0);
993
994 if (retval == 0) {
995 int b_lblkno;
996 int e_lblkno;
997
998 b_lblkno = (int)(f_offset / PAGE_SIZE_64);
999 e_lblkno = (int)
1000 ((f_offset + ((off_t)io_size - 1)) / PAGE_SIZE_64);
1001
1002 if (!(flags & UPL_NORDAHEAD) && !(vp->v_flag & VRAOFF)) {
1003 /*
1004 * we haven't read the last page in of the file yet
1005 * so let's try to read ahead if we're in
1006 * a sequential access pattern
1007 */
1008 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
1009 }
1010 vp->v_lastr = e_lblkno;
1011 }
1012 return (retval);
1013 }
1014
1015
1016 cluster_bp(bp)
1017 struct buf *bp;
1018 {
1019 off_t f_offset;
1020 int flags;
1021
1022 if (bp->b_pagelist == (upl_t) 0)
1023 panic("cluster_bp: can't handle NULL upl yet\n");
1024 if (bp->b_flags & B_READ)
1025 flags = CL_ASYNC | CL_NOMAP | CL_READ;
1026 else
1027 flags = CL_ASYNC | CL_NOMAP;
1028
1029 f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
1030
1031 return (cluster_io(bp->b_vp, bp->b_pagelist, 0, f_offset, bp->b_bcount, flags, bp));
1032 }
1033
1034
1035 cluster_write(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
1036 struct vnode *vp;
1037 struct uio *uio;
1038 off_t oldEOF;
1039 off_t newEOF;
1040 off_t headOff;
1041 off_t tailOff;
1042 int devblocksize;
1043 int flags;
1044 {
1045 int prev_resid;
1046 int clip_size;
1047 off_t max_io_size;
1048 struct iovec *iov;
1049 vm_offset_t upl_offset;
1050 int upl_size;
1051 int pages_in_pl;
1052 upl_page_info_t *pl;
1053 int upl_flags;
1054 upl_t upl;
1055 int retval = 0;
1056
1057
1058 if ((!uio) || (uio->uio_segflg != UIO_USERSPACE) || (!(vp->v_flag & VNOCACHE_DATA)))
1059 {
1060 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1061 return(retval);
1062 }
1063
1064 while (uio->uio_resid && uio->uio_offset < newEOF && retval == 0)
1065 {
1066 /* we know we have a resid, so this is safe */
1067 iov = uio->uio_iov;
1068 while (iov->iov_len == 0) {
1069 uio->uio_iov++;
1070 uio->uio_iovcnt--;
1071 iov = uio->uio_iov;
1072 }
1073
1074 /*
1075 * We check every vector target and if it is physically
1076 * contiguous space, we skip the sanity checks.
1077 */
1078
1079 upl_offset = (vm_offset_t)iov->iov_base & ~PAGE_MASK;
1080 upl_size = (upl_offset + PAGE_SIZE +(PAGE_SIZE -1)) & ~PAGE_MASK;
1081 pages_in_pl = 0;
1082 upl_flags = UPL_QUERY_OBJECT_TYPE;
1083 if ((vm_map_get_upl(current_map(),
1084 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1085 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0)) != KERN_SUCCESS)
1086 {
1087 /*
1088 * the user app must have passed in an invalid address
1089 */
1090 return (EFAULT);
1091 }
1092
1093 if (upl_flags & UPL_PHYS_CONTIG)
1094 {
1095 /*
1096 * since the interface to the IOKit below us uses physical block #'s and
1097 * block counts to specify the I/O, we can't handle anything that isn't
1098 * devblocksize aligned
1099 */
1100 if ((uio->uio_offset & (devblocksize - 1)) || (uio->uio_resid & (devblocksize - 1)))
1101 return(EINVAL);
1102
1103 if (flags & IO_HEADZEROFILL)
1104 {
1105 flags &= ~IO_HEADZEROFILL;
1106
1107 if (retval = cluster_write_x(vp, (struct uio *)0, 0, uio->uio_offset, headOff, 0, devblocksize, IO_HEADZEROFILL))
1108 return(retval);
1109 }
1110
1111 retval = cluster_phys_write(vp, uio);
1112
1113 if (uio->uio_resid == 0 && (flags & IO_TAILZEROFILL))
1114 {
1115 retval = cluster_write_x(vp, (struct uio *)0, 0, tailOff, uio->uio_offset, 0, devblocksize, IO_HEADZEROFILL);
1116 return(retval);
1117 }
1118 }
1119 else if ((uio->uio_resid < 4 * PAGE_SIZE) || (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL)))
1120 {
1121 /*
1122 * We set a threshhold of 4 pages to decide if the nocopy
1123 * write loop is worth the trouble...
1124 * we also come here if we're trying to zero the head and/or tail
1125 * of a partially written page, and the user source is not a physically contiguous region
1126 */
1127 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1128 return(retval);
1129 }
1130 else if (uio->uio_offset & PAGE_MASK_64)
1131 {
1132 /* Bring the file offset write up to a pagesize boundary */
1133 clip_size = (PAGE_SIZE - (uio->uio_offset & PAGE_MASK_64));
1134 if (uio->uio_resid < clip_size)
1135 clip_size = uio->uio_resid;
1136 /*
1137 * Fake the resid going into the cluster_write_x call
1138 * and restore it on the way out.
1139 */
1140 prev_resid = uio->uio_resid;
1141 uio->uio_resid = clip_size;
1142 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1143 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1144 }
1145 else if ((int)iov->iov_base & PAGE_MASK_64)
1146 {
1147 clip_size = iov->iov_len;
1148 prev_resid = uio->uio_resid;
1149 uio->uio_resid = clip_size;
1150 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1151 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1152 }
1153 else
1154 {
1155 /*
1156 * If we come in here, we know the offset into
1157 * the file is on a pagesize boundary
1158 */
1159
1160 max_io_size = newEOF - uio->uio_offset;
1161 clip_size = uio->uio_resid;
1162 if (iov->iov_len < clip_size)
1163 clip_size = iov->iov_len;
1164 if (max_io_size < clip_size)
1165 clip_size = max_io_size;
1166
1167 if (clip_size < PAGE_SIZE)
1168 {
1169 /*
1170 * Take care of tail end of write in this vector
1171 */
1172 prev_resid = uio->uio_resid;
1173 uio->uio_resid = clip_size;
1174 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1175 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1176 }
1177 else
1178 {
1179 /* round clip_size down to a multiple of pagesize */
1180 clip_size = clip_size & ~(PAGE_MASK);
1181 prev_resid = uio->uio_resid;
1182 uio->uio_resid = clip_size;
1183 retval = cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags);
1184 if ((retval == 0) && uio->uio_resid)
1185 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1186 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1187 }
1188 } /* end else */
1189 } /* end while */
1190 return(retval);
1191 }
1192
1193 static
1194 cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags)
1195 struct vnode *vp;
1196 struct uio *uio;
1197 off_t newEOF;
1198 int devblocksize;
1199 int flags;
1200 {
1201 upl_t upl;
1202 upl_page_info_t *pl;
1203 off_t upl_f_offset;
1204 vm_offset_t upl_offset;
1205 off_t max_io_size;
1206 int io_size;
1207 int upl_size;
1208 int upl_needed_size;
1209 int pages_in_pl;
1210 int upl_flags;
1211 kern_return_t kret;
1212 struct iovec *iov;
1213 int i;
1214 int force_data_sync;
1215 int error = 0;
1216
1217 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
1218 (int)uio->uio_offset, (int)uio->uio_resid,
1219 (int)newEOF, devblocksize, 0);
1220
1221 /*
1222 * When we enter this routine, we know
1223 * -- the offset into the file is on a pagesize boundary
1224 * -- the resid is a page multiple
1225 * -- the resid will not exceed iov_len
1226 */
1227
1228 iov = uio->uio_iov;
1229
1230 while (uio->uio_resid && uio->uio_offset < newEOF && error == 0) {
1231 io_size = uio->uio_resid;
1232
1233 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1234 io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1235
1236 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
1237 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
1238
1239 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
1240 (int)upl_offset, upl_needed_size, iov->iov_base, io_size, 0);
1241
1242 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++)
1243 {
1244 pages_in_pl = 0;
1245 upl_size = upl_needed_size;
1246 upl_flags = UPL_COPYOUT_FROM | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
1247
1248 kret = vm_map_get_upl(current_map(),
1249 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1250 &upl_size,
1251 &upl,
1252 NULL,
1253 &pages_in_pl,
1254 &upl_flags,
1255 force_data_sync);
1256
1257 if (kret != KERN_SUCCESS)
1258 {
1259 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1260 0, 0, 0, kret, 0);
1261
1262 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1263 (int)uio->uio_offset, (int)uio->uio_resid, kret, 1, 0);
1264
1265 /* cluster_nocopy_write: failed to get pagelist */
1266 /* do not return kret here */
1267 return(0);
1268 }
1269
1270 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1271 pages_in_pl = upl_size / PAGE_SIZE;
1272
1273 for(i=0; i < pages_in_pl; i++)
1274 {
1275 if (!upl_valid_page(pl, i))
1276 break;
1277 }
1278
1279 if (i == pages_in_pl)
1280 break;
1281
1282 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1283 UPL_ABORT_FREE_ON_EMPTY);
1284 }
1285
1286 if (force_data_sync >= 3)
1287 {
1288 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1289 i, pages_in_pl, upl_size, kret, 0);
1290
1291 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1292 (int)uio->uio_offset, (int)uio->uio_resid, kret, 2, 0);
1293 return(0);
1294 }
1295
1296 /*
1297 * Consider the possibility that upl_size wasn't satisfied.
1298 */
1299 if (upl_size != upl_needed_size)
1300 io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
1301
1302 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1303 (int)upl_offset, upl_size, iov->iov_base, io_size, 0);
1304
1305 if (io_size == 0)
1306 {
1307 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1308 UPL_ABORT_FREE_ON_EMPTY);
1309 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1310 (int)uio->uio_offset, uio->uio_resid, 0, 3, 0);
1311
1312 return(0);
1313 }
1314
1315 /*
1316 * Now look for pages already in the cache
1317 * and throw them away.
1318 */
1319
1320 upl_f_offset = uio->uio_offset; /* this is page aligned in the file */
1321 max_io_size = io_size;
1322
1323 while (max_io_size) {
1324
1325 /*
1326 * Flag UPL_POP_DUMP says if the page is found
1327 * in the page cache it must be thrown away.
1328 */
1329 ubc_page_op(vp,
1330 upl_f_offset,
1331 UPL_POP_SET | UPL_POP_BUSY | UPL_POP_DUMP,
1332 0, 0);
1333 max_io_size -= PAGE_SIZE;
1334 upl_f_offset += PAGE_SIZE;
1335 }
1336
1337 /*
1338 * issue a synchronous write to cluster_io
1339 */
1340
1341 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
1342 (int)upl_offset, (int)uio->uio_offset, io_size, 0, 0);
1343
1344 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1345 io_size, 0, (struct buf *)0);
1346
1347 if (error == 0) {
1348 /*
1349 * The cluster_io write completed successfully,
1350 * update the uio structure.
1351 */
1352 iov->iov_base += io_size;
1353 iov->iov_len -= io_size;
1354 uio->uio_resid -= io_size;
1355 uio->uio_offset += io_size;
1356 }
1357 /*
1358 * always 'commit' the I/O via the abort primitive whether the I/O
1359 * succeeded cleanly or not... this is necessary to insure that
1360 * we preserve the state of the DIRTY flag on the pages used to
1361 * provide the data for the I/O... the state of this flag SHOULD
1362 * NOT be changed by a write
1363 */
1364 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1365 UPL_ABORT_FREE_ON_EMPTY);
1366
1367
1368 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
1369 (int)upl_offset, (int)uio->uio_offset, (int)uio->uio_resid, error, 0);
1370
1371 } /* end while */
1372
1373
1374 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1375 (int)uio->uio_offset, (int)uio->uio_resid, error, 4, 0);
1376
1377 return (error);
1378 }
1379
1380 static
1381 cluster_phys_write(vp, uio)
1382 struct vnode *vp;
1383 struct uio *uio;
1384 {
1385 upl_t upl;
1386 vm_offset_t upl_offset;
1387 int io_size;
1388 int upl_size;
1389 int upl_needed_size;
1390 int pages_in_pl;
1391 int upl_flags;
1392 kern_return_t kret;
1393 struct iovec *iov;
1394 int error = 0;
1395
1396 /*
1397 * When we enter this routine, we know
1398 * -- the resid will not exceed iov_len
1399 * -- the vector target address is physcially contiguous
1400 */
1401
1402 iov = uio->uio_iov;
1403 io_size = iov->iov_len;
1404 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
1405 upl_needed_size = upl_offset + io_size;
1406
1407 pages_in_pl = 0;
1408 upl_size = upl_needed_size;
1409 upl_flags = UPL_COPYOUT_FROM | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
1410
1411 kret = vm_map_get_upl(current_map(),
1412 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1413 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
1414
1415 if (kret != KERN_SUCCESS)
1416 {
1417 /* cluster_phys_write: failed to get pagelist */
1418 /* note: return kret here */
1419 return(EINVAL);
1420 }
1421
1422 /*
1423 * Consider the possibility that upl_size wasn't satisfied.
1424 * This is a failure in the physical memory case.
1425 */
1426 if (upl_size < upl_needed_size)
1427 {
1428 kernel_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1429 return(EINVAL);
1430 }
1431
1432 /*
1433 * issue a synchronous write to cluster_io
1434 */
1435
1436 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1437 io_size, CL_DEV_MEMORY, (struct buf *)0);
1438
1439 if (error == 0) {
1440 /*
1441 * The cluster_io write completed successfully,
1442 * update the uio structure and commit.
1443 */
1444
1445 ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_FREE_ON_EMPTY);
1446
1447 iov->iov_base += io_size;
1448 iov->iov_len -= io_size;
1449 uio->uio_resid -= io_size;
1450 uio->uio_offset += io_size;
1451 }
1452 else
1453 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1454
1455 return (error);
1456 }
1457
1458 static
1459 cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
1460 struct vnode *vp;
1461 struct uio *uio;
1462 off_t oldEOF;
1463 off_t newEOF;
1464 off_t headOff;
1465 off_t tailOff;
1466 int devblocksize;
1467 int flags;
1468 {
1469 upl_page_info_t *pl;
1470 upl_t upl;
1471 vm_offset_t upl_offset;
1472 int upl_size;
1473 off_t upl_f_offset;
1474 int pages_in_upl;
1475 int start_offset;
1476 int xfer_resid;
1477 int io_size;
1478 int io_size_before_rounding;
1479 int io_flags;
1480 vm_offset_t io_address;
1481 int io_offset;
1482 int bytes_to_zero;
1483 int bytes_to_move;
1484 kern_return_t kret;
1485 int retval = 0;
1486 int uio_resid;
1487 long long total_size;
1488 long long zero_cnt;
1489 off_t zero_off;
1490 long long zero_cnt1;
1491 off_t zero_off1;
1492 daddr_t start_blkno;
1493 daddr_t last_blkno;
1494
1495 if (uio) {
1496 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1497 (int)uio->uio_offset, uio->uio_resid, (int)oldEOF, (int)newEOF, 0);
1498
1499 uio_resid = uio->uio_resid;
1500 } else {
1501 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1502 0, 0, (int)oldEOF, (int)newEOF, 0);
1503
1504 uio_resid = 0;
1505 }
1506 zero_cnt = 0;
1507 zero_cnt1 = 0;
1508
1509 if (flags & IO_HEADZEROFILL) {
1510 /*
1511 * some filesystems (HFS is one) don't support unallocated holes within a file...
1512 * so we zero fill the intervening space between the old EOF and the offset
1513 * where the next chunk of real data begins.... ftruncate will also use this
1514 * routine to zero fill to the new EOF when growing a file... in this case, the
1515 * uio structure will not be provided
1516 */
1517 if (uio) {
1518 if (headOff < uio->uio_offset) {
1519 zero_cnt = uio->uio_offset - headOff;
1520 zero_off = headOff;
1521 }
1522 } else if (headOff < newEOF) {
1523 zero_cnt = newEOF - headOff;
1524 zero_off = headOff;
1525 }
1526 }
1527 if (flags & IO_TAILZEROFILL) {
1528 if (uio) {
1529 zero_off1 = uio->uio_offset + uio->uio_resid;
1530
1531 if (zero_off1 < tailOff)
1532 zero_cnt1 = tailOff - zero_off1;
1533 }
1534 }
1535 if (zero_cnt == 0 && uio == (struct uio *) 0)
1536 {
1537 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
1538 retval, 0, 0, 0, 0);
1539 return (0);
1540 }
1541
1542 while ((total_size = (uio_resid + zero_cnt + zero_cnt1)) && retval == 0) {
1543 /*
1544 * for this iteration of the loop, figure out where our starting point is
1545 */
1546 if (zero_cnt) {
1547 start_offset = (int)(zero_off & PAGE_MASK_64);
1548 upl_f_offset = zero_off - start_offset;
1549 } else if (uio_resid) {
1550 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
1551 upl_f_offset = uio->uio_offset - start_offset;
1552 } else {
1553 start_offset = (int)(zero_off1 & PAGE_MASK_64);
1554 upl_f_offset = zero_off1 - start_offset;
1555 }
1556 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
1557 (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
1558
1559 if (total_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1560 total_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1561
1562 /*
1563 * compute the size of the upl needed to encompass
1564 * the requested write... limit each call to cluster_io
1565 * to the maximum UPL size... cluster_io will clip if
1566 * this exceeds the maximum io_size for the device,
1567 * make sure to account for
1568 * a starting offset that's not page aligned
1569 */
1570 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1571
1572 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1573 upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1574
1575 pages_in_upl = upl_size / PAGE_SIZE;
1576 io_size = upl_size - start_offset;
1577
1578 if ((long long)io_size > total_size)
1579 io_size = total_size;
1580
1581 start_blkno = (daddr_t)(upl_f_offset / PAGE_SIZE_64);
1582 last_blkno = start_blkno + pages_in_upl;
1583
1584 kret = ubc_create_upl(vp,
1585 upl_f_offset,
1586 upl_size,
1587 &upl,
1588 &pl,
1589 UPL_FLAGS_NONE);
1590 if (kret != KERN_SUCCESS)
1591 panic("cluster_write: failed to get pagelist");
1592
1593 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_NONE,
1594 upl, (int)upl_f_offset, upl_size, start_offset, 0);
1595
1596 if (start_offset && !upl_valid_page(pl, 0)) {
1597 int read_size;
1598
1599 /*
1600 * we're starting in the middle of the first page of the upl
1601 * and the page isn't currently valid, so we're going to have
1602 * to read it in first... this is a synchronous operation
1603 */
1604 read_size = PAGE_SIZE;
1605
1606 if ((upl_f_offset + read_size) > newEOF) {
1607 read_size = newEOF - upl_f_offset;
1608 read_size = (read_size + (devblocksize - 1)) & ~(devblocksize - 1);
1609 }
1610 retval = cluster_io(vp, upl, 0, upl_f_offset, read_size,
1611 CL_READ, (struct buf *)0);
1612 if (retval) {
1613 /*
1614 * we had an error during the read which causes us to abort
1615 * the current cluster_write request... before we do, we need
1616 * to release the rest of the pages in the upl without modifying
1617 * there state and mark the failed page in error
1618 */
1619 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
1620 ubc_upl_abort(upl, 0);
1621
1622 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1623 upl, 0, 0, retval, 0);
1624 break;
1625 }
1626 }
1627 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
1628 /*
1629 * the last offset we're writing to in this upl does not end on a page
1630 * boundary... if it's not beyond the old EOF, then we'll also need to
1631 * pre-read this page in if it isn't already valid
1632 */
1633 upl_offset = upl_size - PAGE_SIZE;
1634
1635 if ((upl_f_offset + start_offset + io_size) < oldEOF &&
1636 !upl_valid_page(pl, upl_offset / PAGE_SIZE)) {
1637 int read_size;
1638
1639 read_size = PAGE_SIZE;
1640
1641 if ((upl_f_offset + upl_offset + read_size) > newEOF) {
1642 read_size = newEOF - (upl_f_offset + upl_offset);
1643 read_size = (read_size + (devblocksize - 1)) & ~(devblocksize - 1);
1644 }
1645 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size,
1646 CL_READ, (struct buf *)0);
1647 if (retval) {
1648 /*
1649 * we had an error during the read which causes us to abort
1650 * the current cluster_write request... before we do, we
1651 * need to release the rest of the pages in the upl without
1652 * modifying there state and mark the failed page in error
1653 */
1654 ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE,
1655 UPL_ABORT_DUMP_PAGES);
1656 ubc_upl_abort(upl, 0);
1657
1658 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1659 upl, 0, 0, retval, 0);
1660 break;
1661 }
1662 }
1663 }
1664 if ((kret = ubc_upl_map(upl, &io_address)) != KERN_SUCCESS)
1665 panic("cluster_write: ubc_upl_map failed\n");
1666 xfer_resid = io_size;
1667 io_offset = start_offset;
1668
1669 while (zero_cnt && xfer_resid) {
1670
1671 if (zero_cnt < (long long)xfer_resid)
1672 bytes_to_zero = zero_cnt;
1673 else
1674 bytes_to_zero = xfer_resid;
1675
1676 if ( !(flags & IO_NOZEROVALID)) {
1677 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1678
1679 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1680 (int)upl_f_offset + io_offset, bytes_to_zero,
1681 (int)zero_cnt, xfer_resid, 0);
1682 } else {
1683 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
1684
1685 if ( !upl_valid_page(pl, (int)(zero_off / PAGE_SIZE_64))) {
1686 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1687
1688 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1689 (int)upl_f_offset + io_offset, bytes_to_zero,
1690 (int)zero_cnt, xfer_resid, 0);
1691 }
1692 }
1693 xfer_resid -= bytes_to_zero;
1694 zero_cnt -= bytes_to_zero;
1695 zero_off += bytes_to_zero;
1696 io_offset += bytes_to_zero;
1697 }
1698 if (xfer_resid && uio_resid) {
1699 bytes_to_move = min(uio_resid, xfer_resid);
1700
1701 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 42)) | DBG_FUNC_NONE,
1702 (int)uio->uio_offset, bytes_to_move, uio_resid, xfer_resid, 0);
1703
1704 retval = uiomove((caddr_t)(io_address + io_offset), bytes_to_move, uio);
1705
1706 if (retval) {
1707 if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
1708 panic("cluster_write: kernel_upl_unmap failed\n");
1709 ubc_upl_abort(upl, UPL_ABORT_DUMP_PAGES);
1710
1711 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1712 upl, 0, 0, retval, 0);
1713 } else {
1714 uio_resid -= bytes_to_move;
1715 xfer_resid -= bytes_to_move;
1716 io_offset += bytes_to_move;
1717 }
1718 }
1719 while (xfer_resid && zero_cnt1 && retval == 0) {
1720
1721 if (zero_cnt1 < (long long)xfer_resid)
1722 bytes_to_zero = zero_cnt1;
1723 else
1724 bytes_to_zero = xfer_resid;
1725
1726 if ( !(flags & IO_NOZEROVALID)) {
1727 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1728
1729 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1730 (int)upl_f_offset + io_offset,
1731 bytes_to_zero, (int)zero_cnt1, xfer_resid, 0);
1732 } else {
1733 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off1 & PAGE_MASK_64));
1734 if ( !upl_valid_page(pl, (int)(zero_off1 / PAGE_SIZE_64))) {
1735 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1736
1737 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1738 (int)upl_f_offset + io_offset,
1739 bytes_to_zero, (int)zero_cnt1, xfer_resid, 0);
1740 }
1741 }
1742 xfer_resid -= bytes_to_zero;
1743 zero_cnt1 -= bytes_to_zero;
1744 zero_off1 += bytes_to_zero;
1745 io_offset += bytes_to_zero;
1746 }
1747
1748 if (retval == 0) {
1749 int must_push;
1750 int can_delay;
1751
1752 io_size += start_offset;
1753
1754 if ((upl_f_offset + io_size) == newEOF && io_size < upl_size) {
1755 /*
1756 * if we're extending the file with this write
1757 * we'll zero fill the rest of the page so that
1758 * if the file gets extended again in such a way as to leave a
1759 * hole starting at this EOF, we'll have zero's in the correct spot
1760 */
1761 bzero((caddr_t)(io_address + io_size), upl_size - io_size);
1762
1763 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1764 (int)upl_f_offset + io_size,
1765 upl_size - io_size, 0, 0, 0);
1766 }
1767 if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
1768 panic("cluster_write: kernel_upl_unmap failed\n");
1769
1770 io_size_before_rounding = io_size;
1771
1772 if (io_size & (devblocksize - 1))
1773 io_size = (io_size + (devblocksize - 1)) & ~(devblocksize - 1);
1774
1775 must_push = 0;
1776 can_delay = 0;
1777
1778 if (vp->v_clen) {
1779 int newsize;
1780
1781 /*
1782 * we have an existing cluster... see if this write will extend it nicely
1783 */
1784 if (start_blkno >= vp->v_cstart) {
1785 if (last_blkno <= (vp->v_cstart + vp->v_clen)) {
1786 /*
1787 * we have a write that fits entirely
1788 * within the existing cluster limits
1789 */
1790 if (last_blkno >= vp->v_lastw) {
1791 /*
1792 * if we're extending the dirty region within the cluster
1793 * we need to update the cluster info... we check for blkno
1794 * equality because we may be extending the file with a
1795 * partial write.... this in turn changes our idea of how
1796 * much data to write out (v_ciosiz) for the last page
1797 */
1798 vp->v_lastw = last_blkno;
1799 newsize = io_size + ((start_blkno - vp->v_cstart) * PAGE_SIZE);
1800
1801 if (newsize > vp->v_ciosiz)
1802 vp->v_ciosiz = newsize;
1803 }
1804 can_delay = 1;
1805 goto finish_io;
1806 }
1807 if (start_blkno < (vp->v_cstart + vp->v_clen)) {
1808 /*
1809 * we have a write that starts in the middle of the current cluster
1810 * but extends beyond the cluster's limit
1811 * we'll clip the current cluster if we actually
1812 * overlap with the new write and then push it out
1813 * and start a new cluster with the current write
1814 */
1815 if (vp->v_lastw > start_blkno) {
1816 vp->v_lastw = start_blkno;
1817 vp->v_ciosiz = (vp->v_lastw - vp->v_cstart) * PAGE_SIZE;
1818 }
1819 }
1820 /*
1821 * we also get here for the case where the current write starts
1822 * beyond the limit of the existing cluster
1823 */
1824 must_push = 1;
1825 goto check_delay;
1826 }
1827 /*
1828 * the current write starts in front of the current cluster
1829 */
1830 if (last_blkno > vp->v_cstart) {
1831 /*
1832 * the current write extends into the existing cluster
1833 */
1834 if ((vp->v_lastw - start_blkno) > vp->v_clen) {
1835 /*
1836 * if we were to combine this write with the current cluster
1837 * we would exceed the cluster size limit....
1838 * clip the current cluster by moving the start position
1839 * to where the current write ends, and then push it
1840 */
1841 vp->v_ciosiz -= (last_blkno - vp->v_cstart) * PAGE_SIZE;
1842 vp->v_cstart = last_blkno;
1843
1844 /*
1845 * round up the io_size to the nearest page size
1846 * since we've coalesced with at least 1 pre-existing
1847 * page in the current cluster... this write may have ended in the
1848 * middle of the page which would cause io_size to give us an
1849 * inaccurate view of how much I/O we actually need to do
1850 */
1851 io_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1852
1853 must_push = 1;
1854 goto check_delay;
1855 }
1856 /*
1857 * we can coalesce the current write with the existing cluster
1858 * adjust the cluster info to reflect this
1859 */
1860 if (last_blkno > vp->v_lastw) {
1861 /*
1862 * the current write completey overlaps
1863 * the existing cluster
1864 */
1865 vp->v_lastw = last_blkno;
1866 vp->v_ciosiz = io_size;
1867 } else {
1868 vp->v_ciosiz += (vp->v_cstart - start_blkno) * PAGE_SIZE;
1869
1870 if (io_size > vp->v_ciosiz)
1871 vp->v_ciosiz = io_size;
1872 }
1873 vp->v_cstart = start_blkno;
1874 can_delay = 1;
1875 goto finish_io;
1876 }
1877 /*
1878 * this I/O range is entirely in front of the current cluster
1879 * so we need to push the current cluster out before beginning
1880 * a new one
1881 */
1882 must_push = 1;
1883 }
1884 check_delay:
1885 if (must_push)
1886 cluster_push(vp);
1887
1888 if (io_size_before_rounding < (MAX_UPL_TRANSFER * PAGE_SIZE) && !(flags & IO_SYNC)) {
1889 vp->v_clen = MAX_UPL_TRANSFER;
1890 vp->v_cstart = start_blkno;
1891 vp->v_lastw = last_blkno;
1892 vp->v_ciosiz = io_size;
1893
1894 can_delay = 1;
1895 }
1896 finish_io:
1897 if (can_delay) {
1898 ubc_upl_commit_range(upl, 0, upl_size,
1899 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
1900 continue;
1901 }
1902 if (flags & IO_SYNC)
1903 io_flags = CL_COMMIT | CL_AGE;
1904 else
1905 io_flags = CL_COMMIT | CL_AGE | CL_ASYNC;
1906
1907 if (vp->v_flag & VNOCACHE_DATA)
1908 io_flags |= CL_DUMP;
1909
1910 while (vp->v_numoutput >= ASYNC_THROTTLE) {
1911 vp->v_flag |= VTHROTTLED;
1912 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_write", 0);
1913 }
1914 retval = cluster_io(vp, upl, 0, upl_f_offset, io_size,
1915 io_flags, (struct buf *)0);
1916 }
1917 }
1918 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
1919 retval, 0, 0, 0, 0);
1920
1921 return (retval);
1922 }
1923
1924 cluster_read(vp, uio, filesize, devblocksize, flags)
1925 struct vnode *vp;
1926 struct uio *uio;
1927 off_t filesize;
1928 int devblocksize;
1929 int flags;
1930 {
1931 int prev_resid;
1932 int clip_size;
1933 off_t max_io_size;
1934 struct iovec *iov;
1935 vm_offset_t upl_offset;
1936 int upl_size;
1937 int pages_in_pl;
1938 upl_page_info_t *pl;
1939 int upl_flags;
1940 upl_t upl;
1941 int retval = 0;
1942
1943 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
1944 (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0);
1945
1946 /*
1947 * We set a threshhold of 4 pages to decide if the nocopy
1948 * read loop is worth the trouble...
1949 */
1950
1951 if (!((vp->v_flag & VNOCACHE_DATA) && (uio->uio_segflg == UIO_USERSPACE)))
1952 {
1953 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
1954 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
1955 (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
1956 return(retval);
1957 }
1958
1959 while (uio->uio_resid && uio->uio_offset < filesize && retval == 0)
1960 {
1961 /* we know we have a resid, so this is safe */
1962 iov = uio->uio_iov;
1963 while (iov->iov_len == 0) {
1964 uio->uio_iov++;
1965 uio->uio_iovcnt--;
1966 iov = uio->uio_iov;
1967 }
1968
1969 /*
1970 * We check every vector target and if it is physically
1971 * contiguous space, we skip the sanity checks.
1972 */
1973
1974 upl_offset = (vm_offset_t)iov->iov_base & ~PAGE_MASK;
1975 upl_size = (upl_offset + PAGE_SIZE +(PAGE_SIZE -1)) & ~PAGE_MASK;
1976 pages_in_pl = 0;
1977 upl_flags = UPL_QUERY_OBJECT_TYPE;
1978 if((vm_map_get_upl(current_map(),
1979 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1980 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0)) != KERN_SUCCESS)
1981 {
1982 /*
1983 * the user app must have passed in an invalid address
1984 */
1985 return (EFAULT);
1986 }
1987
1988 if (upl_flags & UPL_PHYS_CONTIG)
1989 {
1990 retval = cluster_phys_read(vp, uio, filesize);
1991 }
1992 else if (uio->uio_resid < 4 * PAGE_SIZE)
1993 {
1994 /*
1995 * We set a threshhold of 4 pages to decide if the nocopy
1996 * read loop is worth the trouble...
1997 */
1998 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
1999 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2000 (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
2001 return(retval);
2002 }
2003 else if (uio->uio_offset & PAGE_MASK_64)
2004 {
2005 /* Bring the file offset read up to a pagesize boundary */
2006 clip_size = (PAGE_SIZE - (int)(uio->uio_offset & PAGE_MASK_64));
2007 if (uio->uio_resid < clip_size)
2008 clip_size = uio->uio_resid;
2009 /*
2010 * Fake the resid going into the cluster_read_x call
2011 * and restore it on the way out.
2012 */
2013 prev_resid = uio->uio_resid;
2014 uio->uio_resid = clip_size;
2015 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2016 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2017 }
2018 else if ((int)iov->iov_base & PAGE_MASK_64)
2019 {
2020 clip_size = iov->iov_len;
2021 prev_resid = uio->uio_resid;
2022 uio->uio_resid = clip_size;
2023 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2024 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2025 }
2026 else
2027 {
2028 /*
2029 * If we come in here, we know the offset into
2030 * the file is on a pagesize boundary
2031 */
2032
2033 max_io_size = filesize - uio->uio_offset;
2034 clip_size = uio->uio_resid;
2035 if (iov->iov_len < clip_size)
2036 clip_size = iov->iov_len;
2037 if (max_io_size < clip_size)
2038 clip_size = (int)max_io_size;
2039
2040 if (clip_size < PAGE_SIZE)
2041 {
2042 /*
2043 * Take care of the tail end of the read in this vector.
2044 */
2045 prev_resid = uio->uio_resid;
2046 uio->uio_resid = clip_size;
2047 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2048 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2049 }
2050 else
2051 {
2052 /* round clip_size down to a multiple of pagesize */
2053 clip_size = clip_size & ~(PAGE_MASK);
2054 prev_resid = uio->uio_resid;
2055 uio->uio_resid = clip_size;
2056 retval = cluster_nocopy_read(vp, uio, filesize, devblocksize, flags);
2057 if ((retval==0) && uio->uio_resid)
2058 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2059 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2060 }
2061 } /* end else */
2062 } /* end while */
2063
2064 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2065 (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
2066
2067 return(retval);
2068 }
2069
2070 static
2071 cluster_read_x(vp, uio, filesize, devblocksize, flags)
2072 struct vnode *vp;
2073 struct uio *uio;
2074 off_t filesize;
2075 int devblocksize;
2076 int flags;
2077 {
2078 upl_page_info_t *pl;
2079 upl_t upl;
2080 vm_offset_t upl_offset;
2081 int upl_size;
2082 off_t upl_f_offset;
2083 int start_offset;
2084 int start_pg;
2085 int last_pg;
2086 int uio_last;
2087 int pages_in_upl;
2088 off_t max_size;
2089 int io_size;
2090 vm_offset_t io_address;
2091 kern_return_t kret;
2092 int segflg;
2093 int error = 0;
2094 int retval = 0;
2095 int b_lblkno;
2096 int e_lblkno;
2097
2098 b_lblkno = (int)(uio->uio_offset / PAGE_SIZE_64);
2099
2100 while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) {
2101 /*
2102 * compute the size of the upl needed to encompass
2103 * the requested read... limit each call to cluster_io
2104 * to the maximum UPL size... cluster_io will clip if
2105 * this exceeds the maximum io_size for the device,
2106 * make sure to account for
2107 * a starting offset that's not page aligned
2108 */
2109 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2110 upl_f_offset = uio->uio_offset - (off_t)start_offset;
2111 max_size = filesize - uio->uio_offset;
2112
2113 if ((off_t)((unsigned int)uio->uio_resid) < max_size)
2114 io_size = uio->uio_resid;
2115 else
2116 io_size = max_size;
2117 #ifdef ppc
2118 if (uio->uio_segflg == UIO_USERSPACE && !(vp->v_flag & VNOCACHE_DATA)) {
2119 segflg = uio->uio_segflg;
2120
2121 uio->uio_segflg = UIO_PHYS_USERSPACE;
2122
2123 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
2124 (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2125
2126 while (io_size && retval == 0) {
2127 int xsize;
2128 vm_offset_t paddr;
2129
2130 if (ubc_page_op(vp,
2131 upl_f_offset,
2132 UPL_POP_SET | UPL_POP_BUSY,
2133 &paddr, 0) != KERN_SUCCESS)
2134 break;
2135
2136 xsize = PAGE_SIZE - start_offset;
2137
2138 if (xsize > io_size)
2139 xsize = io_size;
2140
2141 retval = uiomove((caddr_t)(paddr + start_offset), xsize, uio);
2142
2143 ubc_page_op(vp, upl_f_offset,
2144 UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
2145
2146 io_size -= xsize;
2147 start_offset = (int)
2148 (uio->uio_offset & PAGE_MASK_64);
2149 upl_f_offset = uio->uio_offset - start_offset;
2150 }
2151 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
2152 (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2153
2154 uio->uio_segflg = segflg;
2155
2156 if (retval)
2157 break;
2158
2159 if (io_size == 0) {
2160 /*
2161 * we're already finished with this read request
2162 * let's see if we should do a read-ahead
2163 */
2164 e_lblkno = (int)
2165 ((uio->uio_offset - 1) / PAGE_SIZE_64);
2166
2167 if (!(vp->v_flag & VRAOFF))
2168 /*
2169 * let's try to read ahead if we're in
2170 * a sequential access pattern
2171 */
2172 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
2173 vp->v_lastr = e_lblkno;
2174
2175 break;
2176 }
2177 max_size = filesize - uio->uio_offset;
2178 }
2179 #endif
2180 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2181 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2182 upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2183 pages_in_upl = upl_size / PAGE_SIZE;
2184
2185 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
2186 upl, (int)upl_f_offset, upl_size, start_offset, 0);
2187
2188 kret = ubc_create_upl(vp,
2189 upl_f_offset,
2190 upl_size,
2191 &upl,
2192 &pl,
2193 UPL_FLAGS_NONE);
2194 if (kret != KERN_SUCCESS)
2195 panic("cluster_read: failed to get pagelist");
2196
2197 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
2198 upl, (int)upl_f_offset, upl_size, start_offset, 0);
2199
2200 /*
2201 * scan from the beginning of the upl looking for the first
2202 * non-valid page.... this will become the first page in
2203 * the request we're going to make to 'cluster_io'... if all
2204 * of the pages are valid, we won't call through to 'cluster_io'
2205 */
2206 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
2207 if (!upl_valid_page(pl, start_pg))
2208 break;
2209 }
2210
2211 /*
2212 * scan from the starting invalid page looking for a valid
2213 * page before the end of the upl is reached, if we
2214 * find one, then it will be the last page of the request to
2215 * 'cluster_io'
2216 */
2217 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
2218 if (upl_valid_page(pl, last_pg))
2219 break;
2220 }
2221
2222 if (start_pg < last_pg) {
2223 /*
2224 * we found a range of 'invalid' pages that must be filled
2225 * if the last page in this range is the last page of the file
2226 * we may have to clip the size of it to keep from reading past
2227 * the end of the last physical block associated with the file
2228 */
2229 upl_offset = start_pg * PAGE_SIZE;
2230 io_size = (last_pg - start_pg) * PAGE_SIZE;
2231
2232 if ((upl_f_offset + upl_offset + io_size) > filesize) {
2233 io_size = filesize - (upl_f_offset + upl_offset);
2234 io_size = (io_size + (devblocksize - 1)) & ~(devblocksize - 1);
2235 }
2236 /*
2237 * issue a synchronous read to cluster_io
2238 */
2239
2240 error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
2241 io_size, CL_READ, (struct buf *)0);
2242 }
2243 if (error == 0) {
2244 /*
2245 * if the read completed successfully, or there was no I/O request
2246 * issued, than map the upl into kernel address space and
2247 * move the data into user land.... we'll first add on any 'valid'
2248 * pages that were present in the upl when we acquired it.
2249 */
2250 u_int val_size;
2251 u_int size_of_prefetch;
2252
2253 for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
2254 if (!upl_valid_page(pl, uio_last))
2255 break;
2256 }
2257 /*
2258 * compute size to transfer this round, if uio->uio_resid is
2259 * still non-zero after this uiomove, we'll loop around and
2260 * set up for another I/O.
2261 */
2262 val_size = (uio_last * PAGE_SIZE) - start_offset;
2263
2264 if (max_size < val_size)
2265 val_size = max_size;
2266
2267 if (uio->uio_resid < val_size)
2268 val_size = uio->uio_resid;
2269
2270 e_lblkno = (int)((uio->uio_offset + ((off_t)val_size - 1)) / PAGE_SIZE_64);
2271
2272 if (size_of_prefetch = (uio->uio_resid - val_size)) {
2273 /*
2274 * if there's still I/O left to do for this request, then issue a
2275 * pre-fetch I/O... the I/O wait time will overlap
2276 * with the copying of the data
2277 */
2278 cluster_rd_prefetch(vp, uio->uio_offset + val_size, size_of_prefetch, filesize, devblocksize);
2279 } else {
2280 if (!(vp->v_flag & VRAOFF) && !(vp->v_flag & VNOCACHE_DATA))
2281 /*
2282 * let's try to read ahead if we're in
2283 * a sequential access pattern
2284 */
2285 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
2286 vp->v_lastr = e_lblkno;
2287 }
2288 #ifdef ppc
2289 if (uio->uio_segflg == UIO_USERSPACE) {
2290 int offset;
2291
2292 segflg = uio->uio_segflg;
2293
2294 uio->uio_segflg = UIO_PHYS_USERSPACE;
2295
2296
2297 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
2298 (int)uio->uio_offset, val_size, uio->uio_resid, 0, 0);
2299
2300 offset = start_offset;
2301
2302 while (val_size && retval == 0) {
2303 int csize;
2304 int i;
2305 caddr_t paddr;
2306
2307 i = offset / PAGE_SIZE;
2308 csize = min(PAGE_SIZE - start_offset, val_size);
2309
2310 paddr = (caddr_t)upl_phys_page(pl, i) + start_offset;
2311
2312 retval = uiomove(paddr, csize, uio);
2313
2314 val_size -= csize;
2315 offset += csize;
2316 start_offset = offset & PAGE_MASK;
2317 }
2318 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
2319 (int)uio->uio_offset, val_size, uio->uio_resid, 0, 0);
2320
2321 uio->uio_segflg = segflg;
2322 } else
2323 #endif
2324 {
2325 if ((kret = ubc_upl_map(upl, &io_address)) != KERN_SUCCESS)
2326 panic("cluster_read: ubc_upl_map() failed\n");
2327
2328 retval = uiomove((caddr_t)(io_address + start_offset), val_size, uio);
2329
2330 if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
2331 panic("cluster_read: ubc_upl_unmap() failed\n");
2332 }
2333 }
2334 if (start_pg < last_pg) {
2335 /*
2336 * compute the range of pages that we actually issued an I/O for
2337 * and either commit them as valid if the I/O succeeded
2338 * or abort them if the I/O failed
2339 */
2340 io_size = (last_pg - start_pg) * PAGE_SIZE;
2341
2342 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
2343 upl, start_pg * PAGE_SIZE, io_size, error, 0);
2344
2345 if (error || (vp->v_flag & VNOCACHE_DATA))
2346 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
2347 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2348 else
2349 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size,
2350 UPL_COMMIT_CLEAR_DIRTY
2351 | UPL_COMMIT_FREE_ON_EMPTY
2352 | UPL_COMMIT_INACTIVATE);
2353
2354 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
2355 upl, start_pg * PAGE_SIZE, io_size, error, 0);
2356 }
2357 if ((last_pg - start_pg) < pages_in_upl) {
2358 int cur_pg;
2359 int commit_flags;
2360
2361 /*
2362 * the set of pages that we issued an I/O for did not encompass
2363 * the entire upl... so just release these without modifying
2364 * there state
2365 */
2366 if (error)
2367 ubc_upl_abort(upl, 0);
2368 else {
2369 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
2370 upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
2371
2372 if (start_pg) {
2373 /*
2374 * we found some already valid pages at the beginning of
2375 * the upl commit these back to the inactive list with
2376 * reference cleared
2377 */
2378 for (cur_pg = 0; cur_pg < start_pg; cur_pg++) {
2379 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
2380 | UPL_COMMIT_INACTIVATE;
2381
2382 if (upl_dirty_page(pl, cur_pg))
2383 commit_flags |= UPL_COMMIT_SET_DIRTY;
2384
2385 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA))
2386 ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
2387 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2388 else
2389 ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
2390 PAGE_SIZE, commit_flags);
2391 }
2392 }
2393 if (last_pg < uio_last) {
2394 /*
2395 * we found some already valid pages immediately after the
2396 * pages we issued I/O for, commit these back to the
2397 * inactive list with reference cleared
2398 */
2399 for (cur_pg = last_pg; cur_pg < uio_last; cur_pg++) {
2400 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
2401 | UPL_COMMIT_INACTIVATE;
2402
2403 if (upl_dirty_page(pl, cur_pg))
2404 commit_flags |= UPL_COMMIT_SET_DIRTY;
2405
2406 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA))
2407 ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
2408 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2409 else
2410 ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
2411 PAGE_SIZE, commit_flags);
2412 }
2413 }
2414 if (uio_last < pages_in_upl) {
2415 /*
2416 * there were some invalid pages beyond the valid pages
2417 * that we didn't issue an I/O for, just release them
2418 * unchanged
2419 */
2420 ubc_upl_abort(upl, 0);
2421 }
2422
2423 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
2424 upl, -1, -1, 0, 0);
2425 }
2426 }
2427 if (retval == 0)
2428 retval = error;
2429 }
2430
2431 return (retval);
2432 }
2433
2434 static
2435 cluster_nocopy_read(vp, uio, filesize, devblocksize, flags)
2436 struct vnode *vp;
2437 struct uio *uio;
2438 off_t filesize;
2439 int devblocksize;
2440 int flags;
2441 {
2442 upl_t upl;
2443 upl_page_info_t *pl;
2444 off_t upl_f_offset;
2445 vm_offset_t upl_offset;
2446 off_t start_upl_f_offset;
2447 off_t max_io_size;
2448 int io_size;
2449 int upl_size;
2450 int upl_needed_size;
2451 int pages_in_pl;
2452 vm_offset_t paddr;
2453 int upl_flags;
2454 kern_return_t kret;
2455 int segflg;
2456 struct iovec *iov;
2457 int i;
2458 int force_data_sync;
2459 int error = 0;
2460 int retval = 0;
2461
2462 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
2463 (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0);
2464
2465 /*
2466 * When we enter this routine, we know
2467 * -- the offset into the file is on a pagesize boundary
2468 * -- the resid is a page multiple
2469 * -- the resid will not exceed iov_len
2470 */
2471
2472 iov = uio->uio_iov;
2473 while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) {
2474
2475 max_io_size = filesize - uio->uio_offset;
2476
2477 if (max_io_size < (off_t)((unsigned int)uio->uio_resid))
2478 io_size = max_io_size;
2479 else
2480 io_size = uio->uio_resid;
2481
2482 /*
2483 * We don't come into this routine unless
2484 * UIO_USERSPACE is set.
2485 */
2486 segflg = uio->uio_segflg;
2487
2488 uio->uio_segflg = UIO_PHYS_USERSPACE;
2489
2490 /*
2491 * First look for pages already in the cache
2492 * and move them to user space.
2493 */
2494 while (io_size && (retval == 0)) {
2495 upl_f_offset = uio->uio_offset;
2496
2497 /*
2498 * If this call fails, it means the page is not
2499 * in the page cache.
2500 */
2501 if (ubc_page_op(vp, upl_f_offset,
2502 UPL_POP_SET | UPL_POP_BUSY, &paddr, 0) != KERN_SUCCESS)
2503 break;
2504
2505 retval = uiomove((caddr_t)(paddr), PAGE_SIZE, uio);
2506
2507 ubc_page_op(vp, upl_f_offset,
2508 UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
2509
2510 io_size -= PAGE_SIZE;
2511 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 71)) | DBG_FUNC_NONE,
2512 (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2513 }
2514
2515 uio->uio_segflg = segflg;
2516
2517 if (retval)
2518 {
2519 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2520 (int)uio->uio_offset, uio->uio_resid, 2, retval, 0);
2521 return(retval);
2522 }
2523
2524 /* If we are already finished with this read, then return */
2525 if (io_size == 0)
2526 {
2527
2528 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2529 (int)uio->uio_offset, uio->uio_resid, 3, io_size, 0);
2530 return(0);
2531 }
2532
2533 max_io_size = io_size;
2534 if (max_io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2535 max_io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2536
2537 start_upl_f_offset = uio->uio_offset; /* this is page aligned in the file */
2538 upl_f_offset = start_upl_f_offset;
2539 io_size = 0;
2540
2541 while(io_size < max_io_size)
2542 {
2543
2544 if(ubc_page_op(vp, upl_f_offset,
2545 UPL_POP_SET | UPL_POP_BUSY, &paddr, 0) == KERN_SUCCESS)
2546 {
2547 ubc_page_op(vp, upl_f_offset,
2548 UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
2549 break;
2550 }
2551
2552 /*
2553 * Build up the io request parameters.
2554 */
2555
2556 io_size += PAGE_SIZE;
2557 upl_f_offset += PAGE_SIZE;
2558 }
2559
2560 if (io_size == 0)
2561 return(retval);
2562
2563 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
2564 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
2565
2566 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
2567 (int)upl_offset, upl_needed_size, iov->iov_base, io_size, 0);
2568
2569 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++)
2570 {
2571 pages_in_pl = 0;
2572 upl_size = upl_needed_size;
2573 upl_flags = UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
2574
2575 kret = vm_map_get_upl(current_map(),
2576 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2577 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, force_data_sync);
2578
2579 if (kret != KERN_SUCCESS)
2580 {
2581 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2582 (int)upl_offset, upl_size, io_size, kret, 0);
2583
2584 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2585 (int)uio->uio_offset, uio->uio_resid, 4, retval, 0);
2586
2587 /* cluster_nocopy_read: failed to get pagelist */
2588 /* do not return kret here */
2589 return(retval);
2590 }
2591
2592 pages_in_pl = upl_size / PAGE_SIZE;
2593 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2594
2595 for(i=0; i < pages_in_pl; i++)
2596 {
2597 if (!upl_valid_page(pl, i))
2598 break;
2599 }
2600 if (i == pages_in_pl)
2601 break;
2602
2603 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2604 UPL_ABORT_FREE_ON_EMPTY);
2605 }
2606
2607 if (force_data_sync >= 3)
2608 {
2609 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2610 (int)upl_offset, upl_size, io_size, kret, 0);
2611
2612 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2613 (int)uio->uio_offset, uio->uio_resid, 5, retval, 0);
2614 return(retval);
2615 }
2616 /*
2617 * Consider the possibility that upl_size wasn't satisfied.
2618 */
2619 if (upl_size != upl_needed_size)
2620 io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
2621
2622 if (io_size == 0)
2623 {
2624 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2625 UPL_ABORT_FREE_ON_EMPTY);
2626 return(retval);
2627 }
2628
2629 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2630 (int)upl_offset, upl_size, io_size, kret, 0);
2631
2632 /*
2633 * issue a synchronous read to cluster_io
2634 */
2635
2636 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
2637 upl, (int)upl_offset, (int)start_upl_f_offset, io_size, 0);
2638
2639 error = cluster_io(vp, upl, upl_offset, start_upl_f_offset,
2640 io_size, CL_READ| CL_NOZERO, (struct buf *)0);
2641
2642 if (error == 0) {
2643 /*
2644 * The cluster_io read completed successfully,
2645 * update the uio structure and commit.
2646 */
2647
2648 ubc_upl_commit_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2649 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
2650
2651 iov->iov_base += io_size;
2652 iov->iov_len -= io_size;
2653 uio->uio_resid -= io_size;
2654 uio->uio_offset += io_size;
2655 }
2656 else {
2657 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2658 UPL_ABORT_FREE_ON_EMPTY);
2659 }
2660
2661 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
2662 upl, (int)uio->uio_offset, (int)uio->uio_resid, error, 0);
2663
2664 if (retval == 0)
2665 retval = error;
2666
2667 } /* end while */
2668
2669
2670 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2671 (int)uio->uio_offset, (int)uio->uio_resid, 6, retval, 0);
2672
2673 return (retval);
2674 }
2675
2676
2677 static
2678 cluster_phys_read(vp, uio, filesize)
2679 struct vnode *vp;
2680 struct uio *uio;
2681 off_t filesize;
2682 {
2683 upl_t upl;
2684 vm_offset_t upl_offset;
2685 off_t max_size;
2686 int io_size;
2687 int upl_size;
2688 int upl_needed_size;
2689 int pages_in_pl;
2690 int upl_flags;
2691 kern_return_t kret;
2692 struct iovec *iov;
2693 int error;
2694
2695 /*
2696 * When we enter this routine, we know
2697 * -- the resid will not exceed iov_len
2698 * -- the target address is physically contiguous
2699 */
2700
2701 iov = uio->uio_iov;
2702
2703 max_size = filesize - uio->uio_offset;
2704
2705 if (max_size < (off_t)((unsigned int)iov->iov_len))
2706 io_size = max_size;
2707 else
2708 io_size = iov->iov_len;
2709
2710 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
2711 upl_needed_size = upl_offset + io_size;
2712
2713 pages_in_pl = 0;
2714 upl_size = upl_needed_size;
2715 upl_flags = UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
2716
2717 kret = vm_map_get_upl(current_map(),
2718 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2719 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
2720
2721 if (kret != KERN_SUCCESS)
2722 {
2723 /* cluster_phys_read: failed to get pagelist */
2724 return(EINVAL);
2725 }
2726
2727 /*
2728 * Consider the possibility that upl_size wasn't satisfied.
2729 */
2730 if (upl_size < upl_needed_size)
2731 {
2732 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2733 return(EINVAL);
2734 }
2735
2736 /*
2737 * issue a synchronous read to cluster_io
2738 */
2739
2740 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
2741 io_size, CL_READ| CL_NOZERO | CL_DEV_MEMORY, (struct buf *)0);
2742
2743 if (error == 0)
2744 {
2745 /*
2746 * The cluster_io read completed successfully,
2747 * update the uio structure and commit.
2748 */
2749
2750 ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_FREE_ON_EMPTY);
2751
2752 iov->iov_base += io_size;
2753 iov->iov_len -= io_size;
2754 uio->uio_resid -= io_size;
2755 uio->uio_offset += io_size;
2756 }
2757 else
2758 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2759
2760 return (error);
2761 }
2762
2763 /*
2764 * generate advisory I/O's in the largest chunks possible
2765 * the completed pages will be released into the VM cache
2766 */
2767 advisory_read(vp, filesize, f_offset, resid, devblocksize)
2768 struct vnode *vp;
2769 off_t filesize;
2770 off_t f_offset;
2771 int resid;
2772 int devblocksize;
2773 {
2774 upl_page_info_t *pl;
2775 upl_t upl;
2776 vm_offset_t upl_offset;
2777 int upl_size;
2778 off_t upl_f_offset;
2779 int start_offset;
2780 int start_pg;
2781 int last_pg;
2782 int pages_in_upl;
2783 off_t max_size;
2784 int io_size;
2785 kern_return_t kret;
2786 int retval = 0;
2787
2788
2789 if (!UBCINFOEXISTS(vp))
2790 return(EINVAL);
2791
2792 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
2793 (int)f_offset, resid, (int)filesize, devblocksize, 0);
2794
2795 while (resid && f_offset < filesize && retval == 0) {
2796 /*
2797 * compute the size of the upl needed to encompass
2798 * the requested read... limit each call to cluster_io
2799 * to the maximum UPL size... cluster_io will clip if
2800 * this exceeds the maximum io_size for the device,
2801 * make sure to account for
2802 * a starting offset that's not page aligned
2803 */
2804 start_offset = (int)(f_offset & PAGE_MASK_64);
2805 upl_f_offset = f_offset - (off_t)start_offset;
2806 max_size = filesize - f_offset;
2807
2808 if (resid < max_size)
2809 io_size = resid;
2810 else
2811 io_size = max_size;
2812
2813 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2814 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2815 upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2816 pages_in_upl = upl_size / PAGE_SIZE;
2817
2818 kret = ubc_create_upl(vp,
2819 upl_f_offset,
2820 upl_size,
2821 &upl,
2822 &pl,
2823 UPL_FLAGS_NONE);
2824 if (kret != KERN_SUCCESS)
2825 panic("advisory_read: failed to get pagelist");
2826
2827
2828 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_NONE,
2829 upl, (int)upl_f_offset, upl_size, start_offset, 0);
2830
2831 /*
2832 * scan from the beginning of the upl looking for the first
2833 * non-valid page.... this will become the first page in
2834 * the request we're going to make to 'cluster_io'... if all
2835 * of the pages are valid, we won't call through to 'cluster_io'
2836 */
2837 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
2838 if (!upl_valid_page(pl, start_pg))
2839 break;
2840 }
2841
2842 /*
2843 * scan from the starting invalid page looking for a valid
2844 * page before the end of the upl is reached, if we
2845 * find one, then it will be the last page of the request to
2846 * 'cluster_io'
2847 */
2848 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
2849 if (upl_valid_page(pl, last_pg))
2850 break;
2851 }
2852
2853 if (start_pg < last_pg) {
2854 /*
2855 * we found a range of 'invalid' pages that must be filled
2856 * if the last page in this range is the last page of the file
2857 * we may have to clip the size of it to keep from reading past
2858 * the end of the last physical block associated with the file
2859 */
2860 upl_offset = start_pg * PAGE_SIZE;
2861 io_size = (last_pg - start_pg) * PAGE_SIZE;
2862
2863 if ((upl_f_offset + upl_offset + io_size) > filesize) {
2864 io_size = filesize - (upl_f_offset + upl_offset);
2865 io_size = (io_size + (devblocksize - 1)) & ~(devblocksize - 1);
2866 }
2867 /*
2868 * issue an asynchronous read to cluster_io
2869 */
2870 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size,
2871 CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE, (struct buf *)0);
2872 }
2873 if (start_pg) {
2874 /*
2875 * start_pg of non-zero indicates we found some already valid pages
2876 * at the beginning of the upl.... we need to release these without
2877 * modifying there state
2878 */
2879 ubc_upl_abort_range(upl, 0, start_pg * PAGE_SIZE,
2880 UPL_ABORT_FREE_ON_EMPTY);
2881
2882 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 62)) | DBG_FUNC_NONE,
2883 upl, 0, start_pg * PAGE_SIZE, 0, 0);
2884 }
2885 if (last_pg < pages_in_upl) {
2886 /*
2887 * the set of pages that we issued an I/O for did not extend all the
2888 * way to the end of the upl..so just release them without modifying
2889 * there state
2890 */
2891 ubc_upl_abort_range(upl, last_pg * PAGE_SIZE, (pages_in_upl - last_pg) * PAGE_SIZE,
2892 UPL_ABORT_FREE_ON_EMPTY);
2893
2894 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 63)) | DBG_FUNC_NONE,
2895 upl, last_pg * PAGE_SIZE,
2896 (pages_in_upl - last_pg) * PAGE_SIZE, 0, 0);
2897 }
2898 io_size = (last_pg * PAGE_SIZE) - start_offset;
2899
2900 if (io_size > resid)
2901 io_size = resid;
2902 f_offset += io_size;
2903 resid -= io_size;
2904 }
2905 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
2906 (int)f_offset, resid, retval, 0, 0);
2907
2908 return(retval);
2909 }
2910
2911
2912 cluster_push(vp)
2913 struct vnode *vp;
2914 {
2915 upl_page_info_t *pl;
2916 upl_t upl;
2917 vm_offset_t upl_offset;
2918 int upl_size;
2919 off_t upl_f_offset;
2920 int pages_in_upl;
2921 int start_pg;
2922 int last_pg;
2923 int io_size;
2924 int io_flags;
2925 int size;
2926 kern_return_t kret;
2927
2928
2929 if (!UBCINFOEXISTS(vp))
2930 return(0);
2931
2932 if (vp->v_clen == 0 || (pages_in_upl = vp->v_lastw - vp->v_cstart) == 0)
2933 return (0);
2934 upl_size = pages_in_upl * PAGE_SIZE;
2935 upl_f_offset = ((off_t)vp->v_cstart) * PAGE_SIZE_64;
2936 size = vp->v_ciosiz;
2937 vp->v_clen = 0;
2938
2939 if (size > upl_size || (upl_size - size) > PAGE_SIZE)
2940 panic("cluster_push: v_ciosiz doesn't match size of cluster\n");
2941
2942 kret = ubc_create_upl(vp,
2943 upl_f_offset,
2944 upl_size,
2945 &upl,
2946 &pl,
2947 UPL_FLAGS_NONE);
2948 if (kret != KERN_SUCCESS)
2949 panic("cluster_push: failed to get pagelist");
2950
2951 last_pg = 0;
2952
2953 while (size) {
2954
2955 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
2956 if (upl_valid_page(pl, start_pg) && upl_dirty_page(pl, start_pg))
2957 break;
2958 }
2959 if (start_pg > last_pg) {
2960 io_size = (start_pg - last_pg) * PAGE_SIZE;
2961
2962 ubc_upl_abort_range(upl, last_pg * PAGE_SIZE, io_size,
2963 UPL_ABORT_FREE_ON_EMPTY);
2964
2965 if (io_size < size)
2966 size -= io_size;
2967 else
2968 break;
2969 }
2970 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
2971 if (!upl_valid_page(pl, last_pg) || !upl_dirty_page(pl, last_pg))
2972 break;
2973 }
2974 upl_offset = start_pg * PAGE_SIZE;
2975
2976 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
2977
2978 if (vp->v_flag & VNOCACHE_DATA)
2979 io_flags = CL_COMMIT | CL_AGE | CL_ASYNC | CL_DUMP;
2980 else
2981 io_flags = CL_COMMIT | CL_AGE | CL_ASYNC;
2982
2983 while (vp->v_numoutput >= ASYNC_THROTTLE) {
2984 vp->v_flag |= VTHROTTLED;
2985 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_push", 0);
2986 }
2987 cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, io_flags, (struct buf *)0);
2988
2989 size -= io_size;
2990 }
2991 return(1);
2992 }