]> git.saurik.com Git - apple/xnu.git/blob - bsd/vfs/vfs_cluster.c
xnu-344.12.2.tar.gz
[apple/xnu.git] / bsd / vfs / vfs_cluster.c
1 /*
2 * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
23 /*
24 * Copyright (c) 1993
25 * The Regents of the University of California. All rights reserved.
26 *
27 * Redistribution and use in source and binary forms, with or without
28 * modification, are permitted provided that the following conditions
29 * are met:
30 * 1. Redistributions of source code must retain the above copyright
31 * notice, this list of conditions and the following disclaimer.
32 * 2. Redistributions in binary form must reproduce the above copyright
33 * notice, this list of conditions and the following disclaimer in the
34 * documentation and/or other materials provided with the distribution.
35 * 3. All advertising materials mentioning features or use of this software
36 * must display the following acknowledgement:
37 * This product includes software developed by the University of
38 * California, Berkeley and its contributors.
39 * 4. Neither the name of the University nor the names of its contributors
40 * may be used to endorse or promote products derived from this software
41 * without specific prior written permission.
42 *
43 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
44 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
45 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
46 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
47 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
48 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
49 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
50 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
51 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
52 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
53 * SUCH DAMAGE.
54 *
55 * @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95
56 */
57
58 #include <sys/param.h>
59 #include <sys/proc.h>
60 #include <sys/buf.h>
61 #include <sys/vnode.h>
62 #include <sys/mount.h>
63 #include <sys/trace.h>
64 #include <sys/malloc.h>
65 #include <sys/resourcevar.h>
66 #include <libkern/libkern.h>
67
68 #include <sys/ubc.h>
69 #include <vm/vm_pageout.h>
70
71 #include <sys/kdebug.h>
72
73 #define CL_READ 0x01
74 #define CL_ASYNC 0x02
75 #define CL_COMMIT 0x04
76 #define CL_PAGEOUT 0x10
77 #define CL_AGE 0x20
78 #define CL_DUMP 0x40
79 #define CL_NOZERO 0x80
80 #define CL_PAGEIN 0x100
81 #define CL_DEV_MEMORY 0x200
82 #define CL_PRESERVE 0x400
83
84 struct clios {
85 u_int io_completed;
86 u_int io_issued;
87 off_t io_offset;
88 int io_error;
89 int io_wanted;
90 };
91
92
93 static void cluster_zero(upl_t upl, vm_offset_t upl_offset,
94 int size, struct buf *bp);
95 static int cluster_read_x(struct vnode *vp, struct uio *uio,
96 off_t filesize, int devblocksize, int flags);
97 static int cluster_write_x(struct vnode *vp, struct uio *uio,
98 off_t oldEOF, off_t newEOF, off_t headOff,
99 off_t tailOff, int devblocksize, int flags);
100 static int cluster_nocopy_read(struct vnode *vp, struct uio *uio,
101 off_t filesize, int devblocksize, int flags);
102 static int cluster_nocopy_write(struct vnode *vp, struct uio *uio,
103 off_t newEOF, int devblocksize, int flags);
104 static int cluster_phys_read(struct vnode *vp, struct uio *uio,
105 off_t filesize, int devblocksize, int flags);
106 static int cluster_phys_write(struct vnode *vp, struct uio *uio,
107 off_t newEOF, int devblocksize, int flags);
108 static int cluster_align_phys_io(struct vnode *vp, struct uio *uio,
109 vm_offset_t usr_paddr, int xsize, int devblocksize, int flags);
110 static int cluster_push_x(struct vnode *vp, off_t EOF, daddr_t first, daddr_t last, int can_delay);
111 static int cluster_try_push(struct vnode *vp, off_t newEOF, int can_delay, int push_all);
112
113
114 /*
115 * throttle the number of async writes that
116 * can be outstanding on a single vnode
117 * before we issue a synchronous write
118 */
119 #define ASYNC_THROTTLE 9
120
121 static int
122 cluster_iodone(bp)
123 struct buf *bp;
124 {
125 int b_flags;
126 int error;
127 int total_size;
128 int total_resid;
129 int upl_offset;
130 int zero_offset;
131 int l_blkno;
132 upl_t upl;
133 struct buf *cbp;
134 struct buf *cbp_head;
135 struct buf *cbp_next;
136 struct buf *real_bp;
137 struct vnode *vp;
138 struct clios *iostate;
139 int commit_size;
140 int pg_offset;
141
142
143 cbp_head = (struct buf *)(bp->b_trans_head);
144
145 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
146 (int)cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
147
148 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
149 /*
150 * all I/O requests that are part of this transaction
151 * have to complete before we can process it
152 */
153 if ( !(cbp->b_flags & B_DONE)) {
154
155 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
156 (int)cbp_head, (int)cbp, cbp->b_bcount, cbp->b_flags, 0);
157
158 return 0;
159 }
160 }
161 error = 0;
162 total_size = 0;
163 total_resid = 0;
164
165 cbp = cbp_head;
166 upl_offset = cbp->b_uploffset;
167 upl = cbp->b_pagelist;
168 b_flags = cbp->b_flags;
169 real_bp = cbp->b_real_bp;
170 vp = cbp->b_vp;
171 zero_offset= cbp->b_validend;
172 l_blkno = cbp->b_lblkno;
173 iostate = (struct clios *)cbp->b_iostate;
174
175 while (cbp) {
176 if (cbp->b_vectorcount > 1)
177 _FREE(cbp->b_vectorlist, M_SEGMENT);
178
179 if ((cbp->b_flags & B_ERROR) && error == 0)
180 error = cbp->b_error;
181
182 total_resid += cbp->b_resid;
183 total_size += cbp->b_bcount;
184
185 cbp_next = cbp->b_trans_next;
186
187 free_io_buf(cbp);
188
189 cbp = cbp_next;
190 }
191 if (zero_offset)
192 cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
193
194 if ((vp->v_flag & VTHROTTLED) && (vp->v_numoutput <= (ASYNC_THROTTLE / 3))) {
195 vp->v_flag &= ~VTHROTTLED;
196 wakeup((caddr_t)&vp->v_numoutput);
197 }
198 if (iostate) {
199 if (error) {
200 off_t error_offset;
201
202 error_offset = (off_t)l_blkno * PAGE_SIZE_64;
203
204 if (iostate->io_error == 0) {
205 iostate->io_error = error;
206 iostate->io_offset = error_offset;
207 } else {
208 if (error_offset < iostate->io_offset)
209 iostate->io_offset = error_offset;
210 }
211 }
212 iostate->io_completed += total_size;
213
214 if (iostate->io_wanted) {
215 iostate->io_wanted = 0;
216 wakeup((caddr_t)&iostate->io_wanted);
217 }
218 }
219 if ((b_flags & B_NEED_IODONE) && real_bp) {
220 if (error) {
221 real_bp->b_flags |= B_ERROR;
222 real_bp->b_error = error;
223 }
224 real_bp->b_resid = total_resid;
225
226 biodone(real_bp);
227 }
228 if (error == 0 && total_resid)
229 error = EIO;
230
231 if (b_flags & B_COMMIT_UPL) {
232 pg_offset = upl_offset & PAGE_MASK;
233 commit_size = (((pg_offset + total_size) + (PAGE_SIZE - 1)) / PAGE_SIZE) * PAGE_SIZE;
234
235 if (error || (b_flags & B_NOCACHE) || ((b_flags & B_PHYS) && !(b_flags & B_READ))) {
236 int upl_abort_code;
237
238 if (b_flags & B_PHYS)
239 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
240 else if ((b_flags & B_PAGEOUT) && (error != ENXIO)) /* transient error */
241 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
242 else if (b_flags & B_PGIN)
243 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
244 else
245 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
246
247 ubc_upl_abort_range(upl, upl_offset - pg_offset, commit_size,
248 upl_abort_code);
249
250 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
251 (int)upl, upl_offset - pg_offset, commit_size,
252 0x80000000|upl_abort_code, 0);
253
254 } else {
255 int upl_commit_flags = UPL_COMMIT_FREE_ON_EMPTY;
256
257 if (b_flags & B_PHYS)
258 upl_commit_flags |= UPL_COMMIT_SET_DIRTY;
259 else if ( !(b_flags & B_PAGEOUT))
260 upl_commit_flags |= UPL_COMMIT_CLEAR_DIRTY;
261 if (b_flags & B_AGE)
262 upl_commit_flags |= UPL_COMMIT_INACTIVATE;
263
264 ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size,
265 upl_commit_flags);
266
267 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
268 (int)upl, upl_offset - pg_offset, commit_size,
269 upl_commit_flags, 0);
270 }
271 } else
272 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
273 (int)upl, upl_offset, 0, error, 0);
274
275 return (error);
276 }
277
278
279 static void
280 cluster_zero(upl, upl_offset, size, bp)
281 upl_t upl;
282 vm_offset_t upl_offset;
283 int size;
284 struct buf *bp;
285 {
286 vm_offset_t io_addr = 0;
287 int must_unmap = 0;
288 kern_return_t kret;
289
290 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_NONE,
291 upl_offset, size, (int)bp, 0, 0);
292
293 if (bp == NULL || bp->b_data == NULL) {
294 kret = ubc_upl_map(upl, &io_addr);
295
296 if (kret != KERN_SUCCESS)
297 panic("cluster_zero: ubc_upl_map() failed with (%d)", kret);
298 if (io_addr == 0)
299 panic("cluster_zero: ubc_upl_map() mapped 0");
300
301 must_unmap = 1;
302 } else
303 io_addr = (vm_offset_t)bp->b_data;
304 bzero((caddr_t)(io_addr + upl_offset), size);
305
306 if (must_unmap) {
307 kret = ubc_upl_unmap(upl);
308
309 if (kret != KERN_SUCCESS)
310 panic("cluster_zero: kernel_upl_unmap failed");
311 }
312 }
313
314 static int
315 cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, real_bp, iostate)
316 struct vnode *vp;
317 upl_t upl;
318 vm_offset_t upl_offset;
319 off_t f_offset;
320 int non_rounded_size;
321 int devblocksize;
322 int flags;
323 struct buf *real_bp;
324 struct clios *iostate;
325 {
326 struct buf *cbp;
327 struct iovec *iovp;
328 u_int size;
329 u_int io_size;
330 int io_flags;
331 int error = 0;
332 int retval = 0;
333 struct buf *cbp_head = 0;
334 struct buf *cbp_tail = 0;
335 upl_page_info_t *pl;
336 int buf_count = 0;
337 int pg_count;
338 int pg_offset;
339 u_int max_iosize;
340 u_int max_vectors;
341 int priv;
342 int zero_offset = 0;
343 u_int first_lblkno;
344
345 if (flags & CL_READ) {
346 io_flags = (B_VECTORLIST | B_READ);
347
348 vfs_io_attributes(vp, B_READ, &max_iosize, &max_vectors);
349 } else {
350 io_flags = (B_VECTORLIST | B_WRITEINPROG);
351
352 vfs_io_attributes(vp, B_WRITE, &max_iosize, &max_vectors);
353 }
354 pl = ubc_upl_pageinfo(upl);
355
356 if (flags & CL_AGE)
357 io_flags |= B_AGE;
358 if (flags & CL_DUMP)
359 io_flags |= B_NOCACHE;
360 if (flags & CL_PAGEIN)
361 io_flags |= B_PGIN;
362 if (flags & CL_PAGEOUT)
363 io_flags |= B_PAGEOUT;
364 if (flags & CL_COMMIT)
365 io_flags |= B_COMMIT_UPL;
366 if (flags & CL_PRESERVE)
367 io_flags |= B_PHYS;
368
369 if (devblocksize)
370 size = (non_rounded_size + (devblocksize - 1)) & ~(devblocksize - 1);
371 else
372 size = non_rounded_size;
373
374
375 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START,
376 (int)f_offset, size, upl_offset, flags, 0);
377
378 if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
379 /*
380 * then we are going to end up
381 * with a page that we can't complete (the file size wasn't a multiple
382 * of PAGE_SIZE and we're trying to read to the end of the file
383 * so we'll go ahead and zero out the portion of the page we can't
384 * read in from the file
385 */
386 zero_offset = upl_offset + non_rounded_size;
387 }
388 while (size) {
389 int vsize;
390 int i;
391 int pl_index;
392 int pg_resid;
393 int num_contig;
394 daddr_t lblkno;
395 daddr_t blkno;
396
397 if (size > max_iosize)
398 io_size = max_iosize;
399 else
400 io_size = size;
401
402 if (error = VOP_CMAP(vp, f_offset, io_size, &blkno, (size_t *)&io_size, NULL)) {
403 if (error == EOPNOTSUPP)
404 panic("VOP_CMAP Unimplemented");
405 break;
406 }
407
408 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
409 (int)f_offset, (int)blkno, io_size, zero_offset, 0);
410
411 if ( (!(flags & CL_READ) && (long)blkno == -1) || io_size == 0) {
412 if (flags & CL_PAGEOUT) {
413 error = EINVAL;
414 break;
415 };
416
417 /* Try paging out the page individually before
418 giving up entirely and dumping it (it could
419 be mapped in a "hole" and require allocation
420 before the I/O:
421 */
422 ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE_64, UPL_ABORT_FREE_ON_EMPTY);
423 if (ubc_pushdirty_range(vp, f_offset, PAGE_SIZE_64) == 0) {
424 error = EINVAL;
425 break;
426 };
427
428 upl_offset += PAGE_SIZE_64;
429 f_offset += PAGE_SIZE_64;
430 size -= PAGE_SIZE_64;
431 continue;
432 }
433 lblkno = (daddr_t)(f_offset / PAGE_SIZE_64);
434 /*
435 * we have now figured out how much I/O we can do - this is in 'io_size'
436 * pl_index represents the first page in the 'upl' that the I/O will occur for
437 * pg_offset is the starting point in the first page for the I/O
438 * pg_count is the number of full and partial pages that 'io_size' encompasses
439 */
440 pl_index = upl_offset / PAGE_SIZE;
441 pg_offset = upl_offset & PAGE_MASK;
442 pg_count = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
443
444 if (flags & CL_DEV_MEMORY) {
445 /*
446 * currently, can't deal with reading 'holes' in file
447 */
448 if ((long)blkno == -1) {
449 error = EINVAL;
450 break;
451 }
452 /*
453 * treat physical requests as one 'giant' page
454 */
455 pg_count = 1;
456 }
457 if ((flags & CL_READ) && (long)blkno == -1) {
458 int bytes_to_zero;
459
460 /*
461 * if we're reading and blkno == -1, then we've got a
462 * 'hole' in the file that we need to deal with by zeroing
463 * out the affected area in the upl
464 */
465 if (zero_offset && io_size == size) {
466 /*
467 * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
468 * than 'zero_offset' will be non-zero
469 * if the 'hole' returned by VOP_CMAP extends all the way to the eof
470 * (indicated by the io_size finishing off the I/O request for this UPL)
471 * than we're not going to issue an I/O for the
472 * last page in this upl... we need to zero both the hole and the tail
473 * of the page beyond the EOF, since the delayed zero-fill won't kick in
474 */
475 bytes_to_zero = (((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset;
476
477 zero_offset = 0;
478 } else
479 bytes_to_zero = io_size;
480
481 cluster_zero(upl, upl_offset, bytes_to_zero, real_bp);
482
483 if (cbp_head)
484 /*
485 * if there is a current I/O chain pending
486 * then the first page of the group we just zero'd
487 * will be handled by the I/O completion if the zero
488 * fill started in the middle of the page
489 */
490 pg_count = (io_size - pg_offset) / PAGE_SIZE;
491 else {
492 /*
493 * no pending I/O to pick up that first page
494 * so, we have to make sure it gets committed
495 * here.
496 * set the pg_offset to 0 so that the upl_commit_range
497 * starts with this page
498 */
499 pg_count = (io_size + pg_offset) / PAGE_SIZE;
500 pg_offset = 0;
501 }
502 if (io_size == size && ((upl_offset + io_size) & PAGE_MASK))
503 /*
504 * if we're done with the request for this UPL
505 * then we have to make sure to commit the last page
506 * even if we only partially zero-filled it
507 */
508 pg_count++;
509
510 if (pg_count) {
511 if (pg_offset)
512 pg_resid = PAGE_SIZE - pg_offset;
513 else
514 pg_resid = 0;
515
516 if (flags & CL_COMMIT)
517 ubc_upl_commit_range(upl,
518 (upl_offset + pg_resid) & ~PAGE_MASK,
519 pg_count * PAGE_SIZE,
520 UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
521 }
522 upl_offset += io_size;
523 f_offset += io_size;
524 size -= io_size;
525
526 if (cbp_head && pg_count)
527 goto start_io;
528 continue;
529
530 } else if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno)) {
531 real_bp->b_blkno = blkno;
532 }
533
534 if (pg_count > 1) {
535 if (pg_count > max_vectors) {
536 io_size -= (pg_count - max_vectors) * PAGE_SIZE;
537
538 if (io_size < 0) {
539 io_size = PAGE_SIZE - pg_offset;
540 pg_count = 1;
541 } else
542 pg_count = max_vectors;
543 }
544 /*
545 * we need to allocate space for the vector list
546 */
547 if (pg_count > 1) {
548 iovp = (struct iovec *)_MALLOC(sizeof(struct iovec) * pg_count,
549 M_SEGMENT, M_NOWAIT);
550
551 if (iovp == (struct iovec *) 0) {
552 /*
553 * if the allocation fails, then throttle down to a single page
554 */
555 io_size = PAGE_SIZE - pg_offset;
556 pg_count = 1;
557 }
558 }
559 }
560
561 /* Throttle the speculative IO */
562 if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT))
563 priv = 0;
564 else
565 priv = 1;
566
567 cbp = alloc_io_buf(vp, priv);
568
569 if (pg_count == 1)
570 /*
571 * we use the io vector that's reserved in the buffer header
572 * this insures we can always issue an I/O even in a low memory
573 * condition that prevents the _MALLOC from succeeding... this
574 * is necessary to prevent deadlocks with the pager
575 */
576 iovp = (struct iovec *)(&cbp->b_vects[0]);
577
578 cbp->b_vectorlist = (void *)iovp;
579 cbp->b_vectorcount = pg_count;
580
581 if (flags & CL_DEV_MEMORY) {
582
583 iovp->iov_len = io_size;
584 iovp->iov_base = (caddr_t)upl_phys_page(pl, 0);
585
586 if (iovp->iov_base == (caddr_t) 0) {
587 free_io_buf(cbp);
588 error = EINVAL;
589 } else
590 iovp->iov_base += upl_offset;
591 } else {
592
593 for (i = 0, vsize = io_size; i < pg_count; i++, iovp++) {
594 int psize;
595
596 psize = PAGE_SIZE - pg_offset;
597
598 if (psize > vsize)
599 psize = vsize;
600
601 iovp->iov_len = psize;
602 iovp->iov_base = (caddr_t)upl_phys_page(pl, pl_index + i);
603
604 if (iovp->iov_base == (caddr_t) 0) {
605 if (pg_count > 1)
606 _FREE(cbp->b_vectorlist, M_SEGMENT);
607 free_io_buf(cbp);
608
609 error = EINVAL;
610 break;
611 }
612 iovp->iov_base += pg_offset;
613 pg_offset = 0;
614
615 if (flags & CL_PAGEOUT) {
616 int s;
617 struct buf *bp;
618
619 s = splbio();
620 if (bp = incore(vp, lblkno + i)) {
621 if (!ISSET(bp->b_flags, B_BUSY)) {
622 bremfree(bp);
623 SET(bp->b_flags, (B_BUSY | B_INVAL));
624 splx(s);
625 brelse(bp);
626 } else
627 panic("BUSY bp found in cluster_io");
628 }
629 splx(s);
630 }
631 vsize -= psize;
632 }
633 }
634 if (error)
635 break;
636
637 if (flags & CL_ASYNC) {
638 cbp->b_flags |= (B_CALL | B_ASYNC);
639 cbp->b_iodone = (void *)cluster_iodone;
640 }
641 cbp->b_flags |= io_flags;
642
643 cbp->b_lblkno = lblkno;
644 cbp->b_blkno = blkno;
645 cbp->b_bcount = io_size;
646 cbp->b_pagelist = upl;
647 cbp->b_uploffset = upl_offset;
648 cbp->b_trans_next = (struct buf *)0;
649
650 if (cbp->b_iostate = (void *)iostate)
651 iostate->io_issued += io_size;
652
653 if (flags & CL_READ)
654 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
655 cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
656 else
657 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
658 cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
659
660 if (cbp_head) {
661 cbp_tail->b_trans_next = cbp;
662 cbp_tail = cbp;
663 } else {
664 cbp_head = cbp;
665 cbp_tail = cbp;
666 }
667 (struct buf *)(cbp->b_trans_head) = cbp_head;
668 buf_count++;
669
670 upl_offset += io_size;
671 f_offset += io_size;
672 size -= io_size;
673
674 if ( (!(upl_offset & PAGE_MASK) && !(flags & CL_DEV_MEMORY) && ((flags & CL_ASYNC) || buf_count > 8)) || size == 0) {
675 /*
676 * if we have no more I/O to issue or
677 * the current I/O we've prepared fully
678 * completes the last page in this request
679 * and it's either an ASYNC request or
680 * we've already accumulated more than 8 I/O's into
681 * this transaction and it's not an I/O directed to
682 * special DEVICE memory
683 * then go ahead and issue the I/O
684 */
685 start_io:
686 if (real_bp) {
687 cbp_head->b_flags |= B_NEED_IODONE;
688 cbp_head->b_real_bp = real_bp;
689 } else
690 cbp_head->b_real_bp = (struct buf *)NULL;
691
692 if (size == 0) {
693 /*
694 * we're about to issue the last I/O for this upl
695 * if this was a read to the eof and the eof doesn't
696 * finish on a page boundary, than we need to zero-fill
697 * the rest of the page....
698 */
699 cbp_head->b_validend = zero_offset;
700 } else
701 cbp_head->b_validend = 0;
702
703 for (cbp = cbp_head; cbp;) {
704 struct buf * cbp_next;
705
706 if (io_flags & B_WRITEINPROG)
707 cbp->b_vp->v_numoutput++;
708
709 cbp_next = cbp->b_trans_next;
710
711 (void) VOP_STRATEGY(cbp);
712 cbp = cbp_next;
713 }
714 if ( !(flags & CL_ASYNC)) {
715 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next)
716 biowait(cbp);
717
718 if (error = cluster_iodone(cbp_head)) {
719 if ((flags & CL_PAGEOUT) && (error == ENXIO))
720 retval = 0; /* drop the error */
721 else
722 retval = error;
723 error = 0;
724 }
725 }
726 cbp_head = (struct buf *)0;
727 cbp_tail = (struct buf *)0;
728
729 buf_count = 0;
730 }
731 }
732 if (error) {
733 int abort_size;
734
735 io_size = 0;
736
737 for (cbp = cbp_head; cbp;) {
738 struct buf * cbp_next;
739
740 if (cbp->b_vectorcount > 1)
741 _FREE(cbp->b_vectorlist, M_SEGMENT);
742 upl_offset -= cbp->b_bcount;
743 size += cbp->b_bcount;
744 io_size += cbp->b_bcount;
745
746 cbp_next = cbp->b_trans_next;
747 free_io_buf(cbp);
748 cbp = cbp_next;
749 }
750 if (iostate) {
751 if (iostate->io_error == 0) {
752 iostate->io_error = error;
753 iostate->io_offset = f_offset - (off_t)io_size;
754 }
755 iostate->io_issued -= io_size;
756
757 if (iostate->io_wanted) {
758 iostate->io_wanted = 0;
759 wakeup((caddr_t)&iostate->io_wanted);
760 }
761 }
762 pg_offset = upl_offset & PAGE_MASK;
763 abort_size = ((size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE) * PAGE_SIZE;
764
765 if (flags & CL_COMMIT) {
766 int upl_abort_code;
767
768 if (flags & CL_PRESERVE)
769 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
770 else if ((flags & CL_PAGEOUT) && (error != ENXIO)) /* transient error */
771 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
772 else if (flags & CL_PAGEIN)
773 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
774 else
775 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
776
777 ubc_upl_abort_range(upl, upl_offset - pg_offset, abort_size,
778 upl_abort_code);
779
780 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
781 (int)upl, upl_offset - pg_offset, abort_size, error, 0);
782 }
783 if (real_bp) {
784 real_bp->b_flags |= B_ERROR;
785 real_bp->b_error = error;
786
787 biodone(real_bp);
788 }
789 if (retval == 0)
790 retval = error;
791 }
792 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END,
793 (int)f_offset, size, upl_offset, retval, 0);
794
795 return (retval);
796 }
797
798
799 static int
800 cluster_rd_prefetch(vp, f_offset, size, filesize, devblocksize)
801 struct vnode *vp;
802 off_t f_offset;
803 u_int size;
804 off_t filesize;
805 int devblocksize;
806 {
807 int pages_to_fetch;
808 int skipped_pages;
809
810 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
811 (int)f_offset, size, (int)filesize, 0, 0);
812
813 if (f_offset >= filesize) {
814 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
815 (int)f_offset, 0, 0, 0, 0);
816 return(0);
817 }
818 if (size > (MAX_UPL_TRANSFER * PAGE_SIZE))
819 size = MAX_UPL_TRANSFER * PAGE_SIZE;
820 else
821 size = (size + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1);
822
823 if ((off_t)size > (filesize - f_offset))
824 size = filesize - f_offset;
825
826 pages_to_fetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
827
828 for (skipped_pages = 0; skipped_pages < pages_to_fetch; skipped_pages++) {
829 if (ubc_page_op(vp, f_offset, 0, 0, 0) != KERN_SUCCESS)
830 break;
831 f_offset += PAGE_SIZE;
832 size -= PAGE_SIZE;
833 }
834 if (skipped_pages < pages_to_fetch)
835 advisory_read(vp, filesize, f_offset, size, devblocksize);
836
837 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
838 (int)f_offset + (pages_to_fetch * PAGE_SIZE), skipped_pages, 0, 1, 0);
839
840 return (pages_to_fetch);
841 }
842
843
844
845 static void
846 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize)
847 struct vnode *vp;
848 daddr_t b_lblkno;
849 daddr_t e_lblkno;
850 off_t filesize;
851 int devblocksize;
852 {
853 daddr_t r_lblkno;
854 off_t f_offset;
855 int size_of_prefetch;
856 int max_pages;
857
858 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
859 b_lblkno, e_lblkno, vp->v_lastr, 0, 0);
860
861 if (b_lblkno == vp->v_lastr && b_lblkno == e_lblkno) {
862 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
863 vp->v_ralen, vp->v_maxra, vp->v_lastr, 0, 0);
864 return;
865 }
866
867 if (vp->v_lastr == -1 || (b_lblkno != vp->v_lastr && b_lblkno != (vp->v_lastr + 1) &&
868 (b_lblkno != (vp->v_maxra + 1) || vp->v_ralen == 0))) {
869 vp->v_ralen = 0;
870 vp->v_maxra = 0;
871
872 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
873 vp->v_ralen, vp->v_maxra, vp->v_lastr, 1, 0);
874
875 return;
876 }
877 max_pages = MAX_UPL_TRANSFER;
878
879 vp->v_ralen = vp->v_ralen ? min(max_pages, vp->v_ralen << 1) : 1;
880
881 if (((e_lblkno + 1) - b_lblkno) > vp->v_ralen)
882 vp->v_ralen = min(max_pages, (e_lblkno + 1) - b_lblkno);
883
884 if (e_lblkno < vp->v_maxra) {
885 if ((vp->v_maxra - e_lblkno) > max(max_pages / 16, 4)) {
886
887 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
888 vp->v_ralen, vp->v_maxra, vp->v_lastr, 2, 0);
889 return;
890 }
891 }
892 r_lblkno = max(e_lblkno, vp->v_maxra) + 1;
893 f_offset = (off_t)r_lblkno * PAGE_SIZE_64;
894
895 if (f_offset < filesize) {
896 size_of_prefetch = cluster_rd_prefetch(vp, f_offset, vp->v_ralen * PAGE_SIZE, filesize, devblocksize);
897
898 if (size_of_prefetch)
899 vp->v_maxra = (r_lblkno + size_of_prefetch) - 1;
900 }
901 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
902 vp->v_ralen, vp->v_maxra, vp->v_lastr, 3, 0);
903 }
904
905 int
906 cluster_pageout(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags)
907 struct vnode *vp;
908 upl_t upl;
909 vm_offset_t upl_offset;
910 off_t f_offset;
911 int size;
912 off_t filesize;
913 int devblocksize;
914 int flags;
915 {
916 int io_size;
917 int pg_size;
918 off_t max_size;
919 int local_flags = CL_PAGEOUT;
920
921 if ((flags & UPL_IOSYNC) == 0)
922 local_flags |= CL_ASYNC;
923 if ((flags & UPL_NOCOMMIT) == 0)
924 local_flags |= CL_COMMIT;
925
926
927 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
928 (int)f_offset, size, (int)filesize, local_flags, 0);
929
930 /*
931 * If they didn't specify any I/O, then we are done...
932 * we can't issue an abort because we don't know how
933 * big the upl really is
934 */
935 if (size <= 0)
936 return (EINVAL);
937
938 if (vp->v_mount->mnt_flag & MNT_RDONLY) {
939 if (local_flags & CL_COMMIT)
940 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
941 return (EROFS);
942 }
943 /*
944 * can't page-in from a negative offset
945 * or if we're starting beyond the EOF
946 * or if the file offset isn't page aligned
947 * or the size requested isn't a multiple of PAGE_SIZE
948 */
949 if (f_offset < 0 || f_offset >= filesize ||
950 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
951 if (local_flags & CL_COMMIT)
952 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
953 return (EINVAL);
954 }
955 max_size = filesize - f_offset;
956
957 if (size < max_size)
958 io_size = size;
959 else
960 io_size = max_size;
961
962 pg_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
963
964 if (size > pg_size) {
965 if (local_flags & CL_COMMIT)
966 ubc_upl_abort_range(upl, upl_offset + pg_size, size - pg_size,
967 UPL_ABORT_FREE_ON_EMPTY);
968 }
969 while (vp->v_numoutput >= ASYNC_THROTTLE) {
970 vp->v_flag |= VTHROTTLED;
971 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_pageout", 0);
972 }
973
974 return (cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize,
975 local_flags, (struct buf *)0, (struct clios *)0));
976 }
977
978 int
979 cluster_pagein(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags)
980 struct vnode *vp;
981 upl_t upl;
982 vm_offset_t upl_offset;
983 off_t f_offset;
984 int size;
985 off_t filesize;
986 int devblocksize;
987 int flags;
988 {
989 u_int io_size;
990 int rounded_size;
991 off_t max_size;
992 int retval;
993 int local_flags = 0;
994
995 if (upl == NULL || size < 0)
996 panic("cluster_pagein: NULL upl passed in");
997
998 if ((flags & UPL_IOSYNC) == 0)
999 local_flags |= CL_ASYNC;
1000 if ((flags & UPL_NOCOMMIT) == 0)
1001 local_flags |= CL_COMMIT;
1002
1003
1004 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
1005 (int)f_offset, size, (int)filesize, local_flags, 0);
1006
1007 /*
1008 * can't page-in from a negative offset
1009 * or if we're starting beyond the EOF
1010 * or if the file offset isn't page aligned
1011 * or the size requested isn't a multiple of PAGE_SIZE
1012 */
1013 if (f_offset < 0 || f_offset >= filesize ||
1014 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) {
1015 if (local_flags & CL_COMMIT)
1016 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1017 return (EINVAL);
1018 }
1019 max_size = filesize - f_offset;
1020
1021 if (size < max_size)
1022 io_size = size;
1023 else
1024 io_size = max_size;
1025
1026 rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1027
1028 if (size > rounded_size && (local_flags & CL_COMMIT))
1029 ubc_upl_abort_range(upl, upl_offset + rounded_size,
1030 size - (upl_offset + rounded_size), UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1031
1032 retval = cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize,
1033 local_flags | CL_READ | CL_PAGEIN, (struct buf *)0, (struct clios *)0);
1034
1035 if (retval == 0) {
1036 int b_lblkno;
1037 int e_lblkno;
1038
1039 b_lblkno = (int)(f_offset / PAGE_SIZE_64);
1040 e_lblkno = (int)
1041 ((f_offset + ((off_t)io_size - 1)) / PAGE_SIZE_64);
1042
1043 if (!(flags & UPL_NORDAHEAD) && !(vp->v_flag & VRAOFF) && rounded_size == PAGE_SIZE) {
1044 /*
1045 * we haven't read the last page in of the file yet
1046 * so let's try to read ahead if we're in
1047 * a sequential access pattern
1048 */
1049 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
1050 }
1051 vp->v_lastr = e_lblkno;
1052 }
1053 return (retval);
1054 }
1055
1056 int
1057 cluster_bp(bp)
1058 struct buf *bp;
1059 {
1060 off_t f_offset;
1061 int flags;
1062
1063 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START,
1064 (int)bp, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
1065
1066 if (bp->b_pagelist == (upl_t) 0)
1067 panic("cluster_bp: can't handle NULL upl yet\n");
1068 if (bp->b_flags & B_READ)
1069 flags = CL_ASYNC | CL_READ;
1070 else
1071 flags = CL_ASYNC;
1072
1073 f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
1074
1075 return (cluster_io(bp->b_vp, bp->b_pagelist, 0, f_offset, bp->b_bcount, 0, flags, bp, (struct clios *)0));
1076 }
1077
1078 int
1079 cluster_write(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
1080 struct vnode *vp;
1081 struct uio *uio;
1082 off_t oldEOF;
1083 off_t newEOF;
1084 off_t headOff;
1085 off_t tailOff;
1086 int devblocksize;
1087 int flags;
1088 {
1089 int prev_resid;
1090 int clip_size;
1091 off_t max_io_size;
1092 struct iovec *iov;
1093 vm_offset_t upl_offset;
1094 int upl_size;
1095 int pages_in_pl;
1096 upl_page_info_t *pl;
1097 int upl_flags;
1098 upl_t upl;
1099 int retval = 0;
1100
1101
1102 if ( (!(vp->v_flag & VNOCACHE_DATA)) || (!uio) || (uio->uio_segflg != UIO_USERSPACE))
1103 {
1104 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1105 return(retval);
1106 }
1107
1108 while (uio->uio_resid && uio->uio_offset < newEOF && retval == 0)
1109 {
1110 /* we know we have a resid, so this is safe */
1111 iov = uio->uio_iov;
1112 while (iov->iov_len == 0) {
1113 uio->uio_iov++;
1114 uio->uio_iovcnt--;
1115 iov = uio->uio_iov;
1116 }
1117
1118 /*
1119 * We check every vector target and if it is physically
1120 * contiguous space, we skip the sanity checks.
1121 */
1122
1123 upl_offset = (vm_offset_t)iov->iov_base & ~PAGE_MASK;
1124 upl_size = (upl_offset + PAGE_SIZE +(PAGE_SIZE -1)) & ~PAGE_MASK;
1125 pages_in_pl = 0;
1126 upl_flags = UPL_QUERY_OBJECT_TYPE;
1127 if ((vm_map_get_upl(current_map(),
1128 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1129 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0)) != KERN_SUCCESS)
1130 {
1131 /*
1132 * the user app must have passed in an invalid address
1133 */
1134 return (EFAULT);
1135 }
1136
1137 if (upl_flags & UPL_PHYS_CONTIG)
1138 {
1139 if (flags & IO_HEADZEROFILL)
1140 {
1141 flags &= ~IO_HEADZEROFILL;
1142
1143 if (retval = cluster_write_x(vp, (struct uio *)0, 0, uio->uio_offset, headOff, 0, devblocksize, IO_HEADZEROFILL))
1144 return(retval);
1145 }
1146
1147 retval = cluster_phys_write(vp, uio, newEOF, devblocksize, flags);
1148
1149 if (uio->uio_resid == 0 && (flags & IO_TAILZEROFILL))
1150 {
1151 retval = cluster_write_x(vp, (struct uio *)0, 0, tailOff, uio->uio_offset, 0, devblocksize, IO_HEADZEROFILL);
1152 return(retval);
1153 }
1154 }
1155 else if ((uio->uio_resid < 4 * PAGE_SIZE) || (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL)))
1156 {
1157 /*
1158 * We set a threshhold of 4 pages to decide if the nocopy
1159 * write loop is worth the trouble...
1160 * we also come here if we're trying to zero the head and/or tail
1161 * of a partially written page, and the user source is not a physically contiguous region
1162 */
1163 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1164 return(retval);
1165 }
1166 else if (uio->uio_offset & PAGE_MASK_64)
1167 {
1168 /* Bring the file offset write up to a pagesize boundary */
1169 clip_size = (PAGE_SIZE - (uio->uio_offset & PAGE_MASK_64));
1170 if (uio->uio_resid < clip_size)
1171 clip_size = uio->uio_resid;
1172 /*
1173 * Fake the resid going into the cluster_write_x call
1174 * and restore it on the way out.
1175 */
1176 prev_resid = uio->uio_resid;
1177 uio->uio_resid = clip_size;
1178 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1179 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1180 }
1181 else if ((int)iov->iov_base & PAGE_MASK_64)
1182 {
1183 clip_size = iov->iov_len;
1184 prev_resid = uio->uio_resid;
1185 uio->uio_resid = clip_size;
1186 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1187 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1188 }
1189 else
1190 {
1191 /*
1192 * If we come in here, we know the offset into
1193 * the file is on a pagesize boundary
1194 */
1195
1196 max_io_size = newEOF - uio->uio_offset;
1197 clip_size = uio->uio_resid;
1198 if (iov->iov_len < clip_size)
1199 clip_size = iov->iov_len;
1200 if (max_io_size < clip_size)
1201 clip_size = max_io_size;
1202
1203 if (clip_size < PAGE_SIZE)
1204 {
1205 /*
1206 * Take care of tail end of write in this vector
1207 */
1208 prev_resid = uio->uio_resid;
1209 uio->uio_resid = clip_size;
1210 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1211 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1212 }
1213 else
1214 {
1215 /* round clip_size down to a multiple of pagesize */
1216 clip_size = clip_size & ~(PAGE_MASK);
1217 prev_resid = uio->uio_resid;
1218 uio->uio_resid = clip_size;
1219 retval = cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags);
1220 if ((retval == 0) && uio->uio_resid)
1221 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1222 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1223 }
1224 } /* end else */
1225 } /* end while */
1226 return(retval);
1227 }
1228
1229
1230 static int
1231 cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags)
1232 struct vnode *vp;
1233 struct uio *uio;
1234 off_t newEOF;
1235 int devblocksize;
1236 int flags;
1237 {
1238 upl_t upl;
1239 upl_page_info_t *pl;
1240 off_t upl_f_offset;
1241 vm_offset_t upl_offset;
1242 off_t max_io_size;
1243 int io_size;
1244 int upl_size;
1245 int upl_needed_size;
1246 int pages_in_pl;
1247 int upl_flags;
1248 kern_return_t kret;
1249 struct iovec *iov;
1250 int i;
1251 int force_data_sync;
1252 int error = 0;
1253
1254 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
1255 (int)uio->uio_offset, (int)uio->uio_resid,
1256 (int)newEOF, devblocksize, 0);
1257
1258 /*
1259 * When we enter this routine, we know
1260 * -- the offset into the file is on a pagesize boundary
1261 * -- the resid is a page multiple
1262 * -- the resid will not exceed iov_len
1263 */
1264 cluster_try_push(vp, newEOF, 0, 1);
1265
1266 iov = uio->uio_iov;
1267
1268 while (uio->uio_resid && uio->uio_offset < newEOF && error == 0) {
1269 io_size = uio->uio_resid;
1270
1271 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1272 io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1273
1274 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
1275 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
1276
1277 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
1278 (int)upl_offset, upl_needed_size, (int)iov->iov_base, io_size, 0);
1279
1280 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++)
1281 {
1282 pages_in_pl = 0;
1283 upl_size = upl_needed_size;
1284 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1285 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
1286
1287 kret = vm_map_get_upl(current_map(),
1288 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1289 &upl_size,
1290 &upl,
1291 NULL,
1292 &pages_in_pl,
1293 &upl_flags,
1294 force_data_sync);
1295
1296 if (kret != KERN_SUCCESS)
1297 {
1298 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1299 0, 0, 0, kret, 0);
1300
1301 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1302 (int)uio->uio_offset, (int)uio->uio_resid, kret, 1, 0);
1303
1304 /* cluster_nocopy_write: failed to get pagelist */
1305 /* do not return kret here */
1306 return(0);
1307 }
1308
1309 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1310 pages_in_pl = upl_size / PAGE_SIZE;
1311
1312 for(i=0; i < pages_in_pl; i++)
1313 {
1314 if (!upl_valid_page(pl, i))
1315 break;
1316 }
1317
1318 if (i == pages_in_pl)
1319 break;
1320
1321 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1322 UPL_ABORT_FREE_ON_EMPTY);
1323 }
1324
1325 if (force_data_sync >= 3)
1326 {
1327 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1328 i, pages_in_pl, upl_size, kret, 0);
1329
1330 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1331 (int)uio->uio_offset, (int)uio->uio_resid, kret, 2, 0);
1332 return(0);
1333 }
1334
1335 /*
1336 * Consider the possibility that upl_size wasn't satisfied.
1337 */
1338 if (upl_size != upl_needed_size)
1339 io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
1340
1341 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1342 (int)upl_offset, upl_size, (int)iov->iov_base, io_size, 0);
1343
1344 if (io_size == 0)
1345 {
1346 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1347 UPL_ABORT_FREE_ON_EMPTY);
1348 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1349 (int)uio->uio_offset, uio->uio_resid, 0, 3, 0);
1350
1351 return(0);
1352 }
1353
1354 /*
1355 * Now look for pages already in the cache
1356 * and throw them away.
1357 */
1358
1359 upl_f_offset = uio->uio_offset; /* this is page aligned in the file */
1360 max_io_size = io_size;
1361
1362 while (max_io_size) {
1363
1364 /*
1365 * Flag UPL_POP_DUMP says if the page is found
1366 * in the page cache it must be thrown away.
1367 */
1368 ubc_page_op(vp,
1369 upl_f_offset,
1370 UPL_POP_SET | UPL_POP_BUSY | UPL_POP_DUMP,
1371 0, 0);
1372 max_io_size -= PAGE_SIZE;
1373 upl_f_offset += PAGE_SIZE;
1374 }
1375
1376 /*
1377 * issue a synchronous write to cluster_io
1378 */
1379
1380 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
1381 (int)upl_offset, (int)uio->uio_offset, io_size, 0, 0);
1382
1383 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1384 io_size, devblocksize, 0, (struct buf *)0, (struct clios *)0);
1385
1386 if (error == 0) {
1387 /*
1388 * The cluster_io write completed successfully,
1389 * update the uio structure.
1390 */
1391 iov->iov_base += io_size;
1392 iov->iov_len -= io_size;
1393 uio->uio_resid -= io_size;
1394 uio->uio_offset += io_size;
1395 }
1396 /*
1397 * always 'commit' the I/O via the abort primitive whether the I/O
1398 * succeeded cleanly or not... this is necessary to insure that
1399 * we preserve the state of the DIRTY flag on the pages used to
1400 * provide the data for the I/O... the state of this flag SHOULD
1401 * NOT be changed by a write
1402 */
1403 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1404 UPL_ABORT_FREE_ON_EMPTY);
1405
1406
1407 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
1408 (int)upl_offset, (int)uio->uio_offset, (int)uio->uio_resid, error, 0);
1409
1410 } /* end while */
1411
1412
1413 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1414 (int)uio->uio_offset, (int)uio->uio_resid, error, 4, 0);
1415
1416 return (error);
1417 }
1418
1419
1420 static int
1421 cluster_phys_write(vp, uio, newEOF, devblocksize, flags)
1422 struct vnode *vp;
1423 struct uio *uio;
1424 off_t newEOF;
1425 int devblocksize;
1426 int flags;
1427 {
1428 upl_page_info_t *pl;
1429 vm_offset_t src_paddr;
1430 upl_t upl;
1431 vm_offset_t upl_offset;
1432 int tail_size;
1433 int io_size;
1434 int upl_size;
1435 int upl_needed_size;
1436 int pages_in_pl;
1437 int upl_flags;
1438 kern_return_t kret;
1439 struct iovec *iov;
1440 int error = 0;
1441
1442 /*
1443 * When we enter this routine, we know
1444 * -- the resid will not exceed iov_len
1445 * -- the vector target address is physcially contiguous
1446 */
1447 cluster_try_push(vp, newEOF, 0, 1);
1448
1449 iov = uio->uio_iov;
1450 io_size = iov->iov_len;
1451 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
1452 upl_needed_size = upl_offset + io_size;
1453
1454 pages_in_pl = 0;
1455 upl_size = upl_needed_size;
1456 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1457 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
1458
1459 kret = vm_map_get_upl(current_map(),
1460 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1461 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
1462
1463 if (kret != KERN_SUCCESS) {
1464 /*
1465 * cluster_phys_write: failed to get pagelist
1466 * note: return kret here
1467 */
1468 return(EINVAL);
1469 }
1470 /*
1471 * Consider the possibility that upl_size wasn't satisfied.
1472 * This is a failure in the physical memory case.
1473 */
1474 if (upl_size < upl_needed_size) {
1475 kernel_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1476 return(EINVAL);
1477 }
1478 pl = ubc_upl_pageinfo(upl);
1479
1480 src_paddr = (vm_offset_t)upl_phys_page(pl, 0) + ((vm_offset_t)iov->iov_base & PAGE_MASK);
1481
1482 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
1483 int head_size;
1484
1485 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
1486
1487 if (head_size > io_size)
1488 head_size = io_size;
1489
1490 error = cluster_align_phys_io(vp, uio, src_paddr, head_size, devblocksize, 0);
1491
1492 if (error) {
1493 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1494
1495 return(EINVAL);
1496 }
1497 upl_offset += head_size;
1498 src_paddr += head_size;
1499 io_size -= head_size;
1500 }
1501 tail_size = io_size & (devblocksize - 1);
1502 io_size -= tail_size;
1503
1504 if (io_size) {
1505 /*
1506 * issue a synchronous write to cluster_io
1507 */
1508 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1509 io_size, 0, CL_DEV_MEMORY, (struct buf *)0, (struct clios *)0);
1510 }
1511 if (error == 0) {
1512 /*
1513 * The cluster_io write completed successfully,
1514 * update the uio structure
1515 */
1516 uio->uio_resid -= io_size;
1517 iov->iov_len -= io_size;
1518 iov->iov_base += io_size;
1519 uio->uio_offset += io_size;
1520 src_paddr += io_size;
1521
1522 if (tail_size)
1523 error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, devblocksize, 0);
1524 }
1525 /*
1526 * just release our hold on the physically contiguous
1527 * region without changing any state
1528 */
1529 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1530
1531 return (error);
1532 }
1533
1534
1535 static int
1536 cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
1537 struct vnode *vp;
1538 struct uio *uio;
1539 off_t oldEOF;
1540 off_t newEOF;
1541 off_t headOff;
1542 off_t tailOff;
1543 int devblocksize;
1544 int flags;
1545 {
1546 upl_page_info_t *pl;
1547 upl_t upl;
1548 vm_offset_t upl_offset;
1549 int upl_size;
1550 off_t upl_f_offset;
1551 int pages_in_upl;
1552 int start_offset;
1553 int xfer_resid;
1554 int io_size;
1555 int io_flags;
1556 vm_offset_t io_address;
1557 int io_offset;
1558 int bytes_to_zero;
1559 int bytes_to_move;
1560 kern_return_t kret;
1561 int retval = 0;
1562 int uio_resid;
1563 long long total_size;
1564 long long zero_cnt;
1565 off_t zero_off;
1566 long long zero_cnt1;
1567 off_t zero_off1;
1568 daddr_t start_blkno;
1569 daddr_t last_blkno;
1570
1571 if (uio) {
1572 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1573 (int)uio->uio_offset, uio->uio_resid, (int)oldEOF, (int)newEOF, 0);
1574
1575 uio_resid = uio->uio_resid;
1576 } else {
1577 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1578 0, 0, (int)oldEOF, (int)newEOF, 0);
1579
1580 uio_resid = 0;
1581 }
1582 zero_cnt = 0;
1583 zero_cnt1 = 0;
1584
1585 if (flags & IO_HEADZEROFILL) {
1586 /*
1587 * some filesystems (HFS is one) don't support unallocated holes within a file...
1588 * so we zero fill the intervening space between the old EOF and the offset
1589 * where the next chunk of real data begins.... ftruncate will also use this
1590 * routine to zero fill to the new EOF when growing a file... in this case, the
1591 * uio structure will not be provided
1592 */
1593 if (uio) {
1594 if (headOff < uio->uio_offset) {
1595 zero_cnt = uio->uio_offset - headOff;
1596 zero_off = headOff;
1597 }
1598 } else if (headOff < newEOF) {
1599 zero_cnt = newEOF - headOff;
1600 zero_off = headOff;
1601 }
1602 }
1603 if (flags & IO_TAILZEROFILL) {
1604 if (uio) {
1605 zero_off1 = uio->uio_offset + uio->uio_resid;
1606
1607 if (zero_off1 < tailOff)
1608 zero_cnt1 = tailOff - zero_off1;
1609 }
1610 }
1611 if (zero_cnt == 0 && uio == (struct uio *) 0)
1612 {
1613 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
1614 retval, 0, 0, 0, 0);
1615 return (0);
1616 }
1617
1618 while ((total_size = (uio_resid + zero_cnt + zero_cnt1)) && retval == 0) {
1619 /*
1620 * for this iteration of the loop, figure out where our starting point is
1621 */
1622 if (zero_cnt) {
1623 start_offset = (int)(zero_off & PAGE_MASK_64);
1624 upl_f_offset = zero_off - start_offset;
1625 } else if (uio_resid) {
1626 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
1627 upl_f_offset = uio->uio_offset - start_offset;
1628 } else {
1629 start_offset = (int)(zero_off1 & PAGE_MASK_64);
1630 upl_f_offset = zero_off1 - start_offset;
1631 }
1632 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
1633 (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
1634
1635 if (total_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1636 total_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1637
1638 /*
1639 * compute the size of the upl needed to encompass
1640 * the requested write... limit each call to cluster_io
1641 * to the maximum UPL size... cluster_io will clip if
1642 * this exceeds the maximum io_size for the device,
1643 * make sure to account for
1644 * a starting offset that's not page aligned
1645 */
1646 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1647
1648 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1649 upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1650
1651 pages_in_upl = upl_size / PAGE_SIZE;
1652 io_size = upl_size - start_offset;
1653
1654 if ((long long)io_size > total_size)
1655 io_size = total_size;
1656
1657 start_blkno = (daddr_t)(upl_f_offset / PAGE_SIZE_64);
1658 last_blkno = start_blkno + pages_in_upl;
1659
1660 kret = ubc_create_upl(vp,
1661 upl_f_offset,
1662 upl_size,
1663 &upl,
1664 &pl,
1665 UPL_FLAGS_NONE);
1666 if (kret != KERN_SUCCESS)
1667 panic("cluster_write: failed to get pagelist");
1668
1669 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_NONE,
1670 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
1671
1672 if (start_offset && !upl_valid_page(pl, 0)) {
1673 int read_size;
1674
1675 /*
1676 * we're starting in the middle of the first page of the upl
1677 * and the page isn't currently valid, so we're going to have
1678 * to read it in first... this is a synchronous operation
1679 */
1680 read_size = PAGE_SIZE;
1681
1682 if ((upl_f_offset + read_size) > newEOF)
1683 read_size = newEOF - upl_f_offset;
1684
1685 retval = cluster_io(vp, upl, 0, upl_f_offset, read_size, devblocksize,
1686 CL_READ, (struct buf *)0, (struct clios *)0);
1687 if (retval) {
1688 /*
1689 * we had an error during the read which causes us to abort
1690 * the current cluster_write request... before we do, we need
1691 * to release the rest of the pages in the upl without modifying
1692 * there state and mark the failed page in error
1693 */
1694 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
1695 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1696
1697 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1698 (int)upl, 0, 0, retval, 0);
1699 break;
1700 }
1701 }
1702 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
1703 /*
1704 * the last offset we're writing to in this upl does not end on a page
1705 * boundary... if it's not beyond the old EOF, then we'll also need to
1706 * pre-read this page in if it isn't already valid
1707 */
1708 upl_offset = upl_size - PAGE_SIZE;
1709
1710 if ((upl_f_offset + start_offset + io_size) < oldEOF &&
1711 !upl_valid_page(pl, upl_offset / PAGE_SIZE)) {
1712 int read_size;
1713
1714 read_size = PAGE_SIZE;
1715
1716 if ((upl_f_offset + upl_offset + read_size) > newEOF)
1717 read_size = newEOF - (upl_f_offset + upl_offset);
1718
1719 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size, devblocksize,
1720 CL_READ, (struct buf *)0, (struct clios *)0);
1721 if (retval) {
1722 /*
1723 * we had an error during the read which causes us to abort
1724 * the current cluster_write request... before we do, we
1725 * need to release the rest of the pages in the upl without
1726 * modifying there state and mark the failed page in error
1727 */
1728 ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
1729 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1730
1731 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1732 (int)upl, 0, 0, retval, 0);
1733 break;
1734 }
1735 }
1736 }
1737 if ((kret = ubc_upl_map(upl, &io_address)) != KERN_SUCCESS)
1738 panic("cluster_write: ubc_upl_map failed\n");
1739 xfer_resid = io_size;
1740 io_offset = start_offset;
1741
1742 while (zero_cnt && xfer_resid) {
1743
1744 if (zero_cnt < (long long)xfer_resid)
1745 bytes_to_zero = zero_cnt;
1746 else
1747 bytes_to_zero = xfer_resid;
1748
1749 if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
1750 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1751
1752 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1753 (int)upl_f_offset + io_offset, bytes_to_zero,
1754 (int)io_offset, xfer_resid, 0);
1755 } else {
1756 int zero_pg_index;
1757
1758 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
1759 zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
1760
1761 if ( !upl_valid_page(pl, zero_pg_index)) {
1762 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1763
1764 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1765 (int)upl_f_offset + io_offset, bytes_to_zero,
1766 (int)io_offset, xfer_resid, 0);
1767
1768 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
1769 !upl_dirty_page(pl, zero_pg_index)) {
1770 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1771
1772 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1773 (int)upl_f_offset + io_offset, bytes_to_zero,
1774 (int)io_offset, xfer_resid, 0);
1775 }
1776 }
1777 xfer_resid -= bytes_to_zero;
1778 zero_cnt -= bytes_to_zero;
1779 zero_off += bytes_to_zero;
1780 io_offset += bytes_to_zero;
1781 }
1782 if (xfer_resid && uio_resid) {
1783 bytes_to_move = min(uio_resid, xfer_resid);
1784
1785 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 42)) | DBG_FUNC_NONE,
1786 (int)uio->uio_offset, bytes_to_move, uio_resid, xfer_resid, 0);
1787
1788 retval = uiomove((caddr_t)(io_address + io_offset), bytes_to_move, uio);
1789
1790
1791 if (retval) {
1792 if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
1793 panic("cluster_write: kernel_upl_unmap failed\n");
1794
1795 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
1796
1797 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
1798 (int)upl, 0, 0, retval, 0);
1799 } else {
1800 uio_resid -= bytes_to_move;
1801 xfer_resid -= bytes_to_move;
1802 io_offset += bytes_to_move;
1803 }
1804 }
1805 while (xfer_resid && zero_cnt1 && retval == 0) {
1806
1807 if (zero_cnt1 < (long long)xfer_resid)
1808 bytes_to_zero = zero_cnt1;
1809 else
1810 bytes_to_zero = xfer_resid;
1811
1812 if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
1813 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1814
1815 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1816 (int)upl_f_offset + io_offset,
1817 bytes_to_zero, (int)io_offset, xfer_resid, 0);
1818 } else {
1819 int zero_pg_index;
1820
1821 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off1 & PAGE_MASK_64));
1822 zero_pg_index = (int)((zero_off1 - upl_f_offset) / PAGE_SIZE_64);
1823
1824 if ( !upl_valid_page(pl, zero_pg_index)) {
1825 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1826
1827 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1828 (int)upl_f_offset + io_offset,
1829 bytes_to_zero, (int)io_offset, xfer_resid, 0);
1830
1831 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
1832 !upl_dirty_page(pl, zero_pg_index)) {
1833 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1834
1835 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1836 (int)upl_f_offset + io_offset,
1837 bytes_to_zero, (int)io_offset, xfer_resid, 0);
1838 }
1839 }
1840 xfer_resid -= bytes_to_zero;
1841 zero_cnt1 -= bytes_to_zero;
1842 zero_off1 += bytes_to_zero;
1843 io_offset += bytes_to_zero;
1844 }
1845
1846 if (retval == 0) {
1847 int cl_index;
1848 int can_delay;
1849
1850 io_size += start_offset;
1851
1852 if ((upl_f_offset + io_size) >= newEOF && io_size < upl_size) {
1853 /*
1854 * if we're extending the file with this write
1855 * we'll zero fill the rest of the page so that
1856 * if the file gets extended again in such a way as to leave a
1857 * hole starting at this EOF, we'll have zero's in the correct spot
1858 */
1859 bzero((caddr_t)(io_address + io_size), upl_size - io_size);
1860
1861 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1862 (int)upl_f_offset + io_size,
1863 upl_size - io_size, 0, 0, 0);
1864 }
1865 if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
1866 panic("cluster_write: kernel_upl_unmap failed\n");
1867
1868 if (flags & IO_SYNC)
1869 /*
1870 * if the IO_SYNC flag is set than we need to
1871 * bypass any clusters and immediately issue
1872 * the I/O
1873 */
1874 goto issue_io;
1875
1876 if (vp->v_clen == 0)
1877 /*
1878 * no clusters currently present
1879 */
1880 goto start_new_cluster;
1881
1882 /*
1883 * keep track of the overall dirty page
1884 * range we've developed
1885 * in case we have to fall back to the
1886 * VHASDIRTY method of flushing
1887 */
1888 if (vp->v_flag & VHASDIRTY)
1889 goto delay_io;
1890
1891 for (cl_index = 0; cl_index < vp->v_clen; cl_index++) {
1892 /*
1893 * we have an existing cluster... see if this write will extend it nicely
1894 */
1895 if (start_blkno >= vp->v_clusters[cl_index].start_pg) {
1896 /*
1897 * the current write starts at or after the current cluster
1898 */
1899 if (last_blkno <= (vp->v_clusters[cl_index].start_pg + MAX_UPL_TRANSFER)) {
1900 /*
1901 * we have a write that fits entirely
1902 * within the existing cluster limits
1903 */
1904 if (last_blkno > vp->v_clusters[cl_index].last_pg)
1905 /*
1906 * update our idea of where the cluster ends
1907 */
1908 vp->v_clusters[cl_index].last_pg = last_blkno;
1909 break;
1910 }
1911 if (start_blkno < (vp->v_clusters[cl_index].start_pg + MAX_UPL_TRANSFER)) {
1912 /*
1913 * we have a write that starts in the middle of the current cluster
1914 * but extends beyond the cluster's limit
1915 * we'll clip the current cluster if we actually
1916 * overlap with the new write
1917 * and start a new cluster with the current write
1918 */
1919 if (vp->v_clusters[cl_index].last_pg > start_blkno)
1920 vp->v_clusters[cl_index].last_pg = start_blkno;
1921 }
1922 /*
1923 * we also get here for the case where the current write starts
1924 * beyond the limit of the existing cluster
1925 *
1926 * in either case, we'll check the remaining clusters before
1927 * starting a new one
1928 */
1929 } else {
1930 /*
1931 * the current write starts in front of the current cluster
1932 */
1933 if ((vp->v_clusters[cl_index].last_pg - start_blkno) <= MAX_UPL_TRANSFER) {
1934 /*
1935 * we can just merge the old cluster
1936 * with the new request and leave it
1937 * in the cache
1938 */
1939 vp->v_clusters[cl_index].start_pg = start_blkno;
1940
1941 if (last_blkno > vp->v_clusters[cl_index].last_pg) {
1942 /*
1943 * the current write completely
1944 * envelops the existing cluster
1945 */
1946 vp->v_clusters[cl_index].last_pg = last_blkno;
1947 }
1948 break;
1949 }
1950
1951 /*
1952 * if we were to combine this write with the current cluster
1953 * we would exceed the cluster size limit.... so,
1954 * let's see if there's any overlap of the new I/O with
1955 * the existing cluster...
1956 *
1957 */
1958 if (last_blkno > vp->v_clusters[cl_index].start_pg)
1959 /*
1960 * the current write extends into the existing cluster
1961 * clip the current cluster by moving the start position
1962 * to where the current write ends
1963 */
1964 vp->v_clusters[cl_index].start_pg = last_blkno;
1965 /*
1966 * if we get here, there was no way to merge
1967 * the new I/O with this cluster and
1968 * keep it under our maximum cluster length
1969 * we'll check the remaining clusters before starting a new one
1970 */
1971 }
1972 }
1973 if (cl_index < vp->v_clen)
1974 /*
1975 * we found an existing cluster that we
1976 * could merger this I/O into
1977 */
1978 goto delay_io;
1979
1980 if (vp->v_clen < MAX_CLUSTERS && !(vp->v_flag & VNOCACHE_DATA))
1981 /*
1982 * we didn't find an existing cluster to
1983 * merge into, but there's room to start
1984 * a new one
1985 */
1986 goto start_new_cluster;
1987
1988 /*
1989 * no exisitng cluster to merge with and no
1990 * room to start a new one... we'll try
1991 * pushing the existing ones... if none of
1992 * them are able to be pushed, we'll have
1993 * to fall back on the VHASDIRTY mechanism
1994 * cluster_try_push will set v_clen to the
1995 * number of remaining clusters if it is
1996 * unable to push all of them
1997 */
1998 if (vp->v_flag & VNOCACHE_DATA)
1999 can_delay = 0;
2000 else
2001 can_delay = 1;
2002
2003 if (cluster_try_push(vp, newEOF, 0, 0) == 0) {
2004 vp->v_flag |= VHASDIRTY;
2005 goto delay_io;
2006 }
2007 start_new_cluster:
2008 if (vp->v_clen == 0) {
2009 vp->v_ciosiz = devblocksize;
2010 vp->v_cstart = start_blkno;
2011 vp->v_lastw = last_blkno;
2012 }
2013 vp->v_clusters[vp->v_clen].start_pg = start_blkno;
2014 vp->v_clusters[vp->v_clen].last_pg = last_blkno;
2015 vp->v_clen++;
2016 delay_io:
2017 /*
2018 * make sure we keep v_cstart and v_lastw up to
2019 * date in case we have to fall back on the
2020 * V_HASDIRTY mechanism (or we've already entered it)
2021 */
2022 if (start_blkno < vp->v_cstart)
2023 vp->v_cstart = start_blkno;
2024 if (last_blkno > vp->v_lastw)
2025 vp->v_lastw = last_blkno;
2026
2027 ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2028 continue;
2029 issue_io:
2030 /*
2031 * in order to maintain some semblance of coherency with mapped writes
2032 * we need to write the cluster back out as a multiple of the PAGESIZE
2033 * unless the cluster encompasses the last page of the file... in this
2034 * case we'll round out to the nearest device block boundary
2035 */
2036 io_size = upl_size;
2037
2038 if ((upl_f_offset + io_size) > newEOF) {
2039 io_size = newEOF - upl_f_offset;
2040 io_size = (io_size + (devblocksize - 1)) & ~(devblocksize - 1);
2041 }
2042
2043 if (flags & IO_SYNC)
2044 io_flags = CL_COMMIT | CL_AGE;
2045 else
2046 io_flags = CL_COMMIT | CL_AGE | CL_ASYNC;
2047
2048 if (vp->v_flag & VNOCACHE_DATA)
2049 io_flags |= CL_DUMP;
2050
2051 while (vp->v_numoutput >= ASYNC_THROTTLE) {
2052 vp->v_flag |= VTHROTTLED;
2053 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_write", 0);
2054 }
2055 retval = cluster_io(vp, upl, 0, upl_f_offset, io_size, devblocksize,
2056 io_flags, (struct buf *)0, (struct clios *)0);
2057 }
2058 }
2059 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
2060 retval, 0, 0, 0, 0);
2061
2062 return (retval);
2063 }
2064
2065 int
2066 cluster_read(vp, uio, filesize, devblocksize, flags)
2067 struct vnode *vp;
2068 struct uio *uio;
2069 off_t filesize;
2070 int devblocksize;
2071 int flags;
2072 {
2073 int prev_resid;
2074 int clip_size;
2075 off_t max_io_size;
2076 struct iovec *iov;
2077 vm_offset_t upl_offset;
2078 int upl_size;
2079 int pages_in_pl;
2080 upl_page_info_t *pl;
2081 int upl_flags;
2082 upl_t upl;
2083 int retval = 0;
2084
2085 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
2086 (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0);
2087
2088 /*
2089 * We set a threshhold of 4 pages to decide if the nocopy
2090 * read loop is worth the trouble...
2091 */
2092
2093 if (!((vp->v_flag & VNOCACHE_DATA) && (uio->uio_segflg == UIO_USERSPACE)))
2094 {
2095 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2096 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2097 (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
2098 return(retval);
2099 }
2100
2101 while (uio->uio_resid && uio->uio_offset < filesize && retval == 0)
2102 {
2103 /* we know we have a resid, so this is safe */
2104 iov = uio->uio_iov;
2105 while (iov->iov_len == 0) {
2106 uio->uio_iov++;
2107 uio->uio_iovcnt--;
2108 iov = uio->uio_iov;
2109 }
2110
2111 /*
2112 * We check every vector target and if it is physically
2113 * contiguous space, we skip the sanity checks.
2114 */
2115
2116 upl_offset = (vm_offset_t)iov->iov_base & ~PAGE_MASK;
2117 upl_size = (upl_offset + PAGE_SIZE +(PAGE_SIZE -1)) & ~PAGE_MASK;
2118 pages_in_pl = 0;
2119 upl_flags = UPL_QUERY_OBJECT_TYPE;
2120 if((vm_map_get_upl(current_map(),
2121 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2122 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0)) != KERN_SUCCESS)
2123 {
2124 /*
2125 * the user app must have passed in an invalid address
2126 */
2127 return (EFAULT);
2128 }
2129
2130 if (upl_flags & UPL_PHYS_CONTIG)
2131 {
2132 retval = cluster_phys_read(vp, uio, filesize, devblocksize, flags);
2133 }
2134 else if (uio->uio_resid < 4 * PAGE_SIZE)
2135 {
2136 /*
2137 * We set a threshhold of 4 pages to decide if the nocopy
2138 * read loop is worth the trouble...
2139 */
2140 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2141 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2142 (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
2143 return(retval);
2144 }
2145 else if (uio->uio_offset & PAGE_MASK_64)
2146 {
2147 /* Bring the file offset read up to a pagesize boundary */
2148 clip_size = (PAGE_SIZE - (int)(uio->uio_offset & PAGE_MASK_64));
2149 if (uio->uio_resid < clip_size)
2150 clip_size = uio->uio_resid;
2151 /*
2152 * Fake the resid going into the cluster_read_x call
2153 * and restore it on the way out.
2154 */
2155 prev_resid = uio->uio_resid;
2156 uio->uio_resid = clip_size;
2157 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2158 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2159 }
2160 else if ((int)iov->iov_base & PAGE_MASK_64)
2161 {
2162 clip_size = iov->iov_len;
2163 prev_resid = uio->uio_resid;
2164 uio->uio_resid = clip_size;
2165 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2166 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2167 }
2168 else
2169 {
2170 /*
2171 * If we come in here, we know the offset into
2172 * the file is on a pagesize boundary
2173 */
2174
2175 max_io_size = filesize - uio->uio_offset;
2176 clip_size = uio->uio_resid;
2177 if (iov->iov_len < clip_size)
2178 clip_size = iov->iov_len;
2179 if (max_io_size < clip_size)
2180 clip_size = (int)max_io_size;
2181
2182 if (clip_size < PAGE_SIZE)
2183 {
2184 /*
2185 * Take care of the tail end of the read in this vector.
2186 */
2187 prev_resid = uio->uio_resid;
2188 uio->uio_resid = clip_size;
2189 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2190 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2191 }
2192 else
2193 {
2194 /* round clip_size down to a multiple of pagesize */
2195 clip_size = clip_size & ~(PAGE_MASK);
2196 prev_resid = uio->uio_resid;
2197 uio->uio_resid = clip_size;
2198 retval = cluster_nocopy_read(vp, uio, filesize, devblocksize, flags);
2199 if ((retval==0) && uio->uio_resid)
2200 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2201 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2202 }
2203 } /* end else */
2204 } /* end while */
2205
2206 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2207 (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
2208
2209 return(retval);
2210 }
2211
2212
2213 static int
2214 cluster_read_x(vp, uio, filesize, devblocksize, flags)
2215 struct vnode *vp;
2216 struct uio *uio;
2217 off_t filesize;
2218 int devblocksize;
2219 int flags;
2220 {
2221 upl_page_info_t *pl;
2222 upl_t upl;
2223 vm_offset_t upl_offset;
2224 int upl_size;
2225 off_t upl_f_offset;
2226 int start_offset;
2227 int start_pg;
2228 int last_pg;
2229 int uio_last;
2230 int pages_in_upl;
2231 off_t max_size;
2232 int io_size;
2233 vm_offset_t io_address;
2234 kern_return_t kret;
2235 int segflg;
2236 int error = 0;
2237 int retval = 0;
2238 int b_lblkno;
2239 int e_lblkno;
2240
2241 b_lblkno = (int)(uio->uio_offset / PAGE_SIZE_64);
2242
2243 while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) {
2244 /*
2245 * compute the size of the upl needed to encompass
2246 * the requested read... limit each call to cluster_io
2247 * to the maximum UPL size... cluster_io will clip if
2248 * this exceeds the maximum io_size for the device,
2249 * make sure to account for
2250 * a starting offset that's not page aligned
2251 */
2252 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2253 upl_f_offset = uio->uio_offset - (off_t)start_offset;
2254 max_size = filesize - uio->uio_offset;
2255
2256 if ((off_t)((unsigned int)uio->uio_resid) < max_size)
2257 io_size = uio->uio_resid;
2258 else
2259 io_size = max_size;
2260
2261 if (uio->uio_segflg == UIO_USERSPACE && !(vp->v_flag & VNOCACHE_DATA)) {
2262 segflg = uio->uio_segflg;
2263
2264 uio->uio_segflg = UIO_PHYS_USERSPACE;
2265
2266 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
2267 (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2268
2269 while (io_size && retval == 0) {
2270 int xsize;
2271 vm_offset_t paddr;
2272
2273 if (ubc_page_op(vp,
2274 upl_f_offset,
2275 UPL_POP_SET | UPL_POP_BUSY,
2276 &paddr, 0) != KERN_SUCCESS)
2277 break;
2278
2279 xsize = PAGE_SIZE - start_offset;
2280
2281 if (xsize > io_size)
2282 xsize = io_size;
2283
2284 retval = uiomove((caddr_t)(paddr + start_offset), xsize, uio);
2285
2286 ubc_page_op(vp, upl_f_offset,
2287 UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
2288
2289 io_size -= xsize;
2290 start_offset = (int)
2291 (uio->uio_offset & PAGE_MASK_64);
2292 upl_f_offset = uio->uio_offset - start_offset;
2293 }
2294 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
2295 (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2296
2297 uio->uio_segflg = segflg;
2298
2299 if (retval)
2300 break;
2301
2302 if (io_size == 0) {
2303 /*
2304 * we're already finished with this read request
2305 * let's see if we should do a read-ahead
2306 */
2307 e_lblkno = (int)
2308 ((uio->uio_offset - 1) / PAGE_SIZE_64);
2309
2310 if (!(vp->v_flag & VRAOFF))
2311 /*
2312 * let's try to read ahead if we're in
2313 * a sequential access pattern
2314 */
2315 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
2316 vp->v_lastr = e_lblkno;
2317
2318 break;
2319 }
2320 max_size = filesize - uio->uio_offset;
2321 }
2322 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
2323 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2324 upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2325 pages_in_upl = upl_size / PAGE_SIZE;
2326
2327 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
2328 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2329
2330 kret = ubc_create_upl(vp,
2331 upl_f_offset,
2332 upl_size,
2333 &upl,
2334 &pl,
2335 UPL_FLAGS_NONE);
2336 if (kret != KERN_SUCCESS)
2337 panic("cluster_read: failed to get pagelist");
2338
2339 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
2340 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2341
2342 /*
2343 * scan from the beginning of the upl looking for the first
2344 * non-valid page.... this will become the first page in
2345 * the request we're going to make to 'cluster_io'... if all
2346 * of the pages are valid, we won't call through to 'cluster_io'
2347 */
2348 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
2349 if (!upl_valid_page(pl, start_pg))
2350 break;
2351 }
2352
2353 /*
2354 * scan from the starting invalid page looking for a valid
2355 * page before the end of the upl is reached, if we
2356 * find one, then it will be the last page of the request to
2357 * 'cluster_io'
2358 */
2359 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
2360 if (upl_valid_page(pl, last_pg))
2361 break;
2362 }
2363
2364 if (start_pg < last_pg) {
2365 /*
2366 * we found a range of 'invalid' pages that must be filled
2367 * if the last page in this range is the last page of the file
2368 * we may have to clip the size of it to keep from reading past
2369 * the end of the last physical block associated with the file
2370 */
2371 upl_offset = start_pg * PAGE_SIZE;
2372 io_size = (last_pg - start_pg) * PAGE_SIZE;
2373
2374 if ((upl_f_offset + upl_offset + io_size) > filesize)
2375 io_size = filesize - (upl_f_offset + upl_offset);
2376
2377 /*
2378 * issue a synchronous read to cluster_io
2379 */
2380
2381 error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
2382 io_size, devblocksize, CL_READ, (struct buf *)0, (struct clios *)0);
2383 }
2384 if (error == 0) {
2385 /*
2386 * if the read completed successfully, or there was no I/O request
2387 * issued, than map the upl into kernel address space and
2388 * move the data into user land.... we'll first add on any 'valid'
2389 * pages that were present in the upl when we acquired it.
2390 */
2391 u_int val_size;
2392 u_int size_of_prefetch;
2393
2394 for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
2395 if (!upl_valid_page(pl, uio_last))
2396 break;
2397 }
2398 /*
2399 * compute size to transfer this round, if uio->uio_resid is
2400 * still non-zero after this uiomove, we'll loop around and
2401 * set up for another I/O.
2402 */
2403 val_size = (uio_last * PAGE_SIZE) - start_offset;
2404
2405 if (max_size < val_size)
2406 val_size = max_size;
2407
2408 if (uio->uio_resid < val_size)
2409 val_size = uio->uio_resid;
2410
2411 e_lblkno = (int)((uio->uio_offset + ((off_t)val_size - 1)) / PAGE_SIZE_64);
2412
2413 if (size_of_prefetch = (uio->uio_resid - val_size)) {
2414 /*
2415 * if there's still I/O left to do for this request, then issue a
2416 * pre-fetch I/O... the I/O wait time will overlap
2417 * with the copying of the data
2418 */
2419 cluster_rd_prefetch(vp, uio->uio_offset + val_size, size_of_prefetch, filesize, devblocksize);
2420 } else {
2421 if (!(vp->v_flag & VRAOFF) && !(vp->v_flag & VNOCACHE_DATA))
2422 /*
2423 * let's try to read ahead if we're in
2424 * a sequential access pattern
2425 */
2426 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
2427 vp->v_lastr = e_lblkno;
2428 }
2429 if (uio->uio_segflg == UIO_USERSPACE) {
2430 int offset;
2431
2432 segflg = uio->uio_segflg;
2433
2434 uio->uio_segflg = UIO_PHYS_USERSPACE;
2435
2436
2437 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
2438 (int)uio->uio_offset, val_size, uio->uio_resid, 0, 0);
2439
2440 offset = start_offset;
2441
2442 while (val_size && retval == 0) {
2443 int csize;
2444 int i;
2445 caddr_t paddr;
2446
2447 i = offset / PAGE_SIZE;
2448 csize = min(PAGE_SIZE - start_offset, val_size);
2449
2450 paddr = (caddr_t)upl_phys_page(pl, i) + start_offset;
2451
2452 retval = uiomove(paddr, csize, uio);
2453
2454 val_size -= csize;
2455 offset += csize;
2456 start_offset = offset & PAGE_MASK;
2457 }
2458 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
2459 (int)uio->uio_offset, val_size, uio->uio_resid, 0, 0);
2460
2461 uio->uio_segflg = segflg;
2462 }
2463 else
2464 {
2465 if ((kret = ubc_upl_map(upl, &io_address)) != KERN_SUCCESS)
2466 panic("cluster_read: ubc_upl_map() failed\n");
2467
2468 retval = uiomove((caddr_t)(io_address + start_offset), val_size, uio);
2469
2470 if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
2471 panic("cluster_read: ubc_upl_unmap() failed\n");
2472 }
2473 }
2474 if (start_pg < last_pg) {
2475 /*
2476 * compute the range of pages that we actually issued an I/O for
2477 * and either commit them as valid if the I/O succeeded
2478 * or abort them if the I/O failed
2479 */
2480 io_size = (last_pg - start_pg) * PAGE_SIZE;
2481
2482 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
2483 (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
2484
2485 if (error || (vp->v_flag & VNOCACHE_DATA))
2486 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
2487 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2488 else
2489 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size,
2490 UPL_COMMIT_CLEAR_DIRTY
2491 | UPL_COMMIT_FREE_ON_EMPTY
2492 | UPL_COMMIT_INACTIVATE);
2493
2494 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
2495 (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
2496 }
2497 if ((last_pg - start_pg) < pages_in_upl) {
2498 int cur_pg;
2499 int commit_flags;
2500
2501 /*
2502 * the set of pages that we issued an I/O for did not encompass
2503 * the entire upl... so just release these without modifying
2504 * there state
2505 */
2506 if (error)
2507 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2508 else {
2509 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
2510 (int)upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
2511
2512 if (start_pg) {
2513 /*
2514 * we found some already valid pages at the beginning of
2515 * the upl commit these back to the inactive list with
2516 * reference cleared
2517 */
2518 for (cur_pg = 0; cur_pg < start_pg; cur_pg++) {
2519 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
2520 | UPL_COMMIT_INACTIVATE;
2521
2522 if (upl_dirty_page(pl, cur_pg))
2523 commit_flags |= UPL_COMMIT_SET_DIRTY;
2524
2525 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA))
2526 ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
2527 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2528 else
2529 ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
2530 PAGE_SIZE, commit_flags);
2531 }
2532 }
2533 if (last_pg < uio_last) {
2534 /*
2535 * we found some already valid pages immediately after the
2536 * pages we issued I/O for, commit these back to the
2537 * inactive list with reference cleared
2538 */
2539 for (cur_pg = last_pg; cur_pg < uio_last; cur_pg++) {
2540 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
2541 | UPL_COMMIT_INACTIVATE;
2542
2543 if (upl_dirty_page(pl, cur_pg))
2544 commit_flags |= UPL_COMMIT_SET_DIRTY;
2545
2546 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA))
2547 ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
2548 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2549 else
2550 ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
2551 PAGE_SIZE, commit_flags);
2552 }
2553 }
2554 if (uio_last < pages_in_upl) {
2555 /*
2556 * there were some invalid pages beyond the valid pages
2557 * that we didn't issue an I/O for, just release them
2558 * unchanged
2559 */
2560 ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
2561 (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
2562 }
2563
2564 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
2565 (int)upl, -1, -1, 0, 0);
2566 }
2567 }
2568 if (retval == 0)
2569 retval = error;
2570 }
2571
2572 return (retval);
2573 }
2574
2575
2576 static int
2577 cluster_nocopy_read(vp, uio, filesize, devblocksize, flags)
2578 struct vnode *vp;
2579 struct uio *uio;
2580 off_t filesize;
2581 int devblocksize;
2582 int flags;
2583 {
2584 upl_t upl;
2585 upl_page_info_t *pl;
2586 off_t upl_f_offset;
2587 vm_offset_t upl_offset;
2588 off_t start_upl_f_offset;
2589 off_t max_io_size;
2590 int io_size;
2591 int upl_size;
2592 int upl_needed_size;
2593 int pages_in_pl;
2594 vm_offset_t paddr;
2595 int upl_flags;
2596 kern_return_t kret;
2597 int segflg;
2598 struct iovec *iov;
2599 int i;
2600 int force_data_sync;
2601 int error = 0;
2602 int retval = 0;
2603
2604 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
2605 (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0);
2606
2607 /*
2608 * When we enter this routine, we know
2609 * -- the offset into the file is on a pagesize boundary
2610 * -- the resid is a page multiple
2611 * -- the resid will not exceed iov_len
2612 */
2613
2614 iov = uio->uio_iov;
2615 while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) {
2616
2617 max_io_size = filesize - uio->uio_offset;
2618
2619 if (max_io_size < (off_t)((unsigned int)uio->uio_resid))
2620 io_size = max_io_size;
2621 else
2622 io_size = uio->uio_resid;
2623
2624 /*
2625 * We don't come into this routine unless
2626 * UIO_USERSPACE is set.
2627 */
2628 segflg = uio->uio_segflg;
2629
2630 uio->uio_segflg = UIO_PHYS_USERSPACE;
2631
2632 /*
2633 * First look for pages already in the cache
2634 * and move them to user space.
2635 */
2636 while (io_size && (retval == 0)) {
2637 upl_f_offset = uio->uio_offset;
2638
2639 /*
2640 * If this call fails, it means the page is not
2641 * in the page cache.
2642 */
2643 if (ubc_page_op(vp, upl_f_offset,
2644 UPL_POP_SET | UPL_POP_BUSY, &paddr, 0) != KERN_SUCCESS)
2645 break;
2646
2647 retval = uiomove((caddr_t)(paddr), PAGE_SIZE, uio);
2648
2649 ubc_page_op(vp, upl_f_offset,
2650 UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
2651
2652 io_size -= PAGE_SIZE;
2653 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 71)) | DBG_FUNC_NONE,
2654 (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2655 }
2656
2657 uio->uio_segflg = segflg;
2658
2659 if (retval)
2660 {
2661 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2662 (int)uio->uio_offset, uio->uio_resid, 2, retval, 0);
2663 return(retval);
2664 }
2665
2666 /* If we are already finished with this read, then return */
2667 if (io_size == 0)
2668 {
2669
2670 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2671 (int)uio->uio_offset, uio->uio_resid, 3, io_size, 0);
2672 return(0);
2673 }
2674
2675 max_io_size = io_size;
2676 if (max_io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2677 max_io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2678
2679 start_upl_f_offset = uio->uio_offset; /* this is page aligned in the file */
2680 upl_f_offset = start_upl_f_offset;
2681 io_size = 0;
2682
2683 while(io_size < max_io_size)
2684 {
2685
2686 if(ubc_page_op(vp, upl_f_offset,
2687 UPL_POP_SET | UPL_POP_BUSY, &paddr, 0) == KERN_SUCCESS)
2688 {
2689 ubc_page_op(vp, upl_f_offset,
2690 UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
2691 break;
2692 }
2693
2694 /*
2695 * Build up the io request parameters.
2696 */
2697
2698 io_size += PAGE_SIZE;
2699 upl_f_offset += PAGE_SIZE;
2700 }
2701
2702 if (io_size == 0)
2703 return(retval);
2704
2705 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
2706 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
2707
2708 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
2709 (int)upl_offset, upl_needed_size, (int)iov->iov_base, io_size, 0);
2710
2711 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++)
2712 {
2713 pages_in_pl = 0;
2714 upl_size = upl_needed_size;
2715 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
2716
2717 kret = vm_map_get_upl(current_map(),
2718 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2719 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, force_data_sync);
2720
2721 if (kret != KERN_SUCCESS)
2722 {
2723 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2724 (int)upl_offset, upl_size, io_size, kret, 0);
2725
2726 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2727 (int)uio->uio_offset, uio->uio_resid, 4, retval, 0);
2728
2729 /* cluster_nocopy_read: failed to get pagelist */
2730 /* do not return kret here */
2731 return(retval);
2732 }
2733
2734 pages_in_pl = upl_size / PAGE_SIZE;
2735 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2736
2737 for(i=0; i < pages_in_pl; i++)
2738 {
2739 if (!upl_valid_page(pl, i))
2740 break;
2741 }
2742 if (i == pages_in_pl)
2743 break;
2744
2745 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2746 UPL_ABORT_FREE_ON_EMPTY);
2747 }
2748
2749 if (force_data_sync >= 3)
2750 {
2751 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2752 (int)upl_offset, upl_size, io_size, kret, 0);
2753
2754 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2755 (int)uio->uio_offset, uio->uio_resid, 5, retval, 0);
2756 return(retval);
2757 }
2758 /*
2759 * Consider the possibility that upl_size wasn't satisfied.
2760 */
2761 if (upl_size != upl_needed_size)
2762 io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
2763
2764 if (io_size == 0)
2765 {
2766 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2767 UPL_ABORT_FREE_ON_EMPTY);
2768 return(retval);
2769 }
2770
2771 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2772 (int)upl_offset, upl_size, io_size, kret, 0);
2773
2774 /*
2775 * issue a synchronous read to cluster_io
2776 */
2777
2778 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
2779 (int)upl, (int)upl_offset, (int)start_upl_f_offset, io_size, 0);
2780
2781 error = cluster_io(vp, upl, upl_offset, start_upl_f_offset,
2782 io_size, devblocksize, CL_READ| CL_NOZERO, (struct buf *)0, (struct clios *)0);
2783
2784 if (error == 0) {
2785 /*
2786 * The cluster_io read completed successfully,
2787 * update the uio structure and commit.
2788 */
2789
2790 ubc_upl_commit_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2791 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
2792
2793 iov->iov_base += io_size;
2794 iov->iov_len -= io_size;
2795 uio->uio_resid -= io_size;
2796 uio->uio_offset += io_size;
2797 }
2798 else {
2799 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2800 UPL_ABORT_FREE_ON_EMPTY);
2801 }
2802
2803 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
2804 (int)upl, (int)uio->uio_offset, (int)uio->uio_resid, error, 0);
2805
2806 if (retval == 0)
2807 retval = error;
2808
2809 } /* end while */
2810
2811
2812 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2813 (int)uio->uio_offset, (int)uio->uio_resid, 6, retval, 0);
2814
2815 return (retval);
2816 }
2817
2818
2819
2820 static int
2821 cluster_phys_read(vp, uio, filesize, devblocksize, flags)
2822 struct vnode *vp;
2823 struct uio *uio;
2824 off_t filesize;
2825 int devblocksize;
2826 int flags;
2827 {
2828 upl_page_info_t *pl;
2829 upl_t upl;
2830 vm_offset_t upl_offset;
2831 vm_offset_t dst_paddr;
2832 off_t max_size;
2833 int io_size;
2834 int tail_size;
2835 int upl_size;
2836 int upl_needed_size;
2837 int pages_in_pl;
2838 int upl_flags;
2839 kern_return_t kret;
2840 struct iovec *iov;
2841 struct clios iostate;
2842 int error;
2843
2844 /*
2845 * When we enter this routine, we know
2846 * -- the resid will not exceed iov_len
2847 * -- the target address is physically contiguous
2848 */
2849
2850 iov = uio->uio_iov;
2851
2852 max_size = filesize - uio->uio_offset;
2853
2854 if (max_size > (off_t)((unsigned int)iov->iov_len))
2855 io_size = iov->iov_len;
2856 else
2857 io_size = max_size;
2858
2859 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
2860 upl_needed_size = upl_offset + io_size;
2861
2862 error = 0;
2863 pages_in_pl = 0;
2864 upl_size = upl_needed_size;
2865 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
2866
2867 kret = vm_map_get_upl(current_map(),
2868 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2869 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
2870
2871 if (kret != KERN_SUCCESS) {
2872 /*
2873 * cluster_phys_read: failed to get pagelist
2874 */
2875 return(EINVAL);
2876 }
2877 if (upl_size < upl_needed_size) {
2878 /*
2879 * The upl_size wasn't satisfied.
2880 */
2881 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2882
2883 return(EINVAL);
2884 }
2885 pl = ubc_upl_pageinfo(upl);
2886
2887 dst_paddr = (vm_offset_t)upl_phys_page(pl, 0) + ((vm_offset_t)iov->iov_base & PAGE_MASK);
2888
2889 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
2890 int head_size;
2891
2892 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
2893
2894 if (head_size > io_size)
2895 head_size = io_size;
2896
2897 error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, devblocksize, CL_READ);
2898
2899 if (error) {
2900 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2901
2902 return(EINVAL);
2903 }
2904 upl_offset += head_size;
2905 dst_paddr += head_size;
2906 io_size -= head_size;
2907 }
2908 tail_size = io_size & (devblocksize - 1);
2909 io_size -= tail_size;
2910
2911 iostate.io_completed = 0;
2912 iostate.io_issued = 0;
2913 iostate.io_error = 0;
2914 iostate.io_wanted = 0;
2915
2916 while (io_size && error == 0) {
2917 int xsize;
2918
2919 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2920 xsize = MAX_UPL_TRANSFER * PAGE_SIZE;
2921 else
2922 xsize = io_size;
2923 /*
2924 * request asynchronously so that we can overlap
2925 * the preparation of the next I/O... we'll do
2926 * the commit after all the I/O has completed
2927 * since its all issued against the same UPL
2928 * if there are already too many outstanding reads
2929 * throttle back until we reach a more reasonable level
2930 */
2931 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
2932 iostate.io_wanted = 1;
2933 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_phys_read", 0);
2934 }
2935
2936 error = cluster_io(vp, upl, upl_offset, uio->uio_offset, xsize, 0,
2937 CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC,
2938 (struct buf *)0, &iostate);
2939 /*
2940 * The cluster_io read was issued successfully,
2941 * update the uio structure
2942 */
2943 if (error == 0) {
2944 uio->uio_resid -= xsize;
2945 iov->iov_len -= xsize;
2946 iov->iov_base += xsize;
2947 uio->uio_offset += xsize;
2948 dst_paddr += xsize;
2949 upl_offset += xsize;
2950 io_size -= xsize;
2951 }
2952 }
2953 /*
2954 * make sure any async reads have completed before
2955 * we proceed
2956 */
2957 while (iostate.io_issued != iostate.io_completed) {
2958 iostate.io_wanted = 1;
2959 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_phys_read", 0);
2960 }
2961 if (iostate.io_error) {
2962 error = iostate.io_error;
2963 }
2964 if (error == 0 && tail_size)
2965 error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, devblocksize, CL_READ);
2966
2967 /*
2968 * just release our hold on the physically contiguous
2969 * region without changing any state
2970 */
2971 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2972
2973 return (error);
2974 }
2975
2976
2977 /*
2978 * generate advisory I/O's in the largest chunks possible
2979 * the completed pages will be released into the VM cache
2980 */
2981 int
2982 advisory_read(vp, filesize, f_offset, resid, devblocksize)
2983 struct vnode *vp;
2984 off_t filesize;
2985 off_t f_offset;
2986 int resid;
2987 int devblocksize;
2988 {
2989 upl_page_info_t *pl;
2990 upl_t upl;
2991 vm_offset_t upl_offset;
2992 int upl_size;
2993 off_t upl_f_offset;
2994 int start_offset;
2995 int start_pg;
2996 int last_pg;
2997 int pages_in_upl;
2998 off_t max_size;
2999 int io_size;
3000 kern_return_t kret;
3001 int retval = 0;
3002 int issued_io;
3003
3004 if (!UBCINFOEXISTS(vp))
3005 return(EINVAL);
3006
3007 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
3008 (int)f_offset, resid, (int)filesize, devblocksize, 0);
3009
3010 while (resid && f_offset < filesize && retval == 0) {
3011 /*
3012 * compute the size of the upl needed to encompass
3013 * the requested read... limit each call to cluster_io
3014 * to the maximum UPL size... cluster_io will clip if
3015 * this exceeds the maximum io_size for the device,
3016 * make sure to account for
3017 * a starting offset that's not page aligned
3018 */
3019 start_offset = (int)(f_offset & PAGE_MASK_64);
3020 upl_f_offset = f_offset - (off_t)start_offset;
3021 max_size = filesize - f_offset;
3022
3023 if (resid < max_size)
3024 io_size = resid;
3025 else
3026 io_size = max_size;
3027
3028 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
3029 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
3030 upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
3031 pages_in_upl = upl_size / PAGE_SIZE;
3032
3033 kret = ubc_create_upl(vp,
3034 upl_f_offset,
3035 upl_size,
3036 &upl,
3037 &pl,
3038 UPL_RET_ONLY_ABSENT);
3039 if (kret != KERN_SUCCESS)
3040 return(retval);
3041 issued_io = 0;
3042
3043 /*
3044 * before we start marching forward, we must make sure we end on
3045 * a present page, otherwise we will be working with a freed
3046 * upl
3047 */
3048 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
3049 if (upl_page_present(pl, last_pg))
3050 break;
3051 }
3052 pages_in_upl = last_pg + 1;
3053
3054
3055 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_NONE,
3056 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
3057
3058
3059 for (last_pg = 0; last_pg < pages_in_upl; ) {
3060 /*
3061 * scan from the beginning of the upl looking for the first
3062 * page that is present.... this will become the first page in
3063 * the request we're going to make to 'cluster_io'... if all
3064 * of the pages are absent, we won't call through to 'cluster_io'
3065 */
3066 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
3067 if (upl_page_present(pl, start_pg))
3068 break;
3069 }
3070
3071 /*
3072 * scan from the starting present page looking for an absent
3073 * page before the end of the upl is reached, if we
3074 * find one, then it will terminate the range of pages being
3075 * presented to 'cluster_io'
3076 */
3077 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
3078 if (!upl_page_present(pl, last_pg))
3079 break;
3080 }
3081
3082 if (last_pg > start_pg) {
3083 /*
3084 * we found a range of pages that must be filled
3085 * if the last page in this range is the last page of the file
3086 * we may have to clip the size of it to keep from reading past
3087 * the end of the last physical block associated with the file
3088 */
3089 upl_offset = start_pg * PAGE_SIZE;
3090 io_size = (last_pg - start_pg) * PAGE_SIZE;
3091
3092 if ((upl_f_offset + upl_offset + io_size) > filesize)
3093 io_size = filesize - (upl_f_offset + upl_offset);
3094
3095 /*
3096 * issue an asynchronous read to cluster_io
3097 */
3098 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, devblocksize,
3099 CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE, (struct buf *)0, (struct clios *)0);
3100
3101 issued_io = 1;
3102 }
3103 }
3104 if (issued_io == 0)
3105 ubc_upl_abort(upl, 0);
3106
3107 io_size = upl_size - start_offset;
3108
3109 if (io_size > resid)
3110 io_size = resid;
3111 f_offset += io_size;
3112 resid -= io_size;
3113 }
3114
3115 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
3116 (int)f_offset, resid, retval, 0, 0);
3117
3118 return(retval);
3119 }
3120
3121
3122 int
3123 cluster_push(vp)
3124 struct vnode *vp;
3125 {
3126 int retval;
3127
3128 if (!UBCINFOEXISTS(vp) || vp->v_clen == 0) {
3129 vp->v_flag &= ~VHASDIRTY;
3130 return(0);
3131 }
3132
3133 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
3134 vp->v_flag & VHASDIRTY, vp->v_clen, 0, 0, 0);
3135
3136 if (vp->v_flag & VHASDIRTY) {
3137 daddr_t start_pg;
3138 daddr_t last_pg;
3139 daddr_t end_pg;
3140
3141 start_pg = vp->v_cstart;
3142 end_pg = vp->v_lastw;
3143
3144 vp->v_flag &= ~VHASDIRTY;
3145 vp->v_clen = 0;
3146
3147 while (start_pg < end_pg) {
3148 last_pg = start_pg + MAX_UPL_TRANSFER;
3149
3150 if (last_pg > end_pg)
3151 last_pg = end_pg;
3152
3153 cluster_push_x(vp, ubc_getsize(vp), start_pg, last_pg, 0);
3154
3155 start_pg = last_pg;
3156 }
3157 return (1);
3158 }
3159 retval = cluster_try_push(vp, ubc_getsize(vp), 0, 1);
3160
3161 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
3162 vp->v_flag & VHASDIRTY, vp->v_clen, retval, 0, 0);
3163
3164 return (retval);
3165 }
3166
3167
3168 static int
3169 cluster_try_push(vp, EOF, can_delay, push_all)
3170 struct vnode *vp;
3171 off_t EOF;
3172 int can_delay;
3173 int push_all;
3174 {
3175 int cl_index;
3176 int cl_index1;
3177 int min_index;
3178 int cl_len;
3179 int cl_total;
3180 int cl_pushed;
3181 struct v_cluster l_clusters[MAX_CLUSTERS];
3182
3183 /*
3184 * make a local 'sorted' copy of the clusters
3185 * and clear vp->v_clen so that new clusters can
3186 * be developed
3187 */
3188 for (cl_index = 0; cl_index < vp->v_clen; cl_index++) {
3189 for (min_index = -1, cl_index1 = 0; cl_index1 < vp->v_clen; cl_index1++) {
3190 if (vp->v_clusters[cl_index1].start_pg == vp->v_clusters[cl_index1].last_pg)
3191 continue;
3192 if (min_index == -1)
3193 min_index = cl_index1;
3194 else if (vp->v_clusters[cl_index1].start_pg < vp->v_clusters[min_index].start_pg)
3195 min_index = cl_index1;
3196 }
3197 if (min_index == -1)
3198 break;
3199 l_clusters[cl_index].start_pg = vp->v_clusters[min_index].start_pg;
3200 l_clusters[cl_index].last_pg = vp->v_clusters[min_index].last_pg;
3201
3202 vp->v_clusters[min_index].start_pg = vp->v_clusters[min_index].last_pg;
3203 }
3204 cl_len = cl_index;
3205 vp->v_clen = 0;
3206
3207 for (cl_pushed = 0, cl_index = 0; cl_index < cl_len; cl_index++) {
3208 /*
3209 * try to push each cluster in turn... cluster_push_x may not
3210 * push the cluster if can_delay is TRUE and the cluster doesn't
3211 * meet the critera for an immediate push
3212 */
3213 if (cluster_push_x(vp, EOF, l_clusters[cl_index].start_pg, l_clusters[cl_index].last_pg, can_delay)) {
3214 l_clusters[cl_index].start_pg = 0;
3215 l_clusters[cl_index].last_pg = 0;
3216
3217 cl_pushed++;
3218
3219 if (push_all == 0)
3220 break;
3221 }
3222 }
3223 if (cl_len > cl_pushed) {
3224 /*
3225 * we didn't push all of the clusters, so
3226 * lets try to merge them back in to the vnode
3227 */
3228 if ((MAX_CLUSTERS - vp->v_clen) < (cl_len - cl_pushed)) {
3229 /*
3230 * we picked up some new clusters while we were trying to
3231 * push the old ones (I don't think this can happen because
3232 * I'm holding the lock, but just in case)... the sum of the
3233 * leftovers plus the new cluster count exceeds our ability
3234 * to represent them, so fall back to the VHASDIRTY mechanism
3235 */
3236 for (cl_index = 0; cl_index < cl_len; cl_index++) {
3237 if (l_clusters[cl_index].start_pg == l_clusters[cl_index].last_pg)
3238 continue;
3239
3240 if (l_clusters[cl_index].start_pg < vp->v_cstart)
3241 vp->v_cstart = l_clusters[cl_index].start_pg;
3242 if (l_clusters[cl_index].last_pg > vp->v_lastw)
3243 vp->v_lastw = l_clusters[cl_index].last_pg;
3244 }
3245 vp->v_flag |= VHASDIRTY;
3246 } else {
3247 /*
3248 * we've got room to merge the leftovers back in
3249 * just append them starting at the next 'hole'
3250 * represented by vp->v_clen
3251 */
3252 for (cl_index = 0, cl_index1 = vp->v_clen; cl_index < cl_len; cl_index++) {
3253 if (l_clusters[cl_index].start_pg == l_clusters[cl_index].last_pg)
3254 continue;
3255
3256 vp->v_clusters[cl_index1].start_pg = l_clusters[cl_index].start_pg;
3257 vp->v_clusters[cl_index1].last_pg = l_clusters[cl_index].last_pg;
3258
3259 if (cl_index1 == 0) {
3260 vp->v_cstart = l_clusters[cl_index].start_pg;
3261 vp->v_lastw = l_clusters[cl_index].last_pg;
3262 } else {
3263 if (l_clusters[cl_index].start_pg < vp->v_cstart)
3264 vp->v_cstart = l_clusters[cl_index].start_pg;
3265 if (l_clusters[cl_index].last_pg > vp->v_lastw)
3266 vp->v_lastw = l_clusters[cl_index].last_pg;
3267 }
3268 cl_index1++;
3269 }
3270 /*
3271 * update the cluster count
3272 */
3273 vp->v_clen = cl_index1;
3274 }
3275 }
3276 return(MAX_CLUSTERS - vp->v_clen);
3277 }
3278
3279
3280
3281 static int
3282 cluster_push_x(vp, EOF, first, last, can_delay)
3283 struct vnode *vp;
3284 off_t EOF;
3285 daddr_t first;
3286 daddr_t last;
3287 int can_delay;
3288 {
3289 upl_page_info_t *pl;
3290 upl_t upl;
3291 vm_offset_t upl_offset;
3292 int upl_size;
3293 off_t upl_f_offset;
3294 int pages_in_upl;
3295 int start_pg;
3296 int last_pg;
3297 int io_size;
3298 int io_flags;
3299 int size;
3300 kern_return_t kret;
3301
3302
3303 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START,
3304 vp->v_clen, first, last, EOF, 0);
3305
3306 if ((pages_in_upl = last - first) == 0) {
3307 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0);
3308
3309 return (1);
3310 }
3311 upl_size = pages_in_upl * PAGE_SIZE;
3312 upl_f_offset = ((off_t)first) * PAGE_SIZE_64;
3313
3314 if (upl_f_offset + upl_size >= EOF) {
3315
3316 if (upl_f_offset >= EOF) {
3317 /*
3318 * must have truncated the file and missed
3319 * clearing a dangling cluster (i.e. it's completely
3320 * beyond the new EOF
3321 */
3322 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0);
3323
3324 return(1);
3325 }
3326 size = EOF - upl_f_offset;
3327
3328 upl_size = (size + (PAGE_SIZE - 1) ) & ~(PAGE_SIZE - 1);
3329 pages_in_upl = upl_size / PAGE_SIZE;
3330 } else {
3331 if (can_delay && (pages_in_upl < (MAX_UPL_TRANSFER - (MAX_UPL_TRANSFER / 2))))
3332 return(0);
3333 size = upl_size;
3334 }
3335 kret = ubc_create_upl(vp,
3336 upl_f_offset,
3337 upl_size,
3338 &upl,
3339 &pl,
3340 UPL_RET_ONLY_DIRTY);
3341 if (kret != KERN_SUCCESS)
3342 panic("cluster_push: failed to get pagelist");
3343
3344 if (can_delay) {
3345 int num_of_dirty;
3346
3347 for (num_of_dirty = 0, start_pg = 0; start_pg < pages_in_upl; start_pg++) {
3348 if (upl_valid_page(pl, start_pg) && upl_dirty_page(pl, start_pg))
3349 num_of_dirty++;
3350 }
3351 if (num_of_dirty < pages_in_upl / 2) {
3352 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3353
3354 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 0, 2, num_of_dirty, (pages_in_upl / 2), 0);
3355
3356 return(0);
3357 }
3358 }
3359 last_pg = 0;
3360
3361 while (size) {
3362
3363 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
3364 if (upl_valid_page(pl, start_pg) && upl_dirty_page(pl, start_pg))
3365 break;
3366 }
3367 if (start_pg > last_pg) {
3368 io_size = (start_pg - last_pg) * PAGE_SIZE;
3369
3370 ubc_upl_abort_range(upl, last_pg * PAGE_SIZE, io_size,
3371 UPL_ABORT_FREE_ON_EMPTY);
3372
3373 if (io_size < size)
3374 size -= io_size;
3375 else
3376 break;
3377 }
3378 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
3379 if (!upl_valid_page(pl, last_pg) || !upl_dirty_page(pl, last_pg))
3380 break;
3381 }
3382 upl_offset = start_pg * PAGE_SIZE;
3383
3384 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
3385
3386 if (vp->v_flag & VNOCACHE_DATA)
3387 io_flags = CL_COMMIT | CL_AGE | CL_ASYNC | CL_DUMP;
3388 else
3389 io_flags = CL_COMMIT | CL_AGE | CL_ASYNC;
3390
3391 while (vp->v_numoutput >= ASYNC_THROTTLE) {
3392 vp->v_flag |= VTHROTTLED;
3393 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_push", 0);
3394 }
3395 cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, vp->v_ciosiz, io_flags, (struct buf *)0, (struct clios *)0);
3396
3397 size -= io_size;
3398 }
3399 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, 0, 0, 0);
3400
3401 return(1);
3402 }
3403
3404
3405
3406 static int
3407 cluster_align_phys_io(struct vnode *vp, struct uio *uio, vm_offset_t usr_paddr, int xsize, int devblocksize, int flags)
3408 {
3409 struct iovec *iov;
3410 upl_page_info_t *pl;
3411 upl_t upl;
3412 vm_offset_t ubc_paddr;
3413 kern_return_t kret;
3414 int error = 0;
3415
3416 iov = uio->uio_iov;
3417
3418 kret = ubc_create_upl(vp,
3419 uio->uio_offset & ~PAGE_MASK_64,
3420 PAGE_SIZE,
3421 &upl,
3422 &pl,
3423 UPL_FLAGS_NONE);
3424
3425 if (kret != KERN_SUCCESS)
3426 return(EINVAL);
3427
3428 if (!upl_valid_page(pl, 0)) {
3429 /*
3430 * issue a synchronous read to cluster_io
3431 */
3432 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, devblocksize,
3433 CL_READ, (struct buf *)0, (struct clios *)0);
3434 if (error) {
3435 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3436
3437 return(error);
3438 }
3439 }
3440 ubc_paddr = (vm_offset_t)upl_phys_page(pl, 0) + (int)(uio->uio_offset & PAGE_MASK_64);
3441
3442 if (flags & CL_READ)
3443 copyp2p(ubc_paddr, usr_paddr, xsize, 2);
3444 else
3445 copyp2p(usr_paddr, ubc_paddr, xsize, 1);
3446
3447 if ( !(flags & CL_READ) || upl_dirty_page(pl, 0)) {
3448 /*
3449 * issue a synchronous write to cluster_io
3450 */
3451 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, devblocksize,
3452 0, (struct buf *)0, (struct clios *)0);
3453 }
3454 if (error == 0) {
3455 uio->uio_offset += xsize;
3456 iov->iov_base += xsize;
3457 iov->iov_len -= xsize;
3458 uio->uio_resid -= xsize;
3459 }
3460 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3461
3462 return (error);
3463 }