]> git.saurik.com Git - apple/xnu.git/blame - bsd/vfs/vfs_cluster.c
xnu-344.tar.gz
[apple/xnu.git] / bsd / vfs / vfs_cluster.c
CommitLineData
9bccf70c 1
1c79356b 2/*
9bccf70c 3 * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
1c79356b
A
4 *
5 * @APPLE_LICENSE_HEADER_START@
6 *
7 * The contents of this file constitute Original Code as defined in and
8 * are subject to the Apple Public Source License Version 1.1 (the
9 * "License"). You may not use this file except in compliance with the
10 * License. Please obtain a copy of the License at
11 * http://www.apple.com/publicsource and read it before using this file.
12 *
13 * This Original Code and all software distributed under the License are
14 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
18 * License for the specific language governing rights and limitations
19 * under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
24/*
25 * Copyright (c) 1993
26 * The Regents of the University of California. All rights reserved.
27 *
28 * Redistribution and use in source and binary forms, with or without
29 * modification, are permitted provided that the following conditions
30 * are met:
31 * 1. Redistributions of source code must retain the above copyright
32 * notice, this list of conditions and the following disclaimer.
33 * 2. Redistributions in binary form must reproduce the above copyright
34 * notice, this list of conditions and the following disclaimer in the
35 * documentation and/or other materials provided with the distribution.
36 * 3. All advertising materials mentioning features or use of this software
37 * must display the following acknowledgement:
38 * This product includes software developed by the University of
39 * California, Berkeley and its contributors.
40 * 4. Neither the name of the University nor the names of its contributors
41 * may be used to endorse or promote products derived from this software
42 * without specific prior written permission.
43 *
44 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
45 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
46 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
47 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
48 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
49 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
50 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
51 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
52 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
53 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
54 * SUCH DAMAGE.
55 *
56 * @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95
57 */
58
59#include <sys/param.h>
60#include <sys/proc.h>
61#include <sys/buf.h>
62#include <sys/vnode.h>
63#include <sys/mount.h>
64#include <sys/trace.h>
65#include <sys/malloc.h>
66#include <sys/resourcevar.h>
67#include <libkern/libkern.h>
68
69#include <sys/ubc.h>
70#include <vm/vm_pageout.h>
1c79356b
A
71
72#include <sys/kdebug.h>
73
1c79356b
A
74#define CL_READ 0x01
75#define CL_ASYNC 0x02
76#define CL_COMMIT 0x04
1c79356b
A
77#define CL_PAGEOUT 0x10
78#define CL_AGE 0x20
79#define CL_DUMP 0x40
80#define CL_NOZERO 0x80
81#define CL_PAGEIN 0x100
0b4e3aa0 82#define CL_DEV_MEMORY 0x200
1c79356b 83
9bccf70c
A
84static void cluster_zero(upl_t upl, vm_offset_t upl_offset,
85 int size, struct buf *bp);
86static int cluster_read_x(struct vnode *vp, struct uio *uio,
87 off_t filesize, int devblocksize, int flags);
88static int cluster_write_x(struct vnode *vp, struct uio *uio,
89 off_t oldEOF, off_t newEOF, off_t headOff,
90 off_t tailOff, int devblocksize, int flags);
91static int cluster_nocopy_read(struct vnode *vp, struct uio *uio,
92 off_t filesize, int devblocksize, int flags);
93static int cluster_nocopy_write(struct vnode *vp, struct uio *uio,
94 off_t newEOF, int devblocksize, int flags);
95static int cluster_phys_read(struct vnode *vp, struct uio *uio,
96 off_t filesize);
97static int cluster_phys_write(struct vnode *vp, struct uio *uio);
98static int cluster_push_x(struct vnode *vp, off_t EOF, daddr_t first, daddr_t last, int can_delay);
99static int cluster_try_push(struct vnode *vp, off_t newEOF, int can_delay, int push_all);
100
101
1c79356b
A
102/*
103 * throttle the number of async writes that
104 * can be outstanding on a single vnode
105 * before we issue a synchronous write
106 */
d52fe63f 107#define ASYNC_THROTTLE 9
1c79356b
A
108
109static int
110cluster_iodone(bp)
111 struct buf *bp;
112{
113 int b_flags;
114 int error;
115 int total_size;
116 int total_resid;
117 int upl_offset;
9bccf70c 118 int zero_offset;
1c79356b
A
119 upl_t upl;
120 struct buf *cbp;
121 struct buf *cbp_head;
122 struct buf *cbp_next;
123 struct buf *real_bp;
0b4e3aa0 124 struct vnode *vp;
1c79356b
A
125 int commit_size;
126 int pg_offset;
127
128
129 cbp_head = (struct buf *)(bp->b_trans_head);
130
131 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
9bccf70c 132 (int)cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
1c79356b
A
133
134 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
135 /*
136 * all I/O requests that are part of this transaction
137 * have to complete before we can process it
138 */
139 if ( !(cbp->b_flags & B_DONE)) {
140
141 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
9bccf70c 142 (int)cbp_head, (int)cbp, cbp->b_bcount, cbp->b_flags, 0);
1c79356b
A
143
144 return 0;
145 }
146 }
147 error = 0;
148 total_size = 0;
149 total_resid = 0;
150
151 cbp = cbp_head;
152 upl_offset = cbp->b_uploffset;
153 upl = cbp->b_pagelist;
154 b_flags = cbp->b_flags;
155 real_bp = cbp->b_real_bp;
0b4e3aa0 156 vp = cbp->b_vp;
9bccf70c 157 zero_offset= cbp->b_validend;
1c79356b
A
158
159 while (cbp) {
160 if (cbp->b_vectorcount > 1)
161 _FREE(cbp->b_vectorlist, M_SEGMENT);
162
163 if ((cbp->b_flags & B_ERROR) && error == 0)
164 error = cbp->b_error;
165
166 total_resid += cbp->b_resid;
167 total_size += cbp->b_bcount;
168
169 cbp_next = cbp->b_trans_next;
170
171 free_io_buf(cbp);
172
173 cbp = cbp_next;
174 }
0b4e3aa0
A
175 if ((vp->v_flag & VTHROTTLED) && (vp->v_numoutput <= (ASYNC_THROTTLE / 3))) {
176 vp->v_flag &= ~VTHROTTLED;
177 wakeup((caddr_t)&vp->v_numoutput);
178 }
9bccf70c
A
179 if (zero_offset)
180 cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
181
1c79356b
A
182 if ((b_flags & B_NEED_IODONE) && real_bp) {
183 if (error) {
184 real_bp->b_flags |= B_ERROR;
185 real_bp->b_error = error;
186 }
187 real_bp->b_resid = total_resid;
188
189 biodone(real_bp);
190 }
191 if (error == 0 && total_resid)
192 error = EIO;
193
194 if (b_flags & B_COMMIT_UPL) {
195 pg_offset = upl_offset & PAGE_MASK;
196 commit_size = (((pg_offset + total_size) + (PAGE_SIZE - 1)) / PAGE_SIZE) * PAGE_SIZE;
197
198 if (error || (b_flags & B_NOCACHE)) {
199 int upl_abort_code;
200
9bccf70c 201 if ((b_flags & B_PAGEOUT) && (error != ENXIO)) /* transient error */
1c79356b 202 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
0b4e3aa0
A
203 else if (b_flags & B_PGIN)
204 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
1c79356b
A
205 else
206 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
207
0b4e3aa0
A
208 ubc_upl_abort_range(upl, upl_offset - pg_offset, commit_size,
209 upl_abort_code);
1c79356b
A
210
211 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
9bccf70c 212 (int)upl, upl_offset - pg_offset, commit_size,
1c79356b
A
213 0x80000000|upl_abort_code, 0);
214
215 } else {
216 int upl_commit_flags = UPL_COMMIT_FREE_ON_EMPTY;
217
218 if ( !(b_flags & B_PAGEOUT))
219 upl_commit_flags |= UPL_COMMIT_CLEAR_DIRTY;
220 if (b_flags & B_AGE)
221 upl_commit_flags |= UPL_COMMIT_INACTIVATE;
222
0b4e3aa0
A
223 ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size,
224 upl_commit_flags);
1c79356b 225
0b4e3aa0 226 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
9bccf70c 227 (int)upl, upl_offset - pg_offset, commit_size,
1c79356b
A
228 upl_commit_flags, 0);
229 }
230 } else
231 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
9bccf70c 232 (int)upl, upl_offset, 0, error, 0);
1c79356b
A
233
234 return (error);
235}
236
237
238static void
9bccf70c 239cluster_zero(upl, upl_offset, size, bp)
1c79356b
A
240 upl_t upl;
241 vm_offset_t upl_offset;
242 int size;
1c79356b
A
243 struct buf *bp;
244{
245 vm_offset_t io_addr = 0;
9bccf70c 246 int must_unmap = 0;
1c79356b
A
247 kern_return_t kret;
248
9bccf70c
A
249 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_NONE,
250 upl_offset, size, (int)bp, 0, 0);
251
252 if (bp == NULL || bp->b_data == NULL) {
0b4e3aa0 253 kret = ubc_upl_map(upl, &io_addr);
1c79356b
A
254
255 if (kret != KERN_SUCCESS)
0b4e3aa0 256 panic("cluster_zero: ubc_upl_map() failed with (%d)", kret);
1c79356b 257 if (io_addr == 0)
0b4e3aa0 258 panic("cluster_zero: ubc_upl_map() mapped 0");
9bccf70c
A
259
260 must_unmap = 1;
1c79356b
A
261 } else
262 io_addr = (vm_offset_t)bp->b_data;
263 bzero((caddr_t)(io_addr + upl_offset), size);
264
9bccf70c 265 if (must_unmap) {
0b4e3aa0 266 kret = ubc_upl_unmap(upl);
1c79356b
A
267
268 if (kret != KERN_SUCCESS)
269 panic("cluster_zero: kernel_upl_unmap failed");
270 }
271}
272
1c79356b 273static int
9bccf70c 274cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, real_bp)
1c79356b
A
275 struct vnode *vp;
276 upl_t upl;
277 vm_offset_t upl_offset;
278 off_t f_offset;
9bccf70c
A
279 int non_rounded_size;
280 int devblocksize;
1c79356b
A
281 int flags;
282 struct buf *real_bp;
283{
284 struct buf *cbp;
285 struct iovec *iovp;
9bccf70c 286 u_int size;
1c79356b
A
287 int io_flags;
288 int error = 0;
289 int retval = 0;
290 struct buf *cbp_head = 0;
291 struct buf *cbp_tail = 0;
292 upl_page_info_t *pl;
9bccf70c 293 int buf_count = 0;
1c79356b
A
294 int pg_count;
295 int pg_offset;
9bccf70c
A
296 u_int max_iosize;
297 u_int max_vectors;
0b4e3aa0 298 int priv;
9bccf70c 299 int zero_offset = 0;
1c79356b 300
0b4e3aa0 301 if (flags & CL_READ) {
1c79356b 302 io_flags = (B_VECTORLIST | B_READ);
0b4e3aa0
A
303
304 vfs_io_attributes(vp, B_READ, &max_iosize, &max_vectors);
305 } else {
1c79356b
A
306 io_flags = (B_VECTORLIST | B_WRITEINPROG);
307
0b4e3aa0
A
308 vfs_io_attributes(vp, B_WRITE, &max_iosize, &max_vectors);
309 }
310 pl = ubc_upl_pageinfo(upl);
1c79356b
A
311
312 if (flags & CL_ASYNC)
313 io_flags |= (B_CALL | B_ASYNC);
314 if (flags & CL_AGE)
315 io_flags |= B_AGE;
316 if (flags & CL_DUMP)
317 io_flags |= B_NOCACHE;
0b4e3aa0
A
318 if (flags & CL_PAGEIN)
319 io_flags |= B_PGIN;
1c79356b 320
9bccf70c
A
321 if (devblocksize)
322 size = (non_rounded_size + (devblocksize - 1)) & ~(devblocksize - 1);
323 else
324 size = non_rounded_size;
325
1c79356b
A
326
327 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START,
328 (int)f_offset, size, upl_offset, flags, 0);
329
9bccf70c 330 if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
1c79356b
A
331 /*
332 * then we are going to end up
333 * with a page that we can't complete (the file size wasn't a multiple
334 * of PAGE_SIZE and we're trying to read to the end of the file
335 * so we'll go ahead and zero out the portion of the page we can't
336 * read in from the file
337 */
9bccf70c 338 zero_offset = upl_offset + non_rounded_size;
1c79356b
A
339 }
340 while (size) {
341 size_t io_size;
342 int vsize;
343 int i;
344 int pl_index;
345 int pg_resid;
346 int num_contig;
347 daddr_t lblkno;
348 daddr_t blkno;
349
0b4e3aa0
A
350 if (size > max_iosize)
351 io_size = max_iosize;
1c79356b
A
352 else
353 io_size = size;
354
355 if (error = VOP_CMAP(vp, f_offset, io_size, &blkno, &io_size, NULL)) {
356 if (error == EOPNOTSUPP)
357 panic("VOP_CMAP Unimplemented");
358 break;
359 }
360
361 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
9bccf70c 362 (int)f_offset, (int)blkno, io_size, zero_offset, 0);
1c79356b
A
363
364 if ( (!(flags & CL_READ) && (long)blkno == -1) || io_size == 0) {
0b4e3aa0
A
365 if (flags & CL_PAGEOUT) {
366 error = EINVAL;
367 break;
368 };
369
370 /* Try paging out the page individually before
371 giving up entirely and dumping it (it could
372 be mapped in a "hole" and require allocation
373 before the I/O:
374 */
375 ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE_64, UPL_ABORT_FREE_ON_EMPTY);
376 if (ubc_pushdirty_range(vp, f_offset, PAGE_SIZE_64) == 0) {
377 error = EINVAL;
378 break;
379 };
380
381 upl_offset += PAGE_SIZE_64;
382 f_offset += PAGE_SIZE_64;
383 size -= PAGE_SIZE_64;
384 continue;
1c79356b
A
385 }
386 lblkno = (daddr_t)(f_offset / PAGE_SIZE_64);
387 /*
388 * we have now figured out how much I/O we can do - this is in 'io_size'
389 * pl_index represents the first page in the 'upl' that the I/O will occur for
390 * pg_offset is the starting point in the first page for the I/O
391 * pg_count is the number of full and partial pages that 'io_size' encompasses
392 */
393 pl_index = upl_offset / PAGE_SIZE;
394 pg_offset = upl_offset & PAGE_MASK;
395 pg_count = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
396
0b4e3aa0
A
397 if (flags & CL_DEV_MEMORY) {
398 /*
399 * currently, can't deal with reading 'holes' in file
400 */
401 if ((long)blkno == -1) {
402 error = EINVAL;
403 break;
404 }
405 /*
406 * treat physical requests as one 'giant' page
407 */
408 pg_count = 1;
409 }
1c79356b 410 if ((flags & CL_READ) && (long)blkno == -1) {
9bccf70c
A
411 int bytes_to_zero;
412
1c79356b
A
413 /*
414 * if we're reading and blkno == -1, then we've got a
415 * 'hole' in the file that we need to deal with by zeroing
416 * out the affected area in the upl
417 */
9bccf70c
A
418 if (zero_offset && io_size == size) {
419 /*
420 * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
421 * than 'zero_offset' will be non-zero
422 * if the 'hole' returned by VOP_CMAP extends all the way to the eof
423 * (indicated by the io_size finishing off the I/O request for this UPL)
424 * than we're not going to issue an I/O for the
425 * last page in this upl... we need to zero both the hole and the tail
426 * of the page beyond the EOF, since the delayed zero-fill won't kick in
427 */
428 bytes_to_zero = (((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset;
1c79356b 429
9bccf70c
A
430 zero_offset = 0;
431 } else
432 bytes_to_zero = io_size;
1c79356b 433
9bccf70c
A
434 cluster_zero(upl, upl_offset, bytes_to_zero, real_bp);
435
436 if (cbp_head)
437 /*
438 * if there is a current I/O chain pending
439 * then the first page of the group we just zero'd
440 * will be handled by the I/O completion if the zero
441 * fill started in the middle of the page
442 */
443 pg_count = (io_size - pg_offset) / PAGE_SIZE;
444 else {
445 /*
446 * no pending I/O to pick up that first page
447 * so, we have to make sure it gets committed
448 * here.
449 * set the pg_offset to 0 so that the upl_commit_range
450 * starts with this page
451 */
452 pg_count = (io_size + pg_offset) / PAGE_SIZE;
453 pg_offset = 0;
454 }
1c79356b 455 if (io_size == size && ((upl_offset + io_size) & PAGE_MASK))
9bccf70c
A
456 /*
457 * if we're done with the request for this UPL
458 * then we have to make sure to commit the last page
459 * even if we only partially zero-filled it
460 */
1c79356b
A
461 pg_count++;
462
463 if (pg_count) {
464 if (pg_offset)
465 pg_resid = PAGE_SIZE - pg_offset;
466 else
467 pg_resid = 0;
9bccf70c 468
1c79356b 469 if (flags & CL_COMMIT)
0b4e3aa0 470 ubc_upl_commit_range(upl,
9bccf70c 471 (upl_offset + pg_resid) & ~PAGE_MASK,
0b4e3aa0
A
472 pg_count * PAGE_SIZE,
473 UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
1c79356b
A
474 }
475 upl_offset += io_size;
476 f_offset += io_size;
477 size -= io_size;
478
9bccf70c 479 if (cbp_head && pg_count)
1c79356b
A
480 goto start_io;
481 continue;
9bccf70c 482
1c79356b
A
483 } else if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno)) {
484 real_bp->b_blkno = blkno;
485 }
0b4e3aa0 486
1c79356b 487 if (pg_count > 1) {
0b4e3aa0
A
488 if (pg_count > max_vectors) {
489 io_size -= (pg_count - max_vectors) * PAGE_SIZE;
490
491 if (io_size < 0) {
492 io_size = PAGE_SIZE - pg_offset;
493 pg_count = 1;
494 } else
495 pg_count = max_vectors;
496 }
1c79356b
A
497 /*
498 * we need to allocate space for the vector list
499 */
0b4e3aa0
A
500 if (pg_count > 1) {
501 iovp = (struct iovec *)_MALLOC(sizeof(struct iovec) * pg_count,
502 M_SEGMENT, M_NOWAIT);
503
504 if (iovp == (struct iovec *) 0) {
505 /*
506 * if the allocation fails, then throttle down to a single page
507 */
508 io_size = PAGE_SIZE - pg_offset;
509 pg_count = 1;
510 }
1c79356b
A
511 }
512 }
1c79356b 513
0b4e3aa0
A
514 /* Throttle the speculative IO */
515 if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT))
516 priv = 0;
517 else
518 priv = 1;
519
520 cbp = alloc_io_buf(vp, priv);
1c79356b
A
521
522 if (pg_count == 1)
523 /*
524 * we use the io vector that's reserved in the buffer header
525 * this insures we can always issue an I/O even in a low memory
526 * condition that prevents the _MALLOC from succeeding... this
527 * is necessary to prevent deadlocks with the pager
528 */
529 iovp = (struct iovec *)(&cbp->b_vects[0]);
530
531 cbp->b_vectorlist = (void *)iovp;
532 cbp->b_vectorcount = pg_count;
533
0b4e3aa0
A
534 if (flags & CL_DEV_MEMORY) {
535
536 iovp->iov_len = io_size;
537 iovp->iov_base = (caddr_t)upl_phys_page(pl, 0);
538
539 if (iovp->iov_base == (caddr_t) 0) {
540 free_io_buf(cbp);
541 error = EINVAL;
542 } else
543 iovp->iov_base += upl_offset;
544 } else {
545
546 for (i = 0, vsize = io_size; i < pg_count; i++, iovp++) {
1c79356b
A
547 int psize;
548
0b4e3aa0 549 psize = PAGE_SIZE - pg_offset;
1c79356b
A
550
551 if (psize > vsize)
552 psize = vsize;
553
554 iovp->iov_len = psize;
555 iovp->iov_base = (caddr_t)upl_phys_page(pl, pl_index + i);
556
557 if (iovp->iov_base == (caddr_t) 0) {
558 if (pg_count > 1)
559 _FREE(cbp->b_vectorlist, M_SEGMENT);
560 free_io_buf(cbp);
561
562 error = EINVAL;
563 break;
564 }
565 iovp->iov_base += pg_offset;
566 pg_offset = 0;
567
568 if (flags & CL_PAGEOUT) {
569 int s;
570 struct buf *bp;
571
572 s = splbio();
573 if (bp = incore(vp, lblkno + i)) {
574 if (!ISSET(bp->b_flags, B_BUSY)) {
575 bremfree(bp);
576 SET(bp->b_flags, (B_BUSY | B_INVAL));
577 splx(s);
578 brelse(bp);
579 } else
580 panic("BUSY bp found in cluster_io");
581 }
582 splx(s);
583 }
584 vsize -= psize;
0b4e3aa0 585 }
1c79356b
A
586 }
587 if (error)
588 break;
589
590 if (flags & CL_ASYNC)
591 cbp->b_iodone = (void *)cluster_iodone;
592 cbp->b_flags |= io_flags;
593
594 cbp->b_lblkno = lblkno;
595 cbp->b_blkno = blkno;
596 cbp->b_bcount = io_size;
597 cbp->b_pagelist = upl;
598 cbp->b_uploffset = upl_offset;
599 cbp->b_trans_next = (struct buf *)0;
600
601 if (flags & CL_READ)
602 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
603 cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
604 else
605 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
606 cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
607
608 if (cbp_head) {
609 cbp_tail->b_trans_next = cbp;
610 cbp_tail = cbp;
611 } else {
612 cbp_head = cbp;
613 cbp_tail = cbp;
614 }
615 (struct buf *)(cbp->b_trans_head) = cbp_head;
9bccf70c 616 buf_count++;
1c79356b
A
617
618 upl_offset += io_size;
619 f_offset += io_size;
620 size -= io_size;
621
9bccf70c 622 if ( (!(upl_offset & PAGE_MASK) && !(flags & CL_DEV_MEMORY) && ((flags & CL_ASYNC) || buf_count > 8)) || size == 0) {
1c79356b
A
623 /*
624 * if we have no more I/O to issue or
625 * the current I/O we've prepared fully
626 * completes the last page in this request
9bccf70c
A
627 * and it's either an ASYNC request or
628 * we've already accumulated more than 8 I/O's into
629 * this transaction and it's not an I/O directed to
630 * special DEVICE memory
1c79356b
A
631 * then go ahead and issue the I/O
632 */
633start_io:
634 if (flags & CL_COMMIT)
635 cbp_head->b_flags |= B_COMMIT_UPL;
636 if (flags & CL_PAGEOUT)
637 cbp_head->b_flags |= B_PAGEOUT;
0b4e3aa0
A
638 if (flags & CL_PAGEIN)
639 cbp_head->b_flags |= B_PGIN;
1c79356b
A
640
641 if (real_bp) {
642 cbp_head->b_flags |= B_NEED_IODONE;
643 cbp_head->b_real_bp = real_bp;
9bccf70c
A
644 } else
645 cbp_head->b_real_bp = (struct buf *)NULL;
1c79356b 646
9bccf70c
A
647 if (size == 0) {
648 /*
649 * we're about to issue the last I/O for this upl
650 * if this was a read to the eof and the eof doesn't
651 * finish on a page boundary, than we need to zero-fill
652 * the rest of the page....
653 */
654 cbp_head->b_validend = zero_offset;
655 } else
656 cbp_head->b_validend = 0;
657
1c79356b
A
658 for (cbp = cbp_head; cbp;) {
659 struct buf * cbp_next;
660
661 if (io_flags & B_WRITEINPROG)
662 cbp->b_vp->v_numoutput++;
663
664 cbp_next = cbp->b_trans_next;
9bccf70c 665
1c79356b
A
666 (void) VOP_STRATEGY(cbp);
667 cbp = cbp_next;
668 }
669 if ( !(flags & CL_ASYNC)) {
670 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next)
671 biowait(cbp);
672
673 if (error = cluster_iodone(cbp_head)) {
9bccf70c
A
674 if ((flags & CL_PAGEOUT) && (error == ENXIO))
675 retval = 0; /* drop the error */
676 else
677 retval = error;
1c79356b
A
678 error = 0;
679 }
680 }
681 cbp_head = (struct buf *)0;
682 cbp_tail = (struct buf *)0;
9bccf70c
A
683
684 buf_count = 0;
1c79356b
A
685 }
686 }
687 if (error) {
0b4e3aa0
A
688 int abort_size;
689
1c79356b
A
690 for (cbp = cbp_head; cbp;) {
691 struct buf * cbp_next;
692
693 if (cbp->b_vectorcount > 1)
694 _FREE(cbp->b_vectorlist, M_SEGMENT);
0b4e3aa0
A
695 upl_offset -= cbp->b_bcount;
696 size += cbp->b_bcount;
697
1c79356b
A
698 cbp_next = cbp->b_trans_next;
699 free_io_buf(cbp);
700 cbp = cbp_next;
1c79356b 701 }
0b4e3aa0
A
702 pg_offset = upl_offset & PAGE_MASK;
703 abort_size = ((size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE) * PAGE_SIZE;
1c79356b
A
704
705 if (flags & CL_COMMIT) {
706 int upl_abort_code;
707
9bccf70c 708 if ((flags & CL_PAGEOUT) && (error != ENXIO)) /* transient error */
1c79356b
A
709 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
710 else if (flags & CL_PAGEIN)
0b4e3aa0 711 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
1c79356b 712 else
0b4e3aa0 713 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
1c79356b 714
0b4e3aa0
A
715 ubc_upl_abort_range(upl, upl_offset - pg_offset, abort_size,
716 upl_abort_code);
1c79356b
A
717
718 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
9bccf70c 719 (int)upl, upl_offset - pg_offset, abort_size, error, 0);
1c79356b
A
720 }
721 if (real_bp) {
722 real_bp->b_flags |= B_ERROR;
723 real_bp->b_error = error;
724
725 biodone(real_bp);
726 }
727 if (retval == 0)
728 retval = error;
729 }
730 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END,
731 (int)f_offset, size, upl_offset, retval, 0);
732
733 return (retval);
734}
735
736
737static int
0b4e3aa0 738cluster_rd_prefetch(vp, f_offset, size, filesize, devblocksize)
1c79356b 739 struct vnode *vp;
1c79356b
A
740 off_t f_offset;
741 u_int size;
742 off_t filesize;
743 int devblocksize;
744{
9bccf70c
A
745 int pages_to_fetch;
746 int skipped_pages;
1c79356b
A
747
748 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
749 (int)f_offset, size, (int)filesize, 0, 0);
750
751 if (f_offset >= filesize) {
752 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
753 (int)f_offset, 0, 0, 0, 0);
754 return(0);
755 }
0b4e3aa0
A
756 if (size > (MAX_UPL_TRANSFER * PAGE_SIZE))
757 size = MAX_UPL_TRANSFER * PAGE_SIZE;
1c79356b
A
758 else
759 size = (size + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1);
760
9bccf70c
A
761 if ((off_t)size > (filesize - f_offset))
762 size = filesize - f_offset;
1c79356b 763
9bccf70c 764 pages_to_fetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
1c79356b 765
9bccf70c
A
766 for (skipped_pages = 0; skipped_pages < pages_to_fetch; skipped_pages++) {
767 if (ubc_page_op(vp, f_offset, 0, 0, 0) != KERN_SUCCESS)
1c79356b 768 break;
9bccf70c
A
769 f_offset += PAGE_SIZE;
770 size -= PAGE_SIZE;
1c79356b 771 }
9bccf70c
A
772 if (skipped_pages < pages_to_fetch)
773 advisory_read(vp, filesize, f_offset, size, devblocksize);
1c79356b
A
774
775 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
9bccf70c 776 (int)f_offset + (pages_to_fetch * PAGE_SIZE), skipped_pages, 0, 1, 0);
1c79356b 777
9bccf70c 778 return (pages_to_fetch);
1c79356b
A
779}
780
781
782
783static void
0b4e3aa0 784cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize)
1c79356b 785 struct vnode *vp;
1c79356b
A
786 daddr_t b_lblkno;
787 daddr_t e_lblkno;
788 off_t filesize;
789 int devblocksize;
790{
791 daddr_t r_lblkno;
792 off_t f_offset;
793 int size_of_prefetch;
0b4e3aa0 794 int max_pages;
1c79356b
A
795
796 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
797 b_lblkno, e_lblkno, vp->v_lastr, 0, 0);
798
799 if (b_lblkno == vp->v_lastr && b_lblkno == e_lblkno) {
800 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
801 vp->v_ralen, vp->v_maxra, vp->v_lastr, 0, 0);
802 return;
803 }
804
9bccf70c
A
805 if (vp->v_lastr == -1 || (b_lblkno != vp->v_lastr && b_lblkno != (vp->v_lastr + 1) &&
806 (b_lblkno != (vp->v_maxra + 1) || vp->v_ralen == 0))) {
1c79356b
A
807 vp->v_ralen = 0;
808 vp->v_maxra = 0;
809
810 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
811 vp->v_ralen, vp->v_maxra, vp->v_lastr, 1, 0);
812
813 return;
814 }
d52fe63f 815 max_pages = MAX_UPL_TRANSFER;
0b4e3aa0
A
816
817 vp->v_ralen = vp->v_ralen ? min(max_pages, vp->v_ralen << 1) : 1;
1c79356b
A
818
819 if (((e_lblkno + 1) - b_lblkno) > vp->v_ralen)
0b4e3aa0 820 vp->v_ralen = min(max_pages, (e_lblkno + 1) - b_lblkno);
1c79356b
A
821
822 if (e_lblkno < vp->v_maxra) {
9bccf70c 823 if ((vp->v_maxra - e_lblkno) > max(max_pages / 16, 4)) {
1c79356b
A
824
825 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
826 vp->v_ralen, vp->v_maxra, vp->v_lastr, 2, 0);
827 return;
828 }
829 }
830 r_lblkno = max(e_lblkno, vp->v_maxra) + 1;
831 f_offset = (off_t)r_lblkno * PAGE_SIZE_64;
832
9bccf70c
A
833 if (f_offset < filesize) {
834 size_of_prefetch = cluster_rd_prefetch(vp, f_offset, vp->v_ralen * PAGE_SIZE, filesize, devblocksize);
1c79356b 835
9bccf70c
A
836 if (size_of_prefetch)
837 vp->v_maxra = (r_lblkno + size_of_prefetch) - 1;
838 }
1c79356b
A
839 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
840 vp->v_ralen, vp->v_maxra, vp->v_lastr, 3, 0);
841}
842
9bccf70c 843int
1c79356b
A
844cluster_pageout(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags)
845 struct vnode *vp;
846 upl_t upl;
847 vm_offset_t upl_offset;
848 off_t f_offset;
849 int size;
850 off_t filesize;
851 int devblocksize;
852 int flags;
853{
854 int io_size;
855 int pg_size;
856 off_t max_size;
857 int local_flags = CL_PAGEOUT;
858
859 if ((flags & UPL_IOSYNC) == 0)
860 local_flags |= CL_ASYNC;
861 if ((flags & UPL_NOCOMMIT) == 0)
862 local_flags |= CL_COMMIT;
863
1c79356b
A
864
865 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
866 (int)f_offset, size, (int)filesize, local_flags, 0);
867
868 /*
869 * If they didn't specify any I/O, then we are done...
870 * we can't issue an abort because we don't know how
871 * big the upl really is
872 */
873 if (size <= 0)
874 return (EINVAL);
875
876 if (vp->v_mount->mnt_flag & MNT_RDONLY) {
877 if (local_flags & CL_COMMIT)
9bccf70c 878 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
1c79356b
A
879 return (EROFS);
880 }
881 /*
882 * can't page-in from a negative offset
883 * or if we're starting beyond the EOF
884 * or if the file offset isn't page aligned
885 * or the size requested isn't a multiple of PAGE_SIZE
886 */
887 if (f_offset < 0 || f_offset >= filesize ||
888 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
0b4e3aa0
A
889 if (local_flags & CL_COMMIT)
890 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
1c79356b
A
891 return (EINVAL);
892 }
893 max_size = filesize - f_offset;
894
895 if (size < max_size)
896 io_size = size;
897 else
9bccf70c 898 io_size = max_size;
1c79356b
A
899
900 pg_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
901
902 if (size > pg_size) {
0b4e3aa0
A
903 if (local_flags & CL_COMMIT)
904 ubc_upl_abort_range(upl, upl_offset + pg_size, size - pg_size,
1c79356b
A
905 UPL_ABORT_FREE_ON_EMPTY);
906 }
0b4e3aa0
A
907 while (vp->v_numoutput >= ASYNC_THROTTLE) {
908 vp->v_flag |= VTHROTTLED;
909 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_pageout", 0);
910 }
1c79356b 911
9bccf70c 912 return (cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize,
1c79356b
A
913 local_flags, (struct buf *)0));
914}
915
9bccf70c 916int
1c79356b
A
917cluster_pagein(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags)
918 struct vnode *vp;
919 upl_t upl;
920 vm_offset_t upl_offset;
921 off_t f_offset;
922 int size;
923 off_t filesize;
924 int devblocksize;
925 int flags;
926{
927 u_int io_size;
9bccf70c 928 int rounded_size;
1c79356b
A
929 off_t max_size;
930 int retval;
931 int local_flags = 0;
1c79356b 932
9bccf70c
A
933 if (upl == NULL || size < 0)
934 panic("cluster_pagein: NULL upl passed in");
1c79356b 935
9bccf70c
A
936 if ((flags & UPL_IOSYNC) == 0)
937 local_flags |= CL_ASYNC;
1c79356b 938 if ((flags & UPL_NOCOMMIT) == 0)
9bccf70c
A
939 local_flags |= CL_COMMIT;
940
1c79356b
A
941
942 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
943 (int)f_offset, size, (int)filesize, local_flags, 0);
944
945 /*
946 * can't page-in from a negative offset
947 * or if we're starting beyond the EOF
948 * or if the file offset isn't page aligned
949 * or the size requested isn't a multiple of PAGE_SIZE
950 */
951 if (f_offset < 0 || f_offset >= filesize ||
9bccf70c
A
952 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) {
953 if (local_flags & CL_COMMIT)
954 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1c79356b
A
955 return (EINVAL);
956 }
957 max_size = filesize - f_offset;
958
959 if (size < max_size)
960 io_size = size;
961 else
9bccf70c 962 io_size = max_size;
1c79356b 963
9bccf70c 964 rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1c79356b 965
9bccf70c
A
966 if (size > rounded_size && (local_flags & CL_COMMIT))
967 ubc_upl_abort_range(upl, upl_offset + rounded_size,
968 size - (upl_offset + rounded_size), UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
969
970 retval = cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize,
971 local_flags | CL_READ | CL_PAGEIN, (struct buf *)0);
1c79356b
A
972
973 if (retval == 0) {
974 int b_lblkno;
975 int e_lblkno;
976
977 b_lblkno = (int)(f_offset / PAGE_SIZE_64);
978 e_lblkno = (int)
979 ((f_offset + ((off_t)io_size - 1)) / PAGE_SIZE_64);
980
9bccf70c 981 if (!(flags & UPL_NORDAHEAD) && !(vp->v_flag & VRAOFF) && rounded_size == PAGE_SIZE) {
1c79356b
A
982 /*
983 * we haven't read the last page in of the file yet
984 * so let's try to read ahead if we're in
985 * a sequential access pattern
986 */
0b4e3aa0 987 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
1c79356b
A
988 }
989 vp->v_lastr = e_lblkno;
990 }
991 return (retval);
992}
993
9bccf70c 994int
1c79356b
A
995cluster_bp(bp)
996 struct buf *bp;
997{
998 off_t f_offset;
999 int flags;
1000
9bccf70c
A
1001 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START,
1002 (int)bp, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
1003
1c79356b
A
1004 if (bp->b_pagelist == (upl_t) 0)
1005 panic("cluster_bp: can't handle NULL upl yet\n");
1006 if (bp->b_flags & B_READ)
9bccf70c 1007 flags = CL_ASYNC | CL_READ;
1c79356b 1008 else
9bccf70c 1009 flags = CL_ASYNC;
1c79356b
A
1010
1011 f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
1012
9bccf70c 1013 return (cluster_io(bp->b_vp, bp->b_pagelist, 0, f_offset, bp->b_bcount, 0, flags, bp));
1c79356b
A
1014}
1015
9bccf70c 1016int
1c79356b
A
1017cluster_write(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
1018 struct vnode *vp;
1019 struct uio *uio;
1020 off_t oldEOF;
1021 off_t newEOF;
1022 off_t headOff;
1023 off_t tailOff;
1024 int devblocksize;
1025 int flags;
1026{
1c79356b
A
1027 int prev_resid;
1028 int clip_size;
1029 off_t max_io_size;
1030 struct iovec *iov;
0b4e3aa0
A
1031 vm_offset_t upl_offset;
1032 int upl_size;
1033 int pages_in_pl;
1034 upl_page_info_t *pl;
1035 int upl_flags;
1036 upl_t upl;
1c79356b
A
1037 int retval = 0;
1038
1039
0b4e3aa0 1040 if ((!uio) || (uio->uio_segflg != UIO_USERSPACE) || (!(vp->v_flag & VNOCACHE_DATA)))
1c79356b 1041 {
0b4e3aa0 1042 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1c79356b
A
1043 return(retval);
1044 }
1045
1046 while (uio->uio_resid && uio->uio_offset < newEOF && retval == 0)
1047 {
1048 /* we know we have a resid, so this is safe */
1049 iov = uio->uio_iov;
1050 while (iov->iov_len == 0) {
1051 uio->uio_iov++;
1052 uio->uio_iovcnt--;
1053 iov = uio->uio_iov;
1054 }
1055
0b4e3aa0
A
1056 /*
1057 * We check every vector target and if it is physically
1058 * contiguous space, we skip the sanity checks.
1059 */
1060
1061 upl_offset = (vm_offset_t)iov->iov_base & ~PAGE_MASK;
1062 upl_size = (upl_offset + PAGE_SIZE +(PAGE_SIZE -1)) & ~PAGE_MASK;
1063 pages_in_pl = 0;
1064 upl_flags = UPL_QUERY_OBJECT_TYPE;
1065 if ((vm_map_get_upl(current_map(),
1066 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1067 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0)) != KERN_SUCCESS)
1068 {
1069 /*
1070 * the user app must have passed in an invalid address
1071 */
1072 return (EFAULT);
1073 }
1074
1075 if (upl_flags & UPL_PHYS_CONTIG)
1076 {
1077 /*
1078 * since the interface to the IOKit below us uses physical block #'s and
1079 * block counts to specify the I/O, we can't handle anything that isn't
1080 * devblocksize aligned
1081 */
1082 if ((uio->uio_offset & (devblocksize - 1)) || (uio->uio_resid & (devblocksize - 1)))
1083 return(EINVAL);
1084
1085 if (flags & IO_HEADZEROFILL)
1086 {
1087 flags &= ~IO_HEADZEROFILL;
1088
1089 if (retval = cluster_write_x(vp, (struct uio *)0, 0, uio->uio_offset, headOff, 0, devblocksize, IO_HEADZEROFILL))
1090 return(retval);
1091 }
1092
1093 retval = cluster_phys_write(vp, uio);
1094
1095 if (uio->uio_resid == 0 && (flags & IO_TAILZEROFILL))
1096 {
1097 retval = cluster_write_x(vp, (struct uio *)0, 0, tailOff, uio->uio_offset, 0, devblocksize, IO_HEADZEROFILL);
1098 return(retval);
1099 }
1100 }
1101 else if ((uio->uio_resid < 4 * PAGE_SIZE) || (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL)))
1102 {
1103 /*
1104 * We set a threshhold of 4 pages to decide if the nocopy
1105 * write loop is worth the trouble...
1106 * we also come here if we're trying to zero the head and/or tail
1107 * of a partially written page, and the user source is not a physically contiguous region
1108 */
1109 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1110 return(retval);
1111 }
1112 else if (uio->uio_offset & PAGE_MASK_64)
1c79356b
A
1113 {
1114 /* Bring the file offset write up to a pagesize boundary */
1115 clip_size = (PAGE_SIZE - (uio->uio_offset & PAGE_MASK_64));
1116 if (uio->uio_resid < clip_size)
1117 clip_size = uio->uio_resid;
1118 /*
1119 * Fake the resid going into the cluster_write_x call
1120 * and restore it on the way out.
1121 */
1122 prev_resid = uio->uio_resid;
1123 uio->uio_resid = clip_size;
0b4e3aa0 1124 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1c79356b
A
1125 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1126 }
1127 else if ((int)iov->iov_base & PAGE_MASK_64)
1128 {
1129 clip_size = iov->iov_len;
1130 prev_resid = uio->uio_resid;
1131 uio->uio_resid = clip_size;
0b4e3aa0 1132 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1c79356b
A
1133 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1134 }
1135 else
1136 {
1137 /*
1138 * If we come in here, we know the offset into
1139 * the file is on a pagesize boundary
1140 */
1141
1142 max_io_size = newEOF - uio->uio_offset;
1143 clip_size = uio->uio_resid;
1144 if (iov->iov_len < clip_size)
1145 clip_size = iov->iov_len;
1146 if (max_io_size < clip_size)
1147 clip_size = max_io_size;
1148
1149 if (clip_size < PAGE_SIZE)
1150 {
1151 /*
1152 * Take care of tail end of write in this vector
1153 */
1154 prev_resid = uio->uio_resid;
1155 uio->uio_resid = clip_size;
0b4e3aa0 1156 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1c79356b
A
1157 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1158 }
1159 else
1160 {
1161 /* round clip_size down to a multiple of pagesize */
1162 clip_size = clip_size & ~(PAGE_MASK);
1163 prev_resid = uio->uio_resid;
1164 uio->uio_resid = clip_size;
0b4e3aa0 1165 retval = cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags);
1c79356b 1166 if ((retval == 0) && uio->uio_resid)
0b4e3aa0 1167 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1c79356b
A
1168 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1169 }
1170 } /* end else */
1171 } /* end while */
1172 return(retval);
1173}
1174
9bccf70c 1175static int
0b4e3aa0 1176cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags)
1c79356b
A
1177 struct vnode *vp;
1178 struct uio *uio;
1179 off_t newEOF;
1180 int devblocksize;
1181 int flags;
1182{
1183 upl_t upl;
1184 upl_page_info_t *pl;
1185 off_t upl_f_offset;
1186 vm_offset_t upl_offset;
1187 off_t max_io_size;
1188 int io_size;
1189 int upl_size;
1190 int upl_needed_size;
1191 int pages_in_pl;
1192 int upl_flags;
1193 kern_return_t kret;
1194 struct iovec *iov;
1195 int i;
1196 int force_data_sync;
1197 int error = 0;
1198
1199 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
1200 (int)uio->uio_offset, (int)uio->uio_resid,
1201 (int)newEOF, devblocksize, 0);
1202
1203 /*
1204 * When we enter this routine, we know
1205 * -- the offset into the file is on a pagesize boundary
1206 * -- the resid is a page multiple
1207 * -- the resid will not exceed iov_len
1208 */
1209
1210 iov = uio->uio_iov;
1c79356b 1211
0b4e3aa0 1212 while (uio->uio_resid && uio->uio_offset < newEOF && error == 0) {
1c79356b 1213 io_size = uio->uio_resid;
0b4e3aa0
A
1214
1215 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1216 io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1c79356b
A
1217
1218 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
1219 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
1220
1221 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
9bccf70c 1222 (int)upl_offset, upl_needed_size, (int)iov->iov_base, io_size, 0);
1c79356b
A
1223
1224 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++)
1225 {
1226 pages_in_pl = 0;
1227 upl_size = upl_needed_size;
9bccf70c
A
1228 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1229 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
1c79356b
A
1230
1231 kret = vm_map_get_upl(current_map(),
1232 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
0b4e3aa0
A
1233 &upl_size,
1234 &upl,
1235 NULL,
1236 &pages_in_pl,
1237 &upl_flags,
1238 force_data_sync);
1c79356b
A
1239
1240 if (kret != KERN_SUCCESS)
1241 {
1242 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1243 0, 0, 0, kret, 0);
1244
1245 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1246 (int)uio->uio_offset, (int)uio->uio_resid, kret, 1, 0);
1247
1248 /* cluster_nocopy_write: failed to get pagelist */
1249 /* do not return kret here */
1250 return(0);
1251 }
0b4e3aa0
A
1252
1253 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1254 pages_in_pl = upl_size / PAGE_SIZE;
1255
1c79356b
A
1256 for(i=0; i < pages_in_pl; i++)
1257 {
1258 if (!upl_valid_page(pl, i))
1259 break;
1260 }
0b4e3aa0 1261
1c79356b
A
1262 if (i == pages_in_pl)
1263 break;
1264
0b4e3aa0
A
1265 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1266 UPL_ABORT_FREE_ON_EMPTY);
1c79356b
A
1267 }
1268
1269 if (force_data_sync >= 3)
1270 {
1271 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
0b4e3aa0 1272 i, pages_in_pl, upl_size, kret, 0);
1c79356b
A
1273
1274 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1275 (int)uio->uio_offset, (int)uio->uio_resid, kret, 2, 0);
1276 return(0);
1277 }
1278
1279 /*
1280 * Consider the possibility that upl_size wasn't satisfied.
1281 */
1282 if (upl_size != upl_needed_size)
1283 io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
1284
1285 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
9bccf70c 1286 (int)upl_offset, upl_size, (int)iov->iov_base, io_size, 0);
1c79356b
A
1287
1288 if (io_size == 0)
1289 {
0b4e3aa0 1290 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1c79356b
A
1291 UPL_ABORT_FREE_ON_EMPTY);
1292 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1293 (int)uio->uio_offset, uio->uio_resid, 0, 3, 0);
1294
1295 return(0);
1296 }
1297
1298 /*
1299 * Now look for pages already in the cache
1300 * and throw them away.
1301 */
1302
1303 upl_f_offset = uio->uio_offset; /* this is page aligned in the file */
1304 max_io_size = io_size;
1305
1306 while (max_io_size) {
1307
1308 /*
1309 * Flag UPL_POP_DUMP says if the page is found
1310 * in the page cache it must be thrown away.
1311 */
0b4e3aa0
A
1312 ubc_page_op(vp,
1313 upl_f_offset,
1314 UPL_POP_SET | UPL_POP_BUSY | UPL_POP_DUMP,
1315 0, 0);
1c79356b
A
1316 max_io_size -= PAGE_SIZE;
1317 upl_f_offset += PAGE_SIZE;
1318 }
1319
1320 /*
1321 * issue a synchronous write to cluster_io
1322 */
1323
1324 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
1325 (int)upl_offset, (int)uio->uio_offset, io_size, 0, 0);
1326
1327 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
9bccf70c 1328 io_size, devblocksize, 0, (struct buf *)0);
1c79356b
A
1329
1330 if (error == 0) {
1331 /*
1332 * The cluster_io write completed successfully,
7b1edb79 1333 * update the uio structure.
1c79356b 1334 */
1c79356b
A
1335 iov->iov_base += io_size;
1336 iov->iov_len -= io_size;
1337 uio->uio_resid -= io_size;
1338 uio->uio_offset += io_size;
1339 }
7b1edb79
A
1340 /*
1341 * always 'commit' the I/O via the abort primitive whether the I/O
1342 * succeeded cleanly or not... this is necessary to insure that
1343 * we preserve the state of the DIRTY flag on the pages used to
1344 * provide the data for the I/O... the state of this flag SHOULD
1345 * NOT be changed by a write
1346 */
1347 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1348 UPL_ABORT_FREE_ON_EMPTY);
1349
1c79356b
A
1350
1351 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
1352 (int)upl_offset, (int)uio->uio_offset, (int)uio->uio_resid, error, 0);
1353
1354 } /* end while */
1355
1356
1357 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1358 (int)uio->uio_offset, (int)uio->uio_resid, error, 4, 0);
1359
1360 return (error);
1361}
1362
9bccf70c 1363static int
0b4e3aa0
A
1364cluster_phys_write(vp, uio)
1365 struct vnode *vp;
1366 struct uio *uio;
1367{
1368 upl_t upl;
1369 vm_offset_t upl_offset;
1370 int io_size;
1371 int upl_size;
1372 int upl_needed_size;
1373 int pages_in_pl;
1374 int upl_flags;
1375 kern_return_t kret;
1376 struct iovec *iov;
1377 int error = 0;
1378
1379 /*
1380 * When we enter this routine, we know
1381 * -- the resid will not exceed iov_len
1382 * -- the vector target address is physcially contiguous
1383 */
1384
1385 iov = uio->uio_iov;
1386 io_size = iov->iov_len;
1387 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
1388 upl_needed_size = upl_offset + io_size;
1389
1390 pages_in_pl = 0;
1391 upl_size = upl_needed_size;
9bccf70c
A
1392 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1393 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
0b4e3aa0
A
1394
1395 kret = vm_map_get_upl(current_map(),
1396 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1397 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
1398
1399 if (kret != KERN_SUCCESS)
1400 {
1401 /* cluster_phys_write: failed to get pagelist */
1402 /* note: return kret here */
1403 return(EINVAL);
1404 }
1405
1406 /*
1407 * Consider the possibility that upl_size wasn't satisfied.
1408 * This is a failure in the physical memory case.
1409 */
1410 if (upl_size < upl_needed_size)
1411 {
1412 kernel_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1413 return(EINVAL);
1414 }
1415
1416 /*
1417 * issue a synchronous write to cluster_io
1418 */
1419
1420 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
9bccf70c 1421 io_size, 0, CL_DEV_MEMORY, (struct buf *)0);
0b4e3aa0
A
1422
1423 if (error == 0) {
1424 /*
1425 * The cluster_io write completed successfully,
1426 * update the uio structure and commit.
1427 */
1428
1429 ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_FREE_ON_EMPTY);
1430
1431 iov->iov_base += io_size;
1432 iov->iov_len -= io_size;
1433 uio->uio_resid -= io_size;
1434 uio->uio_offset += io_size;
1435 }
1436 else
1437 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1438
1439 return (error);
1440}
1441
9bccf70c 1442static int
0b4e3aa0 1443cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
1c79356b
A
1444 struct vnode *vp;
1445 struct uio *uio;
1446 off_t oldEOF;
1447 off_t newEOF;
1448 off_t headOff;
1449 off_t tailOff;
1450 int devblocksize;
1451 int flags;
1452{
1453 upl_page_info_t *pl;
1454 upl_t upl;
1455 vm_offset_t upl_offset;
1456 int upl_size;
1457 off_t upl_f_offset;
1458 int pages_in_upl;
1459 int start_offset;
1460 int xfer_resid;
1461 int io_size;
1c79356b
A
1462 int io_flags;
1463 vm_offset_t io_address;
1464 int io_offset;
1465 int bytes_to_zero;
1466 int bytes_to_move;
1467 kern_return_t kret;
1468 int retval = 0;
1469 int uio_resid;
1470 long long total_size;
1471 long long zero_cnt;
1472 off_t zero_off;
1473 long long zero_cnt1;
1474 off_t zero_off1;
1475 daddr_t start_blkno;
1476 daddr_t last_blkno;
1477
1478 if (uio) {
1479 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1480 (int)uio->uio_offset, uio->uio_resid, (int)oldEOF, (int)newEOF, 0);
1481
1482 uio_resid = uio->uio_resid;
1483 } else {
1484 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1485 0, 0, (int)oldEOF, (int)newEOF, 0);
1486
1487 uio_resid = 0;
1488 }
1489 zero_cnt = 0;
1490 zero_cnt1 = 0;
1491
1492 if (flags & IO_HEADZEROFILL) {
1493 /*
1494 * some filesystems (HFS is one) don't support unallocated holes within a file...
1495 * so we zero fill the intervening space between the old EOF and the offset
1496 * where the next chunk of real data begins.... ftruncate will also use this
1497 * routine to zero fill to the new EOF when growing a file... in this case, the
1498 * uio structure will not be provided
1499 */
1500 if (uio) {
1501 if (headOff < uio->uio_offset) {
1502 zero_cnt = uio->uio_offset - headOff;
1503 zero_off = headOff;
1504 }
1505 } else if (headOff < newEOF) {
1506 zero_cnt = newEOF - headOff;
1507 zero_off = headOff;
1508 }
1509 }
1510 if (flags & IO_TAILZEROFILL) {
1511 if (uio) {
1512 zero_off1 = uio->uio_offset + uio->uio_resid;
1513
1514 if (zero_off1 < tailOff)
1515 zero_cnt1 = tailOff - zero_off1;
1516 }
1517 }
1518 if (zero_cnt == 0 && uio == (struct uio *) 0)
1519 {
1520 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
1521 retval, 0, 0, 0, 0);
1522 return (0);
1523 }
1524
1525 while ((total_size = (uio_resid + zero_cnt + zero_cnt1)) && retval == 0) {
1526 /*
1527 * for this iteration of the loop, figure out where our starting point is
1528 */
1529 if (zero_cnt) {
1530 start_offset = (int)(zero_off & PAGE_MASK_64);
1531 upl_f_offset = zero_off - start_offset;
1532 } else if (uio_resid) {
1533 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
1534 upl_f_offset = uio->uio_offset - start_offset;
1535 } else {
1536 start_offset = (int)(zero_off1 & PAGE_MASK_64);
1537 upl_f_offset = zero_off1 - start_offset;
1538 }
1539 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
1540 (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
1541
0b4e3aa0
A
1542 if (total_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1543 total_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1c79356b
A
1544
1545 /*
1546 * compute the size of the upl needed to encompass
1547 * the requested write... limit each call to cluster_io
0b4e3aa0
A
1548 * to the maximum UPL size... cluster_io will clip if
1549 * this exceeds the maximum io_size for the device,
1550 * make sure to account for
1c79356b
A
1551 * a starting offset that's not page aligned
1552 */
1553 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1554
0b4e3aa0
A
1555 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1556 upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1c79356b
A
1557
1558 pages_in_upl = upl_size / PAGE_SIZE;
1559 io_size = upl_size - start_offset;
1560
1561 if ((long long)io_size > total_size)
1562 io_size = total_size;
1563
1564 start_blkno = (daddr_t)(upl_f_offset / PAGE_SIZE_64);
1565 last_blkno = start_blkno + pages_in_upl;
1566
0b4e3aa0
A
1567 kret = ubc_create_upl(vp,
1568 upl_f_offset,
1569 upl_size,
1570 &upl,
1571 &pl,
1572 UPL_FLAGS_NONE);
1c79356b
A
1573 if (kret != KERN_SUCCESS)
1574 panic("cluster_write: failed to get pagelist");
1575
0b4e3aa0 1576 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_NONE,
9bccf70c 1577 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
1c79356b
A
1578
1579 if (start_offset && !upl_valid_page(pl, 0)) {
0b4e3aa0 1580 int read_size;
1c79356b 1581
0b4e3aa0 1582 /*
1c79356b
A
1583 * we're starting in the middle of the first page of the upl
1584 * and the page isn't currently valid, so we're going to have
1585 * to read it in first... this is a synchronous operation
1586 */
1587 read_size = PAGE_SIZE;
1588
9bccf70c 1589 if ((upl_f_offset + read_size) > newEOF)
1c79356b 1590 read_size = newEOF - upl_f_offset;
9bccf70c
A
1591
1592 retval = cluster_io(vp, upl, 0, upl_f_offset, read_size, devblocksize,
1c79356b
A
1593 CL_READ, (struct buf *)0);
1594 if (retval) {
0b4e3aa0 1595 /*
1c79356b
A
1596 * we had an error during the read which causes us to abort
1597 * the current cluster_write request... before we do, we need
1598 * to release the rest of the pages in the upl without modifying
1599 * there state and mark the failed page in error
1600 */
0b4e3aa0 1601 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
9bccf70c 1602 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1c79356b
A
1603
1604 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
9bccf70c 1605 (int)upl, 0, 0, retval, 0);
1c79356b
A
1606 break;
1607 }
1608 }
1609 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
1610 /*
1611 * the last offset we're writing to in this upl does not end on a page
1612 * boundary... if it's not beyond the old EOF, then we'll also need to
1613 * pre-read this page in if it isn't already valid
1614 */
1615 upl_offset = upl_size - PAGE_SIZE;
1616
1617 if ((upl_f_offset + start_offset + io_size) < oldEOF &&
1618 !upl_valid_page(pl, upl_offset / PAGE_SIZE)) {
1619 int read_size;
1620
1621 read_size = PAGE_SIZE;
1622
9bccf70c 1623 if ((upl_f_offset + upl_offset + read_size) > newEOF)
1c79356b 1624 read_size = newEOF - (upl_f_offset + upl_offset);
9bccf70c
A
1625
1626 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size, devblocksize,
1c79356b
A
1627 CL_READ, (struct buf *)0);
1628 if (retval) {
0b4e3aa0 1629 /*
1c79356b 1630 * we had an error during the read which causes us to abort
0b4e3aa0
A
1631 * the current cluster_write request... before we do, we
1632 * need to release the rest of the pages in the upl without
1633 * modifying there state and mark the failed page in error
1c79356b 1634 */
9bccf70c
A
1635 ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
1636 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1c79356b
A
1637
1638 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
9bccf70c 1639 (int)upl, 0, 0, retval, 0);
1c79356b
A
1640 break;
1641 }
1642 }
1643 }
0b4e3aa0
A
1644 if ((kret = ubc_upl_map(upl, &io_address)) != KERN_SUCCESS)
1645 panic("cluster_write: ubc_upl_map failed\n");
1c79356b
A
1646 xfer_resid = io_size;
1647 io_offset = start_offset;
1648
1649 while (zero_cnt && xfer_resid) {
1650
1651 if (zero_cnt < (long long)xfer_resid)
1652 bytes_to_zero = zero_cnt;
1653 else
1654 bytes_to_zero = xfer_resid;
1655
9bccf70c 1656 if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
1c79356b
A
1657 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1658
1659 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1660 (int)upl_f_offset + io_offset, bytes_to_zero,
9bccf70c 1661 (int)io_offset, xfer_resid, 0);
1c79356b 1662 } else {
9bccf70c
A
1663 int zero_pg_index;
1664
1c79356b 1665 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
9bccf70c
A
1666 zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
1667
1668 if ( !upl_valid_page(pl, zero_pg_index)) {
1669 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1670
1671 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1672 (int)upl_f_offset + io_offset, bytes_to_zero,
1673 (int)io_offset, xfer_resid, 0);
1c79356b 1674
9bccf70c
A
1675 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
1676 !upl_dirty_page(pl, zero_pg_index)) {
1c79356b
A
1677 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1678
1679 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1680 (int)upl_f_offset + io_offset, bytes_to_zero,
9bccf70c 1681 (int)io_offset, xfer_resid, 0);
1c79356b
A
1682 }
1683 }
1684 xfer_resid -= bytes_to_zero;
1685 zero_cnt -= bytes_to_zero;
1686 zero_off += bytes_to_zero;
1687 io_offset += bytes_to_zero;
1688 }
1689 if (xfer_resid && uio_resid) {
1690 bytes_to_move = min(uio_resid, xfer_resid);
1691
1692 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 42)) | DBG_FUNC_NONE,
1693 (int)uio->uio_offset, bytes_to_move, uio_resid, xfer_resid, 0);
1694
1695 retval = uiomove((caddr_t)(io_address + io_offset), bytes_to_move, uio);
1696
9bccf70c 1697
1c79356b 1698 if (retval) {
0b4e3aa0 1699 if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
1c79356b 1700 panic("cluster_write: kernel_upl_unmap failed\n");
9bccf70c
A
1701
1702 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
1c79356b
A
1703
1704 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
9bccf70c 1705 (int)upl, 0, 0, retval, 0);
1c79356b
A
1706 } else {
1707 uio_resid -= bytes_to_move;
1708 xfer_resid -= bytes_to_move;
1709 io_offset += bytes_to_move;
1710 }
1711 }
1712 while (xfer_resid && zero_cnt1 && retval == 0) {
1713
1714 if (zero_cnt1 < (long long)xfer_resid)
1715 bytes_to_zero = zero_cnt1;
1716 else
1717 bytes_to_zero = xfer_resid;
1718
9bccf70c 1719 if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
1c79356b
A
1720 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1721
1722 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1723 (int)upl_f_offset + io_offset,
9bccf70c 1724 bytes_to_zero, (int)io_offset, xfer_resid, 0);
1c79356b 1725 } else {
9bccf70c
A
1726 int zero_pg_index;
1727
1c79356b 1728 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off1 & PAGE_MASK_64));
9bccf70c
A
1729 zero_pg_index = (int)((zero_off1 - upl_f_offset) / PAGE_SIZE_64);
1730
1731 if ( !upl_valid_page(pl, zero_pg_index)) {
1c79356b
A
1732 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1733
1734 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1735 (int)upl_f_offset + io_offset,
9bccf70c
A
1736 bytes_to_zero, (int)io_offset, xfer_resid, 0);
1737
1738 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
1739 !upl_dirty_page(pl, zero_pg_index)) {
1740 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1741
1742 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1743 (int)upl_f_offset + io_offset,
1744 bytes_to_zero, (int)io_offset, xfer_resid, 0);
1c79356b
A
1745 }
1746 }
1747 xfer_resid -= bytes_to_zero;
1748 zero_cnt1 -= bytes_to_zero;
1749 zero_off1 += bytes_to_zero;
1750 io_offset += bytes_to_zero;
1751 }
1752
1753 if (retval == 0) {
9bccf70c 1754 int cl_index;
1c79356b
A
1755 int can_delay;
1756
1757 io_size += start_offset;
1758
9bccf70c 1759 if ((upl_f_offset + io_size) >= newEOF && io_size < upl_size) {
1c79356b
A
1760 /*
1761 * if we're extending the file with this write
1762 * we'll zero fill the rest of the page so that
1763 * if the file gets extended again in such a way as to leave a
1764 * hole starting at this EOF, we'll have zero's in the correct spot
1765 */
1766 bzero((caddr_t)(io_address + io_size), upl_size - io_size);
1767
1768 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1769 (int)upl_f_offset + io_size,
1770 upl_size - io_size, 0, 0, 0);
1771 }
0b4e3aa0 1772 if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
1c79356b
A
1773 panic("cluster_write: kernel_upl_unmap failed\n");
1774
9bccf70c
A
1775 if (flags & IO_SYNC)
1776 /*
1777 * if the IO_SYNC flag is set than we need to
1778 * bypass any clusters and immediately issue
1779 * the I/O
1780 */
1781 goto issue_io;
1c79356b 1782
9bccf70c
A
1783 if (vp->v_clen == 0)
1784 /*
1785 * no clusters currently present
1786 */
1787 goto start_new_cluster;
1c79356b 1788
9bccf70c
A
1789 /*
1790 * keep track of the overall dirty page
1791 * range we've developed
1792 * in case we have to fall back to the
1793 * VHASDIRTY method of flushing
1794 */
1795 if (vp->v_flag & VHASDIRTY)
1796 goto delay_io;
1c79356b 1797
9bccf70c 1798 for (cl_index = 0; cl_index < vp->v_clen; cl_index++) {
1c79356b
A
1799 /*
1800 * we have an existing cluster... see if this write will extend it nicely
1801 */
9bccf70c
A
1802 if (start_blkno >= vp->v_clusters[cl_index].start_pg) {
1803 /*
1804 * the current write starts at or after the current cluster
1805 */
1806 if (last_blkno <= (vp->v_clusters[cl_index].start_pg + MAX_UPL_TRANSFER)) {
1c79356b
A
1807 /*
1808 * we have a write that fits entirely
1809 * within the existing cluster limits
1810 */
9bccf70c 1811 if (last_blkno > vp->v_clusters[cl_index].last_pg)
1c79356b 1812 /*
9bccf70c 1813 * update our idea of where the cluster ends
1c79356b 1814 */
9bccf70c
A
1815 vp->v_clusters[cl_index].last_pg = last_blkno;
1816 break;
1c79356b 1817 }
9bccf70c 1818 if (start_blkno < (vp->v_clusters[cl_index].start_pg + MAX_UPL_TRANSFER)) {
1c79356b
A
1819 /*
1820 * we have a write that starts in the middle of the current cluster
1821 * but extends beyond the cluster's limit
1822 * we'll clip the current cluster if we actually
9bccf70c 1823 * overlap with the new write
1c79356b
A
1824 * and start a new cluster with the current write
1825 */
9bccf70c
A
1826 if (vp->v_clusters[cl_index].last_pg > start_blkno)
1827 vp->v_clusters[cl_index].last_pg = start_blkno;
1c79356b
A
1828 }
1829 /*
1830 * we also get here for the case where the current write starts
1831 * beyond the limit of the existing cluster
9bccf70c
A
1832 *
1833 * in either case, we'll check the remaining clusters before
1834 * starting a new one
1c79356b 1835 */
9bccf70c 1836 } else {
1c79356b 1837 /*
9bccf70c 1838 * the current write starts in front of the current cluster
1c79356b 1839 */
9bccf70c 1840 if ((vp->v_clusters[cl_index].last_pg - start_blkno) <= MAX_UPL_TRANSFER) {
1c79356b 1841 /*
9bccf70c
A
1842 * we can just merge the old cluster
1843 * with the new request and leave it
1844 * in the cache
1c79356b 1845 */
9bccf70c 1846 vp->v_clusters[cl_index].start_pg = start_blkno;
1c79356b 1847
9bccf70c
A
1848 if (last_blkno > vp->v_clusters[cl_index].last_pg) {
1849 /*
1850 * the current write completely
1851 * envelops the existing cluster
1852 */
1853 vp->v_clusters[cl_index].last_pg = last_blkno;
1854 }
1855 break;
1c79356b 1856 }
9bccf70c 1857
1c79356b 1858 /*
9bccf70c
A
1859 * if we were to combine this write with the current cluster
1860 * we would exceed the cluster size limit.... so,
1861 * let's see if there's any overlap of the new I/O with
1862 * the existing cluster...
1863 *
1c79356b 1864 */
9bccf70c 1865 if (last_blkno > vp->v_clusters[cl_index].start_pg)
1c79356b 1866 /*
9bccf70c
A
1867 * the current write extends into the existing cluster
1868 * clip the current cluster by moving the start position
1869 * to where the current write ends
1c79356b 1870 */
9bccf70c
A
1871 vp->v_clusters[cl_index].start_pg = last_blkno;
1872 /*
1873 * if we get here, there was no way to merge
1874 * the new I/O with this cluster and
1875 * keep it under our maximum cluster length
1876 * we'll check the remaining clusters before starting a new one
1877 */
1c79356b 1878 }
9bccf70c
A
1879 }
1880 if (cl_index < vp->v_clen)
1881 /*
1882 * we found an existing cluster that we
1883 * could merger this I/O into
1884 */
1885 goto delay_io;
1886
1887 if (vp->v_clen < MAX_CLUSTERS && !(vp->v_flag & VNOCACHE_DATA))
1888 /*
1889 * we didn't find an existing cluster to
1890 * merge into, but there's room to start
1c79356b
A
1891 * a new one
1892 */
9bccf70c 1893 goto start_new_cluster;
1c79356b 1894
9bccf70c
A
1895 /*
1896 * no exisitng cluster to merge with and no
1897 * room to start a new one... we'll try
1898 * pushing the existing ones... if none of
1899 * them are able to be pushed, we'll have
1900 * to fall back on the VHASDIRTY mechanism
1901 * cluster_try_push will set v_clen to the
1902 * number of remaining clusters if it is
1903 * unable to push all of them
1904 */
1905 if (vp->v_flag & VNOCACHE_DATA)
1906 can_delay = 0;
1907 else
1908 can_delay = 1;
1909
1910 if (cluster_try_push(vp, newEOF, can_delay, 0) == 0) {
1911 vp->v_flag |= VHASDIRTY;
1912 goto delay_io;
1913 }
1914start_new_cluster:
1915 if (vp->v_clen == 0) {
1916 vp->v_ciosiz = devblocksize;
1c79356b
A
1917 vp->v_cstart = start_blkno;
1918 vp->v_lastw = last_blkno;
1c79356b 1919 }
9bccf70c
A
1920 vp->v_clusters[vp->v_clen].start_pg = start_blkno;
1921 vp->v_clusters[vp->v_clen].last_pg = last_blkno;
1922 vp->v_clen++;
1923delay_io:
1924 /*
1925 * make sure we keep v_cstart and v_lastw up to
1926 * date in case we have to fall back on the
1927 * V_HASDIRTY mechanism (or we've already entered it)
1928 */
1929 if (start_blkno < vp->v_cstart)
1930 vp->v_cstart = start_blkno;
1931 if (last_blkno > vp->v_lastw)
1932 vp->v_lastw = last_blkno;
1933
1934 ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
1935 continue;
1936issue_io:
1937 /*
1938 * in order to maintain some semblance of coherency with mapped writes
1939 * we need to write the cluster back out as a multiple of the PAGESIZE
1940 * unless the cluster encompasses the last page of the file... in this
1941 * case we'll round out to the nearest device block boundary
1942 */
1943 io_size = upl_size;
1944
1945 if ((upl_f_offset + io_size) > newEOF) {
1946 io_size = newEOF - upl_f_offset;
1947 io_size = (io_size + (devblocksize - 1)) & ~(devblocksize - 1);
1c79356b 1948 }
9bccf70c 1949
0b4e3aa0 1950 if (flags & IO_SYNC)
1c79356b
A
1951 io_flags = CL_COMMIT | CL_AGE;
1952 else
1953 io_flags = CL_COMMIT | CL_AGE | CL_ASYNC;
1954
1955 if (vp->v_flag & VNOCACHE_DATA)
1956 io_flags |= CL_DUMP;
1957
0b4e3aa0
A
1958 while (vp->v_numoutput >= ASYNC_THROTTLE) {
1959 vp->v_flag |= VTHROTTLED;
1960 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_write", 0);
1961 }
9bccf70c 1962 retval = cluster_io(vp, upl, 0, upl_f_offset, io_size, devblocksize,
1c79356b
A
1963 io_flags, (struct buf *)0);
1964 }
1965 }
1966 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
1967 retval, 0, 0, 0, 0);
1968
1969 return (retval);
1970}
1971
9bccf70c 1972int
1c79356b
A
1973cluster_read(vp, uio, filesize, devblocksize, flags)
1974 struct vnode *vp;
1975 struct uio *uio;
1976 off_t filesize;
1977 int devblocksize;
1978 int flags;
1979{
1c79356b
A
1980 int prev_resid;
1981 int clip_size;
1982 off_t max_io_size;
1983 struct iovec *iov;
0b4e3aa0
A
1984 vm_offset_t upl_offset;
1985 int upl_size;
1986 int pages_in_pl;
1987 upl_page_info_t *pl;
1988 int upl_flags;
1989 upl_t upl;
1c79356b
A
1990 int retval = 0;
1991
1c79356b
A
1992 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
1993 (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0);
1994
1995 /*
1996 * We set a threshhold of 4 pages to decide if the nocopy
1997 * read loop is worth the trouble...
1998 */
1999
0b4e3aa0 2000 if (!((vp->v_flag & VNOCACHE_DATA) && (uio->uio_segflg == UIO_USERSPACE)))
1c79356b 2001 {
0b4e3aa0 2002 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
1c79356b
A
2003 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2004 (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
2005 return(retval);
1c79356b
A
2006 }
2007
2008 while (uio->uio_resid && uio->uio_offset < filesize && retval == 0)
2009 {
2010 /* we know we have a resid, so this is safe */
2011 iov = uio->uio_iov;
2012 while (iov->iov_len == 0) {
2013 uio->uio_iov++;
2014 uio->uio_iovcnt--;
2015 iov = uio->uio_iov;
2016 }
2017
0b4e3aa0
A
2018 /*
2019 * We check every vector target and if it is physically
2020 * contiguous space, we skip the sanity checks.
2021 */
2022
2023 upl_offset = (vm_offset_t)iov->iov_base & ~PAGE_MASK;
2024 upl_size = (upl_offset + PAGE_SIZE +(PAGE_SIZE -1)) & ~PAGE_MASK;
2025 pages_in_pl = 0;
2026 upl_flags = UPL_QUERY_OBJECT_TYPE;
2027 if((vm_map_get_upl(current_map(),
2028 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2029 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0)) != KERN_SUCCESS)
2030 {
2031 /*
2032 * the user app must have passed in an invalid address
2033 */
2034 return (EFAULT);
2035 }
2036
2037 if (upl_flags & UPL_PHYS_CONTIG)
2038 {
2039 retval = cluster_phys_read(vp, uio, filesize);
2040 }
2041 else if (uio->uio_resid < 4 * PAGE_SIZE)
2042 {
2043 /*
2044 * We set a threshhold of 4 pages to decide if the nocopy
2045 * read loop is worth the trouble...
2046 */
2047 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2048 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2049 (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
2050 return(retval);
2051 }
2052 else if (uio->uio_offset & PAGE_MASK_64)
1c79356b
A
2053 {
2054 /* Bring the file offset read up to a pagesize boundary */
2055 clip_size = (PAGE_SIZE - (int)(uio->uio_offset & PAGE_MASK_64));
2056 if (uio->uio_resid < clip_size)
2057 clip_size = uio->uio_resid;
2058 /*
2059 * Fake the resid going into the cluster_read_x call
2060 * and restore it on the way out.
2061 */
2062 prev_resid = uio->uio_resid;
2063 uio->uio_resid = clip_size;
0b4e3aa0 2064 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
1c79356b
A
2065 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2066 }
2067 else if ((int)iov->iov_base & PAGE_MASK_64)
2068 {
2069 clip_size = iov->iov_len;
2070 prev_resid = uio->uio_resid;
2071 uio->uio_resid = clip_size;
0b4e3aa0 2072 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
1c79356b
A
2073 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2074 }
2075 else
2076 {
2077 /*
2078 * If we come in here, we know the offset into
2079 * the file is on a pagesize boundary
2080 */
2081
2082 max_io_size = filesize - uio->uio_offset;
2083 clip_size = uio->uio_resid;
2084 if (iov->iov_len < clip_size)
2085 clip_size = iov->iov_len;
2086 if (max_io_size < clip_size)
2087 clip_size = (int)max_io_size;
2088
2089 if (clip_size < PAGE_SIZE)
2090 {
2091 /*
2092 * Take care of the tail end of the read in this vector.
2093 */
2094 prev_resid = uio->uio_resid;
2095 uio->uio_resid = clip_size;
0b4e3aa0 2096 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
1c79356b
A
2097 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2098 }
2099 else
2100 {
2101 /* round clip_size down to a multiple of pagesize */
2102 clip_size = clip_size & ~(PAGE_MASK);
2103 prev_resid = uio->uio_resid;
2104 uio->uio_resid = clip_size;
0b4e3aa0 2105 retval = cluster_nocopy_read(vp, uio, filesize, devblocksize, flags);
1c79356b 2106 if ((retval==0) && uio->uio_resid)
0b4e3aa0 2107 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
1c79356b
A
2108 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2109 }
2110 } /* end else */
2111 } /* end while */
2112
2113 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2114 (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
2115
2116 return(retval);
2117}
2118
9bccf70c 2119static int
0b4e3aa0 2120cluster_read_x(vp, uio, filesize, devblocksize, flags)
1c79356b
A
2121 struct vnode *vp;
2122 struct uio *uio;
2123 off_t filesize;
2124 int devblocksize;
2125 int flags;
2126{
2127 upl_page_info_t *pl;
2128 upl_t upl;
2129 vm_offset_t upl_offset;
2130 int upl_size;
2131 off_t upl_f_offset;
2132 int start_offset;
2133 int start_pg;
2134 int last_pg;
2135 int uio_last;
2136 int pages_in_upl;
2137 off_t max_size;
2138 int io_size;
2139 vm_offset_t io_address;
2140 kern_return_t kret;
2141 int segflg;
2142 int error = 0;
2143 int retval = 0;
2144 int b_lblkno;
2145 int e_lblkno;
2146
2147 b_lblkno = (int)(uio->uio_offset / PAGE_SIZE_64);
2148
2149 while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) {
2150 /*
2151 * compute the size of the upl needed to encompass
2152 * the requested read... limit each call to cluster_io
0b4e3aa0
A
2153 * to the maximum UPL size... cluster_io will clip if
2154 * this exceeds the maximum io_size for the device,
2155 * make sure to account for
1c79356b
A
2156 * a starting offset that's not page aligned
2157 */
2158 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2159 upl_f_offset = uio->uio_offset - (off_t)start_offset;
2160 max_size = filesize - uio->uio_offset;
2161
0b4e3aa0 2162 if ((off_t)((unsigned int)uio->uio_resid) < max_size)
1c79356b
A
2163 io_size = uio->uio_resid;
2164 else
2165 io_size = max_size;
9bccf70c 2166
1c79356b
A
2167 if (uio->uio_segflg == UIO_USERSPACE && !(vp->v_flag & VNOCACHE_DATA)) {
2168 segflg = uio->uio_segflg;
2169
2170 uio->uio_segflg = UIO_PHYS_USERSPACE;
2171
2172 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
2173 (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2174
2175 while (io_size && retval == 0) {
2176 int xsize;
2177 vm_offset_t paddr;
2178
0b4e3aa0
A
2179 if (ubc_page_op(vp,
2180 upl_f_offset,
2181 UPL_POP_SET | UPL_POP_BUSY,
2182 &paddr, 0) != KERN_SUCCESS)
1c79356b
A
2183 break;
2184
2185 xsize = PAGE_SIZE - start_offset;
2186
2187 if (xsize > io_size)
2188 xsize = io_size;
2189
2190 retval = uiomove((caddr_t)(paddr + start_offset), xsize, uio);
2191
0b4e3aa0
A
2192 ubc_page_op(vp, upl_f_offset,
2193 UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
1c79356b
A
2194
2195 io_size -= xsize;
2196 start_offset = (int)
2197 (uio->uio_offset & PAGE_MASK_64);
2198 upl_f_offset = uio->uio_offset - start_offset;
2199 }
2200 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
2201 (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2202
2203 uio->uio_segflg = segflg;
2204
2205 if (retval)
2206 break;
2207
2208 if (io_size == 0) {
2209 /*
2210 * we're already finished with this read request
2211 * let's see if we should do a read-ahead
2212 */
2213 e_lblkno = (int)
2214 ((uio->uio_offset - 1) / PAGE_SIZE_64);
2215
2216 if (!(vp->v_flag & VRAOFF))
2217 /*
2218 * let's try to read ahead if we're in
2219 * a sequential access pattern
2220 */
0b4e3aa0 2221 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
1c79356b
A
2222 vp->v_lastr = e_lblkno;
2223
2224 break;
2225 }
2226 max_size = filesize - uio->uio_offset;
2227 }
1c79356b 2228 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
0b4e3aa0
A
2229 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2230 upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1c79356b
A
2231 pages_in_upl = upl_size / PAGE_SIZE;
2232
2233 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
9bccf70c 2234 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
1c79356b 2235
0b4e3aa0
A
2236 kret = ubc_create_upl(vp,
2237 upl_f_offset,
2238 upl_size,
2239 &upl,
2240 &pl,
2241 UPL_FLAGS_NONE);
1c79356b
A
2242 if (kret != KERN_SUCCESS)
2243 panic("cluster_read: failed to get pagelist");
2244
1c79356b 2245 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
9bccf70c 2246 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
1c79356b
A
2247
2248 /*
2249 * scan from the beginning of the upl looking for the first
2250 * non-valid page.... this will become the first page in
2251 * the request we're going to make to 'cluster_io'... if all
2252 * of the pages are valid, we won't call through to 'cluster_io'
2253 */
2254 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
2255 if (!upl_valid_page(pl, start_pg))
2256 break;
2257 }
2258
2259 /*
2260 * scan from the starting invalid page looking for a valid
2261 * page before the end of the upl is reached, if we
2262 * find one, then it will be the last page of the request to
2263 * 'cluster_io'
2264 */
2265 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
2266 if (upl_valid_page(pl, last_pg))
2267 break;
2268 }
2269
2270 if (start_pg < last_pg) {
2271 /*
2272 * we found a range of 'invalid' pages that must be filled
2273 * if the last page in this range is the last page of the file
2274 * we may have to clip the size of it to keep from reading past
2275 * the end of the last physical block associated with the file
2276 */
2277 upl_offset = start_pg * PAGE_SIZE;
2278 io_size = (last_pg - start_pg) * PAGE_SIZE;
2279
9bccf70c 2280 if ((upl_f_offset + upl_offset + io_size) > filesize)
1c79356b 2281 io_size = filesize - (upl_f_offset + upl_offset);
9bccf70c 2282
1c79356b
A
2283 /*
2284 * issue a synchronous read to cluster_io
2285 */
2286
2287 error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
9bccf70c 2288 io_size, devblocksize, CL_READ, (struct buf *)0);
1c79356b
A
2289 }
2290 if (error == 0) {
2291 /*
2292 * if the read completed successfully, or there was no I/O request
2293 * issued, than map the upl into kernel address space and
2294 * move the data into user land.... we'll first add on any 'valid'
2295 * pages that were present in the upl when we acquired it.
2296 */
2297 u_int val_size;
2298 u_int size_of_prefetch;
2299
2300 for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
2301 if (!upl_valid_page(pl, uio_last))
2302 break;
2303 }
2304 /*
2305 * compute size to transfer this round, if uio->uio_resid is
2306 * still non-zero after this uiomove, we'll loop around and
2307 * set up for another I/O.
2308 */
2309 val_size = (uio_last * PAGE_SIZE) - start_offset;
2310
2311 if (max_size < val_size)
2312 val_size = max_size;
2313
2314 if (uio->uio_resid < val_size)
2315 val_size = uio->uio_resid;
2316
2317 e_lblkno = (int)((uio->uio_offset + ((off_t)val_size - 1)) / PAGE_SIZE_64);
2318
2319 if (size_of_prefetch = (uio->uio_resid - val_size)) {
2320 /*
2321 * if there's still I/O left to do for this request, then issue a
2322 * pre-fetch I/O... the I/O wait time will overlap
2323 * with the copying of the data
2324 */
0b4e3aa0 2325 cluster_rd_prefetch(vp, uio->uio_offset + val_size, size_of_prefetch, filesize, devblocksize);
1c79356b
A
2326 } else {
2327 if (!(vp->v_flag & VRAOFF) && !(vp->v_flag & VNOCACHE_DATA))
2328 /*
2329 * let's try to read ahead if we're in
2330 * a sequential access pattern
2331 */
0b4e3aa0 2332 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
1c79356b
A
2333 vp->v_lastr = e_lblkno;
2334 }
1c79356b
A
2335 if (uio->uio_segflg == UIO_USERSPACE) {
2336 int offset;
2337
2338 segflg = uio->uio_segflg;
2339
2340 uio->uio_segflg = UIO_PHYS_USERSPACE;
2341
2342
2343 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
2344 (int)uio->uio_offset, val_size, uio->uio_resid, 0, 0);
2345
2346 offset = start_offset;
2347
2348 while (val_size && retval == 0) {
2349 int csize;
2350 int i;
2351 caddr_t paddr;
2352
2353 i = offset / PAGE_SIZE;
2354 csize = min(PAGE_SIZE - start_offset, val_size);
2355
2356 paddr = (caddr_t)upl_phys_page(pl, i) + start_offset;
2357
2358 retval = uiomove(paddr, csize, uio);
2359
2360 val_size -= csize;
2361 offset += csize;
2362 start_offset = offset & PAGE_MASK;
2363 }
2364 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
2365 (int)uio->uio_offset, val_size, uio->uio_resid, 0, 0);
2366
2367 uio->uio_segflg = segflg;
9bccf70c
A
2368 }
2369 else
1c79356b 2370 {
0b4e3aa0
A
2371 if ((kret = ubc_upl_map(upl, &io_address)) != KERN_SUCCESS)
2372 panic("cluster_read: ubc_upl_map() failed\n");
1c79356b
A
2373
2374 retval = uiomove((caddr_t)(io_address + start_offset), val_size, uio);
2375
0b4e3aa0
A
2376 if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
2377 panic("cluster_read: ubc_upl_unmap() failed\n");
1c79356b
A
2378 }
2379 }
2380 if (start_pg < last_pg) {
2381 /*
2382 * compute the range of pages that we actually issued an I/O for
2383 * and either commit them as valid if the I/O succeeded
2384 * or abort them if the I/O failed
2385 */
2386 io_size = (last_pg - start_pg) * PAGE_SIZE;
2387
2388 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
9bccf70c 2389 (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
1c79356b
A
2390
2391 if (error || (vp->v_flag & VNOCACHE_DATA))
0b4e3aa0 2392 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
1c79356b
A
2393 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2394 else
0b4e3aa0
A
2395 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size,
2396 UPL_COMMIT_CLEAR_DIRTY
1c79356b 2397 | UPL_COMMIT_FREE_ON_EMPTY
0b4e3aa0 2398 | UPL_COMMIT_INACTIVATE);
1c79356b
A
2399
2400 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
9bccf70c 2401 (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
1c79356b
A
2402 }
2403 if ((last_pg - start_pg) < pages_in_upl) {
2404 int cur_pg;
2405 int commit_flags;
2406
2407 /*
2408 * the set of pages that we issued an I/O for did not encompass
2409 * the entire upl... so just release these without modifying
2410 * there state
2411 */
2412 if (error)
9bccf70c 2413 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1c79356b 2414 else {
0b4e3aa0 2415 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
9bccf70c 2416 (int)upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
1c79356b 2417
0b4e3aa0
A
2418 if (start_pg) {
2419 /*
2420 * we found some already valid pages at the beginning of
2421 * the upl commit these back to the inactive list with
2422 * reference cleared
2423 */
2424 for (cur_pg = 0; cur_pg < start_pg; cur_pg++) {
2425 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
2426 | UPL_COMMIT_INACTIVATE;
1c79356b
A
2427
2428 if (upl_dirty_page(pl, cur_pg))
0b4e3aa0 2429 commit_flags |= UPL_COMMIT_SET_DIRTY;
1c79356b
A
2430
2431 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA))
0b4e3aa0
A
2432 ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
2433 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
1c79356b 2434 else
0b4e3aa0
A
2435 ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
2436 PAGE_SIZE, commit_flags);
1c79356b
A
2437 }
2438 }
2439 if (last_pg < uio_last) {
0b4e3aa0
A
2440 /*
2441 * we found some already valid pages immediately after the
2442 * pages we issued I/O for, commit these back to the
2443 * inactive list with reference cleared
2444 */
2445 for (cur_pg = last_pg; cur_pg < uio_last; cur_pg++) {
2446 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
2447 | UPL_COMMIT_INACTIVATE;
1c79356b
A
2448
2449 if (upl_dirty_page(pl, cur_pg))
0b4e3aa0 2450 commit_flags |= UPL_COMMIT_SET_DIRTY;
1c79356b
A
2451
2452 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA))
0b4e3aa0
A
2453 ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
2454 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
1c79356b 2455 else
0b4e3aa0
A
2456 ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
2457 PAGE_SIZE, commit_flags);
1c79356b
A
2458 }
2459 }
2460 if (uio_last < pages_in_upl) {
0b4e3aa0
A
2461 /*
2462 * there were some invalid pages beyond the valid pages
2463 * that we didn't issue an I/O for, just release them
2464 * unchanged
1c79356b 2465 */
9bccf70c
A
2466 ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
2467 (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
1c79356b
A
2468 }
2469
0b4e3aa0 2470 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
9bccf70c 2471 (int)upl, -1, -1, 0, 0);
1c79356b
A
2472 }
2473 }
2474 if (retval == 0)
2475 retval = error;
2476 }
2477
2478 return (retval);
2479}
2480
9bccf70c 2481static int
0b4e3aa0 2482cluster_nocopy_read(vp, uio, filesize, devblocksize, flags)
1c79356b
A
2483 struct vnode *vp;
2484 struct uio *uio;
2485 off_t filesize;
2486 int devblocksize;
2487 int flags;
2488{
2489 upl_t upl;
2490 upl_page_info_t *pl;
2491 off_t upl_f_offset;
2492 vm_offset_t upl_offset;
2493 off_t start_upl_f_offset;
2494 off_t max_io_size;
2495 int io_size;
2496 int upl_size;
2497 int upl_needed_size;
2498 int pages_in_pl;
2499 vm_offset_t paddr;
2500 int upl_flags;
2501 kern_return_t kret;
2502 int segflg;
2503 struct iovec *iov;
2504 int i;
2505 int force_data_sync;
2506 int error = 0;
2507 int retval = 0;
2508
2509 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
2510 (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0);
2511
2512 /*
2513 * When we enter this routine, we know
2514 * -- the offset into the file is on a pagesize boundary
2515 * -- the resid is a page multiple
2516 * -- the resid will not exceed iov_len
2517 */
2518
2519 iov = uio->uio_iov;
2520 while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) {
2521
0b4e3aa0
A
2522 max_io_size = filesize - uio->uio_offset;
2523
2524 if (max_io_size < (off_t)((unsigned int)uio->uio_resid))
2525 io_size = max_io_size;
2526 else
2527 io_size = uio->uio_resid;
1c79356b
A
2528
2529 /*
2530 * We don't come into this routine unless
2531 * UIO_USERSPACE is set.
2532 */
2533 segflg = uio->uio_segflg;
2534
2535 uio->uio_segflg = UIO_PHYS_USERSPACE;
2536
2537 /*
2538 * First look for pages already in the cache
2539 * and move them to user space.
2540 */
0b4e3aa0 2541 while (io_size && (retval == 0)) {
1c79356b
A
2542 upl_f_offset = uio->uio_offset;
2543
2544 /*
2545 * If this call fails, it means the page is not
2546 * in the page cache.
2547 */
0b4e3aa0
A
2548 if (ubc_page_op(vp, upl_f_offset,
2549 UPL_POP_SET | UPL_POP_BUSY, &paddr, 0) != KERN_SUCCESS)
1c79356b
A
2550 break;
2551
2552 retval = uiomove((caddr_t)(paddr), PAGE_SIZE, uio);
2553
0b4e3aa0
A
2554 ubc_page_op(vp, upl_f_offset,
2555 UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
1c79356b
A
2556
2557 io_size -= PAGE_SIZE;
2558 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 71)) | DBG_FUNC_NONE,
0b4e3aa0 2559 (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
1c79356b
A
2560 }
2561
2562 uio->uio_segflg = segflg;
2563
2564 if (retval)
2565 {
2566 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2567 (int)uio->uio_offset, uio->uio_resid, 2, retval, 0);
2568 return(retval);
2569 }
2570
2571 /* If we are already finished with this read, then return */
2572 if (io_size == 0)
2573 {
2574
2575 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2576 (int)uio->uio_offset, uio->uio_resid, 3, io_size, 0);
2577 return(0);
2578 }
2579
2580 max_io_size = io_size;
0b4e3aa0
A
2581 if (max_io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2582 max_io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1c79356b
A
2583
2584 start_upl_f_offset = uio->uio_offset; /* this is page aligned in the file */
2585 upl_f_offset = start_upl_f_offset;
2586 io_size = 0;
2587
2588 while(io_size < max_io_size)
2589 {
2590
0b4e3aa0
A
2591 if(ubc_page_op(vp, upl_f_offset,
2592 UPL_POP_SET | UPL_POP_BUSY, &paddr, 0) == KERN_SUCCESS)
1c79356b 2593 {
0b4e3aa0
A
2594 ubc_page_op(vp, upl_f_offset,
2595 UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
2596 break;
1c79356b
A
2597 }
2598
0b4e3aa0
A
2599 /*
2600 * Build up the io request parameters.
2601 */
1c79356b 2602
0b4e3aa0
A
2603 io_size += PAGE_SIZE;
2604 upl_f_offset += PAGE_SIZE;
2605 }
1c79356b 2606
0b4e3aa0
A
2607 if (io_size == 0)
2608 return(retval);
1c79356b
A
2609
2610 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
2611 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
2612
2613 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
9bccf70c 2614 (int)upl_offset, upl_needed_size, (int)iov->iov_base, io_size, 0);
1c79356b
A
2615
2616 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++)
2617 {
2618 pages_in_pl = 0;
2619 upl_size = upl_needed_size;
9bccf70c 2620 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
1c79356b
A
2621
2622 kret = vm_map_get_upl(current_map(),
2623 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
0b4e3aa0 2624 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, force_data_sync);
1c79356b
A
2625
2626 if (kret != KERN_SUCCESS)
2627 {
2628 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2629 (int)upl_offset, upl_size, io_size, kret, 0);
2630
2631 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2632 (int)uio->uio_offset, uio->uio_resid, 4, retval, 0);
2633
2634 /* cluster_nocopy_read: failed to get pagelist */
2635 /* do not return kret here */
2636 return(retval);
2637 }
2638
0b4e3aa0
A
2639 pages_in_pl = upl_size / PAGE_SIZE;
2640 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
2641
1c79356b
A
2642 for(i=0; i < pages_in_pl; i++)
2643 {
2644 if (!upl_valid_page(pl, i))
2645 break;
2646 }
2647 if (i == pages_in_pl)
2648 break;
2649
0b4e3aa0
A
2650 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2651 UPL_ABORT_FREE_ON_EMPTY);
1c79356b
A
2652 }
2653
2654 if (force_data_sync >= 3)
2655 {
2656 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2657 (int)upl_offset, upl_size, io_size, kret, 0);
2658
2659 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2660 (int)uio->uio_offset, uio->uio_resid, 5, retval, 0);
2661 return(retval);
2662 }
2663 /*
2664 * Consider the possibility that upl_size wasn't satisfied.
2665 */
2666 if (upl_size != upl_needed_size)
2667 io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
2668
2669 if (io_size == 0)
2670 {
0b4e3aa0 2671 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1c79356b
A
2672 UPL_ABORT_FREE_ON_EMPTY);
2673 return(retval);
2674 }
2675
2676 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2677 (int)upl_offset, upl_size, io_size, kret, 0);
2678
2679 /*
2680 * issue a synchronous read to cluster_io
2681 */
2682
2683 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
9bccf70c 2684 (int)upl, (int)upl_offset, (int)start_upl_f_offset, io_size, 0);
1c79356b
A
2685
2686 error = cluster_io(vp, upl, upl_offset, start_upl_f_offset,
9bccf70c 2687 io_size, devblocksize, CL_READ| CL_NOZERO, (struct buf *)0);
1c79356b
A
2688
2689 if (error == 0) {
2690 /*
2691 * The cluster_io read completed successfully,
2692 * update the uio structure and commit.
2693 */
2694
0b4e3aa0
A
2695 ubc_upl_commit_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2696 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
1c79356b
A
2697
2698 iov->iov_base += io_size;
2699 iov->iov_len -= io_size;
2700 uio->uio_resid -= io_size;
2701 uio->uio_offset += io_size;
2702 }
2703 else {
0b4e3aa0 2704 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1c79356b
A
2705 UPL_ABORT_FREE_ON_EMPTY);
2706 }
2707
2708 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
9bccf70c 2709 (int)upl, (int)uio->uio_offset, (int)uio->uio_resid, error, 0);
1c79356b
A
2710
2711 if (retval == 0)
2712 retval = error;
2713
2714 } /* end while */
2715
2716
2717 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2718 (int)uio->uio_offset, (int)uio->uio_resid, 6, retval, 0);
2719
2720 return (retval);
2721}
2722
2723
9bccf70c 2724static int
0b4e3aa0
A
2725cluster_phys_read(vp, uio, filesize)
2726 struct vnode *vp;
2727 struct uio *uio;
2728 off_t filesize;
2729{
2730 upl_t upl;
2731 vm_offset_t upl_offset;
2732 off_t max_size;
2733 int io_size;
2734 int upl_size;
2735 int upl_needed_size;
2736 int pages_in_pl;
2737 int upl_flags;
2738 kern_return_t kret;
2739 struct iovec *iov;
2740 int error;
2741
2742 /*
2743 * When we enter this routine, we know
2744 * -- the resid will not exceed iov_len
2745 * -- the target address is physically contiguous
2746 */
2747
2748 iov = uio->uio_iov;
2749
2750 max_size = filesize - uio->uio_offset;
2751
2752 if (max_size < (off_t)((unsigned int)iov->iov_len))
2753 io_size = max_size;
2754 else
2755 io_size = iov->iov_len;
2756
2757 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
2758 upl_needed_size = upl_offset + io_size;
2759
2760 pages_in_pl = 0;
2761 upl_size = upl_needed_size;
9bccf70c 2762 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
0b4e3aa0
A
2763
2764 kret = vm_map_get_upl(current_map(),
2765 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2766 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
2767
2768 if (kret != KERN_SUCCESS)
2769 {
2770 /* cluster_phys_read: failed to get pagelist */
2771 return(EINVAL);
2772 }
2773
2774 /*
2775 * Consider the possibility that upl_size wasn't satisfied.
2776 */
2777 if (upl_size < upl_needed_size)
2778 {
2779 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2780 return(EINVAL);
2781 }
2782
2783 /*
2784 * issue a synchronous read to cluster_io
2785 */
2786
2787 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
9bccf70c 2788 io_size, 0, CL_READ| CL_NOZERO | CL_DEV_MEMORY, (struct buf *)0);
0b4e3aa0
A
2789
2790 if (error == 0)
2791 {
2792 /*
2793 * The cluster_io read completed successfully,
2794 * update the uio structure and commit.
2795 */
2796
2797 ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_FREE_ON_EMPTY);
2798
2799 iov->iov_base += io_size;
2800 iov->iov_len -= io_size;
2801 uio->uio_resid -= io_size;
2802 uio->uio_offset += io_size;
2803 }
2804 else
2805 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2806
2807 return (error);
2808}
1c79356b
A
2809
2810/*
2811 * generate advisory I/O's in the largest chunks possible
2812 * the completed pages will be released into the VM cache
2813 */
9bccf70c 2814int
1c79356b
A
2815advisory_read(vp, filesize, f_offset, resid, devblocksize)
2816 struct vnode *vp;
2817 off_t filesize;
2818 off_t f_offset;
2819 int resid;
2820 int devblocksize;
2821{
1c79356b
A
2822 upl_page_info_t *pl;
2823 upl_t upl;
2824 vm_offset_t upl_offset;
2825 int upl_size;
2826 off_t upl_f_offset;
2827 int start_offset;
2828 int start_pg;
2829 int last_pg;
2830 int pages_in_upl;
2831 off_t max_size;
2832 int io_size;
2833 kern_return_t kret;
2834 int retval = 0;
9bccf70c 2835 int issued_io;
1c79356b
A
2836
2837 if (!UBCINFOEXISTS(vp))
2838 return(EINVAL);
2839
1c79356b
A
2840 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
2841 (int)f_offset, resid, (int)filesize, devblocksize, 0);
2842
2843 while (resid && f_offset < filesize && retval == 0) {
2844 /*
2845 * compute the size of the upl needed to encompass
2846 * the requested read... limit each call to cluster_io
0b4e3aa0
A
2847 * to the maximum UPL size... cluster_io will clip if
2848 * this exceeds the maximum io_size for the device,
2849 * make sure to account for
1c79356b
A
2850 * a starting offset that's not page aligned
2851 */
2852 start_offset = (int)(f_offset & PAGE_MASK_64);
2853 upl_f_offset = f_offset - (off_t)start_offset;
2854 max_size = filesize - f_offset;
2855
2856 if (resid < max_size)
2857 io_size = resid;
2858 else
2859 io_size = max_size;
2860
2861 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
0b4e3aa0
A
2862 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2863 upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1c79356b
A
2864 pages_in_upl = upl_size / PAGE_SIZE;
2865
0b4e3aa0
A
2866 kret = ubc_create_upl(vp,
2867 upl_f_offset,
2868 upl_size,
2869 &upl,
2870 &pl,
9bccf70c 2871 UPL_RET_ONLY_ABSENT);
1c79356b 2872 if (kret != KERN_SUCCESS)
9bccf70c
A
2873 return(retval);
2874 issued_io = 0;
1c79356b
A
2875
2876 /*
9bccf70c
A
2877 * before we start marching forward, we must make sure we end on
2878 * a present page, otherwise we will be working with a freed
2879 * upl
1c79356b 2880 */
9bccf70c
A
2881 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
2882 if (upl_page_present(pl, last_pg))
2883 break;
1c79356b 2884 }
9bccf70c 2885 pages_in_upl = last_pg + 1;
1c79356b 2886
1c79356b 2887
9bccf70c
A
2888 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_NONE,
2889 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
2890
2891
2892 for (last_pg = 0; last_pg < pages_in_upl; ) {
1c79356b 2893 /*
9bccf70c
A
2894 * scan from the beginning of the upl looking for the first
2895 * page that is present.... this will become the first page in
2896 * the request we're going to make to 'cluster_io'... if all
2897 * of the pages are absent, we won't call through to 'cluster_io'
1c79356b 2898 */
9bccf70c
A
2899 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
2900 if (upl_page_present(pl, start_pg))
2901 break;
1c79356b 2902 }
1c79356b 2903
1c79356b 2904 /*
9bccf70c
A
2905 * scan from the starting present page looking for an absent
2906 * page before the end of the upl is reached, if we
2907 * find one, then it will terminate the range of pages being
2908 * presented to 'cluster_io'
1c79356b 2909 */
9bccf70c
A
2910 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
2911 if (!upl_page_present(pl, last_pg))
2912 break;
2913 }
2914
2915 if (last_pg > start_pg) {
2916 /*
2917 * we found a range of pages that must be filled
2918 * if the last page in this range is the last page of the file
2919 * we may have to clip the size of it to keep from reading past
2920 * the end of the last physical block associated with the file
2921 */
2922 upl_offset = start_pg * PAGE_SIZE;
2923 io_size = (last_pg - start_pg) * PAGE_SIZE;
2924
2925 if ((upl_f_offset + upl_offset + io_size) > filesize)
2926 io_size = filesize - (upl_f_offset + upl_offset);
2927
2928 /*
2929 * issue an asynchronous read to cluster_io
2930 */
2931 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, devblocksize,
2932 CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE, (struct buf *)0);
1c79356b 2933
9bccf70c
A
2934 issued_io = 1;
2935 }
1c79356b 2936 }
9bccf70c
A
2937 if (issued_io == 0)
2938 ubc_upl_abort(upl, 0);
2939
2940 io_size = upl_size - start_offset;
1c79356b
A
2941
2942 if (io_size > resid)
2943 io_size = resid;
2944 f_offset += io_size;
2945 resid -= io_size;
2946 }
9bccf70c 2947
1c79356b
A
2948 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
2949 (int)f_offset, resid, retval, 0, 0);
2950
2951 return(retval);
2952}
2953
2954
9bccf70c 2955int
1c79356b
A
2956cluster_push(vp)
2957 struct vnode *vp;
9bccf70c
A
2958{
2959 int retval;
2960
2961 if (!UBCINFOEXISTS(vp) || vp->v_clen == 0) {
2962 vp->v_flag &= ~VHASDIRTY;
2963 return(0);
2964 }
2965
2966 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
2967 vp->v_flag & VHASDIRTY, vp->v_clen, 0, 0, 0);
2968
2969 if (vp->v_flag & VHASDIRTY) {
2970 daddr_t start_pg;
2971 daddr_t last_pg;
2972 daddr_t end_pg;
2973
2974 start_pg = vp->v_cstart;
2975 end_pg = vp->v_lastw;
2976
2977 vp->v_flag &= ~VHASDIRTY;
2978 vp->v_clen = 0;
2979
2980 while (start_pg < end_pg) {
2981 last_pg = start_pg + MAX_UPL_TRANSFER;
2982
2983 if (last_pg > end_pg)
2984 last_pg = end_pg;
2985
2986 cluster_push_x(vp, ubc_getsize(vp), start_pg, last_pg, 0);
2987
2988 start_pg = last_pg;
2989 }
2990 return (1);
2991 }
2992 retval = cluster_try_push(vp, ubc_getsize(vp), 0, 1);
2993
2994 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
2995 vp->v_flag & VHASDIRTY, vp->v_clen, retval, 0, 0);
2996
2997 return (retval);
2998}
2999
3000
3001static int
3002cluster_try_push(vp, EOF, can_delay, push_all)
3003 struct vnode *vp;
3004 off_t EOF;
3005 int can_delay;
3006 int push_all;
3007{
3008 int cl_index;
3009 int cl_index1;
3010 int min_index;
3011 int cl_len;
3012 int cl_total;
3013 int cl_pushed;
3014 struct v_cluster l_clusters[MAX_CLUSTERS];
3015
3016 /*
3017 * make a local 'sorted' copy of the clusters
3018 * and clear vp->v_clen so that new clusters can
3019 * be developed
3020 */
3021 for (cl_index = 0; cl_index < vp->v_clen; cl_index++) {
3022 for (min_index = -1, cl_index1 = 0; cl_index1 < vp->v_clen; cl_index1++) {
3023 if (vp->v_clusters[cl_index1].start_pg == vp->v_clusters[cl_index1].last_pg)
3024 continue;
3025 if (min_index == -1)
3026 min_index = cl_index1;
3027 else if (vp->v_clusters[cl_index1].start_pg < vp->v_clusters[min_index].start_pg)
3028 min_index = cl_index1;
3029 }
3030 if (min_index == -1)
3031 break;
3032 l_clusters[cl_index].start_pg = vp->v_clusters[min_index].start_pg;
3033 l_clusters[cl_index].last_pg = vp->v_clusters[min_index].last_pg;
3034
3035 vp->v_clusters[min_index].start_pg = vp->v_clusters[min_index].last_pg;
3036 }
3037 cl_len = cl_index;
3038 vp->v_clen = 0;
3039
3040 for (cl_pushed = 0, cl_index = 0; cl_index < cl_len; cl_index++) {
3041 /*
3042 * try to push each cluster in turn... cluster_push_x may not
3043 * push the cluster if can_delay is TRUE and the cluster doesn't
3044 * meet the critera for an immediate push
3045 */
3046 if (cluster_push_x(vp, EOF, l_clusters[cl_index].start_pg, l_clusters[cl_index].last_pg, can_delay)) {
3047 l_clusters[cl_index].start_pg = 0;
3048 l_clusters[cl_index].last_pg = 0;
3049
3050 cl_pushed++;
3051
3052 if (push_all == 0)
3053 break;
3054 }
3055 }
3056 if (cl_len > cl_pushed) {
3057 /*
3058 * we didn't push all of the clusters, so
3059 * lets try to merge them back in to the vnode
3060 */
3061 if ((MAX_CLUSTERS - vp->v_clen) < (cl_len - cl_pushed)) {
3062 /*
3063 * we picked up some new clusters while we were trying to
3064 * push the old ones (I don't think this can happen because
3065 * I'm holding the lock, but just in case)... the sum of the
3066 * leftovers plus the new cluster count exceeds our ability
3067 * to represent them, so fall back to the VHASDIRTY mechanism
3068 */
3069 for (cl_index = 0; cl_index < cl_len; cl_index++) {
3070 if (l_clusters[cl_index].start_pg == l_clusters[cl_index].last_pg)
3071 continue;
3072
3073 if (l_clusters[cl_index].start_pg < vp->v_cstart)
3074 vp->v_cstart = l_clusters[cl_index].start_pg;
3075 if (l_clusters[cl_index].last_pg > vp->v_lastw)
3076 vp->v_lastw = l_clusters[cl_index].last_pg;
3077 }
3078 vp->v_flag |= VHASDIRTY;
3079 } else {
3080 /*
3081 * we've got room to merge the leftovers back in
3082 * just append them starting at the next 'hole'
3083 * represented by vp->v_clen
3084 */
3085 for (cl_index = 0, cl_index1 = vp->v_clen; cl_index < cl_len; cl_index++) {
3086 if (l_clusters[cl_index].start_pg == l_clusters[cl_index].last_pg)
3087 continue;
3088
3089 vp->v_clusters[cl_index1].start_pg = l_clusters[cl_index].start_pg;
3090 vp->v_clusters[cl_index1].last_pg = l_clusters[cl_index].last_pg;
3091
3092 if (cl_index1 == 0) {
3093 vp->v_cstart = l_clusters[cl_index].start_pg;
3094 vp->v_lastw = l_clusters[cl_index].last_pg;
3095 } else {
3096 if (l_clusters[cl_index].start_pg < vp->v_cstart)
3097 vp->v_cstart = l_clusters[cl_index].start_pg;
3098 if (l_clusters[cl_index].last_pg > vp->v_lastw)
3099 vp->v_lastw = l_clusters[cl_index].last_pg;
3100 }
3101 cl_index1++;
3102 }
3103 /*
3104 * update the cluster count
3105 */
3106 vp->v_clen = cl_index1;
3107 }
3108 }
3109 return(MAX_CLUSTERS - vp->v_clen);
3110}
3111
3112
3113
3114static int
3115cluster_push_x(vp, EOF, first, last, can_delay)
3116 struct vnode *vp;
3117 off_t EOF;
3118 daddr_t first;
3119 daddr_t last;
3120 int can_delay;
1c79356b 3121{
1c79356b
A
3122 upl_page_info_t *pl;
3123 upl_t upl;
3124 vm_offset_t upl_offset;
3125 int upl_size;
3126 off_t upl_f_offset;
3127 int pages_in_upl;
3128 int start_pg;
3129 int last_pg;
3130 int io_size;
3131 int io_flags;
3132 int size;
3133 kern_return_t kret;
3134
3135
9bccf70c
A
3136 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START,
3137 vp->v_clen, first, last, EOF, 0);
3138
3139 if ((pages_in_upl = last - first) == 0) {
3140 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0);
1c79356b 3141
9bccf70c
A
3142 return (1);
3143 }
1c79356b 3144 upl_size = pages_in_upl * PAGE_SIZE;
9bccf70c 3145 upl_f_offset = ((off_t)first) * PAGE_SIZE_64;
1c79356b 3146
9bccf70c
A
3147 if (upl_f_offset + upl_size >= EOF) {
3148
3149 if (upl_f_offset >= EOF) {
3150 /*
3151 * must have truncated the file and missed
3152 * clearing a dangling cluster (i.e. it's completely
3153 * beyond the new EOF
3154 */
3155 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0);
3156
3157 return(1);
3158 }
3159 size = EOF - upl_f_offset;
1c79356b 3160
9bccf70c
A
3161 upl_size = (size + (PAGE_SIZE - 1) ) & ~(PAGE_SIZE - 1);
3162 pages_in_upl = upl_size / PAGE_SIZE;
3163 } else {
3164 if (can_delay && (pages_in_upl < (MAX_UPL_TRANSFER - (MAX_UPL_TRANSFER / 2))))
3165 return(0);
3166 size = upl_size;
3167 }
0b4e3aa0
A
3168 kret = ubc_create_upl(vp,
3169 upl_f_offset,
3170 upl_size,
3171 &upl,
9bccf70c
A
3172 &pl,
3173 UPL_RET_ONLY_DIRTY);
1c79356b
A
3174 if (kret != KERN_SUCCESS)
3175 panic("cluster_push: failed to get pagelist");
3176
9bccf70c
A
3177 if (can_delay) {
3178 int num_of_dirty;
3179
3180 for (num_of_dirty = 0, start_pg = 0; start_pg < pages_in_upl; start_pg++) {
3181 if (upl_valid_page(pl, start_pg) && upl_dirty_page(pl, start_pg))
3182 num_of_dirty++;
3183 }
3184 if (num_of_dirty < pages_in_upl / 2) {
3185 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3186
3187 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 0, 2, num_of_dirty, (pages_in_upl / 2), 0);
3188
3189 return(0);
3190 }
3191 }
1c79356b
A
3192 last_pg = 0;
3193
3194 while (size) {
3195
3196 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
3197 if (upl_valid_page(pl, start_pg) && upl_dirty_page(pl, start_pg))
3198 break;
3199 }
3200 if (start_pg > last_pg) {
0b4e3aa0 3201 io_size = (start_pg - last_pg) * PAGE_SIZE;
1c79356b 3202
0b4e3aa0
A
3203 ubc_upl_abort_range(upl, last_pg * PAGE_SIZE, io_size,
3204 UPL_ABORT_FREE_ON_EMPTY);
1c79356b
A
3205
3206 if (io_size < size)
3207 size -= io_size;
3208 else
3209 break;
3210 }
3211 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
3212 if (!upl_valid_page(pl, last_pg) || !upl_dirty_page(pl, last_pg))
3213 break;
3214 }
3215 upl_offset = start_pg * PAGE_SIZE;
3216
3217 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
3218
0b4e3aa0
A
3219 if (vp->v_flag & VNOCACHE_DATA)
3220 io_flags = CL_COMMIT | CL_AGE | CL_ASYNC | CL_DUMP;
1c79356b
A
3221 else
3222 io_flags = CL_COMMIT | CL_AGE | CL_ASYNC;
3223
0b4e3aa0
A
3224 while (vp->v_numoutput >= ASYNC_THROTTLE) {
3225 vp->v_flag |= VTHROTTLED;
3226 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_push", 0);
3227 }
9bccf70c 3228 cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, vp->v_ciosiz, io_flags, (struct buf *)0);
1c79356b
A
3229
3230 size -= io_size;
3231 }
9bccf70c
A
3232 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, 0, 0, 0);
3233
1c79356b
A
3234 return(1);
3235}