]> git.saurik.com Git - apple/xnu.git/blame - bsd/vfs/vfs_cluster.c
xnu-344.23.tar.gz
[apple/xnu.git] / bsd / vfs / vfs_cluster.c
CommitLineData
1c79356b 1/*
9bccf70c 2 * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
1c79356b
A
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
de355530
A
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
1c79356b 11 *
de355530
A
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
1c79356b
A
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
de355530
A
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
1c79356b
A
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
23/*
24 * Copyright (c) 1993
25 * The Regents of the University of California. All rights reserved.
26 *
27 * Redistribution and use in source and binary forms, with or without
28 * modification, are permitted provided that the following conditions
29 * are met:
30 * 1. Redistributions of source code must retain the above copyright
31 * notice, this list of conditions and the following disclaimer.
32 * 2. Redistributions in binary form must reproduce the above copyright
33 * notice, this list of conditions and the following disclaimer in the
34 * documentation and/or other materials provided with the distribution.
35 * 3. All advertising materials mentioning features or use of this software
36 * must display the following acknowledgement:
37 * This product includes software developed by the University of
38 * California, Berkeley and its contributors.
39 * 4. Neither the name of the University nor the names of its contributors
40 * may be used to endorse or promote products derived from this software
41 * without specific prior written permission.
42 *
43 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
44 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
45 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
46 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
47 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
48 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
49 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
50 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
51 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
52 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
53 * SUCH DAMAGE.
54 *
55 * @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95
56 */
57
58#include <sys/param.h>
59#include <sys/proc.h>
60#include <sys/buf.h>
61#include <sys/vnode.h>
62#include <sys/mount.h>
63#include <sys/trace.h>
64#include <sys/malloc.h>
65#include <sys/resourcevar.h>
66#include <libkern/libkern.h>
67
68#include <sys/ubc.h>
69#include <vm/vm_pageout.h>
1c79356b
A
70
71#include <sys/kdebug.h>
72
1c79356b
A
73#define CL_READ 0x01
74#define CL_ASYNC 0x02
75#define CL_COMMIT 0x04
1c79356b
A
76#define CL_PAGEOUT 0x10
77#define CL_AGE 0x20
78#define CL_DUMP 0x40
79#define CL_NOZERO 0x80
80#define CL_PAGEIN 0x100
0b4e3aa0 81#define CL_DEV_MEMORY 0x200
b4c24cb9
A
82#define CL_PRESERVE 0x400
83
d7e50217 84
b4c24cb9 85struct clios {
d7e50217
A
86 u_int io_completed; /* amount of io that has currently completed */
87 u_int io_issued; /* amount of io that was successfully issued */
88 int io_error; /* error code of first error encountered */
89 int io_wanted; /* someone is sleeping waiting for a change in state */
b4c24cb9
A
90};
91
1c79356b 92
9bccf70c
A
93static void cluster_zero(upl_t upl, vm_offset_t upl_offset,
94 int size, struct buf *bp);
95static int cluster_read_x(struct vnode *vp, struct uio *uio,
96 off_t filesize, int devblocksize, int flags);
97static int cluster_write_x(struct vnode *vp, struct uio *uio,
98 off_t oldEOF, off_t newEOF, off_t headOff,
99 off_t tailOff, int devblocksize, int flags);
100static int cluster_nocopy_read(struct vnode *vp, struct uio *uio,
101 off_t filesize, int devblocksize, int flags);
102static int cluster_nocopy_write(struct vnode *vp, struct uio *uio,
103 off_t newEOF, int devblocksize, int flags);
104static int cluster_phys_read(struct vnode *vp, struct uio *uio,
b4c24cb9
A
105 off_t filesize, int devblocksize, int flags);
106static int cluster_phys_write(struct vnode *vp, struct uio *uio,
107 off_t newEOF, int devblocksize, int flags);
108static int cluster_align_phys_io(struct vnode *vp, struct uio *uio,
de355530 109 vm_offset_t usr_paddr, int xsize, int devblocksize, int flags);
9bccf70c
A
110static int cluster_push_x(struct vnode *vp, off_t EOF, daddr_t first, daddr_t last, int can_delay);
111static int cluster_try_push(struct vnode *vp, off_t newEOF, int can_delay, int push_all);
112
113
1c79356b
A
114/*
115 * throttle the number of async writes that
116 * can be outstanding on a single vnode
117 * before we issue a synchronous write
118 */
d52fe63f 119#define ASYNC_THROTTLE 9
1c79356b
A
120
121static int
122cluster_iodone(bp)
123 struct buf *bp;
124{
125 int b_flags;
126 int error;
127 int total_size;
128 int total_resid;
129 int upl_offset;
9bccf70c 130 int zero_offset;
1c79356b
A
131 upl_t upl;
132 struct buf *cbp;
133 struct buf *cbp_head;
134 struct buf *cbp_next;
135 struct buf *real_bp;
0b4e3aa0 136 struct vnode *vp;
b4c24cb9 137 struct clios *iostate;
1c79356b
A
138 int commit_size;
139 int pg_offset;
140
141
142 cbp_head = (struct buf *)(bp->b_trans_head);
143
144 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
9bccf70c 145 (int)cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
1c79356b
A
146
147 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
148 /*
149 * all I/O requests that are part of this transaction
150 * have to complete before we can process it
151 */
152 if ( !(cbp->b_flags & B_DONE)) {
153
154 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
9bccf70c 155 (int)cbp_head, (int)cbp, cbp->b_bcount, cbp->b_flags, 0);
1c79356b
A
156
157 return 0;
158 }
159 }
160 error = 0;
161 total_size = 0;
162 total_resid = 0;
163
164 cbp = cbp_head;
165 upl_offset = cbp->b_uploffset;
166 upl = cbp->b_pagelist;
167 b_flags = cbp->b_flags;
168 real_bp = cbp->b_real_bp;
0b4e3aa0 169 vp = cbp->b_vp;
9bccf70c 170 zero_offset= cbp->b_validend;
b4c24cb9 171 iostate = (struct clios *)cbp->b_iostate;
1c79356b
A
172
173 while (cbp) {
de355530
A
174 if (cbp->b_vectorcount > 1)
175 _FREE(cbp->b_vectorlist, M_SEGMENT);
176
1c79356b
A
177 if ((cbp->b_flags & B_ERROR) && error == 0)
178 error = cbp->b_error;
179
180 total_resid += cbp->b_resid;
181 total_size += cbp->b_bcount;
182
183 cbp_next = cbp->b_trans_next;
184
185 free_io_buf(cbp);
186
187 cbp = cbp_next;
188 }
b4c24cb9
A
189 if (zero_offset)
190 cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
191
0b4e3aa0
A
192 if ((vp->v_flag & VTHROTTLED) && (vp->v_numoutput <= (ASYNC_THROTTLE / 3))) {
193 vp->v_flag &= ~VTHROTTLED;
194 wakeup((caddr_t)&vp->v_numoutput);
195 }
b4c24cb9 196 if (iostate) {
d7e50217
A
197 /*
198 * someone has issued multiple I/Os asynchrounsly
199 * and is waiting for them to complete (streaming)
200 */
201 if (error && iostate->io_error == 0)
202 iostate->io_error = error;
9bccf70c 203
b4c24cb9
A
204 iostate->io_completed += total_size;
205
206 if (iostate->io_wanted) {
d7e50217
A
207 /*
208 * someone is waiting for the state of
209 * this io stream to change
210 */
b4c24cb9
A
211 iostate->io_wanted = 0;
212 wakeup((caddr_t)&iostate->io_wanted);
213 }
214 }
1c79356b
A
215 if ((b_flags & B_NEED_IODONE) && real_bp) {
216 if (error) {
217 real_bp->b_flags |= B_ERROR;
218 real_bp->b_error = error;
219 }
220 real_bp->b_resid = total_resid;
221
222 biodone(real_bp);
223 }
224 if (error == 0 && total_resid)
225 error = EIO;
226
227 if (b_flags & B_COMMIT_UPL) {
b4c24cb9 228 pg_offset = upl_offset & PAGE_MASK;
1c79356b
A
229 commit_size = (((pg_offset + total_size) + (PAGE_SIZE - 1)) / PAGE_SIZE) * PAGE_SIZE;
230
b4c24cb9 231 if (error || (b_flags & B_NOCACHE) || ((b_flags & B_PHYS) && !(b_flags & B_READ))) {
1c79356b
A
232 int upl_abort_code;
233
b4c24cb9
A
234 if (b_flags & B_PHYS)
235 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
236 else if ((b_flags & B_PAGEOUT) && (error != ENXIO)) /* transient error */
1c79356b 237 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
0b4e3aa0
A
238 else if (b_flags & B_PGIN)
239 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
1c79356b
A
240 else
241 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
242
0b4e3aa0
A
243 ubc_upl_abort_range(upl, upl_offset - pg_offset, commit_size,
244 upl_abort_code);
1c79356b
A
245
246 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
9bccf70c 247 (int)upl, upl_offset - pg_offset, commit_size,
1c79356b
A
248 0x80000000|upl_abort_code, 0);
249
250 } else {
251 int upl_commit_flags = UPL_COMMIT_FREE_ON_EMPTY;
252
b4c24cb9
A
253 if (b_flags & B_PHYS)
254 upl_commit_flags |= UPL_COMMIT_SET_DIRTY;
255 else if ( !(b_flags & B_PAGEOUT))
1c79356b
A
256 upl_commit_flags |= UPL_COMMIT_CLEAR_DIRTY;
257 if (b_flags & B_AGE)
258 upl_commit_flags |= UPL_COMMIT_INACTIVATE;
259
0b4e3aa0
A
260 ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size,
261 upl_commit_flags);
1c79356b 262
0b4e3aa0 263 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
9bccf70c 264 (int)upl, upl_offset - pg_offset, commit_size,
1c79356b
A
265 upl_commit_flags, 0);
266 }
267 } else
268 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
9bccf70c 269 (int)upl, upl_offset, 0, error, 0);
1c79356b
A
270
271 return (error);
272}
273
274
275static void
9bccf70c 276cluster_zero(upl, upl_offset, size, bp)
1c79356b
A
277 upl_t upl;
278 vm_offset_t upl_offset;
279 int size;
1c79356b
A
280 struct buf *bp;
281{
282 vm_offset_t io_addr = 0;
9bccf70c 283 int must_unmap = 0;
1c79356b
A
284 kern_return_t kret;
285
9bccf70c
A
286 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_NONE,
287 upl_offset, size, (int)bp, 0, 0);
288
289 if (bp == NULL || bp->b_data == NULL) {
0b4e3aa0 290 kret = ubc_upl_map(upl, &io_addr);
1c79356b
A
291
292 if (kret != KERN_SUCCESS)
0b4e3aa0 293 panic("cluster_zero: ubc_upl_map() failed with (%d)", kret);
1c79356b 294 if (io_addr == 0)
0b4e3aa0 295 panic("cluster_zero: ubc_upl_map() mapped 0");
9bccf70c
A
296
297 must_unmap = 1;
1c79356b
A
298 } else
299 io_addr = (vm_offset_t)bp->b_data;
300 bzero((caddr_t)(io_addr + upl_offset), size);
301
9bccf70c 302 if (must_unmap) {
0b4e3aa0 303 kret = ubc_upl_unmap(upl);
1c79356b
A
304
305 if (kret != KERN_SUCCESS)
306 panic("cluster_zero: kernel_upl_unmap failed");
307 }
308}
309
1c79356b 310static int
b4c24cb9 311cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, real_bp, iostate)
1c79356b
A
312 struct vnode *vp;
313 upl_t upl;
314 vm_offset_t upl_offset;
315 off_t f_offset;
9bccf70c
A
316 int non_rounded_size;
317 int devblocksize;
1c79356b
A
318 int flags;
319 struct buf *real_bp;
b4c24cb9 320 struct clios *iostate;
1c79356b
A
321{
322 struct buf *cbp;
de355530 323 struct iovec *iovp;
b4c24cb9
A
324 u_int size;
325 u_int io_size;
1c79356b
A
326 int io_flags;
327 int error = 0;
328 int retval = 0;
329 struct buf *cbp_head = 0;
330 struct buf *cbp_tail = 0;
331 upl_page_info_t *pl;
9bccf70c 332 int buf_count = 0;
1c79356b
A
333 int pg_count;
334 int pg_offset;
9bccf70c
A
335 u_int max_iosize;
336 u_int max_vectors;
0b4e3aa0 337 int priv;
9bccf70c 338 int zero_offset = 0;
b4c24cb9 339 u_int first_lblkno;
1c79356b 340
0b4e3aa0 341 if (flags & CL_READ) {
1c79356b 342 io_flags = (B_VECTORLIST | B_READ);
0b4e3aa0
A
343
344 vfs_io_attributes(vp, B_READ, &max_iosize, &max_vectors);
345 } else {
1c79356b
A
346 io_flags = (B_VECTORLIST | B_WRITEINPROG);
347
0b4e3aa0
A
348 vfs_io_attributes(vp, B_WRITE, &max_iosize, &max_vectors);
349 }
350 pl = ubc_upl_pageinfo(upl);
1c79356b 351
1c79356b
A
352 if (flags & CL_AGE)
353 io_flags |= B_AGE;
354 if (flags & CL_DUMP)
355 io_flags |= B_NOCACHE;
0b4e3aa0
A
356 if (flags & CL_PAGEIN)
357 io_flags |= B_PGIN;
b4c24cb9
A
358 if (flags & CL_PAGEOUT)
359 io_flags |= B_PAGEOUT;
360 if (flags & CL_COMMIT)
361 io_flags |= B_COMMIT_UPL;
362 if (flags & CL_PRESERVE)
363 io_flags |= B_PHYS;
1c79356b 364
9bccf70c
A
365 if (devblocksize)
366 size = (non_rounded_size + (devblocksize - 1)) & ~(devblocksize - 1);
367 else
368 size = non_rounded_size;
369
1c79356b
A
370
371 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START,
372 (int)f_offset, size, upl_offset, flags, 0);
373
9bccf70c 374 if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
1c79356b
A
375 /*
376 * then we are going to end up
377 * with a page that we can't complete (the file size wasn't a multiple
378 * of PAGE_SIZE and we're trying to read to the end of the file
379 * so we'll go ahead and zero out the portion of the page we can't
380 * read in from the file
381 */
9bccf70c 382 zero_offset = upl_offset + non_rounded_size;
1c79356b
A
383 }
384 while (size) {
de355530
A
385 int vsize;
386 int i;
1c79356b
A
387 int pl_index;
388 int pg_resid;
389 int num_contig;
390 daddr_t lblkno;
391 daddr_t blkno;
392
0b4e3aa0
A
393 if (size > max_iosize)
394 io_size = max_iosize;
1c79356b
A
395 else
396 io_size = size;
397
b4c24cb9 398 if (error = VOP_CMAP(vp, f_offset, io_size, &blkno, (size_t *)&io_size, NULL)) {
1c79356b
A
399 if (error == EOPNOTSUPP)
400 panic("VOP_CMAP Unimplemented");
401 break;
402 }
403
404 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
9bccf70c 405 (int)f_offset, (int)blkno, io_size, zero_offset, 0);
1c79356b
A
406
407 if ( (!(flags & CL_READ) && (long)blkno == -1) || io_size == 0) {
0b4e3aa0
A
408 if (flags & CL_PAGEOUT) {
409 error = EINVAL;
410 break;
411 };
412
413 /* Try paging out the page individually before
414 giving up entirely and dumping it (it could
415 be mapped in a "hole" and require allocation
416 before the I/O:
417 */
418 ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE_64, UPL_ABORT_FREE_ON_EMPTY);
419 if (ubc_pushdirty_range(vp, f_offset, PAGE_SIZE_64) == 0) {
420 error = EINVAL;
421 break;
422 };
423
424 upl_offset += PAGE_SIZE_64;
425 f_offset += PAGE_SIZE_64;
426 size -= PAGE_SIZE_64;
427 continue;
1c79356b
A
428 }
429 lblkno = (daddr_t)(f_offset / PAGE_SIZE_64);
430 /*
431 * we have now figured out how much I/O we can do - this is in 'io_size'
432 * pl_index represents the first page in the 'upl' that the I/O will occur for
433 * pg_offset is the starting point in the first page for the I/O
434 * pg_count is the number of full and partial pages that 'io_size' encompasses
435 */
436 pl_index = upl_offset / PAGE_SIZE;
437 pg_offset = upl_offset & PAGE_MASK;
438 pg_count = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
439
0b4e3aa0
A
440 if (flags & CL_DEV_MEMORY) {
441 /*
442 * currently, can't deal with reading 'holes' in file
443 */
444 if ((long)blkno == -1) {
445 error = EINVAL;
446 break;
447 }
448 /*
449 * treat physical requests as one 'giant' page
450 */
451 pg_count = 1;
452 }
1c79356b 453 if ((flags & CL_READ) && (long)blkno == -1) {
9bccf70c
A
454 int bytes_to_zero;
455
1c79356b
A
456 /*
457 * if we're reading and blkno == -1, then we've got a
458 * 'hole' in the file that we need to deal with by zeroing
459 * out the affected area in the upl
460 */
9bccf70c
A
461 if (zero_offset && io_size == size) {
462 /*
463 * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
464 * than 'zero_offset' will be non-zero
465 * if the 'hole' returned by VOP_CMAP extends all the way to the eof
466 * (indicated by the io_size finishing off the I/O request for this UPL)
467 * than we're not going to issue an I/O for the
468 * last page in this upl... we need to zero both the hole and the tail
469 * of the page beyond the EOF, since the delayed zero-fill won't kick in
470 */
471 bytes_to_zero = (((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset;
1c79356b 472
9bccf70c
A
473 zero_offset = 0;
474 } else
475 bytes_to_zero = io_size;
1c79356b 476
9bccf70c
A
477 cluster_zero(upl, upl_offset, bytes_to_zero, real_bp);
478
479 if (cbp_head)
480 /*
481 * if there is a current I/O chain pending
482 * then the first page of the group we just zero'd
483 * will be handled by the I/O completion if the zero
484 * fill started in the middle of the page
485 */
486 pg_count = (io_size - pg_offset) / PAGE_SIZE;
487 else {
488 /*
489 * no pending I/O to pick up that first page
490 * so, we have to make sure it gets committed
491 * here.
492 * set the pg_offset to 0 so that the upl_commit_range
493 * starts with this page
494 */
495 pg_count = (io_size + pg_offset) / PAGE_SIZE;
496 pg_offset = 0;
497 }
1c79356b 498 if (io_size == size && ((upl_offset + io_size) & PAGE_MASK))
9bccf70c
A
499 /*
500 * if we're done with the request for this UPL
501 * then we have to make sure to commit the last page
502 * even if we only partially zero-filled it
503 */
1c79356b
A
504 pg_count++;
505
506 if (pg_count) {
507 if (pg_offset)
508 pg_resid = PAGE_SIZE - pg_offset;
509 else
510 pg_resid = 0;
9bccf70c 511
1c79356b 512 if (flags & CL_COMMIT)
0b4e3aa0 513 ubc_upl_commit_range(upl,
9bccf70c 514 (upl_offset + pg_resid) & ~PAGE_MASK,
0b4e3aa0
A
515 pg_count * PAGE_SIZE,
516 UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
1c79356b
A
517 }
518 upl_offset += io_size;
519 f_offset += io_size;
520 size -= io_size;
521
9bccf70c 522 if (cbp_head && pg_count)
1c79356b
A
523 goto start_io;
524 continue;
9bccf70c 525
1c79356b
A
526 } else if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno)) {
527 real_bp->b_blkno = blkno;
528 }
0b4e3aa0 529
de355530
A
530 if (pg_count > 1) {
531 if (pg_count > max_vectors) {
532 io_size -= (pg_count - max_vectors) * PAGE_SIZE;
0b4e3aa0 533
de355530
A
534 if (io_size < 0) {
535 io_size = PAGE_SIZE - pg_offset;
536 pg_count = 1;
537 } else
538 pg_count = max_vectors;
539 }
540 /*
541 * we need to allocate space for the vector list
542 */
543 if (pg_count > 1) {
544 iovp = (struct iovec *)_MALLOC(sizeof(struct iovec) * pg_count,
545 M_SEGMENT, M_NOWAIT);
546
547 if (iovp == (struct iovec *) 0) {
548 /*
549 * if the allocation fails, then throttle down to a single page
550 */
551 io_size = PAGE_SIZE - pg_offset;
552 pg_count = 1;
553 }
554 }
1c79356b 555 }
1c79356b 556
0b4e3aa0
A
557 /* Throttle the speculative IO */
558 if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT))
559 priv = 0;
560 else
561 priv = 1;
562
563 cbp = alloc_io_buf(vp, priv);
1c79356b 564
de355530
A
565 if (pg_count == 1)
566 /*
567 * we use the io vector that's reserved in the buffer header
568 * this insures we can always issue an I/O even in a low memory
569 * condition that prevents the _MALLOC from succeeding... this
570 * is necessary to prevent deadlocks with the pager
571 */
572 iovp = (struct iovec *)(&cbp->b_vects[0]);
573
574 cbp->b_vectorlist = (void *)iovp;
575 cbp->b_vectorcount = pg_count;
1c79356b 576
de355530
A
577 if (flags & CL_DEV_MEMORY) {
578
579 iovp->iov_len = io_size;
580 iovp->iov_base = (caddr_t)upl_phys_page(pl, 0);
581
582 if (iovp->iov_base == (caddr_t) 0) {
583 free_io_buf(cbp);
584 error = EINVAL;
585 } else
586 iovp->iov_base += upl_offset;
587 } else {
588
589 for (i = 0, vsize = io_size; i < pg_count; i++, iovp++) {
590 int psize;
591
592 psize = PAGE_SIZE - pg_offset;
593
594 if (psize > vsize)
595 psize = vsize;
596
597 iovp->iov_len = psize;
598 iovp->iov_base = (caddr_t)upl_phys_page(pl, pl_index + i);
599
600 if (iovp->iov_base == (caddr_t) 0) {
601 if (pg_count > 1)
602 _FREE(cbp->b_vectorlist, M_SEGMENT);
603 free_io_buf(cbp);
604
605 error = EINVAL;
606 break;
607 }
608 iovp->iov_base += pg_offset;
609 pg_offset = 0;
610
611 if (flags & CL_PAGEOUT) {
1c79356b
A
612 int s;
613 struct buf *bp;
614
615 s = splbio();
616 if (bp = incore(vp, lblkno + i)) {
617 if (!ISSET(bp->b_flags, B_BUSY)) {
618 bremfree(bp);
619 SET(bp->b_flags, (B_BUSY | B_INVAL));
620 splx(s);
621 brelse(bp);
622 } else
623 panic("BUSY bp found in cluster_io");
624 }
625 splx(s);
626 }
de355530
A
627 vsize -= psize;
628 }
1c79356b 629 }
de355530
A
630 if (error)
631 break;
632
b4c24cb9
A
633 if (flags & CL_ASYNC) {
634 cbp->b_flags |= (B_CALL | B_ASYNC);
635 cbp->b_iodone = (void *)cluster_iodone;
636 }
1c79356b
A
637 cbp->b_flags |= io_flags;
638
639 cbp->b_lblkno = lblkno;
640 cbp->b_blkno = blkno;
641 cbp->b_bcount = io_size;
642 cbp->b_pagelist = upl;
643 cbp->b_uploffset = upl_offset;
644 cbp->b_trans_next = (struct buf *)0;
645
b4c24cb9 646 if (cbp->b_iostate = (void *)iostate)
d7e50217
A
647 /*
648 * caller wants to track the state of this
649 * io... bump the amount issued against this stream
650 */
b4c24cb9
A
651 iostate->io_issued += io_size;
652
1c79356b
A
653 if (flags & CL_READ)
654 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
655 cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
656 else
657 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
658 cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
659
660 if (cbp_head) {
661 cbp_tail->b_trans_next = cbp;
662 cbp_tail = cbp;
663 } else {
664 cbp_head = cbp;
665 cbp_tail = cbp;
666 }
667 (struct buf *)(cbp->b_trans_head) = cbp_head;
9bccf70c 668 buf_count++;
1c79356b
A
669
670 upl_offset += io_size;
671 f_offset += io_size;
672 size -= io_size;
673
9bccf70c 674 if ( (!(upl_offset & PAGE_MASK) && !(flags & CL_DEV_MEMORY) && ((flags & CL_ASYNC) || buf_count > 8)) || size == 0) {
1c79356b
A
675 /*
676 * if we have no more I/O to issue or
677 * the current I/O we've prepared fully
678 * completes the last page in this request
9bccf70c
A
679 * and it's either an ASYNC request or
680 * we've already accumulated more than 8 I/O's into
681 * this transaction and it's not an I/O directed to
682 * special DEVICE memory
1c79356b
A
683 * then go ahead and issue the I/O
684 */
685start_io:
1c79356b
A
686 if (real_bp) {
687 cbp_head->b_flags |= B_NEED_IODONE;
688 cbp_head->b_real_bp = real_bp;
9bccf70c
A
689 } else
690 cbp_head->b_real_bp = (struct buf *)NULL;
1c79356b 691
9bccf70c
A
692 if (size == 0) {
693 /*
694 * we're about to issue the last I/O for this upl
695 * if this was a read to the eof and the eof doesn't
696 * finish on a page boundary, than we need to zero-fill
697 * the rest of the page....
698 */
699 cbp_head->b_validend = zero_offset;
700 } else
701 cbp_head->b_validend = 0;
702
1c79356b
A
703 for (cbp = cbp_head; cbp;) {
704 struct buf * cbp_next;
705
706 if (io_flags & B_WRITEINPROG)
707 cbp->b_vp->v_numoutput++;
708
709 cbp_next = cbp->b_trans_next;
9bccf70c 710
1c79356b
A
711 (void) VOP_STRATEGY(cbp);
712 cbp = cbp_next;
713 }
714 if ( !(flags & CL_ASYNC)) {
715 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next)
716 biowait(cbp);
717
718 if (error = cluster_iodone(cbp_head)) {
9bccf70c
A
719 if ((flags & CL_PAGEOUT) && (error == ENXIO))
720 retval = 0; /* drop the error */
721 else
722 retval = error;
1c79356b
A
723 error = 0;
724 }
725 }
726 cbp_head = (struct buf *)0;
727 cbp_tail = (struct buf *)0;
9bccf70c
A
728
729 buf_count = 0;
1c79356b
A
730 }
731 }
732 if (error) {
0b4e3aa0
A
733 int abort_size;
734
b4c24cb9
A
735 io_size = 0;
736
1c79356b
A
737 for (cbp = cbp_head; cbp;) {
738 struct buf * cbp_next;
739
de355530
A
740 if (cbp->b_vectorcount > 1)
741 _FREE(cbp->b_vectorlist, M_SEGMENT);
0b4e3aa0
A
742 upl_offset -= cbp->b_bcount;
743 size += cbp->b_bcount;
b4c24cb9 744 io_size += cbp->b_bcount;
0b4e3aa0 745
1c79356b
A
746 cbp_next = cbp->b_trans_next;
747 free_io_buf(cbp);
748 cbp = cbp_next;
1c79356b 749 }
b4c24cb9 750 if (iostate) {
d7e50217
A
751 /*
752 * update the error condition for this stream
753 * since we never really issued the io
754 * just go ahead and adjust it back
755 */
756 if (iostate->io_error == 0)
b4c24cb9 757 iostate->io_error = error;
b4c24cb9
A
758 iostate->io_issued -= io_size;
759
760 if (iostate->io_wanted) {
d7e50217
A
761 /*
762 * someone is waiting for the state of
763 * this io stream to change
764 */
b4c24cb9
A
765 iostate->io_wanted = 0;
766 wakeup((caddr_t)&iostate->io_wanted);
767 }
768 }
0b4e3aa0
A
769 pg_offset = upl_offset & PAGE_MASK;
770 abort_size = ((size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE) * PAGE_SIZE;
1c79356b
A
771
772 if (flags & CL_COMMIT) {
773 int upl_abort_code;
774
b4c24cb9
A
775 if (flags & CL_PRESERVE)
776 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
777 else if ((flags & CL_PAGEOUT) && (error != ENXIO)) /* transient error */
1c79356b
A
778 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
779 else if (flags & CL_PAGEIN)
b4c24cb9 780 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
1c79356b 781 else
0b4e3aa0 782 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
1c79356b 783
0b4e3aa0
A
784 ubc_upl_abort_range(upl, upl_offset - pg_offset, abort_size,
785 upl_abort_code);
1c79356b
A
786
787 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
9bccf70c 788 (int)upl, upl_offset - pg_offset, abort_size, error, 0);
1c79356b
A
789 }
790 if (real_bp) {
791 real_bp->b_flags |= B_ERROR;
792 real_bp->b_error = error;
793
794 biodone(real_bp);
795 }
796 if (retval == 0)
797 retval = error;
798 }
799 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END,
800 (int)f_offset, size, upl_offset, retval, 0);
801
802 return (retval);
803}
804
805
806static int
0b4e3aa0 807cluster_rd_prefetch(vp, f_offset, size, filesize, devblocksize)
1c79356b 808 struct vnode *vp;
1c79356b
A
809 off_t f_offset;
810 u_int size;
811 off_t filesize;
812 int devblocksize;
813{
9bccf70c
A
814 int pages_to_fetch;
815 int skipped_pages;
1c79356b
A
816
817 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
818 (int)f_offset, size, (int)filesize, 0, 0);
819
820 if (f_offset >= filesize) {
821 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
822 (int)f_offset, 0, 0, 0, 0);
823 return(0);
824 }
0b4e3aa0
A
825 if (size > (MAX_UPL_TRANSFER * PAGE_SIZE))
826 size = MAX_UPL_TRANSFER * PAGE_SIZE;
1c79356b
A
827 else
828 size = (size + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1);
829
9bccf70c
A
830 if ((off_t)size > (filesize - f_offset))
831 size = filesize - f_offset;
1c79356b 832
9bccf70c 833 pages_to_fetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
1c79356b 834
9bccf70c
A
835 for (skipped_pages = 0; skipped_pages < pages_to_fetch; skipped_pages++) {
836 if (ubc_page_op(vp, f_offset, 0, 0, 0) != KERN_SUCCESS)
1c79356b 837 break;
9bccf70c
A
838 f_offset += PAGE_SIZE;
839 size -= PAGE_SIZE;
1c79356b 840 }
9bccf70c
A
841 if (skipped_pages < pages_to_fetch)
842 advisory_read(vp, filesize, f_offset, size, devblocksize);
1c79356b
A
843
844 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
9bccf70c 845 (int)f_offset + (pages_to_fetch * PAGE_SIZE), skipped_pages, 0, 1, 0);
1c79356b 846
9bccf70c 847 return (pages_to_fetch);
1c79356b
A
848}
849
850
851
852static void
0b4e3aa0 853cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize)
1c79356b 854 struct vnode *vp;
1c79356b
A
855 daddr_t b_lblkno;
856 daddr_t e_lblkno;
857 off_t filesize;
858 int devblocksize;
859{
860 daddr_t r_lblkno;
861 off_t f_offset;
862 int size_of_prefetch;
0b4e3aa0 863 int max_pages;
1c79356b
A
864
865 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
866 b_lblkno, e_lblkno, vp->v_lastr, 0, 0);
867
868 if (b_lblkno == vp->v_lastr && b_lblkno == e_lblkno) {
869 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
870 vp->v_ralen, vp->v_maxra, vp->v_lastr, 0, 0);
871 return;
872 }
873
9bccf70c
A
874 if (vp->v_lastr == -1 || (b_lblkno != vp->v_lastr && b_lblkno != (vp->v_lastr + 1) &&
875 (b_lblkno != (vp->v_maxra + 1) || vp->v_ralen == 0))) {
1c79356b
A
876 vp->v_ralen = 0;
877 vp->v_maxra = 0;
878
879 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
880 vp->v_ralen, vp->v_maxra, vp->v_lastr, 1, 0);
881
882 return;
883 }
d52fe63f 884 max_pages = MAX_UPL_TRANSFER;
0b4e3aa0
A
885
886 vp->v_ralen = vp->v_ralen ? min(max_pages, vp->v_ralen << 1) : 1;
1c79356b
A
887
888 if (((e_lblkno + 1) - b_lblkno) > vp->v_ralen)
0b4e3aa0 889 vp->v_ralen = min(max_pages, (e_lblkno + 1) - b_lblkno);
1c79356b
A
890
891 if (e_lblkno < vp->v_maxra) {
9bccf70c 892 if ((vp->v_maxra - e_lblkno) > max(max_pages / 16, 4)) {
1c79356b
A
893
894 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
895 vp->v_ralen, vp->v_maxra, vp->v_lastr, 2, 0);
896 return;
897 }
898 }
899 r_lblkno = max(e_lblkno, vp->v_maxra) + 1;
900 f_offset = (off_t)r_lblkno * PAGE_SIZE_64;
901
9bccf70c
A
902 if (f_offset < filesize) {
903 size_of_prefetch = cluster_rd_prefetch(vp, f_offset, vp->v_ralen * PAGE_SIZE, filesize, devblocksize);
1c79356b 904
9bccf70c
A
905 if (size_of_prefetch)
906 vp->v_maxra = (r_lblkno + size_of_prefetch) - 1;
907 }
1c79356b
A
908 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
909 vp->v_ralen, vp->v_maxra, vp->v_lastr, 3, 0);
910}
911
9bccf70c 912int
1c79356b
A
913cluster_pageout(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags)
914 struct vnode *vp;
915 upl_t upl;
916 vm_offset_t upl_offset;
917 off_t f_offset;
918 int size;
919 off_t filesize;
920 int devblocksize;
921 int flags;
922{
923 int io_size;
924 int pg_size;
925 off_t max_size;
926 int local_flags = CL_PAGEOUT;
927
928 if ((flags & UPL_IOSYNC) == 0)
929 local_flags |= CL_ASYNC;
930 if ((flags & UPL_NOCOMMIT) == 0)
931 local_flags |= CL_COMMIT;
932
1c79356b
A
933
934 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
935 (int)f_offset, size, (int)filesize, local_flags, 0);
936
937 /*
938 * If they didn't specify any I/O, then we are done...
939 * we can't issue an abort because we don't know how
940 * big the upl really is
941 */
942 if (size <= 0)
943 return (EINVAL);
944
945 if (vp->v_mount->mnt_flag & MNT_RDONLY) {
946 if (local_flags & CL_COMMIT)
9bccf70c 947 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
1c79356b
A
948 return (EROFS);
949 }
950 /*
951 * can't page-in from a negative offset
952 * or if we're starting beyond the EOF
953 * or if the file offset isn't page aligned
954 * or the size requested isn't a multiple of PAGE_SIZE
955 */
956 if (f_offset < 0 || f_offset >= filesize ||
957 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
0b4e3aa0
A
958 if (local_flags & CL_COMMIT)
959 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
1c79356b
A
960 return (EINVAL);
961 }
962 max_size = filesize - f_offset;
963
964 if (size < max_size)
965 io_size = size;
966 else
9bccf70c 967 io_size = max_size;
1c79356b
A
968
969 pg_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
970
971 if (size > pg_size) {
0b4e3aa0
A
972 if (local_flags & CL_COMMIT)
973 ubc_upl_abort_range(upl, upl_offset + pg_size, size - pg_size,
1c79356b
A
974 UPL_ABORT_FREE_ON_EMPTY);
975 }
0b4e3aa0
A
976 while (vp->v_numoutput >= ASYNC_THROTTLE) {
977 vp->v_flag |= VTHROTTLED;
978 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_pageout", 0);
979 }
1c79356b 980
9bccf70c 981 return (cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize,
b4c24cb9 982 local_flags, (struct buf *)0, (struct clios *)0));
1c79356b
A
983}
984
9bccf70c 985int
1c79356b
A
986cluster_pagein(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags)
987 struct vnode *vp;
988 upl_t upl;
989 vm_offset_t upl_offset;
990 off_t f_offset;
991 int size;
992 off_t filesize;
993 int devblocksize;
994 int flags;
995{
996 u_int io_size;
9bccf70c 997 int rounded_size;
1c79356b
A
998 off_t max_size;
999 int retval;
1000 int local_flags = 0;
1c79356b 1001
9bccf70c
A
1002 if (upl == NULL || size < 0)
1003 panic("cluster_pagein: NULL upl passed in");
1c79356b 1004
9bccf70c
A
1005 if ((flags & UPL_IOSYNC) == 0)
1006 local_flags |= CL_ASYNC;
1c79356b 1007 if ((flags & UPL_NOCOMMIT) == 0)
9bccf70c
A
1008 local_flags |= CL_COMMIT;
1009
1c79356b
A
1010
1011 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
1012 (int)f_offset, size, (int)filesize, local_flags, 0);
1013
1014 /*
1015 * can't page-in from a negative offset
1016 * or if we're starting beyond the EOF
1017 * or if the file offset isn't page aligned
1018 * or the size requested isn't a multiple of PAGE_SIZE
1019 */
1020 if (f_offset < 0 || f_offset >= filesize ||
9bccf70c
A
1021 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) {
1022 if (local_flags & CL_COMMIT)
1023 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1c79356b
A
1024 return (EINVAL);
1025 }
1026 max_size = filesize - f_offset;
1027
1028 if (size < max_size)
1029 io_size = size;
1030 else
9bccf70c 1031 io_size = max_size;
1c79356b 1032
9bccf70c 1033 rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1c79356b 1034
9bccf70c
A
1035 if (size > rounded_size && (local_flags & CL_COMMIT))
1036 ubc_upl_abort_range(upl, upl_offset + rounded_size,
1037 size - (upl_offset + rounded_size), UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1038
1039 retval = cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize,
b4c24cb9 1040 local_flags | CL_READ | CL_PAGEIN, (struct buf *)0, (struct clios *)0);
1c79356b
A
1041
1042 if (retval == 0) {
1043 int b_lblkno;
1044 int e_lblkno;
1045
1046 b_lblkno = (int)(f_offset / PAGE_SIZE_64);
1047 e_lblkno = (int)
1048 ((f_offset + ((off_t)io_size - 1)) / PAGE_SIZE_64);
1049
9bccf70c 1050 if (!(flags & UPL_NORDAHEAD) && !(vp->v_flag & VRAOFF) && rounded_size == PAGE_SIZE) {
1c79356b
A
1051 /*
1052 * we haven't read the last page in of the file yet
1053 * so let's try to read ahead if we're in
1054 * a sequential access pattern
1055 */
0b4e3aa0 1056 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
1c79356b
A
1057 }
1058 vp->v_lastr = e_lblkno;
1059 }
1060 return (retval);
1061}
1062
9bccf70c 1063int
1c79356b
A
1064cluster_bp(bp)
1065 struct buf *bp;
1066{
1067 off_t f_offset;
1068 int flags;
1069
9bccf70c
A
1070 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START,
1071 (int)bp, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
1072
1c79356b
A
1073 if (bp->b_pagelist == (upl_t) 0)
1074 panic("cluster_bp: can't handle NULL upl yet\n");
1075 if (bp->b_flags & B_READ)
9bccf70c 1076 flags = CL_ASYNC | CL_READ;
1c79356b 1077 else
9bccf70c 1078 flags = CL_ASYNC;
1c79356b
A
1079
1080 f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
1081
b4c24cb9 1082 return (cluster_io(bp->b_vp, bp->b_pagelist, 0, f_offset, bp->b_bcount, 0, flags, bp, (struct clios *)0));
1c79356b
A
1083}
1084
9bccf70c 1085int
1c79356b
A
1086cluster_write(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
1087 struct vnode *vp;
1088 struct uio *uio;
1089 off_t oldEOF;
1090 off_t newEOF;
1091 off_t headOff;
1092 off_t tailOff;
1093 int devblocksize;
1094 int flags;
1095{
1c79356b
A
1096 int prev_resid;
1097 int clip_size;
1098 off_t max_io_size;
1099 struct iovec *iov;
0b4e3aa0
A
1100 vm_offset_t upl_offset;
1101 int upl_size;
1102 int pages_in_pl;
1103 upl_page_info_t *pl;
1104 int upl_flags;
1105 upl_t upl;
1c79356b
A
1106 int retval = 0;
1107
1108
b4c24cb9 1109 if ( (!(vp->v_flag & VNOCACHE_DATA)) || (!uio) || (uio->uio_segflg != UIO_USERSPACE))
1c79356b 1110 {
0b4e3aa0 1111 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1c79356b
A
1112 return(retval);
1113 }
1114
1115 while (uio->uio_resid && uio->uio_offset < newEOF && retval == 0)
1116 {
1117 /* we know we have a resid, so this is safe */
1118 iov = uio->uio_iov;
1119 while (iov->iov_len == 0) {
1120 uio->uio_iov++;
1121 uio->uio_iovcnt--;
1122 iov = uio->uio_iov;
1123 }
1124
0b4e3aa0
A
1125 /*
1126 * We check every vector target and if it is physically
1127 * contiguous space, we skip the sanity checks.
1128 */
1129
1130 upl_offset = (vm_offset_t)iov->iov_base & ~PAGE_MASK;
1131 upl_size = (upl_offset + PAGE_SIZE +(PAGE_SIZE -1)) & ~PAGE_MASK;
1132 pages_in_pl = 0;
1133 upl_flags = UPL_QUERY_OBJECT_TYPE;
1134 if ((vm_map_get_upl(current_map(),
1135 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1136 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0)) != KERN_SUCCESS)
1137 {
1138 /*
1139 * the user app must have passed in an invalid address
1140 */
1141 return (EFAULT);
1142 }
1143
1144 if (upl_flags & UPL_PHYS_CONTIG)
1145 {
0b4e3aa0
A
1146 if (flags & IO_HEADZEROFILL)
1147 {
1148 flags &= ~IO_HEADZEROFILL;
1149
1150 if (retval = cluster_write_x(vp, (struct uio *)0, 0, uio->uio_offset, headOff, 0, devblocksize, IO_HEADZEROFILL))
1151 return(retval);
1152 }
1153
b4c24cb9 1154 retval = cluster_phys_write(vp, uio, newEOF, devblocksize, flags);
0b4e3aa0
A
1155
1156 if (uio->uio_resid == 0 && (flags & IO_TAILZEROFILL))
1157 {
1158 retval = cluster_write_x(vp, (struct uio *)0, 0, tailOff, uio->uio_offset, 0, devblocksize, IO_HEADZEROFILL);
1159 return(retval);
1160 }
1161 }
1162 else if ((uio->uio_resid < 4 * PAGE_SIZE) || (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL)))
1163 {
1164 /*
1165 * We set a threshhold of 4 pages to decide if the nocopy
1166 * write loop is worth the trouble...
1167 * we also come here if we're trying to zero the head and/or tail
1168 * of a partially written page, and the user source is not a physically contiguous region
1169 */
1170 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1171 return(retval);
1172 }
1173 else if (uio->uio_offset & PAGE_MASK_64)
1c79356b
A
1174 {
1175 /* Bring the file offset write up to a pagesize boundary */
1176 clip_size = (PAGE_SIZE - (uio->uio_offset & PAGE_MASK_64));
1177 if (uio->uio_resid < clip_size)
1178 clip_size = uio->uio_resid;
1179 /*
1180 * Fake the resid going into the cluster_write_x call
1181 * and restore it on the way out.
1182 */
1183 prev_resid = uio->uio_resid;
1184 uio->uio_resid = clip_size;
0b4e3aa0 1185 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1c79356b
A
1186 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1187 }
1188 else if ((int)iov->iov_base & PAGE_MASK_64)
1189 {
1190 clip_size = iov->iov_len;
1191 prev_resid = uio->uio_resid;
1192 uio->uio_resid = clip_size;
0b4e3aa0 1193 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1c79356b
A
1194 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1195 }
1196 else
1197 {
1198 /*
1199 * If we come in here, we know the offset into
1200 * the file is on a pagesize boundary
1201 */
1202
1203 max_io_size = newEOF - uio->uio_offset;
1204 clip_size = uio->uio_resid;
1205 if (iov->iov_len < clip_size)
1206 clip_size = iov->iov_len;
1207 if (max_io_size < clip_size)
1208 clip_size = max_io_size;
1209
1210 if (clip_size < PAGE_SIZE)
1211 {
1212 /*
1213 * Take care of tail end of write in this vector
1214 */
1215 prev_resid = uio->uio_resid;
1216 uio->uio_resid = clip_size;
0b4e3aa0 1217 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1c79356b
A
1218 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1219 }
1220 else
1221 {
1222 /* round clip_size down to a multiple of pagesize */
1223 clip_size = clip_size & ~(PAGE_MASK);
1224 prev_resid = uio->uio_resid;
1225 uio->uio_resid = clip_size;
0b4e3aa0 1226 retval = cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags);
1c79356b 1227 if ((retval == 0) && uio->uio_resid)
0b4e3aa0 1228 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1c79356b
A
1229 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1230 }
1231 } /* end else */
1232 } /* end while */
1233 return(retval);
1234}
1235
b4c24cb9 1236
9bccf70c 1237static int
0b4e3aa0 1238cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags)
1c79356b
A
1239 struct vnode *vp;
1240 struct uio *uio;
1241 off_t newEOF;
1242 int devblocksize;
1243 int flags;
1244{
1245 upl_t upl;
1246 upl_page_info_t *pl;
1247 off_t upl_f_offset;
1248 vm_offset_t upl_offset;
1249 off_t max_io_size;
1250 int io_size;
d7e50217 1251 int io_flag;
1c79356b
A
1252 int upl_size;
1253 int upl_needed_size;
1254 int pages_in_pl;
1255 int upl_flags;
1256 kern_return_t kret;
1257 struct iovec *iov;
1258 int i;
d7e50217 1259 int first = 1;
1c79356b
A
1260 int force_data_sync;
1261 int error = 0;
d7e50217 1262 struct clios iostate;
1c79356b
A
1263
1264 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
1265 (int)uio->uio_offset, (int)uio->uio_resid,
1266 (int)newEOF, devblocksize, 0);
1267
1268 /*
1269 * When we enter this routine, we know
1270 * -- the offset into the file is on a pagesize boundary
1271 * -- the resid is a page multiple
1272 * -- the resid will not exceed iov_len
1273 */
143cc14e 1274 cluster_try_push(vp, newEOF, 0, 1);
1c79356b 1275
d7e50217
A
1276 iostate.io_completed = 0;
1277 iostate.io_issued = 0;
1278 iostate.io_error = 0;
1279 iostate.io_wanted = 0;
1280
1c79356b 1281 iov = uio->uio_iov;
1c79356b 1282
0b4e3aa0 1283 while (uio->uio_resid && uio->uio_offset < newEOF && error == 0) {
d7e50217 1284 io_size = uio->uio_resid;
1c79356b 1285
d7e50217
A
1286 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1287 io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1c79356b 1288
d7e50217
A
1289 if (first) {
1290 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE) / 4)
1291 io_size = (MAX_UPL_TRANSFER * PAGE_SIZE) / 8;
1292 first = 0;
1293 }
1294 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
1295 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
1296
1297 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
1298 (int)upl_offset, upl_needed_size, (int)iov->iov_base, io_size, 0);
1299
1300 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
1301 pages_in_pl = 0;
1302 upl_size = upl_needed_size;
1303 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1304 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
1305
1306 kret = vm_map_get_upl(current_map(),
1307 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1308 &upl_size,
1309 &upl,
1310 NULL,
1311 &pages_in_pl,
1312 &upl_flags,
1313 force_data_sync);
1314
1315 if (kret != KERN_SUCCESS) {
1316 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1317 0, 0, 0, kret, 0);
1c79356b 1318
d7e50217
A
1319 /*
1320 * cluster_nocopy_write: failed to get pagelist
1321 *
1322 * we may have already spun some portion of this request
1323 * off as async requests... we need to wait for the I/O
1324 * to complete before returning
1325 */
1326 goto wait_for_writes;
1327 }
1328 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1329 pages_in_pl = upl_size / PAGE_SIZE;
1c79356b 1330
d7e50217
A
1331 for (i = 0; i < pages_in_pl; i++) {
1332 if (!upl_valid_page(pl, i))
1333 break;
1334 }
1335 if (i == pages_in_pl)
1336 break;
1c79356b 1337
d7e50217
A
1338 /*
1339 * didn't get all the pages back that we
1340 * needed... release this upl and try again
1341 */
1342 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1343 UPL_ABORT_FREE_ON_EMPTY);
1c79356b 1344 }
d7e50217
A
1345 if (force_data_sync >= 3) {
1346 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1347 i, pages_in_pl, upl_size, kret, 0);
0b4e3aa0 1348
d7e50217
A
1349 /*
1350 * for some reason, we couldn't acquire a hold on all
1351 * the pages needed in the user's address space
1352 *
1353 * we may have already spun some portion of this request
1354 * off as async requests... we need to wait for the I/O
1355 * to complete before returning
1356 */
1357 goto wait_for_writes;
1c79356b 1358 }
0b4e3aa0 1359
d7e50217
A
1360 /*
1361 * Consider the possibility that upl_size wasn't satisfied.
1362 */
1363 if (upl_size != upl_needed_size)
1364 io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
1c79356b 1365
d7e50217
A
1366 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1367 (int)upl_offset, upl_size, (int)iov->iov_base, io_size, 0);
1c79356b 1368
d7e50217
A
1369 if (io_size == 0) {
1370 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1371 UPL_ABORT_FREE_ON_EMPTY);
1c79356b 1372
d7e50217
A
1373 /*
1374 * we may have already spun some portion of this request
1375 * off as async requests... we need to wait for the I/O
1376 * to complete before returning
1377 */
1378 goto wait_for_writes;
1379 }
1380 /*
1381 * Now look for pages already in the cache
1382 * and throw them away.
1383 */
1c79356b 1384
d7e50217
A
1385 upl_f_offset = uio->uio_offset; /* this is page aligned in the file */
1386 max_io_size = io_size;
1c79356b 1387
d7e50217
A
1388 while (max_io_size) {
1389 /*
1390 * Flag UPL_POP_DUMP says if the page is found
1391 * in the page cache it must be thrown away.
1392 */
1393 ubc_page_op(vp,
1394 upl_f_offset,
1395 UPL_POP_SET | UPL_POP_BUSY | UPL_POP_DUMP,
1396 0, 0);
1397 max_io_size -= PAGE_SIZE_64;
1398 upl_f_offset += PAGE_SIZE_64;
1399 }
1400 /*
1401 * we want push out these writes asynchronously so that we can overlap
1402 * the preparation of the next I/O
1403 * if there are already too many outstanding writes
1404 * wait until some complete before issuing the next
1405 */
1406 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
1407 iostate.io_wanted = 1;
1408 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_write", 0);
1409 }
1410 if (iostate.io_error) {
1411 /*
1412 * one of the earlier writes we issued ran into a hard error
1413 * don't issue any more writes, cleanup the UPL
1414 * that was just created but not used, then
1415 * go wait for all writes that are part of this stream
1416 * to complete before returning the error to the caller
1417 */
1418 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1419 UPL_ABORT_FREE_ON_EMPTY);
1c79356b 1420
d7e50217
A
1421 goto wait_for_writes;
1422 }
1423 io_flag = CL_ASYNC | CL_PRESERVE | CL_COMMIT;
1c79356b 1424
d7e50217
A
1425 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
1426 (int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0);
1c79356b 1427
d7e50217
A
1428 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1429 io_size, devblocksize, io_flag, (struct buf *)0, &iostate);
7b1edb79 1430
d7e50217
A
1431 iov->iov_len -= io_size;
1432 iov->iov_base += io_size;
1433 uio->uio_resid -= io_size;
1434 uio->uio_offset += io_size;
1c79356b 1435
d7e50217
A
1436 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
1437 (int)upl_offset, (int)uio->uio_offset, (int)uio->uio_resid, error, 0);
1c79356b
A
1438
1439 } /* end while */
1440
d7e50217
A
1441wait_for_writes:
1442 /*
1443 * make sure all async writes issued as part of this stream
1444 * have completed before we return
1445 */
1446 while (iostate.io_issued != iostate.io_completed) {
1447 iostate.io_wanted = 1;
1448 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_write", 0);
1449 }
1450 if (iostate.io_error)
1451 error = iostate.io_error;
1c79356b
A
1452
1453 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1454 (int)uio->uio_offset, (int)uio->uio_resid, error, 4, 0);
1455
1456 return (error);
1457}
1458
b4c24cb9 1459
9bccf70c 1460static int
b4c24cb9 1461cluster_phys_write(vp, uio, newEOF, devblocksize, flags)
0b4e3aa0
A
1462 struct vnode *vp;
1463 struct uio *uio;
143cc14e 1464 off_t newEOF;
b4c24cb9
A
1465 int devblocksize;
1466 int flags;
0b4e3aa0 1467{
b4c24cb9 1468 upl_page_info_t *pl;
de355530 1469 vm_offset_t src_paddr;
0b4e3aa0
A
1470 upl_t upl;
1471 vm_offset_t upl_offset;
b4c24cb9 1472 int tail_size;
0b4e3aa0
A
1473 int io_size;
1474 int upl_size;
1475 int upl_needed_size;
1476 int pages_in_pl;
1477 int upl_flags;
1478 kern_return_t kret;
1479 struct iovec *iov;
1480 int error = 0;
1481
1482 /*
1483 * When we enter this routine, we know
1484 * -- the resid will not exceed iov_len
1485 * -- the vector target address is physcially contiguous
1486 */
143cc14e 1487 cluster_try_push(vp, newEOF, 0, 1);
0b4e3aa0
A
1488
1489 iov = uio->uio_iov;
1490 io_size = iov->iov_len;
1491 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
1492 upl_needed_size = upl_offset + io_size;
1493
1494 pages_in_pl = 0;
1495 upl_size = upl_needed_size;
9bccf70c
A
1496 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
1497 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
0b4e3aa0
A
1498
1499 kret = vm_map_get_upl(current_map(),
1500 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1501 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
1502
b4c24cb9
A
1503 if (kret != KERN_SUCCESS) {
1504 /*
1505 * cluster_phys_write: failed to get pagelist
1506 * note: return kret here
1507 */
0b4e3aa0 1508 return(EINVAL);
b4c24cb9 1509 }
0b4e3aa0
A
1510 /*
1511 * Consider the possibility that upl_size wasn't satisfied.
1512 * This is a failure in the physical memory case.
1513 */
b4c24cb9
A
1514 if (upl_size < upl_needed_size) {
1515 kernel_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1516 return(EINVAL);
1517 }
1518 pl = ubc_upl_pageinfo(upl);
0b4e3aa0 1519
de355530 1520 src_paddr = (vm_offset_t)upl_phys_page(pl, 0) + ((vm_offset_t)iov->iov_base & PAGE_MASK);
0b4e3aa0 1521
b4c24cb9
A
1522 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
1523 int head_size;
0b4e3aa0 1524
b4c24cb9 1525 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
0b4e3aa0 1526
b4c24cb9
A
1527 if (head_size > io_size)
1528 head_size = io_size;
1529
1530 error = cluster_align_phys_io(vp, uio, src_paddr, head_size, devblocksize, 0);
1531
1532 if (error) {
1533 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1534
1535 return(EINVAL);
1536 }
1537 upl_offset += head_size;
1538 src_paddr += head_size;
1539 io_size -= head_size;
0b4e3aa0 1540 }
b4c24cb9
A
1541 tail_size = io_size & (devblocksize - 1);
1542 io_size -= tail_size;
1543
1544 if (io_size) {
1545 /*
1546 * issue a synchronous write to cluster_io
1547 */
1548 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1549 io_size, 0, CL_DEV_MEMORY, (struct buf *)0, (struct clios *)0);
1550 }
1551 if (error == 0) {
1552 /*
1553 * The cluster_io write completed successfully,
1554 * update the uio structure
1555 */
1556 uio->uio_resid -= io_size;
1557 iov->iov_len -= io_size;
1558 iov->iov_base += io_size;
1559 uio->uio_offset += io_size;
1560 src_paddr += io_size;
1561
1562 if (tail_size)
1563 error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, devblocksize, 0);
1564 }
1565 /*
1566 * just release our hold on the physically contiguous
1567 * region without changing any state
1568 */
1569 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
0b4e3aa0
A
1570
1571 return (error);
1572}
1573
b4c24cb9 1574
9bccf70c 1575static int
0b4e3aa0 1576cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
1c79356b
A
1577 struct vnode *vp;
1578 struct uio *uio;
1579 off_t oldEOF;
1580 off_t newEOF;
1581 off_t headOff;
1582 off_t tailOff;
1583 int devblocksize;
1584 int flags;
1585{
1586 upl_page_info_t *pl;
1587 upl_t upl;
1588 vm_offset_t upl_offset;
1589 int upl_size;
1590 off_t upl_f_offset;
1591 int pages_in_upl;
1592 int start_offset;
1593 int xfer_resid;
1594 int io_size;
1c79356b
A
1595 int io_flags;
1596 vm_offset_t io_address;
1597 int io_offset;
1598 int bytes_to_zero;
1599 int bytes_to_move;
1600 kern_return_t kret;
1601 int retval = 0;
1602 int uio_resid;
1603 long long total_size;
1604 long long zero_cnt;
1605 off_t zero_off;
1606 long long zero_cnt1;
1607 off_t zero_off1;
1608 daddr_t start_blkno;
1609 daddr_t last_blkno;
1610
1611 if (uio) {
1612 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1613 (int)uio->uio_offset, uio->uio_resid, (int)oldEOF, (int)newEOF, 0);
1614
1615 uio_resid = uio->uio_resid;
1616 } else {
1617 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1618 0, 0, (int)oldEOF, (int)newEOF, 0);
1619
1620 uio_resid = 0;
1621 }
1622 zero_cnt = 0;
1623 zero_cnt1 = 0;
1624
1625 if (flags & IO_HEADZEROFILL) {
1626 /*
1627 * some filesystems (HFS is one) don't support unallocated holes within a file...
1628 * so we zero fill the intervening space between the old EOF and the offset
1629 * where the next chunk of real data begins.... ftruncate will also use this
1630 * routine to zero fill to the new EOF when growing a file... in this case, the
1631 * uio structure will not be provided
1632 */
1633 if (uio) {
1634 if (headOff < uio->uio_offset) {
1635 zero_cnt = uio->uio_offset - headOff;
1636 zero_off = headOff;
1637 }
1638 } else if (headOff < newEOF) {
1639 zero_cnt = newEOF - headOff;
1640 zero_off = headOff;
1641 }
1642 }
1643 if (flags & IO_TAILZEROFILL) {
1644 if (uio) {
1645 zero_off1 = uio->uio_offset + uio->uio_resid;
1646
1647 if (zero_off1 < tailOff)
1648 zero_cnt1 = tailOff - zero_off1;
1649 }
1650 }
1651 if (zero_cnt == 0 && uio == (struct uio *) 0)
1652 {
1653 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
1654 retval, 0, 0, 0, 0);
1655 return (0);
1656 }
1657
1658 while ((total_size = (uio_resid + zero_cnt + zero_cnt1)) && retval == 0) {
1659 /*
1660 * for this iteration of the loop, figure out where our starting point is
1661 */
1662 if (zero_cnt) {
1663 start_offset = (int)(zero_off & PAGE_MASK_64);
1664 upl_f_offset = zero_off - start_offset;
1665 } else if (uio_resid) {
1666 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
1667 upl_f_offset = uio->uio_offset - start_offset;
1668 } else {
1669 start_offset = (int)(zero_off1 & PAGE_MASK_64);
1670 upl_f_offset = zero_off1 - start_offset;
1671 }
1672 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
1673 (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
1674
0b4e3aa0
A
1675 if (total_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1676 total_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1c79356b
A
1677
1678 /*
1679 * compute the size of the upl needed to encompass
1680 * the requested write... limit each call to cluster_io
0b4e3aa0
A
1681 * to the maximum UPL size... cluster_io will clip if
1682 * this exceeds the maximum io_size for the device,
1683 * make sure to account for
1c79356b
A
1684 * a starting offset that's not page aligned
1685 */
1686 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1687
0b4e3aa0
A
1688 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1689 upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1c79356b
A
1690
1691 pages_in_upl = upl_size / PAGE_SIZE;
1692 io_size = upl_size - start_offset;
1693
1694 if ((long long)io_size > total_size)
1695 io_size = total_size;
1696
1697 start_blkno = (daddr_t)(upl_f_offset / PAGE_SIZE_64);
1698 last_blkno = start_blkno + pages_in_upl;
1699
0b4e3aa0
A
1700 kret = ubc_create_upl(vp,
1701 upl_f_offset,
1702 upl_size,
1703 &upl,
1704 &pl,
1705 UPL_FLAGS_NONE);
1c79356b
A
1706 if (kret != KERN_SUCCESS)
1707 panic("cluster_write: failed to get pagelist");
1708
0b4e3aa0 1709 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_NONE,
9bccf70c 1710 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
1c79356b
A
1711
1712 if (start_offset && !upl_valid_page(pl, 0)) {
0b4e3aa0 1713 int read_size;
1c79356b 1714
0b4e3aa0 1715 /*
1c79356b
A
1716 * we're starting in the middle of the first page of the upl
1717 * and the page isn't currently valid, so we're going to have
1718 * to read it in first... this is a synchronous operation
1719 */
1720 read_size = PAGE_SIZE;
1721
9bccf70c 1722 if ((upl_f_offset + read_size) > newEOF)
1c79356b 1723 read_size = newEOF - upl_f_offset;
9bccf70c
A
1724
1725 retval = cluster_io(vp, upl, 0, upl_f_offset, read_size, devblocksize,
b4c24cb9 1726 CL_READ, (struct buf *)0, (struct clios *)0);
1c79356b 1727 if (retval) {
0b4e3aa0 1728 /*
1c79356b
A
1729 * we had an error during the read which causes us to abort
1730 * the current cluster_write request... before we do, we need
1731 * to release the rest of the pages in the upl without modifying
1732 * there state and mark the failed page in error
1733 */
0b4e3aa0 1734 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
9bccf70c 1735 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1c79356b
A
1736
1737 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
9bccf70c 1738 (int)upl, 0, 0, retval, 0);
1c79356b
A
1739 break;
1740 }
1741 }
1742 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
1743 /*
1744 * the last offset we're writing to in this upl does not end on a page
1745 * boundary... if it's not beyond the old EOF, then we'll also need to
1746 * pre-read this page in if it isn't already valid
1747 */
1748 upl_offset = upl_size - PAGE_SIZE;
1749
1750 if ((upl_f_offset + start_offset + io_size) < oldEOF &&
1751 !upl_valid_page(pl, upl_offset / PAGE_SIZE)) {
1752 int read_size;
1753
1754 read_size = PAGE_SIZE;
1755
9bccf70c 1756 if ((upl_f_offset + upl_offset + read_size) > newEOF)
1c79356b 1757 read_size = newEOF - (upl_f_offset + upl_offset);
9bccf70c
A
1758
1759 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size, devblocksize,
b4c24cb9 1760 CL_READ, (struct buf *)0, (struct clios *)0);
1c79356b 1761 if (retval) {
0b4e3aa0 1762 /*
1c79356b 1763 * we had an error during the read which causes us to abort
0b4e3aa0
A
1764 * the current cluster_write request... before we do, we
1765 * need to release the rest of the pages in the upl without
1766 * modifying there state and mark the failed page in error
1c79356b 1767 */
9bccf70c
A
1768 ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
1769 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1c79356b
A
1770
1771 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
9bccf70c 1772 (int)upl, 0, 0, retval, 0);
1c79356b
A
1773 break;
1774 }
1775 }
1776 }
0b4e3aa0
A
1777 if ((kret = ubc_upl_map(upl, &io_address)) != KERN_SUCCESS)
1778 panic("cluster_write: ubc_upl_map failed\n");
1c79356b
A
1779 xfer_resid = io_size;
1780 io_offset = start_offset;
1781
1782 while (zero_cnt && xfer_resid) {
1783
1784 if (zero_cnt < (long long)xfer_resid)
1785 bytes_to_zero = zero_cnt;
1786 else
1787 bytes_to_zero = xfer_resid;
1788
9bccf70c 1789 if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
1c79356b
A
1790 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1791
1792 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1793 (int)upl_f_offset + io_offset, bytes_to_zero,
9bccf70c 1794 (int)io_offset, xfer_resid, 0);
1c79356b 1795 } else {
9bccf70c
A
1796 int zero_pg_index;
1797
1c79356b 1798 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
9bccf70c
A
1799 zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
1800
1801 if ( !upl_valid_page(pl, zero_pg_index)) {
1802 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1803
1804 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1805 (int)upl_f_offset + io_offset, bytes_to_zero,
1806 (int)io_offset, xfer_resid, 0);
1c79356b 1807
9bccf70c
A
1808 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
1809 !upl_dirty_page(pl, zero_pg_index)) {
1c79356b
A
1810 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1811
1812 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1813 (int)upl_f_offset + io_offset, bytes_to_zero,
9bccf70c 1814 (int)io_offset, xfer_resid, 0);
1c79356b
A
1815 }
1816 }
1817 xfer_resid -= bytes_to_zero;
1818 zero_cnt -= bytes_to_zero;
1819 zero_off += bytes_to_zero;
1820 io_offset += bytes_to_zero;
1821 }
1822 if (xfer_resid && uio_resid) {
1823 bytes_to_move = min(uio_resid, xfer_resid);
1824
1825 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 42)) | DBG_FUNC_NONE,
1826 (int)uio->uio_offset, bytes_to_move, uio_resid, xfer_resid, 0);
1827
1828 retval = uiomove((caddr_t)(io_address + io_offset), bytes_to_move, uio);
1829
9bccf70c 1830
1c79356b 1831 if (retval) {
0b4e3aa0 1832 if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
1c79356b 1833 panic("cluster_write: kernel_upl_unmap failed\n");
9bccf70c
A
1834
1835 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
1c79356b
A
1836
1837 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
9bccf70c 1838 (int)upl, 0, 0, retval, 0);
1c79356b
A
1839 } else {
1840 uio_resid -= bytes_to_move;
1841 xfer_resid -= bytes_to_move;
1842 io_offset += bytes_to_move;
1843 }
1844 }
1845 while (xfer_resid && zero_cnt1 && retval == 0) {
1846
1847 if (zero_cnt1 < (long long)xfer_resid)
1848 bytes_to_zero = zero_cnt1;
1849 else
1850 bytes_to_zero = xfer_resid;
1851
9bccf70c 1852 if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
1c79356b
A
1853 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1854
1855 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1856 (int)upl_f_offset + io_offset,
9bccf70c 1857 bytes_to_zero, (int)io_offset, xfer_resid, 0);
1c79356b 1858 } else {
9bccf70c
A
1859 int zero_pg_index;
1860
1c79356b 1861 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off1 & PAGE_MASK_64));
9bccf70c
A
1862 zero_pg_index = (int)((zero_off1 - upl_f_offset) / PAGE_SIZE_64);
1863
1864 if ( !upl_valid_page(pl, zero_pg_index)) {
1c79356b
A
1865 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1866
1867 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1868 (int)upl_f_offset + io_offset,
9bccf70c
A
1869 bytes_to_zero, (int)io_offset, xfer_resid, 0);
1870
1871 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
1872 !upl_dirty_page(pl, zero_pg_index)) {
1873 bzero((caddr_t)(io_address + io_offset), bytes_to_zero);
1874
1875 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1876 (int)upl_f_offset + io_offset,
1877 bytes_to_zero, (int)io_offset, xfer_resid, 0);
1c79356b
A
1878 }
1879 }
1880 xfer_resid -= bytes_to_zero;
1881 zero_cnt1 -= bytes_to_zero;
1882 zero_off1 += bytes_to_zero;
1883 io_offset += bytes_to_zero;
1884 }
1885
1886 if (retval == 0) {
9bccf70c 1887 int cl_index;
1c79356b
A
1888 int can_delay;
1889
1890 io_size += start_offset;
1891
9bccf70c 1892 if ((upl_f_offset + io_size) >= newEOF && io_size < upl_size) {
1c79356b
A
1893 /*
1894 * if we're extending the file with this write
1895 * we'll zero fill the rest of the page so that
1896 * if the file gets extended again in such a way as to leave a
1897 * hole starting at this EOF, we'll have zero's in the correct spot
1898 */
1899 bzero((caddr_t)(io_address + io_size), upl_size - io_size);
1900
1901 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 43)) | DBG_FUNC_NONE,
1902 (int)upl_f_offset + io_size,
1903 upl_size - io_size, 0, 0, 0);
1904 }
0b4e3aa0 1905 if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
1c79356b
A
1906 panic("cluster_write: kernel_upl_unmap failed\n");
1907
9bccf70c
A
1908 if (flags & IO_SYNC)
1909 /*
1910 * if the IO_SYNC flag is set than we need to
1911 * bypass any clusters and immediately issue
1912 * the I/O
1913 */
1914 goto issue_io;
1c79356b 1915
9bccf70c
A
1916 if (vp->v_clen == 0)
1917 /*
1918 * no clusters currently present
1919 */
1920 goto start_new_cluster;
1c79356b 1921
9bccf70c
A
1922 /*
1923 * keep track of the overall dirty page
1924 * range we've developed
1925 * in case we have to fall back to the
1926 * VHASDIRTY method of flushing
1927 */
1928 if (vp->v_flag & VHASDIRTY)
1929 goto delay_io;
1c79356b 1930
9bccf70c 1931 for (cl_index = 0; cl_index < vp->v_clen; cl_index++) {
1c79356b
A
1932 /*
1933 * we have an existing cluster... see if this write will extend it nicely
1934 */
9bccf70c
A
1935 if (start_blkno >= vp->v_clusters[cl_index].start_pg) {
1936 /*
1937 * the current write starts at or after the current cluster
1938 */
1939 if (last_blkno <= (vp->v_clusters[cl_index].start_pg + MAX_UPL_TRANSFER)) {
1c79356b
A
1940 /*
1941 * we have a write that fits entirely
1942 * within the existing cluster limits
1943 */
9bccf70c 1944 if (last_blkno > vp->v_clusters[cl_index].last_pg)
1c79356b 1945 /*
9bccf70c 1946 * update our idea of where the cluster ends
1c79356b 1947 */
9bccf70c
A
1948 vp->v_clusters[cl_index].last_pg = last_blkno;
1949 break;
1c79356b 1950 }
9bccf70c 1951 if (start_blkno < (vp->v_clusters[cl_index].start_pg + MAX_UPL_TRANSFER)) {
1c79356b
A
1952 /*
1953 * we have a write that starts in the middle of the current cluster
1954 * but extends beyond the cluster's limit
1955 * we'll clip the current cluster if we actually
9bccf70c 1956 * overlap with the new write
1c79356b
A
1957 * and start a new cluster with the current write
1958 */
9bccf70c
A
1959 if (vp->v_clusters[cl_index].last_pg > start_blkno)
1960 vp->v_clusters[cl_index].last_pg = start_blkno;
1c79356b
A
1961 }
1962 /*
1963 * we also get here for the case where the current write starts
1964 * beyond the limit of the existing cluster
9bccf70c
A
1965 *
1966 * in either case, we'll check the remaining clusters before
1967 * starting a new one
1c79356b 1968 */
9bccf70c 1969 } else {
1c79356b 1970 /*
9bccf70c 1971 * the current write starts in front of the current cluster
1c79356b 1972 */
9bccf70c 1973 if ((vp->v_clusters[cl_index].last_pg - start_blkno) <= MAX_UPL_TRANSFER) {
1c79356b 1974 /*
9bccf70c
A
1975 * we can just merge the old cluster
1976 * with the new request and leave it
1977 * in the cache
1c79356b 1978 */
9bccf70c 1979 vp->v_clusters[cl_index].start_pg = start_blkno;
1c79356b 1980
9bccf70c
A
1981 if (last_blkno > vp->v_clusters[cl_index].last_pg) {
1982 /*
1983 * the current write completely
1984 * envelops the existing cluster
1985 */
1986 vp->v_clusters[cl_index].last_pg = last_blkno;
1987 }
1988 break;
1c79356b 1989 }
9bccf70c 1990
1c79356b 1991 /*
9bccf70c
A
1992 * if we were to combine this write with the current cluster
1993 * we would exceed the cluster size limit.... so,
1994 * let's see if there's any overlap of the new I/O with
1995 * the existing cluster...
1996 *
1c79356b 1997 */
9bccf70c 1998 if (last_blkno > vp->v_clusters[cl_index].start_pg)
1c79356b 1999 /*
9bccf70c
A
2000 * the current write extends into the existing cluster
2001 * clip the current cluster by moving the start position
2002 * to where the current write ends
1c79356b 2003 */
9bccf70c
A
2004 vp->v_clusters[cl_index].start_pg = last_blkno;
2005 /*
2006 * if we get here, there was no way to merge
2007 * the new I/O with this cluster and
2008 * keep it under our maximum cluster length
2009 * we'll check the remaining clusters before starting a new one
2010 */
1c79356b 2011 }
9bccf70c
A
2012 }
2013 if (cl_index < vp->v_clen)
2014 /*
2015 * we found an existing cluster that we
2016 * could merger this I/O into
2017 */
2018 goto delay_io;
2019
2020 if (vp->v_clen < MAX_CLUSTERS && !(vp->v_flag & VNOCACHE_DATA))
2021 /*
2022 * we didn't find an existing cluster to
2023 * merge into, but there's room to start
1c79356b
A
2024 * a new one
2025 */
9bccf70c 2026 goto start_new_cluster;
1c79356b 2027
9bccf70c
A
2028 /*
2029 * no exisitng cluster to merge with and no
2030 * room to start a new one... we'll try
2031 * pushing the existing ones... if none of
2032 * them are able to be pushed, we'll have
2033 * to fall back on the VHASDIRTY mechanism
2034 * cluster_try_push will set v_clen to the
2035 * number of remaining clusters if it is
2036 * unable to push all of them
2037 */
2038 if (vp->v_flag & VNOCACHE_DATA)
2039 can_delay = 0;
2040 else
2041 can_delay = 1;
2042
143cc14e 2043 if (cluster_try_push(vp, newEOF, 0, 0) == 0) {
9bccf70c
A
2044 vp->v_flag |= VHASDIRTY;
2045 goto delay_io;
2046 }
2047start_new_cluster:
2048 if (vp->v_clen == 0) {
2049 vp->v_ciosiz = devblocksize;
1c79356b
A
2050 vp->v_cstart = start_blkno;
2051 vp->v_lastw = last_blkno;
1c79356b 2052 }
9bccf70c
A
2053 vp->v_clusters[vp->v_clen].start_pg = start_blkno;
2054 vp->v_clusters[vp->v_clen].last_pg = last_blkno;
2055 vp->v_clen++;
2056delay_io:
2057 /*
2058 * make sure we keep v_cstart and v_lastw up to
2059 * date in case we have to fall back on the
2060 * V_HASDIRTY mechanism (or we've already entered it)
2061 */
2062 if (start_blkno < vp->v_cstart)
2063 vp->v_cstart = start_blkno;
2064 if (last_blkno > vp->v_lastw)
2065 vp->v_lastw = last_blkno;
2066
b4c24cb9 2067 ubc_upl_commit_range(upl, 0, upl_size, UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
9bccf70c
A
2068 continue;
2069issue_io:
2070 /*
2071 * in order to maintain some semblance of coherency with mapped writes
2072 * we need to write the cluster back out as a multiple of the PAGESIZE
2073 * unless the cluster encompasses the last page of the file... in this
2074 * case we'll round out to the nearest device block boundary
2075 */
2076 io_size = upl_size;
2077
2078 if ((upl_f_offset + io_size) > newEOF) {
2079 io_size = newEOF - upl_f_offset;
2080 io_size = (io_size + (devblocksize - 1)) & ~(devblocksize - 1);
1c79356b 2081 }
9bccf70c 2082
0b4e3aa0 2083 if (flags & IO_SYNC)
1c79356b
A
2084 io_flags = CL_COMMIT | CL_AGE;
2085 else
2086 io_flags = CL_COMMIT | CL_AGE | CL_ASYNC;
2087
2088 if (vp->v_flag & VNOCACHE_DATA)
2089 io_flags |= CL_DUMP;
2090
0b4e3aa0
A
2091 while (vp->v_numoutput >= ASYNC_THROTTLE) {
2092 vp->v_flag |= VTHROTTLED;
2093 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_write", 0);
2094 }
9bccf70c 2095 retval = cluster_io(vp, upl, 0, upl_f_offset, io_size, devblocksize,
b4c24cb9 2096 io_flags, (struct buf *)0, (struct clios *)0);
1c79356b
A
2097 }
2098 }
2099 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
2100 retval, 0, 0, 0, 0);
2101
2102 return (retval);
2103}
2104
9bccf70c 2105int
1c79356b
A
2106cluster_read(vp, uio, filesize, devblocksize, flags)
2107 struct vnode *vp;
2108 struct uio *uio;
2109 off_t filesize;
2110 int devblocksize;
2111 int flags;
2112{
1c79356b
A
2113 int prev_resid;
2114 int clip_size;
2115 off_t max_io_size;
2116 struct iovec *iov;
0b4e3aa0
A
2117 vm_offset_t upl_offset;
2118 int upl_size;
2119 int pages_in_pl;
2120 upl_page_info_t *pl;
2121 int upl_flags;
2122 upl_t upl;
1c79356b
A
2123 int retval = 0;
2124
1c79356b
A
2125 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
2126 (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0);
2127
2128 /*
2129 * We set a threshhold of 4 pages to decide if the nocopy
2130 * read loop is worth the trouble...
2131 */
2132
0b4e3aa0 2133 if (!((vp->v_flag & VNOCACHE_DATA) && (uio->uio_segflg == UIO_USERSPACE)))
1c79356b 2134 {
0b4e3aa0 2135 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
1c79356b
A
2136 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2137 (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
2138 return(retval);
1c79356b
A
2139 }
2140
2141 while (uio->uio_resid && uio->uio_offset < filesize && retval == 0)
2142 {
2143 /* we know we have a resid, so this is safe */
2144 iov = uio->uio_iov;
2145 while (iov->iov_len == 0) {
2146 uio->uio_iov++;
2147 uio->uio_iovcnt--;
2148 iov = uio->uio_iov;
2149 }
2150
0b4e3aa0
A
2151 /*
2152 * We check every vector target and if it is physically
2153 * contiguous space, we skip the sanity checks.
2154 */
2155
2156 upl_offset = (vm_offset_t)iov->iov_base & ~PAGE_MASK;
2157 upl_size = (upl_offset + PAGE_SIZE +(PAGE_SIZE -1)) & ~PAGE_MASK;
2158 pages_in_pl = 0;
2159 upl_flags = UPL_QUERY_OBJECT_TYPE;
2160 if((vm_map_get_upl(current_map(),
2161 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2162 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0)) != KERN_SUCCESS)
2163 {
2164 /*
2165 * the user app must have passed in an invalid address
2166 */
2167 return (EFAULT);
2168 }
2169
2170 if (upl_flags & UPL_PHYS_CONTIG)
2171 {
b4c24cb9 2172 retval = cluster_phys_read(vp, uio, filesize, devblocksize, flags);
0b4e3aa0
A
2173 }
2174 else if (uio->uio_resid < 4 * PAGE_SIZE)
2175 {
2176 /*
2177 * We set a threshhold of 4 pages to decide if the nocopy
2178 * read loop is worth the trouble...
2179 */
2180 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2181 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2182 (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
2183 return(retval);
2184 }
2185 else if (uio->uio_offset & PAGE_MASK_64)
1c79356b
A
2186 {
2187 /* Bring the file offset read up to a pagesize boundary */
2188 clip_size = (PAGE_SIZE - (int)(uio->uio_offset & PAGE_MASK_64));
2189 if (uio->uio_resid < clip_size)
2190 clip_size = uio->uio_resid;
2191 /*
2192 * Fake the resid going into the cluster_read_x call
2193 * and restore it on the way out.
2194 */
2195 prev_resid = uio->uio_resid;
2196 uio->uio_resid = clip_size;
0b4e3aa0 2197 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
1c79356b
A
2198 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2199 }
2200 else if ((int)iov->iov_base & PAGE_MASK_64)
2201 {
2202 clip_size = iov->iov_len;
2203 prev_resid = uio->uio_resid;
2204 uio->uio_resid = clip_size;
0b4e3aa0 2205 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
1c79356b
A
2206 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2207 }
2208 else
2209 {
2210 /*
2211 * If we come in here, we know the offset into
2212 * the file is on a pagesize boundary
2213 */
2214
2215 max_io_size = filesize - uio->uio_offset;
2216 clip_size = uio->uio_resid;
2217 if (iov->iov_len < clip_size)
2218 clip_size = iov->iov_len;
2219 if (max_io_size < clip_size)
2220 clip_size = (int)max_io_size;
2221
2222 if (clip_size < PAGE_SIZE)
2223 {
2224 /*
2225 * Take care of the tail end of the read in this vector.
2226 */
2227 prev_resid = uio->uio_resid;
2228 uio->uio_resid = clip_size;
0b4e3aa0 2229 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
1c79356b
A
2230 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2231 }
2232 else
2233 {
2234 /* round clip_size down to a multiple of pagesize */
2235 clip_size = clip_size & ~(PAGE_MASK);
2236 prev_resid = uio->uio_resid;
2237 uio->uio_resid = clip_size;
0b4e3aa0 2238 retval = cluster_nocopy_read(vp, uio, filesize, devblocksize, flags);
1c79356b 2239 if ((retval==0) && uio->uio_resid)
0b4e3aa0 2240 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
1c79356b
A
2241 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2242 }
2243 } /* end else */
2244 } /* end while */
2245
2246 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2247 (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
2248
2249 return(retval);
2250}
2251
b4c24cb9 2252
9bccf70c 2253static int
0b4e3aa0 2254cluster_read_x(vp, uio, filesize, devblocksize, flags)
1c79356b
A
2255 struct vnode *vp;
2256 struct uio *uio;
2257 off_t filesize;
2258 int devblocksize;
2259 int flags;
2260{
2261 upl_page_info_t *pl;
2262 upl_t upl;
2263 vm_offset_t upl_offset;
2264 int upl_size;
2265 off_t upl_f_offset;
2266 int start_offset;
2267 int start_pg;
2268 int last_pg;
2269 int uio_last;
2270 int pages_in_upl;
2271 off_t max_size;
2272 int io_size;
2273 vm_offset_t io_address;
2274 kern_return_t kret;
2275 int segflg;
2276 int error = 0;
2277 int retval = 0;
2278 int b_lblkno;
2279 int e_lblkno;
2280
2281 b_lblkno = (int)(uio->uio_offset / PAGE_SIZE_64);
2282
2283 while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) {
2284 /*
2285 * compute the size of the upl needed to encompass
2286 * the requested read... limit each call to cluster_io
0b4e3aa0
A
2287 * to the maximum UPL size... cluster_io will clip if
2288 * this exceeds the maximum io_size for the device,
2289 * make sure to account for
1c79356b
A
2290 * a starting offset that's not page aligned
2291 */
2292 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2293 upl_f_offset = uio->uio_offset - (off_t)start_offset;
2294 max_size = filesize - uio->uio_offset;
2295
0b4e3aa0 2296 if ((off_t)((unsigned int)uio->uio_resid) < max_size)
1c79356b
A
2297 io_size = uio->uio_resid;
2298 else
2299 io_size = max_size;
9bccf70c 2300
1c79356b
A
2301 if (uio->uio_segflg == UIO_USERSPACE && !(vp->v_flag & VNOCACHE_DATA)) {
2302 segflg = uio->uio_segflg;
2303
2304 uio->uio_segflg = UIO_PHYS_USERSPACE;
2305
2306 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
2307 (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2308
2309 while (io_size && retval == 0) {
de355530
A
2310 int xsize;
2311 vm_offset_t paddr;
1c79356b 2312
0b4e3aa0
A
2313 if (ubc_page_op(vp,
2314 upl_f_offset,
2315 UPL_POP_SET | UPL_POP_BUSY,
2316 &paddr, 0) != KERN_SUCCESS)
1c79356b
A
2317 break;
2318
2319 xsize = PAGE_SIZE - start_offset;
2320
2321 if (xsize > io_size)
2322 xsize = io_size;
2323
de355530 2324 retval = uiomove((caddr_t)(paddr + start_offset), xsize, uio);
1c79356b 2325
0b4e3aa0
A
2326 ubc_page_op(vp, upl_f_offset,
2327 UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
1c79356b
A
2328
2329 io_size -= xsize;
2330 start_offset = (int)
2331 (uio->uio_offset & PAGE_MASK_64);
2332 upl_f_offset = uio->uio_offset - start_offset;
2333 }
2334 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
2335 (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2336
2337 uio->uio_segflg = segflg;
2338
2339 if (retval)
2340 break;
2341
2342 if (io_size == 0) {
2343 /*
2344 * we're already finished with this read request
2345 * let's see if we should do a read-ahead
2346 */
2347 e_lblkno = (int)
2348 ((uio->uio_offset - 1) / PAGE_SIZE_64);
2349
2350 if (!(vp->v_flag & VRAOFF))
2351 /*
2352 * let's try to read ahead if we're in
2353 * a sequential access pattern
2354 */
0b4e3aa0 2355 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
1c79356b
A
2356 vp->v_lastr = e_lblkno;
2357
2358 break;
2359 }
2360 max_size = filesize - uio->uio_offset;
2361 }
1c79356b 2362 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
0b4e3aa0
A
2363 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2364 upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1c79356b
A
2365 pages_in_upl = upl_size / PAGE_SIZE;
2366
2367 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
9bccf70c 2368 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
1c79356b 2369
0b4e3aa0
A
2370 kret = ubc_create_upl(vp,
2371 upl_f_offset,
2372 upl_size,
2373 &upl,
2374 &pl,
2375 UPL_FLAGS_NONE);
1c79356b
A
2376 if (kret != KERN_SUCCESS)
2377 panic("cluster_read: failed to get pagelist");
2378
1c79356b 2379 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
9bccf70c 2380 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
1c79356b
A
2381
2382 /*
2383 * scan from the beginning of the upl looking for the first
2384 * non-valid page.... this will become the first page in
2385 * the request we're going to make to 'cluster_io'... if all
2386 * of the pages are valid, we won't call through to 'cluster_io'
2387 */
2388 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
2389 if (!upl_valid_page(pl, start_pg))
2390 break;
2391 }
2392
2393 /*
2394 * scan from the starting invalid page looking for a valid
2395 * page before the end of the upl is reached, if we
2396 * find one, then it will be the last page of the request to
2397 * 'cluster_io'
2398 */
2399 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
2400 if (upl_valid_page(pl, last_pg))
2401 break;
2402 }
2403
2404 if (start_pg < last_pg) {
2405 /*
2406 * we found a range of 'invalid' pages that must be filled
2407 * if the last page in this range is the last page of the file
2408 * we may have to clip the size of it to keep from reading past
2409 * the end of the last physical block associated with the file
2410 */
2411 upl_offset = start_pg * PAGE_SIZE;
2412 io_size = (last_pg - start_pg) * PAGE_SIZE;
2413
9bccf70c 2414 if ((upl_f_offset + upl_offset + io_size) > filesize)
1c79356b 2415 io_size = filesize - (upl_f_offset + upl_offset);
9bccf70c 2416
1c79356b
A
2417 /*
2418 * issue a synchronous read to cluster_io
2419 */
2420
2421 error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
b4c24cb9 2422 io_size, devblocksize, CL_READ, (struct buf *)0, (struct clios *)0);
1c79356b
A
2423 }
2424 if (error == 0) {
2425 /*
2426 * if the read completed successfully, or there was no I/O request
2427 * issued, than map the upl into kernel address space and
2428 * move the data into user land.... we'll first add on any 'valid'
2429 * pages that were present in the upl when we acquired it.
2430 */
2431 u_int val_size;
2432 u_int size_of_prefetch;
2433
2434 for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
2435 if (!upl_valid_page(pl, uio_last))
2436 break;
2437 }
2438 /*
2439 * compute size to transfer this round, if uio->uio_resid is
2440 * still non-zero after this uiomove, we'll loop around and
2441 * set up for another I/O.
2442 */
2443 val_size = (uio_last * PAGE_SIZE) - start_offset;
2444
2445 if (max_size < val_size)
2446 val_size = max_size;
2447
2448 if (uio->uio_resid < val_size)
2449 val_size = uio->uio_resid;
2450
2451 e_lblkno = (int)((uio->uio_offset + ((off_t)val_size - 1)) / PAGE_SIZE_64);
2452
2453 if (size_of_prefetch = (uio->uio_resid - val_size)) {
2454 /*
2455 * if there's still I/O left to do for this request, then issue a
2456 * pre-fetch I/O... the I/O wait time will overlap
2457 * with the copying of the data
2458 */
0b4e3aa0 2459 cluster_rd_prefetch(vp, uio->uio_offset + val_size, size_of_prefetch, filesize, devblocksize);
1c79356b
A
2460 } else {
2461 if (!(vp->v_flag & VRAOFF) && !(vp->v_flag & VNOCACHE_DATA))
2462 /*
2463 * let's try to read ahead if we're in
2464 * a sequential access pattern
2465 */
0b4e3aa0 2466 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
1c79356b
A
2467 vp->v_lastr = e_lblkno;
2468 }
1c79356b
A
2469 if (uio->uio_segflg == UIO_USERSPACE) {
2470 int offset;
2471
2472 segflg = uio->uio_segflg;
2473
2474 uio->uio_segflg = UIO_PHYS_USERSPACE;
2475
2476
2477 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
2478 (int)uio->uio_offset, val_size, uio->uio_resid, 0, 0);
2479
2480 offset = start_offset;
2481
2482 while (val_size && retval == 0) {
2483 int csize;
2484 int i;
de355530 2485 caddr_t paddr;
1c79356b
A
2486
2487 i = offset / PAGE_SIZE;
2488 csize = min(PAGE_SIZE - start_offset, val_size);
2489
de355530 2490 paddr = (caddr_t)upl_phys_page(pl, i) + start_offset;
1c79356b 2491
de355530 2492 retval = uiomove(paddr, csize, uio);
1c79356b
A
2493
2494 val_size -= csize;
2495 offset += csize;
2496 start_offset = offset & PAGE_MASK;
2497 }
2498 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
2499 (int)uio->uio_offset, val_size, uio->uio_resid, 0, 0);
2500
2501 uio->uio_segflg = segflg;
9bccf70c
A
2502 }
2503 else
1c79356b 2504 {
0b4e3aa0
A
2505 if ((kret = ubc_upl_map(upl, &io_address)) != KERN_SUCCESS)
2506 panic("cluster_read: ubc_upl_map() failed\n");
1c79356b
A
2507
2508 retval = uiomove((caddr_t)(io_address + start_offset), val_size, uio);
2509
0b4e3aa0
A
2510 if ((kret = ubc_upl_unmap(upl)) != KERN_SUCCESS)
2511 panic("cluster_read: ubc_upl_unmap() failed\n");
1c79356b
A
2512 }
2513 }
2514 if (start_pg < last_pg) {
2515 /*
2516 * compute the range of pages that we actually issued an I/O for
2517 * and either commit them as valid if the I/O succeeded
2518 * or abort them if the I/O failed
2519 */
2520 io_size = (last_pg - start_pg) * PAGE_SIZE;
2521
2522 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
9bccf70c 2523 (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
1c79356b
A
2524
2525 if (error || (vp->v_flag & VNOCACHE_DATA))
0b4e3aa0 2526 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
1c79356b
A
2527 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2528 else
0b4e3aa0
A
2529 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size,
2530 UPL_COMMIT_CLEAR_DIRTY
1c79356b 2531 | UPL_COMMIT_FREE_ON_EMPTY
0b4e3aa0 2532 | UPL_COMMIT_INACTIVATE);
1c79356b
A
2533
2534 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
9bccf70c 2535 (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
1c79356b
A
2536 }
2537 if ((last_pg - start_pg) < pages_in_upl) {
2538 int cur_pg;
2539 int commit_flags;
2540
2541 /*
2542 * the set of pages that we issued an I/O for did not encompass
2543 * the entire upl... so just release these without modifying
2544 * there state
2545 */
2546 if (error)
9bccf70c 2547 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1c79356b 2548 else {
0b4e3aa0 2549 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
9bccf70c 2550 (int)upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
1c79356b 2551
0b4e3aa0
A
2552 if (start_pg) {
2553 /*
2554 * we found some already valid pages at the beginning of
2555 * the upl commit these back to the inactive list with
2556 * reference cleared
2557 */
2558 for (cur_pg = 0; cur_pg < start_pg; cur_pg++) {
2559 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
2560 | UPL_COMMIT_INACTIVATE;
1c79356b
A
2561
2562 if (upl_dirty_page(pl, cur_pg))
0b4e3aa0 2563 commit_flags |= UPL_COMMIT_SET_DIRTY;
1c79356b
A
2564
2565 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA))
0b4e3aa0
A
2566 ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
2567 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
1c79356b 2568 else
0b4e3aa0
A
2569 ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
2570 PAGE_SIZE, commit_flags);
1c79356b
A
2571 }
2572 }
2573 if (last_pg < uio_last) {
0b4e3aa0
A
2574 /*
2575 * we found some already valid pages immediately after the
2576 * pages we issued I/O for, commit these back to the
2577 * inactive list with reference cleared
2578 */
2579 for (cur_pg = last_pg; cur_pg < uio_last; cur_pg++) {
2580 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
2581 | UPL_COMMIT_INACTIVATE;
1c79356b
A
2582
2583 if (upl_dirty_page(pl, cur_pg))
0b4e3aa0 2584 commit_flags |= UPL_COMMIT_SET_DIRTY;
1c79356b
A
2585
2586 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA))
0b4e3aa0
A
2587 ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
2588 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
1c79356b 2589 else
0b4e3aa0
A
2590 ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
2591 PAGE_SIZE, commit_flags);
1c79356b
A
2592 }
2593 }
2594 if (uio_last < pages_in_upl) {
0b4e3aa0
A
2595 /*
2596 * there were some invalid pages beyond the valid pages
2597 * that we didn't issue an I/O for, just release them
2598 * unchanged
1c79356b 2599 */
9bccf70c
A
2600 ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
2601 (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
1c79356b
A
2602 }
2603
0b4e3aa0 2604 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
9bccf70c 2605 (int)upl, -1, -1, 0, 0);
1c79356b
A
2606 }
2607 }
2608 if (retval == 0)
2609 retval = error;
2610 }
2611
2612 return (retval);
2613}
2614
b4c24cb9 2615
9bccf70c 2616static int
0b4e3aa0 2617cluster_nocopy_read(vp, uio, filesize, devblocksize, flags)
1c79356b
A
2618 struct vnode *vp;
2619 struct uio *uio;
2620 off_t filesize;
2621 int devblocksize;
2622 int flags;
2623{
2624 upl_t upl;
2625 upl_page_info_t *pl;
2626 off_t upl_f_offset;
2627 vm_offset_t upl_offset;
2628 off_t start_upl_f_offset;
2629 off_t max_io_size;
2630 int io_size;
2631 int upl_size;
2632 int upl_needed_size;
2633 int pages_in_pl;
de355530 2634 vm_offset_t paddr;
1c79356b
A
2635 int upl_flags;
2636 kern_return_t kret;
2637 int segflg;
2638 struct iovec *iov;
2639 int i;
2640 int force_data_sync;
1c79356b 2641 int retval = 0;
d7e50217
A
2642 int first = 1;
2643 struct clios iostate;
1c79356b
A
2644
2645 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
2646 (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0);
2647
2648 /*
2649 * When we enter this routine, we know
2650 * -- the offset into the file is on a pagesize boundary
2651 * -- the resid is a page multiple
2652 * -- the resid will not exceed iov_len
2653 */
2654
d7e50217
A
2655 iostate.io_completed = 0;
2656 iostate.io_issued = 0;
2657 iostate.io_error = 0;
2658 iostate.io_wanted = 0;
2659
1c79356b 2660 iov = uio->uio_iov;
d7e50217 2661
1c79356b
A
2662 while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) {
2663
d7e50217 2664 max_io_size = filesize - uio->uio_offset;
0b4e3aa0 2665
d7e50217
A
2666 if (max_io_size < (off_t)((unsigned int)uio->uio_resid))
2667 io_size = max_io_size;
2668 else
2669 io_size = uio->uio_resid;
1c79356b 2670
d7e50217
A
2671 /*
2672 * We don't come into this routine unless
2673 * UIO_USERSPACE is set.
2674 */
2675 segflg = uio->uio_segflg;
1c79356b 2676
d7e50217 2677 uio->uio_segflg = UIO_PHYS_USERSPACE;
1c79356b 2678
d7e50217
A
2679 /*
2680 * First look for pages already in the cache
2681 * and move them to user space.
2682 */
2683 while (io_size && (retval == 0)) {
2684 upl_f_offset = uio->uio_offset;
1c79356b 2685
d7e50217
A
2686 /*
2687 * If this call fails, it means the page is not
2688 * in the page cache.
2689 */
2690 if (ubc_page_op(vp, upl_f_offset,
2691 UPL_POP_SET | UPL_POP_BUSY, &paddr, 0) != KERN_SUCCESS)
2692 break;
1c79356b 2693
de355530 2694 retval = uiomove((caddr_t)(paddr), PAGE_SIZE, uio);
1c79356b 2695
d7e50217
A
2696 ubc_page_op(vp, upl_f_offset,
2697 UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
1c79356b 2698
d7e50217
A
2699 io_size -= PAGE_SIZE;
2700 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 71)) | DBG_FUNC_NONE,
2701 (int)uio->uio_offset, io_size, uio->uio_resid, 0, 0);
2702 }
2703 uio->uio_segflg = segflg;
1c79356b 2704
d7e50217
A
2705 if (retval) {
2706 /*
2707 * we may have already spun some portion of this request
2708 * off as async requests... we need to wait for the I/O
2709 * to complete before returning
2710 */
2711 goto wait_for_reads;
0b4e3aa0 2712 }
d7e50217
A
2713 /*
2714 * If we are already finished with this read, then return
2715 */
2716 if (io_size == 0) {
2717 /*
2718 * we may have already spun some portion of this request
2719 * off as async requests... we need to wait for the I/O
2720 * to complete before returning
2721 */
2722 goto wait_for_reads;
2723 }
2724 max_io_size = io_size;
2725
2726 if (max_io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2727 max_io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2728 if (first) {
2729 if (max_io_size > (MAX_UPL_TRANSFER * PAGE_SIZE) / 4)
2730 max_io_size = (MAX_UPL_TRANSFER * PAGE_SIZE) / 8;
2731 first = 0;
2732 }
2733 start_upl_f_offset = uio->uio_offset; /* this is page aligned in the file */
2734 upl_f_offset = start_upl_f_offset;
2735 io_size = 0;
1c79356b 2736
d7e50217
A
2737 while (io_size < max_io_size) {
2738 if (ubc_page_op(vp, upl_f_offset,
2739 UPL_POP_SET | UPL_POP_BUSY, &paddr, 0) == KERN_SUCCESS) {
2740 ubc_page_op(vp, upl_f_offset,
2741 UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
2742 break;
2743 }
2744 /*
2745 * Build up the io request parameters.
2746 */
2747 io_size += PAGE_SIZE_64;
2748 upl_f_offset += PAGE_SIZE_64;
2749 }
2750 if (io_size == 0)
2751 /*
2752 * we may have already spun some portion of this request
2753 * off as async requests... we need to wait for the I/O
2754 * to complete before returning
2755 */
2756 goto wait_for_reads;
1c79356b 2757
d7e50217
A
2758 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
2759 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
1c79356b 2760
d7e50217
A
2761 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
2762 (int)upl_offset, upl_needed_size, (int)iov->iov_base, io_size, 0);
1c79356b 2763
d7e50217
A
2764 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
2765 pages_in_pl = 0;
2766 upl_size = upl_needed_size;
2767 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
1c79356b 2768
d7e50217
A
2769 kret = vm_map_get_upl(current_map(),
2770 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2771 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, force_data_sync);
1c79356b 2772
d7e50217
A
2773 if (kret != KERN_SUCCESS) {
2774 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2775 (int)upl_offset, upl_size, io_size, kret, 0);
1c79356b 2776
d7e50217
A
2777 /*
2778 * cluster_nocopy_read: failed to get pagelist
2779 *
2780 * we may have already spun some portion of this request
2781 * off as async requests... we need to wait for the I/O
2782 * to complete before returning
2783 */
2784 goto wait_for_reads;
2785 }
2786 pages_in_pl = upl_size / PAGE_SIZE;
2787 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1c79356b 2788
d7e50217
A
2789 for (i = 0; i < pages_in_pl; i++) {
2790 if (!upl_valid_page(pl, i))
2791 break;
2792 }
2793 if (i == pages_in_pl)
2794 break;
0b4e3aa0 2795
d7e50217
A
2796 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2797 UPL_ABORT_FREE_ON_EMPTY);
1c79356b 2798 }
d7e50217
A
2799 if (force_data_sync >= 3) {
2800 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2801 (int)upl_offset, upl_size, io_size, kret, 0);
1c79356b 2802
d7e50217
A
2803 goto wait_for_reads;
2804 }
2805 /*
2806 * Consider the possibility that upl_size wasn't satisfied.
2807 */
2808 if (upl_size != upl_needed_size)
2809 io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
1c79356b 2810
d7e50217
A
2811 if (io_size == 0) {
2812 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2813 UPL_ABORT_FREE_ON_EMPTY);
2814 goto wait_for_reads;
2815 }
2816 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2817 (int)upl_offset, upl_size, io_size, kret, 0);
1c79356b 2818
d7e50217
A
2819 /*
2820 * request asynchronously so that we can overlap
2821 * the preparation of the next I/O
2822 * if there are already too many outstanding reads
2823 * wait until some have completed before issuing the next read
2824 */
2825 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
2826 iostate.io_wanted = 1;
2827 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_read", 0);
2828 }
2829 if (iostate.io_error) {
2830 /*
2831 * one of the earlier reads we issued ran into a hard error
2832 * don't issue any more reads, cleanup the UPL
2833 * that was just created but not used, then
2834 * go wait for any other reads to complete before
2835 * returning the error to the caller
2836 */
2837 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2838 UPL_ABORT_FREE_ON_EMPTY);
1c79356b 2839
d7e50217
A
2840 goto wait_for_reads;
2841 }
2842 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
2843 (int)upl, (int)upl_offset, (int)start_upl_f_offset, io_size, 0);
1c79356b 2844
d7e50217
A
2845 retval = cluster_io(vp, upl, upl_offset, start_upl_f_offset,
2846 io_size, devblocksize,
2847 CL_PRESERVE | CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO,
2848 (struct buf *)0, &iostate);
1c79356b 2849
d7e50217
A
2850 /*
2851 * update the uio structure
2852 */
2853 iov->iov_base += io_size;
2854 iov->iov_len -= io_size;
2855 uio->uio_resid -= io_size;
2856 uio->uio_offset += io_size;
1c79356b 2857
d7e50217
A
2858 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
2859 (int)upl, (int)uio->uio_offset, (int)uio->uio_resid, retval, 0);
1c79356b
A
2860
2861 } /* end while */
2862
d7e50217
A
2863wait_for_reads:
2864 /*
2865 * make sure all async reads that are part of this stream
2866 * have completed before we return
2867 */
2868 while (iostate.io_issued != iostate.io_completed) {
2869 iostate.io_wanted = 1;
2870 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_read", 0);
2871 }
2872 if (iostate.io_error)
2873 retval = iostate.io_error;
1c79356b
A
2874
2875 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
2876 (int)uio->uio_offset, (int)uio->uio_resid, 6, retval, 0);
2877
2878 return (retval);
2879}
2880
2881
9bccf70c 2882static int
b4c24cb9 2883cluster_phys_read(vp, uio, filesize, devblocksize, flags)
0b4e3aa0
A
2884 struct vnode *vp;
2885 struct uio *uio;
2886 off_t filesize;
b4c24cb9
A
2887 int devblocksize;
2888 int flags;
0b4e3aa0 2889{
b4c24cb9 2890 upl_page_info_t *pl;
0b4e3aa0
A
2891 upl_t upl;
2892 vm_offset_t upl_offset;
de355530 2893 vm_offset_t dst_paddr;
0b4e3aa0
A
2894 off_t max_size;
2895 int io_size;
b4c24cb9 2896 int tail_size;
0b4e3aa0
A
2897 int upl_size;
2898 int upl_needed_size;
2899 int pages_in_pl;
2900 int upl_flags;
2901 kern_return_t kret;
2902 struct iovec *iov;
b4c24cb9 2903 struct clios iostate;
0b4e3aa0
A
2904 int error;
2905
2906 /*
2907 * When we enter this routine, we know
2908 * -- the resid will not exceed iov_len
2909 * -- the target address is physically contiguous
2910 */
2911
2912 iov = uio->uio_iov;
2913
2914 max_size = filesize - uio->uio_offset;
2915
b4c24cb9
A
2916 if (max_size > (off_t)((unsigned int)iov->iov_len))
2917 io_size = iov->iov_len;
0b4e3aa0 2918 else
b4c24cb9 2919 io_size = max_size;
0b4e3aa0
A
2920
2921 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK_64;
2922 upl_needed_size = upl_offset + io_size;
2923
b4c24cb9 2924 error = 0;
0b4e3aa0
A
2925 pages_in_pl = 0;
2926 upl_size = upl_needed_size;
9bccf70c 2927 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL;
0b4e3aa0
A
2928
2929 kret = vm_map_get_upl(current_map(),
2930 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2931 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
2932
b4c24cb9
A
2933 if (kret != KERN_SUCCESS) {
2934 /*
2935 * cluster_phys_read: failed to get pagelist
2936 */
2937 return(EINVAL);
2938 }
2939 if (upl_size < upl_needed_size) {
2940 /*
2941 * The upl_size wasn't satisfied.
2942 */
2943 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2944
2945 return(EINVAL);
2946 }
2947 pl = ubc_upl_pageinfo(upl);
2948
de355530 2949 dst_paddr = (vm_offset_t)upl_phys_page(pl, 0) + ((vm_offset_t)iov->iov_base & PAGE_MASK);
0b4e3aa0 2950
b4c24cb9
A
2951 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
2952 int head_size;
2953
2954 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
2955
2956 if (head_size > io_size)
2957 head_size = io_size;
2958
2959 error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, devblocksize, CL_READ);
2960
2961 if (error) {
2962 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
2963
2964 return(EINVAL);
2965 }
2966 upl_offset += head_size;
2967 dst_paddr += head_size;
2968 io_size -= head_size;
2969 }
2970 tail_size = io_size & (devblocksize - 1);
2971 io_size -= tail_size;
2972
2973 iostate.io_completed = 0;
2974 iostate.io_issued = 0;
2975 iostate.io_error = 0;
2976 iostate.io_wanted = 0;
2977
2978 while (io_size && error == 0) {
2979 int xsize;
2980
2981 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
2982 xsize = MAX_UPL_TRANSFER * PAGE_SIZE;
2983 else
2984 xsize = io_size;
2985 /*
2986 * request asynchronously so that we can overlap
2987 * the preparation of the next I/O... we'll do
2988 * the commit after all the I/O has completed
2989 * since its all issued against the same UPL
2990 * if there are already too many outstanding reads
d7e50217 2991 * wait until some have completed before issuing the next
b4c24cb9
A
2992 */
2993 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
2994 iostate.io_wanted = 1;
2995 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_phys_read", 0);
2996 }
2997
2998 error = cluster_io(vp, upl, upl_offset, uio->uio_offset, xsize, 0,
2999 CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC,
3000 (struct buf *)0, &iostate);
3001 /*
3002 * The cluster_io read was issued successfully,
3003 * update the uio structure
3004 */
3005 if (error == 0) {
3006 uio->uio_resid -= xsize;
3007 iov->iov_len -= xsize;
3008 iov->iov_base += xsize;
3009 uio->uio_offset += xsize;
3010 dst_paddr += xsize;
3011 upl_offset += xsize;
3012 io_size -= xsize;
3013 }
3014 }
0b4e3aa0 3015 /*
d7e50217
A
3016 * make sure all async reads that are part of this stream
3017 * have completed before we proceed
0b4e3aa0 3018 */
b4c24cb9
A
3019 while (iostate.io_issued != iostate.io_completed) {
3020 iostate.io_wanted = 1;
3021 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_phys_read", 0);
3022 }
3023 if (iostate.io_error) {
3024 error = iostate.io_error;
3025 }
3026 if (error == 0 && tail_size)
3027 error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, devblocksize, CL_READ);
0b4e3aa0
A
3028
3029 /*
b4c24cb9
A
3030 * just release our hold on the physically contiguous
3031 * region without changing any state
0b4e3aa0 3032 */
b4c24cb9 3033 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
0b4e3aa0
A
3034
3035 return (error);
3036}
1c79356b 3037
b4c24cb9 3038
1c79356b
A
3039/*
3040 * generate advisory I/O's in the largest chunks possible
3041 * the completed pages will be released into the VM cache
3042 */
9bccf70c 3043int
1c79356b
A
3044advisory_read(vp, filesize, f_offset, resid, devblocksize)
3045 struct vnode *vp;
3046 off_t filesize;
3047 off_t f_offset;
3048 int resid;
3049 int devblocksize;
3050{
1c79356b
A
3051 upl_page_info_t *pl;
3052 upl_t upl;
3053 vm_offset_t upl_offset;
3054 int upl_size;
3055 off_t upl_f_offset;
3056 int start_offset;
3057 int start_pg;
3058 int last_pg;
3059 int pages_in_upl;
3060 off_t max_size;
3061 int io_size;
3062 kern_return_t kret;
3063 int retval = 0;
9bccf70c 3064 int issued_io;
1c79356b
A
3065
3066 if (!UBCINFOEXISTS(vp))
3067 return(EINVAL);
3068
1c79356b
A
3069 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
3070 (int)f_offset, resid, (int)filesize, devblocksize, 0);
3071
3072 while (resid && f_offset < filesize && retval == 0) {
3073 /*
3074 * compute the size of the upl needed to encompass
3075 * the requested read... limit each call to cluster_io
0b4e3aa0
A
3076 * to the maximum UPL size... cluster_io will clip if
3077 * this exceeds the maximum io_size for the device,
3078 * make sure to account for
1c79356b
A
3079 * a starting offset that's not page aligned
3080 */
3081 start_offset = (int)(f_offset & PAGE_MASK_64);
3082 upl_f_offset = f_offset - (off_t)start_offset;
3083 max_size = filesize - f_offset;
3084
3085 if (resid < max_size)
3086 io_size = resid;
3087 else
3088 io_size = max_size;
3089
3090 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
0b4e3aa0
A
3091 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
3092 upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1c79356b
A
3093 pages_in_upl = upl_size / PAGE_SIZE;
3094
0b4e3aa0
A
3095 kret = ubc_create_upl(vp,
3096 upl_f_offset,
3097 upl_size,
3098 &upl,
3099 &pl,
9bccf70c 3100 UPL_RET_ONLY_ABSENT);
1c79356b 3101 if (kret != KERN_SUCCESS)
9bccf70c
A
3102 return(retval);
3103 issued_io = 0;
1c79356b
A
3104
3105 /*
9bccf70c
A
3106 * before we start marching forward, we must make sure we end on
3107 * a present page, otherwise we will be working with a freed
3108 * upl
1c79356b 3109 */
9bccf70c
A
3110 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
3111 if (upl_page_present(pl, last_pg))
3112 break;
1c79356b 3113 }
9bccf70c 3114 pages_in_upl = last_pg + 1;
1c79356b 3115
1c79356b 3116
9bccf70c
A
3117 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_NONE,
3118 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
3119
3120
3121 for (last_pg = 0; last_pg < pages_in_upl; ) {
1c79356b 3122 /*
9bccf70c
A
3123 * scan from the beginning of the upl looking for the first
3124 * page that is present.... this will become the first page in
3125 * the request we're going to make to 'cluster_io'... if all
3126 * of the pages are absent, we won't call through to 'cluster_io'
1c79356b 3127 */
9bccf70c
A
3128 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
3129 if (upl_page_present(pl, start_pg))
3130 break;
1c79356b 3131 }
1c79356b 3132
1c79356b 3133 /*
9bccf70c
A
3134 * scan from the starting present page looking for an absent
3135 * page before the end of the upl is reached, if we
3136 * find one, then it will terminate the range of pages being
3137 * presented to 'cluster_io'
1c79356b 3138 */
9bccf70c
A
3139 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
3140 if (!upl_page_present(pl, last_pg))
3141 break;
3142 }
3143
3144 if (last_pg > start_pg) {
3145 /*
3146 * we found a range of pages that must be filled
3147 * if the last page in this range is the last page of the file
3148 * we may have to clip the size of it to keep from reading past
3149 * the end of the last physical block associated with the file
3150 */
3151 upl_offset = start_pg * PAGE_SIZE;
3152 io_size = (last_pg - start_pg) * PAGE_SIZE;
3153
3154 if ((upl_f_offset + upl_offset + io_size) > filesize)
3155 io_size = filesize - (upl_f_offset + upl_offset);
3156
3157 /*
3158 * issue an asynchronous read to cluster_io
3159 */
3160 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, devblocksize,
b4c24cb9 3161 CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE, (struct buf *)0, (struct clios *)0);
1c79356b 3162
9bccf70c
A
3163 issued_io = 1;
3164 }
1c79356b 3165 }
9bccf70c
A
3166 if (issued_io == 0)
3167 ubc_upl_abort(upl, 0);
3168
3169 io_size = upl_size - start_offset;
1c79356b
A
3170
3171 if (io_size > resid)
3172 io_size = resid;
3173 f_offset += io_size;
3174 resid -= io_size;
3175 }
9bccf70c 3176
1c79356b
A
3177 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
3178 (int)f_offset, resid, retval, 0, 0);
3179
3180 return(retval);
3181}
3182
3183
9bccf70c 3184int
1c79356b
A
3185cluster_push(vp)
3186 struct vnode *vp;
9bccf70c
A
3187{
3188 int retval;
3189
3190 if (!UBCINFOEXISTS(vp) || vp->v_clen == 0) {
3191 vp->v_flag &= ~VHASDIRTY;
3192 return(0);
3193 }
3194
3195 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
3196 vp->v_flag & VHASDIRTY, vp->v_clen, 0, 0, 0);
3197
3198 if (vp->v_flag & VHASDIRTY) {
3199 daddr_t start_pg;
3200 daddr_t last_pg;
3201 daddr_t end_pg;
3202
3203 start_pg = vp->v_cstart;
3204 end_pg = vp->v_lastw;
3205
3206 vp->v_flag &= ~VHASDIRTY;
3207 vp->v_clen = 0;
3208
3209 while (start_pg < end_pg) {
3210 last_pg = start_pg + MAX_UPL_TRANSFER;
3211
3212 if (last_pg > end_pg)
3213 last_pg = end_pg;
3214
3215 cluster_push_x(vp, ubc_getsize(vp), start_pg, last_pg, 0);
3216
3217 start_pg = last_pg;
3218 }
3219 return (1);
3220 }
3221 retval = cluster_try_push(vp, ubc_getsize(vp), 0, 1);
3222
3223 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
3224 vp->v_flag & VHASDIRTY, vp->v_clen, retval, 0, 0);
3225
3226 return (retval);
3227}
3228
3229
3230static int
3231cluster_try_push(vp, EOF, can_delay, push_all)
3232 struct vnode *vp;
3233 off_t EOF;
3234 int can_delay;
3235 int push_all;
3236{
3237 int cl_index;
3238 int cl_index1;
3239 int min_index;
3240 int cl_len;
3241 int cl_total;
3242 int cl_pushed;
3243 struct v_cluster l_clusters[MAX_CLUSTERS];
3244
3245 /*
3246 * make a local 'sorted' copy of the clusters
3247 * and clear vp->v_clen so that new clusters can
3248 * be developed
3249 */
3250 for (cl_index = 0; cl_index < vp->v_clen; cl_index++) {
3251 for (min_index = -1, cl_index1 = 0; cl_index1 < vp->v_clen; cl_index1++) {
3252 if (vp->v_clusters[cl_index1].start_pg == vp->v_clusters[cl_index1].last_pg)
3253 continue;
3254 if (min_index == -1)
3255 min_index = cl_index1;
3256 else if (vp->v_clusters[cl_index1].start_pg < vp->v_clusters[min_index].start_pg)
3257 min_index = cl_index1;
3258 }
3259 if (min_index == -1)
3260 break;
3261 l_clusters[cl_index].start_pg = vp->v_clusters[min_index].start_pg;
3262 l_clusters[cl_index].last_pg = vp->v_clusters[min_index].last_pg;
3263
3264 vp->v_clusters[min_index].start_pg = vp->v_clusters[min_index].last_pg;
3265 }
3266 cl_len = cl_index;
3267 vp->v_clen = 0;
3268
3269 for (cl_pushed = 0, cl_index = 0; cl_index < cl_len; cl_index++) {
3270 /*
3271 * try to push each cluster in turn... cluster_push_x may not
3272 * push the cluster if can_delay is TRUE and the cluster doesn't
3273 * meet the critera for an immediate push
3274 */
3275 if (cluster_push_x(vp, EOF, l_clusters[cl_index].start_pg, l_clusters[cl_index].last_pg, can_delay)) {
3276 l_clusters[cl_index].start_pg = 0;
3277 l_clusters[cl_index].last_pg = 0;
3278
3279 cl_pushed++;
3280
3281 if (push_all == 0)
3282 break;
3283 }
3284 }
3285 if (cl_len > cl_pushed) {
3286 /*
3287 * we didn't push all of the clusters, so
3288 * lets try to merge them back in to the vnode
3289 */
3290 if ((MAX_CLUSTERS - vp->v_clen) < (cl_len - cl_pushed)) {
3291 /*
3292 * we picked up some new clusters while we were trying to
3293 * push the old ones (I don't think this can happen because
3294 * I'm holding the lock, but just in case)... the sum of the
3295 * leftovers plus the new cluster count exceeds our ability
3296 * to represent them, so fall back to the VHASDIRTY mechanism
3297 */
3298 for (cl_index = 0; cl_index < cl_len; cl_index++) {
3299 if (l_clusters[cl_index].start_pg == l_clusters[cl_index].last_pg)
3300 continue;
3301
3302 if (l_clusters[cl_index].start_pg < vp->v_cstart)
3303 vp->v_cstart = l_clusters[cl_index].start_pg;
3304 if (l_clusters[cl_index].last_pg > vp->v_lastw)
3305 vp->v_lastw = l_clusters[cl_index].last_pg;
3306 }
3307 vp->v_flag |= VHASDIRTY;
3308 } else {
3309 /*
3310 * we've got room to merge the leftovers back in
3311 * just append them starting at the next 'hole'
3312 * represented by vp->v_clen
3313 */
3314 for (cl_index = 0, cl_index1 = vp->v_clen; cl_index < cl_len; cl_index++) {
3315 if (l_clusters[cl_index].start_pg == l_clusters[cl_index].last_pg)
3316 continue;
3317
3318 vp->v_clusters[cl_index1].start_pg = l_clusters[cl_index].start_pg;
3319 vp->v_clusters[cl_index1].last_pg = l_clusters[cl_index].last_pg;
3320
3321 if (cl_index1 == 0) {
3322 vp->v_cstart = l_clusters[cl_index].start_pg;
3323 vp->v_lastw = l_clusters[cl_index].last_pg;
3324 } else {
3325 if (l_clusters[cl_index].start_pg < vp->v_cstart)
3326 vp->v_cstart = l_clusters[cl_index].start_pg;
3327 if (l_clusters[cl_index].last_pg > vp->v_lastw)
3328 vp->v_lastw = l_clusters[cl_index].last_pg;
3329 }
3330 cl_index1++;
3331 }
3332 /*
3333 * update the cluster count
3334 */
3335 vp->v_clen = cl_index1;
3336 }
3337 }
3338 return(MAX_CLUSTERS - vp->v_clen);
3339}
3340
3341
3342
3343static int
3344cluster_push_x(vp, EOF, first, last, can_delay)
3345 struct vnode *vp;
3346 off_t EOF;
3347 daddr_t first;
3348 daddr_t last;
3349 int can_delay;
1c79356b 3350{
1c79356b
A
3351 upl_page_info_t *pl;
3352 upl_t upl;
3353 vm_offset_t upl_offset;
3354 int upl_size;
3355 off_t upl_f_offset;
3356 int pages_in_upl;
3357 int start_pg;
3358 int last_pg;
3359 int io_size;
3360 int io_flags;
3361 int size;
3362 kern_return_t kret;
3363
3364
9bccf70c
A
3365 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START,
3366 vp->v_clen, first, last, EOF, 0);
3367
3368 if ((pages_in_upl = last - first) == 0) {
3369 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0);
1c79356b 3370
9bccf70c
A
3371 return (1);
3372 }
1c79356b 3373 upl_size = pages_in_upl * PAGE_SIZE;
9bccf70c 3374 upl_f_offset = ((off_t)first) * PAGE_SIZE_64;
1c79356b 3375
9bccf70c
A
3376 if (upl_f_offset + upl_size >= EOF) {
3377
3378 if (upl_f_offset >= EOF) {
3379 /*
3380 * must have truncated the file and missed
3381 * clearing a dangling cluster (i.e. it's completely
3382 * beyond the new EOF
3383 */
3384 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0);
3385
3386 return(1);
3387 }
3388 size = EOF - upl_f_offset;
1c79356b 3389
9bccf70c
A
3390 upl_size = (size + (PAGE_SIZE - 1) ) & ~(PAGE_SIZE - 1);
3391 pages_in_upl = upl_size / PAGE_SIZE;
3392 } else {
3393 if (can_delay && (pages_in_upl < (MAX_UPL_TRANSFER - (MAX_UPL_TRANSFER / 2))))
3394 return(0);
3395 size = upl_size;
3396 }
0b4e3aa0
A
3397 kret = ubc_create_upl(vp,
3398 upl_f_offset,
3399 upl_size,
3400 &upl,
9bccf70c
A
3401 &pl,
3402 UPL_RET_ONLY_DIRTY);
1c79356b
A
3403 if (kret != KERN_SUCCESS)
3404 panic("cluster_push: failed to get pagelist");
3405
9bccf70c
A
3406 if (can_delay) {
3407 int num_of_dirty;
3408
3409 for (num_of_dirty = 0, start_pg = 0; start_pg < pages_in_upl; start_pg++) {
3410 if (upl_valid_page(pl, start_pg) && upl_dirty_page(pl, start_pg))
3411 num_of_dirty++;
3412 }
3413 if (num_of_dirty < pages_in_upl / 2) {
3414 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3415
3416 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 0, 2, num_of_dirty, (pages_in_upl / 2), 0);
3417
3418 return(0);
3419 }
3420 }
1c79356b
A
3421 last_pg = 0;
3422
3423 while (size) {
3424
3425 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
3426 if (upl_valid_page(pl, start_pg) && upl_dirty_page(pl, start_pg))
3427 break;
3428 }
3429 if (start_pg > last_pg) {
0b4e3aa0 3430 io_size = (start_pg - last_pg) * PAGE_SIZE;
1c79356b 3431
0b4e3aa0
A
3432 ubc_upl_abort_range(upl, last_pg * PAGE_SIZE, io_size,
3433 UPL_ABORT_FREE_ON_EMPTY);
1c79356b
A
3434
3435 if (io_size < size)
3436 size -= io_size;
3437 else
3438 break;
3439 }
3440 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
3441 if (!upl_valid_page(pl, last_pg) || !upl_dirty_page(pl, last_pg))
3442 break;
3443 }
3444 upl_offset = start_pg * PAGE_SIZE;
3445
3446 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
3447
0b4e3aa0
A
3448 if (vp->v_flag & VNOCACHE_DATA)
3449 io_flags = CL_COMMIT | CL_AGE | CL_ASYNC | CL_DUMP;
1c79356b
A
3450 else
3451 io_flags = CL_COMMIT | CL_AGE | CL_ASYNC;
3452
0b4e3aa0
A
3453 while (vp->v_numoutput >= ASYNC_THROTTLE) {
3454 vp->v_flag |= VTHROTTLED;
3455 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_push", 0);
3456 }
b4c24cb9 3457 cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, vp->v_ciosiz, io_flags, (struct buf *)0, (struct clios *)0);
1c79356b
A
3458
3459 size -= io_size;
3460 }
9bccf70c
A
3461 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, 0, 0, 0);
3462
1c79356b
A
3463 return(1);
3464}
b4c24cb9
A
3465
3466
3467
3468static int
de355530 3469cluster_align_phys_io(struct vnode *vp, struct uio *uio, vm_offset_t usr_paddr, int xsize, int devblocksize, int flags)
b4c24cb9
A
3470{
3471 struct iovec *iov;
3472 upl_page_info_t *pl;
3473 upl_t upl;
de355530 3474 vm_offset_t ubc_paddr;
b4c24cb9
A
3475 kern_return_t kret;
3476 int error = 0;
3477
3478 iov = uio->uio_iov;
3479
3480 kret = ubc_create_upl(vp,
3481 uio->uio_offset & ~PAGE_MASK_64,
3482 PAGE_SIZE,
3483 &upl,
3484 &pl,
3485 UPL_FLAGS_NONE);
3486
3487 if (kret != KERN_SUCCESS)
3488 return(EINVAL);
3489
3490 if (!upl_valid_page(pl, 0)) {
3491 /*
3492 * issue a synchronous read to cluster_io
3493 */
3494 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, devblocksize,
3495 CL_READ, (struct buf *)0, (struct clios *)0);
3496 if (error) {
3497 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3498
3499 return(error);
3500 }
3501 }
de355530 3502 ubc_paddr = (vm_offset_t)upl_phys_page(pl, 0) + (int)(uio->uio_offset & PAGE_MASK_64);
b4c24cb9 3503
de355530
A
3504 if (flags & CL_READ)
3505 copyp2p(ubc_paddr, usr_paddr, xsize, 2);
3506 else
3507 copyp2p(usr_paddr, ubc_paddr, xsize, 1);
3508
3509 if ( !(flags & CL_READ) || upl_dirty_page(pl, 0)) {
3510 /*
3511 * issue a synchronous write to cluster_io
3512 */
3513 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, devblocksize,
3514 0, (struct buf *)0, (struct clios *)0);
3515 }
3516 if (error == 0) {
3517 uio->uio_offset += xsize;
3518 iov->iov_base += xsize;
3519 iov->iov_len -= xsize;
3520 uio->uio_resid -= xsize;
3521 }
3522 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3523
3524 return (error);
b4c24cb9 3525}