]> git.saurik.com Git - apple/xnu.git/blame - bsd/vfs/vfs_cluster.c
xnu-517.7.7.tar.gz
[apple/xnu.git] / bsd / vfs / vfs_cluster.c
CommitLineData
1c79356b 1/*
9bccf70c 2 * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
1c79356b
A
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
e5568f75
A
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
1c79356b 11 *
e5568f75
A
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
1c79356b
A
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
e5568f75
A
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
1c79356b
A
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
23/*
24 * Copyright (c) 1993
25 * The Regents of the University of California. All rights reserved.
26 *
27 * Redistribution and use in source and binary forms, with or without
28 * modification, are permitted provided that the following conditions
29 * are met:
30 * 1. Redistributions of source code must retain the above copyright
31 * notice, this list of conditions and the following disclaimer.
32 * 2. Redistributions in binary form must reproduce the above copyright
33 * notice, this list of conditions and the following disclaimer in the
34 * documentation and/or other materials provided with the distribution.
35 * 3. All advertising materials mentioning features or use of this software
36 * must display the following acknowledgement:
37 * This product includes software developed by the University of
38 * California, Berkeley and its contributors.
39 * 4. Neither the name of the University nor the names of its contributors
40 * may be used to endorse or promote products derived from this software
41 * without specific prior written permission.
42 *
43 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
44 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
45 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
46 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
47 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
48 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
49 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
50 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
51 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
52 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
53 * SUCH DAMAGE.
54 *
55 * @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95
56 */
57
58#include <sys/param.h>
59#include <sys/proc.h>
60#include <sys/buf.h>
61#include <sys/vnode.h>
62#include <sys/mount.h>
63#include <sys/trace.h>
64#include <sys/malloc.h>
55e303ae
A
65#include <sys/time.h>
66#include <sys/kernel.h>
1c79356b
A
67#include <sys/resourcevar.h>
68#include <libkern/libkern.h>
55e303ae 69#include <machine/machine_routines.h>
1c79356b
A
70
71#include <sys/ubc.h>
72#include <vm/vm_pageout.h>
1c79356b 73
55e303ae
A
74#include <mach/mach_types.h>
75#include <mach/memory_object_types.h>
76
1c79356b
A
77#include <sys/kdebug.h>
78
55e303ae 79
1c79356b
A
80#define CL_READ 0x01
81#define CL_ASYNC 0x02
82#define CL_COMMIT 0x04
1c79356b
A
83#define CL_PAGEOUT 0x10
84#define CL_AGE 0x20
85#define CL_DUMP 0x40
86#define CL_NOZERO 0x80
87#define CL_PAGEIN 0x100
0b4e3aa0 88#define CL_DEV_MEMORY 0x200
b4c24cb9 89#define CL_PRESERVE 0x400
55e303ae 90#define CL_THROTTLE 0x800
b4c24cb9 91
d7e50217 92
b4c24cb9 93struct clios {
d7e50217
A
94 u_int io_completed; /* amount of io that has currently completed */
95 u_int io_issued; /* amount of io that was successfully issued */
96 int io_error; /* error code of first error encountered */
97 int io_wanted; /* someone is sleeping waiting for a change in state */
b4c24cb9
A
98};
99
1c79356b 100
9bccf70c
A
101static void cluster_zero(upl_t upl, vm_offset_t upl_offset,
102 int size, struct buf *bp);
103static int cluster_read_x(struct vnode *vp, struct uio *uio,
104 off_t filesize, int devblocksize, int flags);
105static int cluster_write_x(struct vnode *vp, struct uio *uio,
106 off_t oldEOF, off_t newEOF, off_t headOff,
107 off_t tailOff, int devblocksize, int flags);
108static int cluster_nocopy_read(struct vnode *vp, struct uio *uio,
109 off_t filesize, int devblocksize, int flags);
110static int cluster_nocopy_write(struct vnode *vp, struct uio *uio,
111 off_t newEOF, int devblocksize, int flags);
112static int cluster_phys_read(struct vnode *vp, struct uio *uio,
b4c24cb9
A
113 off_t filesize, int devblocksize, int flags);
114static int cluster_phys_write(struct vnode *vp, struct uio *uio,
115 off_t newEOF, int devblocksize, int flags);
116static int cluster_align_phys_io(struct vnode *vp, struct uio *uio,
55e303ae 117 addr64_t usr_paddr, int xsize, int devblocksize, int flags);
9bccf70c 118static int cluster_push_x(struct vnode *vp, off_t EOF, daddr_t first, daddr_t last, int can_delay);
55e303ae
A
119static int cluster_try_push(struct vnode *vp, off_t EOF, int can_delay, int push_all);
120
121static int sparse_cluster_switch(struct vnode *vp, off_t EOF);
122static int sparse_cluster_push(struct vnode *vp, off_t EOF, int push_all);
123static int sparse_cluster_add(struct vnode *vp, off_t EOF, daddr_t first, daddr_t last);
124
125static kern_return_t vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, int *setcountp);
126static kern_return_t vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length);
127static kern_return_t vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp);
128static kern_return_t vfs_drt_control(void **cmapp, int op_type);
129
130int ubc_page_op_with_control __P((memory_object_control_t, off_t, int, ppnum_t *, int *));
9bccf70c
A
131
132
1c79356b
A
133/*
134 * throttle the number of async writes that
135 * can be outstanding on a single vnode
136 * before we issue a synchronous write
137 */
55e303ae
A
138#define ASYNC_THROTTLE 18
139#define HARD_THROTTLE_MAXCNT 1
140#define HARD_THROTTLE_MAXSIZE (64 * 1024)
141
142int hard_throttle_on_root = 0;
143struct timeval priority_IO_timestamp_for_root;
144
145
146static int
147cluster_hard_throttle_on(vp)
148 struct vnode *vp;
149{
150 static struct timeval hard_throttle_maxelapsed = { 0, 300000 };
151
152 if (vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV) {
153 struct timeval elapsed;
154
155 if (hard_throttle_on_root)
156 return(1);
157
158 elapsed = time;
159 timevalsub(&elapsed, &priority_IO_timestamp_for_root);
160
161 if (timevalcmp(&elapsed, &hard_throttle_maxelapsed, <))
162 return(1);
163 }
164 return(0);
165}
166
1c79356b
A
167
168static int
169cluster_iodone(bp)
170 struct buf *bp;
171{
172 int b_flags;
173 int error;
174 int total_size;
175 int total_resid;
176 int upl_offset;
9bccf70c 177 int zero_offset;
1c79356b
A
178 upl_t upl;
179 struct buf *cbp;
180 struct buf *cbp_head;
181 struct buf *cbp_next;
182 struct buf *real_bp;
0b4e3aa0 183 struct vnode *vp;
b4c24cb9 184 struct clios *iostate;
1c79356b
A
185 int commit_size;
186 int pg_offset;
187
188
189 cbp_head = (struct buf *)(bp->b_trans_head);
190
191 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
9bccf70c 192 (int)cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
1c79356b
A
193
194 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
195 /*
196 * all I/O requests that are part of this transaction
197 * have to complete before we can process it
198 */
199 if ( !(cbp->b_flags & B_DONE)) {
200
201 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
9bccf70c 202 (int)cbp_head, (int)cbp, cbp->b_bcount, cbp->b_flags, 0);
1c79356b
A
203
204 return 0;
205 }
206 }
207 error = 0;
208 total_size = 0;
209 total_resid = 0;
210
211 cbp = cbp_head;
212 upl_offset = cbp->b_uploffset;
213 upl = cbp->b_pagelist;
214 b_flags = cbp->b_flags;
215 real_bp = cbp->b_real_bp;
0b4e3aa0 216 vp = cbp->b_vp;
9bccf70c 217 zero_offset= cbp->b_validend;
b4c24cb9 218 iostate = (struct clios *)cbp->b_iostate;
1c79356b
A
219
220 while (cbp) {
1c79356b
A
221 if ((cbp->b_flags & B_ERROR) && error == 0)
222 error = cbp->b_error;
223
224 total_resid += cbp->b_resid;
225 total_size += cbp->b_bcount;
226
227 cbp_next = cbp->b_trans_next;
228
229 free_io_buf(cbp);
230
231 cbp = cbp_next;
232 }
b4c24cb9
A
233 if (zero_offset)
234 cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
235
0b4e3aa0
A
236 if ((vp->v_flag & VTHROTTLED) && (vp->v_numoutput <= (ASYNC_THROTTLE / 3))) {
237 vp->v_flag &= ~VTHROTTLED;
238 wakeup((caddr_t)&vp->v_numoutput);
239 }
b4c24cb9 240 if (iostate) {
d7e50217
A
241 /*
242 * someone has issued multiple I/Os asynchrounsly
243 * and is waiting for them to complete (streaming)
244 */
245 if (error && iostate->io_error == 0)
246 iostate->io_error = error;
9bccf70c 247
b4c24cb9
A
248 iostate->io_completed += total_size;
249
250 if (iostate->io_wanted) {
d7e50217
A
251 /*
252 * someone is waiting for the state of
253 * this io stream to change
254 */
b4c24cb9
A
255 iostate->io_wanted = 0;
256 wakeup((caddr_t)&iostate->io_wanted);
257 }
258 }
1c79356b
A
259 if ((b_flags & B_NEED_IODONE) && real_bp) {
260 if (error) {
261 real_bp->b_flags |= B_ERROR;
262 real_bp->b_error = error;
263 }
264 real_bp->b_resid = total_resid;
265
266 biodone(real_bp);
267 }
268 if (error == 0 && total_resid)
269 error = EIO;
270
271 if (b_flags & B_COMMIT_UPL) {
b4c24cb9 272 pg_offset = upl_offset & PAGE_MASK;
55e303ae 273 commit_size = (pg_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1c79356b 274
55e303ae 275 if (error || (b_flags & B_NOCACHE)) {
1c79356b
A
276 int upl_abort_code;
277
55e303ae 278 if ((b_flags & B_PAGEOUT) && (error != ENXIO)) /* transient error */
1c79356b 279 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
0b4e3aa0
A
280 else if (b_flags & B_PGIN)
281 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
1c79356b
A
282 else
283 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
284
0b4e3aa0
A
285 ubc_upl_abort_range(upl, upl_offset - pg_offset, commit_size,
286 upl_abort_code);
1c79356b
A
287
288 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
9bccf70c 289 (int)upl, upl_offset - pg_offset, commit_size,
1c79356b
A
290 0x80000000|upl_abort_code, 0);
291
292 } else {
293 int upl_commit_flags = UPL_COMMIT_FREE_ON_EMPTY;
294
55e303ae
A
295 if (b_flags & B_PHYS) {
296 if (b_flags & B_READ)
297 upl_commit_flags |= UPL_COMMIT_SET_DIRTY;
298 } else if ( !(b_flags & B_PAGEOUT))
1c79356b 299 upl_commit_flags |= UPL_COMMIT_CLEAR_DIRTY;
55e303ae 300
1c79356b
A
301 if (b_flags & B_AGE)
302 upl_commit_flags |= UPL_COMMIT_INACTIVATE;
303
0b4e3aa0
A
304 ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size,
305 upl_commit_flags);
1c79356b 306
0b4e3aa0 307 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
9bccf70c 308 (int)upl, upl_offset - pg_offset, commit_size,
1c79356b
A
309 upl_commit_flags, 0);
310 }
311 } else
312 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
9bccf70c 313 (int)upl, upl_offset, 0, error, 0);
1c79356b
A
314
315 return (error);
316}
317
318
319static void
9bccf70c 320cluster_zero(upl, upl_offset, size, bp)
1c79356b
A
321 upl_t upl;
322 vm_offset_t upl_offset;
323 int size;
1c79356b
A
324 struct buf *bp;
325{
55e303ae 326 upl_page_info_t *pl;
1c79356b 327
55e303ae 328 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_START,
9bccf70c
A
329 upl_offset, size, (int)bp, 0, 0);
330
331 if (bp == NULL || bp->b_data == NULL) {
9bccf70c 332
55e303ae
A
333 pl = ubc_upl_pageinfo(upl);
334
335 while (size) {
336 int page_offset;
337 int page_index;
338 addr64_t zero_addr;
339 int zero_cnt;
340
341 page_index = upl_offset / PAGE_SIZE;
342 page_offset = upl_offset & PAGE_MASK;
343
344 zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << 12) + page_offset;
345 zero_cnt = min(PAGE_SIZE - page_offset, size);
346
347 bzero_phys(zero_addr, zero_cnt);
348
349 size -= zero_cnt;
350 upl_offset += zero_cnt;
351 }
1c79356b 352 } else
55e303ae 353 bzero((caddr_t)((vm_offset_t)bp->b_data + upl_offset), size);
1c79356b 354
55e303ae
A
355 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_END,
356 upl_offset, size, 0, 0, 0);
1c79356b
A
357}
358
1c79356b 359static int
b4c24cb9 360cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, real_bp, iostate)
1c79356b
A
361 struct vnode *vp;
362 upl_t upl;
363 vm_offset_t upl_offset;
364 off_t f_offset;
9bccf70c
A
365 int non_rounded_size;
366 int devblocksize;
1c79356b
A
367 int flags;
368 struct buf *real_bp;
b4c24cb9 369 struct clios *iostate;
1c79356b
A
370{
371 struct buf *cbp;
b4c24cb9
A
372 u_int size;
373 u_int io_size;
1c79356b
A
374 int io_flags;
375 int error = 0;
376 int retval = 0;
377 struct buf *cbp_head = 0;
378 struct buf *cbp_tail = 0;
9bccf70c 379 int buf_count = 0;
1c79356b
A
380 int pg_count;
381 int pg_offset;
9bccf70c
A
382 u_int max_iosize;
383 u_int max_vectors;
0b4e3aa0 384 int priv;
9bccf70c 385 int zero_offset = 0;
55e303ae
A
386 int async_throttle;
387
388 if (devblocksize)
389 size = (non_rounded_size + (devblocksize - 1)) & ~(devblocksize - 1);
390 else
391 size = non_rounded_size;
392
393 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START,
394 (int)f_offset, size, upl_offset, flags, 0);
395
1c79356b 396
0b4e3aa0 397 if (flags & CL_READ) {
1c79356b 398 io_flags = (B_VECTORLIST | B_READ);
0b4e3aa0
A
399
400 vfs_io_attributes(vp, B_READ, &max_iosize, &max_vectors);
401 } else {
1c79356b
A
402 io_flags = (B_VECTORLIST | B_WRITEINPROG);
403
0b4e3aa0
A
404 vfs_io_attributes(vp, B_WRITE, &max_iosize, &max_vectors);
405 }
55e303ae
A
406 /*
407 * make sure the maximum iosize are at least the size of a page
408 * and that they are multiples of the page size
409 */
410 max_iosize &= ~PAGE_MASK;
411
412 if (flags & CL_THROTTLE) {
413 if ( !(flags & CL_PAGEOUT) && cluster_hard_throttle_on(vp)) {
414 if (max_iosize > HARD_THROTTLE_MAXSIZE)
415 max_iosize = HARD_THROTTLE_MAXSIZE;
416 async_throttle = HARD_THROTTLE_MAXCNT;
417 } else
418 async_throttle = ASYNC_THROTTLE;
419 }
1c79356b
A
420 if (flags & CL_AGE)
421 io_flags |= B_AGE;
422 if (flags & CL_DUMP)
423 io_flags |= B_NOCACHE;
0b4e3aa0
A
424 if (flags & CL_PAGEIN)
425 io_flags |= B_PGIN;
b4c24cb9
A
426 if (flags & CL_PAGEOUT)
427 io_flags |= B_PAGEOUT;
428 if (flags & CL_COMMIT)
429 io_flags |= B_COMMIT_UPL;
430 if (flags & CL_PRESERVE)
431 io_flags |= B_PHYS;
1c79356b 432
9bccf70c 433 if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
1c79356b
A
434 /*
435 * then we are going to end up
436 * with a page that we can't complete (the file size wasn't a multiple
437 * of PAGE_SIZE and we're trying to read to the end of the file
438 * so we'll go ahead and zero out the portion of the page we can't
439 * read in from the file
440 */
9bccf70c 441 zero_offset = upl_offset + non_rounded_size;
1c79356b
A
442 }
443 while (size) {
de355530
A
444 int vsize;
445 int i;
1c79356b
A
446 int pg_resid;
447 int num_contig;
448 daddr_t lblkno;
449 daddr_t blkno;
450
0b4e3aa0
A
451 if (size > max_iosize)
452 io_size = max_iosize;
1c79356b
A
453 else
454 io_size = size;
455
b4c24cb9 456 if (error = VOP_CMAP(vp, f_offset, io_size, &blkno, (size_t *)&io_size, NULL)) {
1c79356b
A
457 if (error == EOPNOTSUPP)
458 panic("VOP_CMAP Unimplemented");
459 break;
460 }
461
462 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
9bccf70c 463 (int)f_offset, (int)blkno, io_size, zero_offset, 0);
1c79356b
A
464
465 if ( (!(flags & CL_READ) && (long)blkno == -1) || io_size == 0) {
0b4e3aa0
A
466 if (flags & CL_PAGEOUT) {
467 error = EINVAL;
468 break;
469 };
470
471 /* Try paging out the page individually before
472 giving up entirely and dumping it (it could
473 be mapped in a "hole" and require allocation
474 before the I/O:
475 */
55e303ae 476 ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
0b4e3aa0
A
477 if (ubc_pushdirty_range(vp, f_offset, PAGE_SIZE_64) == 0) {
478 error = EINVAL;
479 break;
480 };
481
0b4e3aa0 482 f_offset += PAGE_SIZE_64;
55e303ae
A
483 upl_offset += PAGE_SIZE;
484 size -= PAGE_SIZE;
0b4e3aa0 485 continue;
1c79356b
A
486 }
487 lblkno = (daddr_t)(f_offset / PAGE_SIZE_64);
488 /*
489 * we have now figured out how much I/O we can do - this is in 'io_size'
1c79356b
A
490 * pg_offset is the starting point in the first page for the I/O
491 * pg_count is the number of full and partial pages that 'io_size' encompasses
492 */
1c79356b 493 pg_offset = upl_offset & PAGE_MASK;
1c79356b 494
0b4e3aa0
A
495 if (flags & CL_DEV_MEMORY) {
496 /*
497 * currently, can't deal with reading 'holes' in file
498 */
499 if ((long)blkno == -1) {
500 error = EINVAL;
501 break;
502 }
503 /*
504 * treat physical requests as one 'giant' page
505 */
506 pg_count = 1;
55e303ae
A
507 } else
508 pg_count = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
509
1c79356b 510 if ((flags & CL_READ) && (long)blkno == -1) {
9bccf70c
A
511 int bytes_to_zero;
512
1c79356b
A
513 /*
514 * if we're reading and blkno == -1, then we've got a
515 * 'hole' in the file that we need to deal with by zeroing
516 * out the affected area in the upl
517 */
9bccf70c
A
518 if (zero_offset && io_size == size) {
519 /*
520 * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
521 * than 'zero_offset' will be non-zero
522 * if the 'hole' returned by VOP_CMAP extends all the way to the eof
523 * (indicated by the io_size finishing off the I/O request for this UPL)
524 * than we're not going to issue an I/O for the
525 * last page in this upl... we need to zero both the hole and the tail
526 * of the page beyond the EOF, since the delayed zero-fill won't kick in
527 */
528 bytes_to_zero = (((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset;
1c79356b 529
9bccf70c
A
530 zero_offset = 0;
531 } else
532 bytes_to_zero = io_size;
1c79356b 533
9bccf70c
A
534 cluster_zero(upl, upl_offset, bytes_to_zero, real_bp);
535
536 if (cbp_head)
537 /*
538 * if there is a current I/O chain pending
539 * then the first page of the group we just zero'd
540 * will be handled by the I/O completion if the zero
541 * fill started in the middle of the page
542 */
543 pg_count = (io_size - pg_offset) / PAGE_SIZE;
544 else {
545 /*
546 * no pending I/O to pick up that first page
547 * so, we have to make sure it gets committed
548 * here.
549 * set the pg_offset to 0 so that the upl_commit_range
550 * starts with this page
551 */
552 pg_count = (io_size + pg_offset) / PAGE_SIZE;
553 pg_offset = 0;
554 }
1c79356b 555 if (io_size == size && ((upl_offset + io_size) & PAGE_MASK))
9bccf70c
A
556 /*
557 * if we're done with the request for this UPL
558 * then we have to make sure to commit the last page
559 * even if we only partially zero-filled it
560 */
1c79356b
A
561 pg_count++;
562
563 if (pg_count) {
564 if (pg_offset)
565 pg_resid = PAGE_SIZE - pg_offset;
566 else
567 pg_resid = 0;
9bccf70c 568
1c79356b 569 if (flags & CL_COMMIT)
0b4e3aa0 570 ubc_upl_commit_range(upl,
9bccf70c 571 (upl_offset + pg_resid) & ~PAGE_MASK,
0b4e3aa0
A
572 pg_count * PAGE_SIZE,
573 UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
1c79356b
A
574 }
575 upl_offset += io_size;
576 f_offset += io_size;
577 size -= io_size;
578
9bccf70c 579 if (cbp_head && pg_count)
1c79356b
A
580 goto start_io;
581 continue;
9bccf70c 582
1c79356b
A
583 } else if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno)) {
584 real_bp->b_blkno = blkno;
585 }
0b4e3aa0 586
55e303ae
A
587 if (pg_count > max_vectors) {
588 io_size -= (pg_count - max_vectors) * PAGE_SIZE;
0b4e3aa0 589
55e303ae
A
590 if (io_size < 0) {
591 io_size = PAGE_SIZE - pg_offset;
592 pg_count = 1;
593 } else
594 pg_count = max_vectors;
1c79356b 595 }
1c79356b 596
55e303ae
A
597 if ( !(vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV))
598 /*
599 * if we're not targeting a virtual device i.e. a disk image
600 * it's safe to dip into the reserve pool since real devices
601 * can complete this I/O request without requiring additional
602 * bufs from the alloc_io_buf pool
603 */
604 priv = 1;
605 else if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT))
606 /*
607 * Throttle the speculative IO
608 */
0b4e3aa0
A
609 priv = 0;
610 else
611 priv = 1;
612
613 cbp = alloc_io_buf(vp, priv);
1c79356b 614
de355530 615
55e303ae
A
616 if (flags & CL_PAGEOUT) {
617 for (i = 0; i < pg_count; i++) {
1c79356b
A
618 int s;
619 struct buf *bp;
620
621 s = splbio();
622 if (bp = incore(vp, lblkno + i)) {
623 if (!ISSET(bp->b_flags, B_BUSY)) {
624 bremfree(bp);
625 SET(bp->b_flags, (B_BUSY | B_INVAL));
626 splx(s);
627 brelse(bp);
628 } else
629 panic("BUSY bp found in cluster_io");
630 }
631 splx(s);
632 }
1c79356b 633 }
b4c24cb9
A
634 if (flags & CL_ASYNC) {
635 cbp->b_flags |= (B_CALL | B_ASYNC);
636 cbp->b_iodone = (void *)cluster_iodone;
637 }
1c79356b
A
638 cbp->b_flags |= io_flags;
639
640 cbp->b_lblkno = lblkno;
641 cbp->b_blkno = blkno;
642 cbp->b_bcount = io_size;
643 cbp->b_pagelist = upl;
644 cbp->b_uploffset = upl_offset;
645 cbp->b_trans_next = (struct buf *)0;
646
b4c24cb9 647 if (cbp->b_iostate = (void *)iostate)
d7e50217
A
648 /*
649 * caller wants to track the state of this
650 * io... bump the amount issued against this stream
651 */
b4c24cb9
A
652 iostate->io_issued += io_size;
653
1c79356b
A
654 if (flags & CL_READ)
655 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
656 cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
657 else
658 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
659 cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
660
661 if (cbp_head) {
662 cbp_tail->b_trans_next = cbp;
663 cbp_tail = cbp;
664 } else {
665 cbp_head = cbp;
666 cbp_tail = cbp;
667 }
668 (struct buf *)(cbp->b_trans_head) = cbp_head;
9bccf70c 669 buf_count++;
1c79356b
A
670
671 upl_offset += io_size;
672 f_offset += io_size;
673 size -= io_size;
674
9bccf70c 675 if ( (!(upl_offset & PAGE_MASK) && !(flags & CL_DEV_MEMORY) && ((flags & CL_ASYNC) || buf_count > 8)) || size == 0) {
1c79356b
A
676 /*
677 * if we have no more I/O to issue or
678 * the current I/O we've prepared fully
679 * completes the last page in this request
9bccf70c
A
680 * and it's either an ASYNC request or
681 * we've already accumulated more than 8 I/O's into
682 * this transaction and it's not an I/O directed to
683 * special DEVICE memory
1c79356b
A
684 * then go ahead and issue the I/O
685 */
686start_io:
1c79356b
A
687 if (real_bp) {
688 cbp_head->b_flags |= B_NEED_IODONE;
689 cbp_head->b_real_bp = real_bp;
9bccf70c
A
690 } else
691 cbp_head->b_real_bp = (struct buf *)NULL;
1c79356b 692
9bccf70c
A
693 if (size == 0) {
694 /*
695 * we're about to issue the last I/O for this upl
696 * if this was a read to the eof and the eof doesn't
697 * finish on a page boundary, than we need to zero-fill
698 * the rest of the page....
699 */
700 cbp_head->b_validend = zero_offset;
701 } else
702 cbp_head->b_validend = 0;
703
55e303ae
A
704 if (flags & CL_THROTTLE) {
705 while (vp->v_numoutput >= async_throttle) {
706 vp->v_flag |= VTHROTTLED;
707 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_io", 0);
708 }
709 }
1c79356b
A
710 for (cbp = cbp_head; cbp;) {
711 struct buf * cbp_next;
712
713 if (io_flags & B_WRITEINPROG)
714 cbp->b_vp->v_numoutput++;
715
716 cbp_next = cbp->b_trans_next;
9bccf70c 717
1c79356b
A
718 (void) VOP_STRATEGY(cbp);
719 cbp = cbp_next;
720 }
721 if ( !(flags & CL_ASYNC)) {
722 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next)
723 biowait(cbp);
724
725 if (error = cluster_iodone(cbp_head)) {
9bccf70c
A
726 if ((flags & CL_PAGEOUT) && (error == ENXIO))
727 retval = 0; /* drop the error */
728 else
729 retval = error;
1c79356b
A
730 error = 0;
731 }
732 }
733 cbp_head = (struct buf *)0;
734 cbp_tail = (struct buf *)0;
9bccf70c
A
735
736 buf_count = 0;
1c79356b
A
737 }
738 }
739 if (error) {
0b4e3aa0
A
740 int abort_size;
741
b4c24cb9
A
742 io_size = 0;
743
1c79356b
A
744 for (cbp = cbp_head; cbp;) {
745 struct buf * cbp_next;
746
0b4e3aa0
A
747 upl_offset -= cbp->b_bcount;
748 size += cbp->b_bcount;
b4c24cb9 749 io_size += cbp->b_bcount;
0b4e3aa0 750
1c79356b
A
751 cbp_next = cbp->b_trans_next;
752 free_io_buf(cbp);
753 cbp = cbp_next;
1c79356b 754 }
b4c24cb9 755 if (iostate) {
d7e50217
A
756 /*
757 * update the error condition for this stream
758 * since we never really issued the io
759 * just go ahead and adjust it back
760 */
761 if (iostate->io_error == 0)
b4c24cb9 762 iostate->io_error = error;
b4c24cb9
A
763 iostate->io_issued -= io_size;
764
765 if (iostate->io_wanted) {
d7e50217
A
766 /*
767 * someone is waiting for the state of
768 * this io stream to change
769 */
b4c24cb9
A
770 iostate->io_wanted = 0;
771 wakeup((caddr_t)&iostate->io_wanted);
772 }
773 }
0b4e3aa0 774 pg_offset = upl_offset & PAGE_MASK;
55e303ae 775 abort_size = (size + pg_offset + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1c79356b
A
776
777 if (flags & CL_COMMIT) {
778 int upl_abort_code;
779
55e303ae
A
780 if (flags & CL_PRESERVE) {
781 ubc_upl_commit_range(upl, upl_offset - pg_offset, abort_size,
782 UPL_COMMIT_FREE_ON_EMPTY);
783 } else {
784 if ((flags & CL_PAGEOUT) && (error != ENXIO)) /* transient error */
785 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
786 else if (flags & CL_PAGEIN)
787 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
788 else
789 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
1c79356b 790
55e303ae 791 ubc_upl_abort_range(upl, upl_offset - pg_offset, abort_size,
0b4e3aa0 792 upl_abort_code);
55e303ae 793 }
1c79356b 794 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
9bccf70c 795 (int)upl, upl_offset - pg_offset, abort_size, error, 0);
1c79356b
A
796 }
797 if (real_bp) {
798 real_bp->b_flags |= B_ERROR;
799 real_bp->b_error = error;
800
801 biodone(real_bp);
802 }
803 if (retval == 0)
804 retval = error;
805 }
806 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END,
807 (int)f_offset, size, upl_offset, retval, 0);
808
809 return (retval);
810}
811
812
813static int
0b4e3aa0 814cluster_rd_prefetch(vp, f_offset, size, filesize, devblocksize)
1c79356b 815 struct vnode *vp;
1c79356b
A
816 off_t f_offset;
817 u_int size;
818 off_t filesize;
819 int devblocksize;
820{
55e303ae 821 int pages_in_prefetch;
1c79356b
A
822
823 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
824 (int)f_offset, size, (int)filesize, 0, 0);
825
826 if (f_offset >= filesize) {
827 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
828 (int)f_offset, 0, 0, 0, 0);
829 return(0);
830 }
0b4e3aa0 831 if (size > (MAX_UPL_TRANSFER * PAGE_SIZE))
55e303ae 832 size = (MAX_UPL_TRANSFER * PAGE_SIZE);
1c79356b 833 else
55e303ae 834 size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1c79356b 835
9bccf70c
A
836 if ((off_t)size > (filesize - f_offset))
837 size = filesize - f_offset;
55e303ae 838 pages_in_prefetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
1c79356b 839
55e303ae 840 advisory_read(vp, filesize, f_offset, size, devblocksize);
1c79356b
A
841
842 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
55e303ae 843 (int)f_offset + size, pages_in_prefetch, 0, 1, 0);
1c79356b 844
55e303ae 845 return (pages_in_prefetch);
1c79356b
A
846}
847
848
849
850static void
0b4e3aa0 851cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize)
1c79356b 852 struct vnode *vp;
1c79356b
A
853 daddr_t b_lblkno;
854 daddr_t e_lblkno;
855 off_t filesize;
856 int devblocksize;
857{
858 daddr_t r_lblkno;
859 off_t f_offset;
860 int size_of_prefetch;
1c79356b
A
861
862 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
863 b_lblkno, e_lblkno, vp->v_lastr, 0, 0);
864
865 if (b_lblkno == vp->v_lastr && b_lblkno == e_lblkno) {
866 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
867 vp->v_ralen, vp->v_maxra, vp->v_lastr, 0, 0);
868 return;
869 }
9bccf70c
A
870 if (vp->v_lastr == -1 || (b_lblkno != vp->v_lastr && b_lblkno != (vp->v_lastr + 1) &&
871 (b_lblkno != (vp->v_maxra + 1) || vp->v_ralen == 0))) {
1c79356b
A
872 vp->v_ralen = 0;
873 vp->v_maxra = 0;
874
875 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
876 vp->v_ralen, vp->v_maxra, vp->v_lastr, 1, 0);
877
878 return;
879 }
1c79356b 880 if (e_lblkno < vp->v_maxra) {
55e303ae 881 if ((vp->v_maxra - e_lblkno) > (MAX_UPL_TRANSFER / 4)) {
1c79356b
A
882
883 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
884 vp->v_ralen, vp->v_maxra, vp->v_lastr, 2, 0);
885 return;
886 }
887 }
888 r_lblkno = max(e_lblkno, vp->v_maxra) + 1;
889 f_offset = (off_t)r_lblkno * PAGE_SIZE_64;
890
55e303ae
A
891 size_of_prefetch = 0;
892
893 ubc_range_op(vp, f_offset, f_offset + PAGE_SIZE_64, UPL_ROP_PRESENT, &size_of_prefetch);
894
895 if (size_of_prefetch) {
896 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
897 vp->v_ralen, vp->v_maxra, vp->v_lastr, 3, 0);
898 return;
899 }
9bccf70c 900 if (f_offset < filesize) {
55e303ae
A
901 vp->v_ralen = vp->v_ralen ? min(MAX_UPL_TRANSFER, vp->v_ralen << 1) : 1;
902
903 if (((e_lblkno + 1) - b_lblkno) > vp->v_ralen)
904 vp->v_ralen = min(MAX_UPL_TRANSFER, (e_lblkno + 1) - b_lblkno);
905
906 size_of_prefetch = cluster_rd_prefetch(vp, f_offset, vp->v_ralen * PAGE_SIZE, filesize, devblocksize);
1c79356b 907
9bccf70c
A
908 if (size_of_prefetch)
909 vp->v_maxra = (r_lblkno + size_of_prefetch) - 1;
910 }
1c79356b 911 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
55e303ae 912 vp->v_ralen, vp->v_maxra, vp->v_lastr, 4, 0);
1c79356b
A
913}
914
9bccf70c 915int
1c79356b
A
916cluster_pageout(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags)
917 struct vnode *vp;
918 upl_t upl;
919 vm_offset_t upl_offset;
920 off_t f_offset;
921 int size;
922 off_t filesize;
923 int devblocksize;
924 int flags;
925{
926 int io_size;
55e303ae 927 int rounded_size;
1c79356b 928 off_t max_size;
55e303ae
A
929 int local_flags;
930
931 if (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)
932 /*
933 * if we know we're issuing this I/O to a virtual device (i.e. disk image)
934 * then we don't want to enforce this throttle... if we do, we can
935 * potentially deadlock since we're stalling the pageout thread at a time
936 * when the disk image might need additional memory (which won't be available
937 * if the pageout thread can't run)... instead we'll just depend on the throttle
938 * that the pageout thread now has in place to deal with external files
939 */
940 local_flags = CL_PAGEOUT;
941 else
942 local_flags = CL_PAGEOUT | CL_THROTTLE;
1c79356b
A
943
944 if ((flags & UPL_IOSYNC) == 0)
945 local_flags |= CL_ASYNC;
946 if ((flags & UPL_NOCOMMIT) == 0)
947 local_flags |= CL_COMMIT;
948
1c79356b
A
949
950 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
951 (int)f_offset, size, (int)filesize, local_flags, 0);
952
953 /*
954 * If they didn't specify any I/O, then we are done...
955 * we can't issue an abort because we don't know how
956 * big the upl really is
957 */
958 if (size <= 0)
959 return (EINVAL);
960
961 if (vp->v_mount->mnt_flag & MNT_RDONLY) {
962 if (local_flags & CL_COMMIT)
9bccf70c 963 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
1c79356b
A
964 return (EROFS);
965 }
966 /*
967 * can't page-in from a negative offset
968 * or if we're starting beyond the EOF
969 * or if the file offset isn't page aligned
970 * or the size requested isn't a multiple of PAGE_SIZE
971 */
972 if (f_offset < 0 || f_offset >= filesize ||
973 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
0b4e3aa0
A
974 if (local_flags & CL_COMMIT)
975 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
1c79356b
A
976 return (EINVAL);
977 }
978 max_size = filesize - f_offset;
979
980 if (size < max_size)
981 io_size = size;
982 else
9bccf70c 983 io_size = max_size;
1c79356b 984
55e303ae 985 rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1c79356b 986
55e303ae 987 if (size > rounded_size) {
0b4e3aa0 988 if (local_flags & CL_COMMIT)
55e303ae 989 ubc_upl_abort_range(upl, upl_offset + rounded_size, size - rounded_size,
1c79356b
A
990 UPL_ABORT_FREE_ON_EMPTY);
991 }
55e303ae 992 vp->v_flag |= VHASBEENPAGED;
1c79356b 993
9bccf70c 994 return (cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize,
b4c24cb9 995 local_flags, (struct buf *)0, (struct clios *)0));
1c79356b
A
996}
997
9bccf70c 998int
1c79356b
A
999cluster_pagein(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags)
1000 struct vnode *vp;
1001 upl_t upl;
1002 vm_offset_t upl_offset;
1003 off_t f_offset;
1004 int size;
1005 off_t filesize;
1006 int devblocksize;
1007 int flags;
1008{
1009 u_int io_size;
9bccf70c 1010 int rounded_size;
1c79356b
A
1011 off_t max_size;
1012 int retval;
1013 int local_flags = 0;
1c79356b 1014
9bccf70c
A
1015 if (upl == NULL || size < 0)
1016 panic("cluster_pagein: NULL upl passed in");
1c79356b 1017
9bccf70c
A
1018 if ((flags & UPL_IOSYNC) == 0)
1019 local_flags |= CL_ASYNC;
1c79356b 1020 if ((flags & UPL_NOCOMMIT) == 0)
9bccf70c
A
1021 local_flags |= CL_COMMIT;
1022
1c79356b
A
1023
1024 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
1025 (int)f_offset, size, (int)filesize, local_flags, 0);
1026
1027 /*
1028 * can't page-in from a negative offset
1029 * or if we're starting beyond the EOF
1030 * or if the file offset isn't page aligned
1031 * or the size requested isn't a multiple of PAGE_SIZE
1032 */
1033 if (f_offset < 0 || f_offset >= filesize ||
9bccf70c
A
1034 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) {
1035 if (local_flags & CL_COMMIT)
1036 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1c79356b
A
1037 return (EINVAL);
1038 }
1039 max_size = filesize - f_offset;
1040
1041 if (size < max_size)
1042 io_size = size;
1043 else
9bccf70c 1044 io_size = max_size;
1c79356b 1045
9bccf70c 1046 rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1c79356b 1047
9bccf70c
A
1048 if (size > rounded_size && (local_flags & CL_COMMIT))
1049 ubc_upl_abort_range(upl, upl_offset + rounded_size,
55e303ae 1050 size - rounded_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
9bccf70c
A
1051
1052 retval = cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize,
b4c24cb9 1053 local_flags | CL_READ | CL_PAGEIN, (struct buf *)0, (struct clios *)0);
1c79356b
A
1054
1055 if (retval == 0) {
1056 int b_lblkno;
1057 int e_lblkno;
1058
1059 b_lblkno = (int)(f_offset / PAGE_SIZE_64);
1060 e_lblkno = (int)
1061 ((f_offset + ((off_t)io_size - 1)) / PAGE_SIZE_64);
1062
9bccf70c 1063 if (!(flags & UPL_NORDAHEAD) && !(vp->v_flag & VRAOFF) && rounded_size == PAGE_SIZE) {
1c79356b
A
1064 /*
1065 * we haven't read the last page in of the file yet
1066 * so let's try to read ahead if we're in
1067 * a sequential access pattern
1068 */
0b4e3aa0 1069 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
1c79356b
A
1070 }
1071 vp->v_lastr = e_lblkno;
1072 }
1073 return (retval);
1074}
1075
9bccf70c 1076int
1c79356b
A
1077cluster_bp(bp)
1078 struct buf *bp;
1079{
1080 off_t f_offset;
1081 int flags;
1082
9bccf70c
A
1083 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START,
1084 (int)bp, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
1085
1c79356b
A
1086 if (bp->b_pagelist == (upl_t) 0)
1087 panic("cluster_bp: can't handle NULL upl yet\n");
1088 if (bp->b_flags & B_READ)
9bccf70c 1089 flags = CL_ASYNC | CL_READ;
1c79356b 1090 else
9bccf70c 1091 flags = CL_ASYNC;
1c79356b
A
1092
1093 f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
1094
b4c24cb9 1095 return (cluster_io(bp->b_vp, bp->b_pagelist, 0, f_offset, bp->b_bcount, 0, flags, bp, (struct clios *)0));
1c79356b
A
1096}
1097
9bccf70c 1098int
1c79356b
A
1099cluster_write(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
1100 struct vnode *vp;
1101 struct uio *uio;
1102 off_t oldEOF;
1103 off_t newEOF;
1104 off_t headOff;
1105 off_t tailOff;
1106 int devblocksize;
1107 int flags;
1108{
1c79356b
A
1109 int prev_resid;
1110 int clip_size;
1111 off_t max_io_size;
1112 struct iovec *iov;
0b4e3aa0 1113 int upl_size;
0b4e3aa0
A
1114 int upl_flags;
1115 upl_t upl;
1c79356b
A
1116 int retval = 0;
1117
55e303ae
A
1118
1119 if (vp->v_flag & VHASBEENPAGED)
1120 {
1121 /*
1122 * this vnode had pages cleaned to it by
1123 * the pager which indicates that either
1124 * it's not very 'hot', or the system is
1125 * being overwhelmed by a lot of dirty
1126 * data being delayed in the VM cache...
1127 * in either event, we'll push our remaining
1128 * delayed data at this point... this will
1129 * be more efficient than paging out 1 page at
1130 * a time, and will also act as a throttle
1131 * by delaying this client from writing any
1132 * more data until all his delayed data has
1133 * at least been queued to the uderlying driver.
1134 */
1135 cluster_push(vp);
1136
1137 vp->v_flag &= ~VHASBEENPAGED;
1138 }
1c79356b 1139
b4c24cb9 1140 if ( (!(vp->v_flag & VNOCACHE_DATA)) || (!uio) || (uio->uio_segflg != UIO_USERSPACE))
1c79356b 1141 {
55e303ae
A
1142 /*
1143 * go do a write through the cache if one of the following is true....
1144 * NOCACHE is not true
1145 * there is no uio structure or it doesn't target USERSPACE
1146 */
1147 return (cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags));
1c79356b
A
1148 }
1149
1150 while (uio->uio_resid && uio->uio_offset < newEOF && retval == 0)
1151 {
55e303ae
A
1152 /*
1153 * we know we have a resid, so this is safe
1154 * skip over any emtpy vectors
1155 */
1c79356b 1156 iov = uio->uio_iov;
55e303ae 1157
1c79356b
A
1158 while (iov->iov_len == 0) {
1159 uio->uio_iov++;
1160 uio->uio_iovcnt--;
1161 iov = uio->uio_iov;
1162 }
55e303ae 1163 upl_size = PAGE_SIZE;
0b4e3aa0 1164 upl_flags = UPL_QUERY_OBJECT_TYPE;
55e303ae 1165
0b4e3aa0
A
1166 if ((vm_map_get_upl(current_map(),
1167 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
55e303ae 1168 &upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS)
0b4e3aa0
A
1169 {
1170 /*
1171 * the user app must have passed in an invalid address
1172 */
1173 return (EFAULT);
1174 }
1175
55e303ae
A
1176 /*
1177 * We check every vector target but if it is physically
1178 * contiguous space, we skip the sanity checks.
1179 */
0b4e3aa0
A
1180 if (upl_flags & UPL_PHYS_CONTIG)
1181 {
0b4e3aa0
A
1182 if (flags & IO_HEADZEROFILL)
1183 {
1184 flags &= ~IO_HEADZEROFILL;
1185
1186 if (retval = cluster_write_x(vp, (struct uio *)0, 0, uio->uio_offset, headOff, 0, devblocksize, IO_HEADZEROFILL))
1187 return(retval);
1188 }
1189
b4c24cb9 1190 retval = cluster_phys_write(vp, uio, newEOF, devblocksize, flags);
0b4e3aa0
A
1191
1192 if (uio->uio_resid == 0 && (flags & IO_TAILZEROFILL))
1193 {
55e303ae 1194 return (cluster_write_x(vp, (struct uio *)0, 0, tailOff, uio->uio_offset, 0, devblocksize, IO_HEADZEROFILL));
0b4e3aa0
A
1195 }
1196 }
55e303ae 1197 else if ((uio->uio_resid < PAGE_SIZE) || (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL)))
0b4e3aa0
A
1198 {
1199 /*
55e303ae
A
1200 * we're here because we're don't have a physically contiguous target buffer
1201 * go do a write through the cache if one of the following is true....
1202 * the total xfer size is less than a page...
1203 * we're being asked to ZEROFILL either the head or the tail of the I/O...
0b4e3aa0 1204 */
55e303ae 1205 return (cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags));
0b4e3aa0 1206 }
55e303ae 1207 else if (((int)uio->uio_offset & PAGE_MASK) || ((int)iov->iov_base & PAGE_MASK))
1c79356b 1208 {
55e303ae
A
1209 if (((int)uio->uio_offset & PAGE_MASK) == ((int)iov->iov_base & PAGE_MASK))
1210 {
1211 /*
1212 * Bring the file offset write up to a pagesize boundary
1213 * this will also bring the base address to a page boundary
1214 * since they both are currently on the same offset within a page
1215 * note: if we get here, uio->uio_resid is greater than PAGE_SIZE
1216 * so the computed clip_size must always be less than the current uio_resid
1217 */
1218 clip_size = (PAGE_SIZE - (uio->uio_offset & PAGE_MASK_64));
1219
1220 /*
1221 * Fake the resid going into the cluster_write_x call
1222 * and restore it on the way out.
1223 */
1224 prev_resid = uio->uio_resid;
1225 uio->uio_resid = clip_size;
1226 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1227 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1228 }
1229 else
1230 {
1231 /*
1232 * can't get both the file offset and the buffer offset aligned to a page boundary
1233 * so fire an I/O through the cache for this entire vector
1234 */
1235 clip_size = iov->iov_len;
1236 prev_resid = uio->uio_resid;
1237 uio->uio_resid = clip_size;
1238 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1239 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1240 }
1c79356b
A
1241 }
1242 else
1243 {
1244 /*
1245 * If we come in here, we know the offset into
55e303ae
A
1246 * the file is on a pagesize boundary and the
1247 * target buffer address is also on a page boundary
1c79356b 1248 */
1c79356b
A
1249 max_io_size = newEOF - uio->uio_offset;
1250 clip_size = uio->uio_resid;
1251 if (iov->iov_len < clip_size)
1252 clip_size = iov->iov_len;
1253 if (max_io_size < clip_size)
1254 clip_size = max_io_size;
1255
1256 if (clip_size < PAGE_SIZE)
1257 {
1258 /*
1259 * Take care of tail end of write in this vector
1260 */
1261 prev_resid = uio->uio_resid;
1262 uio->uio_resid = clip_size;
0b4e3aa0 1263 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1c79356b
A
1264 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1265 }
1266 else
1267 {
1268 /* round clip_size down to a multiple of pagesize */
1269 clip_size = clip_size & ~(PAGE_MASK);
1270 prev_resid = uio->uio_resid;
1271 uio->uio_resid = clip_size;
0b4e3aa0 1272 retval = cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags);
1c79356b 1273 if ((retval == 0) && uio->uio_resid)
0b4e3aa0 1274 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1c79356b
A
1275 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1276 }
1277 } /* end else */
1278 } /* end while */
1279 return(retval);
1280}
1281
b4c24cb9 1282
9bccf70c 1283static int
0b4e3aa0 1284cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags)
1c79356b
A
1285 struct vnode *vp;
1286 struct uio *uio;
1287 off_t newEOF;
1288 int devblocksize;
1289 int flags;
1290{
1291 upl_t upl;
1292 upl_page_info_t *pl;
1293 off_t upl_f_offset;
1294 vm_offset_t upl_offset;
1295 off_t max_io_size;
1296 int io_size;
d7e50217 1297 int io_flag;
1c79356b
A
1298 int upl_size;
1299 int upl_needed_size;
1300 int pages_in_pl;
1301 int upl_flags;
1302 kern_return_t kret;
1303 struct iovec *iov;
1304 int i;
1305 int force_data_sync;
1306 int error = 0;
d7e50217 1307 struct clios iostate;
1c79356b
A
1308
1309 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
1310 (int)uio->uio_offset, (int)uio->uio_resid,
1311 (int)newEOF, devblocksize, 0);
1312
1313 /*
1314 * When we enter this routine, we know
1315 * -- the offset into the file is on a pagesize boundary
1316 * -- the resid is a page multiple
1317 * -- the resid will not exceed iov_len
1318 */
143cc14e 1319 cluster_try_push(vp, newEOF, 0, 1);
1c79356b 1320
d7e50217
A
1321 iostate.io_completed = 0;
1322 iostate.io_issued = 0;
1323 iostate.io_error = 0;
1324 iostate.io_wanted = 0;
1325
1c79356b 1326 iov = uio->uio_iov;
1c79356b 1327
0b4e3aa0 1328 while (uio->uio_resid && uio->uio_offset < newEOF && error == 0) {
d7e50217 1329 io_size = uio->uio_resid;
1c79356b 1330
d7e50217
A
1331 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1332 io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1c79356b 1333
55e303ae 1334 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK;
d7e50217
A
1335 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
1336
1337 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
1338 (int)upl_offset, upl_needed_size, (int)iov->iov_base, io_size, 0);
1339
1340 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
1341 pages_in_pl = 0;
1342 upl_size = upl_needed_size;
1343 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
55e303ae 1344 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
d7e50217
A
1345
1346 kret = vm_map_get_upl(current_map(),
1347 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1348 &upl_size,
1349 &upl,
1350 NULL,
1351 &pages_in_pl,
1352 &upl_flags,
1353 force_data_sync);
1354
1355 if (kret != KERN_SUCCESS) {
1356 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1357 0, 0, 0, kret, 0);
d7e50217
A
1358 /*
1359 * cluster_nocopy_write: failed to get pagelist
1360 *
1361 * we may have already spun some portion of this request
1362 * off as async requests... we need to wait for the I/O
1363 * to complete before returning
1364 */
1365 goto wait_for_writes;
1366 }
1367 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1368 pages_in_pl = upl_size / PAGE_SIZE;
1c79356b 1369
d7e50217
A
1370 for (i = 0; i < pages_in_pl; i++) {
1371 if (!upl_valid_page(pl, i))
1372 break;
1373 }
1374 if (i == pages_in_pl)
1375 break;
1c79356b 1376
d7e50217
A
1377 /*
1378 * didn't get all the pages back that we
1379 * needed... release this upl and try again
1380 */
1381 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1382 UPL_ABORT_FREE_ON_EMPTY);
1c79356b 1383 }
d7e50217
A
1384 if (force_data_sync >= 3) {
1385 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1386 i, pages_in_pl, upl_size, kret, 0);
d7e50217
A
1387 /*
1388 * for some reason, we couldn't acquire a hold on all
1389 * the pages needed in the user's address space
1390 *
1391 * we may have already spun some portion of this request
1392 * off as async requests... we need to wait for the I/O
1393 * to complete before returning
1394 */
1395 goto wait_for_writes;
1c79356b 1396 }
0b4e3aa0 1397
d7e50217
A
1398 /*
1399 * Consider the possibility that upl_size wasn't satisfied.
1400 */
1401 if (upl_size != upl_needed_size)
1402 io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
1c79356b 1403
d7e50217
A
1404 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1405 (int)upl_offset, upl_size, (int)iov->iov_base, io_size, 0);
1c79356b 1406
d7e50217
A
1407 if (io_size == 0) {
1408 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1409 UPL_ABORT_FREE_ON_EMPTY);
d7e50217
A
1410 /*
1411 * we may have already spun some portion of this request
1412 * off as async requests... we need to wait for the I/O
1413 * to complete before returning
1414 */
1415 goto wait_for_writes;
1416 }
1417 /*
1418 * Now look for pages already in the cache
1419 * and throw them away.
55e303ae
A
1420 * uio->uio_offset is page aligned within the file
1421 * io_size is a multiple of PAGE_SIZE
d7e50217 1422 */
55e303ae 1423 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + io_size, UPL_ROP_DUMP, NULL);
1c79356b 1424
d7e50217
A
1425 /*
1426 * we want push out these writes asynchronously so that we can overlap
1427 * the preparation of the next I/O
1428 * if there are already too many outstanding writes
1429 * wait until some complete before issuing the next
1430 */
1431 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
1432 iostate.io_wanted = 1;
1433 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_write", 0);
1434 }
1435 if (iostate.io_error) {
1436 /*
1437 * one of the earlier writes we issued ran into a hard error
1438 * don't issue any more writes, cleanup the UPL
1439 * that was just created but not used, then
1440 * go wait for all writes that are part of this stream
1441 * to complete before returning the error to the caller
1442 */
1443 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1444 UPL_ABORT_FREE_ON_EMPTY);
1c79356b 1445
d7e50217
A
1446 goto wait_for_writes;
1447 }
55e303ae 1448 io_flag = CL_ASYNC | CL_PRESERVE | CL_COMMIT | CL_THROTTLE;
1c79356b 1449
d7e50217
A
1450 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
1451 (int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0);
1c79356b 1452
d7e50217
A
1453 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1454 io_size, devblocksize, io_flag, (struct buf *)0, &iostate);
7b1edb79 1455
d7e50217
A
1456 iov->iov_len -= io_size;
1457 iov->iov_base += io_size;
1458 uio->uio_resid -= io_size;
1459 uio->uio_offset += io_size;
1c79356b 1460
d7e50217
A
1461 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
1462 (int)upl_offset, (int)uio->uio_offset, (int)uio->uio_resid, error, 0);
1c79356b
A
1463
1464 } /* end while */
1465
d7e50217
A
1466wait_for_writes:
1467 /*
1468 * make sure all async writes issued as part of this stream
1469 * have completed before we return
1470 */
1471 while (iostate.io_issued != iostate.io_completed) {
1472 iostate.io_wanted = 1;
1473 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_write", 0);
1474 }
1475 if (iostate.io_error)
1476 error = iostate.io_error;
1c79356b
A
1477
1478 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1479 (int)uio->uio_offset, (int)uio->uio_resid, error, 4, 0);
1480
1481 return (error);
1482}
1483
b4c24cb9 1484
9bccf70c 1485static int
b4c24cb9 1486cluster_phys_write(vp, uio, newEOF, devblocksize, flags)
0b4e3aa0
A
1487 struct vnode *vp;
1488 struct uio *uio;
143cc14e 1489 off_t newEOF;
b4c24cb9
A
1490 int devblocksize;
1491 int flags;
0b4e3aa0 1492{
b4c24cb9 1493 upl_page_info_t *pl;
55e303ae 1494 addr64_t src_paddr;
0b4e3aa0
A
1495 upl_t upl;
1496 vm_offset_t upl_offset;
b4c24cb9 1497 int tail_size;
0b4e3aa0
A
1498 int io_size;
1499 int upl_size;
1500 int upl_needed_size;
1501 int pages_in_pl;
1502 int upl_flags;
1503 kern_return_t kret;
1504 struct iovec *iov;
1505 int error = 0;
1506
1507 /*
1508 * When we enter this routine, we know
1509 * -- the resid will not exceed iov_len
1510 * -- the vector target address is physcially contiguous
1511 */
143cc14e 1512 cluster_try_push(vp, newEOF, 0, 1);
0b4e3aa0
A
1513
1514 iov = uio->uio_iov;
1515 io_size = iov->iov_len;
55e303ae 1516 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK;
0b4e3aa0
A
1517 upl_needed_size = upl_offset + io_size;
1518
1519 pages_in_pl = 0;
1520 upl_size = upl_needed_size;
9bccf70c 1521 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
55e303ae 1522 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
0b4e3aa0
A
1523
1524 kret = vm_map_get_upl(current_map(),
1525 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1526 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
1527
b4c24cb9
A
1528 if (kret != KERN_SUCCESS) {
1529 /*
1530 * cluster_phys_write: failed to get pagelist
1531 * note: return kret here
1532 */
0b4e3aa0 1533 return(EINVAL);
b4c24cb9 1534 }
0b4e3aa0
A
1535 /*
1536 * Consider the possibility that upl_size wasn't satisfied.
1537 * This is a failure in the physical memory case.
1538 */
b4c24cb9
A
1539 if (upl_size < upl_needed_size) {
1540 kernel_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1541 return(EINVAL);
1542 }
1543 pl = ubc_upl_pageinfo(upl);
0b4e3aa0 1544
55e303ae 1545 src_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + ((addr64_t)((u_int)iov->iov_base & PAGE_MASK));
0b4e3aa0 1546
b4c24cb9
A
1547 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
1548 int head_size;
0b4e3aa0 1549
b4c24cb9 1550 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
0b4e3aa0 1551
b4c24cb9
A
1552 if (head_size > io_size)
1553 head_size = io_size;
1554
1555 error = cluster_align_phys_io(vp, uio, src_paddr, head_size, devblocksize, 0);
1556
1557 if (error) {
1558 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1559
1560 return(EINVAL);
1561 }
1562 upl_offset += head_size;
1563 src_paddr += head_size;
1564 io_size -= head_size;
0b4e3aa0 1565 }
b4c24cb9
A
1566 tail_size = io_size & (devblocksize - 1);
1567 io_size -= tail_size;
1568
1569 if (io_size) {
1570 /*
1571 * issue a synchronous write to cluster_io
1572 */
1573 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1574 io_size, 0, CL_DEV_MEMORY, (struct buf *)0, (struct clios *)0);
1575 }
1576 if (error == 0) {
1577 /*
1578 * The cluster_io write completed successfully,
1579 * update the uio structure
1580 */
1581 uio->uio_resid -= io_size;
1582 iov->iov_len -= io_size;
1583 iov->iov_base += io_size;
1584 uio->uio_offset += io_size;
1585 src_paddr += io_size;
1586
1587 if (tail_size)
1588 error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, devblocksize, 0);
1589 }
1590 /*
1591 * just release our hold on the physically contiguous
1592 * region without changing any state
1593 */
1594 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
0b4e3aa0
A
1595
1596 return (error);
1597}
1598
b4c24cb9 1599
9bccf70c 1600static int
0b4e3aa0 1601cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
1c79356b
A
1602 struct vnode *vp;
1603 struct uio *uio;
1604 off_t oldEOF;
1605 off_t newEOF;
1606 off_t headOff;
1607 off_t tailOff;
1608 int devblocksize;
1609 int flags;
1610{
1611 upl_page_info_t *pl;
1612 upl_t upl;
1613 vm_offset_t upl_offset;
1614 int upl_size;
1615 off_t upl_f_offset;
1616 int pages_in_upl;
1617 int start_offset;
1618 int xfer_resid;
1619 int io_size;
1c79356b 1620 int io_flags;
1c79356b
A
1621 int io_offset;
1622 int bytes_to_zero;
1623 int bytes_to_move;
1624 kern_return_t kret;
1625 int retval = 0;
1626 int uio_resid;
1627 long long total_size;
1628 long long zero_cnt;
1629 off_t zero_off;
1630 long long zero_cnt1;
1631 off_t zero_off1;
1632 daddr_t start_blkno;
1633 daddr_t last_blkno;
55e303ae
A
1634 int intersection;
1635
1c79356b
A
1636
1637 if (uio) {
1638 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1639 (int)uio->uio_offset, uio->uio_resid, (int)oldEOF, (int)newEOF, 0);
1640
1641 uio_resid = uio->uio_resid;
1642 } else {
1643 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1644 0, 0, (int)oldEOF, (int)newEOF, 0);
1645
1646 uio_resid = 0;
1647 }
1648 zero_cnt = 0;
1649 zero_cnt1 = 0;
1650
1651 if (flags & IO_HEADZEROFILL) {
1652 /*
1653 * some filesystems (HFS is one) don't support unallocated holes within a file...
1654 * so we zero fill the intervening space between the old EOF and the offset
1655 * where the next chunk of real data begins.... ftruncate will also use this
1656 * routine to zero fill to the new EOF when growing a file... in this case, the
1657 * uio structure will not be provided
1658 */
1659 if (uio) {
1660 if (headOff < uio->uio_offset) {
1661 zero_cnt = uio->uio_offset - headOff;
1662 zero_off = headOff;
1663 }
1664 } else if (headOff < newEOF) {
1665 zero_cnt = newEOF - headOff;
1666 zero_off = headOff;
1667 }
1668 }
1669 if (flags & IO_TAILZEROFILL) {
1670 if (uio) {
1671 zero_off1 = uio->uio_offset + uio->uio_resid;
1672
1673 if (zero_off1 < tailOff)
1674 zero_cnt1 = tailOff - zero_off1;
1675 }
1676 }
55e303ae 1677 if (zero_cnt == 0 && uio == (struct uio *) 0) {
1c79356b
A
1678 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
1679 retval, 0, 0, 0, 0);
1680 return (0);
55e303ae 1681 }
1c79356b
A
1682
1683 while ((total_size = (uio_resid + zero_cnt + zero_cnt1)) && retval == 0) {
1684 /*
1685 * for this iteration of the loop, figure out where our starting point is
1686 */
1687 if (zero_cnt) {
1688 start_offset = (int)(zero_off & PAGE_MASK_64);
1689 upl_f_offset = zero_off - start_offset;
1690 } else if (uio_resid) {
1691 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
1692 upl_f_offset = uio->uio_offset - start_offset;
1693 } else {
1694 start_offset = (int)(zero_off1 & PAGE_MASK_64);
1695 upl_f_offset = zero_off1 - start_offset;
1696 }
1697 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
1698 (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
1699
0b4e3aa0
A
1700 if (total_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1701 total_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1c79356b 1702
55e303ae
A
1703 start_blkno = (daddr_t)(upl_f_offset / PAGE_SIZE_64);
1704
1705 if (uio && !(vp->v_flag & VNOCACHE_DATA) &&
1706 (flags & (IO_SYNC | IO_HEADZEROFILL | IO_TAILZEROFILL)) == 0) {
1707 /*
1708 * assumption... total_size <= uio_resid
1709 * because IO_HEADZEROFILL and IO_TAILZEROFILL not set
1710 */
1711 if ((start_offset + total_size) > (MAX_UPL_TRANSFER * PAGE_SIZE))
1712 total_size -= start_offset;
1713 xfer_resid = total_size;
1714
1715 retval = cluster_copy_ubc_data(vp, uio, &xfer_resid, 1);
1716
1717 if (retval)
1718 break;
1719
1720 uio_resid -= (total_size - xfer_resid);
1721 total_size = xfer_resid;
1722 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
1723 upl_f_offset = uio->uio_offset - start_offset;
1724
1725 if (total_size == 0) {
1726 if (start_offset) {
1727 /*
1728 * the write did not finish on a page boundary
1729 * which will leave upl_f_offset pointing to the
1730 * beginning of the last page written instead of
1731 * the page beyond it... bump it in this case
1732 * so that the cluster code records the last page
1733 * written as dirty
1734 */
1735 upl_f_offset += PAGE_SIZE_64;
1736 }
1737 upl_size = 0;
1738
1739 goto check_cluster;
1740 }
1741 }
1c79356b
A
1742 /*
1743 * compute the size of the upl needed to encompass
1744 * the requested write... limit each call to cluster_io
0b4e3aa0
A
1745 * to the maximum UPL size... cluster_io will clip if
1746 * this exceeds the maximum io_size for the device,
1747 * make sure to account for
1c79356b
A
1748 * a starting offset that's not page aligned
1749 */
1750 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1751
0b4e3aa0
A
1752 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1753 upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1c79356b
A
1754
1755 pages_in_upl = upl_size / PAGE_SIZE;
1756 io_size = upl_size - start_offset;
1757
1758 if ((long long)io_size > total_size)
1759 io_size = total_size;
1760
55e303ae
A
1761 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, io_size, total_size, 0, 0);
1762
1c79356b 1763
0b4e3aa0
A
1764 kret = ubc_create_upl(vp,
1765 upl_f_offset,
1766 upl_size,
1767 &upl,
1768 &pl,
55e303ae 1769 UPL_SET_LITE);
1c79356b
A
1770 if (kret != KERN_SUCCESS)
1771 panic("cluster_write: failed to get pagelist");
1772
55e303ae
A
1773 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END,
1774 (int)upl, (int)upl_f_offset, start_offset, 0, 0);
1c79356b
A
1775
1776 if (start_offset && !upl_valid_page(pl, 0)) {
0b4e3aa0 1777 int read_size;
1c79356b 1778
0b4e3aa0 1779 /*
1c79356b
A
1780 * we're starting in the middle of the first page of the upl
1781 * and the page isn't currently valid, so we're going to have
1782 * to read it in first... this is a synchronous operation
1783 */
1784 read_size = PAGE_SIZE;
1785
9bccf70c 1786 if ((upl_f_offset + read_size) > newEOF)
1c79356b 1787 read_size = newEOF - upl_f_offset;
9bccf70c
A
1788
1789 retval = cluster_io(vp, upl, 0, upl_f_offset, read_size, devblocksize,
b4c24cb9 1790 CL_READ, (struct buf *)0, (struct clios *)0);
1c79356b 1791 if (retval) {
0b4e3aa0 1792 /*
1c79356b
A
1793 * we had an error during the read which causes us to abort
1794 * the current cluster_write request... before we do, we need
1795 * to release the rest of the pages in the upl without modifying
1796 * there state and mark the failed page in error
1797 */
0b4e3aa0 1798 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
9bccf70c 1799 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1c79356b
A
1800
1801 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
9bccf70c 1802 (int)upl, 0, 0, retval, 0);
1c79356b
A
1803 break;
1804 }
1805 }
1806 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
1807 /*
1808 * the last offset we're writing to in this upl does not end on a page
1809 * boundary... if it's not beyond the old EOF, then we'll also need to
1810 * pre-read this page in if it isn't already valid
1811 */
1812 upl_offset = upl_size - PAGE_SIZE;
1813
1814 if ((upl_f_offset + start_offset + io_size) < oldEOF &&
1815 !upl_valid_page(pl, upl_offset / PAGE_SIZE)) {
1816 int read_size;
1817
1818 read_size = PAGE_SIZE;
1819
9bccf70c 1820 if ((upl_f_offset + upl_offset + read_size) > newEOF)
1c79356b 1821 read_size = newEOF - (upl_f_offset + upl_offset);
9bccf70c
A
1822
1823 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size, devblocksize,
b4c24cb9 1824 CL_READ, (struct buf *)0, (struct clios *)0);
1c79356b 1825 if (retval) {
0b4e3aa0 1826 /*
1c79356b 1827 * we had an error during the read which causes us to abort
0b4e3aa0
A
1828 * the current cluster_write request... before we do, we
1829 * need to release the rest of the pages in the upl without
1830 * modifying there state and mark the failed page in error
1c79356b 1831 */
9bccf70c
A
1832 ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
1833 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1c79356b
A
1834
1835 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
9bccf70c 1836 (int)upl, 0, 0, retval, 0);
1c79356b
A
1837 break;
1838 }
1839 }
1840 }
1c79356b
A
1841 xfer_resid = io_size;
1842 io_offset = start_offset;
1843
1844 while (zero_cnt && xfer_resid) {
1845
1846 if (zero_cnt < (long long)xfer_resid)
1847 bytes_to_zero = zero_cnt;
1848 else
1849 bytes_to_zero = xfer_resid;
1850
9bccf70c 1851 if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
55e303ae 1852 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
1c79356b 1853 } else {
9bccf70c
A
1854 int zero_pg_index;
1855
1c79356b 1856 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
9bccf70c
A
1857 zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
1858
1859 if ( !upl_valid_page(pl, zero_pg_index)) {
55e303ae 1860 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
1c79356b 1861
9bccf70c
A
1862 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
1863 !upl_dirty_page(pl, zero_pg_index)) {
55e303ae 1864 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
1c79356b
A
1865 }
1866 }
1867 xfer_resid -= bytes_to_zero;
1868 zero_cnt -= bytes_to_zero;
1869 zero_off += bytes_to_zero;
1870 io_offset += bytes_to_zero;
1871 }
1872 if (xfer_resid && uio_resid) {
1873 bytes_to_move = min(uio_resid, xfer_resid);
1874
55e303ae 1875 retval = cluster_copy_upl_data(uio, upl, io_offset, bytes_to_move);
9bccf70c 1876
1c79356b 1877 if (retval) {
9bccf70c
A
1878
1879 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
1c79356b
A
1880
1881 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
9bccf70c 1882 (int)upl, 0, 0, retval, 0);
1c79356b
A
1883 } else {
1884 uio_resid -= bytes_to_move;
1885 xfer_resid -= bytes_to_move;
1886 io_offset += bytes_to_move;
1887 }
1888 }
1889 while (xfer_resid && zero_cnt1 && retval == 0) {
1890
1891 if (zero_cnt1 < (long long)xfer_resid)
1892 bytes_to_zero = zero_cnt1;
1893 else
1894 bytes_to_zero = xfer_resid;
1895
9bccf70c 1896 if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
55e303ae 1897 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
1c79356b 1898 } else {
9bccf70c
A
1899 int zero_pg_index;
1900
1c79356b 1901 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off1 & PAGE_MASK_64));
9bccf70c
A
1902 zero_pg_index = (int)((zero_off1 - upl_f_offset) / PAGE_SIZE_64);
1903
1904 if ( !upl_valid_page(pl, zero_pg_index)) {
55e303ae 1905 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
9bccf70c
A
1906 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
1907 !upl_dirty_page(pl, zero_pg_index)) {
55e303ae 1908 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
1c79356b
A
1909 }
1910 }
1911 xfer_resid -= bytes_to_zero;
1912 zero_cnt1 -= bytes_to_zero;
1913 zero_off1 += bytes_to_zero;
1914 io_offset += bytes_to_zero;
1915 }
1916
1917 if (retval == 0) {
9bccf70c 1918 int cl_index;
1c79356b
A
1919 int can_delay;
1920
1921 io_size += start_offset;
1922
9bccf70c 1923 if ((upl_f_offset + io_size) >= newEOF && io_size < upl_size) {
1c79356b
A
1924 /*
1925 * if we're extending the file with this write
1926 * we'll zero fill the rest of the page so that
1927 * if the file gets extended again in such a way as to leave a
1928 * hole starting at this EOF, we'll have zero's in the correct spot
1929 */
55e303ae 1930 cluster_zero(upl, io_size, upl_size - io_size, NULL);
1c79356b 1931 }
9bccf70c
A
1932 if (flags & IO_SYNC)
1933 /*
1934 * if the IO_SYNC flag is set than we need to
1935 * bypass any clusters and immediately issue
1936 * the I/O
1937 */
1938 goto issue_io;
55e303ae
A
1939check_cluster:
1940 /*
1941 * calculate the last logical block number
1942 * that this delayed I/O encompassed
1943 */
1944 last_blkno = (upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64;
1945
1946 if (vp->v_flag & VHASDIRTY) {
1947
1948 if ( !(vp->v_flag & VNOCACHE_DATA)) {
1949 /*
1950 * we've fallen into the sparse
1951 * cluster method of delaying dirty pages
1952 * first, we need to release the upl if we hold one
1953 * since pages in it may be present in the sparse cluster map
1954 * and may span 2 separate buckets there... if they do and
1955 * we happen to have to flush a bucket to make room and it intersects
1956 * this upl, a deadlock may result on page BUSY
1957 */
1958 if (upl_size)
1959 ubc_upl_commit_range(upl, 0, upl_size,
1960 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
1961
1962 sparse_cluster_add(vp, newEOF, start_blkno, last_blkno);
1963
1964 continue;
1965 }
1966 /*
1967 * must have done cached writes that fell into
1968 * the sparse cluster mechanism... we've switched
1969 * to uncached writes on the file, so go ahead
1970 * and push whatever's in the sparse map
1971 * and switch back to normal clustering
1972 *
1973 * see the comment above concerning a possible deadlock...
1974 */
1975 if (upl_size) {
1976 ubc_upl_commit_range(upl, 0, upl_size,
1977 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
1978 /*
1979 * setting upl_size to 0 keeps us from committing a
1980 * second time in the start_new_cluster path
1981 */
1982 upl_size = 0;
1983 }
1984 sparse_cluster_push(vp, ubc_getsize(vp), 1);
1985
1986 /*
1987 * no clusters of either type present at this point
1988 * so just go directly to start_new_cluster since
1989 * we know we need to delay this I/O since we've
1990 * already released the pages back into the cache
1991 * to avoid the deadlock with sparse_cluster_push
1992 */
1993 goto start_new_cluster;
1994 }
1995 upl_offset = 0;
1c79356b 1996
9bccf70c
A
1997 if (vp->v_clen == 0)
1998 /*
1999 * no clusters currently present
2000 */
2001 goto start_new_cluster;
1c79356b 2002
9bccf70c 2003 for (cl_index = 0; cl_index < vp->v_clen; cl_index++) {
1c79356b 2004 /*
55e303ae
A
2005 * check each cluster that we currently hold
2006 * try to merge some or all of this write into
2007 * one or more of the existing clusters... if
2008 * any portion of the write remains, start a
2009 * new cluster
1c79356b 2010 */
9bccf70c
A
2011 if (start_blkno >= vp->v_clusters[cl_index].start_pg) {
2012 /*
2013 * the current write starts at or after the current cluster
2014 */
2015 if (last_blkno <= (vp->v_clusters[cl_index].start_pg + MAX_UPL_TRANSFER)) {
1c79356b
A
2016 /*
2017 * we have a write that fits entirely
2018 * within the existing cluster limits
2019 */
9bccf70c 2020 if (last_blkno > vp->v_clusters[cl_index].last_pg)
1c79356b 2021 /*
9bccf70c 2022 * update our idea of where the cluster ends
1c79356b 2023 */
9bccf70c
A
2024 vp->v_clusters[cl_index].last_pg = last_blkno;
2025 break;
1c79356b 2026 }
9bccf70c 2027 if (start_blkno < (vp->v_clusters[cl_index].start_pg + MAX_UPL_TRANSFER)) {
1c79356b
A
2028 /*
2029 * we have a write that starts in the middle of the current cluster
55e303ae
A
2030 * but extends beyond the cluster's limit... we know this because
2031 * of the previous checks
2032 * we'll extend the current cluster to the max
2033 * and update the start_blkno for the current write to reflect that
2034 * the head of it was absorbed into this cluster...
2035 * note that we'll always have a leftover tail in this case since
2036 * full absorbtion would have occurred in the clause above
1c79356b 2037 */
55e303ae
A
2038 vp->v_clusters[cl_index].last_pg = vp->v_clusters[cl_index].start_pg + MAX_UPL_TRANSFER;
2039
2040 if (upl_size) {
2041 int start_pg_in_upl;
2042
2043 start_pg_in_upl = upl_f_offset / PAGE_SIZE_64;
2044
2045 if (start_pg_in_upl < vp->v_clusters[cl_index].last_pg) {
2046 intersection = (vp->v_clusters[cl_index].last_pg - start_pg_in_upl) * PAGE_SIZE;
2047
2048 ubc_upl_commit_range(upl, upl_offset, intersection,
2049 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2050 upl_f_offset += intersection;
2051 upl_offset += intersection;
2052 upl_size -= intersection;
2053 }
2054 }
2055 start_blkno = vp->v_clusters[cl_index].last_pg;
1c79356b
A
2056 }
2057 /*
55e303ae
A
2058 * we come here for the case where the current write starts
2059 * beyond the limit of the existing cluster or we have a leftover
2060 * tail after a partial absorbtion
9bccf70c
A
2061 *
2062 * in either case, we'll check the remaining clusters before
2063 * starting a new one
1c79356b 2064 */
9bccf70c 2065 } else {
1c79356b 2066 /*
55e303ae 2067 * the current write starts in front of the cluster we're currently considering
1c79356b 2068 */
55e303ae 2069 if ((vp->v_clusters[cl_index].last_pg - start_blkno) <= MAX_UPL_TRANSFER) {
1c79356b 2070 /*
55e303ae
A
2071 * we can just merge the new request into
2072 * this cluster and leave it in the cache
2073 * since the resulting cluster is still
2074 * less than the maximum allowable size
1c79356b 2075 */
9bccf70c 2076 vp->v_clusters[cl_index].start_pg = start_blkno;
1c79356b 2077
9bccf70c
A
2078 if (last_blkno > vp->v_clusters[cl_index].last_pg) {
2079 /*
2080 * the current write completely
55e303ae
A
2081 * envelops the existing cluster and since
2082 * each write is limited to at most MAX_UPL_TRANSFER bytes
2083 * we can just use the start and last blocknos of the write
2084 * to generate the cluster limits
9bccf70c
A
2085 */
2086 vp->v_clusters[cl_index].last_pg = last_blkno;
2087 }
2088 break;
1c79356b 2089 }
9bccf70c 2090
1c79356b 2091 /*
9bccf70c
A
2092 * if we were to combine this write with the current cluster
2093 * we would exceed the cluster size limit.... so,
2094 * let's see if there's any overlap of the new I/O with
55e303ae
A
2095 * the cluster we're currently considering... in fact, we'll
2096 * stretch the cluster out to it's full limit and see if we
2097 * get an intersection with the current write
9bccf70c 2098 *
1c79356b 2099 */
55e303ae 2100 if (last_blkno > vp->v_clusters[cl_index].last_pg - MAX_UPL_TRANSFER) {
1c79356b 2101 /*
55e303ae
A
2102 * the current write extends into the proposed cluster
2103 * clip the length of the current write after first combining it's
2104 * tail with the newly shaped cluster
1c79356b 2105 */
55e303ae
A
2106 vp->v_clusters[cl_index].start_pg = vp->v_clusters[cl_index].last_pg - MAX_UPL_TRANSFER;
2107
2108 if (upl_size) {
2109 intersection = (last_blkno - vp->v_clusters[cl_index].start_pg) * PAGE_SIZE;
2110
2111 if (intersection > upl_size)
2112 /*
2113 * because the current write may consist of a number of pages found in the cache
2114 * which are not part of the UPL, we may have an intersection that exceeds
2115 * the size of the UPL that is also part of this write
2116 */
2117 intersection = upl_size;
2118
2119 ubc_upl_commit_range(upl, upl_offset + (upl_size - intersection), intersection,
2120 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2121 upl_size -= intersection;
2122 }
2123 last_blkno = vp->v_clusters[cl_index].start_pg;
2124 }
9bccf70c
A
2125 /*
2126 * if we get here, there was no way to merge
55e303ae
A
2127 * any portion of this write with this cluster
2128 * or we could only merge part of it which
2129 * will leave a tail...
9bccf70c
A
2130 * we'll check the remaining clusters before starting a new one
2131 */
1c79356b 2132 }
9bccf70c
A
2133 }
2134 if (cl_index < vp->v_clen)
2135 /*
55e303ae
A
2136 * we found an existing cluster(s) that we
2137 * could entirely merge this I/O into
9bccf70c
A
2138 */
2139 goto delay_io;
2140
2141 if (vp->v_clen < MAX_CLUSTERS && !(vp->v_flag & VNOCACHE_DATA))
2142 /*
2143 * we didn't find an existing cluster to
2144 * merge into, but there's room to start
1c79356b
A
2145 * a new one
2146 */
9bccf70c 2147 goto start_new_cluster;
1c79356b 2148
9bccf70c
A
2149 /*
2150 * no exisitng cluster to merge with and no
2151 * room to start a new one... we'll try
55e303ae
A
2152 * pushing one of the existing ones... if none of
2153 * them are able to be pushed, we'll switch
2154 * to the sparse cluster mechanism
2155 * cluster_try_push updates v_clen to the
2156 * number of remaining clusters... and
2157 * returns the number of currently unused clusters
9bccf70c
A
2158 */
2159 if (vp->v_flag & VNOCACHE_DATA)
2160 can_delay = 0;
2161 else
2162 can_delay = 1;
2163
55e303ae
A
2164 if (cluster_try_push(vp, newEOF, can_delay, 0) == 0) {
2165 /*
2166 * no more room in the normal cluster mechanism
2167 * so let's switch to the more expansive but expensive
2168 * sparse mechanism....
2169 * first, we need to release the upl if we hold one
2170 * since pages in it may be present in the sparse cluster map (after the cluster_switch)
2171 * and may span 2 separate buckets there... if they do and
2172 * we happen to have to flush a bucket to make room and it intersects
2173 * this upl, a deadlock may result on page BUSY
2174 */
2175 if (upl_size)
2176 ubc_upl_commit_range(upl, upl_offset, upl_size,
2177 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2178
2179 sparse_cluster_switch(vp, newEOF);
2180 sparse_cluster_add(vp, newEOF, start_blkno, last_blkno);
2181
2182 continue;
9bccf70c 2183 }
55e303ae
A
2184 /*
2185 * we pushed one cluster successfully, so we must be sequentially writing this file
2186 * otherwise, we would have failed and fallen into the sparse cluster support
2187 * so let's take the opportunity to push out additional clusters as long as we
2188 * remain below the throttle... this will give us better I/O locality if we're
2189 * in a copy loop (i.e. we won't jump back and forth between the read and write points
2190 * however, we don't want to push so much out that the write throttle kicks in and
2191 * hangs this thread up until some of the I/O completes...
2192 */
2193 while (vp->v_clen && (vp->v_numoutput <= (ASYNC_THROTTLE / 2)))
2194 cluster_try_push(vp, newEOF, 0, 0);
2195
9bccf70c 2196start_new_cluster:
55e303ae 2197 if (vp->v_clen == 0)
9bccf70c 2198 vp->v_ciosiz = devblocksize;
55e303ae 2199
9bccf70c
A
2200 vp->v_clusters[vp->v_clen].start_pg = start_blkno;
2201 vp->v_clusters[vp->v_clen].last_pg = last_blkno;
2202 vp->v_clen++;
9bccf70c 2203
55e303ae
A
2204delay_io:
2205 if (upl_size)
2206 ubc_upl_commit_range(upl, upl_offset, upl_size,
2207 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
9bccf70c
A
2208 continue;
2209issue_io:
2210 /*
2211 * in order to maintain some semblance of coherency with mapped writes
2212 * we need to write the cluster back out as a multiple of the PAGESIZE
2213 * unless the cluster encompasses the last page of the file... in this
2214 * case we'll round out to the nearest device block boundary
2215 */
2216 io_size = upl_size;
2217
2218 if ((upl_f_offset + io_size) > newEOF) {
2219 io_size = newEOF - upl_f_offset;
2220 io_size = (io_size + (devblocksize - 1)) & ~(devblocksize - 1);
1c79356b 2221 }
9bccf70c 2222
0b4e3aa0 2223 if (flags & IO_SYNC)
55e303ae 2224 io_flags = CL_THROTTLE | CL_COMMIT | CL_AGE;
1c79356b 2225 else
55e303ae 2226 io_flags = CL_THROTTLE | CL_COMMIT | CL_AGE | CL_ASYNC;
1c79356b
A
2227
2228 if (vp->v_flag & VNOCACHE_DATA)
2229 io_flags |= CL_DUMP;
2230
9bccf70c 2231 retval = cluster_io(vp, upl, 0, upl_f_offset, io_size, devblocksize,
b4c24cb9 2232 io_flags, (struct buf *)0, (struct clios *)0);
1c79356b
A
2233 }
2234 }
2235 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
55e303ae 2236 retval, 0, uio_resid, 0, 0);
1c79356b
A
2237
2238 return (retval);
2239}
2240
9bccf70c 2241int
1c79356b
A
2242cluster_read(vp, uio, filesize, devblocksize, flags)
2243 struct vnode *vp;
2244 struct uio *uio;
2245 off_t filesize;
2246 int devblocksize;
2247 int flags;
2248{
1c79356b
A
2249 int prev_resid;
2250 int clip_size;
2251 off_t max_io_size;
2252 struct iovec *iov;
0b4e3aa0 2253 int upl_size;
0b4e3aa0
A
2254 int upl_flags;
2255 upl_t upl;
1c79356b
A
2256 int retval = 0;
2257
1c79356b 2258
0b4e3aa0 2259 if (!((vp->v_flag & VNOCACHE_DATA) && (uio->uio_segflg == UIO_USERSPACE)))
1c79356b 2260 {
55e303ae
A
2261 /*
2262 * go do a read through the cache if one of the following is true....
2263 * NOCACHE is not true
2264 * the uio request doesn't target USERSPACE
2265 */
2266 return (cluster_read_x(vp, uio, filesize, devblocksize, flags));
1c79356b
A
2267 }
2268
2269 while (uio->uio_resid && uio->uio_offset < filesize && retval == 0)
2270 {
55e303ae
A
2271 /*
2272 * we know we have a resid, so this is safe
2273 * skip over any emtpy vectors
2274 */
1c79356b 2275 iov = uio->uio_iov;
55e303ae 2276
1c79356b
A
2277 while (iov->iov_len == 0) {
2278 uio->uio_iov++;
2279 uio->uio_iovcnt--;
2280 iov = uio->uio_iov;
2281 }
55e303ae 2282 upl_size = PAGE_SIZE;
0b4e3aa0 2283 upl_flags = UPL_QUERY_OBJECT_TYPE;
55e303ae
A
2284
2285 if ((vm_map_get_upl(current_map(),
0b4e3aa0 2286 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
55e303ae 2287 &upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS)
0b4e3aa0
A
2288 {
2289 /*
2290 * the user app must have passed in an invalid address
2291 */
2292 return (EFAULT);
2293 }
2294
55e303ae
A
2295 /*
2296 * We check every vector target but if it is physically
2297 * contiguous space, we skip the sanity checks.
2298 */
0b4e3aa0
A
2299 if (upl_flags & UPL_PHYS_CONTIG)
2300 {
b4c24cb9 2301 retval = cluster_phys_read(vp, uio, filesize, devblocksize, flags);
0b4e3aa0 2302 }
55e303ae 2303 else if (uio->uio_resid < PAGE_SIZE)
0b4e3aa0
A
2304 {
2305 /*
55e303ae
A
2306 * we're here because we're don't have a physically contiguous target buffer
2307 * go do a read through the cache if
2308 * the total xfer size is less than a page...
1c79356b 2309 */
55e303ae 2310 return (cluster_read_x(vp, uio, filesize, devblocksize, flags));
1c79356b 2311 }
55e303ae 2312 else if (((int)uio->uio_offset & PAGE_MASK) || ((int)iov->iov_base & PAGE_MASK))
1c79356b 2313 {
55e303ae
A
2314 if (((int)uio->uio_offset & PAGE_MASK) == ((int)iov->iov_base & PAGE_MASK))
2315 {
2316 /*
2317 * Bring the file offset read up to a pagesize boundary
2318 * this will also bring the base address to a page boundary
2319 * since they both are currently on the same offset within a page
2320 * note: if we get here, uio->uio_resid is greater than PAGE_SIZE
2321 * so the computed clip_size must always be less than the current uio_resid
2322 */
2323 clip_size = (PAGE_SIZE - (int)(uio->uio_offset & PAGE_MASK_64));
2324
2325 /*
2326 * Fake the resid going into the cluster_read_x call
2327 * and restore it on the way out.
2328 */
2329 prev_resid = uio->uio_resid;
2330 uio->uio_resid = clip_size;
2331 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2332 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2333 }
2334 else
2335 {
2336 /*
2337 * can't get both the file offset and the buffer offset aligned to a page boundary
2338 * so fire an I/O through the cache for this entire vector
2339 */
2340 clip_size = iov->iov_len;
2341 prev_resid = uio->uio_resid;
2342 uio->uio_resid = clip_size;
2343 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2344 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2345 }
1c79356b
A
2346 }
2347 else
2348 {
2349 /*
2350 * If we come in here, we know the offset into
2351 * the file is on a pagesize boundary
2352 */
2353
2354 max_io_size = filesize - uio->uio_offset;
2355 clip_size = uio->uio_resid;
2356 if (iov->iov_len < clip_size)
2357 clip_size = iov->iov_len;
2358 if (max_io_size < clip_size)
2359 clip_size = (int)max_io_size;
2360
2361 if (clip_size < PAGE_SIZE)
2362 {
2363 /*
2364 * Take care of the tail end of the read in this vector.
2365 */
2366 prev_resid = uio->uio_resid;
2367 uio->uio_resid = clip_size;
0b4e3aa0 2368 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
1c79356b
A
2369 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2370 }
2371 else
2372 {
2373 /* round clip_size down to a multiple of pagesize */
2374 clip_size = clip_size & ~(PAGE_MASK);
2375 prev_resid = uio->uio_resid;
2376 uio->uio_resid = clip_size;
0b4e3aa0 2377 retval = cluster_nocopy_read(vp, uio, filesize, devblocksize, flags);
1c79356b 2378 if ((retval==0) && uio->uio_resid)
0b4e3aa0 2379 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
1c79356b
A
2380 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2381 }
2382 } /* end else */
2383 } /* end while */
2384
1c79356b
A
2385 return(retval);
2386}
2387
9bccf70c 2388static int
0b4e3aa0 2389cluster_read_x(vp, uio, filesize, devblocksize, flags)
1c79356b
A
2390 struct vnode *vp;
2391 struct uio *uio;
2392 off_t filesize;
2393 int devblocksize;
2394 int flags;
2395{
2396 upl_page_info_t *pl;
2397 upl_t upl;
2398 vm_offset_t upl_offset;
2399 int upl_size;
2400 off_t upl_f_offset;
2401 int start_offset;
2402 int start_pg;
2403 int last_pg;
2404 int uio_last;
2405 int pages_in_upl;
2406 off_t max_size;
55e303ae
A
2407 off_t last_ioread_offset;
2408 off_t last_request_offset;
2409 u_int size_of_prefetch;
1c79356b 2410 int io_size;
1c79356b 2411 kern_return_t kret;
1c79356b
A
2412 int error = 0;
2413 int retval = 0;
55e303ae
A
2414 u_int b_lblkno;
2415 u_int e_lblkno;
2416 struct clios iostate;
2417 u_int max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2418 u_int rd_ahead_enabled = 1;
2419 u_int prefetch_enabled = 1;
2420
2421
2422 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
2423 (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0);
2424
2425 if (cluster_hard_throttle_on(vp)) {
2426 rd_ahead_enabled = 0;
2427 prefetch_enabled = 0;
2428
2429 max_rd_size = HARD_THROTTLE_MAXSIZE;
2430 }
2431 if (vp->v_flag & (VRAOFF|VNOCACHE_DATA))
2432 rd_ahead_enabled = 0;
2433
2434 last_request_offset = uio->uio_offset + uio->uio_resid;
2435
2436 if (last_request_offset > filesize)
2437 last_request_offset = filesize;
2438 b_lblkno = (u_int)(uio->uio_offset / PAGE_SIZE_64);
2439 e_lblkno = (u_int)((last_request_offset - 1) / PAGE_SIZE_64);
2440
2441 if (vp->v_ralen && (vp->v_lastr == b_lblkno || (vp->v_lastr + 1) == b_lblkno)) {
2442 /*
2443 * determine if we already have a read-ahead in the pipe courtesy of the
2444 * last read systemcall that was issued...
2445 * if so, pick up it's extent to determine where we should start
2446 * with respect to any read-ahead that might be necessary to
2447 * garner all the data needed to complete this read systemcall
2448 */
2449 last_ioread_offset = (vp->v_maxra * PAGE_SIZE_64) + PAGE_SIZE_64;
1c79356b 2450
55e303ae
A
2451 if (last_ioread_offset < uio->uio_offset)
2452 last_ioread_offset = (off_t)0;
2453 else if (last_ioread_offset > last_request_offset)
2454 last_ioread_offset = last_request_offset;
2455 } else
2456 last_ioread_offset = (off_t)0;
1c79356b
A
2457
2458 while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) {
2459 /*
2460 * compute the size of the upl needed to encompass
2461 * the requested read... limit each call to cluster_io
0b4e3aa0
A
2462 * to the maximum UPL size... cluster_io will clip if
2463 * this exceeds the maximum io_size for the device,
2464 * make sure to account for
1c79356b
A
2465 * a starting offset that's not page aligned
2466 */
2467 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2468 upl_f_offset = uio->uio_offset - (off_t)start_offset;
2469 max_size = filesize - uio->uio_offset;
2470
0b4e3aa0 2471 if ((off_t)((unsigned int)uio->uio_resid) < max_size)
1c79356b
A
2472 io_size = uio->uio_resid;
2473 else
2474 io_size = max_size;
9bccf70c 2475
55e303ae 2476 if (!(vp->v_flag & VNOCACHE_DATA)) {
1c79356b 2477
55e303ae
A
2478 while (io_size) {
2479 u_int io_resid;
2480 u_int io_requested;
1c79356b 2481
55e303ae
A
2482 /*
2483 * if we keep finding the pages we need already in the cache, then
2484 * don't bother to call cluster_rd_prefetch since it costs CPU cycles
2485 * to determine that we have all the pages we need... once we miss in
2486 * the cache and have issued an I/O, than we'll assume that we're likely
2487 * to continue to miss in the cache and it's to our advantage to try and prefetch
2488 */
2489 if (last_request_offset && last_ioread_offset && (size_of_prefetch = (last_request_offset - last_ioread_offset))) {
2490 if ((last_ioread_offset - uio->uio_offset) <= max_rd_size && prefetch_enabled) {
2491 /*
2492 * we've already issued I/O for this request and
2493 * there's still work to do and
2494 * our prefetch stream is running dry, so issue a
2495 * pre-fetch I/O... the I/O latency will overlap
2496 * with the copying of the data
2497 */
2498 if (size_of_prefetch > max_rd_size)
2499 size_of_prefetch = max_rd_size;
1c79356b 2500
55e303ae 2501 size_of_prefetch = cluster_rd_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, devblocksize);
1c79356b 2502
55e303ae
A
2503 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
2504
2505 if (last_ioread_offset > last_request_offset)
2506 last_ioread_offset = last_request_offset;
2507 }
2508 }
2509 /*
2510 * limit the size of the copy we're about to do so that
2511 * we can notice that our I/O pipe is running dry and
2512 * get the next I/O issued before it does go dry
2513 */
2514 if (last_ioread_offset && io_size > ((MAX_UPL_TRANSFER * PAGE_SIZE) / 4))
2515 io_resid = ((MAX_UPL_TRANSFER * PAGE_SIZE) / 4);
2516 else
2517 io_resid = io_size;
1c79356b 2518
55e303ae 2519 io_requested = io_resid;
1c79356b 2520
55e303ae 2521 retval = cluster_copy_ubc_data(vp, uio, &io_resid, 0);
1c79356b 2522
55e303ae 2523 io_size -= (io_requested - io_resid);
1c79356b 2524
55e303ae
A
2525 if (retval || io_resid)
2526 /*
2527 * if we run into a real error or
2528 * a page that is not in the cache
2529 * we need to leave streaming mode
2530 */
2531 break;
2532
2533 if ((io_size == 0 || last_ioread_offset == last_request_offset) && rd_ahead_enabled) {
2534 /*
2535 * we're already finished the I/O for this read request
2536 * let's see if we should do a read-ahead
2537 */
2538 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
2539 }
1c79356b 2540 }
1c79356b
A
2541 if (retval)
2542 break;
1c79356b 2543 if (io_size == 0) {
55e303ae
A
2544 if (e_lblkno < vp->v_lastr)
2545 vp->v_maxra = 0;
2546 vp->v_lastr = e_lblkno;
1c79356b
A
2547
2548 break;
2549 }
55e303ae
A
2550 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2551 upl_f_offset = uio->uio_offset - (off_t)start_offset;
2552 max_size = filesize - uio->uio_offset;
1c79356b 2553 }
55e303ae
A
2554 if (io_size > max_rd_size)
2555 io_size = max_rd_size;
2556
1c79356b 2557 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
55e303ae
A
2558
2559 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE) / 4)
2560 upl_size = (MAX_UPL_TRANSFER * PAGE_SIZE) / 4;
1c79356b
A
2561 pages_in_upl = upl_size / PAGE_SIZE;
2562
2563 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
9bccf70c 2564 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
1c79356b 2565
0b4e3aa0
A
2566 kret = ubc_create_upl(vp,
2567 upl_f_offset,
2568 upl_size,
2569 &upl,
2570 &pl,
55e303ae 2571 UPL_SET_LITE);
1c79356b
A
2572 if (kret != KERN_SUCCESS)
2573 panic("cluster_read: failed to get pagelist");
2574
1c79356b 2575 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
9bccf70c 2576 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
1c79356b
A
2577
2578 /*
2579 * scan from the beginning of the upl looking for the first
2580 * non-valid page.... this will become the first page in
2581 * the request we're going to make to 'cluster_io'... if all
2582 * of the pages are valid, we won't call through to 'cluster_io'
2583 */
2584 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
2585 if (!upl_valid_page(pl, start_pg))
2586 break;
2587 }
2588
2589 /*
2590 * scan from the starting invalid page looking for a valid
2591 * page before the end of the upl is reached, if we
2592 * find one, then it will be the last page of the request to
2593 * 'cluster_io'
2594 */
2595 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
2596 if (upl_valid_page(pl, last_pg))
2597 break;
2598 }
55e303ae
A
2599 iostate.io_completed = 0;
2600 iostate.io_issued = 0;
2601 iostate.io_error = 0;
2602 iostate.io_wanted = 0;
1c79356b
A
2603
2604 if (start_pg < last_pg) {
2605 /*
2606 * we found a range of 'invalid' pages that must be filled
2607 * if the last page in this range is the last page of the file
2608 * we may have to clip the size of it to keep from reading past
2609 * the end of the last physical block associated with the file
2610 */
2611 upl_offset = start_pg * PAGE_SIZE;
2612 io_size = (last_pg - start_pg) * PAGE_SIZE;
2613
9bccf70c 2614 if ((upl_f_offset + upl_offset + io_size) > filesize)
1c79356b 2615 io_size = filesize - (upl_f_offset + upl_offset);
9bccf70c 2616
1c79356b 2617 /*
55e303ae 2618 * issue an asynchronous read to cluster_io
1c79356b
A
2619 */
2620
2621 error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
55e303ae 2622 io_size, devblocksize, CL_READ | CL_ASYNC, (struct buf *)0, &iostate);
1c79356b
A
2623 }
2624 if (error == 0) {
2625 /*
2626 * if the read completed successfully, or there was no I/O request
55e303ae
A
2627 * issued, than copy the data into user land via 'cluster_upl_copy_data'
2628 * we'll first add on any 'valid'
1c79356b
A
2629 * pages that were present in the upl when we acquired it.
2630 */
2631 u_int val_size;
1c79356b
A
2632
2633 for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
2634 if (!upl_valid_page(pl, uio_last))
2635 break;
2636 }
2637 /*
2638 * compute size to transfer this round, if uio->uio_resid is
55e303ae 2639 * still non-zero after this attempt, we'll loop around and
1c79356b
A
2640 * set up for another I/O.
2641 */
2642 val_size = (uio_last * PAGE_SIZE) - start_offset;
2643
55e303ae 2644 if (val_size > max_size)
1c79356b
A
2645 val_size = max_size;
2646
55e303ae 2647 if (val_size > uio->uio_resid)
1c79356b
A
2648 val_size = uio->uio_resid;
2649
55e303ae
A
2650 if (last_ioread_offset == 0)
2651 last_ioread_offset = uio->uio_offset + val_size;
1c79356b 2652
55e303ae 2653 if ((size_of_prefetch = (last_request_offset - last_ioread_offset)) && prefetch_enabled) {
1c79356b 2654 /*
55e303ae
A
2655 * if there's still I/O left to do for this request, and...
2656 * we're not in hard throttle mode, then issue a
2657 * pre-fetch I/O... the I/O latency will overlap
1c79356b
A
2658 * with the copying of the data
2659 */
55e303ae 2660 size_of_prefetch = cluster_rd_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, devblocksize);
1c79356b 2661
55e303ae
A
2662 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
2663
2664 if (last_ioread_offset > last_request_offset)
2665 last_ioread_offset = last_request_offset;
1c79356b 2666
55e303ae
A
2667 } else if ((uio->uio_offset + val_size) == last_request_offset) {
2668 /*
2669 * this transfer will finish this request, so...
2670 * let's try to read ahead if we're in
2671 * a sequential access pattern and we haven't
2672 * explicitly disabled it
2673 */
2674 if (rd_ahead_enabled)
2675 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
1c79356b 2676
55e303ae
A
2677 if (e_lblkno < vp->v_lastr)
2678 vp->v_maxra = 0;
2679 vp->v_lastr = e_lblkno;
9bccf70c 2680 }
55e303ae
A
2681 while (iostate.io_issued != iostate.io_completed) {
2682 iostate.io_wanted = 1;
2683 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_read_x", 0);
2684 }
2685 if (iostate.io_error)
2686 error = iostate.io_error;
9bccf70c 2687 else
55e303ae 2688 retval = cluster_copy_upl_data(uio, upl, start_offset, val_size);
1c79356b
A
2689 }
2690 if (start_pg < last_pg) {
2691 /*
2692 * compute the range of pages that we actually issued an I/O for
2693 * and either commit them as valid if the I/O succeeded
2694 * or abort them if the I/O failed
2695 */
2696 io_size = (last_pg - start_pg) * PAGE_SIZE;
2697
2698 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
9bccf70c 2699 (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
1c79356b
A
2700
2701 if (error || (vp->v_flag & VNOCACHE_DATA))
0b4e3aa0 2702 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
1c79356b
A
2703 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2704 else
0b4e3aa0 2705 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size,
55e303ae
A
2706 UPL_COMMIT_CLEAR_DIRTY |
2707 UPL_COMMIT_FREE_ON_EMPTY |
2708 UPL_COMMIT_INACTIVATE);
1c79356b
A
2709
2710 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
9bccf70c 2711 (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
1c79356b
A
2712 }
2713 if ((last_pg - start_pg) < pages_in_upl) {
2714 int cur_pg;
2715 int commit_flags;
2716
2717 /*
2718 * the set of pages that we issued an I/O for did not encompass
2719 * the entire upl... so just release these without modifying
55e303ae 2720 * their state
1c79356b
A
2721 */
2722 if (error)
9bccf70c 2723 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1c79356b 2724 else {
0b4e3aa0 2725 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
9bccf70c 2726 (int)upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
1c79356b 2727
0b4e3aa0
A
2728 if (start_pg) {
2729 /*
2730 * we found some already valid pages at the beginning of
2731 * the upl commit these back to the inactive list with
2732 * reference cleared
2733 */
2734 for (cur_pg = 0; cur_pg < start_pg; cur_pg++) {
2735 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
2736 | UPL_COMMIT_INACTIVATE;
1c79356b
A
2737
2738 if (upl_dirty_page(pl, cur_pg))
0b4e3aa0 2739 commit_flags |= UPL_COMMIT_SET_DIRTY;
1c79356b
A
2740
2741 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA))
0b4e3aa0
A
2742 ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
2743 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
1c79356b 2744 else
0b4e3aa0
A
2745 ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
2746 PAGE_SIZE, commit_flags);
1c79356b
A
2747 }
2748 }
2749 if (last_pg < uio_last) {
0b4e3aa0
A
2750 /*
2751 * we found some already valid pages immediately after the
2752 * pages we issued I/O for, commit these back to the
2753 * inactive list with reference cleared
2754 */
2755 for (cur_pg = last_pg; cur_pg < uio_last; cur_pg++) {
2756 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
2757 | UPL_COMMIT_INACTIVATE;
1c79356b
A
2758
2759 if (upl_dirty_page(pl, cur_pg))
0b4e3aa0 2760 commit_flags |= UPL_COMMIT_SET_DIRTY;
1c79356b
A
2761
2762 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA))
0b4e3aa0
A
2763 ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
2764 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
1c79356b 2765 else
0b4e3aa0
A
2766 ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
2767 PAGE_SIZE, commit_flags);
1c79356b
A
2768 }
2769 }
2770 if (uio_last < pages_in_upl) {
0b4e3aa0
A
2771 /*
2772 * there were some invalid pages beyond the valid pages
2773 * that we didn't issue an I/O for, just release them
2774 * unchanged
1c79356b 2775 */
9bccf70c
A
2776 ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
2777 (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
1c79356b
A
2778 }
2779
0b4e3aa0 2780 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
9bccf70c 2781 (int)upl, -1, -1, 0, 0);
1c79356b
A
2782 }
2783 }
2784 if (retval == 0)
2785 retval = error;
2786 }
55e303ae
A
2787 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2788 (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
1c79356b
A
2789
2790 return (retval);
2791}
2792
b4c24cb9 2793
9bccf70c 2794static int
0b4e3aa0 2795cluster_nocopy_read(vp, uio, filesize, devblocksize, flags)
1c79356b
A
2796 struct vnode *vp;
2797 struct uio *uio;
2798 off_t filesize;
2799 int devblocksize;
2800 int flags;
2801{
2802 upl_t upl;
2803 upl_page_info_t *pl;
1c79356b 2804 vm_offset_t upl_offset;
1c79356b
A
2805 off_t max_io_size;
2806 int io_size;
2807 int upl_size;
2808 int upl_needed_size;
2809 int pages_in_pl;
1c79356b
A
2810 int upl_flags;
2811 kern_return_t kret;
1c79356b
A
2812 struct iovec *iov;
2813 int i;
2814 int force_data_sync;
1c79356b 2815 int retval = 0;
d7e50217 2816 struct clios iostate;
55e303ae
A
2817 u_int max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2818 u_int max_rd_ahead = MAX_UPL_TRANSFER * PAGE_SIZE * 2;
2819
1c79356b
A
2820
2821 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
2822 (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0);
2823
2824 /*
2825 * When we enter this routine, we know
2826 * -- the offset into the file is on a pagesize boundary
2827 * -- the resid is a page multiple
2828 * -- the resid will not exceed iov_len
2829 */
2830
d7e50217
A
2831 iostate.io_completed = 0;
2832 iostate.io_issued = 0;
2833 iostate.io_error = 0;
2834 iostate.io_wanted = 0;
2835
1c79356b 2836 iov = uio->uio_iov;
d7e50217 2837
55e303ae
A
2838 if (cluster_hard_throttle_on(vp)) {
2839 max_rd_size = HARD_THROTTLE_MAXSIZE;
2840 max_rd_ahead = HARD_THROTTLE_MAXSIZE - 1;
2841 }
1c79356b
A
2842 while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) {
2843
d7e50217 2844 max_io_size = filesize - uio->uio_offset;
0b4e3aa0 2845
d7e50217
A
2846 if (max_io_size < (off_t)((unsigned int)uio->uio_resid))
2847 io_size = max_io_size;
2848 else
2849 io_size = uio->uio_resid;
1c79356b 2850
d7e50217
A
2851 /*
2852 * First look for pages already in the cache
2853 * and move them to user space.
2854 */
55e303ae 2855 retval = cluster_copy_ubc_data(vp, uio, &io_size, 0);
1c79356b 2856
d7e50217
A
2857 if (retval) {
2858 /*
2859 * we may have already spun some portion of this request
2860 * off as async requests... we need to wait for the I/O
2861 * to complete before returning
2862 */
2863 goto wait_for_reads;
0b4e3aa0 2864 }
d7e50217
A
2865 /*
2866 * If we are already finished with this read, then return
2867 */
2868 if (io_size == 0) {
2869 /*
2870 * we may have already spun some portion of this request
2871 * off as async requests... we need to wait for the I/O
2872 * to complete before returning
2873 */
2874 goto wait_for_reads;
2875 }
2876 max_io_size = io_size;
2877
55e303ae
A
2878 if (max_io_size > max_rd_size)
2879 max_io_size = max_rd_size;
2880
d7e50217 2881 io_size = 0;
1c79356b 2882
55e303ae
A
2883 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + max_io_size, UPL_ROP_ABSENT, &io_size);
2884
d7e50217
A
2885 if (io_size == 0)
2886 /*
2887 * we may have already spun some portion of this request
2888 * off as async requests... we need to wait for the I/O
2889 * to complete before returning
2890 */
2891 goto wait_for_reads;
1c79356b 2892
55e303ae 2893 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK;
d7e50217 2894 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
1c79356b 2895
d7e50217
A
2896 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
2897 (int)upl_offset, upl_needed_size, (int)iov->iov_base, io_size, 0);
1c79356b 2898
d7e50217
A
2899 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
2900 pages_in_pl = 0;
2901 upl_size = upl_needed_size;
55e303ae 2902 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
1c79356b 2903
d7e50217
A
2904 kret = vm_map_get_upl(current_map(),
2905 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2906 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, force_data_sync);
1c79356b 2907
d7e50217
A
2908 if (kret != KERN_SUCCESS) {
2909 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2910 (int)upl_offset, upl_size, io_size, kret, 0);
d7e50217
A
2911 /*
2912 * cluster_nocopy_read: failed to get pagelist
2913 *
2914 * we may have already spun some portion of this request
2915 * off as async requests... we need to wait for the I/O
2916 * to complete before returning
2917 */
2918 goto wait_for_reads;
2919 }
2920 pages_in_pl = upl_size / PAGE_SIZE;
2921 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1c79356b 2922
d7e50217
A
2923 for (i = 0; i < pages_in_pl; i++) {
2924 if (!upl_valid_page(pl, i))
2925 break;
2926 }
2927 if (i == pages_in_pl)
2928 break;
0b4e3aa0 2929
d7e50217
A
2930 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2931 UPL_ABORT_FREE_ON_EMPTY);
1c79356b 2932 }
d7e50217
A
2933 if (force_data_sync >= 3) {
2934 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2935 (int)upl_offset, upl_size, io_size, kret, 0);
1c79356b 2936
d7e50217
A
2937 goto wait_for_reads;
2938 }
2939 /*
2940 * Consider the possibility that upl_size wasn't satisfied.
2941 */
2942 if (upl_size != upl_needed_size)
2943 io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
1c79356b 2944
d7e50217
A
2945 if (io_size == 0) {
2946 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2947 UPL_ABORT_FREE_ON_EMPTY);
2948 goto wait_for_reads;
2949 }
2950 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2951 (int)upl_offset, upl_size, io_size, kret, 0);
1c79356b 2952
d7e50217
A
2953 /*
2954 * request asynchronously so that we can overlap
2955 * the preparation of the next I/O
2956 * if there are already too many outstanding reads
2957 * wait until some have completed before issuing the next read
2958 */
55e303ae 2959 while ((iostate.io_issued - iostate.io_completed) > max_rd_ahead) {
d7e50217
A
2960 iostate.io_wanted = 1;
2961 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_read", 0);
2962 }
2963 if (iostate.io_error) {
2964 /*
2965 * one of the earlier reads we issued ran into a hard error
2966 * don't issue any more reads, cleanup the UPL
2967 * that was just created but not used, then
2968 * go wait for any other reads to complete before
2969 * returning the error to the caller
2970 */
2971 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2972 UPL_ABORT_FREE_ON_EMPTY);
1c79356b 2973
d7e50217
A
2974 goto wait_for_reads;
2975 }
2976 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
55e303ae 2977 (int)upl, (int)upl_offset, (int)uio->uio_offset, io_size, 0);
1c79356b 2978
55e303ae 2979 retval = cluster_io(vp, upl, upl_offset, uio->uio_offset,
d7e50217
A
2980 io_size, devblocksize,
2981 CL_PRESERVE | CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO,
2982 (struct buf *)0, &iostate);
1c79356b 2983
d7e50217
A
2984 /*
2985 * update the uio structure
2986 */
2987 iov->iov_base += io_size;
2988 iov->iov_len -= io_size;
2989 uio->uio_resid -= io_size;
2990 uio->uio_offset += io_size;
1c79356b 2991
d7e50217
A
2992 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
2993 (int)upl, (int)uio->uio_offset, (int)uio->uio_resid, retval, 0);
1c79356b
A
2994
2995 } /* end while */
2996
d7e50217
A
2997wait_for_reads:
2998 /*
2999 * make sure all async reads that are part of this stream
3000 * have completed before we return
3001 */
3002 while (iostate.io_issued != iostate.io_completed) {
3003 iostate.io_wanted = 1;
3004 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_read", 0);
3005 }
3006 if (iostate.io_error)
3007 retval = iostate.io_error;
1c79356b
A
3008
3009 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
3010 (int)uio->uio_offset, (int)uio->uio_resid, 6, retval, 0);
3011
3012 return (retval);
3013}
3014
3015
9bccf70c 3016static int
b4c24cb9 3017cluster_phys_read(vp, uio, filesize, devblocksize, flags)
0b4e3aa0
A
3018 struct vnode *vp;
3019 struct uio *uio;
3020 off_t filesize;
b4c24cb9
A
3021 int devblocksize;
3022 int flags;
0b4e3aa0 3023{
b4c24cb9 3024 upl_page_info_t *pl;
0b4e3aa0
A
3025 upl_t upl;
3026 vm_offset_t upl_offset;
55e303ae 3027 addr64_t dst_paddr;
0b4e3aa0
A
3028 off_t max_size;
3029 int io_size;
b4c24cb9 3030 int tail_size;
0b4e3aa0
A
3031 int upl_size;
3032 int upl_needed_size;
3033 int pages_in_pl;
3034 int upl_flags;
3035 kern_return_t kret;
3036 struct iovec *iov;
b4c24cb9 3037 struct clios iostate;
0b4e3aa0
A
3038 int error;
3039
3040 /*
3041 * When we enter this routine, we know
3042 * -- the resid will not exceed iov_len
3043 * -- the target address is physically contiguous
3044 */
3045
3046 iov = uio->uio_iov;
3047
3048 max_size = filesize - uio->uio_offset;
3049
b4c24cb9
A
3050 if (max_size > (off_t)((unsigned int)iov->iov_len))
3051 io_size = iov->iov_len;
0b4e3aa0 3052 else
b4c24cb9 3053 io_size = max_size;
0b4e3aa0 3054
55e303ae 3055 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK;
0b4e3aa0
A
3056 upl_needed_size = upl_offset + io_size;
3057
b4c24cb9 3058 error = 0;
0b4e3aa0
A
3059 pages_in_pl = 0;
3060 upl_size = upl_needed_size;
55e303ae 3061 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
0b4e3aa0
A
3062
3063 kret = vm_map_get_upl(current_map(),
3064 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
3065 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
3066
b4c24cb9
A
3067 if (kret != KERN_SUCCESS) {
3068 /*
3069 * cluster_phys_read: failed to get pagelist
3070 */
3071 return(EINVAL);
3072 }
3073 if (upl_size < upl_needed_size) {
3074 /*
3075 * The upl_size wasn't satisfied.
3076 */
3077 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3078
3079 return(EINVAL);
3080 }
3081 pl = ubc_upl_pageinfo(upl);
3082
55e303ae 3083 dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + ((addr64_t)((u_int)iov->iov_base & PAGE_MASK));
0b4e3aa0 3084
b4c24cb9
A
3085 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
3086 int head_size;
3087
3088 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
3089
3090 if (head_size > io_size)
3091 head_size = io_size;
3092
3093 error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, devblocksize, CL_READ);
3094
3095 if (error) {
3096 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3097
3098 return(EINVAL);
3099 }
3100 upl_offset += head_size;
3101 dst_paddr += head_size;
3102 io_size -= head_size;
3103 }
3104 tail_size = io_size & (devblocksize - 1);
3105 io_size -= tail_size;
3106
3107 iostate.io_completed = 0;
3108 iostate.io_issued = 0;
3109 iostate.io_error = 0;
3110 iostate.io_wanted = 0;
3111
3112 while (io_size && error == 0) {
3113 int xsize;
3114
3115 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
3116 xsize = MAX_UPL_TRANSFER * PAGE_SIZE;
3117 else
3118 xsize = io_size;
3119 /*
3120 * request asynchronously so that we can overlap
3121 * the preparation of the next I/O... we'll do
3122 * the commit after all the I/O has completed
3123 * since its all issued against the same UPL
3124 * if there are already too many outstanding reads
d7e50217 3125 * wait until some have completed before issuing the next
b4c24cb9
A
3126 */
3127 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
3128 iostate.io_wanted = 1;
3129 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_phys_read", 0);
3130 }
3131
3132 error = cluster_io(vp, upl, upl_offset, uio->uio_offset, xsize, 0,
3133 CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC,
3134 (struct buf *)0, &iostate);
3135 /*
3136 * The cluster_io read was issued successfully,
3137 * update the uio structure
3138 */
3139 if (error == 0) {
3140 uio->uio_resid -= xsize;
3141 iov->iov_len -= xsize;
3142 iov->iov_base += xsize;
3143 uio->uio_offset += xsize;
3144 dst_paddr += xsize;
3145 upl_offset += xsize;
3146 io_size -= xsize;
3147 }
3148 }
0b4e3aa0 3149 /*
d7e50217
A
3150 * make sure all async reads that are part of this stream
3151 * have completed before we proceed
0b4e3aa0 3152 */
b4c24cb9
A
3153 while (iostate.io_issued != iostate.io_completed) {
3154 iostate.io_wanted = 1;
3155 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_phys_read", 0);
3156 }
3157 if (iostate.io_error) {
3158 error = iostate.io_error;
3159 }
3160 if (error == 0 && tail_size)
3161 error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, devblocksize, CL_READ);
0b4e3aa0
A
3162
3163 /*
b4c24cb9
A
3164 * just release our hold on the physically contiguous
3165 * region without changing any state
0b4e3aa0 3166 */
b4c24cb9 3167 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
0b4e3aa0
A
3168
3169 return (error);
3170}
1c79356b 3171
b4c24cb9 3172
1c79356b
A
3173/*
3174 * generate advisory I/O's in the largest chunks possible
3175 * the completed pages will be released into the VM cache
3176 */
9bccf70c 3177int
1c79356b
A
3178advisory_read(vp, filesize, f_offset, resid, devblocksize)
3179 struct vnode *vp;
3180 off_t filesize;
3181 off_t f_offset;
3182 int resid;
3183 int devblocksize;
3184{
1c79356b
A
3185 upl_page_info_t *pl;
3186 upl_t upl;
3187 vm_offset_t upl_offset;
3188 int upl_size;
3189 off_t upl_f_offset;
3190 int start_offset;
3191 int start_pg;
3192 int last_pg;
3193 int pages_in_upl;
3194 off_t max_size;
3195 int io_size;
3196 kern_return_t kret;
3197 int retval = 0;
9bccf70c 3198 int issued_io;
55e303ae 3199 int skip_range;
1c79356b
A
3200
3201 if (!UBCINFOEXISTS(vp))
3202 return(EINVAL);
3203
1c79356b
A
3204 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
3205 (int)f_offset, resid, (int)filesize, devblocksize, 0);
3206
3207 while (resid && f_offset < filesize && retval == 0) {
3208 /*
3209 * compute the size of the upl needed to encompass
3210 * the requested read... limit each call to cluster_io
0b4e3aa0
A
3211 * to the maximum UPL size... cluster_io will clip if
3212 * this exceeds the maximum io_size for the device,
3213 * make sure to account for
1c79356b
A
3214 * a starting offset that's not page aligned
3215 */
3216 start_offset = (int)(f_offset & PAGE_MASK_64);
3217 upl_f_offset = f_offset - (off_t)start_offset;
3218 max_size = filesize - f_offset;
3219
3220 if (resid < max_size)
3221 io_size = resid;
3222 else
3223 io_size = max_size;
3224
3225 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
0b4e3aa0
A
3226 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
3227 upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
55e303ae
A
3228
3229 skip_range = 0;
3230 /*
3231 * return the number of contiguously present pages in the cache
3232 * starting at upl_f_offset within the file
3233 */
3234 ubc_range_op(vp, upl_f_offset, upl_f_offset + upl_size, UPL_ROP_PRESENT, &skip_range);
3235
3236 if (skip_range) {
3237 /*
3238 * skip over pages already present in the cache
3239 */
3240 io_size = skip_range - start_offset;
3241
3242 f_offset += io_size;
3243 resid -= io_size;
3244
3245 if (skip_range == upl_size)
3246 continue;
3247 /*
3248 * have to issue some real I/O
3249 * at this point, we know it's starting on a page boundary
3250 * because we've skipped over at least the first page in the request
3251 */
3252 start_offset = 0;
3253 upl_f_offset += skip_range;
3254 upl_size -= skip_range;
3255 }
1c79356b
A
3256 pages_in_upl = upl_size / PAGE_SIZE;
3257
55e303ae
A
3258 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_START,
3259 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
3260
0b4e3aa0
A
3261 kret = ubc_create_upl(vp,
3262 upl_f_offset,
3263 upl_size,
3264 &upl,
3265 &pl,
55e303ae 3266 UPL_RET_ONLY_ABSENT | UPL_SET_LITE);
1c79356b 3267 if (kret != KERN_SUCCESS)
9bccf70c
A
3268 return(retval);
3269 issued_io = 0;
1c79356b
A
3270
3271 /*
9bccf70c
A
3272 * before we start marching forward, we must make sure we end on
3273 * a present page, otherwise we will be working with a freed
3274 * upl
1c79356b 3275 */
9bccf70c
A
3276 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
3277 if (upl_page_present(pl, last_pg))
3278 break;
1c79356b 3279 }
9bccf70c 3280 pages_in_upl = last_pg + 1;
1c79356b 3281
1c79356b 3282
55e303ae 3283 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_END,
9bccf70c
A
3284 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
3285
3286
3287 for (last_pg = 0; last_pg < pages_in_upl; ) {
1c79356b 3288 /*
9bccf70c
A
3289 * scan from the beginning of the upl looking for the first
3290 * page that is present.... this will become the first page in
3291 * the request we're going to make to 'cluster_io'... if all
3292 * of the pages are absent, we won't call through to 'cluster_io'
1c79356b 3293 */
9bccf70c
A
3294 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
3295 if (upl_page_present(pl, start_pg))
3296 break;
1c79356b 3297 }
1c79356b 3298
1c79356b 3299 /*
9bccf70c
A
3300 * scan from the starting present page looking for an absent
3301 * page before the end of the upl is reached, if we
3302 * find one, then it will terminate the range of pages being
3303 * presented to 'cluster_io'
1c79356b 3304 */
9bccf70c
A
3305 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
3306 if (!upl_page_present(pl, last_pg))
3307 break;
3308 }
3309
3310 if (last_pg > start_pg) {
3311 /*
3312 * we found a range of pages that must be filled
3313 * if the last page in this range is the last page of the file
3314 * we may have to clip the size of it to keep from reading past
3315 * the end of the last physical block associated with the file
3316 */
3317 upl_offset = start_pg * PAGE_SIZE;
3318 io_size = (last_pg - start_pg) * PAGE_SIZE;
3319
3320 if ((upl_f_offset + upl_offset + io_size) > filesize)
3321 io_size = filesize - (upl_f_offset + upl_offset);
3322
3323 /*
3324 * issue an asynchronous read to cluster_io
3325 */
3326 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, devblocksize,
b4c24cb9 3327 CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE, (struct buf *)0, (struct clios *)0);
1c79356b 3328
9bccf70c
A
3329 issued_io = 1;
3330 }
1c79356b 3331 }
9bccf70c
A
3332 if (issued_io == 0)
3333 ubc_upl_abort(upl, 0);
3334
3335 io_size = upl_size - start_offset;
1c79356b
A
3336
3337 if (io_size > resid)
3338 io_size = resid;
3339 f_offset += io_size;
3340 resid -= io_size;
3341 }
9bccf70c 3342
1c79356b
A
3343 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
3344 (int)f_offset, resid, retval, 0, 0);
3345
3346 return(retval);
3347}
3348
3349
9bccf70c 3350int
1c79356b
A
3351cluster_push(vp)
3352 struct vnode *vp;
9bccf70c
A
3353{
3354 int retval;
3355
55e303ae 3356 if (!UBCINFOEXISTS(vp) || (vp->v_clen == 0 && !(vp->v_flag & VHASDIRTY)))
9bccf70c 3357 return(0);
9bccf70c
A
3358
3359 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
3360 vp->v_flag & VHASDIRTY, vp->v_clen, 0, 0, 0);
3361
3362 if (vp->v_flag & VHASDIRTY) {
55e303ae 3363 sparse_cluster_push(vp, ubc_getsize(vp), 1);
9bccf70c 3364
9bccf70c 3365 vp->v_clen = 0;
55e303ae
A
3366 retval = 1;
3367 } else
3368 retval = cluster_try_push(vp, ubc_getsize(vp), 0, 1);
9bccf70c 3369
55e303ae
A
3370 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
3371 vp->v_flag & VHASDIRTY, vp->v_clen, retval, 0, 0);
9bccf70c 3372
55e303ae
A
3373 return (retval);
3374}
9bccf70c 3375
9bccf70c 3376
55e303ae
A
3377int
3378cluster_release(vp)
3379 struct vnode *vp;
3380{
3381 off_t offset;
3382 u_int length;
9bccf70c 3383
55e303ae 3384 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, (int)vp, (int)vp->v_scmap, vp->v_scdirty, 0, 0);
9bccf70c 3385
55e303ae
A
3386 if (vp->v_flag & VHASDIRTY) {
3387 vfs_drt_control(&(vp->v_scmap), 0);
3388
3389 vp->v_flag &= ~VHASDIRTY;
3390 }
3391 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_END, (int)vp, (int)vp->v_scmap, vp->v_scdirty, 0, 0);
9bccf70c
A
3392}
3393
3394
3395static int
3396cluster_try_push(vp, EOF, can_delay, push_all)
3397 struct vnode *vp;
3398 off_t EOF;
3399 int can_delay;
3400 int push_all;
3401{
3402 int cl_index;
3403 int cl_index1;
3404 int min_index;
3405 int cl_len;
3406 int cl_total;
55e303ae 3407 int cl_pushed = 0;
9bccf70c
A
3408 struct v_cluster l_clusters[MAX_CLUSTERS];
3409
3410 /*
3411 * make a local 'sorted' copy of the clusters
3412 * and clear vp->v_clen so that new clusters can
3413 * be developed
3414 */
3415 for (cl_index = 0; cl_index < vp->v_clen; cl_index++) {
3416 for (min_index = -1, cl_index1 = 0; cl_index1 < vp->v_clen; cl_index1++) {
3417 if (vp->v_clusters[cl_index1].start_pg == vp->v_clusters[cl_index1].last_pg)
3418 continue;
3419 if (min_index == -1)
3420 min_index = cl_index1;
3421 else if (vp->v_clusters[cl_index1].start_pg < vp->v_clusters[min_index].start_pg)
3422 min_index = cl_index1;
3423 }
3424 if (min_index == -1)
3425 break;
3426 l_clusters[cl_index].start_pg = vp->v_clusters[min_index].start_pg;
3427 l_clusters[cl_index].last_pg = vp->v_clusters[min_index].last_pg;
3428
3429 vp->v_clusters[min_index].start_pg = vp->v_clusters[min_index].last_pg;
3430 }
3431 cl_len = cl_index;
3432 vp->v_clen = 0;
3433
55e303ae
A
3434 if (can_delay && cl_len == MAX_CLUSTERS) {
3435 int i;
3436
3437 /*
3438 * determine if we appear to be writing the file sequentially
3439 * if not, by returning without having pushed any clusters
3440 * we will cause this vnode to be pushed into the sparse cluster mechanism
3441 * used for managing more random I/O patterns
3442 *
3443 * we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
3444 * that's why we're in try_push with can_delay true...
3445 *
3446 * check to make sure that all the clusters except the last one are 'full'... and that each cluster
3447 * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
3448 * so we can just make a simple pass through up, to but not including the last one...
3449 * note that last_pg is not inclusive, so it will be equal to the start_pg of the next cluster if they
3450 * are sequential
3451 *
3452 * we let the last one be partial as long as it was adjacent to the previous one...
3453 * we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
3454 * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
3455 */
3456 for (i = 0; i < MAX_CLUSTERS - 1; i++) {
3457 if ((l_clusters[i].last_pg - l_clusters[i].start_pg) != MAX_UPL_TRANSFER)
3458 goto dont_try;
3459 if (l_clusters[i].last_pg != l_clusters[i+1].start_pg)
3460 goto dont_try;
3461 }
3462 }
3463 for (cl_index = 0; cl_index < cl_len; cl_index++) {
9bccf70c
A
3464 /*
3465 * try to push each cluster in turn... cluster_push_x may not
3466 * push the cluster if can_delay is TRUE and the cluster doesn't
3467 * meet the critera for an immediate push
3468 */
3469 if (cluster_push_x(vp, EOF, l_clusters[cl_index].start_pg, l_clusters[cl_index].last_pg, can_delay)) {
3470 l_clusters[cl_index].start_pg = 0;
3471 l_clusters[cl_index].last_pg = 0;
3472
3473 cl_pushed++;
3474
3475 if (push_all == 0)
3476 break;
3477 }
3478 }
55e303ae 3479dont_try:
9bccf70c
A
3480 if (cl_len > cl_pushed) {
3481 /*
3482 * we didn't push all of the clusters, so
3483 * lets try to merge them back in to the vnode
3484 */
3485 if ((MAX_CLUSTERS - vp->v_clen) < (cl_len - cl_pushed)) {
3486 /*
3487 * we picked up some new clusters while we were trying to
3488 * push the old ones (I don't think this can happen because
3489 * I'm holding the lock, but just in case)... the sum of the
3490 * leftovers plus the new cluster count exceeds our ability
55e303ae 3491 * to represent them, so switch to the sparse cluster mechanism
9bccf70c 3492 */
55e303ae
A
3493
3494 /*
3495 * first collect the new clusters sitting in the vp
3496 */
3497 sparse_cluster_switch(vp, EOF);
3498
3499 for (cl_index = 0, cl_index1 = 0; cl_index < cl_len; cl_index++) {
9bccf70c
A
3500 if (l_clusters[cl_index].start_pg == l_clusters[cl_index].last_pg)
3501 continue;
55e303ae
A
3502 vp->v_clusters[cl_index1].start_pg = l_clusters[cl_index].start_pg;
3503 vp->v_clusters[cl_index1].last_pg = l_clusters[cl_index].last_pg;
9bccf70c 3504
55e303ae 3505 cl_index1++;
9bccf70c 3506 }
55e303ae
A
3507 /*
3508 * update the cluster count
3509 */
3510 vp->v_clen = cl_index1;
3511
3512 /*
3513 * and collect the original clusters that were moved into the
3514 * local storage for sorting purposes
3515 */
3516 sparse_cluster_switch(vp, EOF);
3517
9bccf70c
A
3518 } else {
3519 /*
3520 * we've got room to merge the leftovers back in
3521 * just append them starting at the next 'hole'
3522 * represented by vp->v_clen
3523 */
3524 for (cl_index = 0, cl_index1 = vp->v_clen; cl_index < cl_len; cl_index++) {
3525 if (l_clusters[cl_index].start_pg == l_clusters[cl_index].last_pg)
3526 continue;
3527
3528 vp->v_clusters[cl_index1].start_pg = l_clusters[cl_index].start_pg;
3529 vp->v_clusters[cl_index1].last_pg = l_clusters[cl_index].last_pg;
3530
9bccf70c
A
3531 cl_index1++;
3532 }
3533 /*
3534 * update the cluster count
3535 */
3536 vp->v_clen = cl_index1;
3537 }
3538 }
3539 return(MAX_CLUSTERS - vp->v_clen);
3540}
3541
3542
3543
3544static int
3545cluster_push_x(vp, EOF, first, last, can_delay)
3546 struct vnode *vp;
3547 off_t EOF;
3548 daddr_t first;
3549 daddr_t last;
3550 int can_delay;
1c79356b 3551{
1c79356b
A
3552 upl_page_info_t *pl;
3553 upl_t upl;
3554 vm_offset_t upl_offset;
3555 int upl_size;
3556 off_t upl_f_offset;
3557 int pages_in_upl;
3558 int start_pg;
3559 int last_pg;
3560 int io_size;
3561 int io_flags;
55e303ae 3562 int upl_flags;
1c79356b
A
3563 int size;
3564 kern_return_t kret;
3565
3566
9bccf70c
A
3567 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START,
3568 vp->v_clen, first, last, EOF, 0);
3569
3570 if ((pages_in_upl = last - first) == 0) {
3571 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0);
1c79356b 3572
9bccf70c
A
3573 return (1);
3574 }
1c79356b 3575 upl_size = pages_in_upl * PAGE_SIZE;
9bccf70c 3576 upl_f_offset = ((off_t)first) * PAGE_SIZE_64;
1c79356b 3577
9bccf70c
A
3578 if (upl_f_offset + upl_size >= EOF) {
3579
3580 if (upl_f_offset >= EOF) {
3581 /*
3582 * must have truncated the file and missed
3583 * clearing a dangling cluster (i.e. it's completely
3584 * beyond the new EOF
3585 */
3586 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0);
3587
3588 return(1);
3589 }
3590 size = EOF - upl_f_offset;
1c79356b 3591
55e303ae 3592 upl_size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
9bccf70c 3593 pages_in_upl = upl_size / PAGE_SIZE;
55e303ae 3594 } else
9bccf70c 3595 size = upl_size;
55e303ae
A
3596
3597 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, size, 0, 0, 0);
3598
3599 if (vp->v_flag & VNOCACHE_DATA)
3600 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE | UPL_WILL_BE_DUMPED;
3601 else
3602 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE;
3603
0b4e3aa0
A
3604 kret = ubc_create_upl(vp,
3605 upl_f_offset,
3606 upl_size,
3607 &upl,
9bccf70c 3608 &pl,
55e303ae 3609 upl_flags);
1c79356b
A
3610 if (kret != KERN_SUCCESS)
3611 panic("cluster_push: failed to get pagelist");
3612
55e303ae 3613 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END, (int)upl, upl_f_offset, 0, 0, 0);
9bccf70c 3614
55e303ae
A
3615 /*
3616 * since we only asked for the dirty pages back
3617 * it's possible that we may only get a few or even none, so...
3618 * before we start marching forward, we must make sure we know
3619 * where the last present page is in the UPL, otherwise we could
3620 * end up working with a freed upl due to the FREE_ON_EMPTY semantics
3621 * employed by commit_range and abort_range.
3622 */
3623 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
3624 if (upl_page_present(pl, last_pg))
3625 break;
9bccf70c 3626 }
55e303ae 3627 pages_in_upl = last_pg + 1;
1c79356b 3628
55e303ae
A
3629 if (pages_in_upl == 0) {
3630 ubc_upl_abort(upl, 0);
1c79356b 3631
55e303ae
A
3632 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 2, 0, 0, 0);
3633 return(1);
3634 }
3635
3636 for (last_pg = 0; last_pg < pages_in_upl; ) {
3637 /*
3638 * find the next dirty page in the UPL
3639 * this will become the first page in the
3640 * next I/O to generate
3641 */
1c79356b 3642 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
55e303ae 3643 if (upl_dirty_page(pl, start_pg))
1c79356b 3644 break;
55e303ae
A
3645 if (upl_page_present(pl, start_pg))
3646 /*
3647 * RET_ONLY_DIRTY will return non-dirty 'precious' pages
3648 * just release these unchanged since we're not going
3649 * to steal them or change their state
3650 */
3651 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
1c79356b 3652 }
55e303ae
A
3653 if (start_pg >= pages_in_upl)
3654 /*
3655 * done... no more dirty pages to push
3656 */
3657 break;
3658 if (start_pg > last_pg)
3659 /*
3660 * skipped over some non-dirty pages
3661 */
3662 size -= ((start_pg - last_pg) * PAGE_SIZE);
1c79356b 3663
55e303ae
A
3664 /*
3665 * find a range of dirty pages to write
3666 */
1c79356b 3667 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
55e303ae 3668 if (!upl_dirty_page(pl, last_pg))
1c79356b
A
3669 break;
3670 }
3671 upl_offset = start_pg * PAGE_SIZE;
3672
3673 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
3674
0b4e3aa0 3675 if (vp->v_flag & VNOCACHE_DATA)
55e303ae 3676 io_flags = CL_THROTTLE | CL_COMMIT | CL_ASYNC | CL_DUMP;
1c79356b 3677 else
55e303ae 3678 io_flags = CL_THROTTLE | CL_COMMIT | CL_ASYNC;
1c79356b 3679
b4c24cb9 3680 cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, vp->v_ciosiz, io_flags, (struct buf *)0, (struct clios *)0);
1c79356b
A
3681
3682 size -= io_size;
3683 }
9bccf70c
A
3684 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, 0, 0, 0);
3685
1c79356b
A
3686 return(1);
3687}
b4c24cb9
A
3688
3689
b4c24cb9 3690static int
55e303ae 3691sparse_cluster_switch(struct vnode *vp, off_t EOF)
b4c24cb9 3692{
55e303ae 3693 int cl_index;
b4c24cb9 3694
55e303ae 3695 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, (int)vp, (int)vp->v_scmap, vp->v_scdirty, 0, 0);
b4c24cb9 3696
55e303ae
A
3697 if ( !(vp->v_flag & VHASDIRTY)) {
3698 vp->v_flag |= VHASDIRTY;
3699 vp->v_scdirty = 0;
3700 vp->v_scmap = 0;
3701 }
3702 for (cl_index = 0; cl_index < vp->v_clen; cl_index++) {
3703 int flags;
3704 int start_pg;
3705 int last_pg;
b4c24cb9 3706
55e303ae 3707 for (start_pg = vp->v_clusters[cl_index].start_pg; start_pg < vp->v_clusters[cl_index].last_pg; start_pg++) {
b4c24cb9 3708
55e303ae
A
3709 if (ubc_page_op(vp, (off_t)(((off_t)start_pg) * PAGE_SIZE_64), 0, 0, &flags) == KERN_SUCCESS) {
3710 if (flags & UPL_POP_DIRTY)
3711 sparse_cluster_add(vp, EOF, start_pg, start_pg + 1);
3712 }
3713 }
3714 }
3715 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, (int)vp, (int)vp->v_scmap, vp->v_scdirty, 0, 0);
3716}
3717
3718
3719static int
3720sparse_cluster_push(struct vnode *vp, off_t EOF, int push_all)
3721{
3722 daddr_t first;
3723 daddr_t last;
3724 off_t offset;
3725 u_int length;
3726
3727 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, (int)vp, (int)vp->v_scmap, vp->v_scdirty, push_all, 0);
3728
3729 if (push_all)
3730 vfs_drt_control(&(vp->v_scmap), 1);
3731
3732 for (;;) {
3733 if (vfs_drt_get_cluster(&(vp->v_scmap), &offset, &length) != KERN_SUCCESS) {
3734 vp->v_flag &= ~VHASDIRTY;
3735 vp->v_clen = 0;
3736 break;
3737 }
3738 first = (daddr_t)(offset / PAGE_SIZE_64);
3739 last = (daddr_t)((offset + length) / PAGE_SIZE_64);
3740
3741 cluster_push_x(vp, EOF, first, last, 0);
3742
3743 vp->v_scdirty -= (last - first);
3744
3745 if (push_all == 0)
3746 break;
3747 }
3748 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, (int)vp, (int)vp->v_scmap, vp->v_scdirty, 0, 0);
3749}
3750
3751
3752static int
3753sparse_cluster_add(struct vnode *vp, off_t EOF, daddr_t first, daddr_t last)
3754{
3755 u_int new_dirty;
3756 u_int length;
3757 off_t offset;
3758
3759 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_START, (int)vp->v_scmap, vp->v_scdirty, first, last, 0);
3760
3761 offset = (off_t)first * PAGE_SIZE_64;
3762 length = (last - first) * PAGE_SIZE;
3763
3764 while (vfs_drt_mark_pages(&(vp->v_scmap), offset, length, &new_dirty) != KERN_SUCCESS) {
3765 /*
3766 * no room left in the map
3767 * only a partial update was done
3768 * push out some pages and try again
3769 */
3770 vp->v_scdirty += new_dirty;
3771
3772 sparse_cluster_push(vp, EOF, 0);
3773
3774 offset += (new_dirty * PAGE_SIZE_64);
3775 length -= (new_dirty * PAGE_SIZE);
3776 }
3777 vp->v_scdirty += new_dirty;
3778
3779 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, (int)vp, (int)vp->v_scmap, vp->v_scdirty, 0, 0);
3780}
3781
3782
3783static int
3784cluster_align_phys_io(struct vnode *vp, struct uio *uio, addr64_t usr_paddr, int xsize, int devblocksize, int flags)
3785{
3786 struct iovec *iov;
3787 upl_page_info_t *pl;
3788 upl_t upl;
3789 addr64_t ubc_paddr;
3790 kern_return_t kret;
3791 int error = 0;
3792
3793 iov = uio->uio_iov;
3794
3795 kret = ubc_create_upl(vp,
3796 uio->uio_offset & ~PAGE_MASK_64,
3797 PAGE_SIZE,
3798 &upl,
3799 &pl,
3800 UPL_SET_LITE);
3801
3802 if (kret != KERN_SUCCESS)
3803 return(EINVAL);
3804
3805 if (!upl_valid_page(pl, 0)) {
3806 /*
3807 * issue a synchronous read to cluster_io
3808 */
3809 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, devblocksize,
3810 CL_READ, (struct buf *)0, (struct clios *)0);
3811 if (error) {
b4c24cb9
A
3812 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3813
3814 return(error);
3815 }
3816 }
55e303ae 3817 ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)(uio->uio_offset & PAGE_MASK_64);
b4c24cb9 3818
55e303ae
A
3819/*
3820 * NOTE: There is no prototype for the following in BSD. It, and the definitions
3821 * of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
3822 * osfmk/ppc/mappings.h. They are not included here because there appears to be no
3823 * way to do so without exporting them to kexts as well.
3824 */
de355530 3825 if (flags & CL_READ)
55e303ae
A
3826// copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk); /* Copy physical to physical and flush the destination */
3827 copypv(ubc_paddr, usr_paddr, xsize, 2 | 1 | 4); /* Copy physical to physical and flush the destination */
de355530 3828 else
4a249263
A
3829// copypv(usr_paddr, ubc_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc); /* Copy physical to physical and flush the source */
3830 copypv(usr_paddr, ubc_paddr, xsize, 2 | 1 | 8); /* Copy physical to physical and flush the source */
55e303ae
A
3831
3832 if ( !(flags & CL_READ) || (upl_valid_page(pl, 0) && upl_dirty_page(pl, 0))) {
3833 /*
3834 * issue a synchronous write to cluster_io
3835 */
3836 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, devblocksize,
3837 0, (struct buf *)0, (struct clios *)0);
de355530
A
3838 }
3839 if (error == 0) {
55e303ae 3840 uio->uio_offset += xsize;
de355530
A
3841 iov->iov_base += xsize;
3842 iov->iov_len -= xsize;
3843 uio->uio_resid -= xsize;
3844 }
3845 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
55e303ae
A
3846
3847 return (error);
3848}
3849
3850
3851
3852int
3853cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int xsize)
3854{
3855 int pg_offset;
3856 int pg_index;
3857 int csize;
3858 int segflg;
3859 int retval = 0;
3860 upl_page_info_t *pl;
3861 boolean_t funnel_state = FALSE;
3862
3863
3864 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
3865 (int)uio->uio_offset, uio->uio_resid, upl_offset, xsize, 0);
3866
3867 if (xsize >= (16 * 1024))
3868 funnel_state = thread_funnel_set(kernel_flock, FALSE);
3869
3870 segflg = uio->uio_segflg;
3871
3872 switch(segflg) {
3873
3874 case UIO_USERSPACE:
3875 case UIO_USERISPACE:
3876 uio->uio_segflg = UIO_PHYS_USERSPACE;
3877 break;
3878
3879 case UIO_SYSSPACE:
3880 uio->uio_segflg = UIO_PHYS_SYSSPACE;
3881 break;
3882 }
3883 pl = ubc_upl_pageinfo(upl);
3884
3885 pg_index = upl_offset / PAGE_SIZE;
3886 pg_offset = upl_offset & PAGE_MASK;
3887 csize = min(PAGE_SIZE - pg_offset, xsize);
3888
3889 while (xsize && retval == 0) {
3890 addr64_t paddr;
3891
3892 paddr = ((addr64_t)upl_phys_page(pl, pg_index) << 12) + pg_offset;
de355530 3893
55e303ae
A
3894 retval = uiomove64(paddr, csize, uio);
3895
3896 pg_index += 1;
3897 pg_offset = 0;
3898 xsize -= csize;
3899 csize = min(PAGE_SIZE, xsize);
3900 }
3901 uio->uio_segflg = segflg;
3902
3903 if (funnel_state == TRUE)
3904 thread_funnel_set(kernel_flock, TRUE);
3905
3906 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
3907 (int)uio->uio_offset, uio->uio_resid, retval, segflg, 0);
3908
3909 return (retval);
3910}
3911
3912
3913int
3914cluster_copy_ubc_data(struct vnode *vp, struct uio *uio, int *io_resid, int mark_dirty)
3915{
3916 int segflg;
3917 int io_size;
3918 int xsize;
3919 int start_offset;
3920 off_t f_offset;
3921 int retval = 0;
3922 memory_object_control_t control;
3923 int op_flags = UPL_POP_SET | UPL_POP_BUSY;
3924 boolean_t funnel_state = FALSE;
3925
3926
3927 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
3928 (int)uio->uio_offset, uio->uio_resid, 0, *io_resid, 0);
3929
3930 control = ubc_getobject(vp, UBC_FLAGS_NONE);
3931 if (control == MEMORY_OBJECT_CONTROL_NULL) {
3932 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
3933 (int)uio->uio_offset, uio->uio_resid, retval, 3, 0);
3934
3935 return(0);
3936 }
3937 if (mark_dirty)
3938 op_flags |= UPL_POP_DIRTY;
3939
3940 segflg = uio->uio_segflg;
3941
3942 switch(segflg) {
3943
3944 case UIO_USERSPACE:
3945 case UIO_USERISPACE:
3946 uio->uio_segflg = UIO_PHYS_USERSPACE;
3947 break;
3948
3949 case UIO_SYSSPACE:
3950 uio->uio_segflg = UIO_PHYS_SYSSPACE;
3951 break;
3952 }
3953 io_size = *io_resid;
3954 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
3955 f_offset = uio->uio_offset - start_offset;
3956 xsize = min(PAGE_SIZE - start_offset, io_size);
3957
3958 while (io_size && retval == 0) {
3959 ppnum_t pgframe;
3960
3961 if (ubc_page_op_with_control(control, f_offset, op_flags, &pgframe, 0) != KERN_SUCCESS)
3962 break;
3963
3964 if (funnel_state == FALSE && io_size >= (16 * 1024))
3965 funnel_state = thread_funnel_set(kernel_flock, FALSE);
3966
3967 retval = uiomove64((addr64_t)(((addr64_t)pgframe << 12) + start_offset), xsize, uio);
3968
3969 ubc_page_op_with_control(control, f_offset, UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
3970
3971 io_size -= xsize;
3972 start_offset = 0;
3973 f_offset = uio->uio_offset;
3974 xsize = min(PAGE_SIZE, io_size);
3975 }
3976 uio->uio_segflg = segflg;
3977 *io_resid = io_size;
3978
3979 if (funnel_state == TRUE)
3980 thread_funnel_set(kernel_flock, TRUE);
3981
3982 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
3983 (int)uio->uio_offset, uio->uio_resid, retval, 0x80000000 | segflg, 0);
3984
3985 return(retval);
3986}
3987
3988
3989int
3990is_file_clean(struct vnode *vp, off_t filesize)
3991{
3992 off_t f_offset;
3993 int flags;
3994 int total_dirty = 0;
3995
3996 for (f_offset = 0; f_offset < filesize; f_offset += PAGE_SIZE_64) {
3997 if (ubc_page_op(vp, f_offset, 0, 0, &flags) == KERN_SUCCESS) {
3998 if (flags & UPL_POP_DIRTY) {
3999 total_dirty++;
4000 }
4001 }
4002 }
4003 if (total_dirty)
4004 return(EINVAL);
4005
4006 return (0);
4007}
4008
4009
4010
4011/*
4012 * Dirty region tracking/clustering mechanism.
4013 *
4014 * This code (vfs_drt_*) provides a mechanism for tracking and clustering
4015 * dirty regions within a larger space (file). It is primarily intended to
4016 * support clustering in large files with many dirty areas.
4017 *
4018 * The implementation assumes that the dirty regions are pages.
4019 *
4020 * To represent dirty pages within the file, we store bit vectors in a
4021 * variable-size circular hash.
4022 */
4023
4024/*
4025 * Bitvector size. This determines the number of pages we group in a
4026 * single hashtable entry. Each hashtable entry is aligned to this
4027 * size within the file.
4028 */
4029#define DRT_BITVECTOR_PAGES 256
4030
4031/*
4032 * File offset handling.
4033 *
4034 * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
4035 * the correct formula is (~(DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1)
4036 */
4037#define DRT_ADDRESS_MASK (~((1 << 20) - 1))
4038#define DRT_ALIGN_ADDRESS(addr) ((addr) & DRT_ADDRESS_MASK)
4039
4040/*
4041 * Hashtable address field handling.
4042 *
4043 * The low-order bits of the hashtable address are used to conserve
4044 * space.
4045 *
4046 * DRT_HASH_COUNT_MASK must be large enough to store the range
4047 * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
4048 * to indicate that the bucket is actually unoccupied.
4049 */
4050#define DRT_HASH_GET_ADDRESS(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
4051#define DRT_HASH_SET_ADDRESS(scm, i, a) \
4052 do { \
4053 (scm)->scm_hashtable[(i)].dhe_control = \
4054 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \
4055 } while (0)
4056#define DRT_HASH_COUNT_MASK 0x1ff
4057#define DRT_HASH_GET_COUNT(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
4058#define DRT_HASH_SET_COUNT(scm, i, c) \
4059 do { \
4060 (scm)->scm_hashtable[(i)].dhe_control = \
4061 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK); \
4062 } while (0)
4063#define DRT_HASH_CLEAR(scm, i) \
4064 do { \
4065 (scm)->scm_hashtable[(i)].dhe_control = 0; \
4066 } while (0)
4067#define DRT_HASH_VACATE(scm, i) DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
4068#define DRT_HASH_VACANT(scm, i) (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
4069#define DRT_HASH_COPY(oscm, oi, scm, i) \
4070 do { \
4071 (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control; \
4072 DRT_BITVECTOR_COPY(oscm, oi, scm, i); \
4073 } while(0);
4074
4075
4076/*
4077 * Hash table moduli.
4078 *
4079 * Since the hashtable entry's size is dependent on the size of
4080 * the bitvector, and since the hashtable size is constrained to
4081 * both being prime and fitting within the desired allocation
4082 * size, these values need to be manually determined.
4083 *
4084 * For DRT_BITVECTOR_SIZE = 256, the entry size is 40 bytes.
4085 *
4086 * The small hashtable allocation is 1024 bytes, so the modulus is 23.
4087 * The large hashtable allocation is 16384 bytes, so the modulus is 401.
4088 */
4089#define DRT_HASH_SMALL_MODULUS 23
4090#define DRT_HASH_LARGE_MODULUS 401
4091
4092#define DRT_SMALL_ALLOCATION 1024 /* 104 bytes spare */
4093#define DRT_LARGE_ALLOCATION 16384 /* 344 bytes spare */
4094
4095/* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */
4096
4097/*
4098 * Hashtable bitvector handling.
4099 *
4100 * Bitvector fields are 32 bits long.
4101 */
4102
4103#define DRT_HASH_SET_BIT(scm, i, bit) \
4104 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32))
4105
4106#define DRT_HASH_CLEAR_BIT(scm, i, bit) \
4107 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
4108
4109#define DRT_HASH_TEST_BIT(scm, i, bit) \
4110 ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
4111
4112#define DRT_BITVECTOR_CLEAR(scm, i) \
4113 bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
4114
4115#define DRT_BITVECTOR_COPY(oscm, oi, scm, i) \
4116 bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0], \
4117 &(scm)->scm_hashtable[(i)].dhe_bitvector[0], \
4118 (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
4119
4120
4121
4122/*
4123 * Hashtable entry.
4124 */
4125struct vfs_drt_hashentry {
4126 u_int64_t dhe_control;
4127 u_int32_t dhe_bitvector[DRT_BITVECTOR_PAGES / 32];
4128};
4129
4130/*
4131 * Dirty Region Tracking structure.
4132 *
4133 * The hashtable is allocated entirely inside the DRT structure.
4134 *
4135 * The hash is a simple circular prime modulus arrangement, the structure
4136 * is resized from small to large if it overflows.
4137 */
4138
4139struct vfs_drt_clustermap {
4140 u_int32_t scm_magic; /* sanity/detection */
4141#define DRT_SCM_MAGIC 0x12020003
4142 u_int32_t scm_modulus; /* current ring size */
4143 u_int32_t scm_buckets; /* number of occupied buckets */
4144 u_int32_t scm_lastclean; /* last entry we cleaned */
4145 u_int32_t scm_iskips; /* number of slot skips */
4146
4147 struct vfs_drt_hashentry scm_hashtable[0];
4148};
4149
4150
4151#define DRT_HASH(scm, addr) ((addr) % (scm)->scm_modulus)
4152#define DRT_HASH_NEXT(scm, addr) (((addr) + 1) % (scm)->scm_modulus)
4153
4154/*
4155 * Debugging codes and arguments.
4156 */
4157#define DRT_DEBUG_EMPTYFREE (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
4158#define DRT_DEBUG_RETCLUSTER (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
4159#define DRT_DEBUG_ALLOC (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
4160#define DRT_DEBUG_INSERT (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
4161#define DRT_DEBUG_MARK (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
4162 * dirty */
4163 /* 0, setcount */
4164 /* 1 (clean, no map) */
4165 /* 2 (map alloc fail) */
4166 /* 3, resid (partial) */
4167#define DRT_DEBUG_6 (FSDBG_CODE(DBG_FSRW, 87))
4168#define DRT_DEBUG_SCMDATA (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
4169 * lastclean, iskips */
4170
4171
4172static void vfs_drt_sanity(struct vfs_drt_clustermap *cmap);
4173static kern_return_t vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp);
4174static kern_return_t vfs_drt_free_map(struct vfs_drt_clustermap *cmap);
4175static kern_return_t vfs_drt_search_index(struct vfs_drt_clustermap *cmap,
4176 u_int64_t offset, int *indexp);
4177static kern_return_t vfs_drt_get_index(struct vfs_drt_clustermap **cmapp,
4178 u_int64_t offset,
4179 int *indexp,
4180 int recursed);
4181static kern_return_t vfs_drt_do_mark_pages(
4182 void **cmapp,
4183 u_int64_t offset,
4184 u_int length,
4185 int *setcountp,
4186 int dirty);
4187static void vfs_drt_trace(
4188 struct vfs_drt_clustermap *cmap,
4189 int code,
4190 int arg1,
4191 int arg2,
4192 int arg3,
4193 int arg4);
4194
4195
4196/*
4197 * Allocate and initialise a sparse cluster map.
4198 *
4199 * Will allocate a new map, resize or compact an existing map.
4200 *
4201 * XXX we should probably have at least one intermediate map size,
4202 * as the 1:16 ratio seems a bit drastic.
4203 */
4204static kern_return_t
4205vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp)
4206{
4207 struct vfs_drt_clustermap *cmap, *ocmap;
4208 kern_return_t kret;
4209 u_int64_t offset;
4210 int nsize, i, active_buckets, index, copycount;
4211
4212 ocmap = NULL;
4213 if (cmapp != NULL)
4214 ocmap = *cmapp;
4215
4216 /*
4217 * Decide on the size of the new map.
4218 */
4219 if (ocmap == NULL) {
4220 nsize = DRT_HASH_SMALL_MODULUS;
4221 } else {
4222 /* count the number of active buckets in the old map */
4223 active_buckets = 0;
4224 for (i = 0; i < ocmap->scm_modulus; i++) {
4225 if (!DRT_HASH_VACANT(ocmap, i) &&
4226 (DRT_HASH_GET_COUNT(ocmap, i) != 0))
4227 active_buckets++;
4228 }
4229 /*
4230 * If we're currently using the small allocation, check to
4231 * see whether we should grow to the large one.
4232 */
4233 if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
4234 /* if the ring is nearly full */
4235 if (active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) {
4236 nsize = DRT_HASH_LARGE_MODULUS;
4237 } else {
4238 nsize = DRT_HASH_SMALL_MODULUS;
4239 }
4240 } else {
4241 /* already using the large modulus */
4242 nsize = DRT_HASH_LARGE_MODULUS;
4243 /*
4244 * If the ring is completely full, there's
4245 * nothing useful for us to do. Behave as
4246 * though we had compacted into the new
4247 * array and return.
4248 */
4249 if (active_buckets >= DRT_HASH_LARGE_MODULUS)
4250 return(KERN_SUCCESS);
4251 }
4252 }
4253
4254 /*
4255 * Allocate and initialise the new map.
4256 */
4257
4258 kret = kmem_alloc(kernel_map, (vm_offset_t *)&cmap,
4259 (nsize == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION);
4260 if (kret != KERN_SUCCESS)
4261 return(kret);
4262 cmap->scm_magic = DRT_SCM_MAGIC;
4263 cmap->scm_modulus = nsize;
4264 cmap->scm_buckets = 0;
4265 cmap->scm_lastclean = 0;
4266 cmap->scm_iskips = 0;
4267 for (i = 0; i < cmap->scm_modulus; i++) {
4268 DRT_HASH_CLEAR(cmap, i);
4269 DRT_HASH_VACATE(cmap, i);
4270 DRT_BITVECTOR_CLEAR(cmap, i);
4271 }
4272
4273 /*
4274 * If there's an old map, re-hash entries from it into the new map.
4275 */
4276 copycount = 0;
4277 if (ocmap != NULL) {
4278 for (i = 0; i < ocmap->scm_modulus; i++) {
4279 /* skip empty buckets */
4280 if (DRT_HASH_VACANT(ocmap, i) ||
4281 (DRT_HASH_GET_COUNT(ocmap, i) == 0))
4282 continue;
4283 /* get new index */
4284 offset = DRT_HASH_GET_ADDRESS(ocmap, i);
4285 kret = vfs_drt_get_index(&cmap, offset, &index, 1);
4286 if (kret != KERN_SUCCESS) {
4287 /* XXX need to bail out gracefully here */
4288 panic("vfs_drt: new cluster map mysteriously too small");
4289 }
4290 /* copy */
4291 DRT_HASH_COPY(ocmap, i, cmap, index);
4292 copycount++;
4293 }
4294 }
4295
4296 /* log what we've done */
4297 vfs_drt_trace(cmap, DRT_DEBUG_ALLOC, copycount, 0, 0, 0);
4298
4299 /*
4300 * It's important to ensure that *cmapp always points to
4301 * a valid map, so we must overwrite it before freeing
4302 * the old map.
4303 */
4304 *cmapp = cmap;
4305 if (ocmap != NULL) {
4306 /* emit stats into trace buffer */
4307 vfs_drt_trace(ocmap, DRT_DEBUG_SCMDATA,
4308 ocmap->scm_modulus,
4309 ocmap->scm_buckets,
4310 ocmap->scm_lastclean,
4311 ocmap->scm_iskips);
4312
4313 vfs_drt_free_map(ocmap);
4314 }
4315 return(KERN_SUCCESS);
4316}
4317
4318
4319/*
4320 * Free a sparse cluster map.
4321 */
4322static kern_return_t
4323vfs_drt_free_map(struct vfs_drt_clustermap *cmap)
4324{
4325 kern_return_t ret;
4326
4327 kmem_free(kernel_map, (vm_offset_t)cmap,
4328 (cmap->scm_modulus == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION);
4329 return(KERN_SUCCESS);
4330}
4331
4332
4333/*
4334 * Find the hashtable slot currently occupied by an entry for the supplied offset.
4335 */
4336static kern_return_t
4337vfs_drt_search_index(struct vfs_drt_clustermap *cmap, u_int64_t offset, int *indexp)
4338{
4339 kern_return_t kret;
4340 int index, i, tries;
4341
4342 offset = DRT_ALIGN_ADDRESS(offset);
4343 index = DRT_HASH(cmap, offset);
4344
4345 /* traverse the hashtable */
4346 for (i = 0; i < cmap->scm_modulus; i++) {
4347
4348 /*
4349 * If the slot is vacant, we can stop.
4350 */
4351 if (DRT_HASH_VACANT(cmap, index))
4352 break;
4353
4354 /*
4355 * If the address matches our offset, we have success.
4356 */
4357 if (DRT_HASH_GET_ADDRESS(cmap, index) == offset) {
4358 *indexp = index;
4359 return(KERN_SUCCESS);
4360 }
4361
4362 /*
4363 * Move to the next slot, try again.
4364 */
4365 index = DRT_HASH_NEXT(cmap, index);
4366 }
4367 /*
4368 * It's not there.
4369 */
4370 return(KERN_FAILURE);
4371}
4372
4373/*
4374 * Find the hashtable slot for the supplied offset. If we haven't allocated
4375 * one yet, allocate one and populate the address field. Note that it will
4376 * not have a nonzero page count and thus will still technically be free, so
4377 * in the case where we are called to clean pages, the slot will remain free.
4378 */
4379static kern_return_t
4380vfs_drt_get_index(struct vfs_drt_clustermap **cmapp, u_int64_t offset, int *indexp, int recursed)
4381{
4382 struct vfs_drt_clustermap *cmap;
4383 kern_return_t kret;
4384 int index, i;
4385
4386 cmap = *cmapp;
4387
4388 /* look for an existing entry */
4389 kret = vfs_drt_search_index(cmap, offset, indexp);
4390 if (kret == KERN_SUCCESS)
4391 return(kret);
4392
4393 /* need to allocate an entry */
4394 offset = DRT_ALIGN_ADDRESS(offset);
4395 index = DRT_HASH(cmap, offset);
4396
4397 /* scan from the index forwards looking for a vacant slot */
4398 for (i = 0; i < cmap->scm_modulus; i++) {
4399 /* slot vacant? */
4400 if (DRT_HASH_VACANT(cmap, index) || DRT_HASH_GET_COUNT(cmap,index) == 0) {
4401 cmap->scm_buckets++;
4402 if (index < cmap->scm_lastclean)
4403 cmap->scm_lastclean = index;
4404 DRT_HASH_SET_ADDRESS(cmap, index, offset);
4405 DRT_HASH_SET_COUNT(cmap, index, 0);
4406 DRT_BITVECTOR_CLEAR(cmap, index);
4407 *indexp = index;
4408 vfs_drt_trace(cmap, DRT_DEBUG_INSERT, (int)offset, i, 0, 0);
4409 return(KERN_SUCCESS);
4410 }
4411 cmap->scm_iskips += i;
4412 index = DRT_HASH_NEXT(cmap, index);
4413 }
4414
4415 /*
4416 * We haven't found a vacant slot, so the map is full. If we're not
4417 * already recursed, try reallocating/compacting it.
4418 */
4419 if (recursed)
4420 return(KERN_FAILURE);
4421 kret = vfs_drt_alloc_map(cmapp);
4422 if (kret == KERN_SUCCESS) {
4423 /* now try to insert again */
4424 kret = vfs_drt_get_index(cmapp, offset, indexp, 1);
4425 }
4426 return(kret);
4427}
4428
4429/*
4430 * Implementation of set dirty/clean.
4431 *
4432 * In the 'clean' case, not finding a map is OK.
4433 */
4434static kern_return_t
4435vfs_drt_do_mark_pages(
4436 void **private,
4437 u_int64_t offset,
4438 u_int length,
4439 int *setcountp,
4440 int dirty)
4441{
4442 struct vfs_drt_clustermap *cmap, **cmapp;
4443 kern_return_t kret;
4444 int i, index, pgoff, pgcount, setcount, ecount;
4445
4446 cmapp = (struct vfs_drt_clustermap **)private;
4447 cmap = *cmapp;
4448
4449 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_START, (int)offset, (int)length, dirty, 0);
4450
4451 if (setcountp != NULL)
4452 *setcountp = 0;
4453
4454 /* allocate a cluster map if we don't already have one */
4455 if (cmap == NULL) {
4456 /* no cluster map, nothing to clean */
4457 if (!dirty) {
4458 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 1, 0, 0, 0);
4459 return(KERN_SUCCESS);
4460 }
4461 kret = vfs_drt_alloc_map(cmapp);
4462 if (kret != KERN_SUCCESS) {
4463 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 2, 0, 0, 0);
4464 return(kret);
4465 }
4466 }
4467 setcount = 0;
4468
4469 /*
4470 * Iterate over the length of the region.
4471 */
4472 while (length > 0) {
4473 /*
4474 * Get the hashtable index for this offset.
4475 *
4476 * XXX this will add blank entries if we are clearing a range
4477 * that hasn't been dirtied.
4478 */
4479 kret = vfs_drt_get_index(cmapp, offset, &index, 0);
4480 cmap = *cmapp; /* may have changed! */
4481 /* this may be a partial-success return */
4482 if (kret != KERN_SUCCESS) {
4483 if (setcountp != NULL)
4484 *setcountp = setcount;
4485 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 3, (int)length, 0, 0);
4486
4487 return(kret);
4488 }
4489
4490 /*
4491 * Work out how many pages we're modifying in this
4492 * hashtable entry.
4493 */
4494 pgoff = (offset - DRT_ALIGN_ADDRESS(offset)) / PAGE_SIZE;
4495 pgcount = min((length / PAGE_SIZE), (DRT_BITVECTOR_PAGES - pgoff));
4496
4497 /*
4498 * Iterate over pages, dirty/clearing as we go.
4499 */
4500 ecount = DRT_HASH_GET_COUNT(cmap, index);
4501 for (i = 0; i < pgcount; i++) {
4502 if (dirty) {
4503 if (!DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
4504 DRT_HASH_SET_BIT(cmap, index, pgoff + i);
4505 ecount++;
4506 setcount++;
4507 }
4508 } else {
4509 if (DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
4510 DRT_HASH_CLEAR_BIT(cmap, index, pgoff + i);
4511 ecount--;
4512 setcount++;
4513 }
4514 }
4515 }
4516 DRT_HASH_SET_COUNT(cmap, index, ecount);
4517next:
4518 offset += pgcount * PAGE_SIZE;
4519 length -= pgcount * PAGE_SIZE;
4520 }
4521 if (setcountp != NULL)
4522 *setcountp = setcount;
4523
4524 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 0, setcount, 0, 0);
4525
4526 return(KERN_SUCCESS);
4527}
4528
4529/*
4530 * Mark a set of pages as dirty/clean.
4531 *
4532 * This is a public interface.
4533 *
4534 * cmapp
4535 * Pointer to storage suitable for holding a pointer. Note that
4536 * this must either be NULL or a value set by this function.
4537 *
4538 * size
4539 * Current file size in bytes.
4540 *
4541 * offset
4542 * Offset of the first page to be marked as dirty, in bytes. Must be
4543 * page-aligned.
4544 *
4545 * length
4546 * Length of dirty region, in bytes. Must be a multiple of PAGE_SIZE.
4547 *
4548 * setcountp
4549 * Number of pages newly marked dirty by this call (optional).
4550 *
4551 * Returns KERN_SUCCESS if all the pages were successfully marked.
4552 */
4553static kern_return_t
4554vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, int *setcountp)
4555{
4556 /* XXX size unused, drop from interface */
4557 return(vfs_drt_do_mark_pages(cmapp, offset, length, setcountp, 1));
4558}
4559
4560static kern_return_t
4561vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length)
4562{
4563 return(vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0));
4564}
4565
4566/*
4567 * Get a cluster of dirty pages.
4568 *
4569 * This is a public interface.
4570 *
4571 * cmapp
4572 * Pointer to storage managed by drt_mark_pages. Note that this must
4573 * be NULL or a value set by drt_mark_pages.
4574 *
4575 * offsetp
4576 * Returns the byte offset into the file of the first page in the cluster.
4577 *
4578 * lengthp
4579 * Returns the length in bytes of the cluster of dirty pages.
4580 *
4581 * Returns success if a cluster was found. If KERN_FAILURE is returned, there
4582 * are no dirty pages meeting the minmum size criteria. Private storage will
4583 * be released if there are no more dirty pages left in the map
4584 *
4585 */
4586static kern_return_t
4587vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp)
4588{
4589 struct vfs_drt_clustermap *cmap;
4590 u_int64_t offset;
4591 u_int length;
4592 int index, i, j, fs, ls;
4593
4594 /* sanity */
4595 if ((cmapp == NULL) || (*cmapp == NULL))
4596 return(KERN_FAILURE);
4597 cmap = *cmapp;
4598
4599 /* walk the hashtable */
4600 for (offset = 0, j = 0; j < cmap->scm_modulus; offset += (DRT_BITVECTOR_PAGES * PAGE_SIZE), j++) {
4601 index = DRT_HASH(cmap, offset);
4602
4603 if (DRT_HASH_VACANT(cmap, index) || (DRT_HASH_GET_COUNT(cmap, index) == 0))
4604 continue;
4605
4606 /* scan the bitfield for a string of bits */
4607 fs = -1;
4608
4609 for (i = 0; i < DRT_BITVECTOR_PAGES; i++) {
4610 if (DRT_HASH_TEST_BIT(cmap, index, i)) {
4611 fs = i;
4612 break;
4613 }
4614 }
4615 if (fs == -1) {
4616 /* didn't find any bits set */
4617 panic("vfs_drt: entry summary count > 0 but no bits set in map");
4618 }
4619 for (ls = 0; i < DRT_BITVECTOR_PAGES; i++, ls++) {
4620 if (!DRT_HASH_TEST_BIT(cmap, index, i))
4621 break;
4622 }
4623
4624 /* compute offset and length, mark pages clean */
4625 offset = DRT_HASH_GET_ADDRESS(cmap, index) + (PAGE_SIZE * fs);
4626 length = ls * PAGE_SIZE;
4627 vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
4628 cmap->scm_lastclean = index;
4629
4630 /* return successful */
4631 *offsetp = (off_t)offset;
4632 *lengthp = length;
4633
4634 vfs_drt_trace(cmap, DRT_DEBUG_RETCLUSTER, (int)offset, (int)length, 0, 0);
4635 return(KERN_SUCCESS);
4636 }
4637 /*
4638 * We didn't find anything... hashtable is empty
4639 * emit stats into trace buffer and
4640 * then free it
4641 */
4642 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
4643 cmap->scm_modulus,
4644 cmap->scm_buckets,
4645 cmap->scm_lastclean,
4646 cmap->scm_iskips);
4647
4648 vfs_drt_free_map(cmap);
4649 *cmapp = NULL;
4650
4651 return(KERN_FAILURE);
4652}
4653
4654
4655static kern_return_t
4656vfs_drt_control(void **cmapp, int op_type)
4657{
4658 struct vfs_drt_clustermap *cmap;
4659
4660 /* sanity */
4661 if ((cmapp == NULL) || (*cmapp == NULL))
4662 return(KERN_FAILURE);
4663 cmap = *cmapp;
4664
4665 switch (op_type) {
4666 case 0:
4667 /* emit stats into trace buffer */
4668 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
4669 cmap->scm_modulus,
4670 cmap->scm_buckets,
4671 cmap->scm_lastclean,
4672 cmap->scm_iskips);
4673
4674 vfs_drt_free_map(cmap);
4675 *cmapp = NULL;
4676 break;
4677
4678 case 1:
4679 cmap->scm_lastclean = 0;
4680 break;
4681 }
4682 return(KERN_SUCCESS);
4683}
4684
4685
4686
4687/*
4688 * Emit a summary of the state of the clustermap into the trace buffer
4689 * along with some caller-provided data.
4690 */
4691static void
4692vfs_drt_trace(struct vfs_drt_clustermap *cmap, int code, int arg1, int arg2, int arg3, int arg4)
4693{
4694 KERNEL_DEBUG(code, arg1, arg2, arg3, arg4, 0);
4695}
4696
4697/*
4698 * Perform basic sanity check on the hash entry summary count
4699 * vs. the actual bits set in the entry.
4700 */
4701static void
4702vfs_drt_sanity(struct vfs_drt_clustermap *cmap)
4703{
4704 int index, i;
4705 int bits_on;
4706
4707 for (index = 0; index < cmap->scm_modulus; index++) {
4708 if (DRT_HASH_VACANT(cmap, index))
4709 continue;
4710
4711 for (bits_on = 0, i = 0; i < DRT_BITVECTOR_PAGES; i++) {
4712 if (DRT_HASH_TEST_BIT(cmap, index, i))
4713 bits_on++;
4714 }
4715 if (bits_on != DRT_HASH_GET_COUNT(cmap, index))
4716 panic("bits_on = %d, index = %d\n", bits_on, index);
4717 }
b4c24cb9 4718}