]> git.saurik.com Git - apple/xnu.git/blame - bsd/vfs/vfs_cluster.c
xnu-517.3.7.tar.gz
[apple/xnu.git] / bsd / vfs / vfs_cluster.c
CommitLineData
1c79356b 1/*
9bccf70c 2 * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
1c79356b
A
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
43866e37 6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
1c79356b 7 *
43866e37
A
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * file.
14 *
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
1c79356b
A
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
43866e37
A
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
1c79356b
A
22 *
23 * @APPLE_LICENSE_HEADER_END@
24 */
25/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
26/*
27 * Copyright (c) 1993
28 * The Regents of the University of California. All rights reserved.
29 *
30 * Redistribution and use in source and binary forms, with or without
31 * modification, are permitted provided that the following conditions
32 * are met:
33 * 1. Redistributions of source code must retain the above copyright
34 * notice, this list of conditions and the following disclaimer.
35 * 2. Redistributions in binary form must reproduce the above copyright
36 * notice, this list of conditions and the following disclaimer in the
37 * documentation and/or other materials provided with the distribution.
38 * 3. All advertising materials mentioning features or use of this software
39 * must display the following acknowledgement:
40 * This product includes software developed by the University of
41 * California, Berkeley and its contributors.
42 * 4. Neither the name of the University nor the names of its contributors
43 * may be used to endorse or promote products derived from this software
44 * without specific prior written permission.
45 *
46 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
47 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
48 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
49 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
50 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
51 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
52 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
53 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
54 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
55 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
56 * SUCH DAMAGE.
57 *
58 * @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95
59 */
60
61#include <sys/param.h>
62#include <sys/proc.h>
63#include <sys/buf.h>
64#include <sys/vnode.h>
65#include <sys/mount.h>
66#include <sys/trace.h>
67#include <sys/malloc.h>
55e303ae
A
68#include <sys/time.h>
69#include <sys/kernel.h>
1c79356b
A
70#include <sys/resourcevar.h>
71#include <libkern/libkern.h>
55e303ae 72#include <machine/machine_routines.h>
1c79356b
A
73
74#include <sys/ubc.h>
75#include <vm/vm_pageout.h>
1c79356b 76
55e303ae
A
77#include <mach/mach_types.h>
78#include <mach/memory_object_types.h>
79
1c79356b
A
80#include <sys/kdebug.h>
81
55e303ae 82
1c79356b
A
83#define CL_READ 0x01
84#define CL_ASYNC 0x02
85#define CL_COMMIT 0x04
1c79356b
A
86#define CL_PAGEOUT 0x10
87#define CL_AGE 0x20
88#define CL_DUMP 0x40
89#define CL_NOZERO 0x80
90#define CL_PAGEIN 0x100
0b4e3aa0 91#define CL_DEV_MEMORY 0x200
b4c24cb9 92#define CL_PRESERVE 0x400
55e303ae 93#define CL_THROTTLE 0x800
b4c24cb9 94
d7e50217 95
b4c24cb9 96struct clios {
d7e50217
A
97 u_int io_completed; /* amount of io that has currently completed */
98 u_int io_issued; /* amount of io that was successfully issued */
99 int io_error; /* error code of first error encountered */
100 int io_wanted; /* someone is sleeping waiting for a change in state */
b4c24cb9
A
101};
102
1c79356b 103
9bccf70c
A
104static void cluster_zero(upl_t upl, vm_offset_t upl_offset,
105 int size, struct buf *bp);
106static int cluster_read_x(struct vnode *vp, struct uio *uio,
107 off_t filesize, int devblocksize, int flags);
108static int cluster_write_x(struct vnode *vp, struct uio *uio,
109 off_t oldEOF, off_t newEOF, off_t headOff,
110 off_t tailOff, int devblocksize, int flags);
111static int cluster_nocopy_read(struct vnode *vp, struct uio *uio,
112 off_t filesize, int devblocksize, int flags);
113static int cluster_nocopy_write(struct vnode *vp, struct uio *uio,
114 off_t newEOF, int devblocksize, int flags);
115static int cluster_phys_read(struct vnode *vp, struct uio *uio,
b4c24cb9
A
116 off_t filesize, int devblocksize, int flags);
117static int cluster_phys_write(struct vnode *vp, struct uio *uio,
118 off_t newEOF, int devblocksize, int flags);
119static int cluster_align_phys_io(struct vnode *vp, struct uio *uio,
55e303ae 120 addr64_t usr_paddr, int xsize, int devblocksize, int flags);
9bccf70c 121static int cluster_push_x(struct vnode *vp, off_t EOF, daddr_t first, daddr_t last, int can_delay);
55e303ae
A
122static int cluster_try_push(struct vnode *vp, off_t EOF, int can_delay, int push_all);
123
124static int sparse_cluster_switch(struct vnode *vp, off_t EOF);
125static int sparse_cluster_push(struct vnode *vp, off_t EOF, int push_all);
126static int sparse_cluster_add(struct vnode *vp, off_t EOF, daddr_t first, daddr_t last);
127
128static kern_return_t vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, int *setcountp);
129static kern_return_t vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length);
130static kern_return_t vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp);
131static kern_return_t vfs_drt_control(void **cmapp, int op_type);
132
133int ubc_page_op_with_control __P((memory_object_control_t, off_t, int, ppnum_t *, int *));
9bccf70c
A
134
135
1c79356b
A
136/*
137 * throttle the number of async writes that
138 * can be outstanding on a single vnode
139 * before we issue a synchronous write
140 */
55e303ae
A
141#define ASYNC_THROTTLE 18
142#define HARD_THROTTLE_MAXCNT 1
143#define HARD_THROTTLE_MAXSIZE (64 * 1024)
144
145int hard_throttle_on_root = 0;
146struct timeval priority_IO_timestamp_for_root;
147
148
149static int
150cluster_hard_throttle_on(vp)
151 struct vnode *vp;
152{
153 static struct timeval hard_throttle_maxelapsed = { 0, 300000 };
154
155 if (vp->v_mount->mnt_kern_flag & MNTK_ROOTDEV) {
156 struct timeval elapsed;
157
158 if (hard_throttle_on_root)
159 return(1);
160
161 elapsed = time;
162 timevalsub(&elapsed, &priority_IO_timestamp_for_root);
163
164 if (timevalcmp(&elapsed, &hard_throttle_maxelapsed, <))
165 return(1);
166 }
167 return(0);
168}
169
1c79356b
A
170
171static int
172cluster_iodone(bp)
173 struct buf *bp;
174{
175 int b_flags;
176 int error;
177 int total_size;
178 int total_resid;
179 int upl_offset;
9bccf70c 180 int zero_offset;
1c79356b
A
181 upl_t upl;
182 struct buf *cbp;
183 struct buf *cbp_head;
184 struct buf *cbp_next;
185 struct buf *real_bp;
0b4e3aa0 186 struct vnode *vp;
b4c24cb9 187 struct clios *iostate;
1c79356b
A
188 int commit_size;
189 int pg_offset;
190
191
192 cbp_head = (struct buf *)(bp->b_trans_head);
193
194 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_START,
9bccf70c 195 (int)cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
1c79356b
A
196
197 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) {
198 /*
199 * all I/O requests that are part of this transaction
200 * have to complete before we can process it
201 */
202 if ( !(cbp->b_flags & B_DONE)) {
203
204 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
9bccf70c 205 (int)cbp_head, (int)cbp, cbp->b_bcount, cbp->b_flags, 0);
1c79356b
A
206
207 return 0;
208 }
209 }
210 error = 0;
211 total_size = 0;
212 total_resid = 0;
213
214 cbp = cbp_head;
215 upl_offset = cbp->b_uploffset;
216 upl = cbp->b_pagelist;
217 b_flags = cbp->b_flags;
218 real_bp = cbp->b_real_bp;
0b4e3aa0 219 vp = cbp->b_vp;
9bccf70c 220 zero_offset= cbp->b_validend;
b4c24cb9 221 iostate = (struct clios *)cbp->b_iostate;
1c79356b
A
222
223 while (cbp) {
1c79356b
A
224 if ((cbp->b_flags & B_ERROR) && error == 0)
225 error = cbp->b_error;
226
227 total_resid += cbp->b_resid;
228 total_size += cbp->b_bcount;
229
230 cbp_next = cbp->b_trans_next;
231
232 free_io_buf(cbp);
233
234 cbp = cbp_next;
235 }
b4c24cb9
A
236 if (zero_offset)
237 cluster_zero(upl, zero_offset, PAGE_SIZE - (zero_offset & PAGE_MASK), real_bp);
238
0b4e3aa0
A
239 if ((vp->v_flag & VTHROTTLED) && (vp->v_numoutput <= (ASYNC_THROTTLE / 3))) {
240 vp->v_flag &= ~VTHROTTLED;
241 wakeup((caddr_t)&vp->v_numoutput);
242 }
b4c24cb9 243 if (iostate) {
d7e50217
A
244 /*
245 * someone has issued multiple I/Os asynchrounsly
246 * and is waiting for them to complete (streaming)
247 */
248 if (error && iostate->io_error == 0)
249 iostate->io_error = error;
9bccf70c 250
b4c24cb9
A
251 iostate->io_completed += total_size;
252
253 if (iostate->io_wanted) {
d7e50217
A
254 /*
255 * someone is waiting for the state of
256 * this io stream to change
257 */
b4c24cb9
A
258 iostate->io_wanted = 0;
259 wakeup((caddr_t)&iostate->io_wanted);
260 }
261 }
1c79356b
A
262 if ((b_flags & B_NEED_IODONE) && real_bp) {
263 if (error) {
264 real_bp->b_flags |= B_ERROR;
265 real_bp->b_error = error;
266 }
267 real_bp->b_resid = total_resid;
268
269 biodone(real_bp);
270 }
271 if (error == 0 && total_resid)
272 error = EIO;
273
274 if (b_flags & B_COMMIT_UPL) {
b4c24cb9 275 pg_offset = upl_offset & PAGE_MASK;
55e303ae 276 commit_size = (pg_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1c79356b 277
55e303ae 278 if (error || (b_flags & B_NOCACHE)) {
1c79356b
A
279 int upl_abort_code;
280
55e303ae 281 if ((b_flags & B_PAGEOUT) && (error != ENXIO)) /* transient error */
1c79356b 282 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
0b4e3aa0
A
283 else if (b_flags & B_PGIN)
284 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
1c79356b
A
285 else
286 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
287
0b4e3aa0
A
288 ubc_upl_abort_range(upl, upl_offset - pg_offset, commit_size,
289 upl_abort_code);
1c79356b
A
290
291 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
9bccf70c 292 (int)upl, upl_offset - pg_offset, commit_size,
1c79356b
A
293 0x80000000|upl_abort_code, 0);
294
295 } else {
296 int upl_commit_flags = UPL_COMMIT_FREE_ON_EMPTY;
297
55e303ae
A
298 if (b_flags & B_PHYS) {
299 if (b_flags & B_READ)
300 upl_commit_flags |= UPL_COMMIT_SET_DIRTY;
301 } else if ( !(b_flags & B_PAGEOUT))
1c79356b 302 upl_commit_flags |= UPL_COMMIT_CLEAR_DIRTY;
55e303ae 303
1c79356b
A
304 if (b_flags & B_AGE)
305 upl_commit_flags |= UPL_COMMIT_INACTIVATE;
306
0b4e3aa0
A
307 ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size,
308 upl_commit_flags);
1c79356b 309
0b4e3aa0 310 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
9bccf70c 311 (int)upl, upl_offset - pg_offset, commit_size,
1c79356b
A
312 upl_commit_flags, 0);
313 }
314 } else
315 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END,
9bccf70c 316 (int)upl, upl_offset, 0, error, 0);
1c79356b
A
317
318 return (error);
319}
320
321
322static void
9bccf70c 323cluster_zero(upl, upl_offset, size, bp)
1c79356b
A
324 upl_t upl;
325 vm_offset_t upl_offset;
326 int size;
1c79356b
A
327 struct buf *bp;
328{
55e303ae 329 upl_page_info_t *pl;
1c79356b 330
55e303ae 331 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_START,
9bccf70c
A
332 upl_offset, size, (int)bp, 0, 0);
333
334 if (bp == NULL || bp->b_data == NULL) {
9bccf70c 335
55e303ae
A
336 pl = ubc_upl_pageinfo(upl);
337
338 while (size) {
339 int page_offset;
340 int page_index;
341 addr64_t zero_addr;
342 int zero_cnt;
343
344 page_index = upl_offset / PAGE_SIZE;
345 page_offset = upl_offset & PAGE_MASK;
346
347 zero_addr = ((addr64_t)upl_phys_page(pl, page_index) << 12) + page_offset;
348 zero_cnt = min(PAGE_SIZE - page_offset, size);
349
350 bzero_phys(zero_addr, zero_cnt);
351
352 size -= zero_cnt;
353 upl_offset += zero_cnt;
354 }
1c79356b 355 } else
55e303ae 356 bzero((caddr_t)((vm_offset_t)bp->b_data + upl_offset), size);
1c79356b 357
55e303ae
A
358 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 23)) | DBG_FUNC_END,
359 upl_offset, size, 0, 0, 0);
1c79356b
A
360}
361
1c79356b 362static int
b4c24cb9 363cluster_io(vp, upl, upl_offset, f_offset, non_rounded_size, devblocksize, flags, real_bp, iostate)
1c79356b
A
364 struct vnode *vp;
365 upl_t upl;
366 vm_offset_t upl_offset;
367 off_t f_offset;
9bccf70c
A
368 int non_rounded_size;
369 int devblocksize;
1c79356b
A
370 int flags;
371 struct buf *real_bp;
b4c24cb9 372 struct clios *iostate;
1c79356b
A
373{
374 struct buf *cbp;
b4c24cb9
A
375 u_int size;
376 u_int io_size;
1c79356b
A
377 int io_flags;
378 int error = 0;
379 int retval = 0;
380 struct buf *cbp_head = 0;
381 struct buf *cbp_tail = 0;
9bccf70c 382 int buf_count = 0;
1c79356b
A
383 int pg_count;
384 int pg_offset;
9bccf70c
A
385 u_int max_iosize;
386 u_int max_vectors;
0b4e3aa0 387 int priv;
9bccf70c 388 int zero_offset = 0;
55e303ae
A
389 int async_throttle;
390
391 if (devblocksize)
392 size = (non_rounded_size + (devblocksize - 1)) & ~(devblocksize - 1);
393 else
394 size = non_rounded_size;
395
396 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_START,
397 (int)f_offset, size, upl_offset, flags, 0);
398
1c79356b 399
0b4e3aa0 400 if (flags & CL_READ) {
1c79356b 401 io_flags = (B_VECTORLIST | B_READ);
0b4e3aa0
A
402
403 vfs_io_attributes(vp, B_READ, &max_iosize, &max_vectors);
404 } else {
1c79356b
A
405 io_flags = (B_VECTORLIST | B_WRITEINPROG);
406
0b4e3aa0
A
407 vfs_io_attributes(vp, B_WRITE, &max_iosize, &max_vectors);
408 }
55e303ae
A
409 /*
410 * make sure the maximum iosize are at least the size of a page
411 * and that they are multiples of the page size
412 */
413 max_iosize &= ~PAGE_MASK;
414
415 if (flags & CL_THROTTLE) {
416 if ( !(flags & CL_PAGEOUT) && cluster_hard_throttle_on(vp)) {
417 if (max_iosize > HARD_THROTTLE_MAXSIZE)
418 max_iosize = HARD_THROTTLE_MAXSIZE;
419 async_throttle = HARD_THROTTLE_MAXCNT;
420 } else
421 async_throttle = ASYNC_THROTTLE;
422 }
1c79356b
A
423 if (flags & CL_AGE)
424 io_flags |= B_AGE;
425 if (flags & CL_DUMP)
426 io_flags |= B_NOCACHE;
0b4e3aa0
A
427 if (flags & CL_PAGEIN)
428 io_flags |= B_PGIN;
b4c24cb9
A
429 if (flags & CL_PAGEOUT)
430 io_flags |= B_PAGEOUT;
431 if (flags & CL_COMMIT)
432 io_flags |= B_COMMIT_UPL;
433 if (flags & CL_PRESERVE)
434 io_flags |= B_PHYS;
1c79356b 435
9bccf70c 436 if ((flags & CL_READ) && ((upl_offset + non_rounded_size) & PAGE_MASK) && (!(flags & CL_NOZERO))) {
1c79356b
A
437 /*
438 * then we are going to end up
439 * with a page that we can't complete (the file size wasn't a multiple
440 * of PAGE_SIZE and we're trying to read to the end of the file
441 * so we'll go ahead and zero out the portion of the page we can't
442 * read in from the file
443 */
9bccf70c 444 zero_offset = upl_offset + non_rounded_size;
1c79356b
A
445 }
446 while (size) {
de355530
A
447 int vsize;
448 int i;
1c79356b
A
449 int pg_resid;
450 int num_contig;
451 daddr_t lblkno;
452 daddr_t blkno;
453
0b4e3aa0
A
454 if (size > max_iosize)
455 io_size = max_iosize;
1c79356b
A
456 else
457 io_size = size;
458
b4c24cb9 459 if (error = VOP_CMAP(vp, f_offset, io_size, &blkno, (size_t *)&io_size, NULL)) {
1c79356b
A
460 if (error == EOPNOTSUPP)
461 panic("VOP_CMAP Unimplemented");
462 break;
463 }
464
465 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 24)) | DBG_FUNC_NONE,
9bccf70c 466 (int)f_offset, (int)blkno, io_size, zero_offset, 0);
1c79356b
A
467
468 if ( (!(flags & CL_READ) && (long)blkno == -1) || io_size == 0) {
0b4e3aa0
A
469 if (flags & CL_PAGEOUT) {
470 error = EINVAL;
471 break;
472 };
473
474 /* Try paging out the page individually before
475 giving up entirely and dumping it (it could
476 be mapped in a "hole" and require allocation
477 before the I/O:
478 */
55e303ae 479 ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
0b4e3aa0
A
480 if (ubc_pushdirty_range(vp, f_offset, PAGE_SIZE_64) == 0) {
481 error = EINVAL;
482 break;
483 };
484
0b4e3aa0 485 f_offset += PAGE_SIZE_64;
55e303ae
A
486 upl_offset += PAGE_SIZE;
487 size -= PAGE_SIZE;
0b4e3aa0 488 continue;
1c79356b
A
489 }
490 lblkno = (daddr_t)(f_offset / PAGE_SIZE_64);
491 /*
492 * we have now figured out how much I/O we can do - this is in 'io_size'
1c79356b
A
493 * pg_offset is the starting point in the first page for the I/O
494 * pg_count is the number of full and partial pages that 'io_size' encompasses
495 */
1c79356b 496 pg_offset = upl_offset & PAGE_MASK;
1c79356b 497
0b4e3aa0
A
498 if (flags & CL_DEV_MEMORY) {
499 /*
500 * currently, can't deal with reading 'holes' in file
501 */
502 if ((long)blkno == -1) {
503 error = EINVAL;
504 break;
505 }
506 /*
507 * treat physical requests as one 'giant' page
508 */
509 pg_count = 1;
55e303ae
A
510 } else
511 pg_count = (io_size + pg_offset + (PAGE_SIZE - 1)) / PAGE_SIZE;
512
1c79356b 513 if ((flags & CL_READ) && (long)blkno == -1) {
9bccf70c
A
514 int bytes_to_zero;
515
1c79356b
A
516 /*
517 * if we're reading and blkno == -1, then we've got a
518 * 'hole' in the file that we need to deal with by zeroing
519 * out the affected area in the upl
520 */
9bccf70c
A
521 if (zero_offset && io_size == size) {
522 /*
523 * if this upl contains the EOF and it is not a multiple of PAGE_SIZE
524 * than 'zero_offset' will be non-zero
525 * if the 'hole' returned by VOP_CMAP extends all the way to the eof
526 * (indicated by the io_size finishing off the I/O request for this UPL)
527 * than we're not going to issue an I/O for the
528 * last page in this upl... we need to zero both the hole and the tail
529 * of the page beyond the EOF, since the delayed zero-fill won't kick in
530 */
531 bytes_to_zero = (((upl_offset + io_size) + (PAGE_SIZE - 1)) & ~PAGE_MASK) - upl_offset;
1c79356b 532
9bccf70c
A
533 zero_offset = 0;
534 } else
535 bytes_to_zero = io_size;
1c79356b 536
9bccf70c
A
537 cluster_zero(upl, upl_offset, bytes_to_zero, real_bp);
538
539 if (cbp_head)
540 /*
541 * if there is a current I/O chain pending
542 * then the first page of the group we just zero'd
543 * will be handled by the I/O completion if the zero
544 * fill started in the middle of the page
545 */
546 pg_count = (io_size - pg_offset) / PAGE_SIZE;
547 else {
548 /*
549 * no pending I/O to pick up that first page
550 * so, we have to make sure it gets committed
551 * here.
552 * set the pg_offset to 0 so that the upl_commit_range
553 * starts with this page
554 */
555 pg_count = (io_size + pg_offset) / PAGE_SIZE;
556 pg_offset = 0;
557 }
1c79356b 558 if (io_size == size && ((upl_offset + io_size) & PAGE_MASK))
9bccf70c
A
559 /*
560 * if we're done with the request for this UPL
561 * then we have to make sure to commit the last page
562 * even if we only partially zero-filled it
563 */
1c79356b
A
564 pg_count++;
565
566 if (pg_count) {
567 if (pg_offset)
568 pg_resid = PAGE_SIZE - pg_offset;
569 else
570 pg_resid = 0;
9bccf70c 571
1c79356b 572 if (flags & CL_COMMIT)
0b4e3aa0 573 ubc_upl_commit_range(upl,
9bccf70c 574 (upl_offset + pg_resid) & ~PAGE_MASK,
0b4e3aa0
A
575 pg_count * PAGE_SIZE,
576 UPL_COMMIT_CLEAR_DIRTY | UPL_COMMIT_FREE_ON_EMPTY);
1c79356b
A
577 }
578 upl_offset += io_size;
579 f_offset += io_size;
580 size -= io_size;
581
9bccf70c 582 if (cbp_head && pg_count)
1c79356b
A
583 goto start_io;
584 continue;
9bccf70c 585
1c79356b
A
586 } else if (real_bp && (real_bp->b_blkno == real_bp->b_lblkno)) {
587 real_bp->b_blkno = blkno;
588 }
0b4e3aa0 589
55e303ae
A
590 if (pg_count > max_vectors) {
591 io_size -= (pg_count - max_vectors) * PAGE_SIZE;
0b4e3aa0 592
55e303ae
A
593 if (io_size < 0) {
594 io_size = PAGE_SIZE - pg_offset;
595 pg_count = 1;
596 } else
597 pg_count = max_vectors;
1c79356b 598 }
1c79356b 599
55e303ae
A
600 if ( !(vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV))
601 /*
602 * if we're not targeting a virtual device i.e. a disk image
603 * it's safe to dip into the reserve pool since real devices
604 * can complete this I/O request without requiring additional
605 * bufs from the alloc_io_buf pool
606 */
607 priv = 1;
608 else if ((flags & CL_ASYNC) && !(flags & CL_PAGEOUT))
609 /*
610 * Throttle the speculative IO
611 */
0b4e3aa0
A
612 priv = 0;
613 else
614 priv = 1;
615
616 cbp = alloc_io_buf(vp, priv);
1c79356b 617
de355530 618
55e303ae
A
619 if (flags & CL_PAGEOUT) {
620 for (i = 0; i < pg_count; i++) {
1c79356b
A
621 int s;
622 struct buf *bp;
623
624 s = splbio();
625 if (bp = incore(vp, lblkno + i)) {
626 if (!ISSET(bp->b_flags, B_BUSY)) {
627 bremfree(bp);
628 SET(bp->b_flags, (B_BUSY | B_INVAL));
629 splx(s);
630 brelse(bp);
631 } else
632 panic("BUSY bp found in cluster_io");
633 }
634 splx(s);
635 }
1c79356b 636 }
b4c24cb9
A
637 if (flags & CL_ASYNC) {
638 cbp->b_flags |= (B_CALL | B_ASYNC);
639 cbp->b_iodone = (void *)cluster_iodone;
640 }
1c79356b
A
641 cbp->b_flags |= io_flags;
642
643 cbp->b_lblkno = lblkno;
644 cbp->b_blkno = blkno;
645 cbp->b_bcount = io_size;
646 cbp->b_pagelist = upl;
647 cbp->b_uploffset = upl_offset;
648 cbp->b_trans_next = (struct buf *)0;
649
b4c24cb9 650 if (cbp->b_iostate = (void *)iostate)
d7e50217
A
651 /*
652 * caller wants to track the state of this
653 * io... bump the amount issued against this stream
654 */
b4c24cb9
A
655 iostate->io_issued += io_size;
656
1c79356b
A
657 if (flags & CL_READ)
658 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 26)) | DBG_FUNC_NONE,
659 cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
660 else
661 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 27)) | DBG_FUNC_NONE,
662 cbp->b_lblkno, cbp->b_blkno, upl_offset, io_size, 0);
663
664 if (cbp_head) {
665 cbp_tail->b_trans_next = cbp;
666 cbp_tail = cbp;
667 } else {
668 cbp_head = cbp;
669 cbp_tail = cbp;
670 }
671 (struct buf *)(cbp->b_trans_head) = cbp_head;
9bccf70c 672 buf_count++;
1c79356b
A
673
674 upl_offset += io_size;
675 f_offset += io_size;
676 size -= io_size;
677
9bccf70c 678 if ( (!(upl_offset & PAGE_MASK) && !(flags & CL_DEV_MEMORY) && ((flags & CL_ASYNC) || buf_count > 8)) || size == 0) {
1c79356b
A
679 /*
680 * if we have no more I/O to issue or
681 * the current I/O we've prepared fully
682 * completes the last page in this request
9bccf70c
A
683 * and it's either an ASYNC request or
684 * we've already accumulated more than 8 I/O's into
685 * this transaction and it's not an I/O directed to
686 * special DEVICE memory
1c79356b
A
687 * then go ahead and issue the I/O
688 */
689start_io:
1c79356b
A
690 if (real_bp) {
691 cbp_head->b_flags |= B_NEED_IODONE;
692 cbp_head->b_real_bp = real_bp;
9bccf70c
A
693 } else
694 cbp_head->b_real_bp = (struct buf *)NULL;
1c79356b 695
9bccf70c
A
696 if (size == 0) {
697 /*
698 * we're about to issue the last I/O for this upl
699 * if this was a read to the eof and the eof doesn't
700 * finish on a page boundary, than we need to zero-fill
701 * the rest of the page....
702 */
703 cbp_head->b_validend = zero_offset;
704 } else
705 cbp_head->b_validend = 0;
706
55e303ae
A
707 if (flags & CL_THROTTLE) {
708 while (vp->v_numoutput >= async_throttle) {
709 vp->v_flag |= VTHROTTLED;
710 tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "cluster_io", 0);
711 }
712 }
1c79356b
A
713 for (cbp = cbp_head; cbp;) {
714 struct buf * cbp_next;
715
716 if (io_flags & B_WRITEINPROG)
717 cbp->b_vp->v_numoutput++;
718
719 cbp_next = cbp->b_trans_next;
9bccf70c 720
1c79356b
A
721 (void) VOP_STRATEGY(cbp);
722 cbp = cbp_next;
723 }
724 if ( !(flags & CL_ASYNC)) {
725 for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next)
726 biowait(cbp);
727
728 if (error = cluster_iodone(cbp_head)) {
9bccf70c
A
729 if ((flags & CL_PAGEOUT) && (error == ENXIO))
730 retval = 0; /* drop the error */
731 else
732 retval = error;
1c79356b
A
733 error = 0;
734 }
735 }
736 cbp_head = (struct buf *)0;
737 cbp_tail = (struct buf *)0;
9bccf70c
A
738
739 buf_count = 0;
1c79356b
A
740 }
741 }
742 if (error) {
0b4e3aa0
A
743 int abort_size;
744
b4c24cb9
A
745 io_size = 0;
746
1c79356b
A
747 for (cbp = cbp_head; cbp;) {
748 struct buf * cbp_next;
749
0b4e3aa0
A
750 upl_offset -= cbp->b_bcount;
751 size += cbp->b_bcount;
b4c24cb9 752 io_size += cbp->b_bcount;
0b4e3aa0 753
1c79356b
A
754 cbp_next = cbp->b_trans_next;
755 free_io_buf(cbp);
756 cbp = cbp_next;
1c79356b 757 }
b4c24cb9 758 if (iostate) {
d7e50217
A
759 /*
760 * update the error condition for this stream
761 * since we never really issued the io
762 * just go ahead and adjust it back
763 */
764 if (iostate->io_error == 0)
b4c24cb9 765 iostate->io_error = error;
b4c24cb9
A
766 iostate->io_issued -= io_size;
767
768 if (iostate->io_wanted) {
d7e50217
A
769 /*
770 * someone is waiting for the state of
771 * this io stream to change
772 */
b4c24cb9
A
773 iostate->io_wanted = 0;
774 wakeup((caddr_t)&iostate->io_wanted);
775 }
776 }
0b4e3aa0 777 pg_offset = upl_offset & PAGE_MASK;
55e303ae 778 abort_size = (size + pg_offset + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1c79356b
A
779
780 if (flags & CL_COMMIT) {
781 int upl_abort_code;
782
55e303ae
A
783 if (flags & CL_PRESERVE) {
784 ubc_upl_commit_range(upl, upl_offset - pg_offset, abort_size,
785 UPL_COMMIT_FREE_ON_EMPTY);
786 } else {
787 if ((flags & CL_PAGEOUT) && (error != ENXIO)) /* transient error */
788 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
789 else if (flags & CL_PAGEIN)
790 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR;
791 else
792 upl_abort_code = UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_DUMP_PAGES;
1c79356b 793
55e303ae 794 ubc_upl_abort_range(upl, upl_offset - pg_offset, abort_size,
0b4e3aa0 795 upl_abort_code);
55e303ae 796 }
1c79356b 797 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 28)) | DBG_FUNC_NONE,
9bccf70c 798 (int)upl, upl_offset - pg_offset, abort_size, error, 0);
1c79356b
A
799 }
800 if (real_bp) {
801 real_bp->b_flags |= B_ERROR;
802 real_bp->b_error = error;
803
804 biodone(real_bp);
805 }
806 if (retval == 0)
807 retval = error;
808 }
809 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 22)) | DBG_FUNC_END,
810 (int)f_offset, size, upl_offset, retval, 0);
811
812 return (retval);
813}
814
815
816static int
0b4e3aa0 817cluster_rd_prefetch(vp, f_offset, size, filesize, devblocksize)
1c79356b 818 struct vnode *vp;
1c79356b
A
819 off_t f_offset;
820 u_int size;
821 off_t filesize;
822 int devblocksize;
823{
55e303ae 824 int pages_in_prefetch;
1c79356b
A
825
826 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_START,
827 (int)f_offset, size, (int)filesize, 0, 0);
828
829 if (f_offset >= filesize) {
830 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
831 (int)f_offset, 0, 0, 0, 0);
832 return(0);
833 }
0b4e3aa0 834 if (size > (MAX_UPL_TRANSFER * PAGE_SIZE))
55e303ae 835 size = (MAX_UPL_TRANSFER * PAGE_SIZE);
1c79356b 836 else
55e303ae 837 size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1c79356b 838
9bccf70c
A
839 if ((off_t)size > (filesize - f_offset))
840 size = filesize - f_offset;
55e303ae 841 pages_in_prefetch = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
1c79356b 842
55e303ae 843 advisory_read(vp, filesize, f_offset, size, devblocksize);
1c79356b
A
844
845 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 49)) | DBG_FUNC_END,
55e303ae 846 (int)f_offset + size, pages_in_prefetch, 0, 1, 0);
1c79356b 847
55e303ae 848 return (pages_in_prefetch);
1c79356b
A
849}
850
851
852
853static void
0b4e3aa0 854cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize)
1c79356b 855 struct vnode *vp;
1c79356b
A
856 daddr_t b_lblkno;
857 daddr_t e_lblkno;
858 off_t filesize;
859 int devblocksize;
860{
861 daddr_t r_lblkno;
862 off_t f_offset;
863 int size_of_prefetch;
1c79356b
A
864
865 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_START,
866 b_lblkno, e_lblkno, vp->v_lastr, 0, 0);
867
868 if (b_lblkno == vp->v_lastr && b_lblkno == e_lblkno) {
869 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
870 vp->v_ralen, vp->v_maxra, vp->v_lastr, 0, 0);
871 return;
872 }
9bccf70c
A
873 if (vp->v_lastr == -1 || (b_lblkno != vp->v_lastr && b_lblkno != (vp->v_lastr + 1) &&
874 (b_lblkno != (vp->v_maxra + 1) || vp->v_ralen == 0))) {
1c79356b
A
875 vp->v_ralen = 0;
876 vp->v_maxra = 0;
877
878 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
879 vp->v_ralen, vp->v_maxra, vp->v_lastr, 1, 0);
880
881 return;
882 }
1c79356b 883 if (e_lblkno < vp->v_maxra) {
55e303ae 884 if ((vp->v_maxra - e_lblkno) > (MAX_UPL_TRANSFER / 4)) {
1c79356b
A
885
886 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
887 vp->v_ralen, vp->v_maxra, vp->v_lastr, 2, 0);
888 return;
889 }
890 }
891 r_lblkno = max(e_lblkno, vp->v_maxra) + 1;
892 f_offset = (off_t)r_lblkno * PAGE_SIZE_64;
893
55e303ae
A
894 size_of_prefetch = 0;
895
896 ubc_range_op(vp, f_offset, f_offset + PAGE_SIZE_64, UPL_ROP_PRESENT, &size_of_prefetch);
897
898 if (size_of_prefetch) {
899 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
900 vp->v_ralen, vp->v_maxra, vp->v_lastr, 3, 0);
901 return;
902 }
9bccf70c 903 if (f_offset < filesize) {
55e303ae
A
904 vp->v_ralen = vp->v_ralen ? min(MAX_UPL_TRANSFER, vp->v_ralen << 1) : 1;
905
906 if (((e_lblkno + 1) - b_lblkno) > vp->v_ralen)
907 vp->v_ralen = min(MAX_UPL_TRANSFER, (e_lblkno + 1) - b_lblkno);
908
909 size_of_prefetch = cluster_rd_prefetch(vp, f_offset, vp->v_ralen * PAGE_SIZE, filesize, devblocksize);
1c79356b 910
9bccf70c
A
911 if (size_of_prefetch)
912 vp->v_maxra = (r_lblkno + size_of_prefetch) - 1;
913 }
1c79356b 914 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END,
55e303ae 915 vp->v_ralen, vp->v_maxra, vp->v_lastr, 4, 0);
1c79356b
A
916}
917
9bccf70c 918int
1c79356b
A
919cluster_pageout(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags)
920 struct vnode *vp;
921 upl_t upl;
922 vm_offset_t upl_offset;
923 off_t f_offset;
924 int size;
925 off_t filesize;
926 int devblocksize;
927 int flags;
928{
929 int io_size;
55e303ae 930 int rounded_size;
1c79356b 931 off_t max_size;
55e303ae
A
932 int local_flags;
933
934 if (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV)
935 /*
936 * if we know we're issuing this I/O to a virtual device (i.e. disk image)
937 * then we don't want to enforce this throttle... if we do, we can
938 * potentially deadlock since we're stalling the pageout thread at a time
939 * when the disk image might need additional memory (which won't be available
940 * if the pageout thread can't run)... instead we'll just depend on the throttle
941 * that the pageout thread now has in place to deal with external files
942 */
943 local_flags = CL_PAGEOUT;
944 else
945 local_flags = CL_PAGEOUT | CL_THROTTLE;
1c79356b
A
946
947 if ((flags & UPL_IOSYNC) == 0)
948 local_flags |= CL_ASYNC;
949 if ((flags & UPL_NOCOMMIT) == 0)
950 local_flags |= CL_COMMIT;
951
1c79356b
A
952
953 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE,
954 (int)f_offset, size, (int)filesize, local_flags, 0);
955
956 /*
957 * If they didn't specify any I/O, then we are done...
958 * we can't issue an abort because we don't know how
959 * big the upl really is
960 */
961 if (size <= 0)
962 return (EINVAL);
963
964 if (vp->v_mount->mnt_flag & MNT_RDONLY) {
965 if (local_flags & CL_COMMIT)
9bccf70c 966 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
1c79356b
A
967 return (EROFS);
968 }
969 /*
970 * can't page-in from a negative offset
971 * or if we're starting beyond the EOF
972 * or if the file offset isn't page aligned
973 * or the size requested isn't a multiple of PAGE_SIZE
974 */
975 if (f_offset < 0 || f_offset >= filesize ||
976 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK)) {
0b4e3aa0
A
977 if (local_flags & CL_COMMIT)
978 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
1c79356b
A
979 return (EINVAL);
980 }
981 max_size = filesize - f_offset;
982
983 if (size < max_size)
984 io_size = size;
985 else
9bccf70c 986 io_size = max_size;
1c79356b 987
55e303ae 988 rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1c79356b 989
55e303ae 990 if (size > rounded_size) {
0b4e3aa0 991 if (local_flags & CL_COMMIT)
55e303ae 992 ubc_upl_abort_range(upl, upl_offset + rounded_size, size - rounded_size,
1c79356b
A
993 UPL_ABORT_FREE_ON_EMPTY);
994 }
55e303ae 995 vp->v_flag |= VHASBEENPAGED;
1c79356b 996
9bccf70c 997 return (cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize,
b4c24cb9 998 local_flags, (struct buf *)0, (struct clios *)0));
1c79356b
A
999}
1000
9bccf70c 1001int
1c79356b
A
1002cluster_pagein(vp, upl, upl_offset, f_offset, size, filesize, devblocksize, flags)
1003 struct vnode *vp;
1004 upl_t upl;
1005 vm_offset_t upl_offset;
1006 off_t f_offset;
1007 int size;
1008 off_t filesize;
1009 int devblocksize;
1010 int flags;
1011{
1012 u_int io_size;
9bccf70c 1013 int rounded_size;
1c79356b
A
1014 off_t max_size;
1015 int retval;
1016 int local_flags = 0;
1c79356b 1017
9bccf70c
A
1018 if (upl == NULL || size < 0)
1019 panic("cluster_pagein: NULL upl passed in");
1c79356b 1020
9bccf70c
A
1021 if ((flags & UPL_IOSYNC) == 0)
1022 local_flags |= CL_ASYNC;
1c79356b 1023 if ((flags & UPL_NOCOMMIT) == 0)
9bccf70c
A
1024 local_flags |= CL_COMMIT;
1025
1c79356b
A
1026
1027 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE,
1028 (int)f_offset, size, (int)filesize, local_flags, 0);
1029
1030 /*
1031 * can't page-in from a negative offset
1032 * or if we're starting beyond the EOF
1033 * or if the file offset isn't page aligned
1034 * or the size requested isn't a multiple of PAGE_SIZE
1035 */
1036 if (f_offset < 0 || f_offset >= filesize ||
9bccf70c
A
1037 (f_offset & PAGE_MASK_64) || (size & PAGE_MASK) || (upl_offset & PAGE_MASK)) {
1038 if (local_flags & CL_COMMIT)
1039 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
1c79356b
A
1040 return (EINVAL);
1041 }
1042 max_size = filesize - f_offset;
1043
1044 if (size < max_size)
1045 io_size = size;
1046 else
9bccf70c 1047 io_size = max_size;
1c79356b 1048
9bccf70c 1049 rounded_size = (io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1c79356b 1050
9bccf70c
A
1051 if (size > rounded_size && (local_flags & CL_COMMIT))
1052 ubc_upl_abort_range(upl, upl_offset + rounded_size,
55e303ae 1053 size - rounded_size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
9bccf70c
A
1054
1055 retval = cluster_io(vp, upl, upl_offset, f_offset, io_size, devblocksize,
b4c24cb9 1056 local_flags | CL_READ | CL_PAGEIN, (struct buf *)0, (struct clios *)0);
1c79356b
A
1057
1058 if (retval == 0) {
1059 int b_lblkno;
1060 int e_lblkno;
1061
1062 b_lblkno = (int)(f_offset / PAGE_SIZE_64);
1063 e_lblkno = (int)
1064 ((f_offset + ((off_t)io_size - 1)) / PAGE_SIZE_64);
1065
9bccf70c 1066 if (!(flags & UPL_NORDAHEAD) && !(vp->v_flag & VRAOFF) && rounded_size == PAGE_SIZE) {
1c79356b
A
1067 /*
1068 * we haven't read the last page in of the file yet
1069 * so let's try to read ahead if we're in
1070 * a sequential access pattern
1071 */
0b4e3aa0 1072 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
1c79356b
A
1073 }
1074 vp->v_lastr = e_lblkno;
1075 }
1076 return (retval);
1077}
1078
9bccf70c 1079int
1c79356b
A
1080cluster_bp(bp)
1081 struct buf *bp;
1082{
1083 off_t f_offset;
1084 int flags;
1085
9bccf70c
A
1086 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 19)) | DBG_FUNC_START,
1087 (int)bp, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0);
1088
1c79356b
A
1089 if (bp->b_pagelist == (upl_t) 0)
1090 panic("cluster_bp: can't handle NULL upl yet\n");
1091 if (bp->b_flags & B_READ)
9bccf70c 1092 flags = CL_ASYNC | CL_READ;
1c79356b 1093 else
9bccf70c 1094 flags = CL_ASYNC;
1c79356b
A
1095
1096 f_offset = ubc_blktooff(bp->b_vp, bp->b_lblkno);
1097
b4c24cb9 1098 return (cluster_io(bp->b_vp, bp->b_pagelist, 0, f_offset, bp->b_bcount, 0, flags, bp, (struct clios *)0));
1c79356b
A
1099}
1100
9bccf70c 1101int
1c79356b
A
1102cluster_write(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
1103 struct vnode *vp;
1104 struct uio *uio;
1105 off_t oldEOF;
1106 off_t newEOF;
1107 off_t headOff;
1108 off_t tailOff;
1109 int devblocksize;
1110 int flags;
1111{
1c79356b
A
1112 int prev_resid;
1113 int clip_size;
1114 off_t max_io_size;
1115 struct iovec *iov;
0b4e3aa0 1116 int upl_size;
0b4e3aa0
A
1117 int upl_flags;
1118 upl_t upl;
1c79356b
A
1119 int retval = 0;
1120
55e303ae
A
1121
1122 if (vp->v_flag & VHASBEENPAGED)
1123 {
1124 /*
1125 * this vnode had pages cleaned to it by
1126 * the pager which indicates that either
1127 * it's not very 'hot', or the system is
1128 * being overwhelmed by a lot of dirty
1129 * data being delayed in the VM cache...
1130 * in either event, we'll push our remaining
1131 * delayed data at this point... this will
1132 * be more efficient than paging out 1 page at
1133 * a time, and will also act as a throttle
1134 * by delaying this client from writing any
1135 * more data until all his delayed data has
1136 * at least been queued to the uderlying driver.
1137 */
1138 cluster_push(vp);
1139
1140 vp->v_flag &= ~VHASBEENPAGED;
1141 }
1c79356b 1142
b4c24cb9 1143 if ( (!(vp->v_flag & VNOCACHE_DATA)) || (!uio) || (uio->uio_segflg != UIO_USERSPACE))
1c79356b 1144 {
55e303ae
A
1145 /*
1146 * go do a write through the cache if one of the following is true....
1147 * NOCACHE is not true
1148 * there is no uio structure or it doesn't target USERSPACE
1149 */
1150 return (cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags));
1c79356b
A
1151 }
1152
1153 while (uio->uio_resid && uio->uio_offset < newEOF && retval == 0)
1154 {
55e303ae
A
1155 /*
1156 * we know we have a resid, so this is safe
1157 * skip over any emtpy vectors
1158 */
1c79356b 1159 iov = uio->uio_iov;
55e303ae 1160
1c79356b
A
1161 while (iov->iov_len == 0) {
1162 uio->uio_iov++;
1163 uio->uio_iovcnt--;
1164 iov = uio->uio_iov;
1165 }
55e303ae 1166 upl_size = PAGE_SIZE;
0b4e3aa0 1167 upl_flags = UPL_QUERY_OBJECT_TYPE;
55e303ae 1168
0b4e3aa0
A
1169 if ((vm_map_get_upl(current_map(),
1170 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
55e303ae 1171 &upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS)
0b4e3aa0
A
1172 {
1173 /*
1174 * the user app must have passed in an invalid address
1175 */
1176 return (EFAULT);
1177 }
1178
55e303ae
A
1179 /*
1180 * We check every vector target but if it is physically
1181 * contiguous space, we skip the sanity checks.
1182 */
0b4e3aa0
A
1183 if (upl_flags & UPL_PHYS_CONTIG)
1184 {
0b4e3aa0
A
1185 if (flags & IO_HEADZEROFILL)
1186 {
1187 flags &= ~IO_HEADZEROFILL;
1188
1189 if (retval = cluster_write_x(vp, (struct uio *)0, 0, uio->uio_offset, headOff, 0, devblocksize, IO_HEADZEROFILL))
1190 return(retval);
1191 }
1192
b4c24cb9 1193 retval = cluster_phys_write(vp, uio, newEOF, devblocksize, flags);
0b4e3aa0
A
1194
1195 if (uio->uio_resid == 0 && (flags & IO_TAILZEROFILL))
1196 {
55e303ae 1197 return (cluster_write_x(vp, (struct uio *)0, 0, tailOff, uio->uio_offset, 0, devblocksize, IO_HEADZEROFILL));
0b4e3aa0
A
1198 }
1199 }
55e303ae 1200 else if ((uio->uio_resid < PAGE_SIZE) || (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL)))
0b4e3aa0
A
1201 {
1202 /*
55e303ae
A
1203 * we're here because we're don't have a physically contiguous target buffer
1204 * go do a write through the cache if one of the following is true....
1205 * the total xfer size is less than a page...
1206 * we're being asked to ZEROFILL either the head or the tail of the I/O...
0b4e3aa0 1207 */
55e303ae 1208 return (cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags));
0b4e3aa0 1209 }
55e303ae 1210 else if (((int)uio->uio_offset & PAGE_MASK) || ((int)iov->iov_base & PAGE_MASK))
1c79356b 1211 {
55e303ae
A
1212 if (((int)uio->uio_offset & PAGE_MASK) == ((int)iov->iov_base & PAGE_MASK))
1213 {
1214 /*
1215 * Bring the file offset write up to a pagesize boundary
1216 * this will also bring the base address to a page boundary
1217 * since they both are currently on the same offset within a page
1218 * note: if we get here, uio->uio_resid is greater than PAGE_SIZE
1219 * so the computed clip_size must always be less than the current uio_resid
1220 */
1221 clip_size = (PAGE_SIZE - (uio->uio_offset & PAGE_MASK_64));
1222
1223 /*
1224 * Fake the resid going into the cluster_write_x call
1225 * and restore it on the way out.
1226 */
1227 prev_resid = uio->uio_resid;
1228 uio->uio_resid = clip_size;
1229 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1230 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1231 }
1232 else
1233 {
1234 /*
1235 * can't get both the file offset and the buffer offset aligned to a page boundary
1236 * so fire an I/O through the cache for this entire vector
1237 */
1238 clip_size = iov->iov_len;
1239 prev_resid = uio->uio_resid;
1240 uio->uio_resid = clip_size;
1241 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1242 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1243 }
1c79356b
A
1244 }
1245 else
1246 {
1247 /*
1248 * If we come in here, we know the offset into
55e303ae
A
1249 * the file is on a pagesize boundary and the
1250 * target buffer address is also on a page boundary
1c79356b 1251 */
1c79356b
A
1252 max_io_size = newEOF - uio->uio_offset;
1253 clip_size = uio->uio_resid;
1254 if (iov->iov_len < clip_size)
1255 clip_size = iov->iov_len;
1256 if (max_io_size < clip_size)
1257 clip_size = max_io_size;
1258
1259 if (clip_size < PAGE_SIZE)
1260 {
1261 /*
1262 * Take care of tail end of write in this vector
1263 */
1264 prev_resid = uio->uio_resid;
1265 uio->uio_resid = clip_size;
0b4e3aa0 1266 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1c79356b
A
1267 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1268 }
1269 else
1270 {
1271 /* round clip_size down to a multiple of pagesize */
1272 clip_size = clip_size & ~(PAGE_MASK);
1273 prev_resid = uio->uio_resid;
1274 uio->uio_resid = clip_size;
0b4e3aa0 1275 retval = cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags);
1c79356b 1276 if ((retval == 0) && uio->uio_resid)
0b4e3aa0 1277 retval = cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags);
1c79356b
A
1278 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
1279 }
1280 } /* end else */
1281 } /* end while */
1282 return(retval);
1283}
1284
b4c24cb9 1285
9bccf70c 1286static int
0b4e3aa0 1287cluster_nocopy_write(vp, uio, newEOF, devblocksize, flags)
1c79356b
A
1288 struct vnode *vp;
1289 struct uio *uio;
1290 off_t newEOF;
1291 int devblocksize;
1292 int flags;
1293{
1294 upl_t upl;
1295 upl_page_info_t *pl;
1296 off_t upl_f_offset;
1297 vm_offset_t upl_offset;
1298 off_t max_io_size;
1299 int io_size;
d7e50217 1300 int io_flag;
1c79356b
A
1301 int upl_size;
1302 int upl_needed_size;
1303 int pages_in_pl;
1304 int upl_flags;
1305 kern_return_t kret;
1306 struct iovec *iov;
1307 int i;
1308 int force_data_sync;
1309 int error = 0;
d7e50217 1310 struct clios iostate;
1c79356b
A
1311
1312 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_START,
1313 (int)uio->uio_offset, (int)uio->uio_resid,
1314 (int)newEOF, devblocksize, 0);
1315
1316 /*
1317 * When we enter this routine, we know
1318 * -- the offset into the file is on a pagesize boundary
1319 * -- the resid is a page multiple
1320 * -- the resid will not exceed iov_len
1321 */
143cc14e 1322 cluster_try_push(vp, newEOF, 0, 1);
1c79356b 1323
d7e50217
A
1324 iostate.io_completed = 0;
1325 iostate.io_issued = 0;
1326 iostate.io_error = 0;
1327 iostate.io_wanted = 0;
1328
1c79356b 1329 iov = uio->uio_iov;
1c79356b 1330
0b4e3aa0 1331 while (uio->uio_resid && uio->uio_offset < newEOF && error == 0) {
d7e50217 1332 io_size = uio->uio_resid;
1c79356b 1333
d7e50217
A
1334 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1335 io_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1c79356b 1336
55e303ae 1337 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK;
d7e50217
A
1338 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
1339
1340 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_START,
1341 (int)upl_offset, upl_needed_size, (int)iov->iov_base, io_size, 0);
1342
1343 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
1344 pages_in_pl = 0;
1345 upl_size = upl_needed_size;
1346 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
55e303ae 1347 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
d7e50217
A
1348
1349 kret = vm_map_get_upl(current_map(),
1350 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1351 &upl_size,
1352 &upl,
1353 NULL,
1354 &pages_in_pl,
1355 &upl_flags,
1356 force_data_sync);
1357
1358 if (kret != KERN_SUCCESS) {
1359 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1360 0, 0, 0, kret, 0);
d7e50217
A
1361 /*
1362 * cluster_nocopy_write: failed to get pagelist
1363 *
1364 * we may have already spun some portion of this request
1365 * off as async requests... we need to wait for the I/O
1366 * to complete before returning
1367 */
1368 goto wait_for_writes;
1369 }
1370 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1371 pages_in_pl = upl_size / PAGE_SIZE;
1c79356b 1372
d7e50217
A
1373 for (i = 0; i < pages_in_pl; i++) {
1374 if (!upl_valid_page(pl, i))
1375 break;
1376 }
1377 if (i == pages_in_pl)
1378 break;
1c79356b 1379
d7e50217
A
1380 /*
1381 * didn't get all the pages back that we
1382 * needed... release this upl and try again
1383 */
1384 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1385 UPL_ABORT_FREE_ON_EMPTY);
1c79356b 1386 }
d7e50217
A
1387 if (force_data_sync >= 3) {
1388 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1389 i, pages_in_pl, upl_size, kret, 0);
d7e50217
A
1390 /*
1391 * for some reason, we couldn't acquire a hold on all
1392 * the pages needed in the user's address space
1393 *
1394 * we may have already spun some portion of this request
1395 * off as async requests... we need to wait for the I/O
1396 * to complete before returning
1397 */
1398 goto wait_for_writes;
1c79356b 1399 }
0b4e3aa0 1400
d7e50217
A
1401 /*
1402 * Consider the possibility that upl_size wasn't satisfied.
1403 */
1404 if (upl_size != upl_needed_size)
1405 io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
1c79356b 1406
d7e50217
A
1407 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 76)) | DBG_FUNC_END,
1408 (int)upl_offset, upl_size, (int)iov->iov_base, io_size, 0);
1c79356b 1409
d7e50217
A
1410 if (io_size == 0) {
1411 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1412 UPL_ABORT_FREE_ON_EMPTY);
d7e50217
A
1413 /*
1414 * we may have already spun some portion of this request
1415 * off as async requests... we need to wait for the I/O
1416 * to complete before returning
1417 */
1418 goto wait_for_writes;
1419 }
1420 /*
1421 * Now look for pages already in the cache
1422 * and throw them away.
55e303ae
A
1423 * uio->uio_offset is page aligned within the file
1424 * io_size is a multiple of PAGE_SIZE
d7e50217 1425 */
55e303ae 1426 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + io_size, UPL_ROP_DUMP, NULL);
1c79356b 1427
d7e50217
A
1428 /*
1429 * we want push out these writes asynchronously so that we can overlap
1430 * the preparation of the next I/O
1431 * if there are already too many outstanding writes
1432 * wait until some complete before issuing the next
1433 */
1434 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
1435 iostate.io_wanted = 1;
1436 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_write", 0);
1437 }
1438 if (iostate.io_error) {
1439 /*
1440 * one of the earlier writes we issued ran into a hard error
1441 * don't issue any more writes, cleanup the UPL
1442 * that was just created but not used, then
1443 * go wait for all writes that are part of this stream
1444 * to complete before returning the error to the caller
1445 */
1446 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
1447 UPL_ABORT_FREE_ON_EMPTY);
1c79356b 1448
d7e50217
A
1449 goto wait_for_writes;
1450 }
55e303ae 1451 io_flag = CL_ASYNC | CL_PRESERVE | CL_COMMIT | CL_THROTTLE;
1c79356b 1452
d7e50217
A
1453 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_START,
1454 (int)upl_offset, (int)uio->uio_offset, io_size, io_flag, 0);
1c79356b 1455
d7e50217
A
1456 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1457 io_size, devblocksize, io_flag, (struct buf *)0, &iostate);
7b1edb79 1458
d7e50217
A
1459 iov->iov_len -= io_size;
1460 iov->iov_base += io_size;
1461 uio->uio_resid -= io_size;
1462 uio->uio_offset += io_size;
1c79356b 1463
d7e50217
A
1464 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 77)) | DBG_FUNC_END,
1465 (int)upl_offset, (int)uio->uio_offset, (int)uio->uio_resid, error, 0);
1c79356b
A
1466
1467 } /* end while */
1468
d7e50217
A
1469wait_for_writes:
1470 /*
1471 * make sure all async writes issued as part of this stream
1472 * have completed before we return
1473 */
1474 while (iostate.io_issued != iostate.io_completed) {
1475 iostate.io_wanted = 1;
1476 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_write", 0);
1477 }
1478 if (iostate.io_error)
1479 error = iostate.io_error;
1c79356b
A
1480
1481 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 75)) | DBG_FUNC_END,
1482 (int)uio->uio_offset, (int)uio->uio_resid, error, 4, 0);
1483
1484 return (error);
1485}
1486
b4c24cb9 1487
9bccf70c 1488static int
b4c24cb9 1489cluster_phys_write(vp, uio, newEOF, devblocksize, flags)
0b4e3aa0
A
1490 struct vnode *vp;
1491 struct uio *uio;
143cc14e 1492 off_t newEOF;
b4c24cb9
A
1493 int devblocksize;
1494 int flags;
0b4e3aa0 1495{
b4c24cb9 1496 upl_page_info_t *pl;
55e303ae 1497 addr64_t src_paddr;
0b4e3aa0
A
1498 upl_t upl;
1499 vm_offset_t upl_offset;
b4c24cb9 1500 int tail_size;
0b4e3aa0
A
1501 int io_size;
1502 int upl_size;
1503 int upl_needed_size;
1504 int pages_in_pl;
1505 int upl_flags;
1506 kern_return_t kret;
1507 struct iovec *iov;
1508 int error = 0;
1509
1510 /*
1511 * When we enter this routine, we know
1512 * -- the resid will not exceed iov_len
1513 * -- the vector target address is physcially contiguous
1514 */
143cc14e 1515 cluster_try_push(vp, newEOF, 0, 1);
0b4e3aa0
A
1516
1517 iov = uio->uio_iov;
1518 io_size = iov->iov_len;
55e303ae 1519 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK;
0b4e3aa0
A
1520 upl_needed_size = upl_offset + io_size;
1521
1522 pages_in_pl = 0;
1523 upl_size = upl_needed_size;
9bccf70c 1524 upl_flags = UPL_FILE_IO | UPL_COPYOUT_FROM | UPL_NO_SYNC |
55e303ae 1525 UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
0b4e3aa0
A
1526
1527 kret = vm_map_get_upl(current_map(),
1528 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
1529 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
1530
b4c24cb9
A
1531 if (kret != KERN_SUCCESS) {
1532 /*
1533 * cluster_phys_write: failed to get pagelist
1534 * note: return kret here
1535 */
0b4e3aa0 1536 return(EINVAL);
b4c24cb9 1537 }
0b4e3aa0
A
1538 /*
1539 * Consider the possibility that upl_size wasn't satisfied.
1540 * This is a failure in the physical memory case.
1541 */
b4c24cb9
A
1542 if (upl_size < upl_needed_size) {
1543 kernel_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1544 return(EINVAL);
1545 }
1546 pl = ubc_upl_pageinfo(upl);
0b4e3aa0 1547
55e303ae 1548 src_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + ((addr64_t)((u_int)iov->iov_base & PAGE_MASK));
0b4e3aa0 1549
b4c24cb9
A
1550 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
1551 int head_size;
0b4e3aa0 1552
b4c24cb9 1553 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
0b4e3aa0 1554
b4c24cb9
A
1555 if (head_size > io_size)
1556 head_size = io_size;
1557
1558 error = cluster_align_phys_io(vp, uio, src_paddr, head_size, devblocksize, 0);
1559
1560 if (error) {
1561 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1562
1563 return(EINVAL);
1564 }
1565 upl_offset += head_size;
1566 src_paddr += head_size;
1567 io_size -= head_size;
0b4e3aa0 1568 }
b4c24cb9
A
1569 tail_size = io_size & (devblocksize - 1);
1570 io_size -= tail_size;
1571
1572 if (io_size) {
1573 /*
1574 * issue a synchronous write to cluster_io
1575 */
1576 error = cluster_io(vp, upl, upl_offset, uio->uio_offset,
1577 io_size, 0, CL_DEV_MEMORY, (struct buf *)0, (struct clios *)0);
1578 }
1579 if (error == 0) {
1580 /*
1581 * The cluster_io write completed successfully,
1582 * update the uio structure
1583 */
1584 uio->uio_resid -= io_size;
1585 iov->iov_len -= io_size;
1586 iov->iov_base += io_size;
1587 uio->uio_offset += io_size;
1588 src_paddr += io_size;
1589
1590 if (tail_size)
1591 error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, devblocksize, 0);
1592 }
1593 /*
1594 * just release our hold on the physically contiguous
1595 * region without changing any state
1596 */
1597 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
0b4e3aa0
A
1598
1599 return (error);
1600}
1601
b4c24cb9 1602
9bccf70c 1603static int
0b4e3aa0 1604cluster_write_x(vp, uio, oldEOF, newEOF, headOff, tailOff, devblocksize, flags)
1c79356b
A
1605 struct vnode *vp;
1606 struct uio *uio;
1607 off_t oldEOF;
1608 off_t newEOF;
1609 off_t headOff;
1610 off_t tailOff;
1611 int devblocksize;
1612 int flags;
1613{
1614 upl_page_info_t *pl;
1615 upl_t upl;
1616 vm_offset_t upl_offset;
1617 int upl_size;
1618 off_t upl_f_offset;
1619 int pages_in_upl;
1620 int start_offset;
1621 int xfer_resid;
1622 int io_size;
1c79356b 1623 int io_flags;
1c79356b
A
1624 int io_offset;
1625 int bytes_to_zero;
1626 int bytes_to_move;
1627 kern_return_t kret;
1628 int retval = 0;
1629 int uio_resid;
1630 long long total_size;
1631 long long zero_cnt;
1632 off_t zero_off;
1633 long long zero_cnt1;
1634 off_t zero_off1;
1635 daddr_t start_blkno;
1636 daddr_t last_blkno;
55e303ae
A
1637 int intersection;
1638
1c79356b
A
1639
1640 if (uio) {
1641 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1642 (int)uio->uio_offset, uio->uio_resid, (int)oldEOF, (int)newEOF, 0);
1643
1644 uio_resid = uio->uio_resid;
1645 } else {
1646 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_START,
1647 0, 0, (int)oldEOF, (int)newEOF, 0);
1648
1649 uio_resid = 0;
1650 }
1651 zero_cnt = 0;
1652 zero_cnt1 = 0;
1653
1654 if (flags & IO_HEADZEROFILL) {
1655 /*
1656 * some filesystems (HFS is one) don't support unallocated holes within a file...
1657 * so we zero fill the intervening space between the old EOF and the offset
1658 * where the next chunk of real data begins.... ftruncate will also use this
1659 * routine to zero fill to the new EOF when growing a file... in this case, the
1660 * uio structure will not be provided
1661 */
1662 if (uio) {
1663 if (headOff < uio->uio_offset) {
1664 zero_cnt = uio->uio_offset - headOff;
1665 zero_off = headOff;
1666 }
1667 } else if (headOff < newEOF) {
1668 zero_cnt = newEOF - headOff;
1669 zero_off = headOff;
1670 }
1671 }
1672 if (flags & IO_TAILZEROFILL) {
1673 if (uio) {
1674 zero_off1 = uio->uio_offset + uio->uio_resid;
1675
1676 if (zero_off1 < tailOff)
1677 zero_cnt1 = tailOff - zero_off1;
1678 }
1679 }
55e303ae 1680 if (zero_cnt == 0 && uio == (struct uio *) 0) {
1c79356b
A
1681 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
1682 retval, 0, 0, 0, 0);
1683 return (0);
55e303ae 1684 }
1c79356b
A
1685
1686 while ((total_size = (uio_resid + zero_cnt + zero_cnt1)) && retval == 0) {
1687 /*
1688 * for this iteration of the loop, figure out where our starting point is
1689 */
1690 if (zero_cnt) {
1691 start_offset = (int)(zero_off & PAGE_MASK_64);
1692 upl_f_offset = zero_off - start_offset;
1693 } else if (uio_resid) {
1694 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
1695 upl_f_offset = uio->uio_offset - start_offset;
1696 } else {
1697 start_offset = (int)(zero_off1 & PAGE_MASK_64);
1698 upl_f_offset = zero_off1 - start_offset;
1699 }
1700 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 46)) | DBG_FUNC_NONE,
1701 (int)zero_off, (int)zero_cnt, (int)zero_off1, (int)zero_cnt1, 0);
1702
0b4e3aa0
A
1703 if (total_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1704 total_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1c79356b 1705
55e303ae
A
1706 start_blkno = (daddr_t)(upl_f_offset / PAGE_SIZE_64);
1707
1708 if (uio && !(vp->v_flag & VNOCACHE_DATA) &&
1709 (flags & (IO_SYNC | IO_HEADZEROFILL | IO_TAILZEROFILL)) == 0) {
1710 /*
1711 * assumption... total_size <= uio_resid
1712 * because IO_HEADZEROFILL and IO_TAILZEROFILL not set
1713 */
1714 if ((start_offset + total_size) > (MAX_UPL_TRANSFER * PAGE_SIZE))
1715 total_size -= start_offset;
1716 xfer_resid = total_size;
1717
1718 retval = cluster_copy_ubc_data(vp, uio, &xfer_resid, 1);
1719
1720 if (retval)
1721 break;
1722
1723 uio_resid -= (total_size - xfer_resid);
1724 total_size = xfer_resid;
1725 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
1726 upl_f_offset = uio->uio_offset - start_offset;
1727
1728 if (total_size == 0) {
1729 if (start_offset) {
1730 /*
1731 * the write did not finish on a page boundary
1732 * which will leave upl_f_offset pointing to the
1733 * beginning of the last page written instead of
1734 * the page beyond it... bump it in this case
1735 * so that the cluster code records the last page
1736 * written as dirty
1737 */
1738 upl_f_offset += PAGE_SIZE_64;
1739 }
1740 upl_size = 0;
1741
1742 goto check_cluster;
1743 }
1744 }
1c79356b
A
1745 /*
1746 * compute the size of the upl needed to encompass
1747 * the requested write... limit each call to cluster_io
0b4e3aa0
A
1748 * to the maximum UPL size... cluster_io will clip if
1749 * this exceeds the maximum io_size for the device,
1750 * make sure to account for
1c79356b
A
1751 * a starting offset that's not page aligned
1752 */
1753 upl_size = (start_offset + total_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
1754
0b4e3aa0
A
1755 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
1756 upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
1c79356b
A
1757
1758 pages_in_upl = upl_size / PAGE_SIZE;
1759 io_size = upl_size - start_offset;
1760
1761 if ((long long)io_size > total_size)
1762 io_size = total_size;
1763
55e303ae
A
1764 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, io_size, total_size, 0, 0);
1765
1c79356b 1766
0b4e3aa0
A
1767 kret = ubc_create_upl(vp,
1768 upl_f_offset,
1769 upl_size,
1770 &upl,
1771 &pl,
55e303ae 1772 UPL_SET_LITE);
1c79356b
A
1773 if (kret != KERN_SUCCESS)
1774 panic("cluster_write: failed to get pagelist");
1775
55e303ae
A
1776 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END,
1777 (int)upl, (int)upl_f_offset, start_offset, 0, 0);
1c79356b
A
1778
1779 if (start_offset && !upl_valid_page(pl, 0)) {
0b4e3aa0 1780 int read_size;
1c79356b 1781
0b4e3aa0 1782 /*
1c79356b
A
1783 * we're starting in the middle of the first page of the upl
1784 * and the page isn't currently valid, so we're going to have
1785 * to read it in first... this is a synchronous operation
1786 */
1787 read_size = PAGE_SIZE;
1788
9bccf70c 1789 if ((upl_f_offset + read_size) > newEOF)
1c79356b 1790 read_size = newEOF - upl_f_offset;
9bccf70c
A
1791
1792 retval = cluster_io(vp, upl, 0, upl_f_offset, read_size, devblocksize,
b4c24cb9 1793 CL_READ, (struct buf *)0, (struct clios *)0);
1c79356b 1794 if (retval) {
0b4e3aa0 1795 /*
1c79356b
A
1796 * we had an error during the read which causes us to abort
1797 * the current cluster_write request... before we do, we need
1798 * to release the rest of the pages in the upl without modifying
1799 * there state and mark the failed page in error
1800 */
0b4e3aa0 1801 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
9bccf70c 1802 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1c79356b
A
1803
1804 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
9bccf70c 1805 (int)upl, 0, 0, retval, 0);
1c79356b
A
1806 break;
1807 }
1808 }
1809 if ((start_offset == 0 || upl_size > PAGE_SIZE) && ((start_offset + io_size) & PAGE_MASK)) {
1810 /*
1811 * the last offset we're writing to in this upl does not end on a page
1812 * boundary... if it's not beyond the old EOF, then we'll also need to
1813 * pre-read this page in if it isn't already valid
1814 */
1815 upl_offset = upl_size - PAGE_SIZE;
1816
1817 if ((upl_f_offset + start_offset + io_size) < oldEOF &&
1818 !upl_valid_page(pl, upl_offset / PAGE_SIZE)) {
1819 int read_size;
1820
1821 read_size = PAGE_SIZE;
1822
9bccf70c 1823 if ((upl_f_offset + upl_offset + read_size) > newEOF)
1c79356b 1824 read_size = newEOF - (upl_f_offset + upl_offset);
9bccf70c
A
1825
1826 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, read_size, devblocksize,
b4c24cb9 1827 CL_READ, (struct buf *)0, (struct clios *)0);
1c79356b 1828 if (retval) {
0b4e3aa0 1829 /*
1c79356b 1830 * we had an error during the read which causes us to abort
0b4e3aa0
A
1831 * the current cluster_write request... before we do, we
1832 * need to release the rest of the pages in the upl without
1833 * modifying there state and mark the failed page in error
1c79356b 1834 */
9bccf70c
A
1835 ubc_upl_abort_range(upl, upl_offset, PAGE_SIZE, UPL_ABORT_DUMP_PAGES);
1836 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1c79356b
A
1837
1838 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
9bccf70c 1839 (int)upl, 0, 0, retval, 0);
1c79356b
A
1840 break;
1841 }
1842 }
1843 }
1c79356b
A
1844 xfer_resid = io_size;
1845 io_offset = start_offset;
1846
1847 while (zero_cnt && xfer_resid) {
1848
1849 if (zero_cnt < (long long)xfer_resid)
1850 bytes_to_zero = zero_cnt;
1851 else
1852 bytes_to_zero = xfer_resid;
1853
9bccf70c 1854 if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
55e303ae 1855 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
1c79356b 1856 } else {
9bccf70c
A
1857 int zero_pg_index;
1858
1c79356b 1859 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off & PAGE_MASK_64));
9bccf70c
A
1860 zero_pg_index = (int)((zero_off - upl_f_offset) / PAGE_SIZE_64);
1861
1862 if ( !upl_valid_page(pl, zero_pg_index)) {
55e303ae 1863 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
1c79356b 1864
9bccf70c
A
1865 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
1866 !upl_dirty_page(pl, zero_pg_index)) {
55e303ae 1867 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
1c79356b
A
1868 }
1869 }
1870 xfer_resid -= bytes_to_zero;
1871 zero_cnt -= bytes_to_zero;
1872 zero_off += bytes_to_zero;
1873 io_offset += bytes_to_zero;
1874 }
1875 if (xfer_resid && uio_resid) {
1876 bytes_to_move = min(uio_resid, xfer_resid);
1877
55e303ae 1878 retval = cluster_copy_upl_data(uio, upl, io_offset, bytes_to_move);
9bccf70c 1879
1c79356b 1880 if (retval) {
9bccf70c
A
1881
1882 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
1c79356b
A
1883
1884 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
9bccf70c 1885 (int)upl, 0, 0, retval, 0);
1c79356b
A
1886 } else {
1887 uio_resid -= bytes_to_move;
1888 xfer_resid -= bytes_to_move;
1889 io_offset += bytes_to_move;
1890 }
1891 }
1892 while (xfer_resid && zero_cnt1 && retval == 0) {
1893
1894 if (zero_cnt1 < (long long)xfer_resid)
1895 bytes_to_zero = zero_cnt1;
1896 else
1897 bytes_to_zero = xfer_resid;
1898
9bccf70c 1899 if ( !(flags & (IO_NOZEROVALID | IO_NOZERODIRTY))) {
55e303ae 1900 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
1c79356b 1901 } else {
9bccf70c
A
1902 int zero_pg_index;
1903
1c79356b 1904 bytes_to_zero = min(bytes_to_zero, PAGE_SIZE - (int)(zero_off1 & PAGE_MASK_64));
9bccf70c
A
1905 zero_pg_index = (int)((zero_off1 - upl_f_offset) / PAGE_SIZE_64);
1906
1907 if ( !upl_valid_page(pl, zero_pg_index)) {
55e303ae 1908 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
9bccf70c
A
1909 } else if ((flags & (IO_NOZERODIRTY | IO_NOZEROVALID)) == IO_NOZERODIRTY &&
1910 !upl_dirty_page(pl, zero_pg_index)) {
55e303ae 1911 cluster_zero(upl, io_offset, bytes_to_zero, NULL);
1c79356b
A
1912 }
1913 }
1914 xfer_resid -= bytes_to_zero;
1915 zero_cnt1 -= bytes_to_zero;
1916 zero_off1 += bytes_to_zero;
1917 io_offset += bytes_to_zero;
1918 }
1919
1920 if (retval == 0) {
9bccf70c 1921 int cl_index;
1c79356b
A
1922 int can_delay;
1923
1924 io_size += start_offset;
1925
9bccf70c 1926 if ((upl_f_offset + io_size) >= newEOF && io_size < upl_size) {
1c79356b
A
1927 /*
1928 * if we're extending the file with this write
1929 * we'll zero fill the rest of the page so that
1930 * if the file gets extended again in such a way as to leave a
1931 * hole starting at this EOF, we'll have zero's in the correct spot
1932 */
55e303ae 1933 cluster_zero(upl, io_size, upl_size - io_size, NULL);
1c79356b 1934 }
9bccf70c
A
1935 if (flags & IO_SYNC)
1936 /*
1937 * if the IO_SYNC flag is set than we need to
1938 * bypass any clusters and immediately issue
1939 * the I/O
1940 */
1941 goto issue_io;
55e303ae
A
1942check_cluster:
1943 /*
1944 * calculate the last logical block number
1945 * that this delayed I/O encompassed
1946 */
1947 last_blkno = (upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64;
1948
1949 if (vp->v_flag & VHASDIRTY) {
1950
1951 if ( !(vp->v_flag & VNOCACHE_DATA)) {
1952 /*
1953 * we've fallen into the sparse
1954 * cluster method of delaying dirty pages
1955 * first, we need to release the upl if we hold one
1956 * since pages in it may be present in the sparse cluster map
1957 * and may span 2 separate buckets there... if they do and
1958 * we happen to have to flush a bucket to make room and it intersects
1959 * this upl, a deadlock may result on page BUSY
1960 */
1961 if (upl_size)
1962 ubc_upl_commit_range(upl, 0, upl_size,
1963 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
1964
1965 sparse_cluster_add(vp, newEOF, start_blkno, last_blkno);
1966
1967 continue;
1968 }
1969 /*
1970 * must have done cached writes that fell into
1971 * the sparse cluster mechanism... we've switched
1972 * to uncached writes on the file, so go ahead
1973 * and push whatever's in the sparse map
1974 * and switch back to normal clustering
1975 *
1976 * see the comment above concerning a possible deadlock...
1977 */
1978 if (upl_size) {
1979 ubc_upl_commit_range(upl, 0, upl_size,
1980 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
1981 /*
1982 * setting upl_size to 0 keeps us from committing a
1983 * second time in the start_new_cluster path
1984 */
1985 upl_size = 0;
1986 }
1987 sparse_cluster_push(vp, ubc_getsize(vp), 1);
1988
1989 /*
1990 * no clusters of either type present at this point
1991 * so just go directly to start_new_cluster since
1992 * we know we need to delay this I/O since we've
1993 * already released the pages back into the cache
1994 * to avoid the deadlock with sparse_cluster_push
1995 */
1996 goto start_new_cluster;
1997 }
1998 upl_offset = 0;
1c79356b 1999
9bccf70c
A
2000 if (vp->v_clen == 0)
2001 /*
2002 * no clusters currently present
2003 */
2004 goto start_new_cluster;
1c79356b 2005
9bccf70c 2006 for (cl_index = 0; cl_index < vp->v_clen; cl_index++) {
1c79356b 2007 /*
55e303ae
A
2008 * check each cluster that we currently hold
2009 * try to merge some or all of this write into
2010 * one or more of the existing clusters... if
2011 * any portion of the write remains, start a
2012 * new cluster
1c79356b 2013 */
9bccf70c
A
2014 if (start_blkno >= vp->v_clusters[cl_index].start_pg) {
2015 /*
2016 * the current write starts at or after the current cluster
2017 */
2018 if (last_blkno <= (vp->v_clusters[cl_index].start_pg + MAX_UPL_TRANSFER)) {
1c79356b
A
2019 /*
2020 * we have a write that fits entirely
2021 * within the existing cluster limits
2022 */
9bccf70c 2023 if (last_blkno > vp->v_clusters[cl_index].last_pg)
1c79356b 2024 /*
9bccf70c 2025 * update our idea of where the cluster ends
1c79356b 2026 */
9bccf70c
A
2027 vp->v_clusters[cl_index].last_pg = last_blkno;
2028 break;
1c79356b 2029 }
9bccf70c 2030 if (start_blkno < (vp->v_clusters[cl_index].start_pg + MAX_UPL_TRANSFER)) {
1c79356b
A
2031 /*
2032 * we have a write that starts in the middle of the current cluster
55e303ae
A
2033 * but extends beyond the cluster's limit... we know this because
2034 * of the previous checks
2035 * we'll extend the current cluster to the max
2036 * and update the start_blkno for the current write to reflect that
2037 * the head of it was absorbed into this cluster...
2038 * note that we'll always have a leftover tail in this case since
2039 * full absorbtion would have occurred in the clause above
1c79356b 2040 */
55e303ae
A
2041 vp->v_clusters[cl_index].last_pg = vp->v_clusters[cl_index].start_pg + MAX_UPL_TRANSFER;
2042
2043 if (upl_size) {
2044 int start_pg_in_upl;
2045
2046 start_pg_in_upl = upl_f_offset / PAGE_SIZE_64;
2047
2048 if (start_pg_in_upl < vp->v_clusters[cl_index].last_pg) {
2049 intersection = (vp->v_clusters[cl_index].last_pg - start_pg_in_upl) * PAGE_SIZE;
2050
2051 ubc_upl_commit_range(upl, upl_offset, intersection,
2052 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2053 upl_f_offset += intersection;
2054 upl_offset += intersection;
2055 upl_size -= intersection;
2056 }
2057 }
2058 start_blkno = vp->v_clusters[cl_index].last_pg;
1c79356b
A
2059 }
2060 /*
55e303ae
A
2061 * we come here for the case where the current write starts
2062 * beyond the limit of the existing cluster or we have a leftover
2063 * tail after a partial absorbtion
9bccf70c
A
2064 *
2065 * in either case, we'll check the remaining clusters before
2066 * starting a new one
1c79356b 2067 */
9bccf70c 2068 } else {
1c79356b 2069 /*
55e303ae 2070 * the current write starts in front of the cluster we're currently considering
1c79356b 2071 */
55e303ae 2072 if ((vp->v_clusters[cl_index].last_pg - start_blkno) <= MAX_UPL_TRANSFER) {
1c79356b 2073 /*
55e303ae
A
2074 * we can just merge the new request into
2075 * this cluster and leave it in the cache
2076 * since the resulting cluster is still
2077 * less than the maximum allowable size
1c79356b 2078 */
9bccf70c 2079 vp->v_clusters[cl_index].start_pg = start_blkno;
1c79356b 2080
9bccf70c
A
2081 if (last_blkno > vp->v_clusters[cl_index].last_pg) {
2082 /*
2083 * the current write completely
55e303ae
A
2084 * envelops the existing cluster and since
2085 * each write is limited to at most MAX_UPL_TRANSFER bytes
2086 * we can just use the start and last blocknos of the write
2087 * to generate the cluster limits
9bccf70c
A
2088 */
2089 vp->v_clusters[cl_index].last_pg = last_blkno;
2090 }
2091 break;
1c79356b 2092 }
9bccf70c 2093
1c79356b 2094 /*
9bccf70c
A
2095 * if we were to combine this write with the current cluster
2096 * we would exceed the cluster size limit.... so,
2097 * let's see if there's any overlap of the new I/O with
55e303ae
A
2098 * the cluster we're currently considering... in fact, we'll
2099 * stretch the cluster out to it's full limit and see if we
2100 * get an intersection with the current write
9bccf70c 2101 *
1c79356b 2102 */
55e303ae 2103 if (last_blkno > vp->v_clusters[cl_index].last_pg - MAX_UPL_TRANSFER) {
1c79356b 2104 /*
55e303ae
A
2105 * the current write extends into the proposed cluster
2106 * clip the length of the current write after first combining it's
2107 * tail with the newly shaped cluster
1c79356b 2108 */
55e303ae
A
2109 vp->v_clusters[cl_index].start_pg = vp->v_clusters[cl_index].last_pg - MAX_UPL_TRANSFER;
2110
2111 if (upl_size) {
2112 intersection = (last_blkno - vp->v_clusters[cl_index].start_pg) * PAGE_SIZE;
2113
2114 if (intersection > upl_size)
2115 /*
2116 * because the current write may consist of a number of pages found in the cache
2117 * which are not part of the UPL, we may have an intersection that exceeds
2118 * the size of the UPL that is also part of this write
2119 */
2120 intersection = upl_size;
2121
2122 ubc_upl_commit_range(upl, upl_offset + (upl_size - intersection), intersection,
2123 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2124 upl_size -= intersection;
2125 }
2126 last_blkno = vp->v_clusters[cl_index].start_pg;
2127 }
9bccf70c
A
2128 /*
2129 * if we get here, there was no way to merge
55e303ae
A
2130 * any portion of this write with this cluster
2131 * or we could only merge part of it which
2132 * will leave a tail...
9bccf70c
A
2133 * we'll check the remaining clusters before starting a new one
2134 */
1c79356b 2135 }
9bccf70c
A
2136 }
2137 if (cl_index < vp->v_clen)
2138 /*
55e303ae
A
2139 * we found an existing cluster(s) that we
2140 * could entirely merge this I/O into
9bccf70c
A
2141 */
2142 goto delay_io;
2143
2144 if (vp->v_clen < MAX_CLUSTERS && !(vp->v_flag & VNOCACHE_DATA))
2145 /*
2146 * we didn't find an existing cluster to
2147 * merge into, but there's room to start
1c79356b
A
2148 * a new one
2149 */
9bccf70c 2150 goto start_new_cluster;
1c79356b 2151
9bccf70c
A
2152 /*
2153 * no exisitng cluster to merge with and no
2154 * room to start a new one... we'll try
55e303ae
A
2155 * pushing one of the existing ones... if none of
2156 * them are able to be pushed, we'll switch
2157 * to the sparse cluster mechanism
2158 * cluster_try_push updates v_clen to the
2159 * number of remaining clusters... and
2160 * returns the number of currently unused clusters
9bccf70c
A
2161 */
2162 if (vp->v_flag & VNOCACHE_DATA)
2163 can_delay = 0;
2164 else
2165 can_delay = 1;
2166
55e303ae
A
2167 if (cluster_try_push(vp, newEOF, can_delay, 0) == 0) {
2168 /*
2169 * no more room in the normal cluster mechanism
2170 * so let's switch to the more expansive but expensive
2171 * sparse mechanism....
2172 * first, we need to release the upl if we hold one
2173 * since pages in it may be present in the sparse cluster map (after the cluster_switch)
2174 * and may span 2 separate buckets there... if they do and
2175 * we happen to have to flush a bucket to make room and it intersects
2176 * this upl, a deadlock may result on page BUSY
2177 */
2178 if (upl_size)
2179 ubc_upl_commit_range(upl, upl_offset, upl_size,
2180 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2181
2182 sparse_cluster_switch(vp, newEOF);
2183 sparse_cluster_add(vp, newEOF, start_blkno, last_blkno);
2184
2185 continue;
9bccf70c 2186 }
55e303ae
A
2187 /*
2188 * we pushed one cluster successfully, so we must be sequentially writing this file
2189 * otherwise, we would have failed and fallen into the sparse cluster support
2190 * so let's take the opportunity to push out additional clusters as long as we
2191 * remain below the throttle... this will give us better I/O locality if we're
2192 * in a copy loop (i.e. we won't jump back and forth between the read and write points
2193 * however, we don't want to push so much out that the write throttle kicks in and
2194 * hangs this thread up until some of the I/O completes...
2195 */
2196 while (vp->v_clen && (vp->v_numoutput <= (ASYNC_THROTTLE / 2)))
2197 cluster_try_push(vp, newEOF, 0, 0);
2198
9bccf70c 2199start_new_cluster:
55e303ae 2200 if (vp->v_clen == 0)
9bccf70c 2201 vp->v_ciosiz = devblocksize;
55e303ae 2202
9bccf70c
A
2203 vp->v_clusters[vp->v_clen].start_pg = start_blkno;
2204 vp->v_clusters[vp->v_clen].last_pg = last_blkno;
2205 vp->v_clen++;
9bccf70c 2206
55e303ae
A
2207delay_io:
2208 if (upl_size)
2209 ubc_upl_commit_range(upl, upl_offset, upl_size,
2210 UPL_COMMIT_SET_DIRTY | UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
9bccf70c
A
2211 continue;
2212issue_io:
2213 /*
2214 * in order to maintain some semblance of coherency with mapped writes
2215 * we need to write the cluster back out as a multiple of the PAGESIZE
2216 * unless the cluster encompasses the last page of the file... in this
2217 * case we'll round out to the nearest device block boundary
2218 */
2219 io_size = upl_size;
2220
2221 if ((upl_f_offset + io_size) > newEOF) {
2222 io_size = newEOF - upl_f_offset;
2223 io_size = (io_size + (devblocksize - 1)) & ~(devblocksize - 1);
1c79356b 2224 }
9bccf70c 2225
0b4e3aa0 2226 if (flags & IO_SYNC)
55e303ae 2227 io_flags = CL_THROTTLE | CL_COMMIT | CL_AGE;
1c79356b 2228 else
55e303ae 2229 io_flags = CL_THROTTLE | CL_COMMIT | CL_AGE | CL_ASYNC;
1c79356b
A
2230
2231 if (vp->v_flag & VNOCACHE_DATA)
2232 io_flags |= CL_DUMP;
2233
9bccf70c 2234 retval = cluster_io(vp, upl, 0, upl_f_offset, io_size, devblocksize,
b4c24cb9 2235 io_flags, (struct buf *)0, (struct clios *)0);
1c79356b
A
2236 }
2237 }
2238 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END,
55e303ae 2239 retval, 0, uio_resid, 0, 0);
1c79356b
A
2240
2241 return (retval);
2242}
2243
9bccf70c 2244int
1c79356b
A
2245cluster_read(vp, uio, filesize, devblocksize, flags)
2246 struct vnode *vp;
2247 struct uio *uio;
2248 off_t filesize;
2249 int devblocksize;
2250 int flags;
2251{
1c79356b
A
2252 int prev_resid;
2253 int clip_size;
2254 off_t max_io_size;
2255 struct iovec *iov;
0b4e3aa0 2256 int upl_size;
0b4e3aa0
A
2257 int upl_flags;
2258 upl_t upl;
1c79356b
A
2259 int retval = 0;
2260
1c79356b 2261
0b4e3aa0 2262 if (!((vp->v_flag & VNOCACHE_DATA) && (uio->uio_segflg == UIO_USERSPACE)))
1c79356b 2263 {
55e303ae
A
2264 /*
2265 * go do a read through the cache if one of the following is true....
2266 * NOCACHE is not true
2267 * the uio request doesn't target USERSPACE
2268 */
2269 return (cluster_read_x(vp, uio, filesize, devblocksize, flags));
1c79356b
A
2270 }
2271
2272 while (uio->uio_resid && uio->uio_offset < filesize && retval == 0)
2273 {
55e303ae
A
2274 /*
2275 * we know we have a resid, so this is safe
2276 * skip over any emtpy vectors
2277 */
1c79356b 2278 iov = uio->uio_iov;
55e303ae 2279
1c79356b
A
2280 while (iov->iov_len == 0) {
2281 uio->uio_iov++;
2282 uio->uio_iovcnt--;
2283 iov = uio->uio_iov;
2284 }
55e303ae 2285 upl_size = PAGE_SIZE;
0b4e3aa0 2286 upl_flags = UPL_QUERY_OBJECT_TYPE;
55e303ae
A
2287
2288 if ((vm_map_get_upl(current_map(),
0b4e3aa0 2289 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
55e303ae 2290 &upl_size, &upl, NULL, NULL, &upl_flags, 0)) != KERN_SUCCESS)
0b4e3aa0
A
2291 {
2292 /*
2293 * the user app must have passed in an invalid address
2294 */
2295 return (EFAULT);
2296 }
2297
55e303ae
A
2298 /*
2299 * We check every vector target but if it is physically
2300 * contiguous space, we skip the sanity checks.
2301 */
0b4e3aa0
A
2302 if (upl_flags & UPL_PHYS_CONTIG)
2303 {
b4c24cb9 2304 retval = cluster_phys_read(vp, uio, filesize, devblocksize, flags);
0b4e3aa0 2305 }
55e303ae 2306 else if (uio->uio_resid < PAGE_SIZE)
0b4e3aa0
A
2307 {
2308 /*
55e303ae
A
2309 * we're here because we're don't have a physically contiguous target buffer
2310 * go do a read through the cache if
2311 * the total xfer size is less than a page...
1c79356b 2312 */
55e303ae 2313 return (cluster_read_x(vp, uio, filesize, devblocksize, flags));
1c79356b 2314 }
55e303ae 2315 else if (((int)uio->uio_offset & PAGE_MASK) || ((int)iov->iov_base & PAGE_MASK))
1c79356b 2316 {
55e303ae
A
2317 if (((int)uio->uio_offset & PAGE_MASK) == ((int)iov->iov_base & PAGE_MASK))
2318 {
2319 /*
2320 * Bring the file offset read up to a pagesize boundary
2321 * this will also bring the base address to a page boundary
2322 * since they both are currently on the same offset within a page
2323 * note: if we get here, uio->uio_resid is greater than PAGE_SIZE
2324 * so the computed clip_size must always be less than the current uio_resid
2325 */
2326 clip_size = (PAGE_SIZE - (int)(uio->uio_offset & PAGE_MASK_64));
2327
2328 /*
2329 * Fake the resid going into the cluster_read_x call
2330 * and restore it on the way out.
2331 */
2332 prev_resid = uio->uio_resid;
2333 uio->uio_resid = clip_size;
2334 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2335 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2336 }
2337 else
2338 {
2339 /*
2340 * can't get both the file offset and the buffer offset aligned to a page boundary
2341 * so fire an I/O through the cache for this entire vector
2342 */
2343 clip_size = iov->iov_len;
2344 prev_resid = uio->uio_resid;
2345 uio->uio_resid = clip_size;
2346 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
2347 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2348 }
1c79356b
A
2349 }
2350 else
2351 {
2352 /*
2353 * If we come in here, we know the offset into
2354 * the file is on a pagesize boundary
2355 */
2356
2357 max_io_size = filesize - uio->uio_offset;
2358 clip_size = uio->uio_resid;
2359 if (iov->iov_len < clip_size)
2360 clip_size = iov->iov_len;
2361 if (max_io_size < clip_size)
2362 clip_size = (int)max_io_size;
2363
2364 if (clip_size < PAGE_SIZE)
2365 {
2366 /*
2367 * Take care of the tail end of the read in this vector.
2368 */
2369 prev_resid = uio->uio_resid;
2370 uio->uio_resid = clip_size;
0b4e3aa0 2371 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
1c79356b
A
2372 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2373 }
2374 else
2375 {
2376 /* round clip_size down to a multiple of pagesize */
2377 clip_size = clip_size & ~(PAGE_MASK);
2378 prev_resid = uio->uio_resid;
2379 uio->uio_resid = clip_size;
0b4e3aa0 2380 retval = cluster_nocopy_read(vp, uio, filesize, devblocksize, flags);
1c79356b 2381 if ((retval==0) && uio->uio_resid)
0b4e3aa0 2382 retval = cluster_read_x(vp, uio, filesize, devblocksize, flags);
1c79356b
A
2383 uio->uio_resid = prev_resid - (clip_size - uio->uio_resid);
2384 }
2385 } /* end else */
2386 } /* end while */
2387
1c79356b
A
2388 return(retval);
2389}
2390
9bccf70c 2391static int
0b4e3aa0 2392cluster_read_x(vp, uio, filesize, devblocksize, flags)
1c79356b
A
2393 struct vnode *vp;
2394 struct uio *uio;
2395 off_t filesize;
2396 int devblocksize;
2397 int flags;
2398{
2399 upl_page_info_t *pl;
2400 upl_t upl;
2401 vm_offset_t upl_offset;
2402 int upl_size;
2403 off_t upl_f_offset;
2404 int start_offset;
2405 int start_pg;
2406 int last_pg;
2407 int uio_last;
2408 int pages_in_upl;
2409 off_t max_size;
55e303ae
A
2410 off_t last_ioread_offset;
2411 off_t last_request_offset;
2412 u_int size_of_prefetch;
1c79356b 2413 int io_size;
1c79356b 2414 kern_return_t kret;
1c79356b
A
2415 int error = 0;
2416 int retval = 0;
55e303ae
A
2417 u_int b_lblkno;
2418 u_int e_lblkno;
2419 struct clios iostate;
2420 u_int max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2421 u_int rd_ahead_enabled = 1;
2422 u_int prefetch_enabled = 1;
2423
2424
2425 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START,
2426 (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0);
2427
2428 if (cluster_hard_throttle_on(vp)) {
2429 rd_ahead_enabled = 0;
2430 prefetch_enabled = 0;
2431
2432 max_rd_size = HARD_THROTTLE_MAXSIZE;
2433 }
2434 if (vp->v_flag & (VRAOFF|VNOCACHE_DATA))
2435 rd_ahead_enabled = 0;
2436
2437 last_request_offset = uio->uio_offset + uio->uio_resid;
2438
2439 if (last_request_offset > filesize)
2440 last_request_offset = filesize;
2441 b_lblkno = (u_int)(uio->uio_offset / PAGE_SIZE_64);
2442 e_lblkno = (u_int)((last_request_offset - 1) / PAGE_SIZE_64);
2443
2444 if (vp->v_ralen && (vp->v_lastr == b_lblkno || (vp->v_lastr + 1) == b_lblkno)) {
2445 /*
2446 * determine if we already have a read-ahead in the pipe courtesy of the
2447 * last read systemcall that was issued...
2448 * if so, pick up it's extent to determine where we should start
2449 * with respect to any read-ahead that might be necessary to
2450 * garner all the data needed to complete this read systemcall
2451 */
2452 last_ioread_offset = (vp->v_maxra * PAGE_SIZE_64) + PAGE_SIZE_64;
1c79356b 2453
55e303ae
A
2454 if (last_ioread_offset < uio->uio_offset)
2455 last_ioread_offset = (off_t)0;
2456 else if (last_ioread_offset > last_request_offset)
2457 last_ioread_offset = last_request_offset;
2458 } else
2459 last_ioread_offset = (off_t)0;
1c79356b
A
2460
2461 while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) {
2462 /*
2463 * compute the size of the upl needed to encompass
2464 * the requested read... limit each call to cluster_io
0b4e3aa0
A
2465 * to the maximum UPL size... cluster_io will clip if
2466 * this exceeds the maximum io_size for the device,
2467 * make sure to account for
1c79356b
A
2468 * a starting offset that's not page aligned
2469 */
2470 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2471 upl_f_offset = uio->uio_offset - (off_t)start_offset;
2472 max_size = filesize - uio->uio_offset;
2473
0b4e3aa0 2474 if ((off_t)((unsigned int)uio->uio_resid) < max_size)
1c79356b
A
2475 io_size = uio->uio_resid;
2476 else
2477 io_size = max_size;
9bccf70c 2478
55e303ae 2479 if (!(vp->v_flag & VNOCACHE_DATA)) {
1c79356b 2480
55e303ae
A
2481 while (io_size) {
2482 u_int io_resid;
2483 u_int io_requested;
1c79356b 2484
55e303ae
A
2485 /*
2486 * if we keep finding the pages we need already in the cache, then
2487 * don't bother to call cluster_rd_prefetch since it costs CPU cycles
2488 * to determine that we have all the pages we need... once we miss in
2489 * the cache and have issued an I/O, than we'll assume that we're likely
2490 * to continue to miss in the cache and it's to our advantage to try and prefetch
2491 */
2492 if (last_request_offset && last_ioread_offset && (size_of_prefetch = (last_request_offset - last_ioread_offset))) {
2493 if ((last_ioread_offset - uio->uio_offset) <= max_rd_size && prefetch_enabled) {
2494 /*
2495 * we've already issued I/O for this request and
2496 * there's still work to do and
2497 * our prefetch stream is running dry, so issue a
2498 * pre-fetch I/O... the I/O latency will overlap
2499 * with the copying of the data
2500 */
2501 if (size_of_prefetch > max_rd_size)
2502 size_of_prefetch = max_rd_size;
1c79356b 2503
55e303ae 2504 size_of_prefetch = cluster_rd_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, devblocksize);
1c79356b 2505
55e303ae
A
2506 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
2507
2508 if (last_ioread_offset > last_request_offset)
2509 last_ioread_offset = last_request_offset;
2510 }
2511 }
2512 /*
2513 * limit the size of the copy we're about to do so that
2514 * we can notice that our I/O pipe is running dry and
2515 * get the next I/O issued before it does go dry
2516 */
2517 if (last_ioread_offset && io_size > ((MAX_UPL_TRANSFER * PAGE_SIZE) / 4))
2518 io_resid = ((MAX_UPL_TRANSFER * PAGE_SIZE) / 4);
2519 else
2520 io_resid = io_size;
1c79356b 2521
55e303ae 2522 io_requested = io_resid;
1c79356b 2523
55e303ae 2524 retval = cluster_copy_ubc_data(vp, uio, &io_resid, 0);
1c79356b 2525
55e303ae 2526 io_size -= (io_requested - io_resid);
1c79356b 2527
55e303ae
A
2528 if (retval || io_resid)
2529 /*
2530 * if we run into a real error or
2531 * a page that is not in the cache
2532 * we need to leave streaming mode
2533 */
2534 break;
2535
2536 if ((io_size == 0 || last_ioread_offset == last_request_offset) && rd_ahead_enabled) {
2537 /*
2538 * we're already finished the I/O for this read request
2539 * let's see if we should do a read-ahead
2540 */
2541 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
2542 }
1c79356b 2543 }
1c79356b
A
2544 if (retval)
2545 break;
1c79356b 2546 if (io_size == 0) {
55e303ae
A
2547 if (e_lblkno < vp->v_lastr)
2548 vp->v_maxra = 0;
2549 vp->v_lastr = e_lblkno;
1c79356b
A
2550
2551 break;
2552 }
55e303ae
A
2553 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
2554 upl_f_offset = uio->uio_offset - (off_t)start_offset;
2555 max_size = filesize - uio->uio_offset;
1c79356b 2556 }
55e303ae
A
2557 if (io_size > max_rd_size)
2558 io_size = max_rd_size;
2559
1c79356b 2560 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
55e303ae
A
2561
2562 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE) / 4)
2563 upl_size = (MAX_UPL_TRANSFER * PAGE_SIZE) / 4;
1c79356b
A
2564 pages_in_upl = upl_size / PAGE_SIZE;
2565
2566 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_START,
9bccf70c 2567 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
1c79356b 2568
0b4e3aa0
A
2569 kret = ubc_create_upl(vp,
2570 upl_f_offset,
2571 upl_size,
2572 &upl,
2573 &pl,
55e303ae 2574 UPL_SET_LITE);
1c79356b
A
2575 if (kret != KERN_SUCCESS)
2576 panic("cluster_read: failed to get pagelist");
2577
1c79356b 2578 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 33)) | DBG_FUNC_END,
9bccf70c 2579 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
1c79356b
A
2580
2581 /*
2582 * scan from the beginning of the upl looking for the first
2583 * non-valid page.... this will become the first page in
2584 * the request we're going to make to 'cluster_io'... if all
2585 * of the pages are valid, we won't call through to 'cluster_io'
2586 */
2587 for (start_pg = 0; start_pg < pages_in_upl; start_pg++) {
2588 if (!upl_valid_page(pl, start_pg))
2589 break;
2590 }
2591
2592 /*
2593 * scan from the starting invalid page looking for a valid
2594 * page before the end of the upl is reached, if we
2595 * find one, then it will be the last page of the request to
2596 * 'cluster_io'
2597 */
2598 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
2599 if (upl_valid_page(pl, last_pg))
2600 break;
2601 }
55e303ae
A
2602 iostate.io_completed = 0;
2603 iostate.io_issued = 0;
2604 iostate.io_error = 0;
2605 iostate.io_wanted = 0;
1c79356b
A
2606
2607 if (start_pg < last_pg) {
2608 /*
2609 * we found a range of 'invalid' pages that must be filled
2610 * if the last page in this range is the last page of the file
2611 * we may have to clip the size of it to keep from reading past
2612 * the end of the last physical block associated with the file
2613 */
2614 upl_offset = start_pg * PAGE_SIZE;
2615 io_size = (last_pg - start_pg) * PAGE_SIZE;
2616
9bccf70c 2617 if ((upl_f_offset + upl_offset + io_size) > filesize)
1c79356b 2618 io_size = filesize - (upl_f_offset + upl_offset);
9bccf70c 2619
1c79356b 2620 /*
55e303ae 2621 * issue an asynchronous read to cluster_io
1c79356b
A
2622 */
2623
2624 error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset,
55e303ae 2625 io_size, devblocksize, CL_READ | CL_ASYNC, (struct buf *)0, &iostate);
1c79356b
A
2626 }
2627 if (error == 0) {
2628 /*
2629 * if the read completed successfully, or there was no I/O request
55e303ae
A
2630 * issued, than copy the data into user land via 'cluster_upl_copy_data'
2631 * we'll first add on any 'valid'
1c79356b
A
2632 * pages that were present in the upl when we acquired it.
2633 */
2634 u_int val_size;
1c79356b
A
2635
2636 for (uio_last = last_pg; uio_last < pages_in_upl; uio_last++) {
2637 if (!upl_valid_page(pl, uio_last))
2638 break;
2639 }
2640 /*
2641 * compute size to transfer this round, if uio->uio_resid is
55e303ae 2642 * still non-zero after this attempt, we'll loop around and
1c79356b
A
2643 * set up for another I/O.
2644 */
2645 val_size = (uio_last * PAGE_SIZE) - start_offset;
2646
55e303ae 2647 if (val_size > max_size)
1c79356b
A
2648 val_size = max_size;
2649
55e303ae 2650 if (val_size > uio->uio_resid)
1c79356b
A
2651 val_size = uio->uio_resid;
2652
55e303ae
A
2653 if (last_ioread_offset == 0)
2654 last_ioread_offset = uio->uio_offset + val_size;
1c79356b 2655
55e303ae 2656 if ((size_of_prefetch = (last_request_offset - last_ioread_offset)) && prefetch_enabled) {
1c79356b 2657 /*
55e303ae
A
2658 * if there's still I/O left to do for this request, and...
2659 * we're not in hard throttle mode, then issue a
2660 * pre-fetch I/O... the I/O latency will overlap
1c79356b
A
2661 * with the copying of the data
2662 */
55e303ae 2663 size_of_prefetch = cluster_rd_prefetch(vp, last_ioread_offset, size_of_prefetch, filesize, devblocksize);
1c79356b 2664
55e303ae
A
2665 last_ioread_offset += (off_t)(size_of_prefetch * PAGE_SIZE);
2666
2667 if (last_ioread_offset > last_request_offset)
2668 last_ioread_offset = last_request_offset;
1c79356b 2669
55e303ae
A
2670 } else if ((uio->uio_offset + val_size) == last_request_offset) {
2671 /*
2672 * this transfer will finish this request, so...
2673 * let's try to read ahead if we're in
2674 * a sequential access pattern and we haven't
2675 * explicitly disabled it
2676 */
2677 if (rd_ahead_enabled)
2678 cluster_rd_ahead(vp, b_lblkno, e_lblkno, filesize, devblocksize);
1c79356b 2679
55e303ae
A
2680 if (e_lblkno < vp->v_lastr)
2681 vp->v_maxra = 0;
2682 vp->v_lastr = e_lblkno;
9bccf70c 2683 }
55e303ae
A
2684 while (iostate.io_issued != iostate.io_completed) {
2685 iostate.io_wanted = 1;
2686 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_read_x", 0);
2687 }
2688 if (iostate.io_error)
2689 error = iostate.io_error;
9bccf70c 2690 else
55e303ae 2691 retval = cluster_copy_upl_data(uio, upl, start_offset, val_size);
1c79356b
A
2692 }
2693 if (start_pg < last_pg) {
2694 /*
2695 * compute the range of pages that we actually issued an I/O for
2696 * and either commit them as valid if the I/O succeeded
2697 * or abort them if the I/O failed
2698 */
2699 io_size = (last_pg - start_pg) * PAGE_SIZE;
2700
2701 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
9bccf70c 2702 (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
1c79356b
A
2703
2704 if (error || (vp->v_flag & VNOCACHE_DATA))
0b4e3aa0 2705 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, io_size,
1c79356b
A
2706 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
2707 else
0b4e3aa0 2708 ubc_upl_commit_range(upl, start_pg * PAGE_SIZE, io_size,
55e303ae
A
2709 UPL_COMMIT_CLEAR_DIRTY |
2710 UPL_COMMIT_FREE_ON_EMPTY |
2711 UPL_COMMIT_INACTIVATE);
1c79356b
A
2712
2713 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
9bccf70c 2714 (int)upl, start_pg * PAGE_SIZE, io_size, error, 0);
1c79356b
A
2715 }
2716 if ((last_pg - start_pg) < pages_in_upl) {
2717 int cur_pg;
2718 int commit_flags;
2719
2720 /*
2721 * the set of pages that we issued an I/O for did not encompass
2722 * the entire upl... so just release these without modifying
55e303ae 2723 * their state
1c79356b
A
2724 */
2725 if (error)
9bccf70c 2726 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
1c79356b 2727 else {
0b4e3aa0 2728 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_START,
9bccf70c 2729 (int)upl, -1, pages_in_upl - (last_pg - start_pg), 0, 0);
1c79356b 2730
0b4e3aa0
A
2731 if (start_pg) {
2732 /*
2733 * we found some already valid pages at the beginning of
2734 * the upl commit these back to the inactive list with
2735 * reference cleared
2736 */
2737 for (cur_pg = 0; cur_pg < start_pg; cur_pg++) {
2738 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
2739 | UPL_COMMIT_INACTIVATE;
1c79356b
A
2740
2741 if (upl_dirty_page(pl, cur_pg))
0b4e3aa0 2742 commit_flags |= UPL_COMMIT_SET_DIRTY;
1c79356b
A
2743
2744 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA))
0b4e3aa0
A
2745 ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
2746 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
1c79356b 2747 else
0b4e3aa0
A
2748 ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
2749 PAGE_SIZE, commit_flags);
1c79356b
A
2750 }
2751 }
2752 if (last_pg < uio_last) {
0b4e3aa0
A
2753 /*
2754 * we found some already valid pages immediately after the
2755 * pages we issued I/O for, commit these back to the
2756 * inactive list with reference cleared
2757 */
2758 for (cur_pg = last_pg; cur_pg < uio_last; cur_pg++) {
2759 commit_flags = UPL_COMMIT_FREE_ON_EMPTY
2760 | UPL_COMMIT_INACTIVATE;
1c79356b
A
2761
2762 if (upl_dirty_page(pl, cur_pg))
0b4e3aa0 2763 commit_flags |= UPL_COMMIT_SET_DIRTY;
1c79356b
A
2764
2765 if ( !(commit_flags & UPL_COMMIT_SET_DIRTY) && (vp->v_flag & VNOCACHE_DATA))
0b4e3aa0
A
2766 ubc_upl_abort_range(upl, cur_pg * PAGE_SIZE, PAGE_SIZE,
2767 UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
1c79356b 2768 else
0b4e3aa0
A
2769 ubc_upl_commit_range(upl, cur_pg * PAGE_SIZE,
2770 PAGE_SIZE, commit_flags);
1c79356b
A
2771 }
2772 }
2773 if (uio_last < pages_in_upl) {
0b4e3aa0
A
2774 /*
2775 * there were some invalid pages beyond the valid pages
2776 * that we didn't issue an I/O for, just release them
2777 * unchanged
1c79356b 2778 */
9bccf70c
A
2779 ubc_upl_abort_range(upl, uio_last * PAGE_SIZE,
2780 (pages_in_upl - uio_last) * PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
1c79356b
A
2781 }
2782
0b4e3aa0 2783 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 35)) | DBG_FUNC_END,
9bccf70c 2784 (int)upl, -1, -1, 0, 0);
1c79356b
A
2785 }
2786 }
2787 if (retval == 0)
2788 retval = error;
2789 }
55e303ae
A
2790 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END,
2791 (int)uio->uio_offset, uio->uio_resid, vp->v_lastr, retval, 0);
1c79356b
A
2792
2793 return (retval);
2794}
2795
b4c24cb9 2796
9bccf70c 2797static int
0b4e3aa0 2798cluster_nocopy_read(vp, uio, filesize, devblocksize, flags)
1c79356b
A
2799 struct vnode *vp;
2800 struct uio *uio;
2801 off_t filesize;
2802 int devblocksize;
2803 int flags;
2804{
2805 upl_t upl;
2806 upl_page_info_t *pl;
1c79356b 2807 vm_offset_t upl_offset;
1c79356b
A
2808 off_t max_io_size;
2809 int io_size;
2810 int upl_size;
2811 int upl_needed_size;
2812 int pages_in_pl;
1c79356b
A
2813 int upl_flags;
2814 kern_return_t kret;
1c79356b
A
2815 struct iovec *iov;
2816 int i;
2817 int force_data_sync;
1c79356b 2818 int retval = 0;
d7e50217 2819 struct clios iostate;
55e303ae
A
2820 u_int max_rd_size = MAX_UPL_TRANSFER * PAGE_SIZE;
2821 u_int max_rd_ahead = MAX_UPL_TRANSFER * PAGE_SIZE * 2;
2822
1c79356b
A
2823
2824 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_START,
2825 (int)uio->uio_offset, uio->uio_resid, (int)filesize, devblocksize, 0);
2826
2827 /*
2828 * When we enter this routine, we know
2829 * -- the offset into the file is on a pagesize boundary
2830 * -- the resid is a page multiple
2831 * -- the resid will not exceed iov_len
2832 */
2833
d7e50217
A
2834 iostate.io_completed = 0;
2835 iostate.io_issued = 0;
2836 iostate.io_error = 0;
2837 iostate.io_wanted = 0;
2838
1c79356b 2839 iov = uio->uio_iov;
d7e50217 2840
55e303ae
A
2841 if (cluster_hard_throttle_on(vp)) {
2842 max_rd_size = HARD_THROTTLE_MAXSIZE;
2843 max_rd_ahead = HARD_THROTTLE_MAXSIZE - 1;
2844 }
1c79356b
A
2845 while (uio->uio_resid && uio->uio_offset < filesize && retval == 0) {
2846
d7e50217 2847 max_io_size = filesize - uio->uio_offset;
0b4e3aa0 2848
d7e50217
A
2849 if (max_io_size < (off_t)((unsigned int)uio->uio_resid))
2850 io_size = max_io_size;
2851 else
2852 io_size = uio->uio_resid;
1c79356b 2853
d7e50217
A
2854 /*
2855 * First look for pages already in the cache
2856 * and move them to user space.
2857 */
55e303ae 2858 retval = cluster_copy_ubc_data(vp, uio, &io_size, 0);
1c79356b 2859
d7e50217
A
2860 if (retval) {
2861 /*
2862 * we may have already spun some portion of this request
2863 * off as async requests... we need to wait for the I/O
2864 * to complete before returning
2865 */
2866 goto wait_for_reads;
0b4e3aa0 2867 }
d7e50217
A
2868 /*
2869 * If we are already finished with this read, then return
2870 */
2871 if (io_size == 0) {
2872 /*
2873 * we may have already spun some portion of this request
2874 * off as async requests... we need to wait for the I/O
2875 * to complete before returning
2876 */
2877 goto wait_for_reads;
2878 }
2879 max_io_size = io_size;
2880
55e303ae
A
2881 if (max_io_size > max_rd_size)
2882 max_io_size = max_rd_size;
2883
d7e50217 2884 io_size = 0;
1c79356b 2885
55e303ae
A
2886 ubc_range_op(vp, uio->uio_offset, uio->uio_offset + max_io_size, UPL_ROP_ABSENT, &io_size);
2887
d7e50217
A
2888 if (io_size == 0)
2889 /*
2890 * we may have already spun some portion of this request
2891 * off as async requests... we need to wait for the I/O
2892 * to complete before returning
2893 */
2894 goto wait_for_reads;
1c79356b 2895
55e303ae 2896 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK;
d7e50217 2897 upl_needed_size = (upl_offset + io_size + (PAGE_SIZE -1)) & ~PAGE_MASK;
1c79356b 2898
d7e50217
A
2899 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_START,
2900 (int)upl_offset, upl_needed_size, (int)iov->iov_base, io_size, 0);
1c79356b 2901
d7e50217
A
2902 for (force_data_sync = 0; force_data_sync < 3; force_data_sync++) {
2903 pages_in_pl = 0;
2904 upl_size = upl_needed_size;
55e303ae 2905 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
1c79356b 2906
d7e50217
A
2907 kret = vm_map_get_upl(current_map(),
2908 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
2909 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, force_data_sync);
1c79356b 2910
d7e50217
A
2911 if (kret != KERN_SUCCESS) {
2912 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2913 (int)upl_offset, upl_size, io_size, kret, 0);
d7e50217
A
2914 /*
2915 * cluster_nocopy_read: failed to get pagelist
2916 *
2917 * we may have already spun some portion of this request
2918 * off as async requests... we need to wait for the I/O
2919 * to complete before returning
2920 */
2921 goto wait_for_reads;
2922 }
2923 pages_in_pl = upl_size / PAGE_SIZE;
2924 pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
1c79356b 2925
d7e50217
A
2926 for (i = 0; i < pages_in_pl; i++) {
2927 if (!upl_valid_page(pl, i))
2928 break;
2929 }
2930 if (i == pages_in_pl)
2931 break;
0b4e3aa0 2932
d7e50217
A
2933 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2934 UPL_ABORT_FREE_ON_EMPTY);
1c79356b 2935 }
d7e50217
A
2936 if (force_data_sync >= 3) {
2937 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2938 (int)upl_offset, upl_size, io_size, kret, 0);
1c79356b 2939
d7e50217
A
2940 goto wait_for_reads;
2941 }
2942 /*
2943 * Consider the possibility that upl_size wasn't satisfied.
2944 */
2945 if (upl_size != upl_needed_size)
2946 io_size = (upl_size - (int)upl_offset) & ~PAGE_MASK;
1c79356b 2947
d7e50217
A
2948 if (io_size == 0) {
2949 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2950 UPL_ABORT_FREE_ON_EMPTY);
2951 goto wait_for_reads;
2952 }
2953 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 72)) | DBG_FUNC_END,
2954 (int)upl_offset, upl_size, io_size, kret, 0);
1c79356b 2955
d7e50217
A
2956 /*
2957 * request asynchronously so that we can overlap
2958 * the preparation of the next I/O
2959 * if there are already too many outstanding reads
2960 * wait until some have completed before issuing the next read
2961 */
55e303ae 2962 while ((iostate.io_issued - iostate.io_completed) > max_rd_ahead) {
d7e50217
A
2963 iostate.io_wanted = 1;
2964 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_read", 0);
2965 }
2966 if (iostate.io_error) {
2967 /*
2968 * one of the earlier reads we issued ran into a hard error
2969 * don't issue any more reads, cleanup the UPL
2970 * that was just created but not used, then
2971 * go wait for any other reads to complete before
2972 * returning the error to the caller
2973 */
2974 ubc_upl_abort_range(upl, (upl_offset & ~PAGE_MASK), upl_size,
2975 UPL_ABORT_FREE_ON_EMPTY);
1c79356b 2976
d7e50217
A
2977 goto wait_for_reads;
2978 }
2979 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_START,
55e303ae 2980 (int)upl, (int)upl_offset, (int)uio->uio_offset, io_size, 0);
1c79356b 2981
55e303ae 2982 retval = cluster_io(vp, upl, upl_offset, uio->uio_offset,
d7e50217
A
2983 io_size, devblocksize,
2984 CL_PRESERVE | CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO,
2985 (struct buf *)0, &iostate);
1c79356b 2986
d7e50217
A
2987 /*
2988 * update the uio structure
2989 */
2990 iov->iov_base += io_size;
2991 iov->iov_len -= io_size;
2992 uio->uio_resid -= io_size;
2993 uio->uio_offset += io_size;
1c79356b 2994
d7e50217
A
2995 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 73)) | DBG_FUNC_END,
2996 (int)upl, (int)uio->uio_offset, (int)uio->uio_resid, retval, 0);
1c79356b
A
2997
2998 } /* end while */
2999
d7e50217
A
3000wait_for_reads:
3001 /*
3002 * make sure all async reads that are part of this stream
3003 * have completed before we return
3004 */
3005 while (iostate.io_issued != iostate.io_completed) {
3006 iostate.io_wanted = 1;
3007 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_nocopy_read", 0);
3008 }
3009 if (iostate.io_error)
3010 retval = iostate.io_error;
1c79356b
A
3011
3012 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 70)) | DBG_FUNC_END,
3013 (int)uio->uio_offset, (int)uio->uio_resid, 6, retval, 0);
3014
3015 return (retval);
3016}
3017
3018
9bccf70c 3019static int
b4c24cb9 3020cluster_phys_read(vp, uio, filesize, devblocksize, flags)
0b4e3aa0
A
3021 struct vnode *vp;
3022 struct uio *uio;
3023 off_t filesize;
b4c24cb9
A
3024 int devblocksize;
3025 int flags;
0b4e3aa0 3026{
b4c24cb9 3027 upl_page_info_t *pl;
0b4e3aa0
A
3028 upl_t upl;
3029 vm_offset_t upl_offset;
55e303ae 3030 addr64_t dst_paddr;
0b4e3aa0
A
3031 off_t max_size;
3032 int io_size;
b4c24cb9 3033 int tail_size;
0b4e3aa0
A
3034 int upl_size;
3035 int upl_needed_size;
3036 int pages_in_pl;
3037 int upl_flags;
3038 kern_return_t kret;
3039 struct iovec *iov;
b4c24cb9 3040 struct clios iostate;
0b4e3aa0
A
3041 int error;
3042
3043 /*
3044 * When we enter this routine, we know
3045 * -- the resid will not exceed iov_len
3046 * -- the target address is physically contiguous
3047 */
3048
3049 iov = uio->uio_iov;
3050
3051 max_size = filesize - uio->uio_offset;
3052
b4c24cb9
A
3053 if (max_size > (off_t)((unsigned int)iov->iov_len))
3054 io_size = iov->iov_len;
0b4e3aa0 3055 else
b4c24cb9 3056 io_size = max_size;
0b4e3aa0 3057
55e303ae 3058 upl_offset = (vm_offset_t)iov->iov_base & PAGE_MASK;
0b4e3aa0
A
3059 upl_needed_size = upl_offset + io_size;
3060
b4c24cb9 3061 error = 0;
0b4e3aa0
A
3062 pages_in_pl = 0;
3063 upl_size = upl_needed_size;
55e303ae 3064 upl_flags = UPL_FILE_IO | UPL_NO_SYNC | UPL_CLEAN_IN_PLACE | UPL_SET_INTERNAL | UPL_SET_LITE | UPL_SET_IO_WIRE;
0b4e3aa0
A
3065
3066 kret = vm_map_get_upl(current_map(),
3067 (vm_offset_t)iov->iov_base & ~PAGE_MASK,
3068 &upl_size, &upl, NULL, &pages_in_pl, &upl_flags, 0);
3069
b4c24cb9
A
3070 if (kret != KERN_SUCCESS) {
3071 /*
3072 * cluster_phys_read: failed to get pagelist
3073 */
3074 return(EINVAL);
3075 }
3076 if (upl_size < upl_needed_size) {
3077 /*
3078 * The upl_size wasn't satisfied.
3079 */
3080 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3081
3082 return(EINVAL);
3083 }
3084 pl = ubc_upl_pageinfo(upl);
3085
55e303ae 3086 dst_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + ((addr64_t)((u_int)iov->iov_base & PAGE_MASK));
0b4e3aa0 3087
b4c24cb9
A
3088 while (((uio->uio_offset & (devblocksize - 1)) || io_size < devblocksize) && io_size) {
3089 int head_size;
3090
3091 head_size = devblocksize - (int)(uio->uio_offset & (devblocksize - 1));
3092
3093 if (head_size > io_size)
3094 head_size = io_size;
3095
3096 error = cluster_align_phys_io(vp, uio, dst_paddr, head_size, devblocksize, CL_READ);
3097
3098 if (error) {
3099 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
3100
3101 return(EINVAL);
3102 }
3103 upl_offset += head_size;
3104 dst_paddr += head_size;
3105 io_size -= head_size;
3106 }
3107 tail_size = io_size & (devblocksize - 1);
3108 io_size -= tail_size;
3109
3110 iostate.io_completed = 0;
3111 iostate.io_issued = 0;
3112 iostate.io_error = 0;
3113 iostate.io_wanted = 0;
3114
3115 while (io_size && error == 0) {
3116 int xsize;
3117
3118 if (io_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
3119 xsize = MAX_UPL_TRANSFER * PAGE_SIZE;
3120 else
3121 xsize = io_size;
3122 /*
3123 * request asynchronously so that we can overlap
3124 * the preparation of the next I/O... we'll do
3125 * the commit after all the I/O has completed
3126 * since its all issued against the same UPL
3127 * if there are already too many outstanding reads
d7e50217 3128 * wait until some have completed before issuing the next
b4c24cb9
A
3129 */
3130 while ((iostate.io_issued - iostate.io_completed) > (2 * MAX_UPL_TRANSFER * PAGE_SIZE)) {
3131 iostate.io_wanted = 1;
3132 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_phys_read", 0);
3133 }
3134
3135 error = cluster_io(vp, upl, upl_offset, uio->uio_offset, xsize, 0,
3136 CL_READ | CL_NOZERO | CL_DEV_MEMORY | CL_ASYNC,
3137 (struct buf *)0, &iostate);
3138 /*
3139 * The cluster_io read was issued successfully,
3140 * update the uio structure
3141 */
3142 if (error == 0) {
3143 uio->uio_resid -= xsize;
3144 iov->iov_len -= xsize;
3145 iov->iov_base += xsize;
3146 uio->uio_offset += xsize;
3147 dst_paddr += xsize;
3148 upl_offset += xsize;
3149 io_size -= xsize;
3150 }
3151 }
0b4e3aa0 3152 /*
d7e50217
A
3153 * make sure all async reads that are part of this stream
3154 * have completed before we proceed
0b4e3aa0 3155 */
b4c24cb9
A
3156 while (iostate.io_issued != iostate.io_completed) {
3157 iostate.io_wanted = 1;
3158 tsleep((caddr_t)&iostate.io_wanted, PRIBIO + 1, "cluster_phys_read", 0);
3159 }
3160 if (iostate.io_error) {
3161 error = iostate.io_error;
3162 }
3163 if (error == 0 && tail_size)
3164 error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, devblocksize, CL_READ);
0b4e3aa0
A
3165
3166 /*
b4c24cb9
A
3167 * just release our hold on the physically contiguous
3168 * region without changing any state
0b4e3aa0 3169 */
b4c24cb9 3170 ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
0b4e3aa0
A
3171
3172 return (error);
3173}
1c79356b 3174
b4c24cb9 3175
1c79356b
A
3176/*
3177 * generate advisory I/O's in the largest chunks possible
3178 * the completed pages will be released into the VM cache
3179 */
9bccf70c 3180int
1c79356b
A
3181advisory_read(vp, filesize, f_offset, resid, devblocksize)
3182 struct vnode *vp;
3183 off_t filesize;
3184 off_t f_offset;
3185 int resid;
3186 int devblocksize;
3187{
1c79356b
A
3188 upl_page_info_t *pl;
3189 upl_t upl;
3190 vm_offset_t upl_offset;
3191 int upl_size;
3192 off_t upl_f_offset;
3193 int start_offset;
3194 int start_pg;
3195 int last_pg;
3196 int pages_in_upl;
3197 off_t max_size;
3198 int io_size;
3199 kern_return_t kret;
3200 int retval = 0;
9bccf70c 3201 int issued_io;
55e303ae 3202 int skip_range;
1c79356b
A
3203
3204 if (!UBCINFOEXISTS(vp))
3205 return(EINVAL);
3206
1c79356b
A
3207 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_START,
3208 (int)f_offset, resid, (int)filesize, devblocksize, 0);
3209
3210 while (resid && f_offset < filesize && retval == 0) {
3211 /*
3212 * compute the size of the upl needed to encompass
3213 * the requested read... limit each call to cluster_io
0b4e3aa0
A
3214 * to the maximum UPL size... cluster_io will clip if
3215 * this exceeds the maximum io_size for the device,
3216 * make sure to account for
1c79356b
A
3217 * a starting offset that's not page aligned
3218 */
3219 start_offset = (int)(f_offset & PAGE_MASK_64);
3220 upl_f_offset = f_offset - (off_t)start_offset;
3221 max_size = filesize - f_offset;
3222
3223 if (resid < max_size)
3224 io_size = resid;
3225 else
3226 io_size = max_size;
3227
3228 upl_size = (start_offset + io_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
0b4e3aa0
A
3229 if (upl_size > (MAX_UPL_TRANSFER * PAGE_SIZE))
3230 upl_size = MAX_UPL_TRANSFER * PAGE_SIZE;
55e303ae
A
3231
3232 skip_range = 0;
3233 /*
3234 * return the number of contiguously present pages in the cache
3235 * starting at upl_f_offset within the file
3236 */
3237 ubc_range_op(vp, upl_f_offset, upl_f_offset + upl_size, UPL_ROP_PRESENT, &skip_range);
3238
3239 if (skip_range) {
3240 /*
3241 * skip over pages already present in the cache
3242 */
3243 io_size = skip_range - start_offset;
3244
3245 f_offset += io_size;
3246 resid -= io_size;
3247
3248 if (skip_range == upl_size)
3249 continue;
3250 /*
3251 * have to issue some real I/O
3252 * at this point, we know it's starting on a page boundary
3253 * because we've skipped over at least the first page in the request
3254 */
3255 start_offset = 0;
3256 upl_f_offset += skip_range;
3257 upl_size -= skip_range;
3258 }
1c79356b
A
3259 pages_in_upl = upl_size / PAGE_SIZE;
3260
55e303ae
A
3261 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_START,
3262 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
3263
0b4e3aa0
A
3264 kret = ubc_create_upl(vp,
3265 upl_f_offset,
3266 upl_size,
3267 &upl,
3268 &pl,
55e303ae 3269 UPL_RET_ONLY_ABSENT | UPL_SET_LITE);
1c79356b 3270 if (kret != KERN_SUCCESS)
9bccf70c
A
3271 return(retval);
3272 issued_io = 0;
1c79356b
A
3273
3274 /*
9bccf70c
A
3275 * before we start marching forward, we must make sure we end on
3276 * a present page, otherwise we will be working with a freed
3277 * upl
1c79356b 3278 */
9bccf70c
A
3279 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
3280 if (upl_page_present(pl, last_pg))
3281 break;
1c79356b 3282 }
9bccf70c 3283 pages_in_upl = last_pg + 1;
1c79356b 3284
1c79356b 3285
55e303ae 3286 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 61)) | DBG_FUNC_END,
9bccf70c
A
3287 (int)upl, (int)upl_f_offset, upl_size, start_offset, 0);
3288
3289
3290 for (last_pg = 0; last_pg < pages_in_upl; ) {
1c79356b 3291 /*
9bccf70c
A
3292 * scan from the beginning of the upl looking for the first
3293 * page that is present.... this will become the first page in
3294 * the request we're going to make to 'cluster_io'... if all
3295 * of the pages are absent, we won't call through to 'cluster_io'
1c79356b 3296 */
9bccf70c
A
3297 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
3298 if (upl_page_present(pl, start_pg))
3299 break;
1c79356b 3300 }
1c79356b 3301
1c79356b 3302 /*
9bccf70c
A
3303 * scan from the starting present page looking for an absent
3304 * page before the end of the upl is reached, if we
3305 * find one, then it will terminate the range of pages being
3306 * presented to 'cluster_io'
1c79356b 3307 */
9bccf70c
A
3308 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
3309 if (!upl_page_present(pl, last_pg))
3310 break;
3311 }
3312
3313 if (last_pg > start_pg) {
3314 /*
3315 * we found a range of pages that must be filled
3316 * if the last page in this range is the last page of the file
3317 * we may have to clip the size of it to keep from reading past
3318 * the end of the last physical block associated with the file
3319 */
3320 upl_offset = start_pg * PAGE_SIZE;
3321 io_size = (last_pg - start_pg) * PAGE_SIZE;
3322
3323 if ((upl_f_offset + upl_offset + io_size) > filesize)
3324 io_size = filesize - (upl_f_offset + upl_offset);
3325
3326 /*
3327 * issue an asynchronous read to cluster_io
3328 */
3329 retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, devblocksize,
b4c24cb9 3330 CL_ASYNC | CL_READ | CL_COMMIT | CL_AGE, (struct buf *)0, (struct clios *)0);
1c79356b 3331
9bccf70c
A
3332 issued_io = 1;
3333 }
1c79356b 3334 }
9bccf70c
A
3335 if (issued_io == 0)
3336 ubc_upl_abort(upl, 0);
3337
3338 io_size = upl_size - start_offset;
1c79356b
A
3339
3340 if (io_size > resid)
3341 io_size = resid;
3342 f_offset += io_size;
3343 resid -= io_size;
3344 }
9bccf70c 3345
1c79356b
A
3346 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 60)) | DBG_FUNC_END,
3347 (int)f_offset, resid, retval, 0, 0);
3348
3349 return(retval);
3350}
3351
3352
9bccf70c 3353int
1c79356b
A
3354cluster_push(vp)
3355 struct vnode *vp;
9bccf70c
A
3356{
3357 int retval;
3358
55e303ae 3359 if (!UBCINFOEXISTS(vp) || (vp->v_clen == 0 && !(vp->v_flag & VHASDIRTY)))
9bccf70c 3360 return(0);
9bccf70c
A
3361
3362 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_START,
3363 vp->v_flag & VHASDIRTY, vp->v_clen, 0, 0, 0);
3364
3365 if (vp->v_flag & VHASDIRTY) {
55e303ae 3366 sparse_cluster_push(vp, ubc_getsize(vp), 1);
9bccf70c 3367
9bccf70c 3368 vp->v_clen = 0;
55e303ae
A
3369 retval = 1;
3370 } else
3371 retval = cluster_try_push(vp, ubc_getsize(vp), 0, 1);
9bccf70c 3372
55e303ae
A
3373 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
3374 vp->v_flag & VHASDIRTY, vp->v_clen, retval, 0, 0);
9bccf70c 3375
55e303ae
A
3376 return (retval);
3377}
9bccf70c 3378
9bccf70c 3379
55e303ae
A
3380int
3381cluster_release(vp)
3382 struct vnode *vp;
3383{
3384 off_t offset;
3385 u_int length;
9bccf70c 3386
55e303ae 3387 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_START, (int)vp, (int)vp->v_scmap, vp->v_scdirty, 0, 0);
9bccf70c 3388
55e303ae
A
3389 if (vp->v_flag & VHASDIRTY) {
3390 vfs_drt_control(&(vp->v_scmap), 0);
3391
3392 vp->v_flag &= ~VHASDIRTY;
3393 }
3394 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 81)) | DBG_FUNC_END, (int)vp, (int)vp->v_scmap, vp->v_scdirty, 0, 0);
9bccf70c
A
3395}
3396
3397
3398static int
3399cluster_try_push(vp, EOF, can_delay, push_all)
3400 struct vnode *vp;
3401 off_t EOF;
3402 int can_delay;
3403 int push_all;
3404{
3405 int cl_index;
3406 int cl_index1;
3407 int min_index;
3408 int cl_len;
3409 int cl_total;
55e303ae 3410 int cl_pushed = 0;
9bccf70c
A
3411 struct v_cluster l_clusters[MAX_CLUSTERS];
3412
3413 /*
3414 * make a local 'sorted' copy of the clusters
3415 * and clear vp->v_clen so that new clusters can
3416 * be developed
3417 */
3418 for (cl_index = 0; cl_index < vp->v_clen; cl_index++) {
3419 for (min_index = -1, cl_index1 = 0; cl_index1 < vp->v_clen; cl_index1++) {
3420 if (vp->v_clusters[cl_index1].start_pg == vp->v_clusters[cl_index1].last_pg)
3421 continue;
3422 if (min_index == -1)
3423 min_index = cl_index1;
3424 else if (vp->v_clusters[cl_index1].start_pg < vp->v_clusters[min_index].start_pg)
3425 min_index = cl_index1;
3426 }
3427 if (min_index == -1)
3428 break;
3429 l_clusters[cl_index].start_pg = vp->v_clusters[min_index].start_pg;
3430 l_clusters[cl_index].last_pg = vp->v_clusters[min_index].last_pg;
3431
3432 vp->v_clusters[min_index].start_pg = vp->v_clusters[min_index].last_pg;
3433 }
3434 cl_len = cl_index;
3435 vp->v_clen = 0;
3436
55e303ae
A
3437 if (can_delay && cl_len == MAX_CLUSTERS) {
3438 int i;
3439
3440 /*
3441 * determine if we appear to be writing the file sequentially
3442 * if not, by returning without having pushed any clusters
3443 * we will cause this vnode to be pushed into the sparse cluster mechanism
3444 * used for managing more random I/O patterns
3445 *
3446 * we know that we've got all clusters currently in use and the next write doesn't fit into one of them...
3447 * that's why we're in try_push with can_delay true...
3448 *
3449 * check to make sure that all the clusters except the last one are 'full'... and that each cluster
3450 * is adjacent to the next (i.e. we're looking for sequential writes) they were sorted above
3451 * so we can just make a simple pass through up, to but not including the last one...
3452 * note that last_pg is not inclusive, so it will be equal to the start_pg of the next cluster if they
3453 * are sequential
3454 *
3455 * we let the last one be partial as long as it was adjacent to the previous one...
3456 * we need to do this to deal with multi-threaded servers that might write an I/O or 2 out
3457 * of order... if this occurs at the tail of the last cluster, we don't want to fall into the sparse cluster world...
3458 */
3459 for (i = 0; i < MAX_CLUSTERS - 1; i++) {
3460 if ((l_clusters[i].last_pg - l_clusters[i].start_pg) != MAX_UPL_TRANSFER)
3461 goto dont_try;
3462 if (l_clusters[i].last_pg != l_clusters[i+1].start_pg)
3463 goto dont_try;
3464 }
3465 }
3466 for (cl_index = 0; cl_index < cl_len; cl_index++) {
9bccf70c
A
3467 /*
3468 * try to push each cluster in turn... cluster_push_x may not
3469 * push the cluster if can_delay is TRUE and the cluster doesn't
3470 * meet the critera for an immediate push
3471 */
3472 if (cluster_push_x(vp, EOF, l_clusters[cl_index].start_pg, l_clusters[cl_index].last_pg, can_delay)) {
3473 l_clusters[cl_index].start_pg = 0;
3474 l_clusters[cl_index].last_pg = 0;
3475
3476 cl_pushed++;
3477
3478 if (push_all == 0)
3479 break;
3480 }
3481 }
55e303ae 3482dont_try:
9bccf70c
A
3483 if (cl_len > cl_pushed) {
3484 /*
3485 * we didn't push all of the clusters, so
3486 * lets try to merge them back in to the vnode
3487 */
3488 if ((MAX_CLUSTERS - vp->v_clen) < (cl_len - cl_pushed)) {
3489 /*
3490 * we picked up some new clusters while we were trying to
3491 * push the old ones (I don't think this can happen because
3492 * I'm holding the lock, but just in case)... the sum of the
3493 * leftovers plus the new cluster count exceeds our ability
55e303ae 3494 * to represent them, so switch to the sparse cluster mechanism
9bccf70c 3495 */
55e303ae
A
3496
3497 /*
3498 * first collect the new clusters sitting in the vp
3499 */
3500 sparse_cluster_switch(vp, EOF);
3501
3502 for (cl_index = 0, cl_index1 = 0; cl_index < cl_len; cl_index++) {
9bccf70c
A
3503 if (l_clusters[cl_index].start_pg == l_clusters[cl_index].last_pg)
3504 continue;
55e303ae
A
3505 vp->v_clusters[cl_index1].start_pg = l_clusters[cl_index].start_pg;
3506 vp->v_clusters[cl_index1].last_pg = l_clusters[cl_index].last_pg;
9bccf70c 3507
55e303ae 3508 cl_index1++;
9bccf70c 3509 }
55e303ae
A
3510 /*
3511 * update the cluster count
3512 */
3513 vp->v_clen = cl_index1;
3514
3515 /*
3516 * and collect the original clusters that were moved into the
3517 * local storage for sorting purposes
3518 */
3519 sparse_cluster_switch(vp, EOF);
3520
9bccf70c
A
3521 } else {
3522 /*
3523 * we've got room to merge the leftovers back in
3524 * just append them starting at the next 'hole'
3525 * represented by vp->v_clen
3526 */
3527 for (cl_index = 0, cl_index1 = vp->v_clen; cl_index < cl_len; cl_index++) {
3528 if (l_clusters[cl_index].start_pg == l_clusters[cl_index].last_pg)
3529 continue;
3530
3531 vp->v_clusters[cl_index1].start_pg = l_clusters[cl_index].start_pg;
3532 vp->v_clusters[cl_index1].last_pg = l_clusters[cl_index].last_pg;
3533
9bccf70c
A
3534 cl_index1++;
3535 }
3536 /*
3537 * update the cluster count
3538 */
3539 vp->v_clen = cl_index1;
3540 }
3541 }
3542 return(MAX_CLUSTERS - vp->v_clen);
3543}
3544
3545
3546
3547static int
3548cluster_push_x(vp, EOF, first, last, can_delay)
3549 struct vnode *vp;
3550 off_t EOF;
3551 daddr_t first;
3552 daddr_t last;
3553 int can_delay;
1c79356b 3554{
1c79356b
A
3555 upl_page_info_t *pl;
3556 upl_t upl;
3557 vm_offset_t upl_offset;
3558 int upl_size;
3559 off_t upl_f_offset;
3560 int pages_in_upl;
3561 int start_pg;
3562 int last_pg;
3563 int io_size;
3564 int io_flags;
55e303ae 3565 int upl_flags;
1c79356b
A
3566 int size;
3567 kern_return_t kret;
3568
3569
9bccf70c
A
3570 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START,
3571 vp->v_clen, first, last, EOF, 0);
3572
3573 if ((pages_in_upl = last - first) == 0) {
3574 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 0, 0, 0, 0);
1c79356b 3575
9bccf70c
A
3576 return (1);
3577 }
1c79356b 3578 upl_size = pages_in_upl * PAGE_SIZE;
9bccf70c 3579 upl_f_offset = ((off_t)first) * PAGE_SIZE_64;
1c79356b 3580
9bccf70c
A
3581 if (upl_f_offset + upl_size >= EOF) {
3582
3583 if (upl_f_offset >= EOF) {
3584 /*
3585 * must have truncated the file and missed
3586 * clearing a dangling cluster (i.e. it's completely
3587 * beyond the new EOF
3588 */
3589 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 1, 0, 0, 0);
3590
3591 return(1);
3592 }
3593 size = EOF - upl_f_offset;
1c79356b 3594
55e303ae 3595 upl_size = (size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
9bccf70c 3596 pages_in_upl = upl_size / PAGE_SIZE;
55e303ae 3597 } else
9bccf70c 3598 size = upl_size;
55e303ae
A
3599
3600 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, size, 0, 0, 0);
3601
3602 if (vp->v_flag & VNOCACHE_DATA)
3603 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE | UPL_WILL_BE_DUMPED;
3604 else
3605 upl_flags = UPL_COPYOUT_FROM | UPL_RET_ONLY_DIRTY | UPL_SET_LITE;
3606
0b4e3aa0
A
3607 kret = ubc_create_upl(vp,
3608 upl_f_offset,
3609 upl_size,
3610 &upl,
9bccf70c 3611 &pl,
55e303ae 3612 upl_flags);
1c79356b
A
3613 if (kret != KERN_SUCCESS)
3614 panic("cluster_push: failed to get pagelist");
3615
55e303ae 3616 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_END, (int)upl, upl_f_offset, 0, 0, 0);
9bccf70c 3617
55e303ae
A
3618 /*
3619 * since we only asked for the dirty pages back
3620 * it's possible that we may only get a few or even none, so...
3621 * before we start marching forward, we must make sure we know
3622 * where the last present page is in the UPL, otherwise we could
3623 * end up working with a freed upl due to the FREE_ON_EMPTY semantics
3624 * employed by commit_range and abort_range.
3625 */
3626 for (last_pg = pages_in_upl - 1; last_pg >= 0; last_pg--) {
3627 if (upl_page_present(pl, last_pg))
3628 break;
9bccf70c 3629 }
55e303ae 3630 pages_in_upl = last_pg + 1;
1c79356b 3631
55e303ae
A
3632 if (pages_in_upl == 0) {
3633 ubc_upl_abort(upl, 0);
1c79356b 3634
55e303ae
A
3635 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 2, 0, 0, 0);
3636 return(1);
3637 }
3638
3639 for (last_pg = 0; last_pg < pages_in_upl; ) {
3640 /*
3641 * find the next dirty page in the UPL
3642 * this will become the first page in the
3643 * next I/O to generate
3644 */
1c79356b 3645 for (start_pg = last_pg; start_pg < pages_in_upl; start_pg++) {
55e303ae 3646 if (upl_dirty_page(pl, start_pg))
1c79356b 3647 break;
55e303ae
A
3648 if (upl_page_present(pl, start_pg))
3649 /*
3650 * RET_ONLY_DIRTY will return non-dirty 'precious' pages
3651 * just release these unchanged since we're not going
3652 * to steal them or change their state
3653 */
3654 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
1c79356b 3655 }
55e303ae
A
3656 if (start_pg >= pages_in_upl)
3657 /*
3658 * done... no more dirty pages to push
3659 */
3660 break;
3661 if (start_pg > last_pg)
3662 /*
3663 * skipped over some non-dirty pages
3664 */
3665 size -= ((start_pg - last_pg) * PAGE_SIZE);
1c79356b 3666
55e303ae
A
3667 /*
3668 * find a range of dirty pages to write
3669 */
1c79356b 3670 for (last_pg = start_pg; last_pg < pages_in_upl; last_pg++) {
55e303ae 3671 if (!upl_dirty_page(pl, last_pg))
1c79356b
A
3672 break;
3673 }
3674 upl_offset = start_pg * PAGE_SIZE;
3675
3676 io_size = min(size, (last_pg - start_pg) * PAGE_SIZE);
3677
0b4e3aa0 3678 if (vp->v_flag & VNOCACHE_DATA)
55e303ae 3679 io_flags = CL_THROTTLE | CL_COMMIT | CL_ASYNC | CL_DUMP;
1c79356b 3680 else
55e303ae 3681 io_flags = CL_THROTTLE | CL_COMMIT | CL_ASYNC;
1c79356b 3682
b4c24cb9 3683 cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, vp->v_ciosiz, io_flags, (struct buf *)0, (struct clios *)0);
1c79356b
A
3684
3685 size -= io_size;
3686 }
9bccf70c
A
3687 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, 0, 0, 0);
3688
1c79356b
A
3689 return(1);
3690}
b4c24cb9
A
3691
3692
b4c24cb9 3693static int
55e303ae 3694sparse_cluster_switch(struct vnode *vp, off_t EOF)
b4c24cb9 3695{
55e303ae 3696 int cl_index;
b4c24cb9 3697
55e303ae 3698 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, (int)vp, (int)vp->v_scmap, vp->v_scdirty, 0, 0);
b4c24cb9 3699
55e303ae
A
3700 if ( !(vp->v_flag & VHASDIRTY)) {
3701 vp->v_flag |= VHASDIRTY;
3702 vp->v_scdirty = 0;
3703 vp->v_scmap = 0;
3704 }
3705 for (cl_index = 0; cl_index < vp->v_clen; cl_index++) {
3706 int flags;
3707 int start_pg;
3708 int last_pg;
b4c24cb9 3709
55e303ae 3710 for (start_pg = vp->v_clusters[cl_index].start_pg; start_pg < vp->v_clusters[cl_index].last_pg; start_pg++) {
b4c24cb9 3711
55e303ae
A
3712 if (ubc_page_op(vp, (off_t)(((off_t)start_pg) * PAGE_SIZE_64), 0, 0, &flags) == KERN_SUCCESS) {
3713 if (flags & UPL_POP_DIRTY)
3714 sparse_cluster_add(vp, EOF, start_pg, start_pg + 1);
3715 }
3716 }
3717 }
3718 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, (int)vp, (int)vp->v_scmap, vp->v_scdirty, 0, 0);
3719}
3720
3721
3722static int
3723sparse_cluster_push(struct vnode *vp, off_t EOF, int push_all)
3724{
3725 daddr_t first;
3726 daddr_t last;
3727 off_t offset;
3728 u_int length;
3729
3730 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, (int)vp, (int)vp->v_scmap, vp->v_scdirty, push_all, 0);
3731
3732 if (push_all)
3733 vfs_drt_control(&(vp->v_scmap), 1);
3734
3735 for (;;) {
3736 if (vfs_drt_get_cluster(&(vp->v_scmap), &offset, &length) != KERN_SUCCESS) {
3737 vp->v_flag &= ~VHASDIRTY;
3738 vp->v_clen = 0;
3739 break;
3740 }
3741 first = (daddr_t)(offset / PAGE_SIZE_64);
3742 last = (daddr_t)((offset + length) / PAGE_SIZE_64);
3743
3744 cluster_push_x(vp, EOF, first, last, 0);
3745
3746 vp->v_scdirty -= (last - first);
3747
3748 if (push_all == 0)
3749 break;
3750 }
3751 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, (int)vp, (int)vp->v_scmap, vp->v_scdirty, 0, 0);
3752}
3753
3754
3755static int
3756sparse_cluster_add(struct vnode *vp, off_t EOF, daddr_t first, daddr_t last)
3757{
3758 u_int new_dirty;
3759 u_int length;
3760 off_t offset;
3761
3762 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_START, (int)vp->v_scmap, vp->v_scdirty, first, last, 0);
3763
3764 offset = (off_t)first * PAGE_SIZE_64;
3765 length = (last - first) * PAGE_SIZE;
3766
3767 while (vfs_drt_mark_pages(&(vp->v_scmap), offset, length, &new_dirty) != KERN_SUCCESS) {
3768 /*
3769 * no room left in the map
3770 * only a partial update was done
3771 * push out some pages and try again
3772 */
3773 vp->v_scdirty += new_dirty;
3774
3775 sparse_cluster_push(vp, EOF, 0);
3776
3777 offset += (new_dirty * PAGE_SIZE_64);
3778 length -= (new_dirty * PAGE_SIZE);
3779 }
3780 vp->v_scdirty += new_dirty;
3781
3782 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, (int)vp, (int)vp->v_scmap, vp->v_scdirty, 0, 0);
3783}
3784
3785
3786static int
3787cluster_align_phys_io(struct vnode *vp, struct uio *uio, addr64_t usr_paddr, int xsize, int devblocksize, int flags)
3788{
3789 struct iovec *iov;
3790 upl_page_info_t *pl;
3791 upl_t upl;
3792 addr64_t ubc_paddr;
3793 kern_return_t kret;
3794 int error = 0;
3795
3796 iov = uio->uio_iov;
3797
3798 kret = ubc_create_upl(vp,
3799 uio->uio_offset & ~PAGE_MASK_64,
3800 PAGE_SIZE,
3801 &upl,
3802 &pl,
3803 UPL_SET_LITE);
3804
3805 if (kret != KERN_SUCCESS)
3806 return(EINVAL);
3807
3808 if (!upl_valid_page(pl, 0)) {
3809 /*
3810 * issue a synchronous read to cluster_io
3811 */
3812 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, devblocksize,
3813 CL_READ, (struct buf *)0, (struct clios *)0);
3814 if (error) {
b4c24cb9
A
3815 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
3816
3817 return(error);
3818 }
3819 }
55e303ae 3820 ubc_paddr = ((addr64_t)upl_phys_page(pl, 0) << 12) + (addr64_t)(uio->uio_offset & PAGE_MASK_64);
b4c24cb9 3821
55e303ae
A
3822/*
3823 * NOTE: There is no prototype for the following in BSD. It, and the definitions
3824 * of the defines for cppvPsrc, cppvPsnk, cppvFsnk, and cppvFsrc will be found in
3825 * osfmk/ppc/mappings.h. They are not included here because there appears to be no
3826 * way to do so without exporting them to kexts as well.
3827 */
de355530 3828 if (flags & CL_READ)
55e303ae
A
3829// copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsnk); /* Copy physical to physical and flush the destination */
3830 copypv(ubc_paddr, usr_paddr, xsize, 2 | 1 | 4); /* Copy physical to physical and flush the destination */
de355530 3831 else
55e303ae
A
3832// copypv(ubc_paddr, usr_paddr, xsize, cppvPsrc | cppvPsnk | cppvFsrc); /* Copy physical to physical and flush the source */
3833 copypv(ubc_paddr, usr_paddr, xsize, 2 | 1 | 8); /* Copy physical to physical and flush the source */
3834
3835 if ( !(flags & CL_READ) || (upl_valid_page(pl, 0) && upl_dirty_page(pl, 0))) {
3836 /*
3837 * issue a synchronous write to cluster_io
3838 */
3839 error = cluster_io(vp, upl, 0, uio->uio_offset & ~PAGE_MASK_64, PAGE_SIZE, devblocksize,
3840 0, (struct buf *)0, (struct clios *)0);
de355530
A
3841 }
3842 if (error == 0) {
55e303ae 3843 uio->uio_offset += xsize;
de355530
A
3844 iov->iov_base += xsize;
3845 iov->iov_len -= xsize;
3846 uio->uio_resid -= xsize;
3847 }
3848 ubc_upl_abort_range(upl, 0, PAGE_SIZE, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
55e303ae
A
3849
3850 return (error);
3851}
3852
3853
3854
3855int
3856cluster_copy_upl_data(struct uio *uio, upl_t upl, int upl_offset, int xsize)
3857{
3858 int pg_offset;
3859 int pg_index;
3860 int csize;
3861 int segflg;
3862 int retval = 0;
3863 upl_page_info_t *pl;
3864 boolean_t funnel_state = FALSE;
3865
3866
3867 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
3868 (int)uio->uio_offset, uio->uio_resid, upl_offset, xsize, 0);
3869
3870 if (xsize >= (16 * 1024))
3871 funnel_state = thread_funnel_set(kernel_flock, FALSE);
3872
3873 segflg = uio->uio_segflg;
3874
3875 switch(segflg) {
3876
3877 case UIO_USERSPACE:
3878 case UIO_USERISPACE:
3879 uio->uio_segflg = UIO_PHYS_USERSPACE;
3880 break;
3881
3882 case UIO_SYSSPACE:
3883 uio->uio_segflg = UIO_PHYS_SYSSPACE;
3884 break;
3885 }
3886 pl = ubc_upl_pageinfo(upl);
3887
3888 pg_index = upl_offset / PAGE_SIZE;
3889 pg_offset = upl_offset & PAGE_MASK;
3890 csize = min(PAGE_SIZE - pg_offset, xsize);
3891
3892 while (xsize && retval == 0) {
3893 addr64_t paddr;
3894
3895 paddr = ((addr64_t)upl_phys_page(pl, pg_index) << 12) + pg_offset;
de355530 3896
55e303ae
A
3897 retval = uiomove64(paddr, csize, uio);
3898
3899 pg_index += 1;
3900 pg_offset = 0;
3901 xsize -= csize;
3902 csize = min(PAGE_SIZE, xsize);
3903 }
3904 uio->uio_segflg = segflg;
3905
3906 if (funnel_state == TRUE)
3907 thread_funnel_set(kernel_flock, TRUE);
3908
3909 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
3910 (int)uio->uio_offset, uio->uio_resid, retval, segflg, 0);
3911
3912 return (retval);
3913}
3914
3915
3916int
3917cluster_copy_ubc_data(struct vnode *vp, struct uio *uio, int *io_resid, int mark_dirty)
3918{
3919 int segflg;
3920 int io_size;
3921 int xsize;
3922 int start_offset;
3923 off_t f_offset;
3924 int retval = 0;
3925 memory_object_control_t control;
3926 int op_flags = UPL_POP_SET | UPL_POP_BUSY;
3927 boolean_t funnel_state = FALSE;
3928
3929
3930 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START,
3931 (int)uio->uio_offset, uio->uio_resid, 0, *io_resid, 0);
3932
3933 control = ubc_getobject(vp, UBC_FLAGS_NONE);
3934 if (control == MEMORY_OBJECT_CONTROL_NULL) {
3935 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
3936 (int)uio->uio_offset, uio->uio_resid, retval, 3, 0);
3937
3938 return(0);
3939 }
3940 if (mark_dirty)
3941 op_flags |= UPL_POP_DIRTY;
3942
3943 segflg = uio->uio_segflg;
3944
3945 switch(segflg) {
3946
3947 case UIO_USERSPACE:
3948 case UIO_USERISPACE:
3949 uio->uio_segflg = UIO_PHYS_USERSPACE;
3950 break;
3951
3952 case UIO_SYSSPACE:
3953 uio->uio_segflg = UIO_PHYS_SYSSPACE;
3954 break;
3955 }
3956 io_size = *io_resid;
3957 start_offset = (int)(uio->uio_offset & PAGE_MASK_64);
3958 f_offset = uio->uio_offset - start_offset;
3959 xsize = min(PAGE_SIZE - start_offset, io_size);
3960
3961 while (io_size && retval == 0) {
3962 ppnum_t pgframe;
3963
3964 if (ubc_page_op_with_control(control, f_offset, op_flags, &pgframe, 0) != KERN_SUCCESS)
3965 break;
3966
3967 if (funnel_state == FALSE && io_size >= (16 * 1024))
3968 funnel_state = thread_funnel_set(kernel_flock, FALSE);
3969
3970 retval = uiomove64((addr64_t)(((addr64_t)pgframe << 12) + start_offset), xsize, uio);
3971
3972 ubc_page_op_with_control(control, f_offset, UPL_POP_CLR | UPL_POP_BUSY, 0, 0);
3973
3974 io_size -= xsize;
3975 start_offset = 0;
3976 f_offset = uio->uio_offset;
3977 xsize = min(PAGE_SIZE, io_size);
3978 }
3979 uio->uio_segflg = segflg;
3980 *io_resid = io_size;
3981
3982 if (funnel_state == TRUE)
3983 thread_funnel_set(kernel_flock, TRUE);
3984
3985 KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_END,
3986 (int)uio->uio_offset, uio->uio_resid, retval, 0x80000000 | segflg, 0);
3987
3988 return(retval);
3989}
3990
3991
3992int
3993is_file_clean(struct vnode *vp, off_t filesize)
3994{
3995 off_t f_offset;
3996 int flags;
3997 int total_dirty = 0;
3998
3999 for (f_offset = 0; f_offset < filesize; f_offset += PAGE_SIZE_64) {
4000 if (ubc_page_op(vp, f_offset, 0, 0, &flags) == KERN_SUCCESS) {
4001 if (flags & UPL_POP_DIRTY) {
4002 total_dirty++;
4003 }
4004 }
4005 }
4006 if (total_dirty)
4007 return(EINVAL);
4008
4009 return (0);
4010}
4011
4012
4013
4014/*
4015 * Dirty region tracking/clustering mechanism.
4016 *
4017 * This code (vfs_drt_*) provides a mechanism for tracking and clustering
4018 * dirty regions within a larger space (file). It is primarily intended to
4019 * support clustering in large files with many dirty areas.
4020 *
4021 * The implementation assumes that the dirty regions are pages.
4022 *
4023 * To represent dirty pages within the file, we store bit vectors in a
4024 * variable-size circular hash.
4025 */
4026
4027/*
4028 * Bitvector size. This determines the number of pages we group in a
4029 * single hashtable entry. Each hashtable entry is aligned to this
4030 * size within the file.
4031 */
4032#define DRT_BITVECTOR_PAGES 256
4033
4034/*
4035 * File offset handling.
4036 *
4037 * DRT_ADDRESS_MASK is dependent on DRT_BITVECTOR_PAGES;
4038 * the correct formula is (~(DRT_BITVECTOR_PAGES * PAGE_SIZE) - 1)
4039 */
4040#define DRT_ADDRESS_MASK (~((1 << 20) - 1))
4041#define DRT_ALIGN_ADDRESS(addr) ((addr) & DRT_ADDRESS_MASK)
4042
4043/*
4044 * Hashtable address field handling.
4045 *
4046 * The low-order bits of the hashtable address are used to conserve
4047 * space.
4048 *
4049 * DRT_HASH_COUNT_MASK must be large enough to store the range
4050 * 0-DRT_BITVECTOR_PAGES inclusive, as well as have one value
4051 * to indicate that the bucket is actually unoccupied.
4052 */
4053#define DRT_HASH_GET_ADDRESS(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_ADDRESS_MASK)
4054#define DRT_HASH_SET_ADDRESS(scm, i, a) \
4055 do { \
4056 (scm)->scm_hashtable[(i)].dhe_control = \
4057 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_ADDRESS_MASK) | DRT_ALIGN_ADDRESS(a); \
4058 } while (0)
4059#define DRT_HASH_COUNT_MASK 0x1ff
4060#define DRT_HASH_GET_COUNT(scm, i) ((scm)->scm_hashtable[(i)].dhe_control & DRT_HASH_COUNT_MASK)
4061#define DRT_HASH_SET_COUNT(scm, i, c) \
4062 do { \
4063 (scm)->scm_hashtable[(i)].dhe_control = \
4064 ((scm)->scm_hashtable[(i)].dhe_control & ~DRT_HASH_COUNT_MASK) | ((c) & DRT_HASH_COUNT_MASK); \
4065 } while (0)
4066#define DRT_HASH_CLEAR(scm, i) \
4067 do { \
4068 (scm)->scm_hashtable[(i)].dhe_control = 0; \
4069 } while (0)
4070#define DRT_HASH_VACATE(scm, i) DRT_HASH_SET_COUNT((scm), (i), DRT_HASH_COUNT_MASK)
4071#define DRT_HASH_VACANT(scm, i) (DRT_HASH_GET_COUNT((scm), (i)) == DRT_HASH_COUNT_MASK)
4072#define DRT_HASH_COPY(oscm, oi, scm, i) \
4073 do { \
4074 (scm)->scm_hashtable[(i)].dhe_control = (oscm)->scm_hashtable[(oi)].dhe_control; \
4075 DRT_BITVECTOR_COPY(oscm, oi, scm, i); \
4076 } while(0);
4077
4078
4079/*
4080 * Hash table moduli.
4081 *
4082 * Since the hashtable entry's size is dependent on the size of
4083 * the bitvector, and since the hashtable size is constrained to
4084 * both being prime and fitting within the desired allocation
4085 * size, these values need to be manually determined.
4086 *
4087 * For DRT_BITVECTOR_SIZE = 256, the entry size is 40 bytes.
4088 *
4089 * The small hashtable allocation is 1024 bytes, so the modulus is 23.
4090 * The large hashtable allocation is 16384 bytes, so the modulus is 401.
4091 */
4092#define DRT_HASH_SMALL_MODULUS 23
4093#define DRT_HASH_LARGE_MODULUS 401
4094
4095#define DRT_SMALL_ALLOCATION 1024 /* 104 bytes spare */
4096#define DRT_LARGE_ALLOCATION 16384 /* 344 bytes spare */
4097
4098/* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */
4099
4100/*
4101 * Hashtable bitvector handling.
4102 *
4103 * Bitvector fields are 32 bits long.
4104 */
4105
4106#define DRT_HASH_SET_BIT(scm, i, bit) \
4107 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] |= (1 << ((bit) % 32))
4108
4109#define DRT_HASH_CLEAR_BIT(scm, i, bit) \
4110 (scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] &= ~(1 << ((bit) % 32))
4111
4112#define DRT_HASH_TEST_BIT(scm, i, bit) \
4113 ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
4114
4115#define DRT_BITVECTOR_CLEAR(scm, i) \
4116 bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
4117
4118#define DRT_BITVECTOR_COPY(oscm, oi, scm, i) \
4119 bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0], \
4120 &(scm)->scm_hashtable[(i)].dhe_bitvector[0], \
4121 (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
4122
4123
4124
4125/*
4126 * Hashtable entry.
4127 */
4128struct vfs_drt_hashentry {
4129 u_int64_t dhe_control;
4130 u_int32_t dhe_bitvector[DRT_BITVECTOR_PAGES / 32];
4131};
4132
4133/*
4134 * Dirty Region Tracking structure.
4135 *
4136 * The hashtable is allocated entirely inside the DRT structure.
4137 *
4138 * The hash is a simple circular prime modulus arrangement, the structure
4139 * is resized from small to large if it overflows.
4140 */
4141
4142struct vfs_drt_clustermap {
4143 u_int32_t scm_magic; /* sanity/detection */
4144#define DRT_SCM_MAGIC 0x12020003
4145 u_int32_t scm_modulus; /* current ring size */
4146 u_int32_t scm_buckets; /* number of occupied buckets */
4147 u_int32_t scm_lastclean; /* last entry we cleaned */
4148 u_int32_t scm_iskips; /* number of slot skips */
4149
4150 struct vfs_drt_hashentry scm_hashtable[0];
4151};
4152
4153
4154#define DRT_HASH(scm, addr) ((addr) % (scm)->scm_modulus)
4155#define DRT_HASH_NEXT(scm, addr) (((addr) + 1) % (scm)->scm_modulus)
4156
4157/*
4158 * Debugging codes and arguments.
4159 */
4160#define DRT_DEBUG_EMPTYFREE (FSDBG_CODE(DBG_FSRW, 82)) /* nil */
4161#define DRT_DEBUG_RETCLUSTER (FSDBG_CODE(DBG_FSRW, 83)) /* offset, length */
4162#define DRT_DEBUG_ALLOC (FSDBG_CODE(DBG_FSRW, 84)) /* copycount */
4163#define DRT_DEBUG_INSERT (FSDBG_CODE(DBG_FSRW, 85)) /* offset, iskip */
4164#define DRT_DEBUG_MARK (FSDBG_CODE(DBG_FSRW, 86)) /* offset, length,
4165 * dirty */
4166 /* 0, setcount */
4167 /* 1 (clean, no map) */
4168 /* 2 (map alloc fail) */
4169 /* 3, resid (partial) */
4170#define DRT_DEBUG_6 (FSDBG_CODE(DBG_FSRW, 87))
4171#define DRT_DEBUG_SCMDATA (FSDBG_CODE(DBG_FSRW, 88)) /* modulus, buckets,
4172 * lastclean, iskips */
4173
4174
4175static void vfs_drt_sanity(struct vfs_drt_clustermap *cmap);
4176static kern_return_t vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp);
4177static kern_return_t vfs_drt_free_map(struct vfs_drt_clustermap *cmap);
4178static kern_return_t vfs_drt_search_index(struct vfs_drt_clustermap *cmap,
4179 u_int64_t offset, int *indexp);
4180static kern_return_t vfs_drt_get_index(struct vfs_drt_clustermap **cmapp,
4181 u_int64_t offset,
4182 int *indexp,
4183 int recursed);
4184static kern_return_t vfs_drt_do_mark_pages(
4185 void **cmapp,
4186 u_int64_t offset,
4187 u_int length,
4188 int *setcountp,
4189 int dirty);
4190static void vfs_drt_trace(
4191 struct vfs_drt_clustermap *cmap,
4192 int code,
4193 int arg1,
4194 int arg2,
4195 int arg3,
4196 int arg4);
4197
4198
4199/*
4200 * Allocate and initialise a sparse cluster map.
4201 *
4202 * Will allocate a new map, resize or compact an existing map.
4203 *
4204 * XXX we should probably have at least one intermediate map size,
4205 * as the 1:16 ratio seems a bit drastic.
4206 */
4207static kern_return_t
4208vfs_drt_alloc_map(struct vfs_drt_clustermap **cmapp)
4209{
4210 struct vfs_drt_clustermap *cmap, *ocmap;
4211 kern_return_t kret;
4212 u_int64_t offset;
4213 int nsize, i, active_buckets, index, copycount;
4214
4215 ocmap = NULL;
4216 if (cmapp != NULL)
4217 ocmap = *cmapp;
4218
4219 /*
4220 * Decide on the size of the new map.
4221 */
4222 if (ocmap == NULL) {
4223 nsize = DRT_HASH_SMALL_MODULUS;
4224 } else {
4225 /* count the number of active buckets in the old map */
4226 active_buckets = 0;
4227 for (i = 0; i < ocmap->scm_modulus; i++) {
4228 if (!DRT_HASH_VACANT(ocmap, i) &&
4229 (DRT_HASH_GET_COUNT(ocmap, i) != 0))
4230 active_buckets++;
4231 }
4232 /*
4233 * If we're currently using the small allocation, check to
4234 * see whether we should grow to the large one.
4235 */
4236 if (ocmap->scm_modulus == DRT_HASH_SMALL_MODULUS) {
4237 /* if the ring is nearly full */
4238 if (active_buckets > (DRT_HASH_SMALL_MODULUS - 5)) {
4239 nsize = DRT_HASH_LARGE_MODULUS;
4240 } else {
4241 nsize = DRT_HASH_SMALL_MODULUS;
4242 }
4243 } else {
4244 /* already using the large modulus */
4245 nsize = DRT_HASH_LARGE_MODULUS;
4246 /*
4247 * If the ring is completely full, there's
4248 * nothing useful for us to do. Behave as
4249 * though we had compacted into the new
4250 * array and return.
4251 */
4252 if (active_buckets >= DRT_HASH_LARGE_MODULUS)
4253 return(KERN_SUCCESS);
4254 }
4255 }
4256
4257 /*
4258 * Allocate and initialise the new map.
4259 */
4260
4261 kret = kmem_alloc(kernel_map, (vm_offset_t *)&cmap,
4262 (nsize == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION);
4263 if (kret != KERN_SUCCESS)
4264 return(kret);
4265 cmap->scm_magic = DRT_SCM_MAGIC;
4266 cmap->scm_modulus = nsize;
4267 cmap->scm_buckets = 0;
4268 cmap->scm_lastclean = 0;
4269 cmap->scm_iskips = 0;
4270 for (i = 0; i < cmap->scm_modulus; i++) {
4271 DRT_HASH_CLEAR(cmap, i);
4272 DRT_HASH_VACATE(cmap, i);
4273 DRT_BITVECTOR_CLEAR(cmap, i);
4274 }
4275
4276 /*
4277 * If there's an old map, re-hash entries from it into the new map.
4278 */
4279 copycount = 0;
4280 if (ocmap != NULL) {
4281 for (i = 0; i < ocmap->scm_modulus; i++) {
4282 /* skip empty buckets */
4283 if (DRT_HASH_VACANT(ocmap, i) ||
4284 (DRT_HASH_GET_COUNT(ocmap, i) == 0))
4285 continue;
4286 /* get new index */
4287 offset = DRT_HASH_GET_ADDRESS(ocmap, i);
4288 kret = vfs_drt_get_index(&cmap, offset, &index, 1);
4289 if (kret != KERN_SUCCESS) {
4290 /* XXX need to bail out gracefully here */
4291 panic("vfs_drt: new cluster map mysteriously too small");
4292 }
4293 /* copy */
4294 DRT_HASH_COPY(ocmap, i, cmap, index);
4295 copycount++;
4296 }
4297 }
4298
4299 /* log what we've done */
4300 vfs_drt_trace(cmap, DRT_DEBUG_ALLOC, copycount, 0, 0, 0);
4301
4302 /*
4303 * It's important to ensure that *cmapp always points to
4304 * a valid map, so we must overwrite it before freeing
4305 * the old map.
4306 */
4307 *cmapp = cmap;
4308 if (ocmap != NULL) {
4309 /* emit stats into trace buffer */
4310 vfs_drt_trace(ocmap, DRT_DEBUG_SCMDATA,
4311 ocmap->scm_modulus,
4312 ocmap->scm_buckets,
4313 ocmap->scm_lastclean,
4314 ocmap->scm_iskips);
4315
4316 vfs_drt_free_map(ocmap);
4317 }
4318 return(KERN_SUCCESS);
4319}
4320
4321
4322/*
4323 * Free a sparse cluster map.
4324 */
4325static kern_return_t
4326vfs_drt_free_map(struct vfs_drt_clustermap *cmap)
4327{
4328 kern_return_t ret;
4329
4330 kmem_free(kernel_map, (vm_offset_t)cmap,
4331 (cmap->scm_modulus == DRT_HASH_SMALL_MODULUS) ? DRT_SMALL_ALLOCATION : DRT_LARGE_ALLOCATION);
4332 return(KERN_SUCCESS);
4333}
4334
4335
4336/*
4337 * Find the hashtable slot currently occupied by an entry for the supplied offset.
4338 */
4339static kern_return_t
4340vfs_drt_search_index(struct vfs_drt_clustermap *cmap, u_int64_t offset, int *indexp)
4341{
4342 kern_return_t kret;
4343 int index, i, tries;
4344
4345 offset = DRT_ALIGN_ADDRESS(offset);
4346 index = DRT_HASH(cmap, offset);
4347
4348 /* traverse the hashtable */
4349 for (i = 0; i < cmap->scm_modulus; i++) {
4350
4351 /*
4352 * If the slot is vacant, we can stop.
4353 */
4354 if (DRT_HASH_VACANT(cmap, index))
4355 break;
4356
4357 /*
4358 * If the address matches our offset, we have success.
4359 */
4360 if (DRT_HASH_GET_ADDRESS(cmap, index) == offset) {
4361 *indexp = index;
4362 return(KERN_SUCCESS);
4363 }
4364
4365 /*
4366 * Move to the next slot, try again.
4367 */
4368 index = DRT_HASH_NEXT(cmap, index);
4369 }
4370 /*
4371 * It's not there.
4372 */
4373 return(KERN_FAILURE);
4374}
4375
4376/*
4377 * Find the hashtable slot for the supplied offset. If we haven't allocated
4378 * one yet, allocate one and populate the address field. Note that it will
4379 * not have a nonzero page count and thus will still technically be free, so
4380 * in the case where we are called to clean pages, the slot will remain free.
4381 */
4382static kern_return_t
4383vfs_drt_get_index(struct vfs_drt_clustermap **cmapp, u_int64_t offset, int *indexp, int recursed)
4384{
4385 struct vfs_drt_clustermap *cmap;
4386 kern_return_t kret;
4387 int index, i;
4388
4389 cmap = *cmapp;
4390
4391 /* look for an existing entry */
4392 kret = vfs_drt_search_index(cmap, offset, indexp);
4393 if (kret == KERN_SUCCESS)
4394 return(kret);
4395
4396 /* need to allocate an entry */
4397 offset = DRT_ALIGN_ADDRESS(offset);
4398 index = DRT_HASH(cmap, offset);
4399
4400 /* scan from the index forwards looking for a vacant slot */
4401 for (i = 0; i < cmap->scm_modulus; i++) {
4402 /* slot vacant? */
4403 if (DRT_HASH_VACANT(cmap, index) || DRT_HASH_GET_COUNT(cmap,index) == 0) {
4404 cmap->scm_buckets++;
4405 if (index < cmap->scm_lastclean)
4406 cmap->scm_lastclean = index;
4407 DRT_HASH_SET_ADDRESS(cmap, index, offset);
4408 DRT_HASH_SET_COUNT(cmap, index, 0);
4409 DRT_BITVECTOR_CLEAR(cmap, index);
4410 *indexp = index;
4411 vfs_drt_trace(cmap, DRT_DEBUG_INSERT, (int)offset, i, 0, 0);
4412 return(KERN_SUCCESS);
4413 }
4414 cmap->scm_iskips += i;
4415 index = DRT_HASH_NEXT(cmap, index);
4416 }
4417
4418 /*
4419 * We haven't found a vacant slot, so the map is full. If we're not
4420 * already recursed, try reallocating/compacting it.
4421 */
4422 if (recursed)
4423 return(KERN_FAILURE);
4424 kret = vfs_drt_alloc_map(cmapp);
4425 if (kret == KERN_SUCCESS) {
4426 /* now try to insert again */
4427 kret = vfs_drt_get_index(cmapp, offset, indexp, 1);
4428 }
4429 return(kret);
4430}
4431
4432/*
4433 * Implementation of set dirty/clean.
4434 *
4435 * In the 'clean' case, not finding a map is OK.
4436 */
4437static kern_return_t
4438vfs_drt_do_mark_pages(
4439 void **private,
4440 u_int64_t offset,
4441 u_int length,
4442 int *setcountp,
4443 int dirty)
4444{
4445 struct vfs_drt_clustermap *cmap, **cmapp;
4446 kern_return_t kret;
4447 int i, index, pgoff, pgcount, setcount, ecount;
4448
4449 cmapp = (struct vfs_drt_clustermap **)private;
4450 cmap = *cmapp;
4451
4452 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_START, (int)offset, (int)length, dirty, 0);
4453
4454 if (setcountp != NULL)
4455 *setcountp = 0;
4456
4457 /* allocate a cluster map if we don't already have one */
4458 if (cmap == NULL) {
4459 /* no cluster map, nothing to clean */
4460 if (!dirty) {
4461 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 1, 0, 0, 0);
4462 return(KERN_SUCCESS);
4463 }
4464 kret = vfs_drt_alloc_map(cmapp);
4465 if (kret != KERN_SUCCESS) {
4466 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 2, 0, 0, 0);
4467 return(kret);
4468 }
4469 }
4470 setcount = 0;
4471
4472 /*
4473 * Iterate over the length of the region.
4474 */
4475 while (length > 0) {
4476 /*
4477 * Get the hashtable index for this offset.
4478 *
4479 * XXX this will add blank entries if we are clearing a range
4480 * that hasn't been dirtied.
4481 */
4482 kret = vfs_drt_get_index(cmapp, offset, &index, 0);
4483 cmap = *cmapp; /* may have changed! */
4484 /* this may be a partial-success return */
4485 if (kret != KERN_SUCCESS) {
4486 if (setcountp != NULL)
4487 *setcountp = setcount;
4488 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 3, (int)length, 0, 0);
4489
4490 return(kret);
4491 }
4492
4493 /*
4494 * Work out how many pages we're modifying in this
4495 * hashtable entry.
4496 */
4497 pgoff = (offset - DRT_ALIGN_ADDRESS(offset)) / PAGE_SIZE;
4498 pgcount = min((length / PAGE_SIZE), (DRT_BITVECTOR_PAGES - pgoff));
4499
4500 /*
4501 * Iterate over pages, dirty/clearing as we go.
4502 */
4503 ecount = DRT_HASH_GET_COUNT(cmap, index);
4504 for (i = 0; i < pgcount; i++) {
4505 if (dirty) {
4506 if (!DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
4507 DRT_HASH_SET_BIT(cmap, index, pgoff + i);
4508 ecount++;
4509 setcount++;
4510 }
4511 } else {
4512 if (DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
4513 DRT_HASH_CLEAR_BIT(cmap, index, pgoff + i);
4514 ecount--;
4515 setcount++;
4516 }
4517 }
4518 }
4519 DRT_HASH_SET_COUNT(cmap, index, ecount);
4520next:
4521 offset += pgcount * PAGE_SIZE;
4522 length -= pgcount * PAGE_SIZE;
4523 }
4524 if (setcountp != NULL)
4525 *setcountp = setcount;
4526
4527 vfs_drt_trace(cmap, DRT_DEBUG_MARK | DBG_FUNC_END, 0, setcount, 0, 0);
4528
4529 return(KERN_SUCCESS);
4530}
4531
4532/*
4533 * Mark a set of pages as dirty/clean.
4534 *
4535 * This is a public interface.
4536 *
4537 * cmapp
4538 * Pointer to storage suitable for holding a pointer. Note that
4539 * this must either be NULL or a value set by this function.
4540 *
4541 * size
4542 * Current file size in bytes.
4543 *
4544 * offset
4545 * Offset of the first page to be marked as dirty, in bytes. Must be
4546 * page-aligned.
4547 *
4548 * length
4549 * Length of dirty region, in bytes. Must be a multiple of PAGE_SIZE.
4550 *
4551 * setcountp
4552 * Number of pages newly marked dirty by this call (optional).
4553 *
4554 * Returns KERN_SUCCESS if all the pages were successfully marked.
4555 */
4556static kern_return_t
4557vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, int *setcountp)
4558{
4559 /* XXX size unused, drop from interface */
4560 return(vfs_drt_do_mark_pages(cmapp, offset, length, setcountp, 1));
4561}
4562
4563static kern_return_t
4564vfs_drt_unmark_pages(void **cmapp, off_t offset, u_int length)
4565{
4566 return(vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0));
4567}
4568
4569/*
4570 * Get a cluster of dirty pages.
4571 *
4572 * This is a public interface.
4573 *
4574 * cmapp
4575 * Pointer to storage managed by drt_mark_pages. Note that this must
4576 * be NULL or a value set by drt_mark_pages.
4577 *
4578 * offsetp
4579 * Returns the byte offset into the file of the first page in the cluster.
4580 *
4581 * lengthp
4582 * Returns the length in bytes of the cluster of dirty pages.
4583 *
4584 * Returns success if a cluster was found. If KERN_FAILURE is returned, there
4585 * are no dirty pages meeting the minmum size criteria. Private storage will
4586 * be released if there are no more dirty pages left in the map
4587 *
4588 */
4589static kern_return_t
4590vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp)
4591{
4592 struct vfs_drt_clustermap *cmap;
4593 u_int64_t offset;
4594 u_int length;
4595 int index, i, j, fs, ls;
4596
4597 /* sanity */
4598 if ((cmapp == NULL) || (*cmapp == NULL))
4599 return(KERN_FAILURE);
4600 cmap = *cmapp;
4601
4602 /* walk the hashtable */
4603 for (offset = 0, j = 0; j < cmap->scm_modulus; offset += (DRT_BITVECTOR_PAGES * PAGE_SIZE), j++) {
4604 index = DRT_HASH(cmap, offset);
4605
4606 if (DRT_HASH_VACANT(cmap, index) || (DRT_HASH_GET_COUNT(cmap, index) == 0))
4607 continue;
4608
4609 /* scan the bitfield for a string of bits */
4610 fs = -1;
4611
4612 for (i = 0; i < DRT_BITVECTOR_PAGES; i++) {
4613 if (DRT_HASH_TEST_BIT(cmap, index, i)) {
4614 fs = i;
4615 break;
4616 }
4617 }
4618 if (fs == -1) {
4619 /* didn't find any bits set */
4620 panic("vfs_drt: entry summary count > 0 but no bits set in map");
4621 }
4622 for (ls = 0; i < DRT_BITVECTOR_PAGES; i++, ls++) {
4623 if (!DRT_HASH_TEST_BIT(cmap, index, i))
4624 break;
4625 }
4626
4627 /* compute offset and length, mark pages clean */
4628 offset = DRT_HASH_GET_ADDRESS(cmap, index) + (PAGE_SIZE * fs);
4629 length = ls * PAGE_SIZE;
4630 vfs_drt_do_mark_pages(cmapp, offset, length, NULL, 0);
4631 cmap->scm_lastclean = index;
4632
4633 /* return successful */
4634 *offsetp = (off_t)offset;
4635 *lengthp = length;
4636
4637 vfs_drt_trace(cmap, DRT_DEBUG_RETCLUSTER, (int)offset, (int)length, 0, 0);
4638 return(KERN_SUCCESS);
4639 }
4640 /*
4641 * We didn't find anything... hashtable is empty
4642 * emit stats into trace buffer and
4643 * then free it
4644 */
4645 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
4646 cmap->scm_modulus,
4647 cmap->scm_buckets,
4648 cmap->scm_lastclean,
4649 cmap->scm_iskips);
4650
4651 vfs_drt_free_map(cmap);
4652 *cmapp = NULL;
4653
4654 return(KERN_FAILURE);
4655}
4656
4657
4658static kern_return_t
4659vfs_drt_control(void **cmapp, int op_type)
4660{
4661 struct vfs_drt_clustermap *cmap;
4662
4663 /* sanity */
4664 if ((cmapp == NULL) || (*cmapp == NULL))
4665 return(KERN_FAILURE);
4666 cmap = *cmapp;
4667
4668 switch (op_type) {
4669 case 0:
4670 /* emit stats into trace buffer */
4671 vfs_drt_trace(cmap, DRT_DEBUG_SCMDATA,
4672 cmap->scm_modulus,
4673 cmap->scm_buckets,
4674 cmap->scm_lastclean,
4675 cmap->scm_iskips);
4676
4677 vfs_drt_free_map(cmap);
4678 *cmapp = NULL;
4679 break;
4680
4681 case 1:
4682 cmap->scm_lastclean = 0;
4683 break;
4684 }
4685 return(KERN_SUCCESS);
4686}
4687
4688
4689
4690/*
4691 * Emit a summary of the state of the clustermap into the trace buffer
4692 * along with some caller-provided data.
4693 */
4694static void
4695vfs_drt_trace(struct vfs_drt_clustermap *cmap, int code, int arg1, int arg2, int arg3, int arg4)
4696{
4697 KERNEL_DEBUG(code, arg1, arg2, arg3, arg4, 0);
4698}
4699
4700/*
4701 * Perform basic sanity check on the hash entry summary count
4702 * vs. the actual bits set in the entry.
4703 */
4704static void
4705vfs_drt_sanity(struct vfs_drt_clustermap *cmap)
4706{
4707 int index, i;
4708 int bits_on;
4709
4710 for (index = 0; index < cmap->scm_modulus; index++) {
4711 if (DRT_HASH_VACANT(cmap, index))
4712 continue;
4713
4714 for (bits_on = 0, i = 0; i < DRT_BITVECTOR_PAGES; i++) {
4715 if (DRT_HASH_TEST_BIT(cmap, index, i))
4716 bits_on++;
4717 }
4718 if (bits_on != DRT_HASH_GET_COUNT(cmap, index))
4719 panic("bits_on = %d, index = %d\n", bits_on, index);
4720 }
b4c24cb9 4721}