2 * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
29 * Mach Operating System
30 * Copyright (c) 1987 Carnegie-Mellon University
31 * All rights reserved. The CMU software License Agreement specifies
32 * the terms and conditions for use and redistribution.
37 * "Swap" pager that pages to/from vnodes. Also
38 * handles demand paging from files.
42 #include <mach/boolean.h>
43 #include <sys/param.h>
44 #include <sys/systm.h>
47 #include <sys/kauth.h>
50 #include <sys/vnode_internal.h>
51 #include <sys/namei.h>
52 #include <sys/mount_internal.h> /* needs internal due to fhandle_t */
53 #include <sys/ubc_internal.h>
55 #include <sys/disk.h> /* For DKIOC calls */
57 #include <mach/mach_types.h>
58 #include <mach/memory_object_types.h>
59 #include <mach/memory_object_control.h>
60 #include <mach/vm_map.h>
61 #include <mach/mach_vm.h>
65 #include <vm/vm_map.h>
66 #include <vm/vm_kern.h>
67 #include <kern/zalloc.h>
68 #include <kern/kalloc.h>
69 #include <libkern/libkern.h>
71 #include <vm/vnode_pager.h>
72 #include <vm/vm_pageout.h>
74 #include <kern/assert.h>
75 #include <sys/kdebug.h>
76 #include <machine/spl.h>
78 #include <nfs/rpcv2.h>
79 #include <nfs/nfsproto.h>
82 #include <vm/vm_protos.h>
86 vnode_pager_throttle()
90 ut
= get_bsdthread_info(current_thread());
92 if (ut
->uu_lowpri_window
)
93 throttle_lowpri_io(TRUE
);
98 vnode_pager_isSSD(vnode_t vp
)
100 if (vp
->v_mount
->mnt_kern_flag
& MNTK_SSD
)
107 vnode_pager_isinuse(struct vnode
*vp
)
109 if (vp
->v_usecount
> vp
->v_kusecount
)
115 vnode_pager_return_hard_throttle_limit(struct vnode
*vp
, uint32_t *limit
, uint32_t hard_throttle
)
117 return(cluster_hard_throttle_limit(vp
, limit
, hard_throttle
));
121 vnode_pager_get_filesize(struct vnode
*vp
)
124 return (vm_object_offset_t
) ubc_getsize(vp
);
128 vnode_pager_get_pathname(
135 len
= (int) *length_p
;
136 error
= vn_getpath(vp
, pathname
, &len
);
140 *length_p
= (vm_size_t
) len
;
145 vnode_pager_get_filename(
147 const char **filename
)
149 *filename
= vp
->v_name
;
154 vnode_pager_get_cs_blobs(
158 *blobs
= ubc_get_cs_blobs(vp
);
164 * Used to call the DKIOCUNMAP ioctl on the underlying disk device for the specified vnode.
165 * Trims the region at offset bytes into the file, for length bytes.
167 * Care must be taken to ensure that the vnode is sufficiently reference counted at the time this
168 * function is called; no iocounts or usecounts are taken on the vnode.
169 * This function is non-idempotent in error cases; We cannot un-discard the blocks if only some of them
170 * are successfully discarded.
172 u_int32_t
vnode_trim (
177 daddr64_t io_blockno
; /* Block number corresponding to the start of the extent */
178 size_t io_bytecount
; /* Number of bytes in current extent for the specified range */
180 off_t current_offset
= offset
;
181 size_t remaining_length
= length
;
183 u_int32_t blocksize
= 0;
189 /* Get the underlying device vnode */
190 devvp
= vp
->v_mount
->mnt_devvp
;
192 /* Figure out the underlying device block size */
193 error
= VNOP_IOCTL(devvp
, DKIOCGETBLOCKSIZE
, (caddr_t
)&blocksize
, 0, vfs_context_kernel());
199 * We may not get the entire range from offset -> offset+length in a single
200 * extent from the blockmap call. Keep looping/going until we are sure we've hit
201 * the whole range or if we encounter an error.
203 while (trimmed
< length
) {
205 * VNOP_BLOCKMAP will tell us the logical to physical block number mapping for the
206 * specified offset. It returns blocks in contiguous chunks, so if the logical range is
207 * broken into multiple extents, it must be called multiple times, increasing the offset
208 * in each call to ensure that the entire range is covered.
210 error
= VNOP_BLOCKMAP (vp
, current_offset
, remaining_length
,
211 &io_blockno
, &io_bytecount
, NULL
, VNODE_READ
, NULL
);
217 * We have a contiguous run. Prepare & issue the ioctl for the device.
218 * the DKIOCUNMAP ioctl takes offset in bytes from the start of the device.
220 memset (&extent
, 0, sizeof(dk_extent_t
));
221 memset (&unmap
, 0, sizeof(dk_unmap_t
));
222 extent
.offset
= (uint64_t) io_blockno
* (u_int64_t
) blocksize
;
223 extent
.length
= io_bytecount
;
224 unmap
.extents
= &extent
;
225 unmap
.extentsCount
= 1;
226 error
= VNOP_IOCTL(devvp
, DKIOCUNMAP
, (caddr_t
)&unmap
, 0, vfs_context_kernel());
231 remaining_length
= remaining_length
- io_bytecount
;
232 trimmed
= trimmed
+ io_bytecount
;
233 current_offset
= current_offset
+ io_bytecount
;
242 vnode_pageout(struct vnode
*vp
,
244 upl_offset_t upl_offset
,
245 vm_object_offset_t f_offset
,
250 int result
= PAGER_SUCCESS
;
259 vfs_context_t ctx
= vfs_context_current(); /* pager context */
264 result
= PAGER_ERROR
;
269 if (UBCINFOEXISTS(vp
) == 0) {
270 result
= PAGER_ERROR
;
273 if (upl
&& !(flags
& UPL_NOCOMMIT
))
274 ubc_upl_abort_range(upl
, upl_offset
, size
, UPL_ABORT_FREE_ON_EMPTY
);
277 if ( !(flags
& UPL_VNODE_PAGER
)) {
279 * This is a pageout from the default pager,
280 * just go ahead and call vnop_pageout since
281 * it has already sorted out the dirty ranges
283 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE
,
284 (MACHDBG_CODE(DBG_MACH_VM
, 1)) | DBG_FUNC_START
,
287 if ( (error_ret
= VNOP_PAGEOUT(vp
, upl
, upl_offset
, (off_t
)f_offset
,
288 (size_t)size
, flags
, ctx
)) )
289 result
= PAGER_ERROR
;
291 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE
,
292 (MACHDBG_CODE(DBG_MACH_VM
, 1)) | DBG_FUNC_END
,
300 if (vp
->v_mount
->mnt_vtable
->vfc_vfsflags
& VFC_VFSVNOP_PAGEOUTV2
) {
302 * filesystem has requested the new form of VNOP_PAGEOUT for file
303 * backed objects... we will not grab the UPL befofe calling VNOP_PAGEOUT...
304 * it is the fileystem's responsibility to grab the range we're denoting
305 * via 'f_offset' and 'size' into a UPL... this allows the filesystem to first
306 * take any locks it needs, before effectively locking the pages into a UPL...
308 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE
,
309 (MACHDBG_CODE(DBG_MACH_VM
, 1)) | DBG_FUNC_START
,
310 size
, (int)f_offset
, 0, 0, 0);
312 if ( (error_ret
= VNOP_PAGEOUT(vp
, NULL
, upl_offset
, (off_t
)f_offset
,
313 size
, flags
, ctx
)) ) {
314 result
= PAGER_ERROR
;
316 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE
,
317 (MACHDBG_CODE(DBG_MACH_VM
, 1)) | DBG_FUNC_END
,
322 if (flags
& UPL_MSYNC
)
323 request_flags
= UPL_UBC_MSYNC
| UPL_RET_ONLY_DIRTY
;
325 request_flags
= UPL_UBC_PAGEOUT
| UPL_RET_ONLY_DIRTY
;
327 if (ubc_create_upl(vp
, f_offset
, size
, &upl
, &pl
, request_flags
) != KERN_SUCCESS
) {
328 result
= PAGER_ERROR
;
334 pl
= ubc_upl_pageinfo(upl
);
337 * we come here for pageouts to 'real' files and
338 * for msyncs... the upl may not contain any
339 * dirty pages.. it's our responsibility to sort
340 * through it and find the 'runs' of dirty pages
341 * to call VNOP_PAGEOUT on...
343 if (ubc_getsize(vp
) == 0) {
345 * if the file has been effectively deleted, then
346 * we need to go through the UPL and invalidate any
347 * buffer headers we might have that reference any
350 for (offset
= upl_offset
; isize
; isize
-= PAGE_SIZE
, offset
+= PAGE_SIZE
) {
352 if (vp
->v_tag
== VT_NFS
)
353 /* check with nfs if page is OK to drop */
354 error
= nfs_buf_page_inval(vp
, (off_t
)f_offset
);
358 blkno
= ubc_offtoblk(vp
, (off_t
)f_offset
);
359 error
= buf_invalblkno(vp
, blkno
, 0);
362 if ( !(flags
& UPL_NOCOMMIT
))
363 ubc_upl_abort_range(upl
, offset
, PAGE_SIZE
, UPL_ABORT_FREE_ON_EMPTY
);
366 result
= PAGER_ERROR
;
368 } else if ( !(flags
& UPL_NOCOMMIT
)) {
369 ubc_upl_commit_range(upl
, offset
, PAGE_SIZE
, UPL_COMMIT_FREE_ON_EMPTY
);
371 f_offset
+= PAGE_SIZE
;
376 * Ignore any non-present pages at the end of the
377 * UPL so that we aren't looking at a upl that
378 * may already have been freed by the preceeding
379 * aborts/completions.
381 base_index
= upl_offset
/ PAGE_SIZE
;
383 for (pg_index
= (upl_offset
+ isize
) / PAGE_SIZE
; pg_index
> base_index
;) {
384 if (upl_page_present(pl
, --pg_index
))
386 if (pg_index
== base_index
) {
388 * no pages were returned, so release
389 * our hold on the upl and leave
391 if ( !(flags
& UPL_NOCOMMIT
))
392 ubc_upl_abort_range(upl
, upl_offset
, isize
, UPL_ABORT_FREE_ON_EMPTY
);
397 isize
= ((pg_index
+ 1) - base_index
) * PAGE_SIZE
;
400 pg_index
= base_index
;
406 if ( !upl_page_present(pl
, pg_index
)) {
408 * we asked for RET_ONLY_DIRTY, so it's possible
409 * to get back empty slots in the UPL
410 * just skip over them
412 f_offset
+= PAGE_SIZE
;
419 if ( !upl_dirty_page(pl
, pg_index
)) {
421 * if the page is not dirty and reached here it is
422 * marked precious or it is due to invalidation in
423 * memory_object_lock request as part of truncation
424 * We also get here from vm_object_terminate()
425 * So all you need to do in these
426 * cases is to invalidate incore buffer if it is there
427 * Note we must not sleep here if the buffer is busy - that is
428 * a lock inversion which causes deadlock.
431 if (vp
->v_tag
== VT_NFS
)
432 /* check with nfs if page is OK to drop */
433 error
= nfs_buf_page_inval(vp
, (off_t
)f_offset
);
437 blkno
= ubc_offtoblk(vp
, (off_t
)f_offset
);
438 error
= buf_invalblkno(vp
, blkno
, 0);
441 if ( !(flags
& UPL_NOCOMMIT
))
442 ubc_upl_abort_range(upl
, offset
, PAGE_SIZE
, UPL_ABORT_FREE_ON_EMPTY
);
445 result
= PAGER_ERROR
;
447 } else if ( !(flags
& UPL_NOCOMMIT
)) {
448 ubc_upl_commit_range(upl
, offset
, PAGE_SIZE
, UPL_COMMIT_FREE_ON_EMPTY
);
450 f_offset
+= PAGE_SIZE
;
458 xsize
= isize
- PAGE_SIZE
;
461 if ( !upl_dirty_page(pl
, pg_index
+ num_of_pages
))
466 xsize
= num_of_pages
* PAGE_SIZE
;
468 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE
,
469 (MACHDBG_CODE(DBG_MACH_VM
, 1)) | DBG_FUNC_START
,
470 xsize
, (int)f_offset
, 0, 0, 0);
472 if ( (error
= VNOP_PAGEOUT(vp
, upl
, offset
, (off_t
)f_offset
,
473 xsize
, flags
, ctx
)) ) {
476 result
= PAGER_ERROR
;
478 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE
,
479 (MACHDBG_CODE(DBG_MACH_VM
, 1)) | DBG_FUNC_END
,
485 pg_index
+= num_of_pages
;
499 upl_offset_t upl_offset
,
500 vm_object_offset_t f_offset
,
506 int result
= PAGER_SUCCESS
;
515 if (flags
& UPL_NOCOMMIT
)
518 if (UBCINFOEXISTS(vp
) == 0) {
519 result
= PAGER_ERROR
;
522 if (upl
&& must_commit
)
523 ubc_upl_abort_range(upl
, upl_offset
, size
, UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_ERROR
);
527 if (upl
== (upl_t
)NULL
) {
528 flags
&= ~UPL_NOCOMMIT
;
530 if (size
> (MAX_UPL_SIZE
* PAGE_SIZE
)) {
531 result
= PAGER_ERROR
;
535 if (vp
->v_mount
->mnt_vtable
->vfc_vfsflags
& VFC_VFSVNOP_PAGEINV2
) {
537 * filesystem has requested the new form of VNOP_PAGEIN for file
538 * backed objects... we will not grab the UPL befofe calling VNOP_PAGEIN...
539 * it is the fileystem's responsibility to grab the range we're denoting
540 * via 'f_offset' and 'size' into a UPL... this allows the filesystem to first
541 * take any locks it needs, before effectively locking the pages into a UPL...
542 * so we pass a NULL into the filesystem instead of a UPL pointer... the 'upl_offset'
543 * is used to identify the "must have" page in the extent... the filesystem is free
544 * to clip the extent to better fit the underlying FS blocksize if it desires as
545 * long as it continues to include the "must have" page... 'f_offset' + 'upl_offset'
546 * identifies that page
548 if ( (error
= VNOP_PAGEIN(vp
, NULL
, upl_offset
, (off_t
)f_offset
,
549 size
, flags
, vfs_context_current())) ) {
550 result
= PAGER_ERROR
;
555 ubc_create_upl(vp
, f_offset
, size
, &upl
, &pl
, UPL_UBC_PAGEIN
| UPL_RET_ONLY_ABSENT
);
557 if (upl
== (upl_t
)NULL
) {
558 result
= PAGER_ABSENT
;
559 error
= PAGER_ABSENT
;
562 ubc_upl_range_needed(upl
, upl_offset
/ PAGE_SIZE
, 1);
568 * if we get here, we've created the upl and
569 * are responsible for commiting/aborting it
570 * regardless of what the caller has passed in
574 pl
= ubc_upl_pageinfo(upl
);
575 first_pg
= upl_offset
/ PAGE_SIZE
;
577 pages_in_upl
= size
/ PAGE_SIZE
;
578 DTRACE_VM2(pgpgin
, int, pages_in_upl
, (uint64_t *), NULL
);
581 * before we start marching forward, we must make sure we end on
582 * a present page, otherwise we will be working with a freed
585 for (last_pg
= pages_in_upl
- 1; last_pg
>= first_pg
; last_pg
--) {
586 if (upl_page_present(pl
, last_pg
))
588 if (last_pg
== first_pg
) {
590 * empty UPL, no pages are present
593 ubc_upl_abort_range(upl
, upl_offset
, size
, UPL_ABORT_FREE_ON_EMPTY
);
597 pages_in_upl
= last_pg
+ 1;
600 while (last_pg
< pages_in_upl
) {
602 * skip over missing pages...
604 for ( ; last_pg
< pages_in_upl
; last_pg
++) {
605 if (upl_page_present(pl
, last_pg
))
609 * skip over 'valid' pages... we don't want to issue I/O for these
611 for (start_pg
= last_pg
; last_pg
< pages_in_upl
; last_pg
++) {
612 if (!upl_valid_page(pl
, last_pg
))
615 if (last_pg
> start_pg
) {
617 * we've found a range of valid pages
618 * if we've got COMMIT responsibility
619 * commit this range of pages back to the
622 xsize
= (last_pg
- start_pg
) * PAGE_SIZE
;
625 ubc_upl_abort_range(upl
, start_pg
* PAGE_SIZE
, xsize
, UPL_ABORT_FREE_ON_EMPTY
);
627 if (last_pg
== pages_in_upl
)
629 * we're done... all pages that were present
630 * have either had I/O issued on them or
631 * were aborted unchanged...
635 if (!upl_page_present(pl
, last_pg
)) {
637 * we found a range of valid pages
638 * terminated by a missing page...
639 * bump index to the next page and continue on
645 * scan from the found invalid page looking for a valid
646 * or non-present page before the end of the upl is reached, if we
647 * find one, then it will be the last page of the request to
650 for (start_pg
= last_pg
; last_pg
< pages_in_upl
; last_pg
++) {
651 if (upl_valid_page(pl
, last_pg
) || !upl_page_present(pl
, last_pg
))
654 if (last_pg
> start_pg
) {
656 xsize
= (last_pg
- start_pg
) * PAGE_SIZE
;
657 xoff
= start_pg
* PAGE_SIZE
;
659 if ( (error
= VNOP_PAGEIN(vp
, upl
, (upl_offset_t
) xoff
,
660 (off_t
)f_offset
+ xoff
,
661 xsize
, flags
, vfs_context_current())) ) {
663 * Usually this UPL will be aborted/committed by the lower cluster layer.
665 * a) In the case of decmpfs, however, we may return an error (EAGAIN) to avoid
666 * a deadlock with another thread already inflating the file.
668 * b) In the case of content protection, EPERM is a valid error and we should respect it.
670 * In those cases, we must take care of our UPL at this layer itself.
673 if(error
== EAGAIN
) {
674 ubc_upl_abort_range(upl
, (upl_offset_t
) xoff
, xsize
, UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_RESTART
);
678 ubc_upl_abort_range(upl
, (upl_offset_t
) xoff
, xsize
, UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_ERROR
);
682 result
= PAGER_ERROR
;
696 vnode_pager_shutdown(void)
701 for(i
= 0; i
< MAX_BACKING_STORE
; i
++) {
702 vp
= (vnode_t
)(bs_port_table
[i
]).vp
;
704 (bs_port_table
[i
]).vp
= 0;
706 /* get rid of macx_swapon() reference */
714 upl_get_internal_page_list(upl_t upl
)
716 return(UPL_GET_INTERNAL_PAGE_LIST(upl
));