2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
29 * Mach Operating System
30 * Copyright (c) 1987 Carnegie-Mellon University
31 * All rights reserved. The CMU software License Agreement specifies
32 * the terms and conditions for use and redistribution.
37 * "Swap" pager that pages to/from vnodes. Also
38 * handles demand paging from files.
42 #include <mach/boolean.h>
43 #include <sys/param.h>
44 #include <sys/systm.h>
47 #include <sys/kauth.h>
50 #include <sys/vnode_internal.h>
51 #include <sys/namei.h>
52 #include <sys/mount_internal.h> /* needs internal due to fhandle_t */
53 #include <sys/ubc_internal.h>
55 #include <sys/disk.h> /* For DKIOC calls */
57 #include <mach/mach_types.h>
58 #include <mach/memory_object_types.h>
59 #include <mach/vm_map.h>
60 #include <mach/mach_vm.h>
64 #include <vm/vm_map.h>
65 #include <vm/vm_kern.h>
66 #include <kern/zalloc.h>
67 #include <libkern/libkern.h>
69 #include <vm/vnode_pager.h>
70 #include <vm/vm_pageout.h>
72 #include <kern/assert.h>
73 #include <sys/kdebug.h>
74 #include <nfs/nfs_conf.h>
75 #include <nfs/rpcv2.h>
76 #include <nfs/nfsproto.h>
79 #include <vm/vm_protos.h>
81 #include <vfs/vfs_disk_conditioner.h>
84 vnode_pager_throttle(void)
88 ut
= get_bsdthread_info(current_thread());
90 if (ut
->uu_lowpri_window
) {
91 throttle_lowpri_io(1);
96 vnode_pager_isSSD(vnode_t vp
)
98 return disk_conditioner_mount_is_ssd(vp
->v_mount
);
103 vnode_pager_issue_reprioritize_io(struct vnode
*devvp
, uint64_t blkno
, uint32_t len
, int priority
)
105 u_int32_t blocksize
= 0;
107 dk_set_tier_t set_tier
;
110 error
= VNOP_IOCTL(devvp
, DKIOCGETBLOCKSIZE
, (caddr_t
)&blocksize
, 0, vfs_context_kernel());
115 memset(&extent
, 0, sizeof(dk_extent_t
));
116 memset(&set_tier
, 0, sizeof(dk_set_tier_t
));
118 extent
.offset
= blkno
* (u_int64_t
) blocksize
;
121 set_tier
.extents
= &extent
;
122 set_tier
.extentsCount
= 1;
123 set_tier
.tier
= (uint8_t)priority
;
125 error
= VNOP_IOCTL(devvp
, DKIOCSETTIER
, (caddr_t
)&set_tier
, 0, vfs_context_kernel());
131 vnode_pager_was_dirtied(
133 vm_object_offset_t s_offset
,
134 vm_object_offset_t e_offset
)
136 cluster_update_state(vp
, s_offset
, e_offset
, TRUE
);
140 vnode_pager_isinuse(struct vnode
*vp
)
142 if (vp
->v_usecount
> vp
->v_kusecount
) {
149 vnode_pager_return_throttle_io_limit(struct vnode
*vp
, uint32_t *limit
)
151 return cluster_throttle_io_limit(vp
, limit
);
155 vnode_pager_get_filesize(struct vnode
*vp
)
157 return (vm_object_offset_t
) ubc_getsize(vp
);
160 extern int safe_getpath(struct vnode
*dvp
, char *leafname
, char *path
, int _len
, int *truncated_path
);
163 vnode_pager_get_name(
166 vm_size_t pathname_len
,
168 vm_size_t filename_len
,
169 boolean_t
*truncated_path_p
)
171 *truncated_path_p
= FALSE
;
172 if (pathname
!= NULL
) {
173 /* get the path name */
174 safe_getpath(vp
, NULL
,
175 pathname
, (int) pathname_len
,
178 if ((pathname
== NULL
|| *truncated_path_p
) &&
180 /* get the file name */
183 name
= vnode_getname_printable(vp
);
184 strlcpy(filename
, name
, (size_t) filename_len
);
185 vnode_putname_printable(name
);
191 vnode_pager_get_mtime(
193 struct timespec
*current_mtime
,
194 struct timespec
*cs_mtime
)
196 vnode_mtime(vp
, current_mtime
, vfs_context_current());
197 if (cs_mtime
!= NULL
) {
198 ubc_get_cs_mtime(vp
, cs_mtime
);
204 vnode_pager_get_cs_blobs(
208 *blobs
= ubc_get_cs_blobs(vp
);
214 * Used to call the DKIOCUNMAP ioctl on the underlying disk device for the specified vnode.
215 * Trims the region at offset bytes into the file, for length bytes.
217 * Care must be taken to ensure that the vnode is sufficiently reference counted at the time this
218 * function is called; no iocounts or usecounts are taken on the vnode.
219 * This function is non-idempotent in error cases; We cannot un-discard the blocks if only some of them
220 * are successfully discarded.
228 daddr64_t io_blockno
; /* Block number corresponding to the start of the extent */
229 size_t io_bytecount
; /* Number of bytes in current extent for the specified range */
231 off_t current_offset
= offset
;
232 size_t remaining_length
= length
;
234 u_int32_t blocksize
= 0;
240 /* Get the underlying device vnode */
241 devvp
= vp
->v_mount
->mnt_devvp
;
243 /* Figure out the underlying device block size */
244 error
= VNOP_IOCTL(devvp
, DKIOCGETBLOCKSIZE
, (caddr_t
)&blocksize
, 0, vfs_context_kernel());
250 * We may not get the entire range from offset -> offset+length in a single
251 * extent from the blockmap call. Keep looping/going until we are sure we've hit
252 * the whole range or if we encounter an error.
254 while (trimmed
< length
) {
256 * VNOP_BLOCKMAP will tell us the logical to physical block number mapping for the
257 * specified offset. It returns blocks in contiguous chunks, so if the logical range is
258 * broken into multiple extents, it must be called multiple times, increasing the offset
259 * in each call to ensure that the entire range is covered.
261 error
= VNOP_BLOCKMAP(vp
, current_offset
, remaining_length
,
262 &io_blockno
, &io_bytecount
, NULL
, VNODE_READ
| VNODE_BLOCKMAP_NO_TRACK
, NULL
);
268 * We have a contiguous run. Prepare & issue the ioctl for the device.
269 * the DKIOCUNMAP ioctl takes offset in bytes from the start of the device.
271 memset(&extent
, 0, sizeof(dk_extent_t
));
272 memset(&unmap
, 0, sizeof(dk_unmap_t
));
273 extent
.offset
= (uint64_t) io_blockno
* (u_int64_t
) blocksize
;
274 extent
.length
= io_bytecount
;
275 unmap
.extents
= &extent
;
276 unmap
.extentsCount
= 1;
277 error
= VNOP_IOCTL(devvp
, DKIOCUNMAP
, (caddr_t
)&unmap
, 0, vfs_context_kernel());
282 remaining_length
= remaining_length
- io_bytecount
;
283 trimmed
= trimmed
+ io_bytecount
;
284 current_offset
= current_offset
+ io_bytecount
;
292 vnode_pageout(struct vnode
*vp
,
294 upl_offset_t upl_offset
,
295 vm_object_offset_t f_offset
,
300 int result
= PAGER_SUCCESS
;
309 vfs_context_t ctx
= vfs_context_current(); /* pager context */
314 * This call is non-blocking and does not ever fail but it can
315 * only be made when there is other explicit synchronization
316 * with reclaiming of the vnode which, in this path, is provided
317 * by the paging in progress counter.
319 * In addition, this may also be entered via explicit ubc_msync
320 * calls or vm_swapfile_io where the existing iocount provides
321 * the necessary synchronization. Ideally we would not take an
322 * additional iocount here in the cases where an explcit iocount
323 * has already been taken but this call doesn't cause a deadlock
324 * as other forms of vnode_get* might if this thread has already
327 error
= vnode_getalways_from_pager(vp
);
329 /* This can't happen */
330 panic("vnode_getalways returned %d for vp %p", error
, vp
);
334 result
= PAGER_ERROR
;
339 if (UBCINFOEXISTS(vp
) == 0) {
340 result
= PAGER_ERROR
;
343 if (upl
&& !(flags
& UPL_NOCOMMIT
)) {
344 ubc_upl_abort_range(upl
, upl_offset
, size
, UPL_ABORT_FREE_ON_EMPTY
);
348 if (!(flags
& UPL_VNODE_PAGER
)) {
350 * This is a pageout from the default pager,
351 * just go ahead and call vnop_pageout since
352 * it has already sorted out the dirty ranges
354 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE
,
355 (MACHDBG_CODE(DBG_MACH_VM
, 1)) | DBG_FUNC_START
,
358 if ((error_ret
= VNOP_PAGEOUT(vp
, upl
, upl_offset
, (off_t
)f_offset
,
359 (size_t)size
, flags
, ctx
))) {
360 result
= PAGER_ERROR
;
363 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE
,
364 (MACHDBG_CODE(DBG_MACH_VM
, 1)) | DBG_FUNC_END
,
372 if (vp
->v_mount
->mnt_vtable
->vfc_vfsflags
& VFC_VFSVNOP_PAGEOUTV2
) {
374 * filesystem has requested the new form of VNOP_PAGEOUT for file
375 * backed objects... we will not grab the UPL befofe calling VNOP_PAGEOUT...
376 * it is the fileystem's responsibility to grab the range we're denoting
377 * via 'f_offset' and 'size' into a UPL... this allows the filesystem to first
378 * take any locks it needs, before effectively locking the pages into a UPL...
380 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE
,
381 (MACHDBG_CODE(DBG_MACH_VM
, 1)) | DBG_FUNC_START
,
382 size
, (int)f_offset
, 0, 0, 0);
384 if ((error_ret
= VNOP_PAGEOUT(vp
, NULL
, upl_offset
, (off_t
)f_offset
,
385 size
, flags
, ctx
))) {
386 result
= PAGER_ERROR
;
388 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE
,
389 (MACHDBG_CODE(DBG_MACH_VM
, 1)) | DBG_FUNC_END
,
394 if (flags
& UPL_MSYNC
) {
395 request_flags
= UPL_UBC_MSYNC
| UPL_RET_ONLY_DIRTY
;
397 request_flags
= UPL_UBC_PAGEOUT
| UPL_RET_ONLY_DIRTY
;
400 if (ubc_create_upl_kernel(vp
, f_offset
, size
, &upl
, &pl
, request_flags
, VM_KERN_MEMORY_FILE
) != KERN_SUCCESS
) {
401 result
= PAGER_ERROR
;
407 pl
= ubc_upl_pageinfo(upl
);
411 * Ignore any non-present pages at the end of the
412 * UPL so that we aren't looking at a upl that
413 * may already have been freed by the preceeding
414 * aborts/completions.
416 base_index
= upl_offset
/ PAGE_SIZE
;
418 for (pg_index
= (upl_offset
+ isize
) / PAGE_SIZE
; pg_index
> base_index
;) {
419 if (upl_page_present(pl
, --pg_index
)) {
422 if (pg_index
== base_index
) {
424 * no pages were returned, so release
425 * our hold on the upl and leave
427 if (!(flags
& UPL_NOCOMMIT
)) {
428 ubc_upl_abort_range(upl
, upl_offset
, isize
, UPL_ABORT_FREE_ON_EMPTY
);
434 isize
= ((pg_index
+ 1) - base_index
) * PAGE_SIZE
;
437 * we come here for pageouts to 'real' files and
438 * for msyncs... the upl may not contain any
439 * dirty pages.. it's our responsibility to sort
440 * through it and find the 'runs' of dirty pages
441 * to call VNOP_PAGEOUT on...
444 if (ubc_getsize(vp
) == 0) {
446 * if the file has been effectively deleted, then
447 * we need to go through the UPL and invalidate any
448 * buffer headers we might have that reference any
451 for (offset
= upl_offset
; isize
; isize
-= PAGE_SIZE
, offset
+= PAGE_SIZE
) {
452 #if CONFIG_NFS_CLIENT
453 if (vp
->v_tag
== VT_NFS
) {
454 /* check with nfs if page is OK to drop */
455 error
= nfs_buf_page_inval(vp
, (off_t
)f_offset
);
457 #endif /* CONFIG_NFS_CLIENT */
459 blkno
= ubc_offtoblk(vp
, (off_t
)f_offset
);
460 error
= buf_invalblkno(vp
, blkno
, 0);
463 if (!(flags
& UPL_NOCOMMIT
)) {
464 ubc_upl_abort_range(upl
, offset
, PAGE_SIZE
, UPL_ABORT_FREE_ON_EMPTY
);
466 if (error_ret
== 0) {
469 result
= PAGER_ERROR
;
470 } else if (!(flags
& UPL_NOCOMMIT
)) {
471 ubc_upl_commit_range(upl
, offset
, PAGE_SIZE
, UPL_COMMIT_FREE_ON_EMPTY
);
473 f_offset
+= PAGE_SIZE
;
479 pg_index
= base_index
;
485 if (!upl_page_present(pl
, pg_index
)) {
487 * we asked for RET_ONLY_DIRTY, so it's possible
488 * to get back empty slots in the UPL
489 * just skip over them
491 f_offset
+= PAGE_SIZE
;
498 if (!upl_dirty_page(pl
, pg_index
)) {
500 * if the page is not dirty and reached here it is
501 * marked precious or it is due to invalidation in
502 * memory_object_lock request as part of truncation
503 * We also get here from vm_object_terminate()
504 * So all you need to do in these
505 * cases is to invalidate incore buffer if it is there
506 * Note we must not sleep here if the buffer is busy - that is
507 * a lock inversion which causes deadlock.
509 #if CONFIG_NFS_CLIENT
510 if (vp
->v_tag
== VT_NFS
) {
511 /* check with nfs if page is OK to drop */
512 error
= nfs_buf_page_inval(vp
, (off_t
)f_offset
);
514 #endif /* CONFIG_NFS_CLIENT */
516 blkno
= ubc_offtoblk(vp
, (off_t
)f_offset
);
517 error
= buf_invalblkno(vp
, blkno
, 0);
520 if (!(flags
& UPL_NOCOMMIT
)) {
521 ubc_upl_abort_range(upl
, offset
, PAGE_SIZE
, UPL_ABORT_FREE_ON_EMPTY
);
523 if (error_ret
== 0) {
526 result
= PAGER_ERROR
;
527 } else if (!(flags
& UPL_NOCOMMIT
)) {
528 ubc_upl_commit_range(upl
, offset
, PAGE_SIZE
, UPL_COMMIT_FREE_ON_EMPTY
);
530 f_offset
+= PAGE_SIZE
;
538 xsize
= isize
- PAGE_SIZE
;
541 if (!upl_dirty_page(pl
, pg_index
+ num_of_pages
)) {
547 xsize
= num_of_pages
* PAGE_SIZE
;
549 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE
,
550 (MACHDBG_CODE(DBG_MACH_VM
, 1)) | DBG_FUNC_START
,
551 xsize
, (int)f_offset
, 0, 0, 0);
553 if ((error
= VNOP_PAGEOUT(vp
, upl
, offset
, (off_t
)f_offset
,
554 xsize
, flags
, ctx
))) {
555 if (error_ret
== 0) {
558 result
= PAGER_ERROR
;
560 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE
,
561 (MACHDBG_CODE(DBG_MACH_VM
, 1)) | DBG_FUNC_END
,
567 pg_index
+= num_of_pages
;
570 vnode_put_from_pager(vp
);
584 upl_offset_t upl_offset
,
585 vm_object_offset_t f_offset
,
591 int result
= PAGER_SUCCESS
;
599 int ignore_valid_page_check
= 0;
601 if (flags
& UPL_NOCOMMIT
) {
605 if (flags
& UPL_IGNORE_VALID_PAGE_CHECK
) {
606 ignore_valid_page_check
= 1;
610 * This call is non-blocking and does not ever fail but it can
611 * only be made when there is other explicit synchronization
612 * with reclaiming of the vnode which, in this path, is provided
613 * by the paging in progress counter.
615 * In addition, this may also be entered via vm_swapfile_io
616 * where the existing iocount provides the necessary synchronization.
617 * Ideally we would not take an additional iocount here in the cases
618 * where an explcit iocount has already been taken but this call
619 * doesn't cause a deadlock as other forms of vnode_get* might if
620 * this thread has already taken an iocount.
622 error
= vnode_getalways_from_pager(vp
);
624 /* This can't happen */
625 panic("vnode_getalways returned %d for vp %p", error
, vp
);
628 if (UBCINFOEXISTS(vp
) == 0) {
629 result
= PAGER_ERROR
;
632 if (upl
&& must_commit
) {
633 ubc_upl_abort_range(upl
, upl_offset
, size
, UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_ERROR
);
638 if (upl
== (upl_t
)NULL
) {
639 flags
&= ~UPL_NOCOMMIT
;
641 if (size
> MAX_UPL_SIZE_BYTES
) {
642 result
= PAGER_ERROR
;
646 if (vp
->v_mount
->mnt_vtable
->vfc_vfsflags
& VFC_VFSVNOP_PAGEINV2
) {
648 * filesystem has requested the new form of VNOP_PAGEIN for file
649 * backed objects... we will not grab the UPL befofe calling VNOP_PAGEIN...
650 * it is the fileystem's responsibility to grab the range we're denoting
651 * via 'f_offset' and 'size' into a UPL... this allows the filesystem to first
652 * take any locks it needs, before effectively locking the pages into a UPL...
653 * so we pass a NULL into the filesystem instead of a UPL pointer... the 'upl_offset'
654 * is used to identify the "must have" page in the extent... the filesystem is free
655 * to clip the extent to better fit the underlying FS blocksize if it desires as
656 * long as it continues to include the "must have" page... 'f_offset' + 'upl_offset'
657 * identifies that page
659 if ((error
= VNOP_PAGEIN(vp
, NULL
, upl_offset
, (off_t
)f_offset
,
660 size
, flags
, vfs_context_current()))) {
661 set_thread_pagein_error(current_thread(), error
);
662 result
= PAGER_ERROR
;
667 ubc_create_upl_kernel(vp
, f_offset
, size
, &upl
, &pl
, UPL_UBC_PAGEIN
| UPL_RET_ONLY_ABSENT
, VM_KERN_MEMORY_FILE
);
669 if (upl
== (upl_t
)NULL
) {
670 result
= PAGER_ABSENT
;
671 error
= PAGER_ABSENT
;
674 ubc_upl_range_needed(upl
, upl_offset
/ PAGE_SIZE
, 1);
680 * if we get here, we've created the upl and
681 * are responsible for commiting/aborting it
682 * regardless of what the caller has passed in
686 pl
= ubc_upl_pageinfo(upl
);
687 first_pg
= upl_offset
/ PAGE_SIZE
;
689 pages_in_upl
= size
/ PAGE_SIZE
;
690 DTRACE_VM2(pgpgin
, int, pages_in_upl
, (uint64_t *), NULL
);
693 * before we start marching forward, we must make sure we end on
694 * a present page, otherwise we will be working with a freed
697 for (last_pg
= pages_in_upl
- 1; last_pg
>= first_pg
; last_pg
--) {
698 if (upl_page_present(pl
, last_pg
)) {
701 if (last_pg
== first_pg
) {
703 * empty UPL, no pages are present
706 ubc_upl_abort_range(upl
, upl_offset
, size
, UPL_ABORT_FREE_ON_EMPTY
);
711 pages_in_upl
= last_pg
+ 1;
714 while (last_pg
< pages_in_upl
) {
716 * skip over missing pages...
718 for (; last_pg
< pages_in_upl
; last_pg
++) {
719 if (upl_page_present(pl
, last_pg
)) {
724 if (ignore_valid_page_check
== 1) {
728 * skip over 'valid' pages... we don't want to issue I/O for these
730 for (start_pg
= last_pg
; last_pg
< pages_in_upl
; last_pg
++) {
731 if (!upl_valid_page(pl
, last_pg
)) {
737 if (last_pg
> start_pg
) {
739 * we've found a range of valid pages
740 * if we've got COMMIT responsibility
741 * commit this range of pages back to the
744 xsize
= (last_pg
- start_pg
) * PAGE_SIZE
;
747 ubc_upl_abort_range(upl
, start_pg
* PAGE_SIZE
, xsize
, UPL_ABORT_FREE_ON_EMPTY
);
750 if (last_pg
== pages_in_upl
) {
752 * we're done... all pages that were present
753 * have either had I/O issued on them or
754 * were aborted unchanged...
759 if (!upl_page_present(pl
, last_pg
)) {
761 * we found a range of valid pages
762 * terminated by a missing page...
763 * bump index to the next page and continue on
769 * scan from the found invalid page looking for a valid
770 * or non-present page before the end of the upl is reached, if we
771 * find one, then it will be the last page of the request to
774 for (start_pg
= last_pg
; last_pg
< pages_in_upl
; last_pg
++) {
775 if ((!ignore_valid_page_check
&& upl_valid_page(pl
, last_pg
)) || !upl_page_present(pl
, last_pg
)) {
779 if (last_pg
> start_pg
) {
781 xsize
= (last_pg
- start_pg
) * PAGE_SIZE
;
782 xoff
= start_pg
* PAGE_SIZE
;
784 if ((error
= VNOP_PAGEIN(vp
, upl
, (upl_offset_t
) xoff
,
785 (off_t
)f_offset
+ xoff
,
786 xsize
, flags
, vfs_context_current()))) {
788 * Usually this UPL will be aborted/committed by the lower cluster layer.
790 * a) In the case of decmpfs, however, we may return an error (EAGAIN) to avoid
791 * a deadlock with another thread already inflating the file.
793 * b) In the case of content protection, EPERM is a valid error and we should respect it.
795 * In those cases, we must take care of our UPL at this layer itself.
798 if (error
== EAGAIN
) {
799 ubc_upl_abort_range(upl
, (upl_offset_t
) xoff
, xsize
, UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_RESTART
);
801 if (error
== EPERM
) {
802 ubc_upl_abort_range(upl
, (upl_offset_t
) xoff
, xsize
, UPL_ABORT_FREE_ON_EMPTY
| UPL_ABORT_ERROR
);
805 set_thread_pagein_error(current_thread(), error
);
806 result
= PAGER_ERROR
;
812 vnode_put_from_pager(vp
);
822 upl_get_internal_page_list(upl_t upl
)
824 return UPL_GET_INTERNAL_PAGE_LIST(upl
);