X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/15129b1c8dbb3650c63b70adb1cad9af601c6c17..ea3f04195ba4a5034c9c8e9b726d4f7ce96f1832:/bsd/vm/vm_compressor_backing_file.c

diff --git a/bsd/vm/vm_compressor_backing_file.c b/bsd/vm/vm_compressor_backing_file.c
index fe74a47ea..b908529dc 100644
--- a/bsd/vm/vm_compressor_backing_file.c
+++ b/bsd/vm/vm_compressor_backing_file.c
@@ -1,8 +1,8 @@
 /*
- * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
+ *
  * This file contains Original Code and/or Modifications of Original Code
  * as defined in and that are subject to the Apple Public Source License
  * Version 2.0 (the 'License'). You may not use this file except in
@@ -11,10 +11,10 @@
  * unlawful or unlicensed copies of an Apple operating system, or to
  * circumvent, violate, or enable the circumvention or violation of, any
  * terms of an Apple operating system software license agreement.
- * 
+ *
  * Please obtain a copy of the License at
  * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
+ *
  * The Original Code and all software distributed under the License are
  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
@@ -22,7 +22,7 @@
  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  * Please see the License for the specific language governing rights and
  * limitations under the License.
- * 
+ *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 
@@ -39,25 +39,44 @@
 #include <sys/disk.h>
 #include <vm/vm_protos.h>
 #include <vm/vm_pageout.h>
+#include <sys/content_protection.h>
 
 void vm_swapfile_open(const char *path, vnode_t *vp);
 void vm_swapfile_close(uint64_t path, vnode_t vp);
-int vm_swapfile_preallocate(vnode_t vp, uint64_t *size);
+int vm_swapfile_preallocate(vnode_t vp, uint64_t *size, boolean_t *pin);
 uint64_t vm_swapfile_get_blksize(vnode_t vp);
 uint64_t vm_swapfile_get_transfer_size(vnode_t vp);
-int vm_swapfile_io(vnode_t vp, uint64_t offset, uint64_t start, int npages, int flags);
+int vm_swapfile_io(vnode_t vp, uint64_t offset, uint64_t start, int npages, int flags, void *);
+int vm_record_file_write(struct vnode *vp, uint64_t offset, char *buf, int size);
+
+#if CONFIG_FREEZE
+int vm_swap_vol_get_budget(vnode_t vp, uint64_t *freeze_daily_budget);
+#endif /* CONFIG_FREEZE */
+
 
 void
 vm_swapfile_open(const char *path, vnode_t *vp)
 {
 	int error = 0;
-	vfs_context_t	ctx = vfs_context_current();
+	vfs_context_t   ctx = vfs_context_kernel();
 
-	if ((error = vnode_open(path, (O_CREAT | FREAD | FWRITE), S_IRUSR | S_IWUSR, 0, vp, ctx))) {
+	if ((error = vnode_open(path, (O_CREAT | O_TRUNC | FREAD | FWRITE), S_IRUSR | S_IWUSR, 0, vp, ctx))) {
 		printf("Failed to open swap file %d\n", error);
 		*vp = NULL;
 		return;
-	}	
+	}
+
+	/*
+	 * If MNT_IOFLAGS_NOSWAP is set, opening the swap file should fail.
+	 * To avoid a race on the mount we only make this check after creating the
+	 * vnode.
+	 */
+	if ((*vp)->v_mount->mnt_kern_flag & MNTK_NOSWAP) {
+		vnode_put(*vp);
+		vm_swapfile_close((uint64_t)path, *vp);
+		*vp = NULL;
+		return;
+	}
 
 	vnode_put(*vp);
 }
@@ -65,128 +84,147 @@ vm_swapfile_open(const char *path, vnode_t *vp)
 uint64_t
 vm_swapfile_get_blksize(vnode_t vp)
 {
-	return ((uint64_t)vfs_devblocksize(vnode_mount(vp)));
+	return (uint64_t)vfs_devblocksize(vnode_mount(vp));
 }
 
 uint64_t
 vm_swapfile_get_transfer_size(vnode_t vp)
 {
-	return((uint64_t)vp->v_mount->mnt_vfsstat.f_iosize);
+	return (uint64_t)vp->v_mount->mnt_vfsstat.f_iosize;
 }
 
-int unlink1(vfs_context_t, struct nameidata *, int);
+int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
 
 void
 vm_swapfile_close(uint64_t path_addr, vnode_t vp)
 {
-	struct nameidata nd;
-	vfs_context_t context = vfs_context_current();
-	int error = 0;
+	vfs_context_t context = vfs_context_kernel();
+	int error;
 
 	vnode_getwithref(vp);
 	vnode_close(vp, 0, context);
-	
-	NDINIT(&nd, DELETE, OP_UNLINK, AUDITVNPATH1, UIO_SYSSPACE,
-	       path_addr, context);
 
-	error = unlink1(context, &nd, 0);
+	error = unlink1(context, NULLVP, CAST_USER_ADDR_T(path_addr),
+	    UIO_SYSSPACE, 0);
+
+#if DEVELOPMENT || DEBUG
+	if (error) {
+		printf("%s : unlink of %s failed with error %d", __FUNCTION__,
+		    (char *)path_addr, error);
+	}
+#endif
 }
 
 int
-vm_swapfile_preallocate(vnode_t vp, uint64_t *size)
+vm_swapfile_preallocate(vnode_t vp, uint64_t *size, boolean_t *pin)
 {
-	int		error = 0;
-	uint64_t	file_size = 0;
-	vfs_context_t	ctx = NULL;
+	int             error = 0;
+	uint64_t        file_size = 0;
+	vfs_context_t   ctx = NULL;
+#if CONFIG_FREEZE
+	struct vnode_attr va;
+#endif /* CONFIG_FREEZE */
 
+	ctx = vfs_context_kernel();
 
-	ctx = vfs_context_current();
-
-#if CONFIG_PROTECT
-	{
-#if 0	// <rdar://11771612>
+	error = vnode_setsize(vp, *size, IO_NOZEROFILL, ctx);
 
-		if ((error = cp_vnode_setclass(vp, PROTECTION_CLASS_F))) {
-			if(config_protect_bug) {
-				printf("swap protection class set failed with %d\n", error);
-			} else {
-				panic("swap protection class set failed with %d\n", error);
-			}
-		}
-#endif
-		/* initialize content protection keys manually */
-		if ((error = cp_handle_vnop(vp, CP_WRITE_ACCESS, 0)) != 0) {
-			printf("Content Protection key failure on swap: %d\n", error);
-			vnode_put(vp);
-			vp = NULL;
-			goto done;
- 		}
+	if (error) {
+		printf("vnode_setsize for swap files failed: %d\n", error);
+		goto done;
 	}
-#endif
-
-	/*
-  	 * This check exists because dynamic_pager creates the 1st swapfile,
-	 * swapfile0, for us from user-space in a supported manner (with IO_NOZEROFILL etc).
-	 * 
-	 * If dynamic_pager, in the future, discontinues creating that file,
-	 * then we need to change this check to a panic / assert or return an error.
-	 * That's because we can't be sure if the file has been created correctly.
-	 */
 
-	if ((error = vnode_size(vp, (off_t*) &file_size, ctx)) != 0) {
+	error = vnode_size(vp, (off_t*) &file_size, ctx);
 
-		printf("vnode_size (existing files) for swap files failed: %d\n", error);
+	if (error) {
+		printf("vnode_size (new file) for swap file failed: %d\n", error);
 		goto done;
-	} else {
-	
-		if (file_size == 0) {
+	}
+	assert(file_size == *size);
 
-			error = vnode_setsize(vp, *size, IO_NOZEROFILL, ctx);
-		
-			if (error) {
-				printf("vnode_setsize for swap files failed: %d\n", error);
-				goto done;
-			}
-		} else {
+	if (pin != NULL && *pin != FALSE) {
+		error = VNOP_IOCTL(vp, FIOPINSWAP, NULL, 0, ctx);
 
-			*size = file_size;
+		if (error) {
+			printf("pin for swap files failed: %d,  file_size = %lld\n", error, file_size);
+			/* this is not fatal, carry on with files wherever they landed */
+			*pin = FALSE;
+			error = 0;
 		}
 	}
 
 	vnode_lock_spin(vp);
 	SET(vp->v_flag, VSWAP);
 	vnode_unlock(vp);
+
+#if CONFIG_FREEZE
+	VATTR_INIT(&va);
+	VATTR_SET(&va, va_dataprotect_class, PROTECTION_CLASS_C);
+	error = VNOP_SETATTR(vp, &va, ctx);
+
+	if (error) {
+		printf("setattr PROTECTION_CLASS_C for swap file failed: %d\n", error);
+		goto done;
+	}
+#endif /* CONFIG_FREEZE */
+
 done:
 	return error;
 }
 
+
+int
+vm_record_file_write(vnode_t vp, uint64_t offset, char *buf, int size)
+{
+	int error = 0;
+	vfs_context_t ctx;
+
+	ctx = vfs_context_kernel();
+
+	error = vn_rdwr(UIO_WRITE, vp, (caddr_t)buf, size, offset,
+	    UIO_SYSSPACE, IO_NODELOCKED, vfs_context_ucred(ctx), (int *) 0, vfs_context_proc(ctx));
+
+	return error;
+}
+
+
+
 int
-vm_swapfile_io(vnode_t vp, uint64_t offset, uint64_t start, int npages, int flags)
+vm_swapfile_io(vnode_t vp, uint64_t offset, uint64_t start, int npages, int flags, void *upl_iodone)
 {
 	int error = 0;
 	uint64_t io_size = npages * PAGE_SIZE_64;
 #if 1
-	kern_return_t	kr = KERN_SUCCESS;
-	upl_t		upl = NULL;
-	unsigned int	count = 0;
-	int		upl_create_flags = 0, upl_control_flags = 0;
-	upl_size_t	upl_size = 0;
+	kern_return_t   kr = KERN_SUCCESS;
+	upl_t           upl = NULL;
+	unsigned int    count = 0;
+	upl_control_flags_t upl_create_flags = 0;
+	int             upl_control_flags = 0;
+	upl_size_t      upl_size = 0;
 
 	upl_create_flags = UPL_SET_INTERNAL | UPL_SET_LITE;
-	upl_control_flags = UPL_IOSYNC | UPL_PAGING_ENCRYPTED;
+
+	if (upl_iodone == NULL) {
+		upl_control_flags = UPL_IOSYNC;
+	}
+
+#if ENCRYPTED_SWAP
+	upl_control_flags |= UPL_PAGING_ENCRYPTED;
+#endif
 
 	if ((flags & SWAP_READ) == FALSE) {
 		upl_create_flags |= UPL_COPYOUT_FROM;
 	}
- 
+
 	upl_size = io_size;
 	kr = vm_map_create_upl( kernel_map,
-				start,
-				&upl_size,
-				&upl,
-				NULL,
-				&count,
-				&upl_create_flags);
+	    start,
+	    &upl_size,
+	    &upl,
+	    NULL,
+	    &count,
+	    &upl_create_flags,
+	    VM_KERN_MEMORY_OSFMK);
 
 	if (kr != KERN_SUCCESS || (upl_size != io_size)) {
 		panic("vm_map_create_upl failed with %d\n", kr);
@@ -194,12 +232,12 @@ vm_swapfile_io(vnode_t vp, uint64_t offset, uint64_t start, int npages, int flag
 
 	if (flags & SWAP_READ) {
 		vnode_pagein(vp,
-			      upl,
-			      0,
-			      offset,
-			      io_size,
-			      upl_control_flags | UPL_IGNORE_VALID_PAGE_CHECK,
-			      &error);
+		    upl,
+		    0,
+		    offset,
+		    io_size,
+		    upl_control_flags | UPL_IGNORE_VALID_PAGE_CHECK,
+		    &error);
 		if (error) {
 #if DEBUG
 			printf("vm_swapfile_io: vnode_pagein failed with %d (vp: %p, offset: 0x%llx, size:%llu)\n", error, vp, offset, io_size);
@@ -207,15 +245,16 @@ vm_swapfile_io(vnode_t vp, uint64_t offset, uint64_t start, int npages, int flag
 			printf("vm_swapfile_io: vnode_pagein failed with %d.\n", error);
 #endif /* DEBUG */
 		}
-	
 	} else {
+		upl_set_iodone(upl, upl_iodone);
+
 		vnode_pageout(vp,
-			      upl,
-			      0,
-			      offset,
-			      io_size,
-			      upl_control_flags,
-			      &error);
+		    upl,
+		    0,
+		    offset,
+		    io_size,
+		    upl_control_flags,
+		    &error);
 		if (error) {
 #if DEBUG
 			printf("vm_swapfile_io: vnode_pageout failed with %d (vp: %p, offset: 0x%llx, size:%llu)\n", error, vp, offset, io_size);
@@ -229,9 +268,9 @@ vm_swapfile_io(vnode_t vp, uint64_t offset, uint64_t start, int npages, int flag
 #else /* 1 */
 	vfs_context_t ctx;
 	ctx = vfs_context_kernel();
-		
+
 	error = vn_rdwr((flags & SWAP_READ) ? UIO_READ : UIO_WRITE, vp, (caddr_t)start, io_size, offset,
-		UIO_SYSSPACE, IO_SYNC | IO_NODELOCKED | IO_UNIT | IO_NOCACHE | IO_SWAP_DISPATCH, vfs_context_ucred(ctx), (int *) 0, vfs_context_proc(ctx));
+	    UIO_SYSSPACE, IO_SYNC | IO_NODELOCKED | IO_UNIT | IO_NOCACHE | IO_SWAP_DISPATCH, vfs_context_ucred(ctx), (int *) 0, vfs_context_proc(ctx));
 
 	if (error) {
 		printf("vn_rdwr: Swap I/O failed with %d\n", error);
@@ -241,22 +280,31 @@ vm_swapfile_io(vnode_t vp, uint64_t offset, uint64_t start, int npages, int flag
 }
 
 
-#define MAX_BATCH_TO_TRIM	256
+#define MAX_BATCH_TO_TRIM       256
 
-u_int32_t vnode_trim_list (vnode_t vp, struct trim_list *tl)
-{
-	int		error = 0;
-	int		trim_index = 0;
-	u_int32_t	blocksize = 0;
-	struct vnode	*devvp;
-	dk_extent_t	*extents;
-	dk_unmap_t	unmap;
+#define ROUTE_ONLY              0x10            /* if corestorage is present, tell it to just pass */
+                                                /* the DKIOUNMAP command through w/o acting on it */
+                                                /* this is used by the compressed swap system to reclaim empty space */
 
-	if ( !(vp->v_mount->mnt_ioflags & MNT_IOFLAGS_UNMAP_SUPPORTED))
-		return (ENOTSUP);
 
-	if (tl == NULL)
-		return (0);
+u_int32_t
+vnode_trim_list(vnode_t vp, struct trim_list *tl, boolean_t route_only)
+{
+	int             error = 0;
+	int             trim_index = 0;
+	u_int32_t       blocksize = 0;
+	struct vnode    *devvp;
+	dk_extent_t     *extents;
+	dk_unmap_t      unmap;
+	_dk_cs_unmap_t  cs_unmap;
+
+	if (!(vp->v_mount->mnt_ioflags & MNT_IOFLAGS_UNMAP_SUPPORTED)) {
+		return ENOTSUP;
+	}
+
+	if (tl == NULL) {
+		return 0;
+	}
 
 	/*
 	 * Get the underlying device vnode and physical block size
@@ -266,21 +314,30 @@ u_int32_t vnode_trim_list (vnode_t vp, struct trim_list *tl)
 
 	extents = kalloc(sizeof(dk_extent_t) * MAX_BATCH_TO_TRIM);
 
-	memset (&unmap, 0, sizeof(dk_unmap_t));
-	unmap.extents = extents;
+	if (vp->v_mount->mnt_ioflags & MNT_IOFLAGS_CSUNMAP_SUPPORTED) {
+		memset(&cs_unmap, 0, sizeof(_dk_cs_unmap_t));
+		cs_unmap.extents = extents;
+
+		if (route_only == TRUE) {
+			cs_unmap.options = ROUTE_ONLY;
+		}
+	} else {
+		memset(&unmap, 0, sizeof(dk_unmap_t));
+		unmap.extents = extents;
+	}
 
 	while (tl) {
-		daddr64_t	io_blockno;	/* Block number corresponding to the start of the extent */
-		size_t		io_bytecount;	/* Number of bytes in current extent for the specified range */
-		size_t		trimmed;
-		size_t		remaining_length;
-		off_t		current_offset; 
+		daddr64_t       io_blockno;     /* Block number corresponding to the start of the extent */
+		size_t          io_bytecount;   /* Number of bytes in current extent for the specified range */
+		size_t          trimmed;
+		size_t          remaining_length;
+		off_t           current_offset;
 
 		current_offset = tl->tl_offset;
 		remaining_length = tl->tl_length;
 		trimmed = 0;
-		
-		/* 
+
+		/*
 		 * We may not get the entire range from tl_offset -> tl_offset+tl_length in a single
 		 * extent from the blockmap call.  Keep looping/going until we are sure we've hit
 		 * the whole range or if we encounter an error.
@@ -288,27 +345,30 @@ u_int32_t vnode_trim_list (vnode_t vp, struct trim_list *tl)
 		while (trimmed < tl->tl_length) {
 			/*
 			 * VNOP_BLOCKMAP will tell us the logical to physical block number mapping for the
-			 * specified offset.  It returns blocks in contiguous chunks, so if the logical range is 
+			 * specified offset.  It returns blocks in contiguous chunks, so if the logical range is
 			 * broken into multiple extents, it must be called multiple times, increasing the offset
 			 * in each call to ensure that the entire range is covered.
 			 */
-			error = VNOP_BLOCKMAP (vp, current_offset, remaining_length, 
-					       &io_blockno, &io_bytecount, NULL, VNODE_READ, NULL);
+			error = VNOP_BLOCKMAP(vp, current_offset, remaining_length,
+			    &io_blockno, &io_bytecount, NULL, VNODE_READ | VNODE_BLOCKMAP_NO_TRACK, NULL);
 
 			if (error) {
 				goto trim_exit;
 			}
+			if (io_blockno != -1) {
+				extents[trim_index].offset = (uint64_t) io_blockno * (u_int64_t) blocksize;
+				extents[trim_index].length = io_bytecount;
 
-			extents[trim_index].offset = (uint64_t) io_blockno * (u_int64_t) blocksize;
-			extents[trim_index].length = io_bytecount;
-
-			trim_index++;
-
+				trim_index++;
+			}
 			if (trim_index == MAX_BATCH_TO_TRIM) {
-
-				unmap.extentsCount = trim_index;
-				error = VNOP_IOCTL(devvp, DKIOCUNMAP, (caddr_t)&unmap, 0, vfs_context_kernel());
-
+				if (vp->v_mount->mnt_ioflags & MNT_IOFLAGS_CSUNMAP_SUPPORTED) {
+					cs_unmap.extentsCount = trim_index;
+					error = VNOP_IOCTL(devvp, _DKIOCCSUNMAP, (caddr_t)&cs_unmap, 0, vfs_context_kernel());
+				} else {
+					unmap.extentsCount = trim_index;
+					error = VNOP_IOCTL(devvp, DKIOCUNMAP, (caddr_t)&unmap, 0, vfs_context_kernel());
+				}
 				if (error) {
 					goto trim_exit;
 				}
@@ -321,12 +381,32 @@ u_int32_t vnode_trim_list (vnode_t vp, struct trim_list *tl)
 		tl = tl->tl_next;
 	}
 	if (trim_index) {
-
-		unmap.extentsCount = trim_index;
-		error = VNOP_IOCTL(devvp, DKIOCUNMAP, (caddr_t)&unmap, 0, vfs_context_kernel());
+		if (vp->v_mount->mnt_ioflags & MNT_IOFLAGS_CSUNMAP_SUPPORTED) {
+			cs_unmap.extentsCount = trim_index;
+			error = VNOP_IOCTL(devvp, _DKIOCCSUNMAP, (caddr_t)&cs_unmap, 0, vfs_context_kernel());
+		} else {
+			unmap.extentsCount = trim_index;
+			error = VNOP_IOCTL(devvp, DKIOCUNMAP, (caddr_t)&unmap, 0, vfs_context_kernel());
+		}
 	}
 trim_exit:
 	kfree(extents, sizeof(dk_extent_t) * MAX_BATCH_TO_TRIM);
 
 	return error;
 }
+
+#if CONFIG_FREEZE
+int
+vm_swap_vol_get_budget(vnode_t vp, uint64_t *freeze_daily_budget)
+{
+	vnode_t         devvp = NULL;
+	vfs_context_t   ctx = vfs_context_kernel();
+	errno_t         err = 0;
+
+	devvp = vp->v_mount->mnt_devvp;
+
+	err = VNOP_IOCTL(devvp, DKIOCGETMAXSWAPWRITE, (caddr_t)freeze_daily_budget, 0, ctx);
+
+	return err;
+}
+#endif /* CONFIG_FREEZE */