bsd/vm/vm_compressor_backing_file.c

   1 /*
   2  * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 #include <stdint.h>
  30 #include <sys/fcntl.h>
  31 #include <sys/vnode_internal.h>
  32 #include <sys/vnode.h>
  33 #include <sys/kauth.h>
  34 #include <sys/mount_internal.h>
  35 #include <sys/buf_internal.h>
  36 #include <kern/debug.h>
  37 #include <kern/kalloc.h>
  38 #include <sys/cprotect.h>
  39 #include <sys/disk.h>
  40 #include <vm/vm_protos.h>
  41 #include <vm/vm_pageout.h>
  42 #include <sys/content_protection.h>
  43
  44 void vm_swapfile_open(const char *path, vnode_t *vp);
  45 void vm_swapfile_close(uint64_t path, vnode_t vp);
  46 int vm_swapfile_preallocate(vnode_t vp, uint64_t *size, boolean_t *pin);
  47 uint64_t vm_swapfile_get_blksize(vnode_t vp);
  48 uint64_t vm_swapfile_get_transfer_size(vnode_t vp);
  49 int vm_swapfile_io(vnode_t vp, uint64_t offset, uint64_t start, int npages, int flags, void *);
  50 int vm_record_file_write(struct vnode *vp, uint64_t offset, char *buf, int size);
  51
  52 #if CONFIG_FREEZE
  53 int vm_swap_vol_get_budget(vnode_t vp, uint64_t *freeze_daily_budget);
  54 #endif /* CONFIG_FREEZE */
  55
  56
  57 void
  58 vm_swapfile_open(const char *path, vnode_t *vp)
  59 {
  60         int error = 0;
  61         vfs_context_t   ctx = vfs_context_kernel();
  62
  63         if ((error = vnode_open(path, (O_CREAT | O_TRUNC | FREAD | FWRITE), S_IRUSR | S_IWUSR, 0, vp, ctx))) {
  64                 printf("Failed to open swap file %d\n", error);
  65                 *vp = NULL;
  66                 return;
  67         }
  68
  69         /*
  70          * If MNT_IOFLAGS_NOSWAP is set, opening the swap file should fail.
  71          * To avoid a race on the mount we only make this check after creating the
  72          * vnode.
  73          */
  74         if ((*vp)->v_mount->mnt_kern_flag & MNTK_NOSWAP) {
  75                 vnode_put(*vp);
  76                 vm_swapfile_close((uint64_t)path, *vp);
  77                 *vp = NULL;
  78                 return;
  79         }
  80
  81         vnode_put(*vp);
  82 }
  83
  84 uint64_t
  85 vm_swapfile_get_blksize(vnode_t vp)
  86 {
  87         return (uint64_t)vfs_devblocksize(vnode_mount(vp));
  88 }
  89
  90 uint64_t
  91 vm_swapfile_get_transfer_size(vnode_t vp)
  92 {
  93         return (uint64_t)vp->v_mount->mnt_vfsstat.f_iosize;
  94 }
  95
  96 int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int);
  97
  98 void
  99 vm_swapfile_close(uint64_t path_addr, vnode_t vp)
 100 {
 101         vfs_context_t context = vfs_context_kernel();
 102         int error;
 103
 104         vnode_getwithref(vp);
 105         vnode_close(vp, 0, context);
 106
 107         error = unlink1(context, NULLVP, CAST_USER_ADDR_T(path_addr),
 108             UIO_SYSSPACE, 0);
 109
 110 #if DEVELOPMENT || DEBUG
 111         if (error) {
 112                 printf("%s : unlink of %s failed with error %d", __FUNCTION__,
 113                     (char *)path_addr, error);
 114         }
 115 #endif
 116 }
 117
 118 int
 119 vm_swapfile_preallocate(vnode_t vp, uint64_t *size, boolean_t *pin)
 120 {
 121         int             error = 0;
 122         uint64_t        file_size = 0;
 123         vfs_context_t   ctx = NULL;
 124 #if CONFIG_FREEZE
 125         struct vnode_attr va;
 126 #endif /* CONFIG_FREEZE */
 127
 128         ctx = vfs_context_kernel();
 129
 130         error = vnode_setsize(vp, *size, IO_NOZEROFILL, ctx);
 131
 132         if (error) {
 133                 printf("vnode_setsize for swap files failed: %d\n", error);
 134                 goto done;
 135         }
 136
 137         error = vnode_size(vp, (off_t*) &file_size, ctx);
 138
 139         if (error) {
 140                 printf("vnode_size (new file) for swap file failed: %d\n", error);
 141                 goto done;
 142         }
 143         assert(file_size == *size);
 144
 145         if (pin != NULL && *pin != FALSE) {
 146                 error = VNOP_IOCTL(vp, FIOPINSWAP, NULL, 0, ctx);
 147
 148                 if (error) {
 149                         printf("pin for swap files failed: %d,  file_size = %lld\n", error, file_size);
 150                         /* this is not fatal, carry on with files wherever they landed */
 151                         *pin = FALSE;
 152                         error = 0;
 153                 }
 154         }
 155
 156         vnode_lock_spin(vp);
 157         SET(vp->v_flag, VSWAP);
 158         vnode_unlock(vp);
 159
 160 #if CONFIG_FREEZE
 161         VATTR_INIT(&va);
 162         VATTR_SET(&va, va_dataprotect_class, PROTECTION_CLASS_C);
 163         error = VNOP_SETATTR(vp, &va, ctx);
 164
 165         if (error) {
 166                 printf("setattr PROTECTION_CLASS_C for swap file failed: %d\n", error);
 167                 goto done;
 168         }
 169 #endif /* CONFIG_FREEZE */
 170
 171 done:
 172         return error;
 173 }
 174
 175
 176 int
 177 vm_record_file_write(vnode_t vp, uint64_t offset, char *buf, int size)
 178 {
 179         int error = 0;
 180         vfs_context_t ctx;
 181
 182         ctx = vfs_context_kernel();
 183
 184         error = vn_rdwr(UIO_WRITE, vp, (caddr_t)buf, size, offset,
 185             UIO_SYSSPACE, IO_NODELOCKED, vfs_context_ucred(ctx), (int *) 0, vfs_context_proc(ctx));
 186
 187         return error;
 188 }
 189
 190
 191
 192 int
 193 vm_swapfile_io(vnode_t vp, uint64_t offset, uint64_t start, int npages, int flags, void *upl_iodone)
 194 {
 195         int error = 0;
 196         upl_size_t io_size = (upl_size_t) (npages * PAGE_SIZE_64);
 197 #if 1
 198         kern_return_t   kr = KERN_SUCCESS;
 199         upl_t           upl = NULL;
 200         unsigned int    count = 0;
 201         upl_control_flags_t upl_create_flags = 0;
 202         int             upl_control_flags = 0;
 203         upl_size_t      upl_size = 0;
 204
 205         upl_create_flags = UPL_SET_INTERNAL | UPL_SET_LITE;
 206
 207         if (upl_iodone == NULL) {
 208                 upl_control_flags = UPL_IOSYNC;
 209         }
 210
 211 #if ENCRYPTED_SWAP
 212         upl_control_flags |= UPL_PAGING_ENCRYPTED;
 213 #endif
 214
 215         if ((flags & SWAP_READ) == FALSE) {
 216                 upl_create_flags |= UPL_COPYOUT_FROM;
 217         }
 218
 219         upl_size = io_size;
 220         kr = vm_map_create_upl( kernel_map,
 221             start,
 222             &upl_size,
 223             &upl,
 224             NULL,
 225             &count,
 226             &upl_create_flags,
 227             VM_KERN_MEMORY_OSFMK);
 228
 229         if (kr != KERN_SUCCESS || (upl_size != io_size)) {
 230                 panic("vm_map_create_upl failed with %d\n", kr);
 231         }
 232
 233         if (flags & SWAP_READ) {
 234                 vnode_pagein(vp,
 235                     upl,
 236                     0,
 237                     offset,
 238                     io_size,
 239                     upl_control_flags | UPL_IGNORE_VALID_PAGE_CHECK,
 240                     &error);
 241                 if (error) {
 242 #if DEBUG
 243                         printf("vm_swapfile_io: vnode_pagein failed with %d (vp: %p, offset: 0x%llx, size:%u)\n", error, vp, offset, io_size);
 244 #else /* DEBUG */
 245                         printf("vm_swapfile_io: vnode_pagein failed with %d.\n", error);
 246 #endif /* DEBUG */
 247                 }
 248         } else {
 249                 upl_set_iodone(upl, upl_iodone);
 250
 251                 vnode_pageout(vp,
 252                     upl,
 253                     0,
 254                     offset,
 255                     io_size,
 256                     upl_control_flags,
 257                     &error);
 258                 if (error) {
 259 #if DEBUG
 260                         printf("vm_swapfile_io: vnode_pageout failed with %d (vp: %p, offset: 0x%llx, size:%u)\n", error, vp, offset, io_size);
 261 #else /* DEBUG */
 262                         printf("vm_swapfile_io: vnode_pageout failed with %d.\n", error);
 263 #endif /* DEBUG */
 264                 }
 265         }
 266
 267         return error;
 268
 269 #else /* 1 */
 270         vfs_context_t ctx;
 271         ctx = vfs_context_kernel();
 272
 273         error = vn_rdwr((flags & SWAP_READ) ? UIO_READ : UIO_WRITE, vp, (caddr_t)start, io_size, offset,
 274             UIO_SYSSPACE, IO_SYNC | IO_NODELOCKED | IO_UNIT | IO_NOCACHE | IO_SWAP_DISPATCH, vfs_context_ucred(ctx), (int *) 0, vfs_context_proc(ctx));
 275
 276         if (error) {
 277                 printf("vn_rdwr: Swap I/O failed with %d\n", error);
 278         }
 279         return error;
 280 #endif /* 1 */
 281 }
 282
 283
 284 #define MAX_BATCH_TO_TRIM       256
 285
 286 #define ROUTE_ONLY              0x10            /* if corestorage is present, tell it to just pass */
 287                                                 /* the DKIOUNMAP command through w/o acting on it */
 288                                                 /* this is used by the compressed swap system to reclaim empty space */
 289
 290
 291 u_int32_t
 292 vnode_trim_list(vnode_t vp, struct trim_list *tl, boolean_t route_only)
 293 {
 294         int             error = 0;
 295         int             trim_index = 0;
 296         u_int32_t       blocksize = 0;
 297         struct vnode    *devvp;
 298         dk_extent_t     *extents;
 299         dk_unmap_t      unmap;
 300         _dk_cs_unmap_t  cs_unmap;
 301
 302         if (!(vp->v_mount->mnt_ioflags & MNT_IOFLAGS_UNMAP_SUPPORTED)) {
 303                 return ENOTSUP;
 304         }
 305
 306         if (tl == NULL) {
 307                 return 0;
 308         }
 309
 310         /*
 311          * Get the underlying device vnode and physical block size
 312          */
 313         devvp = vp->v_mount->mnt_devvp;
 314         blocksize = vp->v_mount->mnt_devblocksize;
 315
 316         extents = kheap_alloc(KHEAP_TEMP,
 317             sizeof(dk_extent_t) * MAX_BATCH_TO_TRIM, Z_WAITOK);
 318
 319         if (vp->v_mount->mnt_ioflags & MNT_IOFLAGS_CSUNMAP_SUPPORTED) {
 320                 memset(&cs_unmap, 0, sizeof(_dk_cs_unmap_t));
 321                 cs_unmap.extents = extents;
 322
 323                 if (route_only == TRUE) {
 324                         cs_unmap.options = ROUTE_ONLY;
 325                 }
 326         } else {
 327                 memset(&unmap, 0, sizeof(dk_unmap_t));
 328                 unmap.extents = extents;
 329         }
 330
 331         while (tl) {
 332                 daddr64_t       io_blockno;     /* Block number corresponding to the start of the extent */
 333                 size_t          io_bytecount;   /* Number of bytes in current extent for the specified range */
 334                 size_t          trimmed;
 335                 size_t          remaining_length;
 336                 off_t           current_offset;
 337
 338                 current_offset = tl->tl_offset;
 339                 remaining_length = tl->tl_length;
 340                 trimmed = 0;
 341
 342                 /*
 343                  * We may not get the entire range from tl_offset -> tl_offset+tl_length in a single
 344                  * extent from the blockmap call.  Keep looping/going until we are sure we've hit
 345                  * the whole range or if we encounter an error.
 346                  */
 347                 while (trimmed < tl->tl_length) {
 348                         /*
 349                          * VNOP_BLOCKMAP will tell us the logical to physical block number mapping for the
 350                          * specified offset.  It returns blocks in contiguous chunks, so if the logical range is
 351                          * broken into multiple extents, it must be called multiple times, increasing the offset
 352                          * in each call to ensure that the entire range is covered.
 353                          */
 354                         error = VNOP_BLOCKMAP(vp, current_offset, remaining_length,
 355                             &io_blockno, &io_bytecount, NULL, VNODE_READ | VNODE_BLOCKMAP_NO_TRACK, NULL);
 356
 357                         if (error) {
 358                                 goto trim_exit;
 359                         }
 360                         if (io_blockno != -1) {
 361                                 extents[trim_index].offset = (uint64_t) io_blockno * (u_int64_t) blocksize;
 362                                 extents[trim_index].length = io_bytecount;
 363
 364                                 trim_index++;
 365                         }
 366                         if (trim_index == MAX_BATCH_TO_TRIM) {
 367                                 if (vp->v_mount->mnt_ioflags & MNT_IOFLAGS_CSUNMAP_SUPPORTED) {
 368                                         cs_unmap.extentsCount = trim_index;
 369                                         error = VNOP_IOCTL(devvp, _DKIOCCSUNMAP, (caddr_t)&cs_unmap, 0, vfs_context_kernel());
 370                                 } else {
 371                                         unmap.extentsCount = trim_index;
 372                                         error = VNOP_IOCTL(devvp, DKIOCUNMAP, (caddr_t)&unmap, 0, vfs_context_kernel());
 373                                 }
 374                                 if (error) {
 375                                         goto trim_exit;
 376                                 }
 377                                 trim_index = 0;
 378                         }
 379                         trimmed += io_bytecount;
 380                         current_offset += io_bytecount;
 381                         remaining_length -= io_bytecount;
 382                 }
 383                 tl = tl->tl_next;
 384         }
 385         if (trim_index) {
 386                 if (vp->v_mount->mnt_ioflags & MNT_IOFLAGS_CSUNMAP_SUPPORTED) {
 387                         cs_unmap.extentsCount = trim_index;
 388                         error = VNOP_IOCTL(devvp, _DKIOCCSUNMAP, (caddr_t)&cs_unmap, 0, vfs_context_kernel());
 389                 } else {
 390                         unmap.extentsCount = trim_index;
 391                         error = VNOP_IOCTL(devvp, DKIOCUNMAP, (caddr_t)&unmap, 0, vfs_context_kernel());
 392                 }
 393         }
 394 trim_exit:
 395         kheap_free(KHEAP_TEMP, extents, sizeof(dk_extent_t) * MAX_BATCH_TO_TRIM);
 396
 397         return error;
 398 }
 399
 400 #if CONFIG_FREEZE
 401 int
 402 vm_swap_vol_get_budget(vnode_t vp, uint64_t *freeze_daily_budget)
 403 {
 404         vnode_t         devvp = NULL;
 405         vfs_context_t   ctx = vfs_context_kernel();
 406         errno_t         err = 0;
 407
 408         devvp = vp->v_mount->mnt_devvp;
 409
 410         err = VNOP_IOCTL(devvp, DKIOCGETMAXSWAPWRITE, (caddr_t)freeze_daily_budget, 0, ctx);
 411
 412         return err;
 413 }
 414 #endif /* CONFIG_FREEZE */