bsd/vm/vnode_pager.c

   1 /*
   2  * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Mach Operating System
  30  * Copyright (c) 1987 Carnegie-Mellon University
  31  * All rights reserved.  The CMU software License Agreement specifies
  32  * the terms and conditions for use and redistribution.
  33  */
  34 /*
  35  *      File:   vnode_pager.c
  36  *
  37  *      "Swap" pager that pages to/from vnodes.  Also
  38  *      handles demand paging from files.
  39  *
  40  */
  41
  42 #include <mach/boolean.h>
  43 #include <sys/param.h>
  44 #include <sys/systm.h>
  45 #include <sys/user.h>
  46 #include <sys/proc.h>
  47 #include <sys/kauth.h>
  48 #include <sys/buf.h>
  49 #include <sys/uio.h>
  50 #include <sys/vnode_internal.h>
  51 #include <sys/namei.h>
  52 #include <sys/mount_internal.h> /* needs internal due to fhandle_t */
  53 #include <sys/ubc_internal.h>
  54 #include <sys/lock.h>
  55 #include <sys/disk.h>           /* For DKIOC calls */
  56
  57 #include <mach/mach_types.h>
  58 #include <mach/memory_object_types.h>
  59 #include <mach/vm_map.h>
  60 #include <mach/mach_vm.h>
  61 #include <mach/upl.h>
  62 #include <mach/sdt.h>
  63
  64 #include <vm/vm_map.h>
  65 #include <vm/vm_kern.h>
  66 #include <kern/zalloc.h>
  67 #include <libkern/libkern.h>
  68
  69 #include <vm/vnode_pager.h>
  70 #include <vm/vm_pageout.h>
  71
  72 #include <kern/assert.h>
  73 #include <sys/kdebug.h>
  74 #include <nfs/nfs_conf.h>
  75 #include <nfs/rpcv2.h>
  76 #include <nfs/nfsproto.h>
  77 #include <nfs/nfs.h>
  78
  79 #include <vm/vm_protos.h>
  80
  81 #include <vfs/vfs_disk_conditioner.h>
  82
  83 void
  84 vnode_pager_throttle(void)
  85 {
  86         struct uthread *ut;
  87
  88         ut = get_bsdthread_info(current_thread());
  89
  90         if (ut->uu_lowpri_window) {
  91                 throttle_lowpri_io(1);
  92         }
  93 }
  94
  95 boolean_t
  96 vnode_pager_isSSD(vnode_t vp)
  97 {
  98         return disk_conditioner_mount_is_ssd(vp->v_mount);
  99 }
 100
 101 #if CONFIG_IOSCHED
 102 void
 103 vnode_pager_issue_reprioritize_io(struct vnode *devvp, uint64_t blkno, uint32_t len, int priority)
 104 {
 105         u_int32_t blocksize = 0;
 106         dk_extent_t extent;
 107         dk_set_tier_t set_tier;
 108         int error = 0;
 109
 110         error = VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE, (caddr_t)&blocksize, 0, vfs_context_kernel());
 111         if (error) {
 112                 return;
 113         }
 114
 115         memset(&extent, 0, sizeof(dk_extent_t));
 116         memset(&set_tier, 0, sizeof(dk_set_tier_t));
 117
 118         extent.offset = blkno * (u_int64_t) blocksize;
 119         extent.length = len;
 120
 121         set_tier.extents = &extent;
 122         set_tier.extentsCount = 1;
 123         set_tier.tier = (uint8_t)priority;
 124
 125         error = VNOP_IOCTL(devvp, DKIOCSETTIER, (caddr_t)&set_tier, 0, vfs_context_kernel());
 126         return;
 127 }
 128 #endif
 129
 130 void
 131 vnode_pager_was_dirtied(
 132         struct vnode            *vp,
 133         vm_object_offset_t      s_offset,
 134         vm_object_offset_t      e_offset)
 135 {
 136         cluster_update_state(vp, s_offset, e_offset, TRUE);
 137 }
 138
 139 uint32_t
 140 vnode_pager_isinuse(struct vnode *vp)
 141 {
 142         if (vp->v_usecount > vp->v_kusecount) {
 143                 return 1;
 144         }
 145         return 0;
 146 }
 147
 148 uint32_t
 149 vnode_pager_return_throttle_io_limit(struct vnode *vp, uint32_t *limit)
 150 {
 151         return cluster_throttle_io_limit(vp, limit);
 152 }
 153
 154 vm_object_offset_t
 155 vnode_pager_get_filesize(struct vnode *vp)
 156 {
 157         return (vm_object_offset_t) ubc_getsize(vp);
 158 }
 159
 160 extern int safe_getpath(struct vnode *dvp, char *leafname, char *path, int _len, int *truncated_path);
 161
 162 kern_return_t
 163 vnode_pager_get_name(
 164         struct vnode    *vp,
 165         char            *pathname,
 166         vm_size_t       pathname_len,
 167         char            *filename,
 168         vm_size_t       filename_len,
 169         boolean_t       *truncated_path_p)
 170 {
 171         *truncated_path_p = FALSE;
 172         if (pathname != NULL) {
 173                 /* get the path name */
 174                 safe_getpath(vp, NULL,
 175                     pathname, (int) pathname_len,
 176                     truncated_path_p);
 177         }
 178         if ((pathname == NULL || *truncated_path_p) &&
 179             filename != NULL) {
 180                 /* get the file name */
 181                 const char *name;
 182
 183                 name = vnode_getname_printable(vp);
 184                 strlcpy(filename, name, (size_t) filename_len);
 185                 vnode_putname_printable(name);
 186         }
 187         return KERN_SUCCESS;
 188 }
 189
 190 kern_return_t
 191 vnode_pager_get_mtime(
 192         struct vnode    *vp,
 193         struct timespec *current_mtime,
 194         struct timespec *cs_mtime)
 195 {
 196         vnode_mtime(vp, current_mtime, vfs_context_current());
 197         if (cs_mtime != NULL) {
 198                 ubc_get_cs_mtime(vp, cs_mtime);
 199         }
 200         return KERN_SUCCESS;
 201 }
 202
 203 kern_return_t
 204 vnode_pager_get_cs_blobs(
 205         struct vnode    *vp,
 206         void            **blobs)
 207 {
 208         *blobs = ubc_get_cs_blobs(vp);
 209         return KERN_SUCCESS;
 210 }
 211
 212 /*
 213  * vnode_trim:
 214  * Used to call the DKIOCUNMAP ioctl on the underlying disk device for the specified vnode.
 215  * Trims the region at offset bytes into the file, for length bytes.
 216  *
 217  * Care must be taken to ensure that the vnode is sufficiently reference counted at the time this
 218  * function is called; no iocounts or usecounts are taken on the vnode.
 219  * This function is non-idempotent in error cases;  We cannot un-discard the blocks if only some of them
 220  * are successfully discarded.
 221  */
 222 u_int32_t
 223 vnode_trim(
 224         struct vnode *vp,
 225         off_t offset,
 226         size_t length)
 227 {
 228         daddr64_t io_blockno;    /* Block number corresponding to the start of the extent */
 229         size_t io_bytecount;    /* Number of bytes in current extent for the specified range */
 230         size_t trimmed = 0;
 231         off_t current_offset = offset;
 232         size_t remaining_length = length;
 233         int error = 0;
 234         u_int32_t blocksize = 0;
 235         struct vnode *devvp;
 236         dk_extent_t extent;
 237         dk_unmap_t unmap;
 238
 239
 240         /* Get the underlying device vnode */
 241         devvp = vp->v_mount->mnt_devvp;
 242
 243         /* Figure out the underlying device block size */
 244         error  = VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE, (caddr_t)&blocksize, 0, vfs_context_kernel());
 245         if (error) {
 246                 goto trim_exit;
 247         }
 248
 249         /*
 250          * We may not get the entire range from offset -> offset+length in a single
 251          * extent from the blockmap call.  Keep looping/going until we are sure we've hit
 252          * the whole range or if we encounter an error.
 253          */
 254         while (trimmed < length) {
 255                 /*
 256                  * VNOP_BLOCKMAP will tell us the logical to physical block number mapping for the
 257                  * specified offset.  It returns blocks in contiguous chunks, so if the logical range is
 258                  * broken into multiple extents, it must be called multiple times, increasing the offset
 259                  * in each call to ensure that the entire range is covered.
 260                  */
 261                 error = VNOP_BLOCKMAP(vp, current_offset, remaining_length,
 262                     &io_blockno, &io_bytecount, NULL, VNODE_READ | VNODE_BLOCKMAP_NO_TRACK, NULL);
 263
 264                 if (error) {
 265                         goto trim_exit;
 266                 }
 267                 /*
 268                  * We have a contiguous run.  Prepare & issue the ioctl for the device.
 269                  * the DKIOCUNMAP ioctl takes offset in bytes from the start of the device.
 270                  */
 271                 memset(&extent, 0, sizeof(dk_extent_t));
 272                 memset(&unmap, 0, sizeof(dk_unmap_t));
 273                 extent.offset = (uint64_t) io_blockno * (u_int64_t) blocksize;
 274                 extent.length = io_bytecount;
 275                 unmap.extents = &extent;
 276                 unmap.extentsCount = 1;
 277                 error = VNOP_IOCTL(devvp, DKIOCUNMAP, (caddr_t)&unmap, 0, vfs_context_kernel());
 278
 279                 if (error) {
 280                         goto trim_exit;
 281                 }
 282                 remaining_length = remaining_length - io_bytecount;
 283                 trimmed = trimmed + io_bytecount;
 284                 current_offset = current_offset + io_bytecount;
 285         }
 286 trim_exit:
 287
 288         return error;
 289 }
 290
 291 pager_return_t
 292 vnode_pageout(struct vnode *vp,
 293     upl_t                   upl,
 294     upl_offset_t            upl_offset,
 295     vm_object_offset_t      f_offset,
 296     upl_size_t              size,
 297     int                     flags,
 298     int                     *errorp)
 299 {
 300         int             result = PAGER_SUCCESS;
 301         int             error = 0;
 302         int             error_ret = 0;
 303         daddr64_t blkno;
 304         int isize;
 305         int pg_index;
 306         int base_index;
 307         upl_offset_t offset;
 308         upl_page_info_t *pl;
 309         vfs_context_t ctx = vfs_context_current();      /* pager context */
 310
 311         isize = (int)size;
 312
 313         /*
 314          * This call is non-blocking and does not ever fail but it can
 315          * only be made when there is other explicit synchronization
 316          * with reclaiming of the vnode which, in this path, is provided
 317          * by the paging in progress counter.
 318          *
 319          * In addition, this may also be entered via explicit ubc_msync
 320          * calls or vm_swapfile_io where the existing iocount provides
 321          * the necessary synchronization. Ideally we would not take an
 322          * additional iocount here in the cases where an explcit iocount
 323          * has already been taken but this call doesn't cause a deadlock
 324          * as other forms of vnode_get* might if this thread has already
 325          * taken an iocount.
 326          */
 327         error = vnode_getalways_from_pager(vp);
 328         if (error != 0) {
 329                 /* This can't happen */
 330                 panic("vnode_getalways returned %d for vp %p", error, vp);
 331         }
 332
 333         if (isize <= 0) {
 334                 result    = PAGER_ERROR;
 335                 error_ret = EINVAL;
 336                 goto out;
 337         }
 338
 339         if (UBCINFOEXISTS(vp) == 0) {
 340                 result    = PAGER_ERROR;
 341                 error_ret = EINVAL;
 342
 343                 if (upl && !(flags & UPL_NOCOMMIT)) {
 344                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
 345                 }
 346                 goto out;
 347         }
 348         if (!(flags & UPL_VNODE_PAGER)) {
 349                 /*
 350                  * This is a pageout from the default pager,
 351                  * just go ahead and call vnop_pageout since
 352                  * it has already sorted out the dirty ranges
 353                  */
 354                 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
 355                     (MACHDBG_CODE(DBG_MACH_VM, 1)) | DBG_FUNC_START,
 356                     size, 1, 0, 0, 0);
 357
 358                 if ((error_ret = VNOP_PAGEOUT(vp, upl, upl_offset, (off_t)f_offset,
 359                     (size_t)size, flags, ctx))) {
 360                         result = PAGER_ERROR;
 361                 }
 362
 363                 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
 364                     (MACHDBG_CODE(DBG_MACH_VM, 1)) | DBG_FUNC_END,
 365                     size, 1, 0, 0, 0);
 366
 367                 goto out;
 368         }
 369         if (upl == NULL) {
 370                 int                     request_flags;
 371
 372                 if (vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSVNOP_PAGEOUTV2) {
 373                         /*
 374                          * filesystem has requested the new form of VNOP_PAGEOUT for file
 375                          * backed objects... we will not grab the UPL befofe calling VNOP_PAGEOUT...
 376                          * it is the fileystem's responsibility to grab the range we're denoting
 377                          * via 'f_offset' and 'size' into a UPL... this allows the filesystem to first
 378                          * take any locks it needs, before effectively locking the pages into a UPL...
 379                          */
 380                         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
 381                             (MACHDBG_CODE(DBG_MACH_VM, 1)) | DBG_FUNC_START,
 382                             size, (int)f_offset, 0, 0, 0);
 383
 384                         if ((error_ret = VNOP_PAGEOUT(vp, NULL, upl_offset, (off_t)f_offset,
 385                             size, flags, ctx))) {
 386                                 result = PAGER_ERROR;
 387                         }
 388                         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
 389                             (MACHDBG_CODE(DBG_MACH_VM, 1)) | DBG_FUNC_END,
 390                             size, 0, 0, 0, 0);
 391
 392                         goto out;
 393                 }
 394                 if (flags & UPL_MSYNC) {
 395                         request_flags = UPL_UBC_MSYNC | UPL_RET_ONLY_DIRTY;
 396                 } else {
 397                         request_flags = UPL_UBC_PAGEOUT | UPL_RET_ONLY_DIRTY;
 398                 }
 399
 400                 if (ubc_create_upl_kernel(vp, f_offset, size, &upl, &pl, request_flags, VM_KERN_MEMORY_FILE) != KERN_SUCCESS) {
 401                         result    = PAGER_ERROR;
 402                         error_ret = EINVAL;
 403                         goto out;
 404                 }
 405                 upl_offset = 0;
 406         } else {
 407                 pl = ubc_upl_pageinfo(upl);
 408         }
 409
 410         /*
 411          * Ignore any non-present pages at the end of the
 412          * UPL so that we aren't looking at a upl that
 413          * may already have been freed by the preceeding
 414          * aborts/completions.
 415          */
 416         base_index = upl_offset / PAGE_SIZE;
 417
 418         for (pg_index = (upl_offset + isize) / PAGE_SIZE; pg_index > base_index;) {
 419                 if (upl_page_present(pl, --pg_index)) {
 420                         break;
 421                 }
 422                 if (pg_index == base_index) {
 423                         /*
 424                          * no pages were returned, so release
 425                          * our hold on the upl and leave
 426                          */
 427                         if (!(flags & UPL_NOCOMMIT)) {
 428                                 ubc_upl_abort_range(upl, upl_offset, isize, UPL_ABORT_FREE_ON_EMPTY);
 429                         }
 430
 431                         goto out;
 432                 }
 433         }
 434         isize = ((pg_index + 1) - base_index) * PAGE_SIZE;
 435
 436         /*
 437          * we come here for pageouts to 'real' files and
 438          * for msyncs...  the upl may not contain any
 439          * dirty pages.. it's our responsibility to sort
 440          * through it and find the 'runs' of dirty pages
 441          * to call VNOP_PAGEOUT on...
 442          */
 443
 444         if (ubc_getsize(vp) == 0) {
 445                 /*
 446                  * if the file has been effectively deleted, then
 447                  * we need to go through the UPL and invalidate any
 448                  * buffer headers we might have that reference any
 449                  * of it's pages
 450                  */
 451                 for (offset = upl_offset; isize; isize -= PAGE_SIZE, offset += PAGE_SIZE) {
 452 #if CONFIG_NFS_CLIENT
 453                         if (vp->v_tag == VT_NFS) {
 454                                 /* check with nfs if page is OK to drop */
 455                                 error = nfs_buf_page_inval(vp, (off_t)f_offset);
 456                         } else
 457 #endif /* CONFIG_NFS_CLIENT */
 458                         {
 459                                 blkno = ubc_offtoblk(vp, (off_t)f_offset);
 460                                 error = buf_invalblkno(vp, blkno, 0);
 461                         }
 462                         if (error) {
 463                                 if (!(flags & UPL_NOCOMMIT)) {
 464                                         ubc_upl_abort_range(upl, offset, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
 465                                 }
 466                                 if (error_ret == 0) {
 467                                         error_ret = error;
 468                                 }
 469                                 result = PAGER_ERROR;
 470                         } else if (!(flags & UPL_NOCOMMIT)) {
 471                                 ubc_upl_commit_range(upl, offset, PAGE_SIZE, UPL_COMMIT_FREE_ON_EMPTY);
 472                         }
 473                         f_offset += PAGE_SIZE;
 474                 }
 475                 goto out;
 476         }
 477
 478         offset = upl_offset;
 479         pg_index = base_index;
 480
 481         while (isize) {
 482                 int  xsize;
 483                 int  num_of_pages;
 484
 485                 if (!upl_page_present(pl, pg_index)) {
 486                         /*
 487                          * we asked for RET_ONLY_DIRTY, so it's possible
 488                          * to get back empty slots in the UPL
 489                          * just skip over them
 490                          */
 491                         f_offset += PAGE_SIZE;
 492                         offset   += PAGE_SIZE;
 493                         isize    -= PAGE_SIZE;
 494                         pg_index++;
 495
 496                         continue;
 497                 }
 498                 if (!upl_dirty_page(pl, pg_index)) {
 499                         /*
 500                          * if the page is not dirty and reached here it is
 501                          * marked precious or it is due to invalidation in
 502                          * memory_object_lock request as part of truncation
 503                          * We also get here from vm_object_terminate()
 504                          * So all you need to do in these
 505                          * cases is to invalidate incore buffer if it is there
 506                          * Note we must not sleep here if the buffer is busy - that is
 507                          * a lock inversion which causes deadlock.
 508                          */
 509 #if CONFIG_NFS_CLIENT
 510                         if (vp->v_tag == VT_NFS) {
 511                                 /* check with nfs if page is OK to drop */
 512                                 error = nfs_buf_page_inval(vp, (off_t)f_offset);
 513                         } else
 514 #endif /* CONFIG_NFS_CLIENT */
 515                         {
 516                                 blkno = ubc_offtoblk(vp, (off_t)f_offset);
 517                                 error = buf_invalblkno(vp, blkno, 0);
 518                         }
 519                         if (error) {
 520                                 if (!(flags & UPL_NOCOMMIT)) {
 521                                         ubc_upl_abort_range(upl, offset, PAGE_SIZE, UPL_ABORT_FREE_ON_EMPTY);
 522                                 }
 523                                 if (error_ret == 0) {
 524                                         error_ret = error;
 525                                 }
 526                                 result = PAGER_ERROR;
 527                         } else if (!(flags & UPL_NOCOMMIT)) {
 528                                 ubc_upl_commit_range(upl, offset, PAGE_SIZE, UPL_COMMIT_FREE_ON_EMPTY);
 529                         }
 530                         f_offset += PAGE_SIZE;
 531                         offset   += PAGE_SIZE;
 532                         isize    -= PAGE_SIZE;
 533                         pg_index++;
 534
 535                         continue;
 536                 }
 537                 num_of_pages = 1;
 538                 xsize = isize - PAGE_SIZE;
 539
 540                 while (xsize) {
 541                         if (!upl_dirty_page(pl, pg_index + num_of_pages)) {
 542                                 break;
 543                         }
 544                         num_of_pages++;
 545                         xsize -= PAGE_SIZE;
 546                 }
 547                 xsize = num_of_pages * PAGE_SIZE;
 548
 549                 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
 550                     (MACHDBG_CODE(DBG_MACH_VM, 1)) | DBG_FUNC_START,
 551                     xsize, (int)f_offset, 0, 0, 0);
 552
 553                 if ((error = VNOP_PAGEOUT(vp, upl, offset, (off_t)f_offset,
 554                     xsize, flags, ctx))) {
 555                         if (error_ret == 0) {
 556                                 error_ret = error;
 557                         }
 558                         result = PAGER_ERROR;
 559                 }
 560                 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
 561                     (MACHDBG_CODE(DBG_MACH_VM, 1)) | DBG_FUNC_END,
 562                     xsize, 0, 0, 0, 0);
 563
 564                 f_offset += xsize;
 565                 offset   += xsize;
 566                 isize    -= xsize;
 567                 pg_index += num_of_pages;
 568         }
 569 out:
 570         vnode_put_from_pager(vp);
 571
 572         if (errorp) {
 573                 *errorp = error_ret;
 574         }
 575
 576         return result;
 577 }
 578
 579
 580 pager_return_t
 581 vnode_pagein(
 582         struct vnode            *vp,
 583         upl_t                   upl,
 584         upl_offset_t            upl_offset,
 585         vm_object_offset_t      f_offset,
 586         upl_size_t              size,
 587         int                     flags,
 588         int                     *errorp)
 589 {
 590         upl_page_info_t *pl;
 591         int             result = PAGER_SUCCESS;
 592         int             error = 0;
 593         int             pages_in_upl;
 594         int             start_pg;
 595         int             last_pg;
 596         int             first_pg;
 597         int             xsize;
 598         int             must_commit = 1;
 599         int             ignore_valid_page_check = 0;
 600
 601         if (flags & UPL_NOCOMMIT) {
 602                 must_commit = 0;
 603         }
 604
 605         if (flags & UPL_IGNORE_VALID_PAGE_CHECK) {
 606                 ignore_valid_page_check = 1;
 607         }
 608
 609         /*
 610          * This call is non-blocking and does not ever fail but it can
 611          * only be made when there is other explicit synchronization
 612          * with reclaiming of the vnode which, in this path, is provided
 613          * by the paging in progress counter.
 614          *
 615          * In addition, this may also be entered via vm_swapfile_io
 616          * where the existing iocount provides the necessary synchronization.
 617          * Ideally we would not take an additional iocount here in the cases
 618          * where an explcit iocount has already been taken but this call
 619          * doesn't cause a deadlock as other forms of vnode_get* might if
 620          * this thread has already taken an iocount.
 621          */
 622         error = vnode_getalways_from_pager(vp);
 623         if (error != 0) {
 624                 /* This can't happen */
 625                 panic("vnode_getalways returned %d for vp %p", error, vp);
 626         }
 627
 628         if (UBCINFOEXISTS(vp) == 0) {
 629                 result = PAGER_ERROR;
 630                 error  = PAGER_ERROR;
 631
 632                 if (upl && must_commit) {
 633                         ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
 634                 }
 635
 636                 goto out;
 637         }
 638         if (upl == (upl_t)NULL) {
 639                 flags &= ~UPL_NOCOMMIT;
 640
 641                 if (size > MAX_UPL_SIZE_BYTES) {
 642                         result = PAGER_ERROR;
 643                         error  = PAGER_ERROR;
 644                         goto out;
 645                 }
 646                 if (vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSVNOP_PAGEINV2) {
 647                         /*
 648                          * filesystem has requested the new form of VNOP_PAGEIN for file
 649                          * backed objects... we will not grab the UPL befofe calling VNOP_PAGEIN...
 650                          * it is the fileystem's responsibility to grab the range we're denoting
 651                          * via 'f_offset' and 'size' into a UPL... this allows the filesystem to first
 652                          * take any locks it needs, before effectively locking the pages into a UPL...
 653                          * so we pass a NULL into the filesystem instead of a UPL pointer... the 'upl_offset'
 654                          * is used to identify the "must have" page in the extent... the filesystem is free
 655                          * to clip the extent to better fit the underlying FS blocksize if it desires as
 656                          * long as it continues to include the "must have" page... 'f_offset' + 'upl_offset'
 657                          * identifies that page
 658                          */
 659                         if ((error = VNOP_PAGEIN(vp, NULL, upl_offset, (off_t)f_offset,
 660                             size, flags, vfs_context_current()))) {
 661                                 set_thread_pagein_error(current_thread(), error);
 662                                 result = PAGER_ERROR;
 663                                 error  = PAGER_ERROR;
 664                         }
 665                         goto out;
 666                 }
 667                 ubc_create_upl_kernel(vp, f_offset, size, &upl, &pl, UPL_UBC_PAGEIN | UPL_RET_ONLY_ABSENT, VM_KERN_MEMORY_FILE);
 668
 669                 if (upl == (upl_t)NULL) {
 670                         result =  PAGER_ABSENT;
 671                         error = PAGER_ABSENT;
 672                         goto out;
 673                 }
 674                 ubc_upl_range_needed(upl, upl_offset / PAGE_SIZE, 1);
 675
 676                 upl_offset = 0;
 677                 first_pg = 0;
 678
 679                 /*
 680                  * if we get here, we've created the upl and
 681                  * are responsible for commiting/aborting it
 682                  * regardless of what the caller has passed in
 683                  */
 684                 must_commit = 1;
 685         } else {
 686                 pl = ubc_upl_pageinfo(upl);
 687                 first_pg = upl_offset / PAGE_SIZE;
 688         }
 689         pages_in_upl = size / PAGE_SIZE;
 690         DTRACE_VM2(pgpgin, int, pages_in_upl, (uint64_t *), NULL);
 691
 692         /*
 693          * before we start marching forward, we must make sure we end on
 694          * a present page, otherwise we will be working with a freed
 695          * upl
 696          */
 697         for (last_pg = pages_in_upl - 1; last_pg >= first_pg; last_pg--) {
 698                 if (upl_page_present(pl, last_pg)) {
 699                         break;
 700                 }
 701                 if (last_pg == first_pg) {
 702                         /*
 703                          * empty UPL, no pages are present
 704                          */
 705                         if (must_commit) {
 706                                 ubc_upl_abort_range(upl, upl_offset, size, UPL_ABORT_FREE_ON_EMPTY);
 707                         }
 708                         goto out;
 709                 }
 710         }
 711         pages_in_upl = last_pg + 1;
 712         last_pg = first_pg;
 713
 714         while (last_pg < pages_in_upl) {
 715                 /*
 716                  * skip over missing pages...
 717                  */
 718                 for (; last_pg < pages_in_upl; last_pg++) {
 719                         if (upl_page_present(pl, last_pg)) {
 720                                 break;
 721                         }
 722                 }
 723
 724                 if (ignore_valid_page_check == 1) {
 725                         start_pg = last_pg;
 726                 } else {
 727                         /*
 728                          * skip over 'valid' pages... we don't want to issue I/O for these
 729                          */
 730                         for (start_pg = last_pg; last_pg < pages_in_upl; last_pg++) {
 731                                 if (!upl_valid_page(pl, last_pg)) {
 732                                         break;
 733                                 }
 734                         }
 735                 }
 736
 737                 if (last_pg > start_pg) {
 738                         /*
 739                          * we've found a range of valid pages
 740                          * if we've got COMMIT responsibility
 741                          * commit this range of pages back to the
 742                          * cache unchanged
 743                          */
 744                         xsize = (last_pg - start_pg) * PAGE_SIZE;
 745
 746                         if (must_commit) {
 747                                 ubc_upl_abort_range(upl, start_pg * PAGE_SIZE, xsize, UPL_ABORT_FREE_ON_EMPTY);
 748                         }
 749                 }
 750                 if (last_pg == pages_in_upl) {
 751                         /*
 752                          * we're done... all pages that were present
 753                          * have either had I/O issued on them or
 754                          * were aborted unchanged...
 755                          */
 756                         break;
 757                 }
 758
 759                 if (!upl_page_present(pl, last_pg)) {
 760                         /*
 761                          * we found a range of valid pages
 762                          * terminated by a missing page...
 763                          * bump index to the next page and continue on
 764                          */
 765                         last_pg++;
 766                         continue;
 767                 }
 768                 /*
 769                  * scan from the found invalid page looking for a valid
 770                  * or non-present page before the end of the upl is reached, if we
 771                  * find one, then it will be the last page of the request to
 772                  * 'cluster_io'
 773                  */
 774                 for (start_pg = last_pg; last_pg < pages_in_upl; last_pg++) {
 775                         if ((!ignore_valid_page_check && upl_valid_page(pl, last_pg)) || !upl_page_present(pl, last_pg)) {
 776                                 break;
 777                         }
 778                 }
 779                 if (last_pg > start_pg) {
 780                         int xoff;
 781                         xsize = (last_pg - start_pg) * PAGE_SIZE;
 782                         xoff  = start_pg * PAGE_SIZE;
 783
 784                         if ((error = VNOP_PAGEIN(vp, upl, (upl_offset_t) xoff,
 785                             (off_t)f_offset + xoff,
 786                             xsize, flags, vfs_context_current()))) {
 787                                 /*
 788                                  * Usually this UPL will be aborted/committed by the lower cluster layer.
 789                                  *
 790                                  * a)   In the case of decmpfs, however, we may return an error (EAGAIN) to avoid
 791                                  *      a deadlock with another thread already inflating the file.
 792                                  *
 793                                  * b)   In the case of content protection, EPERM is a valid error and we should respect it.
 794                                  *
 795                                  * In those cases, we must take care of our UPL at this layer itself.
 796                                  */
 797                                 if (must_commit) {
 798                                         if (error == EAGAIN) {
 799                                                 ubc_upl_abort_range(upl, (upl_offset_t) xoff, xsize, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_RESTART);
 800                                         }
 801                                         if (error == EPERM) {
 802                                                 ubc_upl_abort_range(upl, (upl_offset_t) xoff, xsize, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR);
 803                                         }
 804                                 }
 805                                 set_thread_pagein_error(current_thread(), error);
 806                                 result = PAGER_ERROR;
 807                                 error  = PAGER_ERROR;
 808                         }
 809                 }
 810         }
 811 out:
 812         vnode_put_from_pager(vp);
 813
 814         if (errorp) {
 815                 *errorp = result;
 816         }
 817
 818         return error;
 819 }
 820
 821 void *
 822 upl_get_internal_page_list(upl_t upl)
 823 {
 824         return UPL_GET_INTERNAL_PAGE_LIST(upl);
 825 }