osfmk/vm/vm_pageout.c

   1 /*
   2  * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58 /*
  59  *      File:   vm/vm_pageout.c
  60  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  61  *      Date:   1985
  62  *
  63  *      The proverbial page-out daemon.
  64  */
  65
  66 #include <stdint.h>
  67 #include <ptrauth.h>
  68
  69 #include <debug.h>
  70 #include <mach_pagemap.h>
  71 #include <mach_cluster_stats.h>
  72
  73 #include <mach/mach_types.h>
  74 #include <mach/memory_object.h>
  75 #include <mach/memory_object_default.h>
  76 #include <mach/memory_object_control_server.h>
  77 #include <mach/mach_host_server.h>
  78 #include <mach/upl.h>
  79 #include <mach/vm_map.h>
  80 #include <mach/vm_param.h>
  81 #include <mach/vm_statistics.h>
  82 #include <mach/sdt.h>
  83
  84 #include <kern/kern_types.h>
  85 #include <kern/counter.h>
  86 #include <kern/host_statistics.h>
  87 #include <kern/machine.h>
  88 #include <kern/misc_protos.h>
  89 #include <kern/sched.h>
  90 #include <kern/thread.h>
  91 #include <kern/kalloc.h>
  92 #include <kern/zalloc_internal.h>
  93 #include <kern/policy_internal.h>
  94 #include <kern/thread_group.h>
  95
  96 #include <machine/vm_tuning.h>
  97 #include <machine/commpage.h>
  98
  99 #include <vm/pmap.h>
 100 #include <vm/vm_compressor_pager.h>
 101 #include <vm/vm_fault.h>
 102 #include <vm/vm_map.h>
 103 #include <vm/vm_object.h>
 104 #include <vm/vm_page.h>
 105 #include <vm/vm_pageout.h>
 106 #include <vm/vm_protos.h> /* must be last */
 107 #include <vm/memory_object.h>
 108 #include <vm/vm_purgeable_internal.h>
 109 #include <vm/vm_shared_region.h>
 110 #include <vm/vm_compressor.h>
 111
 112 #include <san/kasan.h>
 113
 114 #if CONFIG_PHANTOM_CACHE
 115 #include <vm/vm_phantom_cache.h>
 116 #endif
 117
 118 #if UPL_DEBUG
 119 #include <libkern/OSDebug.h>
 120 #endif
 121
 122 extern int cs_debug;
 123
 124 extern void mbuf_drain(boolean_t);
 125
 126 #if VM_PRESSURE_EVENTS
 127 #if CONFIG_JETSAM
 128 extern unsigned int memorystatus_available_pages;
 129 extern unsigned int memorystatus_available_pages_pressure;
 130 extern unsigned int memorystatus_available_pages_critical;
 131 #else /* CONFIG_JETSAM */
 132 extern uint64_t memorystatus_available_pages;
 133 extern uint64_t memorystatus_available_pages_pressure;
 134 extern uint64_t memorystatus_available_pages_critical;
 135 #endif /* CONFIG_JETSAM */
 136
 137 extern unsigned int memorystatus_frozen_count;
 138 extern unsigned int memorystatus_suspended_count;
 139 extern vm_pressure_level_t memorystatus_vm_pressure_level;
 140
 141 extern lck_mtx_t memorystatus_jetsam_fg_band_lock;
 142 extern uint32_t memorystatus_jetsam_fg_band_waiters;
 143
 144 void vm_pressure_response(void);
 145 extern void consider_vm_pressure_events(void);
 146
 147 #define MEMORYSTATUS_SUSPENDED_THRESHOLD  4
 148 #endif /* VM_PRESSURE_EVENTS */
 149
 150 thread_t  vm_pageout_scan_thread = THREAD_NULL;
 151 boolean_t vps_dynamic_priority_enabled = FALSE;
 152
 153 #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE  /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
 154 #if !XNU_TARGET_OS_OSX
 155 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 1024
 156 #else /* !XNU_TARGET_OS_OSX */
 157 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096
 158 #endif /* !XNU_TARGET_OS_OSX */
 159 #endif
 160
 161 #ifndef VM_PAGEOUT_DEADLOCK_RELIEF
 162 #define VM_PAGEOUT_DEADLOCK_RELIEF 100  /* number of pages to move to break deadlock */
 163 #endif
 164
 165 #ifndef VM_PAGE_LAUNDRY_MAX
 166 #define VM_PAGE_LAUNDRY_MAX     128UL   /* maximum pageouts on a given pageout queue */
 167 #endif  /* VM_PAGEOUT_LAUNDRY_MAX */
 168
 169 #ifndef VM_PAGEOUT_BURST_WAIT
 170 #define VM_PAGEOUT_BURST_WAIT   1       /* milliseconds */
 171 #endif  /* VM_PAGEOUT_BURST_WAIT */
 172
 173 #ifndef VM_PAGEOUT_EMPTY_WAIT
 174 #define VM_PAGEOUT_EMPTY_WAIT   50      /* milliseconds */
 175 #endif  /* VM_PAGEOUT_EMPTY_WAIT */
 176
 177 #ifndef VM_PAGEOUT_DEADLOCK_WAIT
 178 #define VM_PAGEOUT_DEADLOCK_WAIT 100    /* milliseconds */
 179 #endif  /* VM_PAGEOUT_DEADLOCK_WAIT */
 180
 181 #ifndef VM_PAGEOUT_IDLE_WAIT
 182 #define VM_PAGEOUT_IDLE_WAIT    10      /* milliseconds */
 183 #endif  /* VM_PAGEOUT_IDLE_WAIT */
 184
 185 #ifndef VM_PAGEOUT_SWAP_WAIT
 186 #define VM_PAGEOUT_SWAP_WAIT    10      /* milliseconds */
 187 #endif  /* VM_PAGEOUT_SWAP_WAIT */
 188
 189
 190 #ifndef VM_PAGE_SPECULATIVE_TARGET
 191 #define VM_PAGE_SPECULATIVE_TARGET(total) ((total) * 1 / (100 / vm_pageout_state.vm_page_speculative_percentage))
 192 #endif /* VM_PAGE_SPECULATIVE_TARGET */
 193
 194
 195 /*
 196  *      To obtain a reasonable LRU approximation, the inactive queue
 197  *      needs to be large enough to give pages on it a chance to be
 198  *      referenced a second time.  This macro defines the fraction
 199  *      of active+inactive pages that should be inactive.
 200  *      The pageout daemon uses it to update vm_page_inactive_target.
 201  *
 202  *      If vm_page_free_count falls below vm_page_free_target and
 203  *      vm_page_inactive_count is below vm_page_inactive_target,
 204  *      then the pageout daemon starts running.
 205  */
 206
 207 #ifndef VM_PAGE_INACTIVE_TARGET
 208 #define VM_PAGE_INACTIVE_TARGET(avail)  ((avail) * 1 / 2)
 209 #endif  /* VM_PAGE_INACTIVE_TARGET */
 210
 211 /*
 212  *      Once the pageout daemon starts running, it keeps going
 213  *      until vm_page_free_count meets or exceeds vm_page_free_target.
 214  */
 215
 216 #ifndef VM_PAGE_FREE_TARGET
 217 #if !XNU_TARGET_OS_OSX
 218 #define VM_PAGE_FREE_TARGET(free)       (15 + (free) / 100)
 219 #else /* !XNU_TARGET_OS_OSX */
 220 #define VM_PAGE_FREE_TARGET(free)       (15 + (free) / 80)
 221 #endif /* !XNU_TARGET_OS_OSX */
 222 #endif  /* VM_PAGE_FREE_TARGET */
 223
 224
 225 /*
 226  *      The pageout daemon always starts running once vm_page_free_count
 227  *      falls below vm_page_free_min.
 228  */
 229
 230 #ifndef VM_PAGE_FREE_MIN
 231 #if !XNU_TARGET_OS_OSX
 232 #define VM_PAGE_FREE_MIN(free)          (10 + (free) / 200)
 233 #else /* !XNU_TARGET_OS_OSX */
 234 #define VM_PAGE_FREE_MIN(free)          (10 + (free) / 100)
 235 #endif /* !XNU_TARGET_OS_OSX */
 236 #endif  /* VM_PAGE_FREE_MIN */
 237
 238 #if !XNU_TARGET_OS_OSX
 239 #define VM_PAGE_FREE_RESERVED_LIMIT     100
 240 #define VM_PAGE_FREE_MIN_LIMIT          1500
 241 #define VM_PAGE_FREE_TARGET_LIMIT       2000
 242 #else /* !XNU_TARGET_OS_OSX */
 243 #define VM_PAGE_FREE_RESERVED_LIMIT     1700
 244 #define VM_PAGE_FREE_MIN_LIMIT          3500
 245 #define VM_PAGE_FREE_TARGET_LIMIT       4000
 246 #endif /* !XNU_TARGET_OS_OSX */
 247
 248 /*
 249  *      When vm_page_free_count falls below vm_page_free_reserved,
 250  *      only vm-privileged threads can allocate pages.  vm-privilege
 251  *      allows the pageout daemon and default pager (and any other
 252  *      associated threads needed for default pageout) to continue
 253  *      operation by dipping into the reserved pool of pages.
 254  */
 255
 256 #ifndef VM_PAGE_FREE_RESERVED
 257 #define VM_PAGE_FREE_RESERVED(n)        \
 258         ((unsigned) (6 * VM_PAGE_LAUNDRY_MAX) + (n))
 259 #endif  /* VM_PAGE_FREE_RESERVED */
 260
 261 /*
 262  *      When we dequeue pages from the inactive list, they are
 263  *      reactivated (ie, put back on the active queue) if referenced.
 264  *      However, it is possible to starve the free list if other
 265  *      processors are referencing pages faster than we can turn off
 266  *      the referenced bit.  So we limit the number of reactivations
 267  *      we will make per call of vm_pageout_scan().
 268  */
 269 #define VM_PAGE_REACTIVATE_LIMIT_MAX 20000
 270
 271 #ifndef VM_PAGE_REACTIVATE_LIMIT
 272 #if !XNU_TARGET_OS_OSX
 273 #define VM_PAGE_REACTIVATE_LIMIT(avail) (VM_PAGE_INACTIVE_TARGET(avail) / 2)
 274 #else /* !XNU_TARGET_OS_OSX */
 275 #define VM_PAGE_REACTIVATE_LIMIT(avail) (MAX((avail) * 1 / 20,VM_PAGE_REACTIVATE_LIMIT_MAX))
 276 #endif /* !XNU_TARGET_OS_OSX */
 277 #endif  /* VM_PAGE_REACTIVATE_LIMIT */
 278 #define VM_PAGEOUT_INACTIVE_FORCE_RECLAIM       1000
 279
 280 extern boolean_t hibernate_cleaning_in_progress;
 281
 282 /*
 283  * Forward declarations for internal routines.
 284  */
 285 struct cq {
 286         struct vm_pageout_queue *q;
 287         void                    *current_chead;
 288         char                    *scratch_buf;
 289         int                     id;
 290 };
 291
 292 struct cq ciq[MAX_COMPRESSOR_THREAD_COUNT];
 293
 294
 295 #if VM_PRESSURE_EVENTS
 296 void vm_pressure_thread(void);
 297
 298 boolean_t VM_PRESSURE_NORMAL_TO_WARNING(void);
 299 boolean_t VM_PRESSURE_WARNING_TO_CRITICAL(void);
 300
 301 boolean_t VM_PRESSURE_WARNING_TO_NORMAL(void);
 302 boolean_t VM_PRESSURE_CRITICAL_TO_WARNING(void);
 303 #endif
 304
 305 void vm_pageout_garbage_collect(int);
 306 static void vm_pageout_iothread_external(void);
 307 static void vm_pageout_iothread_internal(struct cq *cq);
 308 static void vm_pageout_adjust_eq_iothrottle(struct vm_pageout_queue *, boolean_t);
 309
 310 extern void vm_pageout_continue(void);
 311 extern void vm_pageout_scan(void);
 312
 313 boolean_t vm_pageout_running = FALSE;
 314
 315 uint32_t vm_page_upl_tainted = 0;
 316 uint32_t vm_page_iopl_tainted = 0;
 317
 318 #if XNU_TARGET_OS_OSX
 319 static boolean_t vm_pageout_waiter  = FALSE;
 320 #endif /* XNU_TARGET_OS_OSX */
 321
 322
 323 #if DEVELOPMENT || DEBUG
 324 struct vm_pageout_debug vm_pageout_debug;
 325 #endif
 326 struct vm_pageout_vminfo vm_pageout_vminfo;
 327 struct vm_pageout_state  vm_pageout_state;
 328 struct vm_config         vm_config;
 329
 330 struct  vm_pageout_queue vm_pageout_queue_internal VM_PAGE_PACKED_ALIGNED;
 331 struct  vm_pageout_queue vm_pageout_queue_external VM_PAGE_PACKED_ALIGNED;
 332
 333 int         vm_upl_wait_for_pages = 0;
 334 vm_object_t vm_pageout_scan_wants_object = VM_OBJECT_NULL;
 335
 336 boolean_t(*volatile consider_buffer_cache_collect)(int) = NULL;
 337
 338 int     vm_debug_events = 0;
 339
 340 LCK_GRP_DECLARE(vm_pageout_lck_grp, "vm_pageout");
 341
 342 #if CONFIG_MEMORYSTATUS
 343 extern boolean_t memorystatus_kill_on_VM_page_shortage(boolean_t async);
 344
 345 uint32_t vm_pageout_memorystatus_fb_factor_nr = 5;
 346 uint32_t vm_pageout_memorystatus_fb_factor_dr = 2;
 347
 348 #endif
 349
 350 #if __AMP__
 351 int vm_compressor_ebound = 1;
 352 int vm_pgo_pbound = 0;
 353 extern void thread_bind_cluster_type(thread_t, char, bool);
 354 #endif /* __AMP__ */
 355
 356
 357 /*
 358  *      Routine:        vm_pageout_object_terminate
 359  *      Purpose:
 360  *              Destroy the pageout_object, and perform all of the
 361  *              required cleanup actions.
 362  *
 363  *      In/Out conditions:
 364  *              The object must be locked, and will be returned locked.
 365  */
 366 void
 367 vm_pageout_object_terminate(
 368         vm_object_t     object)
 369 {
 370         vm_object_t     shadow_object;
 371
 372         /*
 373          * Deal with the deallocation (last reference) of a pageout object
 374          * (used for cleaning-in-place) by dropping the paging references/
 375          * freeing pages in the original object.
 376          */
 377
 378         assert(object->pageout);
 379         shadow_object = object->shadow;
 380         vm_object_lock(shadow_object);
 381
 382         while (!vm_page_queue_empty(&object->memq)) {
 383                 vm_page_t               p, m;
 384                 vm_object_offset_t      offset;
 385
 386                 p = (vm_page_t) vm_page_queue_first(&object->memq);
 387
 388                 assert(p->vmp_private);
 389                 assert(p->vmp_free_when_done);
 390                 p->vmp_free_when_done = FALSE;
 391                 assert(!p->vmp_cleaning);
 392                 assert(!p->vmp_laundry);
 393
 394                 offset = p->vmp_offset;
 395                 VM_PAGE_FREE(p);
 396                 p = VM_PAGE_NULL;
 397
 398                 m = vm_page_lookup(shadow_object,
 399                     offset + object->vo_shadow_offset);
 400
 401                 if (m == VM_PAGE_NULL) {
 402                         continue;
 403                 }
 404
 405                 assert((m->vmp_dirty) || (m->vmp_precious) ||
 406                     (m->vmp_busy && m->vmp_cleaning));
 407
 408                 /*
 409                  * Handle the trusted pager throttle.
 410                  * Also decrement the burst throttle (if external).
 411                  */
 412                 vm_page_lock_queues();
 413                 if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
 414                         vm_pageout_throttle_up(m);
 415                 }
 416
 417                 /*
 418                  * Handle the "target" page(s). These pages are to be freed if
 419                  * successfully cleaned. Target pages are always busy, and are
 420                  * wired exactly once. The initial target pages are not mapped,
 421                  * (so cannot be referenced or modified) but converted target
 422                  * pages may have been modified between the selection as an
 423                  * adjacent page and conversion to a target.
 424                  */
 425                 if (m->vmp_free_when_done) {
 426                         assert(m->vmp_busy);
 427                         assert(m->vmp_q_state == VM_PAGE_IS_WIRED);
 428                         assert(m->vmp_wire_count == 1);
 429                         m->vmp_cleaning = FALSE;
 430                         m->vmp_free_when_done = FALSE;
 431                         /*
 432                          * Revoke all access to the page. Since the object is
 433                          * locked, and the page is busy, this prevents the page
 434                          * from being dirtied after the pmap_disconnect() call
 435                          * returns.
 436                          *
 437                          * Since the page is left "dirty" but "not modifed", we
 438                          * can detect whether the page was redirtied during
 439                          * pageout by checking the modify state.
 440                          */
 441                         if (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED) {
 442                                 SET_PAGE_DIRTY(m, FALSE);
 443                         } else {
 444                                 m->vmp_dirty = FALSE;
 445                         }
 446
 447                         if (m->vmp_dirty) {
 448                                 vm_page_unwire(m, TRUE);        /* reactivates */
 449                                 counter_inc(&vm_statistics_reactivations);
 450                                 PAGE_WAKEUP_DONE(m);
 451                         } else {
 452                                 vm_page_free(m);  /* clears busy, etc. */
 453                         }
 454                         vm_page_unlock_queues();
 455                         continue;
 456                 }
 457                 /*
 458                  * Handle the "adjacent" pages. These pages were cleaned in
 459                  * place, and should be left alone.
 460                  * If prep_pin_count is nonzero, then someone is using the
 461                  * page, so make it active.
 462                  */
 463                 if ((m->vmp_q_state == VM_PAGE_NOT_ON_Q) && !m->vmp_private) {
 464                         if (m->vmp_reference) {
 465                                 vm_page_activate(m);
 466                         } else {
 467                                 vm_page_deactivate(m);
 468                         }
 469                 }
 470                 if (m->vmp_overwriting) {
 471                         /*
 472                          * the (COPY_OUT_FROM == FALSE) request_page_list case
 473                          */
 474                         if (m->vmp_busy) {
 475                                 /*
 476                                  * We do not re-set m->vmp_dirty !
 477                                  * The page was busy so no extraneous activity
 478                                  * could have occurred. COPY_INTO is a read into the
 479                                  * new pages. CLEAN_IN_PLACE does actually write
 480                                  * out the pages but handling outside of this code
 481                                  * will take care of resetting dirty. We clear the
 482                                  * modify however for the Programmed I/O case.
 483                                  */
 484                                 pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
 485
 486                                 m->vmp_busy = FALSE;
 487                                 m->vmp_absent = FALSE;
 488                         } else {
 489                                 /*
 490                                  * alternate (COPY_OUT_FROM == FALSE) request_page_list case
 491                                  * Occurs when the original page was wired
 492                                  * at the time of the list request
 493                                  */
 494                                 assert(VM_PAGE_WIRED(m));
 495                                 vm_page_unwire(m, TRUE);        /* reactivates */
 496                         }
 497                         m->vmp_overwriting = FALSE;
 498                 } else {
 499                         m->vmp_dirty = FALSE;
 500                 }
 501                 m->vmp_cleaning = FALSE;
 502
 503                 /*
 504                  * Wakeup any thread waiting for the page to be un-cleaning.
 505                  */
 506                 PAGE_WAKEUP(m);
 507                 vm_page_unlock_queues();
 508         }
 509         /*
 510          * Account for the paging reference taken in vm_paging_object_allocate.
 511          */
 512         vm_object_activity_end(shadow_object);
 513         vm_object_unlock(shadow_object);
 514
 515         assert(object->ref_count == 0);
 516         assert(object->paging_in_progress == 0);
 517         assert(object->activity_in_progress == 0);
 518         assert(object->resident_page_count == 0);
 519         return;
 520 }
 521
 522 /*
 523  * Routine:     vm_pageclean_setup
 524  *
 525  * Purpose:     setup a page to be cleaned (made non-dirty), but not
 526  *              necessarily flushed from the VM page cache.
 527  *              This is accomplished by cleaning in place.
 528  *
 529  *              The page must not be busy, and new_object
 530  *              must be locked.
 531  *
 532  */
 533 static void
 534 vm_pageclean_setup(
 535         vm_page_t               m,
 536         vm_page_t               new_m,
 537         vm_object_t             new_object,
 538         vm_object_offset_t      new_offset)
 539 {
 540         assert(!m->vmp_busy);
 541 #if 0
 542         assert(!m->vmp_cleaning);
 543 #endif
 544
 545         pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
 546
 547         /*
 548          * Mark original page as cleaning in place.
 549          */
 550         m->vmp_cleaning = TRUE;
 551         SET_PAGE_DIRTY(m, FALSE);
 552         m->vmp_precious = FALSE;
 553
 554         /*
 555          * Convert the fictitious page to a private shadow of
 556          * the real page.
 557          */
 558         assert(new_m->vmp_fictitious);
 559         assert(VM_PAGE_GET_PHYS_PAGE(new_m) == vm_page_fictitious_addr);
 560         new_m->vmp_fictitious = FALSE;
 561         new_m->vmp_private = TRUE;
 562         new_m->vmp_free_when_done = TRUE;
 563         VM_PAGE_SET_PHYS_PAGE(new_m, VM_PAGE_GET_PHYS_PAGE(m));
 564
 565         vm_page_lockspin_queues();
 566         vm_page_wire(new_m, VM_KERN_MEMORY_NONE, TRUE);
 567         vm_page_unlock_queues();
 568
 569         vm_page_insert_wired(new_m, new_object, new_offset, VM_KERN_MEMORY_NONE);
 570         assert(!new_m->vmp_wanted);
 571         new_m->vmp_busy = FALSE;
 572 }
 573
 574 /*
 575  *      Routine:        vm_pageout_initialize_page
 576  *      Purpose:
 577  *              Causes the specified page to be initialized in
 578  *              the appropriate memory object. This routine is used to push
 579  *              pages into a copy-object when they are modified in the
 580  *              permanent object.
 581  *
 582  *              The page is moved to a temporary object and paged out.
 583  *
 584  *      In/out conditions:
 585  *              The page in question must not be on any pageout queues.
 586  *              The object to which it belongs must be locked.
 587  *              The page must be busy, but not hold a paging reference.
 588  *
 589  *      Implementation:
 590  *              Move this page to a completely new object.
 591  */
 592 void
 593 vm_pageout_initialize_page(
 594         vm_page_t       m)
 595 {
 596         vm_object_t             object;
 597         vm_object_offset_t      paging_offset;
 598         memory_object_t         pager;
 599
 600         assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
 601
 602         object = VM_PAGE_OBJECT(m);
 603
 604         assert(m->vmp_busy);
 605         assert(object->internal);
 606
 607         /*
 608          *      Verify that we really want to clean this page
 609          */
 610         assert(!m->vmp_absent);
 611         assert(!m->vmp_error);
 612         assert(m->vmp_dirty);
 613
 614         /*
 615          *      Create a paging reference to let us play with the object.
 616          */
 617         paging_offset = m->vmp_offset + object->paging_offset;
 618
 619         if (m->vmp_absent || m->vmp_error || m->vmp_restart || (!m->vmp_dirty && !m->vmp_precious)) {
 620                 panic("reservation without pageout?"); /* alan */
 621
 622                 VM_PAGE_FREE(m);
 623                 vm_object_unlock(object);
 624
 625                 return;
 626         }
 627
 628         /*
 629          * If there's no pager, then we can't clean the page.  This should
 630          * never happen since this should be a copy object and therefore not
 631          * an external object, so the pager should always be there.
 632          */
 633
 634         pager = object->pager;
 635
 636         if (pager == MEMORY_OBJECT_NULL) {
 637                 panic("missing pager for copy object");
 638
 639                 VM_PAGE_FREE(m);
 640                 return;
 641         }
 642
 643         /*
 644          * set the page for future call to vm_fault_list_request
 645          */
 646         pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
 647         SET_PAGE_DIRTY(m, FALSE);
 648
 649         /*
 650          * keep the object from collapsing or terminating
 651          */
 652         vm_object_paging_begin(object);
 653         vm_object_unlock(object);
 654
 655         /*
 656          *      Write the data to its pager.
 657          *      Note that the data is passed by naming the new object,
 658          *      not a virtual address; the pager interface has been
 659          *      manipulated to use the "internal memory" data type.
 660          *      [The object reference from its allocation is donated
 661          *      to the eventual recipient.]
 662          */
 663         memory_object_data_initialize(pager, paging_offset, PAGE_SIZE);
 664
 665         vm_object_lock(object);
 666         vm_object_paging_end(object);
 667 }
 668
 669
 670 /*
 671  * vm_pageout_cluster:
 672  *
 673  * Given a page, queue it to the appropriate I/O thread,
 674  * which will page it out and attempt to clean adjacent pages
 675  * in the same operation.
 676  *
 677  * The object and queues must be locked. We will take a
 678  * paging reference to prevent deallocation or collapse when we
 679  * release the object lock back at the call site.  The I/O thread
 680  * is responsible for consuming this reference
 681  *
 682  * The page must not be on any pageout queue.
 683  */
 684 #if DEVELOPMENT || DEBUG
 685 vmct_stats_t vmct_stats;
 686
 687 int32_t vmct_active = 0;
 688 uint64_t vm_compressor_epoch_start = 0;
 689 uint64_t vm_compressor_epoch_stop = 0;
 690
 691 typedef enum vmct_state_t {
 692         VMCT_IDLE,
 693         VMCT_AWAKENED,
 694         VMCT_ACTIVE,
 695 } vmct_state_t;
 696 vmct_state_t vmct_state[MAX_COMPRESSOR_THREAD_COUNT];
 697 #endif
 698
 699
 700 void
 701 vm_pageout_cluster(vm_page_t m)
 702 {
 703         vm_object_t     object = VM_PAGE_OBJECT(m);
 704         struct          vm_pageout_queue *q;
 705
 706         VM_PAGE_CHECK(m);
 707         LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
 708         vm_object_lock_assert_exclusive(object);
 709
 710         /*
 711          * Only a certain kind of page is appreciated here.
 712          */
 713         assert((m->vmp_dirty || m->vmp_precious) && (!VM_PAGE_WIRED(m)));
 714         assert(!m->vmp_cleaning && !m->vmp_laundry);
 715         assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
 716
 717         /*
 718          * protect the object from collapse or termination
 719          */
 720         vm_object_activity_begin(object);
 721
 722         if (object->internal == TRUE) {
 723                 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
 724
 725                 m->vmp_busy = TRUE;
 726
 727                 q = &vm_pageout_queue_internal;
 728         } else {
 729                 q = &vm_pageout_queue_external;
 730         }
 731
 732         /*
 733          * pgo_laundry count is tied to the laundry bit
 734          */
 735         m->vmp_laundry = TRUE;
 736         q->pgo_laundry++;
 737
 738         m->vmp_q_state = VM_PAGE_ON_PAGEOUT_Q;
 739         vm_page_queue_enter(&q->pgo_pending, m, vmp_pageq);
 740
 741         if (q->pgo_idle == TRUE) {
 742                 q->pgo_idle = FALSE;
 743                 thread_wakeup((event_t) &q->pgo_pending);
 744         }
 745         VM_PAGE_CHECK(m);
 746 }
 747
 748
 749 /*
 750  * A page is back from laundry or we are stealing it back from
 751  * the laundering state.  See if there are some pages waiting to
 752  * go to laundry and if we can let some of them go now.
 753  *
 754  * Object and page queues must be locked.
 755  */
 756 void
 757 vm_pageout_throttle_up(
 758         vm_page_t       m)
 759 {
 760         struct vm_pageout_queue *q;
 761         vm_object_t      m_object;
 762
 763         m_object = VM_PAGE_OBJECT(m);
 764
 765         assert(m_object != VM_OBJECT_NULL);
 766         assert(m_object != kernel_object);
 767
 768         LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
 769         vm_object_lock_assert_exclusive(m_object);
 770
 771         if (m_object->internal == TRUE) {
 772                 q = &vm_pageout_queue_internal;
 773         } else {
 774                 q = &vm_pageout_queue_external;
 775         }
 776
 777         if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
 778                 vm_page_queue_remove(&q->pgo_pending, m, vmp_pageq);
 779                 m->vmp_q_state = VM_PAGE_NOT_ON_Q;
 780
 781                 VM_PAGE_ZERO_PAGEQ_ENTRY(m);
 782
 783                 vm_object_activity_end(m_object);
 784
 785                 VM_PAGEOUT_DEBUG(vm_page_steal_pageout_page, 1);
 786         }
 787         if (m->vmp_laundry == TRUE) {
 788                 m->vmp_laundry = FALSE;
 789                 q->pgo_laundry--;
 790
 791                 if (q->pgo_throttled == TRUE) {
 792                         q->pgo_throttled = FALSE;
 793                         thread_wakeup((event_t) &q->pgo_laundry);
 794                 }
 795                 if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
 796                         q->pgo_draining = FALSE;
 797                         thread_wakeup((event_t) (&q->pgo_laundry + 1));
 798                 }
 799                 VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, 1);
 800         }
 801 }
 802
 803
 804 static void
 805 vm_pageout_throttle_up_batch(
 806         struct vm_pageout_queue *q,
 807         int             batch_cnt)
 808 {
 809         LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
 810
 811         VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, batch_cnt);
 812
 813         q->pgo_laundry -= batch_cnt;
 814
 815         if (q->pgo_throttled == TRUE) {
 816                 q->pgo_throttled = FALSE;
 817                 thread_wakeup((event_t) &q->pgo_laundry);
 818         }
 819         if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
 820                 q->pgo_draining = FALSE;
 821                 thread_wakeup((event_t) (&q->pgo_laundry + 1));
 822         }
 823 }
 824
 825
 826
 827 /*
 828  * VM memory pressure monitoring.
 829  *
 830  * vm_pageout_scan() keeps track of the number of pages it considers and
 831  * reclaims, in the currently active vm_pageout_stat[vm_pageout_stat_now].
 832  *
 833  * compute_memory_pressure() is called every second from compute_averages()
 834  * and moves "vm_pageout_stat_now" forward, to start accumulating the number
 835  * of recalimed pages in a new vm_pageout_stat[] bucket.
 836  *
 837  * mach_vm_pressure_monitor() collects past statistics about memory pressure.
 838  * The caller provides the number of seconds ("nsecs") worth of statistics
 839  * it wants, up to 30 seconds.
 840  * It computes the number of pages reclaimed in the past "nsecs" seconds and
 841  * also returns the number of pages the system still needs to reclaim at this
 842  * moment in time.
 843  */
 844 #if DEVELOPMENT || DEBUG
 845 #define VM_PAGEOUT_STAT_SIZE    (30 * 8) + 1
 846 #else
 847 #define VM_PAGEOUT_STAT_SIZE    (1 * 8) + 1
 848 #endif
 849 struct vm_pageout_stat {
 850         unsigned long vm_page_active_count;
 851         unsigned long vm_page_speculative_count;
 852         unsigned long vm_page_inactive_count;
 853         unsigned long vm_page_anonymous_count;
 854
 855         unsigned long vm_page_free_count;
 856         unsigned long vm_page_wire_count;
 857         unsigned long vm_page_compressor_count;
 858
 859         unsigned long vm_page_pages_compressed;
 860         unsigned long vm_page_pageable_internal_count;
 861         unsigned long vm_page_pageable_external_count;
 862         unsigned long vm_page_xpmapped_external_count;
 863
 864         unsigned int pages_grabbed;
 865         unsigned int pages_freed;
 866
 867         unsigned int pages_compressed;
 868         unsigned int pages_grabbed_by_compressor;
 869         unsigned int failed_compressions;
 870
 871         unsigned int pages_evicted;
 872         unsigned int pages_purged;
 873
 874         unsigned int considered;
 875         unsigned int considered_bq_internal;
 876         unsigned int considered_bq_external;
 877
 878         unsigned int skipped_external;
 879         unsigned int filecache_min_reactivations;
 880
 881         unsigned int freed_speculative;
 882         unsigned int freed_cleaned;
 883         unsigned int freed_internal;
 884         unsigned int freed_external;
 885
 886         unsigned int cleaned_dirty_external;
 887         unsigned int cleaned_dirty_internal;
 888
 889         unsigned int inactive_referenced;
 890         unsigned int inactive_nolock;
 891         unsigned int reactivation_limit_exceeded;
 892         unsigned int forced_inactive_reclaim;
 893
 894         unsigned int throttled_internal_q;
 895         unsigned int throttled_external_q;
 896
 897         unsigned int phantom_ghosts_found;
 898         unsigned int phantom_ghosts_added;
 899 } vm_pageout_stats[VM_PAGEOUT_STAT_SIZE] = {{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, };
 900
 901 unsigned int vm_pageout_stat_now = 0;
 902
 903 #define VM_PAGEOUT_STAT_BEFORE(i) \
 904         (((i) == 0) ? VM_PAGEOUT_STAT_SIZE - 1 : (i) - 1)
 905 #define VM_PAGEOUT_STAT_AFTER(i) \
 906         (((i) == VM_PAGEOUT_STAT_SIZE - 1) ? 0 : (i) + 1)
 907
 908 #if VM_PAGE_BUCKETS_CHECK
 909 int vm_page_buckets_check_interval = 80; /* in eighths of a second */
 910 #endif /* VM_PAGE_BUCKETS_CHECK */
 911
 912
 913 void
 914 record_memory_pressure(void);
 915 void
 916 record_memory_pressure(void)
 917 {
 918         unsigned int vm_pageout_next;
 919
 920 #if VM_PAGE_BUCKETS_CHECK
 921         /* check the consistency of VM page buckets at regular interval */
 922         static int counter = 0;
 923         if ((++counter % vm_page_buckets_check_interval) == 0) {
 924                 vm_page_buckets_check();
 925         }
 926 #endif /* VM_PAGE_BUCKETS_CHECK */
 927
 928         vm_pageout_state.vm_memory_pressure =
 929             vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_speculative +
 930             vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_cleaned +
 931             vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_internal +
 932             vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_external;
 933
 934         commpage_set_memory_pressure((unsigned int)vm_pageout_state.vm_memory_pressure );
 935
 936         /* move "now" forward */
 937         vm_pageout_next = VM_PAGEOUT_STAT_AFTER(vm_pageout_stat_now);
 938
 939         bzero(&vm_pageout_stats[vm_pageout_next], sizeof(struct vm_pageout_stat));
 940
 941         vm_pageout_stat_now = vm_pageout_next;
 942 }
 943
 944
 945 /*
 946  * IMPORTANT
 947  * mach_vm_ctl_page_free_wanted() is called indirectly, via
 948  * mach_vm_pressure_monitor(), when taking a stackshot. Therefore,
 949  * it must be safe in the restricted stackshot context. Locks and/or
 950  * blocking are not allowable.
 951  */
 952 unsigned int
 953 mach_vm_ctl_page_free_wanted(void)
 954 {
 955         unsigned int page_free_target, page_free_count, page_free_wanted;
 956
 957         page_free_target = vm_page_free_target;
 958         page_free_count = vm_page_free_count;
 959         if (page_free_target > page_free_count) {
 960                 page_free_wanted = page_free_target - page_free_count;
 961         } else {
 962                 page_free_wanted = 0;
 963         }
 964
 965         return page_free_wanted;
 966 }
 967
 968
 969 /*
 970  * IMPORTANT:
 971  * mach_vm_pressure_monitor() is called when taking a stackshot, with
 972  * wait_for_pressure FALSE, so that code path must remain safe in the
 973  * restricted stackshot context. No blocking or locks are allowable.
 974  * on that code path.
 975  */
 976
 977 kern_return_t
 978 mach_vm_pressure_monitor(
 979         boolean_t       wait_for_pressure,
 980         unsigned int    nsecs_monitored,
 981         unsigned int    *pages_reclaimed_p,
 982         unsigned int    *pages_wanted_p)
 983 {
 984         wait_result_t   wr;
 985         unsigned int    vm_pageout_then, vm_pageout_now;
 986         unsigned int    pages_reclaimed;
 987         unsigned int    units_of_monitor;
 988
 989         units_of_monitor = 8 * nsecs_monitored;
 990         /*
 991          * We don't take the vm_page_queue_lock here because we don't want
 992          * vm_pressure_monitor() to get in the way of the vm_pageout_scan()
 993          * thread when it's trying to reclaim memory.  We don't need fully
 994          * accurate monitoring anyway...
 995          */
 996
 997         if (wait_for_pressure) {
 998                 /* wait until there's memory pressure */
 999                 while (vm_page_free_count >= vm_page_free_target) {
1000                         wr = assert_wait((event_t) &vm_page_free_wanted,
1001                             THREAD_INTERRUPTIBLE);
1002                         if (wr == THREAD_WAITING) {
1003                                 wr = thread_block(THREAD_CONTINUE_NULL);
1004                         }
1005                         if (wr == THREAD_INTERRUPTED) {
1006                                 return KERN_ABORTED;
1007                         }
1008                         if (wr == THREAD_AWAKENED) {
1009                                 /*
1010                                  * The memory pressure might have already
1011                                  * been relieved but let's not block again
1012                                  * and let's report that there was memory
1013                                  * pressure at some point.
1014                                  */
1015                                 break;
1016                         }
1017                 }
1018         }
1019
1020         /* provide the number of pages the system wants to reclaim */
1021         if (pages_wanted_p != NULL) {
1022                 *pages_wanted_p = mach_vm_ctl_page_free_wanted();
1023         }
1024
1025         if (pages_reclaimed_p == NULL) {
1026                 return KERN_SUCCESS;
1027         }
1028
1029         /* provide number of pages reclaimed in the last "nsecs_monitored" */
1030         vm_pageout_now = vm_pageout_stat_now;
1031         pages_reclaimed = 0;
1032         for (vm_pageout_then =
1033             VM_PAGEOUT_STAT_BEFORE(vm_pageout_now);
1034             vm_pageout_then != vm_pageout_now &&
1035             units_of_monitor-- != 0;
1036             vm_pageout_then =
1037             VM_PAGEOUT_STAT_BEFORE(vm_pageout_then)) {
1038                 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_speculative;
1039                 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_cleaned;
1040                 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_internal;
1041                 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_external;
1042         }
1043         *pages_reclaimed_p = pages_reclaimed;
1044
1045         return KERN_SUCCESS;
1046 }
1047
1048
1049
1050 #if DEVELOPMENT || DEBUG
1051
1052 static void
1053 vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *, int);
1054
1055 /*
1056  * condition variable used to make sure there is
1057  * only a single sweep going on at a time
1058  */
1059 boolean_t       vm_pageout_disconnect_all_pages_active = FALSE;
1060
1061
1062 void
1063 vm_pageout_disconnect_all_pages()
1064 {
1065         vm_page_lock_queues();
1066
1067         if (vm_pageout_disconnect_all_pages_active == TRUE) {
1068                 vm_page_unlock_queues();
1069                 return;
1070         }
1071         vm_pageout_disconnect_all_pages_active = TRUE;
1072         vm_page_unlock_queues();
1073
1074         vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_throttled, vm_page_throttled_count);
1075         vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_anonymous, vm_page_anonymous_count);
1076         vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_active, vm_page_active_count);
1077
1078         vm_pageout_disconnect_all_pages_active = FALSE;
1079 }
1080
1081
1082 void
1083 vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *q, int qcount)
1084 {
1085         vm_page_t       m;
1086         vm_object_t     t_object = NULL;
1087         vm_object_t     l_object = NULL;
1088         vm_object_t     m_object = NULL;
1089         int             delayed_unlock = 0;
1090         int             try_failed_count = 0;
1091         int             disconnected_count = 0;
1092         int             paused_count = 0;
1093         int             object_locked_count = 0;
1094
1095         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS)) | DBG_FUNC_START,
1096             q, qcount, 0, 0, 0);
1097
1098         vm_page_lock_queues();
1099
1100         while (qcount && !vm_page_queue_empty(q)) {
1101                 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1102
1103                 m = (vm_page_t) vm_page_queue_first(q);
1104                 m_object = VM_PAGE_OBJECT(m);
1105
1106                 /*
1107                  * check to see if we currently are working
1108                  * with the same object... if so, we've
1109                  * already got the lock
1110                  */
1111                 if (m_object != l_object) {
1112                         /*
1113                          * the object associated with candidate page is
1114                          * different from the one we were just working
1115                          * with... dump the lock if we still own it
1116                          */
1117                         if (l_object != NULL) {
1118                                 vm_object_unlock(l_object);
1119                                 l_object = NULL;
1120                         }
1121                         if (m_object != t_object) {
1122                                 try_failed_count = 0;
1123                         }
1124
1125                         /*
1126                          * Try to lock object; since we've alread got the
1127                          * page queues lock, we can only 'try' for this one.
1128                          * if the 'try' fails, we need to do a mutex_pause
1129                          * to allow the owner of the object lock a chance to
1130                          * run...
1131                          */
1132                         if (!vm_object_lock_try_scan(m_object)) {
1133                                 if (try_failed_count > 20) {
1134                                         goto reenter_pg_on_q;
1135                                 }
1136                                 vm_page_unlock_queues();
1137                                 mutex_pause(try_failed_count++);
1138                                 vm_page_lock_queues();
1139                                 delayed_unlock = 0;
1140
1141                                 paused_count++;
1142
1143                                 t_object = m_object;
1144                                 continue;
1145                         }
1146                         object_locked_count++;
1147
1148                         l_object = m_object;
1149                 }
1150                 if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry || m->vmp_busy || m->vmp_absent || m->vmp_error || m->vmp_free_when_done) {
1151                         /*
1152                          * put it back on the head of its queue
1153                          */
1154                         goto reenter_pg_on_q;
1155                 }
1156                 if (m->vmp_pmapped == TRUE) {
1157                         pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
1158
1159                         disconnected_count++;
1160                 }
1161 reenter_pg_on_q:
1162                 vm_page_queue_remove(q, m, vmp_pageq);
1163                 vm_page_queue_enter(q, m, vmp_pageq);
1164
1165                 qcount--;
1166                 try_failed_count = 0;
1167
1168                 if (delayed_unlock++ > 128) {
1169                         if (l_object != NULL) {
1170                                 vm_object_unlock(l_object);
1171                                 l_object = NULL;
1172                         }
1173                         lck_mtx_yield(&vm_page_queue_lock);
1174                         delayed_unlock = 0;
1175                 }
1176         }
1177         if (l_object != NULL) {
1178                 vm_object_unlock(l_object);
1179                 l_object = NULL;
1180         }
1181         vm_page_unlock_queues();
1182
1183         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS)) | DBG_FUNC_END,
1184             q, disconnected_count, object_locked_count, paused_count, 0);
1185 }
1186
1187 #endif
1188
1189
1190 static void
1191 vm_pageout_page_queue(vm_page_queue_head_t *, int);
1192
1193 /*
1194  * condition variable used to make sure there is
1195  * only a single sweep going on at a time
1196  */
1197 boolean_t       vm_pageout_anonymous_pages_active = FALSE;
1198
1199
1200 void
1201 vm_pageout_anonymous_pages()
1202 {
1203         if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
1204                 vm_page_lock_queues();
1205
1206                 if (vm_pageout_anonymous_pages_active == TRUE) {
1207                         vm_page_unlock_queues();
1208                         return;
1209                 }
1210                 vm_pageout_anonymous_pages_active = TRUE;
1211                 vm_page_unlock_queues();
1212
1213                 vm_pageout_page_queue(&vm_page_queue_throttled, vm_page_throttled_count);
1214                 vm_pageout_page_queue(&vm_page_queue_anonymous, vm_page_anonymous_count);
1215                 vm_pageout_page_queue(&vm_page_queue_active, vm_page_active_count);
1216
1217                 if (VM_CONFIG_SWAP_IS_PRESENT) {
1218                         vm_consider_swapping();
1219                 }
1220
1221                 vm_page_lock_queues();
1222                 vm_pageout_anonymous_pages_active = FALSE;
1223                 vm_page_unlock_queues();
1224         }
1225 }
1226
1227
1228 void
1229 vm_pageout_page_queue(vm_page_queue_head_t *q, int qcount)
1230 {
1231         vm_page_t       m;
1232         vm_object_t     t_object = NULL;
1233         vm_object_t     l_object = NULL;
1234         vm_object_t     m_object = NULL;
1235         int             delayed_unlock = 0;
1236         int             try_failed_count = 0;
1237         int             refmod_state;
1238         int             pmap_options;
1239         struct          vm_pageout_queue *iq;
1240         ppnum_t         phys_page;
1241
1242
1243         iq = &vm_pageout_queue_internal;
1244
1245         vm_page_lock_queues();
1246
1247         while (qcount && !vm_page_queue_empty(q)) {
1248                 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1249
1250                 if (VM_PAGE_Q_THROTTLED(iq)) {
1251                         if (l_object != NULL) {
1252                                 vm_object_unlock(l_object);
1253                                 l_object = NULL;
1254                         }
1255                         iq->pgo_draining = TRUE;
1256
1257                         assert_wait((event_t) (&iq->pgo_laundry + 1), THREAD_INTERRUPTIBLE);
1258                         vm_page_unlock_queues();
1259
1260                         thread_block(THREAD_CONTINUE_NULL);
1261
1262                         vm_page_lock_queues();
1263                         delayed_unlock = 0;
1264                         continue;
1265                 }
1266                 m = (vm_page_t) vm_page_queue_first(q);
1267                 m_object = VM_PAGE_OBJECT(m);
1268
1269                 /*
1270                  * check to see if we currently are working
1271                  * with the same object... if so, we've
1272                  * already got the lock
1273                  */
1274                 if (m_object != l_object) {
1275                         if (!m_object->internal) {
1276                                 goto reenter_pg_on_q;
1277                         }
1278
1279                         /*
1280                          * the object associated with candidate page is
1281                          * different from the one we were just working
1282                          * with... dump the lock if we still own it
1283                          */
1284                         if (l_object != NULL) {
1285                                 vm_object_unlock(l_object);
1286                                 l_object = NULL;
1287                         }
1288                         if (m_object != t_object) {
1289                                 try_failed_count = 0;
1290                         }
1291
1292                         /*
1293                          * Try to lock object; since we've alread got the
1294                          * page queues lock, we can only 'try' for this one.
1295                          * if the 'try' fails, we need to do a mutex_pause
1296                          * to allow the owner of the object lock a chance to
1297                          * run...
1298                          */
1299                         if (!vm_object_lock_try_scan(m_object)) {
1300                                 if (try_failed_count > 20) {
1301                                         goto reenter_pg_on_q;
1302                                 }
1303                                 vm_page_unlock_queues();
1304                                 mutex_pause(try_failed_count++);
1305                                 vm_page_lock_queues();
1306                                 delayed_unlock = 0;
1307
1308                                 t_object = m_object;
1309                                 continue;
1310                         }
1311                         l_object = m_object;
1312                 }
1313                 if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry || m->vmp_busy || m->vmp_absent || m->vmp_error || m->vmp_free_when_done) {
1314                         /*
1315                          * page is not to be cleaned
1316                          * put it back on the head of its queue
1317                          */
1318                         goto reenter_pg_on_q;
1319                 }
1320                 phys_page = VM_PAGE_GET_PHYS_PAGE(m);
1321
1322                 if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
1323                         refmod_state = pmap_get_refmod(phys_page);
1324
1325                         if (refmod_state & VM_MEM_REFERENCED) {
1326                                 m->vmp_reference = TRUE;
1327                         }
1328                         if (refmod_state & VM_MEM_MODIFIED) {
1329                                 SET_PAGE_DIRTY(m, FALSE);
1330                         }
1331                 }
1332                 if (m->vmp_reference == TRUE) {
1333                         m->vmp_reference = FALSE;
1334                         pmap_clear_refmod_options(phys_page, VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
1335                         goto reenter_pg_on_q;
1336                 }
1337                 if (m->vmp_pmapped == TRUE) {
1338                         if (m->vmp_dirty || m->vmp_precious) {
1339                                 pmap_options = PMAP_OPTIONS_COMPRESSOR;
1340                         } else {
1341                                 pmap_options = PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
1342                         }
1343                         refmod_state = pmap_disconnect_options(phys_page, pmap_options, NULL);
1344                         if (refmod_state & VM_MEM_MODIFIED) {
1345                                 SET_PAGE_DIRTY(m, FALSE);
1346                         }
1347                 }
1348
1349                 if (!m->vmp_dirty && !m->vmp_precious) {
1350                         vm_page_unlock_queues();
1351                         VM_PAGE_FREE(m);
1352                         vm_page_lock_queues();
1353                         delayed_unlock = 0;
1354
1355                         goto next_pg;
1356                 }
1357                 if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) {
1358                         if (!m_object->pager_initialized) {
1359                                 vm_page_unlock_queues();
1360
1361                                 vm_object_collapse(m_object, (vm_object_offset_t) 0, TRUE);
1362
1363                                 if (!m_object->pager_initialized) {
1364                                         vm_object_compressor_pager_create(m_object);
1365                                 }
1366
1367                                 vm_page_lock_queues();
1368                                 delayed_unlock = 0;
1369                         }
1370                         if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) {
1371                                 goto reenter_pg_on_q;
1372                         }
1373                         /*
1374                          * vm_object_compressor_pager_create will drop the object lock
1375                          * which means 'm' may no longer be valid to use
1376                          */
1377                         continue;
1378                 }
1379                 /*
1380                  * we've already factored out pages in the laundry which
1381                  * means this page can't be on the pageout queue so it's
1382                  * safe to do the vm_page_queues_remove
1383                  */
1384                 vm_page_queues_remove(m, TRUE);
1385
1386                 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1387
1388                 vm_pageout_cluster(m);
1389
1390                 goto next_pg;
1391
1392 reenter_pg_on_q:
1393                 vm_page_queue_remove(q, m, vmp_pageq);
1394                 vm_page_queue_enter(q, m, vmp_pageq);
1395 next_pg:
1396                 qcount--;
1397                 try_failed_count = 0;
1398
1399                 if (delayed_unlock++ > 128) {
1400                         if (l_object != NULL) {
1401                                 vm_object_unlock(l_object);
1402                                 l_object = NULL;
1403                         }
1404                         lck_mtx_yield(&vm_page_queue_lock);
1405                         delayed_unlock = 0;
1406                 }
1407         }
1408         if (l_object != NULL) {
1409                 vm_object_unlock(l_object);
1410                 l_object = NULL;
1411         }
1412         vm_page_unlock_queues();
1413 }
1414
1415
1416
1417 /*
1418  * function in BSD to apply I/O throttle to the pageout thread
1419  */
1420 extern void vm_pageout_io_throttle(void);
1421
1422 #define VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, obj)                    \
1423         MACRO_BEGIN                                                     \
1424         /* \
1425          * If a "reusable" page somehow made it back into \
1426          * the active queue, it's been re-used and is not \
1427          * quite re-usable. \
1428          * If the VM object was "all_reusable", consider it \
1429          * as "all re-used" instead of converting it to \
1430          * "partially re-used", which could be expensive. \
1431          */                                                             \
1432         assert(VM_PAGE_OBJECT((m)) == (obj));                           \
1433         if ((m)->vmp_reusable ||                                        \
1434             (obj)->all_reusable) {                                      \
1435                 vm_object_reuse_pages((obj),                            \
1436                                       (m)->vmp_offset,                  \
1437                                       (m)->vmp_offset + PAGE_SIZE_64,   \
1438                                       FALSE);                           \
1439         }                                                               \
1440         MACRO_END
1441
1442
1443 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT         64
1444 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX     1024
1445
1446 #define FCS_IDLE                0
1447 #define FCS_DELAYED             1
1448 #define FCS_DEADLOCK_DETECTED   2
1449
1450 struct flow_control {
1451         int             state;
1452         mach_timespec_t ts;
1453 };
1454
1455
1456 #if CONFIG_BACKGROUND_QUEUE
1457 uint64_t vm_pageout_rejected_bq_internal = 0;
1458 uint64_t vm_pageout_rejected_bq_external = 0;
1459 uint64_t vm_pageout_skipped_bq_internal = 0;
1460 #endif
1461
1462 #define ANONS_GRABBED_LIMIT     2
1463
1464
1465 #if 0
1466 static void vm_pageout_delayed_unlock(int *, int *, vm_page_t *);
1467 #endif
1468 static void vm_pageout_prepare_to_block(vm_object_t *, int *, vm_page_t *, int *, int);
1469
1470 #define VM_PAGEOUT_PB_NO_ACTION                         0
1471 #define VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER 1
1472 #define VM_PAGEOUT_PB_THREAD_YIELD                      2
1473
1474
1475 #if 0
1476 static void
1477 vm_pageout_delayed_unlock(int *delayed_unlock, int *local_freed, vm_page_t *local_freeq)
1478 {
1479         if (*local_freeq) {
1480                 vm_page_unlock_queues();
1481
1482                 VM_DEBUG_CONSTANT_EVENT(
1483                         vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START,
1484                         vm_page_free_count, 0, 0, 1);
1485
1486                 vm_page_free_list(*local_freeq, TRUE);
1487
1488                 VM_DEBUG_CONSTANT_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END,
1489                     vm_page_free_count, *local_freed, 0, 1);
1490
1491                 *local_freeq = NULL;
1492                 *local_freed = 0;
1493
1494                 vm_page_lock_queues();
1495         } else {
1496                 lck_mtx_yield(&vm_page_queue_lock);
1497         }
1498         *delayed_unlock = 1;
1499 }
1500 #endif
1501
1502
1503 static void
1504 vm_pageout_prepare_to_block(vm_object_t *object, int *delayed_unlock,
1505     vm_page_t *local_freeq, int *local_freed, int action)
1506 {
1507         vm_page_unlock_queues();
1508
1509         if (*object != NULL) {
1510                 vm_object_unlock(*object);
1511                 *object = NULL;
1512         }
1513         if (*local_freeq) {
1514                 vm_page_free_list(*local_freeq, TRUE);
1515
1516                 *local_freeq = NULL;
1517                 *local_freed = 0;
1518         }
1519         *delayed_unlock = 1;
1520
1521         switch (action) {
1522         case VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER:
1523                 vm_consider_waking_compactor_swapper();
1524                 break;
1525         case VM_PAGEOUT_PB_THREAD_YIELD:
1526                 thread_yield_internal(1);
1527                 break;
1528         case VM_PAGEOUT_PB_NO_ACTION:
1529         default:
1530                 break;
1531         }
1532         vm_page_lock_queues();
1533 }
1534
1535
1536 static struct vm_pageout_vminfo last;
1537
1538 uint64_t last_vm_page_pages_grabbed = 0;
1539
1540 extern  uint32_t c_segment_pages_compressed;
1541
1542 extern uint64_t shared_region_pager_reclaimed;
1543 extern struct memory_object_pager_ops shared_region_pager_ops;
1544
1545 void
1546 update_vm_info(void)
1547 {
1548         unsigned long tmp;
1549         uint64_t tmp64;
1550
1551         vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count = vm_page_active_count;
1552         vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count = vm_page_speculative_count;
1553         vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count = vm_page_inactive_count;
1554         vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count = vm_page_anonymous_count;
1555
1556         vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count = vm_page_free_count;
1557         vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count = vm_page_wire_count;
1558         vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count = VM_PAGE_COMPRESSOR_COUNT;
1559
1560         vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed = c_segment_pages_compressed;
1561         vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count = vm_page_pageable_internal_count;
1562         vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count = vm_page_pageable_external_count;
1563         vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count = vm_page_xpmapped_external_count;
1564
1565
1566         tmp = vm_pageout_vminfo.vm_pageout_considered_page;
1567         vm_pageout_stats[vm_pageout_stat_now].considered = (unsigned int)(tmp - last.vm_pageout_considered_page);
1568         last.vm_pageout_considered_page = tmp;
1569
1570         tmp64 = vm_pageout_vminfo.vm_pageout_compressions;
1571         vm_pageout_stats[vm_pageout_stat_now].pages_compressed = (unsigned int)(tmp64 - last.vm_pageout_compressions);
1572         last.vm_pageout_compressions = tmp64;
1573
1574         tmp = vm_pageout_vminfo.vm_compressor_failed;
1575         vm_pageout_stats[vm_pageout_stat_now].failed_compressions = (unsigned int)(tmp - last.vm_compressor_failed);
1576         last.vm_compressor_failed = tmp;
1577
1578         tmp64 = vm_pageout_vminfo.vm_compressor_pages_grabbed;
1579         vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor = (unsigned int)(tmp64 - last.vm_compressor_pages_grabbed);
1580         last.vm_compressor_pages_grabbed = tmp64;
1581
1582         tmp = vm_pageout_vminfo.vm_phantom_cache_found_ghost;
1583         vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found = (unsigned int)(tmp - last.vm_phantom_cache_found_ghost);
1584         last.vm_phantom_cache_found_ghost = tmp;
1585
1586         tmp = vm_pageout_vminfo.vm_phantom_cache_added_ghost;
1587         vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added = (unsigned int)(tmp - last.vm_phantom_cache_added_ghost);
1588         last.vm_phantom_cache_added_ghost = tmp;
1589
1590         tmp64 = counter_load(&vm_page_grab_count);
1591         vm_pageout_stats[vm_pageout_stat_now].pages_grabbed = (unsigned int)(tmp64 - last_vm_page_pages_grabbed);
1592         last_vm_page_pages_grabbed = tmp64;
1593
1594         tmp = vm_pageout_vminfo.vm_page_pages_freed;
1595         vm_pageout_stats[vm_pageout_stat_now].pages_freed = (unsigned int)(tmp - last.vm_page_pages_freed);
1596         last.vm_page_pages_freed = tmp;
1597
1598
1599         if (vm_pageout_stats[vm_pageout_stat_now].considered) {
1600                 tmp = vm_pageout_vminfo.vm_pageout_pages_evicted;
1601                 vm_pageout_stats[vm_pageout_stat_now].pages_evicted = (unsigned int)(tmp - last.vm_pageout_pages_evicted);
1602                 last.vm_pageout_pages_evicted = tmp;
1603
1604                 tmp = vm_pageout_vminfo.vm_pageout_pages_purged;
1605                 vm_pageout_stats[vm_pageout_stat_now].pages_purged = (unsigned int)(tmp - last.vm_pageout_pages_purged);
1606                 last.vm_pageout_pages_purged = tmp;
1607
1608                 tmp = vm_pageout_vminfo.vm_pageout_freed_speculative;
1609                 vm_pageout_stats[vm_pageout_stat_now].freed_speculative = (unsigned int)(tmp - last.vm_pageout_freed_speculative);
1610                 last.vm_pageout_freed_speculative = tmp;
1611
1612                 tmp = vm_pageout_vminfo.vm_pageout_freed_external;
1613                 vm_pageout_stats[vm_pageout_stat_now].freed_external = (unsigned int)(tmp - last.vm_pageout_freed_external);
1614                 last.vm_pageout_freed_external = tmp;
1615
1616                 tmp = vm_pageout_vminfo.vm_pageout_inactive_referenced;
1617                 vm_pageout_stats[vm_pageout_stat_now].inactive_referenced = (unsigned int)(tmp - last.vm_pageout_inactive_referenced);
1618                 last.vm_pageout_inactive_referenced = tmp;
1619
1620                 tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external;
1621                 vm_pageout_stats[vm_pageout_stat_now].throttled_external_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_external);
1622                 last.vm_pageout_scan_inactive_throttled_external = tmp;
1623
1624                 tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_external;
1625                 vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_external);
1626                 last.vm_pageout_inactive_dirty_external = tmp;
1627
1628                 tmp = vm_pageout_vminfo.vm_pageout_freed_cleaned;
1629                 vm_pageout_stats[vm_pageout_stat_now].freed_cleaned = (unsigned int)(tmp - last.vm_pageout_freed_cleaned);
1630                 last.vm_pageout_freed_cleaned = tmp;
1631
1632                 tmp = vm_pageout_vminfo.vm_pageout_inactive_nolock;
1633                 vm_pageout_stats[vm_pageout_stat_now].inactive_nolock = (unsigned int)(tmp - last.vm_pageout_inactive_nolock);
1634                 last.vm_pageout_inactive_nolock = tmp;
1635
1636                 tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal;
1637                 vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_internal);
1638                 last.vm_pageout_scan_inactive_throttled_internal = tmp;
1639
1640                 tmp = vm_pageout_vminfo.vm_pageout_skipped_external;
1641                 vm_pageout_stats[vm_pageout_stat_now].skipped_external = (unsigned int)(tmp - last.vm_pageout_skipped_external);
1642                 last.vm_pageout_skipped_external = tmp;
1643
1644                 tmp = vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded;
1645                 vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded = (unsigned int)(tmp - last.vm_pageout_reactivation_limit_exceeded);
1646                 last.vm_pageout_reactivation_limit_exceeded = tmp;
1647
1648                 tmp = vm_pageout_vminfo.vm_pageout_inactive_force_reclaim;
1649                 vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim = (unsigned int)(tmp - last.vm_pageout_inactive_force_reclaim);
1650                 last.vm_pageout_inactive_force_reclaim = tmp;
1651
1652                 tmp = vm_pageout_vminfo.vm_pageout_freed_internal;
1653                 vm_pageout_stats[vm_pageout_stat_now].freed_internal = (unsigned int)(tmp - last.vm_pageout_freed_internal);
1654                 last.vm_pageout_freed_internal = tmp;
1655
1656                 tmp = vm_pageout_vminfo.vm_pageout_considered_bq_internal;
1657                 vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal = (unsigned int)(tmp - last.vm_pageout_considered_bq_internal);
1658                 last.vm_pageout_considered_bq_internal = tmp;
1659
1660                 tmp = vm_pageout_vminfo.vm_pageout_considered_bq_external;
1661                 vm_pageout_stats[vm_pageout_stat_now].considered_bq_external = (unsigned int)(tmp - last.vm_pageout_considered_bq_external);
1662                 last.vm_pageout_considered_bq_external = tmp;
1663
1664                 tmp = vm_pageout_vminfo.vm_pageout_filecache_min_reactivated;
1665                 vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations = (unsigned int)(tmp - last.vm_pageout_filecache_min_reactivated);
1666                 last.vm_pageout_filecache_min_reactivated = tmp;
1667
1668                 tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_internal;
1669                 vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_internal);
1670                 last.vm_pageout_inactive_dirty_internal = tmp;
1671         }
1672
1673         KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO1)) | DBG_FUNC_NONE,
1674             vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count,
1675             vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count,
1676             vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count,
1677             vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count,
1678             0);
1679
1680         KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO2)) | DBG_FUNC_NONE,
1681             vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count,
1682             vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count,
1683             vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count,
1684             0,
1685             0);
1686
1687         KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO3)) | DBG_FUNC_NONE,
1688             vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed,
1689             vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count,
1690             vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count,
1691             vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count,
1692             0);
1693
1694         if (vm_pageout_stats[vm_pageout_stat_now].considered ||
1695             vm_pageout_stats[vm_pageout_stat_now].pages_compressed ||
1696             vm_pageout_stats[vm_pageout_stat_now].failed_compressions) {
1697                 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO4)) | DBG_FUNC_NONE,
1698                     vm_pageout_stats[vm_pageout_stat_now].considered,
1699                     vm_pageout_stats[vm_pageout_stat_now].freed_speculative,
1700                     vm_pageout_stats[vm_pageout_stat_now].freed_external,
1701                     vm_pageout_stats[vm_pageout_stat_now].inactive_referenced,
1702                     0);
1703
1704                 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO5)) | DBG_FUNC_NONE,
1705                     vm_pageout_stats[vm_pageout_stat_now].throttled_external_q,
1706                     vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external,
1707                     vm_pageout_stats[vm_pageout_stat_now].freed_cleaned,
1708                     vm_pageout_stats[vm_pageout_stat_now].inactive_nolock,
1709                     0);
1710
1711                 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO6)) | DBG_FUNC_NONE,
1712                     vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q,
1713                     vm_pageout_stats[vm_pageout_stat_now].pages_compressed,
1714                     vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor,
1715                     vm_pageout_stats[vm_pageout_stat_now].skipped_external,
1716                     0);
1717
1718                 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO7)) | DBG_FUNC_NONE,
1719                     vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded,
1720                     vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim,
1721                     vm_pageout_stats[vm_pageout_stat_now].failed_compressions,
1722                     vm_pageout_stats[vm_pageout_stat_now].freed_internal,
1723                     0);
1724
1725                 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO8)) | DBG_FUNC_NONE,
1726                     vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal,
1727                     vm_pageout_stats[vm_pageout_stat_now].considered_bq_external,
1728                     vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations,
1729                     vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal,
1730                     0);
1731         }
1732         KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO9)) | DBG_FUNC_NONE,
1733             vm_pageout_stats[vm_pageout_stat_now].pages_grabbed,
1734             vm_pageout_stats[vm_pageout_stat_now].pages_freed,
1735             vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found,
1736             vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added,
1737             0);
1738
1739         record_memory_pressure();
1740 }
1741
1742 extern boolean_t hibernation_vmqueues_inspection;
1743
1744 /*
1745  * Return values for functions called by vm_pageout_scan
1746  * that control its flow.
1747  *
1748  * PROCEED -- vm_pageout_scan will keep making forward progress.
1749  * DONE_RETURN -- page demand satisfied, work is done -> vm_pageout_scan returns.
1750  * NEXT_ITERATION -- restart the 'for' loop in vm_pageout_scan aka continue.
1751  */
1752
1753 #define VM_PAGEOUT_SCAN_PROCEED                 (0)
1754 #define VM_PAGEOUT_SCAN_DONE_RETURN             (1)
1755 #define VM_PAGEOUT_SCAN_NEXT_ITERATION          (2)
1756
1757 /*
1758  * This function is called only from vm_pageout_scan and
1759  * it moves overflow secluded pages (one-at-a-time) to the
1760  * batched 'local' free Q or active Q.
1761  */
1762 static void
1763 vps_deal_with_secluded_page_overflow(vm_page_t *local_freeq, int *local_freed)
1764 {
1765 #if CONFIG_SECLUDED_MEMORY
1766         /*
1767          * Deal with secluded_q overflow.
1768          */
1769         if (vm_page_secluded_count > vm_page_secluded_target) {
1770                 vm_page_t secluded_page;
1771
1772                 /*
1773                  * SECLUDED_AGING_BEFORE_ACTIVE:
1774                  * Excess secluded pages go to the active queue and
1775                  * will later go to the inactive queue.
1776                  */
1777                 assert((vm_page_secluded_count_free +
1778                     vm_page_secluded_count_inuse) ==
1779                     vm_page_secluded_count);
1780                 secluded_page = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded);
1781                 assert(secluded_page->vmp_q_state == VM_PAGE_ON_SECLUDED_Q);
1782
1783                 vm_page_queues_remove(secluded_page, FALSE);
1784                 assert(!secluded_page->vmp_fictitious);
1785                 assert(!VM_PAGE_WIRED(secluded_page));
1786
1787                 if (secluded_page->vmp_object == 0) {
1788                         /* transfer to free queue */
1789                         assert(secluded_page->vmp_busy);
1790                         secluded_page->vmp_snext = *local_freeq;
1791                         *local_freeq = secluded_page;
1792                         *local_freed += 1;
1793                 } else {
1794                         /* transfer to head of active queue */
1795                         vm_page_enqueue_active(secluded_page, FALSE);
1796                         secluded_page = VM_PAGE_NULL;
1797                 }
1798         }
1799 #else /* CONFIG_SECLUDED_MEMORY */
1800
1801 #pragma unused(local_freeq)
1802 #pragma unused(local_freed)
1803
1804         return;
1805
1806 #endif /* CONFIG_SECLUDED_MEMORY */
1807 }
1808
1809 /*
1810  * This function is called only from vm_pageout_scan and
1811  * it initializes the loop targets for vm_pageout_scan().
1812  */
1813 static void
1814 vps_init_page_targets(void)
1815 {
1816         /*
1817          * LD TODO: Other page targets should be calculated here too.
1818          */
1819         vm_page_anonymous_min = vm_page_inactive_target / 20;
1820
1821         if (vm_pageout_state.vm_page_speculative_percentage > 50) {
1822                 vm_pageout_state.vm_page_speculative_percentage = 50;
1823         } else if (vm_pageout_state.vm_page_speculative_percentage <= 0) {
1824                 vm_pageout_state.vm_page_speculative_percentage = 1;
1825         }
1826
1827         vm_pageout_state.vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count +
1828             vm_page_inactive_count);
1829 }
1830
1831 /*
1832  * This function is called only from vm_pageout_scan and
1833  * it purges a single VM object at-a-time and will either
1834  * make vm_pageout_scan() restart the loop or keeping moving forward.
1835  */
1836 static int
1837 vps_purge_object()
1838 {
1839         int             force_purge;
1840
1841         assert(available_for_purge >= 0);
1842         force_purge = 0; /* no force-purging */
1843
1844 #if VM_PRESSURE_EVENTS
1845         vm_pressure_level_t pressure_level;
1846
1847         pressure_level = memorystatus_vm_pressure_level;
1848
1849         if (pressure_level > kVMPressureNormal) {
1850                 if (pressure_level >= kVMPressureCritical) {
1851                         force_purge = vm_pageout_state.memorystatus_purge_on_critical;
1852                 } else if (pressure_level >= kVMPressureUrgent) {
1853                         force_purge = vm_pageout_state.memorystatus_purge_on_urgent;
1854                 } else if (pressure_level >= kVMPressureWarning) {
1855                         force_purge = vm_pageout_state.memorystatus_purge_on_warning;
1856                 }
1857         }
1858 #endif /* VM_PRESSURE_EVENTS */
1859
1860         if (available_for_purge || force_purge) {
1861                 memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_START);
1862
1863                 VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_START, vm_page_free_count, 0, 0, 0);
1864                 if (vm_purgeable_object_purge_one(force_purge, C_DONT_BLOCK)) {
1865                         VM_PAGEOUT_DEBUG(vm_pageout_purged_objects, 1);
1866                         VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, vm_page_free_count, 0, 0, 0);
1867                         memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
1868
1869                         return VM_PAGEOUT_SCAN_NEXT_ITERATION;
1870                 }
1871                 VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, 0, 0, 0, -1);
1872                 memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
1873         }
1874
1875         return VM_PAGEOUT_SCAN_PROCEED;
1876 }
1877
1878 /*
1879  * This function is called only from vm_pageout_scan and
1880  * it will try to age the next speculative Q if the oldest
1881  * one is empty.
1882  */
1883 static int
1884 vps_age_speculative_queue(boolean_t force_speculative_aging)
1885 {
1886 #define DELAY_SPECULATIVE_AGE   1000
1887
1888         /*
1889          * try to pull pages from the aging bins...
1890          * see vm_page.h for an explanation of how
1891          * this mechanism works
1892          */
1893         boolean_t                       can_steal = FALSE;
1894         int                             num_scanned_queues;
1895         static int                      delay_speculative_age = 0; /* depends the # of times we go through the main pageout_scan loop.*/
1896         mach_timespec_t                 ts;
1897         struct vm_speculative_age_q     *aq;
1898         struct vm_speculative_age_q     *sq;
1899
1900         sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
1901
1902         aq = &vm_page_queue_speculative[speculative_steal_index];
1903
1904         num_scanned_queues = 0;
1905         while (vm_page_queue_empty(&aq->age_q) &&
1906             num_scanned_queues++ != VM_PAGE_MAX_SPECULATIVE_AGE_Q) {
1907                 speculative_steal_index++;
1908
1909                 if (speculative_steal_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q) {
1910                         speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
1911                 }
1912
1913                 aq = &vm_page_queue_speculative[speculative_steal_index];
1914         }
1915
1916         if (num_scanned_queues == VM_PAGE_MAX_SPECULATIVE_AGE_Q + 1) {
1917                 /*
1918                  * XXX We've scanned all the speculative
1919                  * queues but still haven't found one
1920                  * that is not empty, even though
1921                  * vm_page_speculative_count is not 0.
1922                  */
1923                 if (!vm_page_queue_empty(&sq->age_q)) {
1924                         return VM_PAGEOUT_SCAN_NEXT_ITERATION;
1925                 }
1926 #if DEVELOPMENT || DEBUG
1927                 panic("vm_pageout_scan: vm_page_speculative_count=%d but queues are empty", vm_page_speculative_count);
1928 #endif
1929                 /* readjust... */
1930                 vm_page_speculative_count = 0;
1931                 /* ... and continue */
1932                 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
1933         }
1934
1935         if (vm_page_speculative_count > vm_pageout_state.vm_page_speculative_target || force_speculative_aging == TRUE) {
1936                 can_steal = TRUE;
1937         } else {
1938                 if (!delay_speculative_age) {
1939                         mach_timespec_t ts_fully_aged;
1940
1941                         ts_fully_aged.tv_sec = (VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_pageout_state.vm_page_speculative_q_age_ms) / 1000;
1942                         ts_fully_aged.tv_nsec = ((VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_pageout_state.vm_page_speculative_q_age_ms) % 1000)
1943                             * 1000 * NSEC_PER_USEC;
1944
1945                         ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts);
1946
1947                         clock_sec_t sec;
1948                         clock_nsec_t nsec;
1949                         clock_get_system_nanotime(&sec, &nsec);
1950                         ts.tv_sec = (unsigned int) sec;
1951                         ts.tv_nsec = nsec;
1952
1953                         if (CMP_MACH_TIMESPEC(&ts, &ts_fully_aged) >= 0) {
1954                                 can_steal = TRUE;
1955                         } else {
1956                                 delay_speculative_age++;
1957                         }
1958                 } else {
1959                         delay_speculative_age++;
1960                         if (delay_speculative_age == DELAY_SPECULATIVE_AGE) {
1961                                 delay_speculative_age = 0;
1962                         }
1963                 }
1964         }
1965         if (can_steal == TRUE) {
1966                 vm_page_speculate_ageit(aq);
1967         }
1968
1969         return VM_PAGEOUT_SCAN_PROCEED;
1970 }
1971
1972 /*
1973  * This function is called only from vm_pageout_scan and
1974  * it evicts a single VM object from the cache.
1975  */
1976 static int inline
1977 vps_object_cache_evict(vm_object_t *object_to_unlock)
1978 {
1979         static int                      cache_evict_throttle = 0;
1980         struct vm_speculative_age_q     *sq;
1981
1982         sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
1983
1984         if (vm_page_queue_empty(&sq->age_q) && cache_evict_throttle == 0) {
1985                 int     pages_evicted;
1986
1987                 if (*object_to_unlock != NULL) {
1988                         vm_object_unlock(*object_to_unlock);
1989                         *object_to_unlock = NULL;
1990                 }
1991                 KERNEL_DEBUG_CONSTANT(0x13001ec | DBG_FUNC_START, 0, 0, 0, 0, 0);
1992
1993                 pages_evicted = vm_object_cache_evict(100, 10);
1994
1995                 KERNEL_DEBUG_CONSTANT(0x13001ec | DBG_FUNC_END, pages_evicted, 0, 0, 0, 0);
1996
1997                 if (pages_evicted) {
1998                         vm_pageout_vminfo.vm_pageout_pages_evicted += pages_evicted;
1999
2000                         VM_DEBUG_EVENT(vm_pageout_cache_evict, VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE,
2001                             vm_page_free_count, pages_evicted, vm_pageout_vminfo.vm_pageout_pages_evicted, 0);
2002                         memoryshot(VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE);
2003
2004                         /*
2005                          * we just freed up to 100 pages,
2006                          * so go back to the top of the main loop
2007                          * and re-evaulate the memory situation
2008                          */
2009                         return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2010                 } else {
2011                         cache_evict_throttle = 1000;
2012                 }
2013         }
2014         if (cache_evict_throttle) {
2015                 cache_evict_throttle--;
2016         }
2017
2018         return VM_PAGEOUT_SCAN_PROCEED;
2019 }
2020
2021
2022 /*
2023  * This function is called only from vm_pageout_scan and
2024  * it calculates the filecache min. that needs to be maintained
2025  * as we start to steal pages.
2026  */
2027 static void
2028 vps_calculate_filecache_min(void)
2029 {
2030         int divisor = vm_pageout_state.vm_page_filecache_min_divisor;
2031
2032 #if CONFIG_JETSAM
2033         /*
2034          * don't let the filecache_min fall below 15% of available memory
2035          * on systems with an active compressor that isn't nearing its
2036          * limits w/r to accepting new data
2037          *
2038          * on systems w/o the compressor/swapper, the filecache is always
2039          * a very large percentage of the AVAILABLE_NON_COMPRESSED_MEMORY
2040          * since most (if not all) of the anonymous pages are in the
2041          * throttled queue (which isn't counted as available) which
2042          * effectively disables this filter
2043          */
2044         if (vm_compressor_low_on_space() || divisor == 0) {
2045                 vm_pageout_state.vm_page_filecache_min = 0;
2046         } else {
2047                 vm_pageout_state.vm_page_filecache_min =
2048                     ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
2049         }
2050 #else
2051         if (vm_compressor_out_of_space() || divisor == 0) {
2052                 vm_pageout_state.vm_page_filecache_min = 0;
2053         } else {
2054                 /*
2055                  * don't let the filecache_min fall below the specified critical level
2056                  */
2057                 vm_pageout_state.vm_page_filecache_min =
2058                     ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
2059         }
2060 #endif
2061         if (vm_page_free_count < (vm_page_free_reserved / 4)) {
2062                 vm_pageout_state.vm_page_filecache_min = 0;
2063         }
2064 }
2065
2066 /*
2067  * This function is called only from vm_pageout_scan and
2068  * it updates the flow control time to detect if VM pageoutscan
2069  * isn't making progress.
2070  */
2071 static void
2072 vps_flow_control_reset_deadlock_timer(struct flow_control *flow_control)
2073 {
2074         mach_timespec_t ts;
2075         clock_sec_t sec;
2076         clock_nsec_t nsec;
2077
2078         ts.tv_sec = vm_pageout_state.vm_pageout_deadlock_wait / 1000;
2079         ts.tv_nsec = (vm_pageout_state.vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
2080         clock_get_system_nanotime(&sec, &nsec);
2081         flow_control->ts.tv_sec = (unsigned int) sec;
2082         flow_control->ts.tv_nsec = nsec;
2083         ADD_MACH_TIMESPEC(&flow_control->ts, &ts);
2084
2085         flow_control->state = FCS_DELAYED;
2086
2087         vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal++;
2088 }
2089
2090 /*
2091  * This function is called only from vm_pageout_scan and
2092  * it is the flow control logic of VM pageout scan which
2093  * controls if it should block and for how long.
2094  * Any blocking of vm_pageout_scan happens ONLY in this function.
2095  */
2096 static int
2097 vps_flow_control(struct flow_control *flow_control, int *anons_grabbed, vm_object_t *object, int *delayed_unlock,
2098     vm_page_t *local_freeq, int *local_freed, int *vm_pageout_deadlock_target, unsigned int inactive_burst_count)
2099 {
2100         boolean_t       exceeded_burst_throttle = FALSE;
2101         unsigned int    msecs = 0;
2102         uint32_t        inactive_external_count;
2103         mach_timespec_t ts;
2104         struct  vm_pageout_queue *iq;
2105         struct  vm_pageout_queue *eq;
2106         struct  vm_speculative_age_q *sq;
2107
2108         iq = &vm_pageout_queue_internal;
2109         eq = &vm_pageout_queue_external;
2110         sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2111
2112         /*
2113          * Sometimes we have to pause:
2114          *      1) No inactive pages - nothing to do.
2115          *      2) Loop control - no acceptable pages found on the inactive queue
2116          *         within the last vm_pageout_burst_inactive_throttle iterations
2117          *      3) Flow control - default pageout queue is full
2118          */
2119         if (vm_page_queue_empty(&vm_page_queue_inactive) &&
2120             vm_page_queue_empty(&vm_page_queue_anonymous) &&
2121             vm_page_queue_empty(&vm_page_queue_cleaned) &&
2122             vm_page_queue_empty(&sq->age_q)) {
2123                 VM_PAGEOUT_DEBUG(vm_pageout_scan_empty_throttle, 1);
2124                 msecs = vm_pageout_state.vm_pageout_empty_wait;
2125         } else if (inactive_burst_count >=
2126             MIN(vm_pageout_state.vm_pageout_burst_inactive_throttle,
2127             (vm_page_inactive_count +
2128             vm_page_speculative_count))) {
2129                 VM_PAGEOUT_DEBUG(vm_pageout_scan_burst_throttle, 1);
2130                 msecs = vm_pageout_state.vm_pageout_burst_wait;
2131
2132                 exceeded_burst_throttle = TRUE;
2133         } else if (VM_PAGE_Q_THROTTLED(iq) &&
2134             VM_DYNAMIC_PAGING_ENABLED()) {
2135                 clock_sec_t sec;
2136                 clock_nsec_t nsec;
2137
2138                 switch (flow_control->state) {
2139                 case FCS_IDLE:
2140                         if ((vm_page_free_count + *local_freed) < vm_page_free_target &&
2141                             vm_pageout_state.vm_restricted_to_single_processor == FALSE) {
2142                                 /*
2143                                  * since the compressor is running independently of vm_pageout_scan
2144                                  * let's not wait for it just yet... as long as we have a healthy supply
2145                                  * of filecache pages to work with, let's keep stealing those.
2146                                  */
2147                                 inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2148
2149                                 if (vm_page_pageable_external_count > vm_pageout_state.vm_page_filecache_min &&
2150                                     (inactive_external_count >= VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
2151                                         *anons_grabbed = ANONS_GRABBED_LIMIT;
2152                                         VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle_deferred, 1);
2153                                         return VM_PAGEOUT_SCAN_PROCEED;
2154                                 }
2155                         }
2156
2157                         vps_flow_control_reset_deadlock_timer(flow_control);
2158                         msecs = vm_pageout_state.vm_pageout_deadlock_wait;
2159
2160                         break;
2161
2162                 case FCS_DELAYED:
2163                         clock_get_system_nanotime(&sec, &nsec);
2164                         ts.tv_sec = (unsigned int) sec;
2165                         ts.tv_nsec = nsec;
2166
2167                         if (CMP_MACH_TIMESPEC(&ts, &flow_control->ts) >= 0) {
2168                                 /*
2169                                  * the pageout thread for the default pager is potentially
2170                                  * deadlocked since the
2171                                  * default pager queue has been throttled for more than the
2172                                  * allowable time... we need to move some clean pages or dirty
2173                                  * pages belonging to the external pagers if they aren't throttled
2174                                  * vm_page_free_wanted represents the number of threads currently
2175                                  * blocked waiting for pages... we'll move one page for each of
2176                                  * these plus a fixed amount to break the logjam... once we're done
2177                                  * moving this number of pages, we'll re-enter the FSC_DELAYED state
2178                                  * with a new timeout target since we have no way of knowing
2179                                  * whether we've broken the deadlock except through observation
2180                                  * of the queue associated with the default pager... we need to
2181                                  * stop moving pages and allow the system to run to see what
2182                                  * state it settles into.
2183                                  */
2184
2185                                 *vm_pageout_deadlock_target = vm_pageout_state.vm_pageout_deadlock_relief +
2186                                     vm_page_free_wanted + vm_page_free_wanted_privileged;
2187                                 VM_PAGEOUT_DEBUG(vm_pageout_scan_deadlock_detected, 1);
2188                                 flow_control->state = FCS_DEADLOCK_DETECTED;
2189                                 thread_wakeup((event_t) &vm_pageout_garbage_collect);
2190                                 return VM_PAGEOUT_SCAN_PROCEED;
2191                         }
2192                         /*
2193                          * just resniff instead of trying
2194                          * to compute a new delay time... we're going to be
2195                          * awakened immediately upon a laundry completion,
2196                          * so we won't wait any longer than necessary
2197                          */
2198                         msecs = vm_pageout_state.vm_pageout_idle_wait;
2199                         break;
2200
2201                 case FCS_DEADLOCK_DETECTED:
2202                         if (*vm_pageout_deadlock_target) {
2203                                 return VM_PAGEOUT_SCAN_PROCEED;
2204                         }
2205
2206                         vps_flow_control_reset_deadlock_timer(flow_control);
2207                         msecs = vm_pageout_state.vm_pageout_deadlock_wait;
2208
2209                         break;
2210                 }
2211         } else {
2212                 /*
2213                  * No need to pause...
2214                  */
2215                 return VM_PAGEOUT_SCAN_PROCEED;
2216         }
2217
2218         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2219
2220         vm_pageout_prepare_to_block(object, delayed_unlock, local_freeq, local_freed,
2221             VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
2222
2223         if (vm_page_free_count >= vm_page_free_target) {
2224                 /*
2225                  * we're here because
2226                  *  1) someone else freed up some pages while we had
2227                  *     the queues unlocked above
2228                  * and we've hit one of the 3 conditions that
2229                  * cause us to pause the pageout scan thread
2230                  *
2231                  * since we already have enough free pages,
2232                  * let's avoid stalling and return normally
2233                  *
2234                  * before we return, make sure the pageout I/O threads
2235                  * are running throttled in case there are still requests
2236                  * in the laundry... since we have enough free pages
2237                  * we don't need the laundry to be cleaned in a timely
2238                  * fashion... so let's avoid interfering with foreground
2239                  * activity
2240                  *
2241                  * we don't want to hold vm_page_queue_free_lock when
2242                  * calling vm_pageout_adjust_eq_iothrottle (since it
2243                  * may cause other locks to be taken), we do the intitial
2244                  * check outside of the lock.  Once we take the lock,
2245                  * we recheck the condition since it may have changed.
2246                  * if it has, no problem, we will make the threads
2247                  * non-throttled before actually blocking
2248                  */
2249                 vm_pageout_adjust_eq_iothrottle(eq, TRUE);
2250         }
2251         lck_mtx_lock(&vm_page_queue_free_lock);
2252
2253         if (vm_page_free_count >= vm_page_free_target &&
2254             (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
2255                 return VM_PAGEOUT_SCAN_DONE_RETURN;
2256         }
2257         lck_mtx_unlock(&vm_page_queue_free_lock);
2258
2259         if ((vm_page_free_count + vm_page_cleaned_count) < vm_page_free_target) {
2260                 /*
2261                  * we're most likely about to block due to one of
2262                  * the 3 conditions that cause vm_pageout_scan to
2263                  * not be able to make forward progress w/r
2264                  * to providing new pages to the free queue,
2265                  * so unthrottle the I/O threads in case we
2266                  * have laundry to be cleaned... it needs
2267                  * to be completed ASAP.
2268                  *
2269                  * even if we don't block, we want the io threads
2270                  * running unthrottled since the sum of free +
2271                  * clean pages is still under our free target
2272                  */
2273                 vm_pageout_adjust_eq_iothrottle(eq, FALSE);
2274         }
2275         if (vm_page_cleaned_count > 0 && exceeded_burst_throttle == FALSE) {
2276                 /*
2277                  * if we get here we're below our free target and
2278                  * we're stalling due to a full laundry queue or
2279                  * we don't have any inactive pages other then
2280                  * those in the clean queue...
2281                  * however, we have pages on the clean queue that
2282                  * can be moved to the free queue, so let's not
2283                  * stall the pageout scan
2284                  */
2285                 flow_control->state = FCS_IDLE;
2286                 return VM_PAGEOUT_SCAN_PROCEED;
2287         }
2288         if (flow_control->state == FCS_DELAYED && !VM_PAGE_Q_THROTTLED(iq)) {
2289                 flow_control->state = FCS_IDLE;
2290                 return VM_PAGEOUT_SCAN_PROCEED;
2291         }
2292
2293         VM_CHECK_MEMORYSTATUS;
2294
2295         if (flow_control->state != FCS_IDLE) {
2296                 VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle, 1);
2297         }
2298
2299         iq->pgo_throttled = TRUE;
2300         assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000 * NSEC_PER_USEC);
2301
2302         vm_page_unlock_queues();
2303
2304         assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
2305
2306         VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START,
2307             iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2308         memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START);
2309
2310         thread_block(THREAD_CONTINUE_NULL);
2311
2312         VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END,
2313             iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2314         memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END);
2315
2316         vm_page_lock_queues();
2317
2318         iq->pgo_throttled = FALSE;
2319
2320         vps_init_page_targets();
2321
2322         return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2323 }
2324
2325 /*
2326  * This function is called only from vm_pageout_scan and
2327  * it will find and return the most appropriate page to be
2328  * reclaimed.
2329  */
2330 static int
2331 vps_choose_victim_page(vm_page_t *victim_page, int *anons_grabbed, boolean_t *grab_anonymous, boolean_t force_anonymous,
2332     boolean_t *is_page_from_bg_q, unsigned int *reactivated_this_call)
2333 {
2334         vm_page_t                       m = NULL;
2335         vm_object_t                     m_object = VM_OBJECT_NULL;
2336         uint32_t                        inactive_external_count;
2337         struct vm_speculative_age_q     *sq;
2338         struct vm_pageout_queue         *iq;
2339         int                             retval = VM_PAGEOUT_SCAN_PROCEED;
2340
2341         sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2342         iq = &vm_pageout_queue_internal;
2343
2344         *is_page_from_bg_q = FALSE;
2345
2346         m = NULL;
2347         m_object = VM_OBJECT_NULL;
2348
2349         if (VM_DYNAMIC_PAGING_ENABLED()) {
2350                 assert(vm_page_throttled_count == 0);
2351                 assert(vm_page_queue_empty(&vm_page_queue_throttled));
2352         }
2353
2354         /*
2355          * Try for a clean-queue inactive page.
2356          * These are pages that vm_pageout_scan tried to steal earlier, but
2357          * were dirty and had to be cleaned.  Pick them up now that they are clean.
2358          */
2359         if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2360                 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2361
2362                 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q);
2363
2364                 goto found_page;
2365         }
2366
2367         /*
2368          * The next most eligible pages are ones we paged in speculatively,
2369          * but which have not yet been touched and have been aged out.
2370          */
2371         if (!vm_page_queue_empty(&sq->age_q)) {
2372                 m = (vm_page_t) vm_page_queue_first(&sq->age_q);
2373
2374                 assert(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q);
2375
2376                 if (!m->vmp_dirty || force_anonymous == FALSE) {
2377                         goto found_page;
2378                 } else {
2379                         m = NULL;
2380                 }
2381         }
2382
2383 #if CONFIG_BACKGROUND_QUEUE
2384         if (vm_page_background_mode != VM_PAGE_BG_DISABLED && (vm_page_background_count > vm_page_background_target)) {
2385                 vm_object_t     bg_m_object = NULL;
2386
2387                 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
2388
2389                 bg_m_object = VM_PAGE_OBJECT(m);
2390
2391                 if (!VM_PAGE_PAGEABLE(m)) {
2392                         /*
2393                          * This page is on the background queue
2394                          * but not on a pageable queue.  This is
2395                          * likely a transient state and whoever
2396                          * took it out of its pageable queue
2397                          * will likely put it back on a pageable
2398                          * queue soon but we can't deal with it
2399                          * at this point, so let's ignore this
2400                          * page.
2401                          */
2402                 } else if (force_anonymous == FALSE || bg_m_object->internal) {
2403                         if (bg_m_object->internal &&
2404                             (VM_PAGE_Q_THROTTLED(iq) ||
2405                             vm_compressor_out_of_space() == TRUE ||
2406                             vm_page_free_count < (vm_page_free_reserved / 4))) {
2407                                 vm_pageout_skipped_bq_internal++;
2408                         } else {
2409                                 *is_page_from_bg_q = TRUE;
2410
2411                                 if (bg_m_object->internal) {
2412                                         vm_pageout_vminfo.vm_pageout_considered_bq_internal++;
2413                                 } else {
2414                                         vm_pageout_vminfo.vm_pageout_considered_bq_external++;
2415                                 }
2416                                 goto found_page;
2417                         }
2418                 }
2419         }
2420 #endif /* CONFIG_BACKGROUND_QUEUE */
2421
2422         inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2423
2424         if ((vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min || force_anonymous == TRUE) ||
2425             (inactive_external_count < VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
2426                 *grab_anonymous = TRUE;
2427                 *anons_grabbed = 0;
2428
2429                 vm_pageout_vminfo.vm_pageout_skipped_external++;
2430                 goto want_anonymous;
2431         }
2432         *grab_anonymous = (vm_page_anonymous_count > vm_page_anonymous_min);
2433
2434 #if CONFIG_JETSAM
2435         /* If the file-backed pool has accumulated
2436          * significantly more pages than the jetsam
2437          * threshold, prefer to reclaim those
2438          * inline to minimise compute overhead of reclaiming
2439          * anonymous pages.
2440          * This calculation does not account for the CPU local
2441          * external page queues, as those are expected to be
2442          * much smaller relative to the global pools.
2443          */
2444
2445         struct vm_pageout_queue *eq = &vm_pageout_queue_external;
2446
2447         if (*grab_anonymous == TRUE && !VM_PAGE_Q_THROTTLED(eq)) {
2448                 if (vm_page_pageable_external_count >
2449                     vm_pageout_state.vm_page_filecache_min) {
2450                         if ((vm_page_pageable_external_count *
2451                             vm_pageout_memorystatus_fb_factor_dr) >
2452                             (memorystatus_available_pages_critical *
2453                             vm_pageout_memorystatus_fb_factor_nr)) {
2454                                 *grab_anonymous = FALSE;
2455
2456                                 VM_PAGEOUT_DEBUG(vm_grab_anon_overrides, 1);
2457                         }
2458                 }
2459                 if (*grab_anonymous) {
2460                         VM_PAGEOUT_DEBUG(vm_grab_anon_nops, 1);
2461                 }
2462         }
2463 #endif /* CONFIG_JETSAM */
2464
2465 want_anonymous:
2466         if (*grab_anonymous == FALSE || *anons_grabbed >= ANONS_GRABBED_LIMIT || vm_page_queue_empty(&vm_page_queue_anonymous)) {
2467                 if (!vm_page_queue_empty(&vm_page_queue_inactive)) {
2468                         m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2469
2470                         assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q);
2471                         *anons_grabbed = 0;
2472
2473                         if (vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min) {
2474                                 if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2475                                         if ((++(*reactivated_this_call) % 100)) {
2476                                                 vm_pageout_vminfo.vm_pageout_filecache_min_reactivated++;
2477
2478                                                 vm_page_activate(m);
2479                                                 counter_inc(&vm_statistics_reactivations);
2480 #if CONFIG_BACKGROUND_QUEUE
2481 #if DEVELOPMENT || DEBUG
2482                                                 if (*is_page_from_bg_q == TRUE) {
2483                                                         if (m_object->internal) {
2484                                                                 vm_pageout_rejected_bq_internal++;
2485                                                         } else {
2486                                                                 vm_pageout_rejected_bq_external++;
2487                                                         }
2488                                                 }
2489 #endif /* DEVELOPMENT || DEBUG */
2490 #endif /* CONFIG_BACKGROUND_QUEUE */
2491                                                 vm_pageout_state.vm_pageout_inactive_used++;
2492
2493                                                 m = NULL;
2494                                                 retval = VM_PAGEOUT_SCAN_NEXT_ITERATION;
2495
2496                                                 goto found_page;
2497                                         }
2498
2499                                         /*
2500                                          * steal 1 of the file backed pages even if
2501                                          * we are under the limit that has been set
2502                                          * for a healthy filecache
2503                                          */
2504                                 }
2505                         }
2506                         goto found_page;
2507                 }
2508         }
2509         if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2510                 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2511
2512                 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q);
2513                 *anons_grabbed += 1;
2514
2515                 goto found_page;
2516         }
2517
2518         m = NULL;
2519
2520 found_page:
2521         *victim_page = m;
2522
2523         return retval;
2524 }
2525
2526 /*
2527  * This function is called only from vm_pageout_scan and
2528  * it will put a page back on the active/inactive queue
2529  * if we can't reclaim it for some reason.
2530  */
2531 static void
2532 vps_requeue_page(vm_page_t m, int page_prev_q_state, __unused boolean_t page_from_bg_q)
2533 {
2534         if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
2535                 vm_page_enqueue_inactive(m, FALSE);
2536         } else {
2537                 vm_page_activate(m);
2538         }
2539
2540 #if CONFIG_BACKGROUND_QUEUE
2541 #if DEVELOPMENT || DEBUG
2542         vm_object_t m_object = VM_PAGE_OBJECT(m);
2543
2544         if (page_from_bg_q == TRUE) {
2545                 if (m_object->internal) {
2546                         vm_pageout_rejected_bq_internal++;
2547                 } else {
2548                         vm_pageout_rejected_bq_external++;
2549                 }
2550         }
2551 #endif /* DEVELOPMENT || DEBUG */
2552 #endif /* CONFIG_BACKGROUND_QUEUE */
2553 }
2554
2555 /*
2556  * This function is called only from vm_pageout_scan and
2557  * it will try to grab the victim page's VM object (m_object)
2558  * which differs from the previous victim page's object (object).
2559  */
2560 static int
2561 vps_switch_object(vm_page_t m, vm_object_t m_object, vm_object_t *object, int page_prev_q_state, boolean_t avoid_anon_pages, boolean_t page_from_bg_q)
2562 {
2563         struct vm_speculative_age_q *sq;
2564
2565         sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2566
2567         /*
2568          * the object associated with candidate page is
2569          * different from the one we were just working
2570          * with... dump the lock if we still own it
2571          */
2572         if (*object != NULL) {
2573                 vm_object_unlock(*object);
2574                 *object = NULL;
2575         }
2576         /*
2577          * Try to lock object; since we've alread got the
2578          * page queues lock, we can only 'try' for this one.
2579          * if the 'try' fails, we need to do a mutex_pause
2580          * to allow the owner of the object lock a chance to
2581          * run... otherwise, we're likely to trip over this
2582          * object in the same state as we work our way through
2583          * the queue... clumps of pages associated with the same
2584          * object are fairly typical on the inactive and active queues
2585          */
2586         if (!vm_object_lock_try_scan(m_object)) {
2587                 vm_page_t m_want = NULL;
2588
2589                 vm_pageout_vminfo.vm_pageout_inactive_nolock++;
2590
2591                 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
2592                         VM_PAGEOUT_DEBUG(vm_pageout_cleaned_nolock, 1);
2593                 }
2594
2595                 pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(m));
2596
2597                 m->vmp_reference = FALSE;
2598
2599                 if (!m_object->object_is_shared_cache) {
2600                         /*
2601                          * don't apply this optimization if this is the shared cache
2602                          * object, it's too easy to get rid of very hot and important
2603                          * pages...
2604                          * m->vmp_object must be stable since we hold the page queues lock...
2605                          * we can update the scan_collisions field sans the object lock
2606                          * since it is a separate field and this is the only spot that does
2607                          * a read-modify-write operation and it is never executed concurrently...
2608                          * we can asynchronously set this field to 0 when creating a UPL, so it
2609                          * is possible for the value to be a bit non-determistic, but that's ok
2610                          * since it's only used as a hint
2611                          */
2612                         m_object->scan_collisions = 1;
2613                 }
2614                 if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2615                         m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2616                 } else if (!vm_page_queue_empty(&sq->age_q)) {
2617                         m_want = (vm_page_t) vm_page_queue_first(&sq->age_q);
2618                 } else if ((avoid_anon_pages || vm_page_queue_empty(&vm_page_queue_anonymous)) &&
2619                     !vm_page_queue_empty(&vm_page_queue_inactive)) {
2620                         m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2621                 } else if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2622                         m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2623                 }
2624
2625                 /*
2626                  * this is the next object we're going to be interested in
2627                  * try to make sure its available after the mutex_pause
2628                  * returns control
2629                  */
2630                 if (m_want) {
2631                         vm_pageout_scan_wants_object = VM_PAGE_OBJECT(m_want);
2632                 }
2633
2634                 vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
2635
2636                 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2637         } else {
2638                 *object = m_object;
2639                 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2640         }
2641
2642         return VM_PAGEOUT_SCAN_PROCEED;
2643 }
2644
2645 /*
2646  * This function is called only from vm_pageout_scan and
2647  * it notices that pageout scan may be rendered ineffective
2648  * due to a FS deadlock and will jetsam a process if possible.
2649  * If jetsam isn't supported, it'll move the page to the active
2650  * queue to try and get some different pages pushed onwards so
2651  * we can try to get out of this scenario.
2652  */
2653 static void
2654 vps_deal_with_throttled_queues(vm_page_t m, vm_object_t *object, uint32_t *vm_pageout_inactive_external_forced_reactivate_limit,
2655     int *delayed_unlock, boolean_t *force_anonymous, __unused boolean_t is_page_from_bg_q)
2656 {
2657         struct  vm_pageout_queue *eq;
2658         vm_object_t cur_object = VM_OBJECT_NULL;
2659
2660         cur_object = *object;
2661
2662         eq = &vm_pageout_queue_external;
2663
2664         if (cur_object->internal == FALSE) {
2665                 /*
2666                  * we need to break up the following potential deadlock case...
2667                  *  a) The external pageout thread is stuck on the truncate lock for a file that is being extended i.e. written.
2668                  *  b) The thread doing the writing is waiting for pages while holding the truncate lock
2669                  *  c) Most of the pages in the inactive queue belong to this file.
2670                  *
2671                  * we are potentially in this deadlock because...
2672                  *  a) the external pageout queue is throttled
2673                  *  b) we're done with the active queue and moved on to the inactive queue
2674                  *  c) we've got a dirty external page
2675                  *
2676                  * since we don't know the reason for the external pageout queue being throttled we
2677                  * must suspect that we are deadlocked, so move the current page onto the active queue
2678                  * in an effort to cause a page from the active queue to 'age' to the inactive queue
2679                  *
2680                  * if we don't have jetsam configured (i.e. we have a dynamic pager), set
2681                  * 'force_anonymous' to TRUE to cause us to grab a page from the cleaned/anonymous
2682                  * pool the next time we select a victim page... if we can make enough new free pages,
2683                  * the deadlock will break, the external pageout queue will empty and it will no longer
2684                  * be throttled
2685                  *
2686                  * if we have jetsam configured, keep a count of the pages reactivated this way so
2687                  * that we can try to find clean pages in the active/inactive queues before
2688                  * deciding to jetsam a process
2689                  */
2690                 vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external++;
2691
2692                 vm_page_check_pageable_safe(m);
2693                 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
2694                 vm_page_queue_enter(&vm_page_queue_active, m, vmp_pageq);
2695                 m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
2696                 vm_page_active_count++;
2697                 vm_page_pageable_external_count++;
2698
2699                 vm_pageout_adjust_eq_iothrottle(eq, FALSE);
2700
2701 #if CONFIG_MEMORYSTATUS && CONFIG_JETSAM
2702
2703 #pragma unused(force_anonymous)
2704
2705                 *vm_pageout_inactive_external_forced_reactivate_limit -= 1;
2706
2707                 if (*vm_pageout_inactive_external_forced_reactivate_limit <= 0) {
2708                         *vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
2709                         /*
2710                          * Possible deadlock scenario so request jetsam action
2711                          */
2712
2713                         assert(cur_object);
2714                         vm_object_unlock(cur_object);
2715
2716                         cur_object = VM_OBJECT_NULL;
2717
2718                         /*
2719                          * VM pageout scan needs to know we have dropped this lock and so set the
2720                          * object variable we got passed in to NULL.
2721                          */
2722                         *object = VM_OBJECT_NULL;
2723
2724                         vm_page_unlock_queues();
2725
2726                         VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_START,
2727                             vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count);
2728
2729                         /* Kill first suitable process. If this call returned FALSE, we might have simply purged a process instead. */
2730                         if (memorystatus_kill_on_VM_page_shortage(FALSE) == TRUE) {
2731                                 VM_PAGEOUT_DEBUG(vm_pageout_inactive_external_forced_jetsam_count, 1);
2732                         }
2733
2734                         VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_END,
2735                             vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count);
2736
2737                         vm_page_lock_queues();
2738                         *delayed_unlock = 1;
2739                 }
2740 #else /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
2741
2742 #pragma unused(vm_pageout_inactive_external_forced_reactivate_limit)
2743 #pragma unused(delayed_unlock)
2744
2745                 *force_anonymous = TRUE;
2746 #endif /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
2747         } else {
2748                 vm_page_activate(m);
2749                 counter_inc(&vm_statistics_reactivations);
2750
2751 #if CONFIG_BACKGROUND_QUEUE
2752 #if DEVELOPMENT || DEBUG
2753                 if (is_page_from_bg_q == TRUE) {
2754                         if (cur_object->internal) {
2755                                 vm_pageout_rejected_bq_internal++;
2756                         } else {
2757                                 vm_pageout_rejected_bq_external++;
2758                         }
2759                 }
2760 #endif /* DEVELOPMENT || DEBUG */
2761 #endif /* CONFIG_BACKGROUND_QUEUE */
2762
2763                 vm_pageout_state.vm_pageout_inactive_used++;
2764         }
2765 }
2766
2767
2768 void
2769 vm_page_balance_inactive(int max_to_move)
2770 {
2771         vm_page_t m;
2772
2773         LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
2774
2775         if (hibernation_vmqueues_inspection || hibernate_cleaning_in_progress) {
2776                 /*
2777                  * It is likely that the hibernation code path is
2778                  * dealing with these very queues as we are about
2779                  * to move pages around in/from them and completely
2780                  * change the linkage of the pages.
2781                  *
2782                  * And so we skip the rebalancing of these queues.
2783                  */
2784                 return;
2785         }
2786         vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
2787             vm_page_inactive_count +
2788             vm_page_speculative_count);
2789
2790         while (max_to_move-- && (vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) {
2791                 VM_PAGEOUT_DEBUG(vm_pageout_balanced, 1);
2792
2793                 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
2794
2795                 assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q);
2796                 assert(!m->vmp_laundry);
2797                 assert(VM_PAGE_OBJECT(m) != kernel_object);
2798                 assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
2799
2800                 DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
2801
2802                 /*
2803                  * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
2804                  *
2805                  * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
2806                  * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
2807                  * new reference happens. If no futher references happen on the page after that remote TLB flushes
2808                  * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
2809                  * by pageout_scan, which is just fine since the last reference would have happened quite far
2810                  * in the past (TLB caches don't hang around for very long), and of course could just as easily
2811                  * have happened before we moved the page
2812                  */
2813                 if (m->vmp_pmapped == TRUE) {
2814                         pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
2815                 }
2816
2817                 /*
2818                  * The page might be absent or busy,
2819                  * but vm_page_deactivate can handle that.
2820                  * FALSE indicates that we don't want a H/W clear reference
2821                  */
2822                 vm_page_deactivate_internal(m, FALSE);
2823         }
2824 }
2825
2826
2827 /*
2828  *      vm_pageout_scan does the dirty work for the pageout daemon.
2829  *      It returns with both vm_page_queue_free_lock and vm_page_queue_lock
2830  *      held and vm_page_free_wanted == 0.
2831  */
2832 void
2833 vm_pageout_scan(void)
2834 {
2835         unsigned int loop_count = 0;
2836         unsigned int inactive_burst_count = 0;
2837         unsigned int reactivated_this_call;
2838         unsigned int reactivate_limit;
2839         vm_page_t   local_freeq = NULL;
2840         int         local_freed = 0;
2841         int         delayed_unlock;
2842         int         delayed_unlock_limit = 0;
2843         int         refmod_state = 0;
2844         int     vm_pageout_deadlock_target = 0;
2845         struct  vm_pageout_queue *iq;
2846         struct  vm_pageout_queue *eq;
2847         struct  vm_speculative_age_q *sq;
2848         struct  flow_control    flow_control = { .state = 0, .ts = { .tv_sec = 0, .tv_nsec = 0 } };
2849         boolean_t inactive_throttled = FALSE;
2850         vm_object_t     object = NULL;
2851         uint32_t        inactive_reclaim_run;
2852         boolean_t       grab_anonymous = FALSE;
2853         boolean_t       force_anonymous = FALSE;
2854         boolean_t       force_speculative_aging = FALSE;
2855         int             anons_grabbed = 0;
2856         int             page_prev_q_state = 0;
2857         boolean_t       page_from_bg_q = FALSE;
2858         uint32_t        vm_pageout_inactive_external_forced_reactivate_limit = 0;
2859         vm_object_t     m_object = VM_OBJECT_NULL;
2860         int             retval = 0;
2861         boolean_t       lock_yield_check = FALSE;
2862
2863
2864         VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_START,
2865             vm_pageout_vminfo.vm_pageout_freed_speculative,
2866             vm_pageout_state.vm_pageout_inactive_clean,
2867             vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
2868             vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
2869
2870         flow_control.state = FCS_IDLE;
2871         iq = &vm_pageout_queue_internal;
2872         eq = &vm_pageout_queue_external;
2873         sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2874
2875         /* Ask the pmap layer to return any pages it no longer needs. */
2876         uint64_t pmap_wired_pages_freed = pmap_release_pages_fast();
2877
2878         vm_page_lock_queues();
2879
2880         vm_page_wire_count -= pmap_wired_pages_freed;
2881
2882         delayed_unlock = 1;
2883
2884         /*
2885          *      Calculate the max number of referenced pages on the inactive
2886          *      queue that we will reactivate.
2887          */
2888         reactivated_this_call = 0;
2889         reactivate_limit = VM_PAGE_REACTIVATE_LIMIT(vm_page_active_count +
2890             vm_page_inactive_count);
2891         inactive_reclaim_run = 0;
2892
2893         vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
2894
2895         /*
2896          *      We must limit the rate at which we send pages to the pagers
2897          *      so that we don't tie up too many pages in the I/O queues.
2898          *      We implement a throttling mechanism using the laundry count
2899          *      to limit the number of pages outstanding to the default
2900          *      and external pagers.  We can bypass the throttles and look
2901          *      for clean pages if the pageout queues don't drain in a timely
2902          *      fashion since this may indicate that the pageout paths are
2903          *      stalled waiting for memory, which only we can provide.
2904          */
2905
2906         vps_init_page_targets();
2907         assert(object == NULL);
2908         assert(delayed_unlock != 0);
2909
2910         for (;;) {
2911                 vm_page_t m;
2912
2913                 DTRACE_VM2(rev, int, 1, (uint64_t *), NULL);
2914
2915                 if (lock_yield_check) {
2916                         lock_yield_check = FALSE;
2917
2918                         if (delayed_unlock++ > delayed_unlock_limit) {
2919                                 int freed = local_freed;
2920
2921                                 vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
2922                                     VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
2923                                 if (freed == 0) {
2924                                         lck_mtx_yield(&vm_page_queue_lock);
2925                                 }
2926                         } else if (vm_pageout_scan_wants_object) {
2927                                 vm_page_unlock_queues();
2928                                 mutex_pause(0);
2929                                 vm_page_lock_queues();
2930                         }
2931                 }
2932
2933                 if (vm_upl_wait_for_pages < 0) {
2934                         vm_upl_wait_for_pages = 0;
2935                 }
2936
2937                 delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT + vm_upl_wait_for_pages;
2938
2939                 if (delayed_unlock_limit > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX) {
2940                         delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX;
2941                 }
2942
2943                 vps_deal_with_secluded_page_overflow(&local_freeq, &local_freed);
2944
2945                 assert(delayed_unlock);
2946
2947                 /*
2948                  * maintain our balance
2949                  */
2950                 vm_page_balance_inactive(1);
2951
2952
2953                 /**********************************************************************
2954                 * above this point we're playing with the active and secluded queues
2955                 * below this point we're playing with the throttling mechanisms
2956                 * and the inactive queue
2957                 **********************************************************************/
2958
2959                 if (vm_page_free_count + local_freed >= vm_page_free_target) {
2960                         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2961
2962                         vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
2963                             VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
2964                         /*
2965                          * make sure the pageout I/O threads are running
2966                          * throttled in case there are still requests
2967                          * in the laundry... since we have met our targets
2968                          * we don't need the laundry to be cleaned in a timely
2969                          * fashion... so let's avoid interfering with foreground
2970                          * activity
2971                          */
2972                         vm_pageout_adjust_eq_iothrottle(eq, TRUE);
2973
2974                         lck_mtx_lock(&vm_page_queue_free_lock);
2975
2976                         if ((vm_page_free_count >= vm_page_free_target) &&
2977                             (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
2978                                 /*
2979                                  * done - we have met our target *and*
2980                                  * there is no one waiting for a page.
2981                                  */
2982 return_from_scan:
2983                                 assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
2984
2985                                 VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_NONE,
2986                                     vm_pageout_state.vm_pageout_inactive,
2987                                     vm_pageout_state.vm_pageout_inactive_used, 0, 0);
2988                                 VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_END,
2989                                     vm_pageout_vminfo.vm_pageout_freed_speculative,
2990                                     vm_pageout_state.vm_pageout_inactive_clean,
2991                                     vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
2992                                     vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
2993
2994                                 return;
2995                         }
2996                         lck_mtx_unlock(&vm_page_queue_free_lock);
2997                 }
2998
2999                 /*
3000                  * Before anything, we check if we have any ripe volatile
3001                  * objects around. If so, try to purge the first object.
3002                  * If the purge fails, fall through to reclaim a page instead.
3003                  * If the purge succeeds, go back to the top and reevalute
3004                  * the new memory situation.
3005                  */
3006                 retval = vps_purge_object();
3007
3008                 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3009                         /*
3010                          * Success
3011                          */
3012                         if (object != NULL) {
3013                                 vm_object_unlock(object);
3014                                 object = NULL;
3015                         }
3016
3017                         lock_yield_check = FALSE;
3018                         continue;
3019                 }
3020
3021                 /*
3022                  * If our 'aged' queue is empty and we have some speculative pages
3023                  * in the other queues, let's go through and see if we need to age
3024                  * them.
3025                  *
3026                  * If we succeeded in aging a speculative Q or just that everything
3027                  * looks normal w.r.t queue age and queue counts, we keep going onward.
3028                  *
3029                  * If, for some reason, we seem to have a mismatch between the spec.
3030                  * page count and the page queues, we reset those variables and
3031                  * restart the loop (LD TODO: Track this better?).
3032                  */
3033                 if (vm_page_queue_empty(&sq->age_q) && vm_page_speculative_count) {
3034                         retval = vps_age_speculative_queue(force_speculative_aging);
3035
3036                         if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3037                                 lock_yield_check = FALSE;
3038                                 continue;
3039                         }
3040                 }
3041                 force_speculative_aging = FALSE;
3042
3043                 /*
3044                  * Check to see if we need to evict objects from the cache.
3045                  *
3046                  * Note: 'object' here doesn't have anything to do with
3047                  * the eviction part. We just need to make sure we have dropped
3048                  * any object lock we might be holding if we need to go down
3049                  * into the eviction logic.
3050                  */
3051                 retval = vps_object_cache_evict(&object);
3052
3053                 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3054                         lock_yield_check = FALSE;
3055                         continue;
3056                 }
3057
3058
3059                 /*
3060                  * Calculate our filecache_min that will affect the loop
3061                  * going forward.
3062                  */
3063                 vps_calculate_filecache_min();
3064
3065                 /*
3066                  * LD TODO: Use a structure to hold all state variables for a single
3067                  * vm_pageout_scan iteration and pass that structure to this function instead.
3068                  */
3069                 retval = vps_flow_control(&flow_control, &anons_grabbed, &object,
3070                     &delayed_unlock, &local_freeq, &local_freed,
3071                     &vm_pageout_deadlock_target, inactive_burst_count);
3072
3073                 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3074                         if (loop_count >= vm_page_inactive_count) {
3075                                 loop_count = 0;
3076                         }
3077
3078                         inactive_burst_count = 0;
3079
3080                         assert(object == NULL);
3081                         assert(delayed_unlock != 0);
3082
3083                         lock_yield_check = FALSE;
3084                         continue;
3085                 } else if (retval == VM_PAGEOUT_SCAN_DONE_RETURN) {
3086                         goto return_from_scan;
3087                 }
3088
3089                 flow_control.state = FCS_IDLE;
3090
3091                 vm_pageout_inactive_external_forced_reactivate_limit = MIN((vm_page_active_count + vm_page_inactive_count),
3092                     vm_pageout_inactive_external_forced_reactivate_limit);
3093                 loop_count++;
3094                 inactive_burst_count++;
3095                 vm_pageout_state.vm_pageout_inactive++;
3096
3097                 /*
3098                  * Choose a victim.
3099                  */
3100
3101                 m = NULL;
3102                 retval = vps_choose_victim_page(&m, &anons_grabbed, &grab_anonymous, force_anonymous, &page_from_bg_q, &reactivated_this_call);
3103
3104                 if (m == NULL) {
3105                         if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3106                                 inactive_burst_count = 0;
3107
3108                                 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3109                                         VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3110                                 }
3111
3112                                 lock_yield_check = TRUE;
3113                                 continue;
3114                         }
3115
3116                         /*
3117                          * if we've gotten here, we have no victim page.
3118                          * check to see if we've not finished balancing the queues
3119                          * or we have a page on the aged speculative queue that we
3120                          * skipped due to force_anonymous == TRUE.. or we have
3121                          * speculative  pages that we can prematurely age... if
3122                          * one of these cases we'll keep going, else panic
3123                          */
3124                         force_anonymous = FALSE;
3125                         VM_PAGEOUT_DEBUG(vm_pageout_no_victim, 1);
3126
3127                         if (!vm_page_queue_empty(&sq->age_q)) {
3128                                 lock_yield_check = TRUE;
3129                                 continue;
3130                         }
3131
3132                         if (vm_page_speculative_count) {
3133                                 force_speculative_aging = TRUE;
3134                                 lock_yield_check = TRUE;
3135                                 continue;
3136                         }
3137                         panic("vm_pageout: no victim");
3138
3139                         /* NOTREACHED */
3140                 }
3141
3142                 assert(VM_PAGE_PAGEABLE(m));
3143                 m_object = VM_PAGE_OBJECT(m);
3144                 force_anonymous = FALSE;
3145
3146                 page_prev_q_state = m->vmp_q_state;
3147                 /*
3148                  * we just found this page on one of our queues...
3149                  * it can't also be on the pageout queue, so safe
3150                  * to call vm_page_queues_remove
3151                  */
3152                 vm_page_queues_remove(m, TRUE);
3153
3154                 assert(!m->vmp_laundry);
3155                 assert(!m->vmp_private);
3156                 assert(!m->vmp_fictitious);
3157                 assert(m_object != kernel_object);
3158                 assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
3159
3160                 vm_pageout_vminfo.vm_pageout_considered_page++;
3161
3162                 DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
3163
3164                 /*
3165                  * check to see if we currently are working
3166                  * with the same object... if so, we've
3167                  * already got the lock
3168                  */
3169                 if (m_object != object) {
3170                         boolean_t avoid_anon_pages = (grab_anonymous == FALSE || anons_grabbed >= ANONS_GRABBED_LIMIT);
3171
3172                         /*
3173                          * vps_switch_object() will always drop the 'object' lock first
3174                          * and then try to acquire the 'm_object' lock. So 'object' has to point to
3175                          * either 'm_object' or NULL.
3176                          */
3177                         retval = vps_switch_object(m, m_object, &object, page_prev_q_state, avoid_anon_pages, page_from_bg_q);
3178
3179                         if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3180                                 lock_yield_check = TRUE;
3181                                 continue;
3182                         }
3183                 }
3184                 assert(m_object == object);
3185                 assert(VM_PAGE_OBJECT(m) == m_object);
3186
3187                 if (m->vmp_busy) {
3188                         /*
3189                          *      Somebody is already playing with this page.
3190                          *      Put it back on the appropriate queue
3191                          *
3192                          */
3193                         VM_PAGEOUT_DEBUG(vm_pageout_inactive_busy, 1);
3194
3195                         if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3196                                 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_busy, 1);
3197                         }
3198
3199                         vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
3200
3201                         lock_yield_check = TRUE;
3202                         continue;
3203                 }
3204
3205                 /*
3206                  *   if (m->vmp_cleaning && !m->vmp_free_when_done)
3207                  *      If already cleaning this page in place
3208                  *      just leave if off the paging queues.
3209                  *      We can leave the page mapped, and upl_commit_range
3210                  *      will put it on the clean queue.
3211                  *
3212                  *   if (m->vmp_free_when_done && !m->vmp_cleaning)
3213                  *      an msync INVALIDATE is in progress...
3214                  *      this page has been marked for destruction
3215                  *      after it has been cleaned,
3216                  *      but not yet gathered into a UPL
3217                  *      where 'cleaning' will be set...
3218                  *      just leave it off the paging queues
3219                  *
3220                  *   if (m->vmp_free_when_done && m->vmp_clenaing)
3221                  *      an msync INVALIDATE is in progress
3222                  *      and the UPL has already gathered this page...
3223                  *      just leave it off the paging queues
3224                  */
3225                 if (m->vmp_free_when_done || m->vmp_cleaning) {
3226                         lock_yield_check = TRUE;
3227                         continue;
3228                 }
3229
3230
3231                 /*
3232                  *      If it's absent, in error or the object is no longer alive,
3233                  *      we can reclaim the page... in the no longer alive case,
3234                  *      there are 2 states the page can be in that preclude us
3235                  *      from reclaiming it - busy or cleaning - that we've already
3236                  *      dealt with
3237                  */
3238                 if (m->vmp_absent || m->vmp_error || !object->alive) {
3239                         if (m->vmp_absent) {
3240                                 VM_PAGEOUT_DEBUG(vm_pageout_inactive_absent, 1);
3241                         } else if (!object->alive) {
3242                                 VM_PAGEOUT_DEBUG(vm_pageout_inactive_notalive, 1);
3243                         } else {
3244                                 VM_PAGEOUT_DEBUG(vm_pageout_inactive_error, 1);
3245                         }
3246 reclaim_page:
3247                         if (vm_pageout_deadlock_target) {
3248                                 VM_PAGEOUT_DEBUG(vm_pageout_scan_inactive_throttle_success, 1);
3249                                 vm_pageout_deadlock_target--;
3250                         }
3251
3252                         DTRACE_VM2(dfree, int, 1, (uint64_t *), NULL);
3253
3254                         if (object->internal) {
3255                                 DTRACE_VM2(anonfree, int, 1, (uint64_t *), NULL);
3256                         } else {
3257                                 DTRACE_VM2(fsfree, int, 1, (uint64_t *), NULL);
3258                         }
3259                         assert(!m->vmp_cleaning);
3260                         assert(!m->vmp_laundry);
3261
3262                         if (!object->internal &&
3263                             object->pager != NULL &&
3264                             object->pager->mo_pager_ops == &shared_region_pager_ops) {
3265                                 shared_region_pager_reclaimed++;
3266                         }
3267
3268                         m->vmp_busy = TRUE;
3269
3270                         /*
3271                          * remove page from object here since we're already
3272                          * behind the object lock... defer the rest of the work
3273                          * we'd normally do in vm_page_free_prepare_object
3274                          * until 'vm_page_free_list' is called
3275                          */
3276                         if (m->vmp_tabled) {
3277                                 vm_page_remove(m, TRUE);
3278                         }
3279
3280                         assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
3281                         m->vmp_snext = local_freeq;
3282                         local_freeq = m;
3283                         local_freed++;
3284
3285                         if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
3286                                 vm_pageout_vminfo.vm_pageout_freed_speculative++;
3287                         } else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3288                                 vm_pageout_vminfo.vm_pageout_freed_cleaned++;
3289                         } else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q) {
3290                                 vm_pageout_vminfo.vm_pageout_freed_internal++;
3291                         } else {
3292                                 vm_pageout_vminfo.vm_pageout_freed_external++;
3293                         }
3294
3295                         inactive_burst_count = 0;
3296
3297                         lock_yield_check = TRUE;
3298                         continue;
3299                 }
3300                 if (object->copy == VM_OBJECT_NULL) {
3301                         /*
3302                          * No one else can have any interest in this page.
3303                          * If this is an empty purgable object, the page can be
3304                          * reclaimed even if dirty.
3305                          * If the page belongs to a volatile purgable object, we
3306                          * reactivate it if the compressor isn't active.
3307                          */
3308                         if (object->purgable == VM_PURGABLE_EMPTY) {
3309                                 if (m->vmp_pmapped == TRUE) {
3310                                         /* unmap the page */
3311                                         refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
3312                                         if (refmod_state & VM_MEM_MODIFIED) {
3313                                                 SET_PAGE_DIRTY(m, FALSE);
3314                                         }
3315                                 }
3316                                 if (m->vmp_dirty || m->vmp_precious) {
3317                                         /* we saved the cost of cleaning this page ! */
3318                                         vm_page_purged_count++;
3319                                 }
3320                                 goto reclaim_page;
3321                         }
3322
3323                         if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
3324                                 /*
3325                                  * With the VM compressor, the cost of
3326                                  * reclaiming a page is much lower (no I/O),
3327                                  * so if we find a "volatile" page, it's better
3328                                  * to let it get compressed rather than letting
3329                                  * it occupy a full page until it gets purged.
3330                                  * So no need to check for "volatile" here.
3331                                  */
3332                         } else if (object->purgable == VM_PURGABLE_VOLATILE) {
3333                                 /*
3334                                  * Avoid cleaning a "volatile" page which might
3335                                  * be purged soon.
3336                                  */
3337
3338                                 /* if it's wired, we can't put it on our queue */
3339                                 assert(!VM_PAGE_WIRED(m));
3340
3341                                 /* just stick it back on! */
3342                                 reactivated_this_call++;
3343
3344                                 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3345                                         VM_PAGEOUT_DEBUG(vm_pageout_cleaned_volatile_reactivated, 1);
3346                                 }
3347
3348                                 goto reactivate_page;
3349                         }
3350                 }
3351                 /*
3352                  *      If it's being used, reactivate.
3353                  *      (Fictitious pages are either busy or absent.)
3354                  *      First, update the reference and dirty bits
3355                  *      to make sure the page is unreferenced.
3356                  */
3357                 refmod_state = -1;
3358
3359                 if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
3360                         refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
3361
3362                         if (refmod_state & VM_MEM_REFERENCED) {
3363                                 m->vmp_reference = TRUE;
3364                         }
3365                         if (refmod_state & VM_MEM_MODIFIED) {
3366                                 SET_PAGE_DIRTY(m, FALSE);
3367                         }
3368                 }
3369
3370                 if (m->vmp_reference || m->vmp_dirty) {
3371                         /* deal with a rogue "reusable" page */
3372                         VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, m_object);
3373                 }
3374
3375                 if (vm_pageout_state.vm_page_xpmapped_min_divisor == 0) {
3376                         vm_pageout_state.vm_page_xpmapped_min = 0;
3377                 } else {
3378                         vm_pageout_state.vm_page_xpmapped_min = (vm_page_external_count * 10) / vm_pageout_state.vm_page_xpmapped_min_divisor;
3379                 }
3380
3381                 if (!m->vmp_no_cache &&
3382                     page_from_bg_q == FALSE &&
3383                     (m->vmp_reference || (m->vmp_xpmapped && !object->internal &&
3384                     (vm_page_xpmapped_external_count < vm_pageout_state.vm_page_xpmapped_min)))) {
3385                         /*
3386                          * The page we pulled off the inactive list has
3387                          * been referenced.  It is possible for other
3388                          * processors to be touching pages faster than we
3389                          * can clear the referenced bit and traverse the
3390                          * inactive queue, so we limit the number of
3391                          * reactivations.
3392                          */
3393                         if (++reactivated_this_call >= reactivate_limit) {
3394                                 vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded++;
3395                         } else if (++inactive_reclaim_run >= VM_PAGEOUT_INACTIVE_FORCE_RECLAIM) {
3396                                 vm_pageout_vminfo.vm_pageout_inactive_force_reclaim++;
3397                         } else {
3398                                 uint32_t isinuse;
3399
3400                                 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3401                                         VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reference_reactivated, 1);
3402                                 }
3403
3404                                 vm_pageout_vminfo.vm_pageout_inactive_referenced++;
3405 reactivate_page:
3406                                 if (!object->internal && object->pager != MEMORY_OBJECT_NULL &&
3407                                     vnode_pager_get_isinuse(object->pager, &isinuse) == KERN_SUCCESS && !isinuse) {
3408                                         /*
3409                                          * no explict mappings of this object exist
3410                                          * and it's not open via the filesystem
3411                                          */
3412                                         vm_page_deactivate(m);
3413                                         VM_PAGEOUT_DEBUG(vm_pageout_inactive_deactivated, 1);
3414                                 } else {
3415                                         /*
3416                                          * The page was/is being used, so put back on active list.
3417                                          */
3418                                         vm_page_activate(m);
3419                                         counter_inc(&vm_statistics_reactivations);
3420                                         inactive_burst_count = 0;
3421                                 }
3422 #if CONFIG_BACKGROUND_QUEUE
3423 #if DEVELOPMENT || DEBUG
3424                                 if (page_from_bg_q == TRUE) {
3425                                         if (m_object->internal) {
3426                                                 vm_pageout_rejected_bq_internal++;
3427                                         } else {
3428                                                 vm_pageout_rejected_bq_external++;
3429                                         }
3430                                 }
3431 #endif /* DEVELOPMENT || DEBUG */
3432 #endif /* CONFIG_BACKGROUND_QUEUE */
3433
3434                                 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3435                                         VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3436                                 }
3437                                 vm_pageout_state.vm_pageout_inactive_used++;
3438
3439                                 lock_yield_check = TRUE;
3440                                 continue;
3441                         }
3442                         /*
3443                          * Make sure we call pmap_get_refmod() if it
3444                          * wasn't already called just above, to update
3445                          * the dirty bit.
3446                          */
3447                         if ((refmod_state == -1) && !m->vmp_dirty && m->vmp_pmapped) {
3448                                 refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
3449                                 if (refmod_state & VM_MEM_MODIFIED) {
3450                                         SET_PAGE_DIRTY(m, FALSE);
3451                                 }
3452                         }
3453                 }
3454
3455                 /*
3456                  * we've got a candidate page to steal...
3457                  *
3458                  * m->vmp_dirty is up to date courtesy of the
3459                  * preceding check for m->vmp_reference... if
3460                  * we get here, then m->vmp_reference had to be
3461                  * FALSE (or possibly "reactivate_limit" was
3462                  * exceeded), but in either case we called
3463                  * pmap_get_refmod() and updated both
3464                  * m->vmp_reference and m->vmp_dirty
3465                  *
3466                  * if it's dirty or precious we need to
3467                  * see if the target queue is throtttled
3468                  * it if is, we need to skip over it by moving it back
3469                  * to the end of the inactive queue
3470                  */
3471
3472                 inactive_throttled = FALSE;
3473
3474                 if (m->vmp_dirty || m->vmp_precious) {
3475                         if (object->internal) {
3476                                 if (VM_PAGE_Q_THROTTLED(iq)) {
3477                                         inactive_throttled = TRUE;
3478                                 }
3479                         } else if (VM_PAGE_Q_THROTTLED(eq)) {
3480                                 inactive_throttled = TRUE;
3481                         }
3482                 }
3483 throttle_inactive:
3484                 if (!VM_DYNAMIC_PAGING_ENABLED() &&
3485                     object->internal && m->vmp_dirty &&
3486                     (object->purgable == VM_PURGABLE_DENY ||
3487                     object->purgable == VM_PURGABLE_NONVOLATILE ||
3488                     object->purgable == VM_PURGABLE_VOLATILE)) {
3489                         vm_page_check_pageable_safe(m);
3490                         assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
3491                         vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
3492                         m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
3493                         vm_page_throttled_count++;
3494
3495                         VM_PAGEOUT_DEBUG(vm_pageout_scan_reclaimed_throttled, 1);
3496
3497                         inactive_burst_count = 0;
3498
3499                         lock_yield_check = TRUE;
3500                         continue;
3501                 }
3502                 if (inactive_throttled == TRUE) {
3503                         vps_deal_with_throttled_queues(m, &object, &vm_pageout_inactive_external_forced_reactivate_limit,
3504                             &delayed_unlock, &force_anonymous, page_from_bg_q);
3505
3506                         inactive_burst_count = 0;
3507
3508                         if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3509                                 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3510                         }
3511
3512                         lock_yield_check = TRUE;
3513                         continue;
3514                 }
3515
3516                 /*
3517                  * we've got a page that we can steal...
3518                  * eliminate all mappings and make sure
3519                  * we have the up-to-date modified state
3520                  *
3521                  * if we need to do a pmap_disconnect then we
3522                  * need to re-evaluate m->vmp_dirty since the pmap_disconnect
3523                  * provides the true state atomically... the
3524                  * page was still mapped up to the pmap_disconnect
3525                  * and may have been dirtied at the last microsecond
3526                  *
3527                  * Note that if 'pmapped' is FALSE then the page is not
3528                  * and has not been in any map, so there is no point calling
3529                  * pmap_disconnect().  m->vmp_dirty could have been set in anticipation
3530                  * of likely usage of the page.
3531                  */
3532                 if (m->vmp_pmapped == TRUE) {
3533                         int pmap_options;
3534
3535                         /*
3536                          * Don't count this page as going into the compressor
3537                          * if any of these are true:
3538                          * 1) compressed pager isn't enabled
3539                          * 2) Freezer enabled device with compressed pager
3540                          *    backend (exclusive use) i.e. most of the VM system
3541                          *    (including vm_pageout_scan) has no knowledge of
3542                          *    the compressor
3543                          * 3) This page belongs to a file and hence will not be
3544                          *    sent into the compressor
3545                          */
3546                         if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE ||
3547                             object->internal == FALSE) {
3548                                 pmap_options = 0;
3549                         } else if (m->vmp_dirty || m->vmp_precious) {
3550                                 /*
3551                                  * VM knows that this page is dirty (or
3552                                  * precious) and needs to be compressed
3553                                  * rather than freed.
3554                                  * Tell the pmap layer to count this page
3555                                  * as "compressed".
3556                                  */
3557                                 pmap_options = PMAP_OPTIONS_COMPRESSOR;
3558                         } else {
3559                                 /*
3560                                  * VM does not know if the page needs to
3561                                  * be preserved but the pmap layer might tell
3562                                  * us if any mapping has "modified" it.
3563                                  * Let's the pmap layer to count this page
3564                                  * as compressed if and only if it has been
3565                                  * modified.
3566                                  */
3567                                 pmap_options =
3568                                     PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
3569                         }
3570                         refmod_state = pmap_disconnect_options(VM_PAGE_GET_PHYS_PAGE(m),
3571                             pmap_options,
3572                             NULL);
3573                         if (refmod_state & VM_MEM_MODIFIED) {
3574                                 SET_PAGE_DIRTY(m, FALSE);
3575                         }
3576                 }
3577
3578                 /*
3579                  * reset our count of pages that have been reclaimed
3580                  * since the last page was 'stolen'
3581                  */
3582                 inactive_reclaim_run = 0;
3583
3584                 /*
3585                  *      If it's clean and not precious, we can free the page.
3586                  */
3587                 if (!m->vmp_dirty && !m->vmp_precious) {
3588                         vm_pageout_state.vm_pageout_inactive_clean++;
3589
3590                         /*
3591                          * OK, at this point we have found a page we are going to free.
3592                          */
3593 #if CONFIG_PHANTOM_CACHE
3594                         if (!object->internal) {
3595                                 vm_phantom_cache_add_ghost(m);
3596                         }
3597 #endif
3598                         goto reclaim_page;
3599                 }
3600
3601                 /*
3602                  * The page may have been dirtied since the last check
3603                  * for a throttled target queue (which may have been skipped
3604                  * if the page was clean then).  With the dirty page
3605                  * disconnected here, we can make one final check.
3606                  */
3607                 if (object->internal) {
3608                         if (VM_PAGE_Q_THROTTLED(iq)) {
3609                                 inactive_throttled = TRUE;
3610                         }
3611                 } else if (VM_PAGE_Q_THROTTLED(eq)) {
3612                         inactive_throttled = TRUE;
3613                 }
3614
3615                 if (inactive_throttled == TRUE) {
3616                         goto throttle_inactive;
3617                 }
3618
3619 #if VM_PRESSURE_EVENTS
3620 #if CONFIG_JETSAM
3621
3622                 /*
3623                  * If Jetsam is enabled, then the sending
3624                  * of memory pressure notifications is handled
3625                  * from the same thread that takes care of high-water
3626                  * and other jetsams i.e. the memorystatus_thread.
3627                  */
3628
3629 #else /* CONFIG_JETSAM */
3630
3631                 vm_pressure_response();
3632
3633 #endif /* CONFIG_JETSAM */
3634 #endif /* VM_PRESSURE_EVENTS */
3635
3636                 if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
3637                         VM_PAGEOUT_DEBUG(vm_pageout_speculative_dirty, 1);
3638                 }
3639
3640                 if (object->internal) {
3641                         vm_pageout_vminfo.vm_pageout_inactive_dirty_internal++;
3642                 } else {
3643                         vm_pageout_vminfo.vm_pageout_inactive_dirty_external++;
3644                 }
3645
3646                 /*
3647                  * internal pages will go to the compressor...
3648                  * external pages will go to the appropriate pager to be cleaned
3649                  * and upon completion will end up on 'vm_page_queue_cleaned' which
3650                  * is a preferred queue to steal from
3651                  */
3652                 vm_pageout_cluster(m);
3653                 inactive_burst_count = 0;
3654
3655                 /*
3656                  * back to top of pageout scan loop
3657                  */
3658         }
3659 }
3660
3661
3662 void
3663 vm_page_free_reserve(
3664         int pages)
3665 {
3666         int             free_after_reserve;
3667
3668         if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
3669                 if ((vm_page_free_reserved + pages + COMPRESSOR_FREE_RESERVED_LIMIT) >= (VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT)) {
3670                         vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT;
3671                 } else {
3672                         vm_page_free_reserved += (pages + COMPRESSOR_FREE_RESERVED_LIMIT);
3673                 }
3674         } else {
3675                 if ((vm_page_free_reserved + pages) >= VM_PAGE_FREE_RESERVED_LIMIT) {
3676                         vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT;
3677                 } else {
3678                         vm_page_free_reserved += pages;
3679                 }
3680         }
3681         free_after_reserve = vm_pageout_state.vm_page_free_count_init - vm_page_free_reserved;
3682
3683         vm_page_free_min = vm_page_free_reserved +
3684             VM_PAGE_FREE_MIN(free_after_reserve);
3685
3686         if (vm_page_free_min > VM_PAGE_FREE_MIN_LIMIT) {
3687                 vm_page_free_min = VM_PAGE_FREE_MIN_LIMIT;
3688         }
3689
3690         vm_page_free_target = vm_page_free_reserved +
3691             VM_PAGE_FREE_TARGET(free_after_reserve);
3692
3693         if (vm_page_free_target > VM_PAGE_FREE_TARGET_LIMIT) {
3694                 vm_page_free_target = VM_PAGE_FREE_TARGET_LIMIT;
3695         }
3696
3697         if (vm_page_free_target < vm_page_free_min + 5) {
3698                 vm_page_free_target = vm_page_free_min + 5;
3699         }
3700
3701         vm_page_throttle_limit = vm_page_free_target - (vm_page_free_target / 2);
3702 }
3703
3704 /*
3705  *      vm_pageout is the high level pageout daemon.
3706  */
3707
3708 void
3709 vm_pageout_continue(void)
3710 {
3711         DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL);
3712         VM_PAGEOUT_DEBUG(vm_pageout_scan_event_counter, 1);
3713
3714         lck_mtx_lock(&vm_page_queue_free_lock);
3715         vm_pageout_running = TRUE;
3716         lck_mtx_unlock(&vm_page_queue_free_lock);
3717
3718         vm_pageout_scan();
3719         /*
3720          * we hold both the vm_page_queue_free_lock
3721          * and the vm_page_queues_lock at this point
3722          */
3723         assert(vm_page_free_wanted == 0);
3724         assert(vm_page_free_wanted_privileged == 0);
3725         assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
3726
3727         vm_pageout_running = FALSE;
3728 #if XNU_TARGET_OS_OSX
3729         if (vm_pageout_waiter) {
3730                 vm_pageout_waiter = FALSE;
3731                 thread_wakeup((event_t)&vm_pageout_waiter);
3732         }
3733 #endif /* XNU_TARGET_OS_OSX */
3734
3735         lck_mtx_unlock(&vm_page_queue_free_lock);
3736         vm_page_unlock_queues();
3737
3738         thread_block((thread_continue_t)vm_pageout_continue);
3739         /*NOTREACHED*/
3740 }
3741
3742 #if XNU_TARGET_OS_OSX
3743 kern_return_t
3744 vm_pageout_wait(uint64_t deadline)
3745 {
3746         kern_return_t kr;
3747
3748         lck_mtx_lock(&vm_page_queue_free_lock);
3749         for (kr = KERN_SUCCESS; vm_pageout_running && (KERN_SUCCESS == kr);) {
3750                 vm_pageout_waiter = TRUE;
3751                 if (THREAD_AWAKENED != lck_mtx_sleep_deadline(
3752                             &vm_page_queue_free_lock, LCK_SLEEP_DEFAULT,
3753                             (event_t) &vm_pageout_waiter, THREAD_UNINT, deadline)) {
3754                         kr = KERN_OPERATION_TIMED_OUT;
3755                 }
3756         }
3757         lck_mtx_unlock(&vm_page_queue_free_lock);
3758
3759         return kr;
3760 }
3761 #endif /* XNU_TARGET_OS_OSX */
3762
3763
3764 static void
3765 vm_pageout_iothread_external_continue(struct vm_pageout_queue *q)
3766 {
3767         vm_page_t       m = NULL;
3768         vm_object_t     object;
3769         vm_object_offset_t offset;
3770         memory_object_t pager;
3771
3772         /* On systems with a compressor, the external IO thread clears its
3773          * VM privileged bit to accommodate large allocations (e.g. bulk UPL
3774          * creation)
3775          */
3776         if (vm_pageout_state.vm_pageout_internal_iothread != THREAD_NULL) {
3777                 current_thread()->options &= ~TH_OPT_VMPRIV;
3778         }
3779
3780         vm_page_lockspin_queues();
3781
3782         while (!vm_page_queue_empty(&q->pgo_pending)) {
3783                 q->pgo_busy = TRUE;
3784                 vm_page_queue_remove_first(&q->pgo_pending, m, vmp_pageq);
3785
3786                 assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
3787                 VM_PAGE_CHECK(m);
3788                 /*
3789                  * grab a snapshot of the object and offset this
3790                  * page is tabled in so that we can relookup this
3791                  * page after we've taken the object lock - these
3792                  * fields are stable while we hold the page queues lock
3793                  * but as soon as we drop it, there is nothing to keep
3794                  * this page in this object... we hold an activity_in_progress
3795                  * on this object which will keep it from terminating
3796                  */
3797                 object = VM_PAGE_OBJECT(m);
3798                 offset = m->vmp_offset;
3799
3800                 m->vmp_q_state = VM_PAGE_NOT_ON_Q;
3801                 VM_PAGE_ZERO_PAGEQ_ENTRY(m);
3802
3803                 vm_page_unlock_queues();
3804
3805                 vm_object_lock(object);
3806
3807                 m = vm_page_lookup(object, offset);
3808
3809                 if (m == NULL || m->vmp_busy || m->vmp_cleaning ||
3810                     !m->vmp_laundry || (m->vmp_q_state != VM_PAGE_NOT_ON_Q)) {
3811                         /*
3812                          * it's either the same page that someone else has
3813                          * started cleaning (or it's finished cleaning or
3814                          * been put back on the pageout queue), or
3815                          * the page has been freed or we have found a
3816                          * new page at this offset... in all of these cases
3817                          * we merely need to release the activity_in_progress
3818                          * we took when we put the page on the pageout queue
3819                          */
3820                         vm_object_activity_end(object);
3821                         vm_object_unlock(object);
3822
3823                         vm_page_lockspin_queues();
3824                         continue;
3825                 }
3826                 pager = object->pager;
3827
3828                 if (pager == MEMORY_OBJECT_NULL) {
3829                         /*
3830                          * This pager has been destroyed by either
3831                          * memory_object_destroy or vm_object_destroy, and
3832                          * so there is nowhere for the page to go.
3833                          */
3834                         if (m->vmp_free_when_done) {
3835                                 /*
3836                                  * Just free the page... VM_PAGE_FREE takes
3837                                  * care of cleaning up all the state...
3838                                  * including doing the vm_pageout_throttle_up
3839                                  */
3840                                 VM_PAGE_FREE(m);
3841                         } else {
3842                                 vm_page_lockspin_queues();
3843
3844                                 vm_pageout_throttle_up(m);
3845                                 vm_page_activate(m);
3846
3847                                 vm_page_unlock_queues();
3848
3849                                 /*
3850                                  *      And we are done with it.
3851                                  */
3852                         }
3853                         vm_object_activity_end(object);
3854                         vm_object_unlock(object);
3855
3856                         vm_page_lockspin_queues();
3857                         continue;
3858                 }
3859 #if 0
3860                 /*
3861                  * we don't hold the page queue lock
3862                  * so this check isn't safe to make
3863                  */
3864                 VM_PAGE_CHECK(m);
3865 #endif
3866                 /*
3867                  * give back the activity_in_progress reference we
3868                  * took when we queued up this page and replace it
3869                  * it with a paging_in_progress reference that will
3870                  * also hold the paging offset from changing and
3871                  * prevent the object from terminating
3872                  */
3873                 vm_object_activity_end(object);
3874                 vm_object_paging_begin(object);
3875                 vm_object_unlock(object);
3876
3877                 /*
3878                  * Send the data to the pager.
3879                  * any pageout clustering happens there
3880                  */
3881                 memory_object_data_return(pager,
3882                     m->vmp_offset + object->paging_offset,
3883                     PAGE_SIZE,
3884                     NULL,
3885                     NULL,
3886                     FALSE,
3887                     FALSE,
3888                     0);
3889
3890                 vm_object_lock(object);
3891                 vm_object_paging_end(object);
3892                 vm_object_unlock(object);
3893
3894                 vm_pageout_io_throttle();
3895
3896                 vm_page_lockspin_queues();
3897         }
3898         q->pgo_busy = FALSE;
3899         q->pgo_idle = TRUE;
3900
3901         assert_wait((event_t) &q->pgo_pending, THREAD_UNINT);
3902         vm_page_unlock_queues();
3903
3904         thread_block_parameter((thread_continue_t)vm_pageout_iothread_external_continue, (void *) q);
3905         /*NOTREACHED*/
3906 }
3907
3908
3909 #define         MAX_FREE_BATCH          32
3910 uint32_t vm_compressor_time_thread; /* Set via sysctl to record time accrued by
3911                                      * this thread.
3912                                      */
3913
3914
3915 void
3916 vm_pageout_iothread_internal_continue(struct cq *);
3917 void
3918 vm_pageout_iothread_internal_continue(struct cq *cq)
3919 {
3920         struct vm_pageout_queue *q;
3921         vm_page_t       m = NULL;
3922         boolean_t       pgo_draining;
3923         vm_page_t   local_q;
3924         int         local_cnt;
3925         vm_page_t   local_freeq = NULL;
3926         int         local_freed = 0;
3927         int         local_batch_size;
3928 #if DEVELOPMENT || DEBUG
3929         int       ncomps = 0;
3930         boolean_t marked_active = FALSE;
3931 #endif
3932         KERNEL_DEBUG(0xe040000c | DBG_FUNC_END, 0, 0, 0, 0, 0);
3933
3934         q = cq->q;
3935 #if __AMP__
3936         if (vm_compressor_ebound && (vm_pageout_state.vm_compressor_thread_count > 1)) {
3937                 local_batch_size = (q->pgo_maxlaundry >> 3);
3938                 local_batch_size = MAX(local_batch_size, 16);
3939         } else {
3940                 local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
3941         }
3942 #else
3943         local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
3944 #endif
3945
3946 #if RECORD_THE_COMPRESSED_DATA
3947         if (q->pgo_laundry) {
3948                 c_compressed_record_init();
3949         }
3950 #endif
3951         while (TRUE) {
3952                 int     pages_left_on_q = 0;
3953
3954                 local_cnt = 0;
3955                 local_q = NULL;
3956
3957                 KERNEL_DEBUG(0xe0400014 | DBG_FUNC_START, 0, 0, 0, 0, 0);
3958
3959                 vm_page_lock_queues();
3960 #if DEVELOPMENT || DEBUG
3961                 if (marked_active == FALSE) {
3962                         vmct_active++;
3963                         vmct_state[cq->id] = VMCT_ACTIVE;
3964                         marked_active = TRUE;
3965                         if (vmct_active == 1) {
3966                                 vm_compressor_epoch_start = mach_absolute_time();
3967                         }
3968                 }
3969 #endif
3970                 KERNEL_DEBUG(0xe0400014 | DBG_FUNC_END, 0, 0, 0, 0, 0);
3971
3972                 KERNEL_DEBUG(0xe0400018 | DBG_FUNC_START, q->pgo_laundry, 0, 0, 0, 0);
3973
3974                 while (!vm_page_queue_empty(&q->pgo_pending) && local_cnt < local_batch_size) {
3975                         vm_page_queue_remove_first(&q->pgo_pending, m, vmp_pageq);
3976                         assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
3977                         VM_PAGE_CHECK(m);
3978
3979                         m->vmp_q_state = VM_PAGE_NOT_ON_Q;
3980                         VM_PAGE_ZERO_PAGEQ_ENTRY(m);
3981                         m->vmp_laundry = FALSE;
3982
3983                         m->vmp_snext = local_q;
3984                         local_q = m;
3985                         local_cnt++;
3986                 }
3987                 if (local_q == NULL) {
3988                         break;
3989                 }
3990
3991                 q->pgo_busy = TRUE;
3992
3993                 if ((pgo_draining = q->pgo_draining) == FALSE) {
3994                         vm_pageout_throttle_up_batch(q, local_cnt);
3995                         pages_left_on_q = q->pgo_laundry;
3996                 } else {
3997                         pages_left_on_q = q->pgo_laundry - local_cnt;
3998                 }
3999
4000                 vm_page_unlock_queues();
4001
4002 #if !RECORD_THE_COMPRESSED_DATA
4003                 if (pages_left_on_q >= local_batch_size && cq->id < (vm_pageout_state.vm_compressor_thread_count - 1)) {
4004                         thread_wakeup((event_t) ((uintptr_t)&q->pgo_pending + cq->id + 1));
4005                 }
4006 #endif
4007                 KERNEL_DEBUG(0xe0400018 | DBG_FUNC_END, q->pgo_laundry, 0, 0, 0, 0);
4008
4009                 while (local_q) {
4010                         KERNEL_DEBUG(0xe0400024 | DBG_FUNC_START, local_cnt, 0, 0, 0, 0);
4011
4012                         m = local_q;
4013                         local_q = m->vmp_snext;
4014                         m->vmp_snext = NULL;
4015
4016                         if (vm_pageout_compress_page(&cq->current_chead, cq->scratch_buf, m) == KERN_SUCCESS) {
4017 #if DEVELOPMENT || DEBUG
4018                                 ncomps++;
4019 #endif
4020                                 KERNEL_DEBUG(0xe0400024 | DBG_FUNC_END, local_cnt, 0, 0, 0, 0);
4021
4022                                 m->vmp_snext = local_freeq;
4023                                 local_freeq = m;
4024                                 local_freed++;
4025
4026                                 if (local_freed >= MAX_FREE_BATCH) {
4027                                         OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4028
4029                                         vm_page_free_list(local_freeq, TRUE);
4030
4031                                         local_freeq = NULL;
4032                                         local_freed = 0;
4033                                 }
4034                         }
4035 #if !CONFIG_JETSAM
4036                         while (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4037                                 kern_return_t   wait_result;
4038                                 int             need_wakeup = 0;
4039
4040                                 if (local_freeq) {
4041                                         OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4042
4043                                         vm_page_free_list(local_freeq, TRUE);
4044                                         local_freeq = NULL;
4045                                         local_freed = 0;
4046
4047                                         continue;
4048                                 }
4049                                 lck_mtx_lock_spin(&vm_page_queue_free_lock);
4050
4051                                 if (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4052                                         if (vm_page_free_wanted_privileged++ == 0) {
4053                                                 need_wakeup = 1;
4054                                         }
4055                                         wait_result = assert_wait((event_t)&vm_page_free_wanted_privileged, THREAD_UNINT);
4056
4057                                         lck_mtx_unlock(&vm_page_queue_free_lock);
4058
4059                                         if (need_wakeup) {
4060                                                 thread_wakeup((event_t)&vm_page_free_wanted);
4061                                         }
4062
4063                                         if (wait_result == THREAD_WAITING) {
4064                                                 thread_block(THREAD_CONTINUE_NULL);
4065                                         }
4066                                 } else {
4067                                         lck_mtx_unlock(&vm_page_queue_free_lock);
4068                                 }
4069                         }
4070 #endif
4071                 }
4072                 if (local_freeq) {
4073                         OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4074
4075                         vm_page_free_list(local_freeq, TRUE);
4076                         local_freeq = NULL;
4077                         local_freed = 0;
4078                 }
4079                 if (pgo_draining == TRUE) {
4080                         vm_page_lockspin_queues();
4081                         vm_pageout_throttle_up_batch(q, local_cnt);
4082                         vm_page_unlock_queues();
4083                 }
4084         }
4085         KERNEL_DEBUG(0xe040000c | DBG_FUNC_START, 0, 0, 0, 0, 0);
4086
4087         /*
4088          * queue lock is held and our q is empty
4089          */
4090         q->pgo_busy = FALSE;
4091         q->pgo_idle = TRUE;
4092
4093         assert_wait((event_t) ((uintptr_t)&q->pgo_pending + cq->id), THREAD_UNINT);
4094 #if DEVELOPMENT || DEBUG
4095         if (marked_active == TRUE) {
4096                 vmct_active--;
4097                 vmct_state[cq->id] = VMCT_IDLE;
4098
4099                 if (vmct_active == 0) {
4100                         vm_compressor_epoch_stop = mach_absolute_time();
4101                         assertf(vm_compressor_epoch_stop >= vm_compressor_epoch_start,
4102                             "Compressor epoch non-monotonic: 0x%llx -> 0x%llx",
4103                             vm_compressor_epoch_start, vm_compressor_epoch_stop);
4104                         /* This interval includes intervals where one or more
4105                          * compressor threads were pre-empted
4106                          */
4107                         vmct_stats.vmct_cthreads_total += vm_compressor_epoch_stop - vm_compressor_epoch_start;
4108                 }
4109         }
4110 #endif
4111         vm_page_unlock_queues();
4112 #if DEVELOPMENT || DEBUG
4113         if (__improbable(vm_compressor_time_thread)) {
4114                 vmct_stats.vmct_runtimes[cq->id] = thread_get_runtime_self();
4115                 vmct_stats.vmct_pages[cq->id] += ncomps;
4116                 vmct_stats.vmct_iterations[cq->id]++;
4117                 if (ncomps > vmct_stats.vmct_maxpages[cq->id]) {
4118                         vmct_stats.vmct_maxpages[cq->id] = ncomps;
4119                 }
4120                 if (ncomps < vmct_stats.vmct_minpages[cq->id]) {
4121                         vmct_stats.vmct_minpages[cq->id] = ncomps;
4122                 }
4123         }
4124 #endif
4125
4126         KERNEL_DEBUG(0xe0400018 | DBG_FUNC_END, 0, 0, 0, 0, 0);
4127
4128         thread_block_parameter((thread_continue_t)vm_pageout_iothread_internal_continue, (void *) cq);
4129         /*NOTREACHED*/
4130 }
4131
4132
4133 kern_return_t
4134 vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m)
4135 {
4136         vm_object_t     object;
4137         memory_object_t pager;
4138         int             compressed_count_delta;
4139         kern_return_t   retval;
4140
4141         object = VM_PAGE_OBJECT(m);
4142
4143         assert(!m->vmp_free_when_done);
4144         assert(!m->vmp_laundry);
4145
4146         pager = object->pager;
4147
4148         if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
4149                 KERNEL_DEBUG(0xe0400010 | DBG_FUNC_START, object, pager, 0, 0, 0);
4150
4151                 vm_object_lock(object);
4152
4153                 /*
4154                  * If there is no memory object for the page, create
4155                  * one and hand it to the compression pager.
4156                  */
4157
4158                 if (!object->pager_initialized) {
4159                         vm_object_collapse(object, (vm_object_offset_t) 0, TRUE);
4160                 }
4161                 if (!object->pager_initialized) {
4162                         vm_object_compressor_pager_create(object);
4163                 }
4164
4165                 pager = object->pager;
4166
4167                 if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
4168                         /*
4169                          * Still no pager for the object,
4170                          * or the pager has been destroyed.
4171                          * Reactivate the page.
4172                          *
4173                          * Should only happen if there is no
4174                          * compression pager
4175                          */
4176                         PAGE_WAKEUP_DONE(m);
4177
4178                         vm_page_lockspin_queues();
4179                         vm_page_activate(m);
4180                         VM_PAGEOUT_DEBUG(vm_pageout_dirty_no_pager, 1);
4181                         vm_page_unlock_queues();
4182
4183                         /*
4184                          *      And we are done with it.
4185                          */
4186                         vm_object_activity_end(object);
4187                         vm_object_unlock(object);
4188
4189                         return KERN_FAILURE;
4190                 }
4191                 vm_object_unlock(object);
4192
4193                 KERNEL_DEBUG(0xe0400010 | DBG_FUNC_END, object, pager, 0, 0, 0);
4194         }
4195         assert(object->pager_initialized && pager != MEMORY_OBJECT_NULL);
4196         assert(object->activity_in_progress > 0);
4197
4198         retval = vm_compressor_pager_put(
4199                 pager,
4200                 m->vmp_offset + object->paging_offset,
4201                 VM_PAGE_GET_PHYS_PAGE(m),
4202                 current_chead,
4203                 scratch_buf,
4204                 &compressed_count_delta);
4205
4206         vm_object_lock(object);
4207
4208         assert(object->activity_in_progress > 0);
4209         assert(VM_PAGE_OBJECT(m) == object);
4210         assert( !VM_PAGE_WIRED(m));
4211
4212         vm_compressor_pager_count(pager,
4213             compressed_count_delta,
4214             FALSE,                       /* shared_lock */
4215             object);
4216
4217         if (retval == KERN_SUCCESS) {
4218                 /*
4219                  * If the object is purgeable, its owner's
4220                  * purgeable ledgers will be updated in
4221                  * vm_page_remove() but the page still
4222                  * contributes to the owner's memory footprint,
4223                  * so account for it as such.
4224                  */
4225                 if ((object->purgable != VM_PURGABLE_DENY ||
4226                     object->vo_ledger_tag) &&
4227                     object->vo_owner != NULL) {
4228                         /* one more compressed purgeable/tagged page */
4229                         vm_object_owner_compressed_update(object,
4230                             +1);
4231                 }
4232                 counter_inc(&vm_statistics_compressions);
4233
4234                 if (m->vmp_tabled) {
4235                         vm_page_remove(m, TRUE);
4236                 }
4237         } else {
4238                 PAGE_WAKEUP_DONE(m);
4239
4240                 vm_page_lockspin_queues();
4241
4242                 vm_page_activate(m);
4243                 vm_pageout_vminfo.vm_compressor_failed++;
4244
4245                 vm_page_unlock_queues();
4246         }
4247         vm_object_activity_end(object);
4248         vm_object_unlock(object);
4249
4250         return retval;
4251 }
4252
4253
4254 static void
4255 vm_pageout_adjust_eq_iothrottle(struct vm_pageout_queue *eq, boolean_t req_lowpriority)
4256 {
4257         uint32_t        policy;
4258
4259         if (hibernate_cleaning_in_progress == TRUE) {
4260                 req_lowpriority = FALSE;
4261         }
4262
4263         if (eq->pgo_inited == TRUE && eq->pgo_lowpriority != req_lowpriority) {
4264                 vm_page_unlock_queues();
4265
4266                 if (req_lowpriority == TRUE) {
4267                         policy = THROTTLE_LEVEL_PAGEOUT_THROTTLED;
4268                         DTRACE_VM(laundrythrottle);
4269                 } else {
4270                         policy = THROTTLE_LEVEL_PAGEOUT_UNTHROTTLED;
4271                         DTRACE_VM(laundryunthrottle);
4272                 }
4273                 proc_set_thread_policy_with_tid(kernel_task, eq->pgo_tid,
4274                     TASK_POLICY_EXTERNAL, TASK_POLICY_IO, policy);
4275
4276                 vm_page_lock_queues();
4277                 eq->pgo_lowpriority = req_lowpriority;
4278         }
4279 }
4280
4281
4282 static void
4283 vm_pageout_iothread_external(void)
4284 {
4285         thread_t        self = current_thread();
4286
4287         self->options |= TH_OPT_VMPRIV;
4288
4289         DTRACE_VM2(laundrythrottle, int, 1, (uint64_t *), NULL);
4290
4291         proc_set_thread_policy(self, TASK_POLICY_EXTERNAL,
4292             TASK_POLICY_IO, THROTTLE_LEVEL_PAGEOUT_THROTTLED);
4293
4294         vm_page_lock_queues();
4295
4296         vm_pageout_queue_external.pgo_tid = self->thread_id;
4297         vm_pageout_queue_external.pgo_lowpriority = TRUE;
4298         vm_pageout_queue_external.pgo_inited = TRUE;
4299
4300         vm_page_unlock_queues();
4301
4302         vm_pageout_iothread_external_continue(&vm_pageout_queue_external);
4303
4304         /*NOTREACHED*/
4305 }
4306
4307
4308 static void
4309 vm_pageout_iothread_internal(struct cq *cq)
4310 {
4311         thread_t        self = current_thread();
4312
4313         self->options |= TH_OPT_VMPRIV;
4314
4315         vm_page_lock_queues();
4316
4317         vm_pageout_queue_internal.pgo_tid = self->thread_id;
4318         vm_pageout_queue_internal.pgo_lowpriority = TRUE;
4319         vm_pageout_queue_internal.pgo_inited = TRUE;
4320
4321         vm_page_unlock_queues();
4322
4323         if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
4324                 thread_vm_bind_group_add();
4325         }
4326
4327 #if CONFIG_THREAD_GROUPS
4328         thread_group_vm_add();
4329 #endif /* CONFIG_THREAD_GROUPS */
4330
4331 #if __AMP__
4332         if (vm_compressor_ebound) {
4333                 /*
4334                  * Use the soft bound option for vm_compressor to allow it to run on
4335                  * P-cores if E-cluster is unavailable.
4336                  */
4337                 thread_bind_cluster_type(self, 'E', true);
4338         }
4339 #endif /* __AMP__ */
4340
4341         thread_set_thread_name(current_thread(), "VM_compressor");
4342 #if DEVELOPMENT || DEBUG
4343         vmct_stats.vmct_minpages[cq->id] = INT32_MAX;
4344 #endif
4345         vm_pageout_iothread_internal_continue(cq);
4346
4347         /*NOTREACHED*/
4348 }
4349
4350 kern_return_t
4351 vm_set_buffer_cleanup_callout(boolean_t (*func)(int))
4352 {
4353         if (OSCompareAndSwapPtr(NULL, ptrauth_nop_cast(void *, func), (void * volatile *) &consider_buffer_cache_collect)) {
4354                 return KERN_SUCCESS;
4355         } else {
4356                 return KERN_FAILURE; /* Already set */
4357         }
4358 }
4359
4360 extern boolean_t        memorystatus_manual_testing_on;
4361 extern unsigned int     memorystatus_level;
4362
4363
4364 #if VM_PRESSURE_EVENTS
4365
4366 boolean_t vm_pressure_events_enabled = FALSE;
4367
4368 void
4369 vm_pressure_response(void)
4370 {
4371         vm_pressure_level_t     old_level = kVMPressureNormal;
4372         int                     new_level = -1;
4373         unsigned int            total_pages;
4374         uint64_t                available_memory = 0;
4375
4376         if (vm_pressure_events_enabled == FALSE) {
4377                 return;
4378         }
4379
4380 #if !XNU_TARGET_OS_OSX
4381
4382         available_memory = (uint64_t) memorystatus_available_pages;
4383
4384 #else /* !XNU_TARGET_OS_OSX */
4385
4386         available_memory = (uint64_t) AVAILABLE_NON_COMPRESSED_MEMORY;
4387         memorystatus_available_pages = (uint64_t) AVAILABLE_NON_COMPRESSED_MEMORY;
4388
4389 #endif /* !XNU_TARGET_OS_OSX */
4390
4391         total_pages = (unsigned int) atop_64(max_mem);
4392 #if CONFIG_SECLUDED_MEMORY
4393         total_pages -= vm_page_secluded_count;
4394 #endif /* CONFIG_SECLUDED_MEMORY */
4395         memorystatus_level = (unsigned int) ((available_memory * 100) / total_pages);
4396
4397         if (memorystatus_manual_testing_on) {
4398                 return;
4399         }
4400
4401         old_level = memorystatus_vm_pressure_level;
4402
4403         switch (memorystatus_vm_pressure_level) {
4404         case kVMPressureNormal:
4405         {
4406                 if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4407                         new_level = kVMPressureCritical;
4408                 } else if (VM_PRESSURE_NORMAL_TO_WARNING()) {
4409                         new_level = kVMPressureWarning;
4410                 }
4411                 break;
4412         }
4413
4414         case kVMPressureWarning:
4415         case kVMPressureUrgent:
4416         {
4417                 if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4418                         new_level = kVMPressureNormal;
4419                 } else if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4420                         new_level = kVMPressureCritical;
4421                 }
4422                 break;
4423         }
4424
4425         case kVMPressureCritical:
4426         {
4427                 if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4428                         new_level = kVMPressureNormal;
4429                 } else if (VM_PRESSURE_CRITICAL_TO_WARNING()) {
4430                         new_level = kVMPressureWarning;
4431                 }
4432                 break;
4433         }
4434
4435         default:
4436                 return;
4437         }
4438
4439         if (new_level != -1) {
4440                 memorystatus_vm_pressure_level = (vm_pressure_level_t) new_level;
4441
4442                 if (new_level != (int) old_level) {
4443                         VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE,
4444                             new_level, old_level, 0, 0);
4445                 }
4446
4447                 if ((memorystatus_vm_pressure_level != kVMPressureNormal) || (old_level != memorystatus_vm_pressure_level)) {
4448                         if (vm_pageout_state.vm_pressure_thread_running == FALSE) {
4449                                 thread_wakeup(&vm_pressure_thread);
4450                         }
4451
4452                         if (old_level != memorystatus_vm_pressure_level) {
4453                                 thread_wakeup(&vm_pageout_state.vm_pressure_changed);
4454                         }
4455                 }
4456         }
4457 }
4458 #endif /* VM_PRESSURE_EVENTS */
4459
4460 /*
4461  * Function called by a kernel thread to either get the current pressure level or
4462  * wait until memory pressure changes from a given level.
4463  */
4464 kern_return_t
4465 mach_vm_pressure_level_monitor(__unused boolean_t wait_for_pressure, __unused unsigned int *pressure_level)
4466 {
4467 #if !VM_PRESSURE_EVENTS
4468
4469         return KERN_FAILURE;
4470
4471 #else /* VM_PRESSURE_EVENTS */
4472
4473         wait_result_t       wr = 0;
4474         vm_pressure_level_t old_level = memorystatus_vm_pressure_level;
4475
4476         if (pressure_level == NULL) {
4477                 return KERN_INVALID_ARGUMENT;
4478         }
4479
4480         if (*pressure_level == kVMPressureJetsam) {
4481                 if (!wait_for_pressure) {
4482                         return KERN_INVALID_ARGUMENT;
4483                 }
4484
4485                 lck_mtx_lock(&memorystatus_jetsam_fg_band_lock);
4486                 wr = assert_wait((event_t)&memorystatus_jetsam_fg_band_waiters,
4487                     THREAD_INTERRUPTIBLE);
4488                 if (wr == THREAD_WAITING) {
4489                         ++memorystatus_jetsam_fg_band_waiters;
4490                         lck_mtx_unlock(&memorystatus_jetsam_fg_band_lock);
4491                         wr = thread_block(THREAD_CONTINUE_NULL);
4492                 } else {
4493                         lck_mtx_unlock(&memorystatus_jetsam_fg_band_lock);
4494                 }
4495                 if (wr != THREAD_AWAKENED) {
4496                         return KERN_ABORTED;
4497                 }
4498                 *pressure_level = kVMPressureJetsam;
4499                 return KERN_SUCCESS;
4500         }
4501
4502         if (wait_for_pressure == TRUE) {
4503                 while (old_level == *pressure_level) {
4504                         wr = assert_wait((event_t) &vm_pageout_state.vm_pressure_changed,
4505                             THREAD_INTERRUPTIBLE);
4506                         if (wr == THREAD_WAITING) {
4507                                 wr = thread_block(THREAD_CONTINUE_NULL);
4508                         }
4509                         if (wr == THREAD_INTERRUPTED) {
4510                                 return KERN_ABORTED;
4511                         }
4512
4513                         if (wr == THREAD_AWAKENED) {
4514                                 old_level = memorystatus_vm_pressure_level;
4515                         }
4516                 }
4517         }
4518
4519         *pressure_level = old_level;
4520         return KERN_SUCCESS;
4521 #endif /* VM_PRESSURE_EVENTS */
4522 }
4523
4524 #if VM_PRESSURE_EVENTS
4525 void
4526 vm_pressure_thread(void)
4527 {
4528         static boolean_t thread_initialized = FALSE;
4529
4530         if (thread_initialized == TRUE) {
4531                 vm_pageout_state.vm_pressure_thread_running = TRUE;
4532                 consider_vm_pressure_events();
4533                 vm_pageout_state.vm_pressure_thread_running = FALSE;
4534         }
4535
4536         thread_set_thread_name(current_thread(), "VM_pressure");
4537         thread_initialized = TRUE;
4538         assert_wait((event_t) &vm_pressure_thread, THREAD_UNINT);
4539         thread_block((thread_continue_t)vm_pressure_thread);
4540 }
4541 #endif /* VM_PRESSURE_EVENTS */
4542
4543
4544 /*
4545  * called once per-second via "compute_averages"
4546  */
4547 void
4548 compute_pageout_gc_throttle(__unused void *arg)
4549 {
4550         if (vm_pageout_vminfo.vm_pageout_considered_page != vm_pageout_state.vm_pageout_considered_page_last) {
4551                 vm_pageout_state.vm_pageout_considered_page_last = vm_pageout_vminfo.vm_pageout_considered_page;
4552
4553                 thread_wakeup((event_t) &vm_pageout_garbage_collect);
4554         }
4555 }
4556
4557 /*
4558  * vm_pageout_garbage_collect can also be called when the zone allocator needs
4559  * to call zone_gc on a different thread in order to trigger zone-map-exhaustion
4560  * jetsams. We need to check if the zone map size is above its jetsam limit to
4561  * decide if this was indeed the case.
4562  *
4563  * We need to do this on a different thread because of the following reasons:
4564  *
4565  * 1. In the case of synchronous jetsams, the leaking process can try to jetsam
4566  * itself causing the system to hang. We perform synchronous jetsams if we're
4567  * leaking in the VM map entries zone, so the leaking process could be doing a
4568  * zalloc for a VM map entry while holding its vm_map lock, when it decides to
4569  * jetsam itself. We also need the vm_map lock on the process termination path,
4570  * which would now lead the dying process to deadlock against itself.
4571  *
4572  * 2. The jetsam path might need to allocate zone memory itself. We could try
4573  * using the non-blocking variant of zalloc for this path, but we can still
4574  * end up trying to do a kernel_memory_allocate when the zone maps are almost
4575  * full.
4576  */
4577
4578 void
4579 vm_pageout_garbage_collect(int collect)
4580 {
4581         if (collect) {
4582                 if (zone_map_nearing_exhaustion()) {
4583                         /*
4584                          * Woken up by the zone allocator for zone-map-exhaustion jetsams.
4585                          *
4586                          * Bail out after calling zone_gc (which triggers the
4587                          * zone-map-exhaustion jetsams). If we fall through, the subsequent
4588                          * operations that clear out a bunch of caches might allocate zone
4589                          * memory themselves (for eg. vm_map operations would need VM map
4590                          * entries). Since the zone map is almost full at this point, we
4591                          * could end up with a panic. We just need to quickly jetsam a
4592                          * process and exit here.
4593                          *
4594                          * It could so happen that we were woken up to relieve memory
4595                          * pressure and the zone map also happened to be near its limit at
4596                          * the time, in which case we'll skip out early. But that should be
4597                          * ok; if memory pressure persists, the thread will simply be woken
4598                          * up again.
4599                          */
4600                         zone_gc(ZONE_GC_JETSAM);
4601                 } else {
4602                         /* Woken up by vm_pageout_scan or compute_pageout_gc_throttle. */
4603                         boolean_t buf_large_zfree = FALSE;
4604                         boolean_t first_try = TRUE;
4605
4606                         stack_collect();
4607
4608                         consider_machine_collect();
4609                         mbuf_drain(FALSE);
4610
4611                         do {
4612                                 if (consider_buffer_cache_collect != NULL) {
4613                                         buf_large_zfree = (*consider_buffer_cache_collect)(0);
4614                                 }
4615                                 if (first_try == TRUE || buf_large_zfree == TRUE) {
4616                                         /*
4617                                          * zone_gc should be last, because the other operations
4618                                          * might return memory to zones.
4619                                          */
4620                                         zone_gc(ZONE_GC_TRIM);
4621                                 }
4622                                 first_try = FALSE;
4623                         } while (buf_large_zfree == TRUE && vm_page_free_count < vm_page_free_target);
4624
4625                         consider_machine_adjust();
4626                 }
4627         }
4628
4629         assert_wait((event_t) &vm_pageout_garbage_collect, THREAD_UNINT);
4630
4631         thread_block_parameter((thread_continue_t) vm_pageout_garbage_collect, (void *)1);
4632         /*NOTREACHED*/
4633 }
4634
4635
4636 #if VM_PAGE_BUCKETS_CHECK
4637 #if VM_PAGE_FAKE_BUCKETS
4638 extern vm_map_offset_t vm_page_fake_buckets_start, vm_page_fake_buckets_end;
4639 #endif /* VM_PAGE_FAKE_BUCKETS */
4640 #endif /* VM_PAGE_BUCKETS_CHECK */
4641
4642
4643
4644 void
4645 vm_set_restrictions(unsigned int num_cpus)
4646 {
4647         int vm_restricted_to_single_processor = 0;
4648
4649         if (PE_parse_boot_argn("vm_restricted_to_single_processor", &vm_restricted_to_single_processor, sizeof(vm_restricted_to_single_processor))) {
4650                 kprintf("Overriding vm_restricted_to_single_processor to %d\n", vm_restricted_to_single_processor);
4651                 vm_pageout_state.vm_restricted_to_single_processor = (vm_restricted_to_single_processor ? TRUE : FALSE);
4652         } else {
4653                 assert(num_cpus > 0);
4654
4655                 if (num_cpus <= 3) {
4656                         /*
4657                          * on systems with a limited number of CPUS, bind the
4658                          * 4 major threads that can free memory and that tend to use
4659                          * a fair bit of CPU under pressured conditions to a single processor.
4660                          * This insures that these threads don't hog all of the available CPUs
4661                          * (important for camera launch), while allowing them to run independently
4662                          * w/r to locks... the 4 threads are
4663                          * vm_pageout_scan,  vm_pageout_iothread_internal (compressor),
4664                          * vm_compressor_swap_trigger_thread (minor and major compactions),
4665                          * memorystatus_thread (jetsams).
4666                          *
4667                          * the first time the thread is run, it is responsible for checking the
4668                          * state of vm_restricted_to_single_processor, and if TRUE it calls
4669                          * thread_bind_master...  someday this should be replaced with a group
4670                          * scheduling mechanism and KPI.
4671                          */
4672                         vm_pageout_state.vm_restricted_to_single_processor = TRUE;
4673                 } else {
4674                         vm_pageout_state.vm_restricted_to_single_processor = FALSE;
4675                 }
4676         }
4677 }
4678
4679 void
4680 vm_pageout(void)
4681 {
4682         thread_t        self = current_thread();
4683         thread_t        thread;
4684         kern_return_t   result;
4685         spl_t           s;
4686
4687         /*
4688          * Set thread privileges.
4689          */
4690         s = splsched();
4691
4692         vm_pageout_scan_thread = self;
4693
4694 #if CONFIG_VPS_DYNAMIC_PRIO
4695
4696         int             vps_dynprio_bootarg = 0;
4697
4698         if (PE_parse_boot_argn("vps_dynamic_priority_enabled", &vps_dynprio_bootarg, sizeof(vps_dynprio_bootarg))) {
4699                 vps_dynamic_priority_enabled = (vps_dynprio_bootarg ? TRUE : FALSE);
4700                 kprintf("Overriding vps_dynamic_priority_enabled to %d\n", vps_dynamic_priority_enabled);
4701         } else {
4702                 if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
4703                         vps_dynamic_priority_enabled = TRUE;
4704                 } else {
4705                         vps_dynamic_priority_enabled = FALSE;
4706                 }
4707         }
4708
4709         if (vps_dynamic_priority_enabled) {
4710                 sched_set_kernel_thread_priority(self, MAXPRI_THROTTLE);
4711                 thread_set_eager_preempt(self);
4712         } else {
4713                 sched_set_kernel_thread_priority(self, BASEPRI_VM);
4714         }
4715
4716 #else /* CONFIG_VPS_DYNAMIC_PRIO */
4717
4718         vps_dynamic_priority_enabled = FALSE;
4719         sched_set_kernel_thread_priority(self, BASEPRI_VM);
4720
4721 #endif /* CONFIG_VPS_DYNAMIC_PRIO */
4722
4723         thread_lock(self);
4724         self->options |= TH_OPT_VMPRIV;
4725         thread_unlock(self);
4726
4727         if (!self->reserved_stack) {
4728                 self->reserved_stack = self->kernel_stack;
4729         }
4730
4731         if (vm_pageout_state.vm_restricted_to_single_processor == TRUE &&
4732             vps_dynamic_priority_enabled == FALSE) {
4733                 thread_vm_bind_group_add();
4734         }
4735
4736
4737 #if CONFIG_THREAD_GROUPS
4738         thread_group_vm_add();
4739 #endif /* CONFIG_THREAD_GROUPS */
4740
4741 #if __AMP__
4742         PE_parse_boot_argn("vmpgo_pcluster", &vm_pgo_pbound, sizeof(vm_pgo_pbound));
4743         if (vm_pgo_pbound) {
4744                 /*
4745                  * Use the soft bound option for vm pageout to allow it to run on
4746                  * E-cores if P-cluster is unavailable.
4747                  */
4748                 thread_bind_cluster_type(self, 'P', true);
4749         }
4750 #endif /* __AMP__ */
4751
4752         splx(s);
4753
4754         thread_set_thread_name(current_thread(), "VM_pageout_scan");
4755
4756         /*
4757          *      Initialize some paging parameters.
4758          */
4759
4760         vm_pageout_state.vm_pressure_thread_running = FALSE;
4761         vm_pageout_state.vm_pressure_changed = FALSE;
4762         vm_pageout_state.memorystatus_purge_on_warning = 2;
4763         vm_pageout_state.memorystatus_purge_on_urgent = 5;
4764         vm_pageout_state.memorystatus_purge_on_critical = 8;
4765         vm_pageout_state.vm_page_speculative_q_age_ms = VM_PAGE_SPECULATIVE_Q_AGE_MS;
4766         vm_pageout_state.vm_page_speculative_percentage = 5;
4767         vm_pageout_state.vm_page_speculative_target = 0;
4768
4769         vm_pageout_state.vm_pageout_external_iothread = THREAD_NULL;
4770         vm_pageout_state.vm_pageout_internal_iothread = THREAD_NULL;
4771
4772         vm_pageout_state.vm_pageout_swap_wait = 0;
4773         vm_pageout_state.vm_pageout_idle_wait = 0;
4774         vm_pageout_state.vm_pageout_empty_wait = 0;
4775         vm_pageout_state.vm_pageout_burst_wait = 0;
4776         vm_pageout_state.vm_pageout_deadlock_wait = 0;
4777         vm_pageout_state.vm_pageout_deadlock_relief = 0;
4778         vm_pageout_state.vm_pageout_burst_inactive_throttle = 0;
4779
4780         vm_pageout_state.vm_pageout_inactive = 0;
4781         vm_pageout_state.vm_pageout_inactive_used = 0;
4782         vm_pageout_state.vm_pageout_inactive_clean = 0;
4783
4784         vm_pageout_state.vm_memory_pressure = 0;
4785         vm_pageout_state.vm_page_filecache_min = 0;
4786 #if CONFIG_JETSAM
4787         vm_pageout_state.vm_page_filecache_min_divisor = 70;
4788         vm_pageout_state.vm_page_xpmapped_min_divisor = 40;
4789 #else
4790         vm_pageout_state.vm_page_filecache_min_divisor = 27;
4791         vm_pageout_state.vm_page_xpmapped_min_divisor = 36;
4792 #endif
4793         vm_pageout_state.vm_page_free_count_init = vm_page_free_count;
4794
4795         vm_pageout_state.vm_pageout_considered_page_last = 0;
4796
4797         if (vm_pageout_state.vm_pageout_swap_wait == 0) {
4798                 vm_pageout_state.vm_pageout_swap_wait = VM_PAGEOUT_SWAP_WAIT;
4799         }
4800
4801         if (vm_pageout_state.vm_pageout_idle_wait == 0) {
4802                 vm_pageout_state.vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
4803         }
4804
4805         if (vm_pageout_state.vm_pageout_burst_wait == 0) {
4806                 vm_pageout_state.vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
4807         }
4808
4809         if (vm_pageout_state.vm_pageout_empty_wait == 0) {
4810                 vm_pageout_state.vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
4811         }
4812
4813         if (vm_pageout_state.vm_pageout_deadlock_wait == 0) {
4814                 vm_pageout_state.vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
4815         }
4816
4817         if (vm_pageout_state.vm_pageout_deadlock_relief == 0) {
4818                 vm_pageout_state.vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
4819         }
4820
4821         if (vm_pageout_state.vm_pageout_burst_inactive_throttle == 0) {
4822                 vm_pageout_state.vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
4823         }
4824         /*
4825          * even if we've already called vm_page_free_reserve
4826          * call it again here to insure that the targets are
4827          * accurately calculated (it uses vm_page_free_count_init)
4828          * calling it with an arg of 0 will not change the reserve
4829          * but will re-calculate free_min and free_target
4830          */
4831         if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
4832                 vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
4833         } else {
4834                 vm_page_free_reserve(0);
4835         }
4836
4837
4838         vm_page_queue_init(&vm_pageout_queue_external.pgo_pending);
4839         vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
4840         vm_pageout_queue_external.pgo_laundry = 0;
4841         vm_pageout_queue_external.pgo_idle = FALSE;
4842         vm_pageout_queue_external.pgo_busy = FALSE;
4843         vm_pageout_queue_external.pgo_throttled = FALSE;
4844         vm_pageout_queue_external.pgo_draining = FALSE;
4845         vm_pageout_queue_external.pgo_lowpriority = FALSE;
4846         vm_pageout_queue_external.pgo_tid = -1;
4847         vm_pageout_queue_external.pgo_inited = FALSE;
4848
4849         vm_page_queue_init(&vm_pageout_queue_internal.pgo_pending);
4850         vm_pageout_queue_internal.pgo_maxlaundry = 0;
4851         vm_pageout_queue_internal.pgo_laundry = 0;
4852         vm_pageout_queue_internal.pgo_idle = FALSE;
4853         vm_pageout_queue_internal.pgo_busy = FALSE;
4854         vm_pageout_queue_internal.pgo_throttled = FALSE;
4855         vm_pageout_queue_internal.pgo_draining = FALSE;
4856         vm_pageout_queue_internal.pgo_lowpriority = FALSE;
4857         vm_pageout_queue_internal.pgo_tid = -1;
4858         vm_pageout_queue_internal.pgo_inited = FALSE;
4859
4860         /* internal pageout thread started when default pager registered first time */
4861         /* external pageout and garbage collection threads started here */
4862
4863         result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external, NULL,
4864             BASEPRI_VM,
4865             &vm_pageout_state.vm_pageout_external_iothread);
4866         if (result != KERN_SUCCESS) {
4867                 panic("vm_pageout_iothread_external: create failed");
4868         }
4869         thread_set_thread_name(vm_pageout_state.vm_pageout_external_iothread, "VM_pageout_external_iothread");
4870         thread_deallocate(vm_pageout_state.vm_pageout_external_iothread);
4871
4872         result = kernel_thread_create((thread_continue_t)vm_pageout_garbage_collect, NULL,
4873             BASEPRI_DEFAULT,
4874             &thread);
4875         if (result != KERN_SUCCESS) {
4876                 panic("vm_pageout_garbage_collect: create failed");
4877         }
4878         thread_set_thread_name(thread, "VM_pageout_garbage_collect");
4879         if (thread->reserved_stack == 0) {
4880                 assert(thread->kernel_stack);
4881                 thread->reserved_stack = thread->kernel_stack;
4882         }
4883
4884         thread_mtx_lock(thread);
4885         thread_start(thread);
4886         thread_mtx_unlock(thread);
4887
4888         thread_deallocate(thread);
4889
4890 #if VM_PRESSURE_EVENTS
4891         result = kernel_thread_start_priority((thread_continue_t)vm_pressure_thread, NULL,
4892             BASEPRI_DEFAULT,
4893             &thread);
4894
4895         if (result != KERN_SUCCESS) {
4896                 panic("vm_pressure_thread: create failed");
4897         }
4898
4899         thread_deallocate(thread);
4900 #endif
4901
4902         vm_object_reaper_init();
4903
4904
4905         bzero(&vm_config, sizeof(vm_config));
4906
4907         switch (vm_compressor_mode) {
4908         case VM_PAGER_DEFAULT:
4909                 printf("mapping deprecated VM_PAGER_DEFAULT to VM_PAGER_COMPRESSOR_WITH_SWAP\n");
4910                 OS_FALLTHROUGH;
4911
4912         case VM_PAGER_COMPRESSOR_WITH_SWAP:
4913                 vm_config.compressor_is_present = TRUE;
4914                 vm_config.swap_is_present = TRUE;
4915                 vm_config.compressor_is_active = TRUE;
4916                 vm_config.swap_is_active = TRUE;
4917                 break;
4918
4919         case VM_PAGER_COMPRESSOR_NO_SWAP:
4920                 vm_config.compressor_is_present = TRUE;
4921                 vm_config.swap_is_present = TRUE;
4922                 vm_config.compressor_is_active = TRUE;
4923                 break;
4924
4925         case VM_PAGER_FREEZER_DEFAULT:
4926                 printf("mapping deprecated VM_PAGER_FREEZER_DEFAULT to VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP\n");
4927                 OS_FALLTHROUGH;
4928
4929         case VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP:
4930                 vm_config.compressor_is_present = TRUE;
4931                 vm_config.swap_is_present = TRUE;
4932                 break;
4933
4934         case VM_PAGER_COMPRESSOR_NO_SWAP_PLUS_FREEZER_COMPRESSOR_WITH_SWAP:
4935                 vm_config.compressor_is_present = TRUE;
4936                 vm_config.swap_is_present = TRUE;
4937                 vm_config.compressor_is_active = TRUE;
4938                 vm_config.freezer_swap_is_active = TRUE;
4939                 break;
4940
4941         case VM_PAGER_NOT_CONFIGURED:
4942                 break;
4943
4944         default:
4945                 printf("unknown compressor mode - %x\n", vm_compressor_mode);
4946                 break;
4947         }
4948         if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
4949                 vm_compressor_pager_init();
4950         }
4951
4952 #if VM_PRESSURE_EVENTS
4953         vm_pressure_events_enabled = TRUE;
4954 #endif /* VM_PRESSURE_EVENTS */
4955
4956 #if CONFIG_PHANTOM_CACHE
4957         vm_phantom_cache_init();
4958 #endif
4959 #if VM_PAGE_BUCKETS_CHECK
4960 #if VM_PAGE_FAKE_BUCKETS
4961         printf("**** DEBUG: protecting fake buckets [0x%llx:0x%llx]\n",
4962             (uint64_t) vm_page_fake_buckets_start,
4963             (uint64_t) vm_page_fake_buckets_end);
4964         pmap_protect(kernel_pmap,
4965             vm_page_fake_buckets_start,
4966             vm_page_fake_buckets_end,
4967             VM_PROT_READ);
4968 //      *(char *) vm_page_fake_buckets_start = 'x';     /* panic! */
4969 #endif /* VM_PAGE_FAKE_BUCKETS */
4970 #endif /* VM_PAGE_BUCKETS_CHECK */
4971
4972 #if VM_OBJECT_TRACKING
4973         vm_object_tracking_init();
4974 #endif /* VM_OBJECT_TRACKING */
4975
4976         vm_pageout_continue();
4977
4978         /*
4979          * Unreached code!
4980          *
4981          * The vm_pageout_continue() call above never returns, so the code below is never
4982          * executed.  We take advantage of this to declare several DTrace VM related probe
4983          * points that our kernel doesn't have an analog for.  These are probe points that
4984          * exist in Solaris and are in the DTrace documentation, so people may have written
4985          * scripts that use them.  Declaring the probe points here means their scripts will
4986          * compile and execute which we want for portability of the scripts, but since this
4987          * section of code is never reached, the probe points will simply never fire.  Yes,
4988          * this is basically a hack.  The problem is the DTrace probe points were chosen with
4989          * Solaris specific VM events in mind, not portability to different VM implementations.
4990          */
4991
4992         DTRACE_VM2(execfree, int, 1, (uint64_t *), NULL);
4993         DTRACE_VM2(execpgin, int, 1, (uint64_t *), NULL);
4994         DTRACE_VM2(execpgout, int, 1, (uint64_t *), NULL);
4995         DTRACE_VM2(pgswapin, int, 1, (uint64_t *), NULL);
4996         DTRACE_VM2(pgswapout, int, 1, (uint64_t *), NULL);
4997         DTRACE_VM2(swapin, int, 1, (uint64_t *), NULL);
4998         DTRACE_VM2(swapout, int, 1, (uint64_t *), NULL);
4999         /*NOTREACHED*/
5000 }
5001
5002
5003
5004 kern_return_t
5005 vm_pageout_internal_start(void)
5006 {
5007         kern_return_t   result;
5008         host_basic_info_data_t hinfo;
5009         vm_offset_t     buf, bufsize;
5010
5011         assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
5012
5013         mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
5014 #define BSD_HOST 1
5015         host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count);
5016
5017         assert(hinfo.max_cpus > 0);
5018
5019 #if !XNU_TARGET_OS_OSX
5020         vm_pageout_state.vm_compressor_thread_count = 1;
5021 #else /* !XNU_TARGET_OS_OSX */
5022         if (hinfo.max_cpus > 4) {
5023                 vm_pageout_state.vm_compressor_thread_count = 2;
5024         } else {
5025                 vm_pageout_state.vm_compressor_thread_count = 1;
5026         }
5027 #endif /* !XNU_TARGET_OS_OSX */
5028         PE_parse_boot_argn("vmcomp_threads", &vm_pageout_state.vm_compressor_thread_count,
5029             sizeof(vm_pageout_state.vm_compressor_thread_count));
5030
5031 #if     __AMP__
5032         PE_parse_boot_argn("vmcomp_ecluster", &vm_compressor_ebound, sizeof(vm_compressor_ebound));
5033         if (vm_compressor_ebound) {
5034                 vm_pageout_state.vm_compressor_thread_count = 2;
5035         }
5036 #endif
5037         if (vm_pageout_state.vm_compressor_thread_count >= hinfo.max_cpus) {
5038                 vm_pageout_state.vm_compressor_thread_count = hinfo.max_cpus - 1;
5039         }
5040         if (vm_pageout_state.vm_compressor_thread_count <= 0) {
5041                 vm_pageout_state.vm_compressor_thread_count = 1;
5042         } else if (vm_pageout_state.vm_compressor_thread_count > MAX_COMPRESSOR_THREAD_COUNT) {
5043                 vm_pageout_state.vm_compressor_thread_count = MAX_COMPRESSOR_THREAD_COUNT;
5044         }
5045
5046         vm_pageout_queue_internal.pgo_maxlaundry =
5047             (vm_pageout_state.vm_compressor_thread_count * 4) * VM_PAGE_LAUNDRY_MAX;
5048
5049         PE_parse_boot_argn("vmpgoi_maxlaundry",
5050             &vm_pageout_queue_internal.pgo_maxlaundry,
5051             sizeof(vm_pageout_queue_internal.pgo_maxlaundry));
5052
5053         bufsize = COMPRESSOR_SCRATCH_BUF_SIZE;
5054         if (kernel_memory_allocate(kernel_map, &buf,
5055             bufsize * vm_pageout_state.vm_compressor_thread_count,
5056             0, KMA_KOBJECT | KMA_PERMANENT, VM_KERN_MEMORY_COMPRESSOR)) {
5057                 panic("vm_pageout_internal_start: Unable to allocate %zd bytes",
5058                     (size_t)(bufsize * vm_pageout_state.vm_compressor_thread_count));
5059         }
5060
5061         for (int i = 0; i < vm_pageout_state.vm_compressor_thread_count; i++) {
5062                 ciq[i].id = i;
5063                 ciq[i].q = &vm_pageout_queue_internal;
5064                 ciq[i].current_chead = NULL;
5065                 ciq[i].scratch_buf = (char *)(buf + i * bufsize);
5066
5067                 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal,
5068                     (void *)&ciq[i], BASEPRI_VM,
5069                     &vm_pageout_state.vm_pageout_internal_iothread);
5070
5071                 if (result == KERN_SUCCESS) {
5072                         thread_deallocate(vm_pageout_state.vm_pageout_internal_iothread);
5073                 } else {
5074                         break;
5075                 }
5076         }
5077         return result;
5078 }
5079
5080 #if CONFIG_IOSCHED
5081 /*
5082  * To support I/O Expedite for compressed files we mark the upls with special flags.
5083  * The way decmpfs works is that we create a big upl which marks all the pages needed to
5084  * represent the compressed file as busy. We tag this upl with the flag UPL_DECMP_REQ. Decmpfs
5085  * then issues smaller I/Os for compressed I/Os, deflates them and puts the data into the pages
5086  * being held in the big original UPL. We mark each of these smaller UPLs with the flag
5087  * UPL_DECMP_REAL_IO. Any outstanding real I/O UPL is tracked by the big req upl using the
5088  * decmp_io_upl field (in the upl structure). This link is protected in the forward direction
5089  * by the req upl lock (the reverse link doesnt need synch. since we never inspect this link
5090  * unless the real I/O upl is being destroyed).
5091  */
5092
5093
5094 static void
5095 upl_set_decmp_info(upl_t upl, upl_t src_upl)
5096 {
5097         assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5098
5099         upl_lock(src_upl);
5100         if (src_upl->decmp_io_upl) {
5101                 /*
5102                  * If there is already an alive real I/O UPL, ignore this new UPL.
5103                  * This case should rarely happen and even if it does, it just means
5104                  * that we might issue a spurious expedite which the driver is expected
5105                  * to handle.
5106                  */
5107                 upl_unlock(src_upl);
5108                 return;
5109         }
5110         src_upl->decmp_io_upl = (void *)upl;
5111         src_upl->ref_count++;
5112
5113         upl->flags |= UPL_DECMP_REAL_IO;
5114         upl->decmp_io_upl = (void *)src_upl;
5115         upl_unlock(src_upl);
5116 }
5117 #endif /* CONFIG_IOSCHED */
5118
5119 #if UPL_DEBUG
5120 int     upl_debug_enabled = 1;
5121 #else
5122 int     upl_debug_enabled = 0;
5123 #endif
5124
5125 static upl_t
5126 upl_create(int type, int flags, upl_size_t size)
5127 {
5128         upl_t   upl;
5129         vm_size_t       page_field_size = 0;
5130         int     upl_flags = 0;
5131         vm_size_t       upl_size  = sizeof(struct upl);
5132
5133         assert(page_aligned(size));
5134
5135         size = round_page_32(size);
5136
5137         if (type & UPL_CREATE_LITE) {
5138                 page_field_size = (atop(size) + 7) >> 3;
5139                 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
5140
5141                 upl_flags |= UPL_LITE;
5142         }
5143         if (type & UPL_CREATE_INTERNAL) {
5144                 upl_size += sizeof(struct upl_page_info) * atop(size);
5145
5146                 upl_flags |= UPL_INTERNAL;
5147         }
5148         upl = (upl_t)kalloc(upl_size + page_field_size);
5149
5150         if (page_field_size) {
5151                 bzero((char *)upl + upl_size, page_field_size);
5152         }
5153
5154         upl->flags = upl_flags | flags;
5155         upl->kaddr = (vm_offset_t)0;
5156         upl->u_offset = 0;
5157         upl->u_size = 0;
5158         upl->map_object = NULL;
5159         upl->ref_count = 1;
5160         upl->ext_ref_count = 0;
5161         upl->highest_page = 0;
5162         upl_lock_init(upl);
5163         upl->vector_upl = NULL;
5164         upl->associated_upl = NULL;
5165         upl->upl_iodone = NULL;
5166 #if CONFIG_IOSCHED
5167         if (type & UPL_CREATE_IO_TRACKING) {
5168                 upl->upl_priority = proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO);
5169         }
5170
5171         upl->upl_reprio_info = 0;
5172         upl->decmp_io_upl = 0;
5173         if ((type & UPL_CREATE_INTERNAL) && (type & UPL_CREATE_EXPEDITE_SUP)) {
5174                 /* Only support expedite on internal UPLs */
5175                 thread_t        curthread = current_thread();
5176                 upl->upl_reprio_info = (uint64_t *)kalloc(sizeof(uint64_t) * atop(size));
5177                 bzero(upl->upl_reprio_info, (sizeof(uint64_t) * atop(size)));
5178                 upl->flags |= UPL_EXPEDITE_SUPPORTED;
5179                 if (curthread->decmp_upl != NULL) {
5180                         upl_set_decmp_info(upl, curthread->decmp_upl);
5181                 }
5182         }
5183 #endif
5184 #if CONFIG_IOSCHED || UPL_DEBUG
5185         if ((type & UPL_CREATE_IO_TRACKING) || upl_debug_enabled) {
5186                 upl->upl_creator = current_thread();
5187                 upl->uplq.next = 0;
5188                 upl->uplq.prev = 0;
5189                 upl->flags |= UPL_TRACKED_BY_OBJECT;
5190         }
5191 #endif
5192
5193 #if UPL_DEBUG
5194         upl->ubc_alias1 = 0;
5195         upl->ubc_alias2 = 0;
5196
5197         upl->upl_state = 0;
5198         upl->upl_commit_index = 0;
5199         bzero(&upl->upl_commit_records[0], sizeof(upl->upl_commit_records));
5200
5201         (void) OSBacktrace(&upl->upl_create_retaddr[0], UPL_DEBUG_STACK_FRAMES);
5202 #endif /* UPL_DEBUG */
5203
5204         return upl;
5205 }
5206
5207 static void
5208 upl_destroy(upl_t upl)
5209 {
5210         int     page_field_size;  /* bit field in word size buf */
5211         int     size;
5212
5213 //      DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object);
5214
5215         if (upl->ext_ref_count) {
5216                 panic("upl(%p) ext_ref_count", upl);
5217         }
5218
5219 #if CONFIG_IOSCHED
5220         if ((upl->flags & UPL_DECMP_REAL_IO) && upl->decmp_io_upl) {
5221                 upl_t src_upl;
5222                 src_upl = upl->decmp_io_upl;
5223                 assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5224                 upl_lock(src_upl);
5225                 src_upl->decmp_io_upl = NULL;
5226                 upl_unlock(src_upl);
5227                 upl_deallocate(src_upl);
5228         }
5229 #endif /* CONFIG_IOSCHED */
5230
5231 #if CONFIG_IOSCHED || UPL_DEBUG
5232         if (((upl->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) &&
5233             !(upl->flags & UPL_VECTOR)) {
5234                 vm_object_t     object;
5235
5236                 if (upl->flags & UPL_SHADOWED) {
5237                         object = upl->map_object->shadow;
5238                 } else {
5239                         object = upl->map_object;
5240                 }
5241
5242                 vm_object_lock(object);
5243                 queue_remove(&object->uplq, upl, upl_t, uplq);
5244                 vm_object_activity_end(object);
5245                 vm_object_collapse(object, 0, TRUE);
5246                 vm_object_unlock(object);
5247         }
5248 #endif
5249         /*
5250          * drop a reference on the map_object whether or
5251          * not a pageout object is inserted
5252          */
5253         if (upl->flags & UPL_SHADOWED) {
5254                 vm_object_deallocate(upl->map_object);
5255         }
5256
5257         if (upl->flags & UPL_DEVICE_MEMORY) {
5258                 size = PAGE_SIZE;
5259         } else {
5260                 size = upl_adjusted_size(upl, PAGE_MASK);
5261         }
5262         page_field_size = 0;
5263
5264         if (upl->flags & UPL_LITE) {
5265                 page_field_size = ((size / PAGE_SIZE) + 7) >> 3;
5266                 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
5267         }
5268         upl_lock_destroy(upl);
5269         upl->vector_upl = (vector_upl_t) 0xfeedbeef;
5270
5271 #if CONFIG_IOSCHED
5272         if (upl->flags & UPL_EXPEDITE_SUPPORTED) {
5273                 kfree(upl->upl_reprio_info, sizeof(uint64_t) * (size / PAGE_SIZE));
5274         }
5275 #endif
5276
5277         if (upl->flags & UPL_INTERNAL) {
5278                 kfree(upl,
5279                     sizeof(struct upl) +
5280                     (sizeof(struct upl_page_info) * (size / PAGE_SIZE))
5281                     + page_field_size);
5282         } else {
5283                 kfree(upl, sizeof(struct upl) + page_field_size);
5284         }
5285 }
5286
5287 void
5288 upl_deallocate(upl_t upl)
5289 {
5290         upl_lock(upl);
5291
5292         if (--upl->ref_count == 0) {
5293                 if (vector_upl_is_valid(upl)) {
5294                         vector_upl_deallocate(upl);
5295                 }
5296                 upl_unlock(upl);
5297
5298                 if (upl->upl_iodone) {
5299                         upl_callout_iodone(upl);
5300                 }
5301
5302                 upl_destroy(upl);
5303         } else {
5304                 upl_unlock(upl);
5305         }
5306 }
5307
5308 #if CONFIG_IOSCHED
5309 void
5310 upl_mark_decmp(upl_t upl)
5311 {
5312         if (upl->flags & UPL_TRACKED_BY_OBJECT) {
5313                 upl->flags |= UPL_DECMP_REQ;
5314                 upl->upl_creator->decmp_upl = (void *)upl;
5315         }
5316 }
5317
5318 void
5319 upl_unmark_decmp(upl_t upl)
5320 {
5321         if (upl && (upl->flags & UPL_DECMP_REQ)) {
5322                 upl->upl_creator->decmp_upl = NULL;
5323         }
5324 }
5325
5326 #endif /* CONFIG_IOSCHED */
5327
5328 #define VM_PAGE_Q_BACKING_UP(q)         \
5329         ((q)->pgo_laundry >= (((q)->pgo_maxlaundry * 8) / 10))
5330
5331 boolean_t must_throttle_writes(void);
5332
5333 boolean_t
5334 must_throttle_writes()
5335 {
5336         if (VM_PAGE_Q_BACKING_UP(&vm_pageout_queue_external) &&
5337             vm_page_pageable_external_count > (AVAILABLE_NON_COMPRESSED_MEMORY * 6) / 10) {
5338                 return TRUE;
5339         }
5340
5341         return FALSE;
5342 }
5343
5344 #define MIN_DELAYED_WORK_CTX_ALLOCATED  (16)
5345 #define MAX_DELAYED_WORK_CTX_ALLOCATED  (512)
5346
5347 int vm_page_delayed_work_ctx_needed = 0;
5348 SECURITY_READ_ONLY_LATE(zone_t) dw_ctx_zone;
5349
5350 void
5351 vm_page_delayed_work_init_ctx(void)
5352 {
5353         size_t elem_size = sizeof(struct vm_page_delayed_work_ctx);
5354
5355         dw_ctx_zone = zone_create_ext("delayed-work-ctx", elem_size,
5356             ZC_NOGC, ZONE_ID_ANY, ^(zone_t z) {
5357                 zone_set_exhaustible(z, MAX_DELAYED_WORK_CTX_ALLOCATED);
5358         });
5359
5360         zone_fill_initially(dw_ctx_zone, MIN_DELAYED_WORK_CTX_ALLOCATED);
5361 }
5362
5363 struct vm_page_delayed_work*
5364 vm_page_delayed_work_get_ctx(void)
5365 {
5366         struct vm_page_delayed_work_ctx * dw_ctx = NULL;
5367
5368         dw_ctx = (struct vm_page_delayed_work_ctx*) zalloc_noblock(dw_ctx_zone);
5369
5370         if (dw_ctx) {
5371                 dw_ctx->delayed_owner = current_thread();
5372         } else {
5373                 vm_page_delayed_work_ctx_needed++;
5374         }
5375         return dw_ctx ? dw_ctx->dwp : NULL;
5376 }
5377
5378 void
5379 vm_page_delayed_work_finish_ctx(struct vm_page_delayed_work* dwp)
5380 {
5381         struct  vm_page_delayed_work_ctx *ldw_ctx;
5382
5383         ldw_ctx = (struct vm_page_delayed_work_ctx *)dwp;
5384         ldw_ctx->delayed_owner = NULL;
5385
5386         zfree(dw_ctx_zone, ldw_ctx);
5387 }
5388
5389 /*
5390  *      Routine:        vm_object_upl_request
5391  *      Purpose:
5392  *              Cause the population of a portion of a vm_object.
5393  *              Depending on the nature of the request, the pages
5394  *              returned may be contain valid data or be uninitialized.
5395  *              A page list structure, listing the physical pages
5396  *              will be returned upon request.
5397  *              This function is called by the file system or any other
5398  *              supplier of backing store to a pager.
5399  *              IMPORTANT NOTE: The caller must still respect the relationship
5400  *              between the vm_object and its backing memory object.  The
5401  *              caller MUST NOT substitute changes in the backing file
5402  *              without first doing a memory_object_lock_request on the
5403  *              target range unless it is know that the pages are not
5404  *              shared with another entity at the pager level.
5405  *              Copy_in_to:
5406  *                      if a page list structure is present
5407  *                      return the mapped physical pages, where a
5408  *                      page is not present, return a non-initialized
5409  *                      one.  If the no_sync bit is turned on, don't
5410  *                      call the pager unlock to synchronize with other
5411  *                      possible copies of the page. Leave pages busy
5412  *                      in the original object, if a page list structure
5413  *                      was specified.  When a commit of the page list
5414  *                      pages is done, the dirty bit will be set for each one.
5415  *              Copy_out_from:
5416  *                      If a page list structure is present, return
5417  *                      all mapped pages.  Where a page does not exist
5418  *                      map a zero filled one. Leave pages busy in
5419  *                      the original object.  If a page list structure
5420  *                      is not specified, this call is a no-op.
5421  *
5422  *              Note:  access of default pager objects has a rather interesting
5423  *              twist.  The caller of this routine, presumably the file system
5424  *              page cache handling code, will never actually make a request
5425  *              against a default pager backed object.  Only the default
5426  *              pager will make requests on backing store related vm_objects
5427  *              In this way the default pager can maintain the relationship
5428  *              between backing store files (abstract memory objects) and
5429  *              the vm_objects (cache objects), they support.
5430  *
5431  */
5432
5433 __private_extern__ kern_return_t
5434 vm_object_upl_request(
5435         vm_object_t             object,
5436         vm_object_offset_t      offset,
5437         upl_size_t              size,
5438         upl_t                   *upl_ptr,
5439         upl_page_info_array_t   user_page_list,
5440         unsigned int            *page_list_count,
5441         upl_control_flags_t     cntrl_flags,
5442         vm_tag_t                tag)
5443 {
5444         vm_page_t               dst_page = VM_PAGE_NULL;
5445         vm_object_offset_t      dst_offset;
5446         upl_size_t              xfer_size;
5447         unsigned int            size_in_pages;
5448         boolean_t               dirty;
5449         boolean_t               hw_dirty;
5450         upl_t                   upl = NULL;
5451         unsigned int            entry;
5452         vm_page_t               alias_page = NULL;
5453         int                     refmod_state = 0;
5454         wpl_array_t             lite_list = NULL;
5455         vm_object_t             last_copy_object;
5456         struct  vm_page_delayed_work    dw_array;
5457         struct  vm_page_delayed_work    *dwp, *dwp_start;
5458         bool                    dwp_finish_ctx = TRUE;
5459         int                     dw_count;
5460         int                     dw_limit;
5461         int                     io_tracking_flag = 0;
5462         int                     grab_options;
5463         int                     page_grab_count = 0;
5464         ppnum_t                 phys_page;
5465         pmap_flush_context      pmap_flush_context_storage;
5466         boolean_t               pmap_flushes_delayed = FALSE;
5467 #if DEVELOPMENT || DEBUG
5468         task_t                  task = current_task();
5469 #endif /* DEVELOPMENT || DEBUG */
5470
5471         dwp_start = dwp = NULL;
5472
5473         if (cntrl_flags & ~UPL_VALID_FLAGS) {
5474                 /*
5475                  * For forward compatibility's sake,
5476                  * reject any unknown flag.
5477                  */
5478                 return KERN_INVALID_VALUE;
5479         }
5480         if ((!object->internal) && (object->paging_offset != 0)) {
5481                 panic("vm_object_upl_request: external object with non-zero paging offset\n");
5482         }
5483         if (object->phys_contiguous) {
5484                 panic("vm_object_upl_request: contiguous object specified\n");
5485         }
5486
5487         assertf(page_aligned(offset) && page_aligned(size),
5488             "offset 0x%llx size 0x%x",
5489             offset, size);
5490
5491         VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, VM_UPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, 0, 0);
5492
5493         dw_count = 0;
5494         dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
5495         dwp_start = vm_page_delayed_work_get_ctx();
5496         if (dwp_start == NULL) {
5497                 dwp_start = &dw_array;
5498                 dw_limit = 1;
5499                 dwp_finish_ctx = FALSE;
5500         }
5501
5502         dwp = dwp_start;
5503
5504         if (size > MAX_UPL_SIZE_BYTES) {
5505                 size = MAX_UPL_SIZE_BYTES;
5506         }
5507
5508         if ((cntrl_flags & UPL_SET_INTERNAL) && page_list_count != NULL) {
5509                 *page_list_count = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
5510         }
5511
5512 #if CONFIG_IOSCHED || UPL_DEBUG
5513         if (object->io_tracking || upl_debug_enabled) {
5514                 io_tracking_flag |= UPL_CREATE_IO_TRACKING;
5515         }
5516 #endif
5517 #if CONFIG_IOSCHED
5518         if (object->io_tracking) {
5519                 io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
5520         }
5521 #endif
5522
5523         if (cntrl_flags & UPL_SET_INTERNAL) {
5524                 if (cntrl_flags & UPL_SET_LITE) {
5525                         upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
5526
5527                         user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
5528                         lite_list = (wpl_array_t)
5529                             (((uintptr_t)user_page_list) +
5530                             ((size / PAGE_SIZE) * sizeof(upl_page_info_t)));
5531                         if (size == 0) {
5532                                 user_page_list = NULL;
5533                                 lite_list = NULL;
5534                         }
5535                 } else {
5536                         upl = upl_create(UPL_CREATE_INTERNAL | io_tracking_flag, 0, size);
5537
5538                         user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
5539                         if (size == 0) {
5540                                 user_page_list = NULL;
5541                         }
5542                 }
5543         } else {
5544                 if (cntrl_flags & UPL_SET_LITE) {
5545                         upl = upl_create(UPL_CREATE_EXTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
5546
5547                         lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
5548                         if (size == 0) {
5549                                 lite_list = NULL;
5550                         }
5551                 } else {
5552                         upl = upl_create(UPL_CREATE_EXTERNAL | io_tracking_flag, 0, size);
5553                 }
5554         }
5555         *upl_ptr = upl;
5556
5557         if (user_page_list) {
5558                 user_page_list[0].device = FALSE;
5559         }
5560
5561         if (cntrl_flags & UPL_SET_LITE) {
5562                 upl->map_object = object;
5563         } else {
5564                 upl->map_object = vm_object_allocate(size);
5565                 /*
5566                  * No neeed to lock the new object: nobody else knows
5567                  * about it yet, so it's all ours so far.
5568                  */
5569                 upl->map_object->shadow = object;
5570                 upl->map_object->pageout = TRUE;
5571                 upl->map_object->can_persist = FALSE;
5572                 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
5573                 upl->map_object->vo_shadow_offset = offset;
5574                 upl->map_object->wimg_bits = object->wimg_bits;
5575                 assertf(page_aligned(upl->map_object->vo_shadow_offset),
5576                     "object %p shadow_offset 0x%llx",
5577                     upl->map_object, upl->map_object->vo_shadow_offset);
5578
5579                 alias_page = vm_page_grab_fictitious(TRUE);
5580
5581                 upl->flags |= UPL_SHADOWED;
5582         }
5583         if (cntrl_flags & UPL_FOR_PAGEOUT) {
5584                 upl->flags |= UPL_PAGEOUT;
5585         }
5586
5587         vm_object_lock(object);
5588         vm_object_activity_begin(object);
5589
5590         grab_options = 0;
5591 #if CONFIG_SECLUDED_MEMORY
5592         if (object->can_grab_secluded) {
5593                 grab_options |= VM_PAGE_GRAB_SECLUDED;
5594         }
5595 #endif /* CONFIG_SECLUDED_MEMORY */
5596
5597         /*
5598          * we can lock in the paging_offset once paging_in_progress is set
5599          */
5600         upl->u_size = size;
5601         upl->u_offset = offset + object->paging_offset;
5602
5603 #if CONFIG_IOSCHED || UPL_DEBUG
5604         if (object->io_tracking || upl_debug_enabled) {
5605                 vm_object_activity_begin(object);
5606                 queue_enter(&object->uplq, upl, upl_t, uplq);
5607         }
5608 #endif
5609         if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != VM_OBJECT_NULL) {
5610                 /*
5611                  * Honor copy-on-write obligations
5612                  *
5613                  * The caller is gathering these pages and
5614                  * might modify their contents.  We need to
5615                  * make sure that the copy object has its own
5616                  * private copies of these pages before we let
5617                  * the caller modify them.
5618                  */
5619                 vm_object_update(object,
5620                     offset,
5621                     size,
5622                     NULL,
5623                     NULL,
5624                     FALSE,              /* should_return */
5625                     MEMORY_OBJECT_COPY_SYNC,
5626                     VM_PROT_NO_CHANGE);
5627
5628                 VM_PAGEOUT_DEBUG(upl_cow, 1);
5629                 VM_PAGEOUT_DEBUG(upl_cow_pages, (size >> PAGE_SHIFT));
5630         }
5631         /*
5632          * remember which copy object we synchronized with
5633          */
5634         last_copy_object = object->copy;
5635         entry = 0;
5636
5637         xfer_size = size;
5638         dst_offset = offset;
5639         size_in_pages = size / PAGE_SIZE;
5640
5641         if (vm_page_free_count > (vm_page_free_target + size_in_pages) ||
5642             object->resident_page_count < ((MAX_UPL_SIZE_BYTES * 2) >> PAGE_SHIFT)) {
5643                 object->scan_collisions = 0;
5644         }
5645
5646         if ((cntrl_flags & UPL_WILL_MODIFY) && must_throttle_writes() == TRUE) {
5647                 boolean_t       isSSD = FALSE;
5648
5649 #if !XNU_TARGET_OS_OSX
5650                 isSSD = TRUE;
5651 #else /* !XNU_TARGET_OS_OSX */
5652                 vnode_pager_get_isSSD(object->pager, &isSSD);
5653 #endif /* !XNU_TARGET_OS_OSX */
5654                 vm_object_unlock(object);
5655
5656                 OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
5657
5658                 if (isSSD == TRUE) {
5659                         delay(1000 * size_in_pages);
5660                 } else {
5661                         delay(5000 * size_in_pages);
5662                 }
5663                 OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
5664
5665                 vm_object_lock(object);
5666         }
5667
5668         while (xfer_size) {
5669                 dwp->dw_mask = 0;
5670
5671                 if ((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
5672                         vm_object_unlock(object);
5673                         alias_page = vm_page_grab_fictitious(TRUE);
5674                         vm_object_lock(object);
5675                 }
5676                 if (cntrl_flags & UPL_COPYOUT_FROM) {
5677                         upl->flags |= UPL_PAGE_SYNC_DONE;
5678
5679                         if (((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
5680                             dst_page->vmp_fictitious ||
5681                             dst_page->vmp_absent ||
5682                             dst_page->vmp_error ||
5683                             dst_page->vmp_cleaning ||
5684                             (VM_PAGE_WIRED(dst_page))) {
5685                                 if (user_page_list) {
5686                                         user_page_list[entry].phys_addr = 0;
5687                                 }
5688
5689                                 goto try_next_page;
5690                         }
5691                         phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
5692
5693                         /*
5694                          * grab this up front...
5695                          * a high percentange of the time we're going to
5696                          * need the hardware modification state a bit later
5697                          * anyway... so we can eliminate an extra call into
5698                          * the pmap layer by grabbing it here and recording it
5699                          */
5700                         if (dst_page->vmp_pmapped) {
5701                                 refmod_state = pmap_get_refmod(phys_page);
5702                         } else {
5703                                 refmod_state = 0;
5704                         }
5705
5706                         if ((refmod_state & VM_MEM_REFERENCED) && VM_PAGE_INACTIVE(dst_page)) {
5707                                 /*
5708                                  * page is on inactive list and referenced...
5709                                  * reactivate it now... this gets it out of the
5710                                  * way of vm_pageout_scan which would have to
5711                                  * reactivate it upon tripping over it
5712                                  */
5713                                 dwp->dw_mask |= DW_vm_page_activate;
5714                         }
5715                         if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
5716                                 /*
5717                                  * we're only asking for DIRTY pages to be returned
5718                                  */
5719                                 if (dst_page->vmp_laundry || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
5720                                         /*
5721                                          * if we were the page stolen by vm_pageout_scan to be
5722                                          * cleaned (as opposed to a buddy being clustered in
5723                                          * or this request is not being driven by a PAGEOUT cluster
5724                                          * then we only need to check for the page being dirty or
5725                                          * precious to decide whether to return it
5726                                          */
5727                                         if (dst_page->vmp_dirty || dst_page->vmp_precious || (refmod_state & VM_MEM_MODIFIED)) {
5728                                                 goto check_busy;
5729                                         }
5730                                         goto dont_return;
5731                                 }
5732                                 /*
5733                                  * this is a request for a PAGEOUT cluster and this page
5734                                  * is merely along for the ride as a 'buddy'... not only
5735                                  * does it have to be dirty to be returned, but it also
5736                                  * can't have been referenced recently...
5737                                  */
5738                                 if ((hibernate_cleaning_in_progress == TRUE ||
5739                                     (!((refmod_state & VM_MEM_REFERENCED) || dst_page->vmp_reference) ||
5740                                     (dst_page->vmp_q_state == VM_PAGE_ON_THROTTLED_Q))) &&
5741                                     ((refmod_state & VM_MEM_MODIFIED) || dst_page->vmp_dirty || dst_page->vmp_precious)) {
5742                                         goto check_busy;
5743                                 }
5744 dont_return:
5745                                 /*
5746                                  * if we reach here, we're not to return
5747                                  * the page... go on to the next one
5748                                  */
5749                                 if (dst_page->vmp_laundry == TRUE) {
5750                                         /*
5751                                          * if we get here, the page is not 'cleaning' (filtered out above).
5752                                          * since it has been referenced, remove it from the laundry
5753                                          * so we don't pay the cost of an I/O to clean a page
5754                                          * we're just going to take back
5755                                          */
5756                                         vm_page_lockspin_queues();
5757
5758                                         vm_pageout_steal_laundry(dst_page, TRUE);
5759                                         vm_page_activate(dst_page);
5760
5761                                         vm_page_unlock_queues();
5762                                 }
5763                                 if (user_page_list) {
5764                                         user_page_list[entry].phys_addr = 0;
5765                                 }
5766
5767                                 goto try_next_page;
5768                         }
5769 check_busy:
5770                         if (dst_page->vmp_busy) {
5771                                 if (cntrl_flags & UPL_NOBLOCK) {
5772                                         if (user_page_list) {
5773                                                 user_page_list[entry].phys_addr = 0;
5774                                         }
5775                                         dwp->dw_mask = 0;
5776
5777                                         goto try_next_page;
5778                                 }
5779                                 /*
5780                                  * someone else is playing with the
5781                                  * page.  We will have to wait.
5782                                  */
5783                                 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
5784
5785                                 continue;
5786                         }
5787                         if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
5788                                 vm_page_lockspin_queues();
5789
5790                                 if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
5791                                         /*
5792                                          * we've buddied up a page for a clustered pageout
5793                                          * that has already been moved to the pageout
5794                                          * queue by pageout_scan... we need to remove
5795                                          * it from the queue and drop the laundry count
5796                                          * on that queue
5797                                          */
5798                                         vm_pageout_throttle_up(dst_page);
5799                                 }
5800                                 vm_page_unlock_queues();
5801                         }
5802                         hw_dirty = refmod_state & VM_MEM_MODIFIED;
5803                         dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
5804
5805                         if (phys_page > upl->highest_page) {
5806                                 upl->highest_page = phys_page;
5807                         }
5808
5809                         assert(!pmap_is_noencrypt(phys_page));
5810
5811                         if (cntrl_flags & UPL_SET_LITE) {
5812                                 unsigned int    pg_num;
5813
5814                                 pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
5815                                 assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
5816                                 lite_list[pg_num >> 5] |= 1U << (pg_num & 31);
5817
5818                                 if (hw_dirty) {
5819                                         if (pmap_flushes_delayed == FALSE) {
5820                                                 pmap_flush_context_init(&pmap_flush_context_storage);
5821                                                 pmap_flushes_delayed = TRUE;
5822                                         }
5823                                         pmap_clear_refmod_options(phys_page,
5824                                             VM_MEM_MODIFIED,
5825                                             PMAP_OPTIONS_NOFLUSH | PMAP_OPTIONS_CLEAR_WRITE,
5826                                             &pmap_flush_context_storage);
5827                                 }
5828
5829                                 /*
5830                                  * Mark original page as cleaning
5831                                  * in place.
5832                                  */
5833                                 dst_page->vmp_cleaning = TRUE;
5834                                 dst_page->vmp_precious = FALSE;
5835                         } else {
5836                                 /*
5837                                  * use pageclean setup, it is more
5838                                  * convenient even for the pageout
5839                                  * cases here
5840                                  */
5841                                 vm_object_lock(upl->map_object);
5842                                 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
5843                                 vm_object_unlock(upl->map_object);
5844
5845                                 alias_page->vmp_absent = FALSE;
5846                                 alias_page = NULL;
5847                         }
5848                         if (dirty) {
5849                                 SET_PAGE_DIRTY(dst_page, FALSE);
5850                         } else {
5851                                 dst_page->vmp_dirty = FALSE;
5852                         }
5853
5854                         if (!dirty) {
5855                                 dst_page->vmp_precious = TRUE;
5856                         }
5857
5858                         if (!(cntrl_flags & UPL_CLEAN_IN_PLACE)) {
5859                                 if (!VM_PAGE_WIRED(dst_page)) {
5860                                         dst_page->vmp_free_when_done = TRUE;
5861                                 }
5862                         }
5863                 } else {
5864                         if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != last_copy_object) {
5865                                 /*
5866                                  * Honor copy-on-write obligations
5867                                  *
5868                                  * The copy object has changed since we
5869                                  * last synchronized for copy-on-write.
5870                                  * Another copy object might have been
5871                                  * inserted while we released the object's
5872                                  * lock.  Since someone could have seen the
5873                                  * original contents of the remaining pages
5874                                  * through that new object, we have to
5875                                  * synchronize with it again for the remaining
5876                                  * pages only.  The previous pages are "busy"
5877                                  * so they can not be seen through the new
5878                                  * mapping.  The new mapping will see our
5879                                  * upcoming changes for those previous pages,
5880                                  * but that's OK since they couldn't see what
5881                                  * was there before.  It's just a race anyway
5882                                  * and there's no guarantee of consistency or
5883                                  * atomicity.  We just don't want new mappings
5884                                  * to see both the *before* and *after* pages.
5885                                  */
5886                                 if (object->copy != VM_OBJECT_NULL) {
5887                                         vm_object_update(
5888                                                 object,
5889                                                 dst_offset,/* current offset */
5890                                                 xfer_size, /* remaining size */
5891                                                 NULL,
5892                                                 NULL,
5893                                                 FALSE,     /* should_return */
5894                                                 MEMORY_OBJECT_COPY_SYNC,
5895                                                 VM_PROT_NO_CHANGE);
5896
5897                                         VM_PAGEOUT_DEBUG(upl_cow_again, 1);
5898                                         VM_PAGEOUT_DEBUG(upl_cow_again_pages, (xfer_size >> PAGE_SHIFT));
5899                                 }
5900                                 /*
5901                                  * remember the copy object we synced with
5902                                  */
5903                                 last_copy_object = object->copy;
5904                         }
5905                         dst_page = vm_page_lookup(object, dst_offset);
5906
5907                         if (dst_page != VM_PAGE_NULL) {
5908                                 if ((cntrl_flags & UPL_RET_ONLY_ABSENT)) {
5909                                         /*
5910                                          * skip over pages already present in the cache
5911                                          */
5912                                         if (user_page_list) {
5913                                                 user_page_list[entry].phys_addr = 0;
5914                                         }
5915
5916                                         goto try_next_page;
5917                                 }
5918                                 if (dst_page->vmp_fictitious) {
5919                                         panic("need corner case for fictitious page");
5920                                 }
5921
5922                                 if (dst_page->vmp_busy || dst_page->vmp_cleaning) {
5923                                         /*
5924                                          * someone else is playing with the
5925                                          * page.  We will have to wait.
5926                                          */
5927                                         PAGE_SLEEP(object, dst_page, THREAD_UNINT);
5928
5929                                         continue;
5930                                 }
5931                                 if (dst_page->vmp_laundry) {
5932                                         vm_pageout_steal_laundry(dst_page, FALSE);
5933                                 }
5934                         } else {
5935                                 if (object->private) {
5936                                         /*
5937                                          * This is a nasty wrinkle for users
5938                                          * of upl who encounter device or
5939                                          * private memory however, it is
5940                                          * unavoidable, only a fault can
5941                                          * resolve the actual backing
5942                                          * physical page by asking the
5943                                          * backing device.
5944                                          */
5945                                         if (user_page_list) {
5946                                                 user_page_list[entry].phys_addr = 0;
5947                                         }
5948
5949                                         goto try_next_page;
5950                                 }
5951                                 if (object->scan_collisions) {
5952                                         /*
5953                                          * the pageout_scan thread is trying to steal
5954                                          * pages from this object, but has run into our
5955                                          * lock... grab 2 pages from the head of the object...
5956                                          * the first is freed on behalf of pageout_scan, the
5957                                          * 2nd is for our own use... we use vm_object_page_grab
5958                                          * in both cases to avoid taking pages from the free
5959                                          * list since we are under memory pressure and our
5960                                          * lock on this object is getting in the way of
5961                                          * relieving it
5962                                          */
5963                                         dst_page = vm_object_page_grab(object);
5964
5965                                         if (dst_page != VM_PAGE_NULL) {
5966                                                 vm_page_release(dst_page,
5967                                                     FALSE);
5968                                         }
5969
5970                                         dst_page = vm_object_page_grab(object);
5971                                 }
5972                                 if (dst_page == VM_PAGE_NULL) {
5973                                         /*
5974                                          * need to allocate a page
5975                                          */
5976                                         dst_page = vm_page_grab_options(grab_options);
5977                                         if (dst_page != VM_PAGE_NULL) {
5978                                                 page_grab_count++;
5979                                         }
5980                                 }
5981                                 if (dst_page == VM_PAGE_NULL) {
5982                                         if ((cntrl_flags & (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) == (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) {
5983                                                 /*
5984                                                  * we don't want to stall waiting for pages to come onto the free list
5985                                                  * while we're already holding absent pages in this UPL
5986                                                  * the caller will deal with the empty slots
5987                                                  */
5988                                                 if (user_page_list) {
5989                                                         user_page_list[entry].phys_addr = 0;
5990                                                 }
5991
5992                                                 goto try_next_page;
5993                                         }
5994                                         /*
5995                                          * no pages available... wait
5996                                          * then try again for the same
5997                                          * offset...
5998                                          */
5999                                         vm_object_unlock(object);
6000
6001                                         OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
6002
6003                                         VM_DEBUG_EVENT(vm_upl_page_wait, VM_UPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
6004
6005                                         VM_PAGE_WAIT();
6006                                         OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
6007
6008                                         VM_DEBUG_EVENT(vm_upl_page_wait, VM_UPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
6009
6010                                         vm_object_lock(object);
6011
6012                                         continue;
6013                                 }
6014                                 vm_page_insert(dst_page, object, dst_offset);
6015
6016                                 dst_page->vmp_absent = TRUE;
6017                                 dst_page->vmp_busy = FALSE;
6018
6019                                 if (cntrl_flags & UPL_RET_ONLY_ABSENT) {
6020                                         /*
6021                                          * if UPL_RET_ONLY_ABSENT was specified,
6022                                          * than we're definitely setting up a
6023                                          * upl for a clustered read/pagein
6024                                          * operation... mark the pages as clustered
6025                                          * so upl_commit_range can put them on the
6026                                          * speculative list
6027                                          */
6028                                         dst_page->vmp_clustered = TRUE;
6029
6030                                         if (!(cntrl_flags & UPL_FILE_IO)) {
6031                                                 counter_inc(&vm_statistics_pageins);
6032                                         }
6033                                 }
6034                         }
6035                         phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
6036
6037                         dst_page->vmp_overwriting = TRUE;
6038
6039                         if (dst_page->vmp_pmapped) {
6040                                 if (!(cntrl_flags & UPL_FILE_IO)) {
6041                                         /*
6042                                          * eliminate all mappings from the
6043                                          * original object and its prodigy
6044                                          */
6045                                         refmod_state = pmap_disconnect(phys_page);
6046                                 } else {
6047                                         refmod_state = pmap_get_refmod(phys_page);
6048                                 }
6049                         } else {
6050                                 refmod_state = 0;
6051                         }
6052
6053                         hw_dirty = refmod_state & VM_MEM_MODIFIED;
6054                         dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
6055
6056                         if (cntrl_flags & UPL_SET_LITE) {
6057                                 unsigned int    pg_num;
6058
6059                                 pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
6060                                 assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
6061                                 lite_list[pg_num >> 5] |= 1U << (pg_num & 31);
6062
6063                                 if (hw_dirty) {
6064                                         pmap_clear_modify(phys_page);
6065                                 }
6066
6067                                 /*
6068                                  * Mark original page as cleaning
6069                                  * in place.
6070                                  */
6071                                 dst_page->vmp_cleaning = TRUE;
6072                                 dst_page->vmp_precious = FALSE;
6073                         } else {
6074                                 /*
6075                                  * use pageclean setup, it is more
6076                                  * convenient even for the pageout
6077                                  * cases here
6078                                  */
6079                                 vm_object_lock(upl->map_object);
6080                                 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
6081                                 vm_object_unlock(upl->map_object);
6082
6083                                 alias_page->vmp_absent = FALSE;
6084                                 alias_page = NULL;
6085                         }
6086
6087                         if (cntrl_flags & UPL_REQUEST_SET_DIRTY) {
6088                                 upl->flags &= ~UPL_CLEAR_DIRTY;
6089                                 upl->flags |= UPL_SET_DIRTY;
6090                                 dirty = TRUE;
6091                                 /*
6092                                  * Page belonging to a code-signed object is about to
6093                                  * be written. Mark it tainted and disconnect it from
6094                                  * all pmaps so processes have to fault it back in and
6095                                  * deal with the tainted bit.
6096                                  */
6097                                 if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
6098                                         dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE;
6099                                         vm_page_upl_tainted++;
6100                                         if (dst_page->vmp_pmapped) {
6101                                                 refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
6102                                                 if (refmod_state & VM_MEM_REFERENCED) {
6103                                                         dst_page->vmp_reference = TRUE;
6104                                                 }
6105                                         }
6106                                 }
6107                         } else if (cntrl_flags & UPL_CLEAN_IN_PLACE) {
6108                                 /*
6109                                  * clean in place for read implies
6110                                  * that a write will be done on all
6111                                  * the pages that are dirty before
6112                                  * a upl commit is done.  The caller
6113                                  * is obligated to preserve the
6114                                  * contents of all pages marked dirty
6115                                  */
6116                                 upl->flags |= UPL_CLEAR_DIRTY;
6117                         }
6118                         dst_page->vmp_dirty = dirty;
6119
6120                         if (!dirty) {
6121                                 dst_page->vmp_precious = TRUE;
6122                         }
6123
6124                         if (!VM_PAGE_WIRED(dst_page)) {
6125                                 /*
6126                                  * deny access to the target page while
6127                                  * it is being worked on
6128                                  */
6129                                 dst_page->vmp_busy = TRUE;
6130                         } else {
6131                                 dwp->dw_mask |= DW_vm_page_wire;
6132                         }
6133
6134                         /*
6135                          * We might be about to satisfy a fault which has been
6136                          * requested. So no need for the "restart" bit.
6137                          */
6138                         dst_page->vmp_restart = FALSE;
6139                         if (!dst_page->vmp_absent && !(cntrl_flags & UPL_WILL_MODIFY)) {
6140                                 /*
6141                                  * expect the page to be used
6142                                  */
6143                                 dwp->dw_mask |= DW_set_reference;
6144                         }
6145                         if (cntrl_flags & UPL_PRECIOUS) {
6146                                 if (object->internal) {
6147                                         SET_PAGE_DIRTY(dst_page, FALSE);
6148                                         dst_page->vmp_precious = FALSE;
6149                                 } else {
6150                                         dst_page->vmp_precious = TRUE;
6151                                 }
6152                         } else {
6153                                 dst_page->vmp_precious = FALSE;
6154                         }
6155                 }
6156                 if (dst_page->vmp_busy) {
6157                         upl->flags |= UPL_HAS_BUSY;
6158                 }
6159
6160                 if (phys_page > upl->highest_page) {
6161                         upl->highest_page = phys_page;
6162                 }
6163                 assert(!pmap_is_noencrypt(phys_page));
6164                 if (user_page_list) {
6165                         user_page_list[entry].phys_addr = phys_page;
6166                         user_page_list[entry].free_when_done    = dst_page->vmp_free_when_done;
6167                         user_page_list[entry].absent    = dst_page->vmp_absent;
6168                         user_page_list[entry].dirty     = dst_page->vmp_dirty;
6169                         user_page_list[entry].precious  = dst_page->vmp_precious;
6170                         user_page_list[entry].device    = FALSE;
6171                         user_page_list[entry].needed    = FALSE;
6172                         if (dst_page->vmp_clustered == TRUE) {
6173                                 user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
6174                         } else {
6175                                 user_page_list[entry].speculative = FALSE;
6176                         }
6177                         user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
6178                         user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
6179                         user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
6180                         user_page_list[entry].mark      = FALSE;
6181                 }
6182                 /*
6183                  * if UPL_RET_ONLY_ABSENT is set, then
6184                  * we are working with a fresh page and we've
6185                  * just set the clustered flag on it to
6186                  * indicate that it was drug in as part of a
6187                  * speculative cluster... so leave it alone
6188                  */
6189                 if (!(cntrl_flags & UPL_RET_ONLY_ABSENT)) {
6190                         /*
6191                          * someone is explicitly grabbing this page...
6192                          * update clustered and speculative state
6193                          *
6194                          */
6195                         if (dst_page->vmp_clustered) {
6196                                 VM_PAGE_CONSUME_CLUSTERED(dst_page);
6197                         }
6198                 }
6199 try_next_page:
6200                 if (dwp->dw_mask) {
6201                         if (dwp->dw_mask & DW_vm_page_activate) {
6202                                 counter_inc(&vm_statistics_reactivations);
6203                         }
6204
6205                         VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
6206
6207                         if (dw_count >= dw_limit) {
6208                                 vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
6209
6210                                 dwp = dwp_start;
6211                                 dw_count = 0;
6212                         }
6213                 }
6214                 entry++;
6215                 dst_offset += PAGE_SIZE_64;
6216                 xfer_size -= PAGE_SIZE;
6217         }
6218         if (dw_count) {
6219                 vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
6220                 dwp = dwp_start;
6221                 dw_count = 0;
6222         }
6223
6224         if (alias_page != NULL) {
6225                 VM_PAGE_FREE(alias_page);
6226         }
6227         if (pmap_flushes_delayed == TRUE) {
6228                 pmap_flush(&pmap_flush_context_storage);
6229         }
6230
6231         if (page_list_count != NULL) {
6232                 if (upl->flags & UPL_INTERNAL) {
6233                         *page_list_count = 0;
6234                 } else if (*page_list_count > entry) {
6235                         *page_list_count = entry;
6236                 }
6237         }
6238 #if UPL_DEBUG
6239         upl->upl_state = 1;
6240 #endif
6241         vm_object_unlock(object);
6242
6243         VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, VM_UPL_REQUEST, DBG_FUNC_END, page_grab_count, 0, 0, 0);
6244 #if DEVELOPMENT || DEBUG
6245         if (task != NULL) {
6246                 ledger_credit(task->ledger, task_ledgers.pages_grabbed_upl, page_grab_count);
6247         }
6248 #endif /* DEVELOPMENT || DEBUG */
6249
6250         if (dwp_start && dwp_finish_ctx) {
6251                 vm_page_delayed_work_finish_ctx(dwp_start);
6252                 dwp_start = dwp = NULL;
6253         }
6254
6255         return KERN_SUCCESS;
6256 }
6257
6258 /*
6259  *      Routine:        vm_object_super_upl_request
6260  *      Purpose:
6261  *              Cause the population of a portion of a vm_object
6262  *              in much the same way as memory_object_upl_request.
6263  *              Depending on the nature of the request, the pages
6264  *              returned may be contain valid data or be uninitialized.
6265  *              However, the region may be expanded up to the super
6266  *              cluster size provided.
6267  */
6268
6269 __private_extern__ kern_return_t
6270 vm_object_super_upl_request(
6271         vm_object_t object,
6272         vm_object_offset_t      offset,
6273         upl_size_t              size,
6274         upl_size_t              super_cluster,
6275         upl_t                   *upl,
6276         upl_page_info_t         *user_page_list,
6277         unsigned int            *page_list_count,
6278         upl_control_flags_t     cntrl_flags,
6279         vm_tag_t                tag)
6280 {
6281         if (object->paging_offset > offset || ((cntrl_flags & UPL_VECTOR) == UPL_VECTOR)) {
6282                 return KERN_FAILURE;
6283         }
6284
6285         assert(object->paging_in_progress);
6286         offset = offset - object->paging_offset;
6287
6288         if (super_cluster > size) {
6289                 vm_object_offset_t      base_offset;
6290                 upl_size_t              super_size;
6291                 vm_object_size_t        super_size_64;
6292
6293                 base_offset = (offset & ~((vm_object_offset_t) super_cluster - 1));
6294                 super_size = (offset + size) > (base_offset + super_cluster) ? super_cluster << 1 : super_cluster;
6295                 super_size_64 = ((base_offset + super_size) > object->vo_size) ? (object->vo_size - base_offset) : super_size;
6296                 super_size = (upl_size_t) super_size_64;
6297                 assert(super_size == super_size_64);
6298
6299                 if (offset > (base_offset + super_size)) {
6300                         panic("vm_object_super_upl_request: Missed target pageout"
6301                             " %#llx,%#llx, %#x, %#x, %#x, %#llx\n",
6302                             offset, base_offset, super_size, super_cluster,
6303                             size, object->paging_offset);
6304                 }
6305                 /*
6306                  * apparently there is a case where the vm requests a
6307                  * page to be written out who's offset is beyond the
6308                  * object size
6309                  */
6310                 if ((offset + size) > (base_offset + super_size)) {
6311                         super_size_64 = (offset + size) - base_offset;
6312                         super_size = (upl_size_t) super_size_64;
6313                         assert(super_size == super_size_64);
6314                 }
6315
6316                 offset = base_offset;
6317                 size = super_size;
6318         }
6319         return vm_object_upl_request(object, offset, size, upl, user_page_list, page_list_count, cntrl_flags, tag);
6320 }
6321
6322 int cs_executable_create_upl = 0;
6323 extern int proc_selfpid(void);
6324 extern char *proc_name_address(void *p);
6325
6326 kern_return_t
6327 vm_map_create_upl(
6328         vm_map_t                map,
6329         vm_map_address_t        offset,
6330         upl_size_t              *upl_size,
6331         upl_t                   *upl,
6332         upl_page_info_array_t   page_list,
6333         unsigned int            *count,
6334         upl_control_flags_t     *flags,
6335         vm_tag_t                tag)
6336 {
6337         vm_map_entry_t          entry;
6338         upl_control_flags_t     caller_flags;
6339         int                     force_data_sync;
6340         int                     sync_cow_data;
6341         vm_object_t             local_object;
6342         vm_map_offset_t         local_offset;
6343         vm_map_offset_t         local_start;
6344         kern_return_t           ret;
6345         vm_map_address_t        original_offset;
6346         vm_map_size_t           original_size, adjusted_size;
6347         vm_map_offset_t         local_entry_start;
6348         vm_object_offset_t      local_entry_offset;
6349         vm_object_offset_t      offset_in_mapped_page;
6350         boolean_t               release_map = FALSE;
6351
6352 start_with_map:
6353
6354         original_offset = offset;
6355         original_size = *upl_size;
6356         adjusted_size = original_size;
6357
6358         caller_flags = *flags;
6359
6360         if (caller_flags & ~UPL_VALID_FLAGS) {
6361                 /*
6362                  * For forward compatibility's sake,
6363                  * reject any unknown flag.
6364                  */
6365                 ret = KERN_INVALID_VALUE;
6366                 goto done;
6367         }
6368         force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
6369         sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
6370
6371         if (upl == NULL) {
6372                 ret = KERN_INVALID_ARGUMENT;
6373                 goto done;
6374         }
6375
6376 REDISCOVER_ENTRY:
6377         vm_map_lock_read(map);
6378
6379         if (!vm_map_lookup_entry(map, offset, &entry)) {
6380                 vm_map_unlock_read(map);
6381                 ret = KERN_FAILURE;
6382                 goto done;
6383         }
6384
6385         local_entry_start = entry->vme_start;
6386         local_entry_offset = VME_OFFSET(entry);
6387
6388         if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
6389                 DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%x flags 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)offset, *upl_size, *flags);
6390         }
6391
6392         if (entry->vme_end - original_offset < adjusted_size) {
6393                 adjusted_size = entry->vme_end - original_offset;
6394                 assert(adjusted_size > 0);
6395                 *upl_size = (upl_size_t) adjusted_size;
6396                 assert(*upl_size == adjusted_size);
6397         }
6398
6399         if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
6400                 *flags = 0;
6401
6402                 if (!entry->is_sub_map &&
6403                     VME_OBJECT(entry) != VM_OBJECT_NULL) {
6404                         if (VME_OBJECT(entry)->private) {
6405                                 *flags = UPL_DEV_MEMORY;
6406                         }
6407
6408                         if (VME_OBJECT(entry)->phys_contiguous) {
6409                                 *flags |= UPL_PHYS_CONTIG;
6410                         }
6411                 }
6412                 vm_map_unlock_read(map);
6413                 ret = KERN_SUCCESS;
6414                 goto done;
6415         }
6416
6417         offset_in_mapped_page = 0;
6418         if (VM_MAP_PAGE_SIZE(map) < PAGE_SIZE) {
6419                 offset = vm_map_trunc_page(original_offset, VM_MAP_PAGE_MASK(map));
6420                 *upl_size = (upl_size_t)
6421                     (vm_map_round_page(original_offset + adjusted_size,
6422                     VM_MAP_PAGE_MASK(map))
6423                     - offset);
6424
6425                 offset_in_mapped_page = original_offset - offset;
6426                 assert(offset_in_mapped_page < VM_MAP_PAGE_SIZE(map));
6427
6428                 DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%llx flags 0x%llx -> offset 0x%llx adjusted_size 0x%llx *upl_size 0x%x offset_in_mapped_page 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)original_offset, (uint64_t)original_size, *flags, (uint64_t)offset, (uint64_t)adjusted_size, *upl_size, offset_in_mapped_page);
6429         }
6430
6431         if (VME_OBJECT(entry) == VM_OBJECT_NULL ||
6432             !VME_OBJECT(entry)->phys_contiguous) {
6433                 if (*upl_size > MAX_UPL_SIZE_BYTES) {
6434                         *upl_size = MAX_UPL_SIZE_BYTES;
6435                 }
6436         }
6437
6438         /*
6439          *      Create an object if necessary.
6440          */
6441         if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
6442                 if (vm_map_lock_read_to_write(map)) {
6443                         goto REDISCOVER_ENTRY;
6444                 }
6445
6446                 VME_OBJECT_SET(entry,
6447                     vm_object_allocate((vm_size_t)
6448                     vm_object_round_page((entry->vme_end - entry->vme_start))));
6449                 VME_OFFSET_SET(entry, 0);
6450                 assert(entry->use_pmap);
6451
6452                 vm_map_lock_write_to_read(map);
6453         }
6454
6455         if (!(caller_flags & UPL_COPYOUT_FROM) &&
6456             !entry->is_sub_map &&
6457             !(entry->protection & VM_PROT_WRITE)) {
6458                 vm_map_unlock_read(map);
6459                 ret = KERN_PROTECTION_FAILURE;
6460                 goto done;
6461         }
6462
6463 #if !XNU_TARGET_OS_OSX
6464         if (map->pmap != kernel_pmap &&
6465             (caller_flags & UPL_COPYOUT_FROM) &&
6466             (entry->protection & VM_PROT_EXECUTE) &&
6467             !(entry->protection & VM_PROT_WRITE)) {
6468                 vm_offset_t     kaddr;
6469                 vm_size_t       ksize;
6470
6471                 /*
6472                  * We're about to create a read-only UPL backed by
6473                  * memory from an executable mapping.
6474                  * Wiring the pages would result in the pages being copied
6475                  * (due to the "MAP_PRIVATE" mapping) and no longer
6476                  * code-signed, so no longer eligible for execution.
6477                  * Instead, let's copy the data into a kernel buffer and
6478                  * create the UPL from this kernel buffer.
6479                  * The kernel buffer is then freed, leaving the UPL holding
6480                  * the last reference on the VM object, so the memory will
6481                  * be released when the UPL is committed.
6482                  */
6483
6484                 vm_map_unlock_read(map);
6485                 entry = VM_MAP_ENTRY_NULL;
6486                 /* allocate kernel buffer */
6487                 ksize = round_page(*upl_size);
6488                 kaddr = 0;
6489                 ret = kmem_alloc_pageable(kernel_map,
6490                     &kaddr,
6491                     ksize,
6492                     tag);
6493                 if (ret == KERN_SUCCESS) {
6494                         /* copyin the user data */
6495                         ret = copyinmap(map, offset, (void *)kaddr, *upl_size);
6496                 }
6497                 if (ret == KERN_SUCCESS) {
6498                         if (ksize > *upl_size) {
6499                                 /* zero out the extra space in kernel buffer */
6500                                 memset((void *)(kaddr + *upl_size),
6501                                     0,
6502                                     ksize - *upl_size);
6503                         }
6504                         /* create the UPL from the kernel buffer */
6505                         vm_object_offset_t      offset_in_object;
6506                         vm_object_offset_t      offset_in_object_page;
6507
6508                         offset_in_object = offset - local_entry_start + local_entry_offset;
6509                         offset_in_object_page = offset_in_object - vm_object_trunc_page(offset_in_object);
6510                         assert(offset_in_object_page < PAGE_SIZE);
6511                         assert(offset_in_object_page + offset_in_mapped_page < PAGE_SIZE);
6512                         *upl_size -= offset_in_object_page + offset_in_mapped_page;
6513                         ret = vm_map_create_upl(kernel_map,
6514                             (vm_map_address_t)(kaddr + offset_in_object_page + offset_in_mapped_page),
6515                             upl_size, upl, page_list, count, flags, tag);
6516                 }
6517                 if (kaddr != 0) {
6518                         /* free the kernel buffer */
6519                         kmem_free(kernel_map, kaddr, ksize);
6520                         kaddr = 0;
6521                         ksize = 0;
6522                 }
6523 #if DEVELOPMENT || DEBUG
6524                 DTRACE_VM4(create_upl_from_executable,
6525                     vm_map_t, map,
6526                     vm_map_address_t, offset,
6527                     upl_size_t, *upl_size,
6528                     kern_return_t, ret);
6529 #endif /* DEVELOPMENT || DEBUG */
6530                 goto done;
6531         }
6532 #endif /* !XNU_TARGET_OS_OSX */
6533
6534         local_object = VME_OBJECT(entry);
6535         assert(local_object != VM_OBJECT_NULL);
6536
6537         if (!entry->is_sub_map &&
6538             !entry->needs_copy &&
6539             *upl_size != 0 &&
6540             local_object->vo_size > *upl_size && /* partial UPL */
6541             entry->wired_count == 0 && /* No COW for entries that are wired */
6542             (map->pmap != kernel_pmap) && /* alias checks */
6543             (vm_map_entry_should_cow_for_true_share(entry) /* case 1 */
6544             ||
6545             ( /* case 2 */
6546                     local_object->internal &&
6547                     (local_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) &&
6548                     local_object->ref_count > 1))) {
6549                 vm_prot_t       prot;
6550
6551                 /*
6552                  * Case 1:
6553                  * Set up the targeted range for copy-on-write to avoid
6554                  * applying true_share/copy_delay to the entire object.
6555                  *
6556                  * Case 2:
6557                  * This map entry covers only part of an internal
6558                  * object.  There could be other map entries covering
6559                  * other areas of this object and some of these map
6560                  * entries could be marked as "needs_copy", which
6561                  * assumes that the object is COPY_SYMMETRIC.
6562                  * To avoid marking this object as COPY_DELAY and
6563                  * "true_share", let's shadow it and mark the new
6564                  * (smaller) object as "true_share" and COPY_DELAY.
6565                  */
6566
6567                 if (vm_map_lock_read_to_write(map)) {
6568                         goto REDISCOVER_ENTRY;
6569                 }
6570                 vm_map_lock_assert_exclusive(map);
6571                 assert(VME_OBJECT(entry) == local_object);
6572
6573                 vm_map_clip_start(map,
6574                     entry,
6575                     vm_map_trunc_page(offset,
6576                     VM_MAP_PAGE_MASK(map)));
6577                 vm_map_clip_end(map,
6578                     entry,
6579                     vm_map_round_page(offset + *upl_size,
6580                     VM_MAP_PAGE_MASK(map)));
6581                 if ((entry->vme_end - offset) < *upl_size) {
6582                         *upl_size = (upl_size_t) (entry->vme_end - offset);
6583                         assert(*upl_size == entry->vme_end - offset);
6584                 }
6585
6586                 prot = entry->protection & ~VM_PROT_WRITE;
6587                 if (override_nx(map, VME_ALIAS(entry)) && prot) {
6588                         prot |= VM_PROT_EXECUTE;
6589                 }
6590                 vm_object_pmap_protect(local_object,
6591                     VME_OFFSET(entry),
6592                     entry->vme_end - entry->vme_start,
6593                     ((entry->is_shared ||
6594                     map->mapped_in_other_pmaps)
6595                     ? PMAP_NULL
6596                     : map->pmap),
6597                     VM_MAP_PAGE_SIZE(map),
6598                     entry->vme_start,
6599                     prot);
6600
6601                 assert(entry->wired_count == 0);
6602
6603                 /*
6604                  * Lock the VM object and re-check its status: if it's mapped
6605                  * in another address space, we could still be racing with
6606                  * another thread holding that other VM map exclusively.
6607                  */
6608                 vm_object_lock(local_object);
6609                 if (local_object->true_share) {
6610                         /* object is already in proper state: no COW needed */
6611                         assert(local_object->copy_strategy !=
6612                             MEMORY_OBJECT_COPY_SYMMETRIC);
6613                 } else {
6614                         /* not true_share: ask for copy-on-write below */
6615                         assert(local_object->copy_strategy ==
6616                             MEMORY_OBJECT_COPY_SYMMETRIC);
6617                         entry->needs_copy = TRUE;
6618                 }
6619                 vm_object_unlock(local_object);
6620
6621                 vm_map_lock_write_to_read(map);
6622         }
6623
6624         if (entry->needs_copy) {
6625                 /*
6626                  * Honor copy-on-write for COPY_SYMMETRIC
6627                  * strategy.
6628                  */
6629                 vm_map_t                local_map;
6630                 vm_object_t             object;
6631                 vm_object_offset_t      new_offset;
6632                 vm_prot_t               prot;
6633                 boolean_t               wired;
6634                 vm_map_version_t        version;
6635                 vm_map_t                real_map;
6636                 vm_prot_t               fault_type;
6637
6638                 local_map = map;
6639
6640                 if (caller_flags & UPL_COPYOUT_FROM) {
6641                         fault_type = VM_PROT_READ | VM_PROT_COPY;
6642                         vm_counters.create_upl_extra_cow++;
6643                         vm_counters.create_upl_extra_cow_pages +=
6644                             (entry->vme_end - entry->vme_start) / PAGE_SIZE;
6645                 } else {
6646                         fault_type = VM_PROT_WRITE;
6647                 }
6648                 if (vm_map_lookup_locked(&local_map,
6649                     offset, fault_type,
6650                     OBJECT_LOCK_EXCLUSIVE,
6651                     &version, &object,
6652                     &new_offset, &prot, &wired,
6653                     NULL,
6654                     &real_map, NULL) != KERN_SUCCESS) {
6655                         if (fault_type == VM_PROT_WRITE) {
6656                                 vm_counters.create_upl_lookup_failure_write++;
6657                         } else {
6658                                 vm_counters.create_upl_lookup_failure_copy++;
6659                         }
6660                         vm_map_unlock_read(local_map);
6661                         ret = KERN_FAILURE;
6662                         goto done;
6663                 }
6664                 if (real_map != local_map) {
6665                         vm_map_unlock(real_map);
6666                 }
6667                 vm_map_unlock_read(local_map);
6668
6669                 vm_object_unlock(object);
6670
6671                 goto REDISCOVER_ENTRY;
6672         }
6673
6674         if (entry->is_sub_map) {
6675                 vm_map_t        submap;
6676
6677                 submap = VME_SUBMAP(entry);
6678                 local_start = entry->vme_start;
6679                 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
6680
6681                 vm_map_reference(submap);
6682                 vm_map_unlock_read(map);
6683
6684                 DEBUG4K_UPL("map %p offset 0x%llx (0x%llx) size 0x%x (adjusted 0x%llx original 0x%llx) offset_in_mapped_page 0x%llx submap %p\n", map, (uint64_t)offset, (uint64_t)original_offset, *upl_size, (uint64_t)adjusted_size, (uint64_t)original_size, offset_in_mapped_page, submap);
6685                 offset += offset_in_mapped_page;
6686                 *upl_size -= offset_in_mapped_page;
6687
6688                 if (release_map) {
6689                         vm_map_deallocate(map);
6690                 }
6691                 map = submap;
6692                 release_map = TRUE;
6693                 offset = local_offset + (offset - local_start);
6694                 goto start_with_map;
6695         }
6696
6697         if (sync_cow_data &&
6698             (VME_OBJECT(entry)->shadow ||
6699             VME_OBJECT(entry)->copy)) {
6700                 local_object = VME_OBJECT(entry);
6701                 local_start = entry->vme_start;
6702                 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
6703
6704                 vm_object_reference(local_object);
6705                 vm_map_unlock_read(map);
6706
6707                 if (local_object->shadow && local_object->copy) {
6708                         vm_object_lock_request(local_object->shadow,
6709                             ((vm_object_offset_t)
6710                             ((offset - local_start) +
6711                             local_offset) +
6712                             local_object->vo_shadow_offset),
6713                             *upl_size, FALSE,
6714                             MEMORY_OBJECT_DATA_SYNC,
6715                             VM_PROT_NO_CHANGE);
6716                 }
6717                 sync_cow_data = FALSE;
6718                 vm_object_deallocate(local_object);
6719
6720                 goto REDISCOVER_ENTRY;
6721         }
6722         if (force_data_sync) {
6723                 local_object = VME_OBJECT(entry);
6724                 local_start = entry->vme_start;
6725                 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
6726
6727                 vm_object_reference(local_object);
6728                 vm_map_unlock_read(map);
6729
6730                 vm_object_lock_request(local_object,
6731                     ((vm_object_offset_t)
6732                     ((offset - local_start) +
6733                     local_offset)),
6734                     (vm_object_size_t)*upl_size,
6735                     FALSE,
6736                     MEMORY_OBJECT_DATA_SYNC,
6737                     VM_PROT_NO_CHANGE);
6738
6739                 force_data_sync = FALSE;
6740                 vm_object_deallocate(local_object);
6741
6742                 goto REDISCOVER_ENTRY;
6743         }
6744         if (VME_OBJECT(entry)->private) {
6745                 *flags = UPL_DEV_MEMORY;
6746         } else {
6747                 *flags = 0;
6748         }
6749
6750         if (VME_OBJECT(entry)->phys_contiguous) {
6751                 *flags |= UPL_PHYS_CONTIG;
6752         }
6753
6754         local_object = VME_OBJECT(entry);
6755         local_offset = (vm_map_offset_t)VME_OFFSET(entry);
6756         local_start = entry->vme_start;
6757
6758         /*
6759          * Wiring will copy the pages to the shadow object.
6760          * The shadow object will not be code-signed so
6761          * attempting to execute code from these copied pages
6762          * would trigger a code-signing violation.
6763          */
6764         if (entry->protection & VM_PROT_EXECUTE) {
6765 #if MACH_ASSERT
6766                 printf("pid %d[%s] create_upl out of executable range from "
6767                     "0x%llx to 0x%llx: side effects may include "
6768                     "code-signing violations later on\n",
6769                     proc_selfpid(),
6770                     (current_task()->bsd_info
6771                     ? proc_name_address(current_task()->bsd_info)
6772                     : "?"),
6773                     (uint64_t) entry->vme_start,
6774                     (uint64_t) entry->vme_end);
6775 #endif /* MACH_ASSERT */
6776                 DTRACE_VM2(cs_executable_create_upl,
6777                     uint64_t, (uint64_t)entry->vme_start,
6778                     uint64_t, (uint64_t)entry->vme_end);
6779                 cs_executable_create_upl++;
6780         }
6781
6782         vm_object_lock(local_object);
6783
6784         /*
6785          * Ensure that this object is "true_share" and "copy_delay" now,
6786          * while we're still holding the VM map lock.  After we unlock the map,
6787          * anything could happen to that mapping, including some copy-on-write
6788          * activity.  We need to make sure that the IOPL will point at the
6789          * same memory as the mapping.
6790          */
6791         if (local_object->true_share) {
6792                 assert(local_object->copy_strategy !=
6793                     MEMORY_OBJECT_COPY_SYMMETRIC);
6794         } else if (local_object != kernel_object &&
6795             local_object != compressor_object &&
6796             !local_object->phys_contiguous) {
6797 #if VM_OBJECT_TRACKING_OP_TRUESHARE
6798                 if (!local_object->true_share &&
6799                     vm_object_tracking_inited) {
6800                         void *bt[VM_OBJECT_TRACKING_BTDEPTH];
6801                         int num = 0;
6802                         num = OSBacktrace(bt,
6803                             VM_OBJECT_TRACKING_BTDEPTH);
6804                         btlog_add_entry(vm_object_tracking_btlog,
6805                             local_object,
6806                             VM_OBJECT_TRACKING_OP_TRUESHARE,
6807                             bt,
6808                             num);
6809                 }
6810 #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
6811                 local_object->true_share = TRUE;
6812                 if (local_object->copy_strategy ==
6813                     MEMORY_OBJECT_COPY_SYMMETRIC) {
6814                         local_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
6815                 }
6816         }
6817
6818         vm_object_reference_locked(local_object);
6819         vm_object_unlock(local_object);
6820
6821         vm_map_unlock_read(map);
6822
6823         offset += offset_in_mapped_page;
6824         assert(*upl_size > offset_in_mapped_page);
6825         *upl_size -= offset_in_mapped_page;
6826
6827         ret = vm_object_iopl_request(local_object,
6828             ((vm_object_offset_t)
6829             ((offset - local_start) + local_offset)),
6830             *upl_size,
6831             upl,
6832             page_list,
6833             count,
6834             caller_flags,
6835             tag);
6836         vm_object_deallocate(local_object);
6837
6838 done:
6839         if (release_map) {
6840                 vm_map_deallocate(map);
6841         }
6842
6843         return ret;
6844 }
6845
6846 /*
6847  * Internal routine to enter a UPL into a VM map.
6848  *
6849  * JMM - This should just be doable through the standard
6850  * vm_map_enter() API.
6851  */
6852 kern_return_t
6853 vm_map_enter_upl(
6854         vm_map_t                map,
6855         upl_t                   upl,
6856         vm_map_offset_t         *dst_addr)
6857 {
6858         vm_map_size_t           size;
6859         vm_object_offset_t      offset;
6860         vm_map_offset_t         addr;
6861         vm_page_t               m;
6862         kern_return_t           kr;
6863         int                     isVectorUPL = 0, curr_upl = 0;
6864         upl_t                   vector_upl = NULL;
6865         vm_offset_t             vector_upl_dst_addr = 0;
6866         vm_map_t                vector_upl_submap = NULL;
6867         upl_offset_t            subupl_offset = 0;
6868         upl_size_t              subupl_size = 0;
6869
6870         if (upl == UPL_NULL) {
6871                 return KERN_INVALID_ARGUMENT;
6872         }
6873
6874         DEBUG4K_UPL("map %p upl %p flags 0x%x object %p offset 0x%llx size 0x%x \n", map, upl, upl->flags, upl->map_object, upl->u_offset, upl->u_size);
6875         assert(map == kernel_map);
6876
6877         if ((isVectorUPL = vector_upl_is_valid(upl))) {
6878                 int mapped = 0, valid_upls = 0;
6879                 vector_upl = upl;
6880
6881                 upl_lock(vector_upl);
6882                 for (curr_upl = 0; curr_upl < MAX_VECTOR_UPL_ELEMENTS; curr_upl++) {
6883                         upl =  vector_upl_subupl_byindex(vector_upl, curr_upl );
6884                         if (upl == NULL) {
6885                                 continue;
6886                         }
6887                         valid_upls++;
6888                         if (UPL_PAGE_LIST_MAPPED & upl->flags) {
6889                                 mapped++;
6890                         }
6891                 }
6892
6893                 if (mapped) {
6894                         if (mapped != valid_upls) {
6895                                 panic("Only %d of the %d sub-upls within the Vector UPL are alread mapped\n", mapped, valid_upls);
6896                         } else {
6897                                 upl_unlock(vector_upl);
6898                                 return KERN_FAILURE;
6899                         }
6900                 }
6901
6902                 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
6903                         panic("TODO4K: vector UPL not implemented");
6904                 }
6905
6906                 kr = kmem_suballoc(map, &vector_upl_dst_addr,
6907                     vector_upl->u_size,
6908                     FALSE,
6909                     VM_FLAGS_ANYWHERE, VM_MAP_KERNEL_FLAGS_NONE, VM_KERN_MEMORY_NONE,
6910                     &vector_upl_submap);
6911                 if (kr != KERN_SUCCESS) {
6912                         panic("Vector UPL submap allocation failed\n");
6913                 }
6914                 map = vector_upl_submap;
6915                 vector_upl_set_submap(vector_upl, vector_upl_submap, vector_upl_dst_addr);
6916                 curr_upl = 0;
6917         } else {
6918                 upl_lock(upl);
6919         }
6920
6921 process_upl_to_enter:
6922         if (isVectorUPL) {
6923                 if (curr_upl == MAX_VECTOR_UPL_ELEMENTS) {
6924                         *dst_addr = vector_upl_dst_addr;
6925                         upl_unlock(vector_upl);
6926                         return KERN_SUCCESS;
6927                 }
6928                 upl =  vector_upl_subupl_byindex(vector_upl, curr_upl++ );
6929                 if (upl == NULL) {
6930                         goto process_upl_to_enter;
6931                 }
6932
6933                 vector_upl_get_iostate(vector_upl, upl, &subupl_offset, &subupl_size);
6934                 *dst_addr = (vm_map_offset_t)(vector_upl_dst_addr + (vm_map_offset_t)subupl_offset);
6935         } else {
6936                 /*
6937                  * check to see if already mapped
6938                  */
6939                 if (UPL_PAGE_LIST_MAPPED & upl->flags) {
6940                         upl_unlock(upl);
6941                         return KERN_FAILURE;
6942                 }
6943         }
6944
6945         size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
6946
6947         if ((!(upl->flags & UPL_SHADOWED)) &&
6948             ((upl->flags & UPL_HAS_BUSY) ||
6949             !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || (upl->map_object->phys_contiguous)))) {
6950                 vm_object_t             object;
6951                 vm_page_t               alias_page;
6952                 vm_object_offset_t      new_offset;
6953                 unsigned int            pg_num;
6954                 wpl_array_t             lite_list;
6955
6956                 if (upl->flags & UPL_INTERNAL) {
6957                         lite_list = (wpl_array_t)
6958                             ((((uintptr_t)upl) + sizeof(struct upl))
6959                             + ((size / PAGE_SIZE) * sizeof(upl_page_info_t)));
6960                 } else {
6961                         lite_list = (wpl_array_t)(((uintptr_t)upl) + sizeof(struct upl));
6962                 }
6963                 object = upl->map_object;
6964                 upl->map_object = vm_object_allocate(vm_object_round_page(size));
6965
6966                 vm_object_lock(upl->map_object);
6967
6968                 upl->map_object->shadow = object;
6969                 upl->map_object->pageout = TRUE;
6970                 upl->map_object->can_persist = FALSE;
6971                 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
6972                 upl->map_object->vo_shadow_offset = upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset;
6973                 assertf(page_aligned(upl->map_object->vo_shadow_offset),
6974                     "object %p shadow_offset 0x%llx",
6975                     upl->map_object,
6976                     (uint64_t)upl->map_object->vo_shadow_offset);
6977                 upl->map_object->wimg_bits = object->wimg_bits;
6978                 assertf(page_aligned(upl->map_object->vo_shadow_offset),
6979                     "object %p shadow_offset 0x%llx",
6980                     upl->map_object, upl->map_object->vo_shadow_offset);
6981                 offset = upl->map_object->vo_shadow_offset;
6982                 new_offset = 0;
6983                 size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
6984
6985                 upl->flags |= UPL_SHADOWED;
6986
6987                 while (size) {
6988                         pg_num = (unsigned int) (new_offset / PAGE_SIZE);
6989                         assert(pg_num == new_offset / PAGE_SIZE);
6990
6991                         if (lite_list[pg_num >> 5] & (1U << (pg_num & 31))) {
6992                                 alias_page = vm_page_grab_fictitious(TRUE);
6993
6994                                 vm_object_lock(object);
6995
6996                                 m = vm_page_lookup(object, offset);
6997                                 if (m == VM_PAGE_NULL) {
6998                                         panic("vm_upl_map: page missing\n");
6999                                 }
7000
7001                                 /*
7002                                  * Convert the fictitious page to a private
7003                                  * shadow of the real page.
7004                                  */
7005                                 assert(alias_page->vmp_fictitious);
7006                                 alias_page->vmp_fictitious = FALSE;
7007                                 alias_page->vmp_private = TRUE;
7008                                 alias_page->vmp_free_when_done = TRUE;
7009                                 /*
7010                                  * since m is a page in the upl it must
7011                                  * already be wired or BUSY, so it's
7012                                  * safe to assign the underlying physical
7013                                  * page to the alias
7014                                  */
7015                                 VM_PAGE_SET_PHYS_PAGE(alias_page, VM_PAGE_GET_PHYS_PAGE(m));
7016
7017                                 vm_object_unlock(object);
7018
7019                                 vm_page_lockspin_queues();
7020                                 vm_page_wire(alias_page, VM_KERN_MEMORY_NONE, TRUE);
7021                                 vm_page_unlock_queues();
7022
7023                                 vm_page_insert_wired(alias_page, upl->map_object, new_offset, VM_KERN_MEMORY_NONE);
7024
7025                                 assert(!alias_page->vmp_wanted);
7026                                 alias_page->vmp_busy = FALSE;
7027                                 alias_page->vmp_absent = FALSE;
7028                         }
7029                         size -= PAGE_SIZE;
7030                         offset += PAGE_SIZE_64;
7031                         new_offset += PAGE_SIZE_64;
7032                 }
7033                 vm_object_unlock(upl->map_object);
7034         }
7035         if (upl->flags & UPL_SHADOWED) {
7036                 offset = 0;
7037         } else {
7038                 offset = upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)) - upl->map_object->paging_offset;
7039         }
7040
7041         size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7042
7043         vm_object_reference(upl->map_object);
7044
7045         if (!isVectorUPL) {
7046                 *dst_addr = 0;
7047                 /*
7048                  * NEED A UPL_MAP ALIAS
7049                  */
7050                 kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
7051                     VM_FLAGS_ANYWHERE, VM_MAP_KERNEL_FLAGS_NONE, VM_KERN_MEMORY_OSFMK,
7052                     upl->map_object, offset, FALSE,
7053                     VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
7054
7055                 if (kr != KERN_SUCCESS) {
7056                         vm_object_deallocate(upl->map_object);
7057                         upl_unlock(upl);
7058                         return kr;
7059                 }
7060         } else {
7061                 kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
7062                     VM_FLAGS_FIXED, VM_MAP_KERNEL_FLAGS_NONE, VM_KERN_MEMORY_OSFMK,
7063                     upl->map_object, offset, FALSE,
7064                     VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
7065                 if (kr) {
7066                         panic("vm_map_enter failed for a Vector UPL\n");
7067                 }
7068         }
7069         vm_object_lock(upl->map_object);
7070
7071         for (addr = *dst_addr; size > 0; size -= PAGE_SIZE, addr += PAGE_SIZE) {
7072                 m = vm_page_lookup(upl->map_object, offset);
7073
7074                 if (m) {
7075                         m->vmp_pmapped = TRUE;
7076
7077                         /* CODE SIGNING ENFORCEMENT: page has been wpmapped,
7078                          * but only in kernel space. If this was on a user map,
7079                          * we'd have to set the wpmapped bit. */
7080                         /* m->vmp_wpmapped = TRUE; */
7081                         assert(map->pmap == kernel_pmap);
7082
7083                         PMAP_ENTER(map->pmap, addr, m, VM_PROT_DEFAULT, VM_PROT_NONE, 0, TRUE, kr);
7084
7085                         assert(kr == KERN_SUCCESS);
7086 #if KASAN
7087                         kasan_notify_address(addr, PAGE_SIZE_64);
7088 #endif
7089                 }
7090                 offset += PAGE_SIZE_64;
7091         }
7092         vm_object_unlock(upl->map_object);
7093
7094         /*
7095          * hold a reference for the mapping
7096          */
7097         upl->ref_count++;
7098         upl->flags |= UPL_PAGE_LIST_MAPPED;
7099         upl->kaddr = (vm_offset_t) *dst_addr;
7100         assert(upl->kaddr == *dst_addr);
7101
7102         if (isVectorUPL) {
7103                 goto process_upl_to_enter;
7104         }
7105
7106         if (!isVectorUPL) {
7107                 vm_map_offset_t addr_adjustment;
7108
7109                 addr_adjustment = (vm_map_offset_t)(upl->u_offset - upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)));
7110                 if (addr_adjustment) {
7111                         assert(VM_MAP_PAGE_MASK(map) != PAGE_MASK);
7112                         DEBUG4K_UPL("dst_addr 0x%llx (+ 0x%llx) -> 0x%llx\n", (uint64_t)*dst_addr, (uint64_t)addr_adjustment, (uint64_t)(*dst_addr + addr_adjustment));
7113                         *dst_addr += addr_adjustment;
7114                 }
7115         }
7116
7117         upl_unlock(upl);
7118
7119         return KERN_SUCCESS;
7120 }
7121
7122 /*
7123  * Internal routine to remove a UPL mapping from a VM map.
7124  *
7125  * XXX - This should just be doable through a standard
7126  * vm_map_remove() operation.  Otherwise, implicit clean-up
7127  * of the target map won't be able to correctly remove
7128  * these (and release the reference on the UPL).  Having
7129  * to do this means we can't map these into user-space
7130  * maps yet.
7131  */
7132 kern_return_t
7133 vm_map_remove_upl(
7134         vm_map_t        map,
7135         upl_t           upl)
7136 {
7137         vm_address_t    addr;
7138         upl_size_t      size;
7139         int             isVectorUPL = 0, curr_upl = 0;
7140         upl_t           vector_upl = NULL;
7141
7142         if (upl == UPL_NULL) {
7143                 return KERN_INVALID_ARGUMENT;
7144         }
7145
7146         if ((isVectorUPL = vector_upl_is_valid(upl))) {
7147                 int     unmapped = 0, valid_upls = 0;
7148                 vector_upl = upl;
7149                 upl_lock(vector_upl);
7150                 for (curr_upl = 0; curr_upl < MAX_VECTOR_UPL_ELEMENTS; curr_upl++) {
7151                         upl =  vector_upl_subupl_byindex(vector_upl, curr_upl );
7152                         if (upl == NULL) {
7153                                 continue;
7154                         }
7155                         valid_upls++;
7156                         if (!(UPL_PAGE_LIST_MAPPED & upl->flags)) {
7157                                 unmapped++;
7158                         }
7159                 }
7160
7161                 if (unmapped) {
7162                         if (unmapped != valid_upls) {
7163                                 panic("%d of the %d sub-upls within the Vector UPL is/are not mapped\n", unmapped, valid_upls);
7164                         } else {
7165                                 upl_unlock(vector_upl);
7166                                 return KERN_FAILURE;
7167                         }
7168                 }
7169                 curr_upl = 0;
7170         } else {
7171                 upl_lock(upl);
7172         }
7173
7174 process_upl_to_remove:
7175         if (isVectorUPL) {
7176                 if (curr_upl == MAX_VECTOR_UPL_ELEMENTS) {
7177                         vm_map_t v_upl_submap;
7178                         vm_offset_t v_upl_submap_dst_addr;
7179                         vector_upl_get_submap(vector_upl, &v_upl_submap, &v_upl_submap_dst_addr);
7180
7181                         vm_map_remove(map, v_upl_submap_dst_addr,
7182                             v_upl_submap_dst_addr + vector_upl->u_size,
7183                             VM_MAP_REMOVE_NO_FLAGS);
7184                         vm_map_deallocate(v_upl_submap);
7185                         upl_unlock(vector_upl);
7186                         return KERN_SUCCESS;
7187                 }
7188
7189                 upl =  vector_upl_subupl_byindex(vector_upl, curr_upl++ );
7190                 if (upl == NULL) {
7191                         goto process_upl_to_remove;
7192                 }
7193         }
7194
7195         if (upl->flags & UPL_PAGE_LIST_MAPPED) {
7196                 addr = upl->kaddr;
7197                 size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7198
7199                 assert(upl->ref_count > 1);
7200                 upl->ref_count--;               /* removing mapping ref */
7201
7202                 upl->flags &= ~UPL_PAGE_LIST_MAPPED;
7203                 upl->kaddr = (vm_offset_t) 0;
7204
7205                 if (!isVectorUPL) {
7206                         upl_unlock(upl);
7207
7208                         vm_map_remove(
7209                                 map,
7210                                 vm_map_trunc_page(addr,
7211                                 VM_MAP_PAGE_MASK(map)),
7212                                 vm_map_round_page(addr + size,
7213                                 VM_MAP_PAGE_MASK(map)),
7214                                 VM_MAP_REMOVE_NO_FLAGS);
7215                         return KERN_SUCCESS;
7216                 } else {
7217                         /*
7218                          * If it's a Vectored UPL, we'll be removing the entire
7219                          * submap anyways, so no need to remove individual UPL
7220                          * element mappings from within the submap
7221                          */
7222                         goto process_upl_to_remove;
7223                 }
7224         }
7225         upl_unlock(upl);
7226
7227         return KERN_FAILURE;
7228 }
7229
7230
7231 kern_return_t
7232 upl_commit_range(
7233         upl_t                   upl,
7234         upl_offset_t            offset,
7235         upl_size_t              size,
7236         int                     flags,
7237         upl_page_info_t         *page_list,
7238         mach_msg_type_number_t  count,
7239         boolean_t               *empty)
7240 {
7241         upl_size_t              xfer_size, subupl_size;
7242         vm_object_t             shadow_object;
7243         vm_object_t             object;
7244         vm_object_t             m_object;
7245         vm_object_offset_t      target_offset;
7246         upl_offset_t            subupl_offset = offset;
7247         int                     entry;
7248         wpl_array_t             lite_list;
7249         int                     occupied;
7250         int                     clear_refmod = 0;
7251         int                     pgpgout_count = 0;
7252         struct  vm_page_delayed_work    dw_array;
7253         struct  vm_page_delayed_work    *dwp, *dwp_start;
7254         bool                    dwp_finish_ctx = TRUE;
7255         int                     dw_count;
7256         int                     dw_limit;
7257         int                     isVectorUPL = 0;
7258         upl_t                   vector_upl = NULL;
7259         boolean_t               should_be_throttled = FALSE;
7260
7261         vm_page_t               nxt_page = VM_PAGE_NULL;
7262         int                     fast_path_possible = 0;
7263         int                     fast_path_full_commit = 0;
7264         int                     throttle_page = 0;
7265         int                     unwired_count = 0;
7266         int                     local_queue_count = 0;
7267         vm_page_t               first_local, last_local;
7268         vm_object_offset_t      obj_start, obj_end, obj_offset;
7269         kern_return_t           kr = KERN_SUCCESS;
7270
7271 //      DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p offset 0x%llx size 0x%llx flags 0x%x\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object, (uint64_t)offset, (uint64_t)size, flags);
7272
7273         dwp_start = dwp = NULL;
7274
7275         subupl_size = size;
7276         *empty = FALSE;
7277
7278         if (upl == UPL_NULL) {
7279                 return KERN_INVALID_ARGUMENT;
7280         }
7281
7282         dw_count = 0;
7283         dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
7284         dwp_start = vm_page_delayed_work_get_ctx();
7285         if (dwp_start == NULL) {
7286                 dwp_start = &dw_array;
7287                 dw_limit = 1;
7288                 dwp_finish_ctx = FALSE;
7289         }
7290
7291         dwp = dwp_start;
7292
7293         if (count == 0) {
7294                 page_list = NULL;
7295         }
7296
7297         if ((isVectorUPL = vector_upl_is_valid(upl))) {
7298                 vector_upl = upl;
7299                 upl_lock(vector_upl);
7300         } else {
7301                 upl_lock(upl);
7302         }
7303
7304 process_upl_to_commit:
7305
7306         if (isVectorUPL) {
7307                 size = subupl_size;
7308                 offset = subupl_offset;
7309                 if (size == 0) {
7310                         upl_unlock(vector_upl);
7311                         kr = KERN_SUCCESS;
7312                         goto done;
7313                 }
7314                 upl =  vector_upl_subupl_byoffset(vector_upl, &offset, &size);
7315                 if (upl == NULL) {
7316                         upl_unlock(vector_upl);
7317                         kr = KERN_FAILURE;
7318                         goto done;
7319                 }
7320                 page_list = UPL_GET_INTERNAL_PAGE_LIST_SIMPLE(upl);
7321                 subupl_size -= size;
7322                 subupl_offset += size;
7323         }
7324
7325 #if UPL_DEBUG
7326         if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
7327                 (void) OSBacktrace(&upl->upl_commit_records[upl->upl_commit_index].c_retaddr[0], UPL_DEBUG_STACK_FRAMES);
7328
7329                 upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
7330                 upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
7331
7332                 upl->upl_commit_index++;
7333         }
7334 #endif
7335         if (upl->flags & UPL_DEVICE_MEMORY) {
7336                 xfer_size = 0;
7337         } else if ((offset + size) <= upl_adjusted_size(upl, PAGE_MASK)) {
7338                 xfer_size = size;
7339         } else {
7340                 if (!isVectorUPL) {
7341                         upl_unlock(upl);
7342                 } else {
7343                         upl_unlock(vector_upl);
7344                 }
7345                 DEBUG4K_ERROR("upl %p (u_offset 0x%llx u_size 0x%x) offset 0x%x size 0x%x\n", upl, upl->u_offset, upl->u_size, offset, size);
7346                 kr = KERN_FAILURE;
7347                 goto done;
7348         }
7349         if (upl->flags & UPL_SET_DIRTY) {
7350                 flags |= UPL_COMMIT_SET_DIRTY;
7351         }
7352         if (upl->flags & UPL_CLEAR_DIRTY) {
7353                 flags |= UPL_COMMIT_CLEAR_DIRTY;
7354         }
7355
7356         if (upl->flags & UPL_INTERNAL) {
7357                 lite_list = (wpl_array_t) ((((uintptr_t)upl) + sizeof(struct upl))
7358                     + ((upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE) * sizeof(upl_page_info_t)));
7359         } else {
7360                 lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
7361         }
7362
7363         object = upl->map_object;
7364
7365         if (upl->flags & UPL_SHADOWED) {
7366                 vm_object_lock(object);
7367                 shadow_object = object->shadow;
7368         } else {
7369                 shadow_object = object;
7370         }
7371         entry = offset / PAGE_SIZE;
7372         target_offset = (vm_object_offset_t)offset;
7373
7374         if (upl->flags & UPL_KERNEL_OBJECT) {
7375                 vm_object_lock_shared(shadow_object);
7376         } else {
7377                 vm_object_lock(shadow_object);
7378         }
7379
7380         VM_OBJECT_WIRED_PAGE_UPDATE_START(shadow_object);
7381
7382         if (upl->flags & UPL_ACCESS_BLOCKED) {
7383                 assert(shadow_object->blocked_access);
7384                 shadow_object->blocked_access = FALSE;
7385                 vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
7386         }
7387
7388         if (shadow_object->code_signed) {
7389                 /*
7390                  * CODE SIGNING:
7391                  * If the object is code-signed, do not let this UPL tell
7392                  * us if the pages are valid or not.  Let the pages be
7393                  * validated by VM the normal way (when they get mapped or
7394                  * copied).
7395                  */
7396                 flags &= ~UPL_COMMIT_CS_VALIDATED;
7397         }
7398         if (!page_list) {
7399                 /*
7400                  * No page list to get the code-signing info from !?
7401                  */
7402                 flags &= ~UPL_COMMIT_CS_VALIDATED;
7403         }
7404         if (!VM_DYNAMIC_PAGING_ENABLED() && shadow_object->internal) {
7405                 should_be_throttled = TRUE;
7406         }
7407
7408         if ((upl->flags & UPL_IO_WIRE) &&
7409             !(flags & UPL_COMMIT_FREE_ABSENT) &&
7410             !isVectorUPL &&
7411             shadow_object->purgable != VM_PURGABLE_VOLATILE &&
7412             shadow_object->purgable != VM_PURGABLE_EMPTY) {
7413                 if (!vm_page_queue_empty(&shadow_object->memq)) {
7414                         if (size == shadow_object->vo_size) {
7415                                 nxt_page = (vm_page_t)vm_page_queue_first(&shadow_object->memq);
7416                                 fast_path_full_commit = 1;
7417                         }
7418                         fast_path_possible = 1;
7419
7420                         if (!VM_DYNAMIC_PAGING_ENABLED() && shadow_object->internal &&
7421                             (shadow_object->purgable == VM_PURGABLE_DENY ||
7422                             shadow_object->purgable == VM_PURGABLE_NONVOLATILE ||
7423                             shadow_object->purgable == VM_PURGABLE_VOLATILE)) {
7424                                 throttle_page = 1;
7425                         }
7426                 }
7427         }
7428         first_local = VM_PAGE_NULL;
7429         last_local = VM_PAGE_NULL;
7430
7431         obj_start = target_offset + upl->u_offset - shadow_object->paging_offset;
7432         obj_end = obj_start + xfer_size;
7433         obj_start = vm_object_trunc_page(obj_start);
7434         obj_end = vm_object_round_page(obj_end);
7435         for (obj_offset = obj_start;
7436             obj_offset < obj_end;
7437             obj_offset += PAGE_SIZE) {
7438                 vm_page_t       t, m;
7439
7440                 dwp->dw_mask = 0;
7441                 clear_refmod = 0;
7442
7443                 m = VM_PAGE_NULL;
7444
7445                 if (upl->flags & UPL_LITE) {
7446                         unsigned int    pg_num;
7447
7448                         if (nxt_page != VM_PAGE_NULL) {
7449                                 m = nxt_page;
7450                                 nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->vmp_listq);
7451                                 target_offset = m->vmp_offset;
7452                         }
7453                         pg_num = (unsigned int) (target_offset / PAGE_SIZE);
7454                         assert(pg_num == target_offset / PAGE_SIZE);
7455
7456                         if (lite_list[pg_num >> 5] & (1U << (pg_num & 31))) {
7457                                 lite_list[pg_num >> 5] &= ~(1U << (pg_num & 31));
7458
7459                                 if (!(upl->flags & UPL_KERNEL_OBJECT) && m == VM_PAGE_NULL) {
7460                                         m = vm_page_lookup(shadow_object, obj_offset);
7461                                 }
7462                         } else {
7463                                 m = NULL;
7464                         }
7465                 }
7466                 if (upl->flags & UPL_SHADOWED) {
7467                         if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
7468                                 t->vmp_free_when_done = FALSE;
7469
7470                                 VM_PAGE_FREE(t);
7471
7472                                 if (!(upl->flags & UPL_KERNEL_OBJECT) && m == VM_PAGE_NULL) {
7473                                         m = vm_page_lookup(shadow_object, target_offset + object->vo_shadow_offset);
7474                                 }
7475                         }
7476                 }
7477                 if (m == VM_PAGE_NULL) {
7478                         goto commit_next_page;
7479                 }
7480
7481                 m_object = VM_PAGE_OBJECT(m);
7482
7483                 if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
7484                         assert(m->vmp_busy);
7485
7486                         dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7487                         goto commit_next_page;
7488                 }
7489
7490                 if (flags & UPL_COMMIT_CS_VALIDATED) {
7491                         /*
7492                          * CODE SIGNING:
7493                          * Set the code signing bits according to
7494                          * what the UPL says they should be.
7495                          */
7496                         m->vmp_cs_validated |= page_list[entry].cs_validated;
7497                         m->vmp_cs_tainted |= page_list[entry].cs_tainted;
7498                         m->vmp_cs_nx |= page_list[entry].cs_nx;
7499                 }
7500                 if (flags & UPL_COMMIT_WRITTEN_BY_KERNEL) {
7501                         m->vmp_written_by_kernel = TRUE;
7502                 }
7503
7504                 if (upl->flags & UPL_IO_WIRE) {
7505                         if (page_list) {
7506                                 page_list[entry].phys_addr = 0;
7507                         }
7508
7509                         if (flags & UPL_COMMIT_SET_DIRTY) {
7510                                 SET_PAGE_DIRTY(m, FALSE);
7511                         } else if (flags & UPL_COMMIT_CLEAR_DIRTY) {
7512                                 m->vmp_dirty = FALSE;
7513
7514                                 if (!(flags & UPL_COMMIT_CS_VALIDATED) &&
7515                                     m->vmp_cs_validated &&
7516                                     m->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
7517                                         /*
7518                                          * CODE SIGNING:
7519                                          * This page is no longer dirty
7520                                          * but could have been modified,
7521                                          * so it will need to be
7522                                          * re-validated.
7523                                          */
7524                                         m->vmp_cs_validated = VMP_CS_ALL_FALSE;
7525
7526                                         VM_PAGEOUT_DEBUG(vm_cs_validated_resets, 1);
7527
7528                                         pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
7529                                 }
7530                                 clear_refmod |= VM_MEM_MODIFIED;
7531                         }
7532                         if (upl->flags & UPL_ACCESS_BLOCKED) {
7533                                 /*
7534                                  * We blocked access to the pages in this UPL.
7535                                  * Clear the "busy" bit and wake up any waiter
7536                                  * for this page.
7537                                  */
7538                                 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7539                         }
7540                         if (fast_path_possible) {
7541                                 assert(m_object->purgable != VM_PURGABLE_EMPTY);
7542                                 assert(m_object->purgable != VM_PURGABLE_VOLATILE);
7543                                 if (m->vmp_absent) {
7544                                         assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
7545                                         assert(m->vmp_wire_count == 0);
7546                                         assert(m->vmp_busy);
7547
7548                                         m->vmp_absent = FALSE;
7549                                         dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7550                                 } else {
7551                                         if (m->vmp_wire_count == 0) {
7552                                                 panic("wire_count == 0, m = %p, obj = %p\n", m, shadow_object);
7553                                         }
7554                                         assert(m->vmp_q_state == VM_PAGE_IS_WIRED);
7555
7556                                         /*
7557                                          * XXX FBDP need to update some other
7558                                          * counters here (purgeable_wired_count)
7559                                          * (ledgers), ...
7560                                          */
7561                                         assert(m->vmp_wire_count > 0);
7562                                         m->vmp_wire_count--;
7563
7564                                         if (m->vmp_wire_count == 0) {
7565                                                 m->vmp_q_state = VM_PAGE_NOT_ON_Q;
7566                                                 unwired_count++;
7567                                         }
7568                                 }
7569                                 if (m->vmp_wire_count == 0) {
7570                                         assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
7571
7572                                         if (last_local == VM_PAGE_NULL) {
7573                                                 assert(first_local == VM_PAGE_NULL);
7574
7575                                                 last_local = m;
7576                                                 first_local = m;
7577                                         } else {
7578                                                 assert(first_local != VM_PAGE_NULL);
7579
7580                                                 m->vmp_pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local);
7581                                                 first_local->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(m);
7582                                                 first_local = m;
7583                                         }
7584                                         local_queue_count++;
7585
7586                                         if (throttle_page) {
7587                                                 m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
7588                                         } else {
7589                                                 if (flags & UPL_COMMIT_INACTIVATE) {
7590                                                         if (shadow_object->internal) {
7591                                                                 m->vmp_q_state = VM_PAGE_ON_INACTIVE_INTERNAL_Q;
7592                                                         } else {
7593                                                                 m->vmp_q_state = VM_PAGE_ON_INACTIVE_EXTERNAL_Q;
7594                                                         }
7595                                                 } else {
7596                                                         m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
7597                                                 }
7598                                         }
7599                                 }
7600                         } else {
7601                                 if (flags & UPL_COMMIT_INACTIVATE) {
7602                                         dwp->dw_mask |= DW_vm_page_deactivate_internal;
7603                                         clear_refmod |= VM_MEM_REFERENCED;
7604                                 }
7605                                 if (m->vmp_absent) {
7606                                         if (flags & UPL_COMMIT_FREE_ABSENT) {
7607                                                 dwp->dw_mask |= DW_vm_page_free;
7608                                         } else {
7609                                                 m->vmp_absent = FALSE;
7610                                                 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7611
7612                                                 if (!(dwp->dw_mask & DW_vm_page_deactivate_internal)) {
7613                                                         dwp->dw_mask |= DW_vm_page_activate;
7614                                                 }
7615                                         }
7616                                 } else {
7617                                         dwp->dw_mask |= DW_vm_page_unwire;
7618                                 }
7619                         }
7620                         goto commit_next_page;
7621                 }
7622                 assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
7623
7624                 if (page_list) {
7625                         page_list[entry].phys_addr = 0;
7626                 }
7627
7628                 /*
7629                  * make sure to clear the hardware
7630                  * modify or reference bits before
7631                  * releasing the BUSY bit on this page
7632                  * otherwise we risk losing a legitimate
7633                  * change of state
7634                  */
7635                 if (flags & UPL_COMMIT_CLEAR_DIRTY) {
7636                         m->vmp_dirty = FALSE;
7637
7638                         clear_refmod |= VM_MEM_MODIFIED;
7639                 }
7640                 if (m->vmp_laundry) {
7641                         dwp->dw_mask |= DW_vm_pageout_throttle_up;
7642                 }
7643
7644                 if (VM_PAGE_WIRED(m)) {
7645                         m->vmp_free_when_done = FALSE;
7646                 }
7647
7648                 if (!(flags & UPL_COMMIT_CS_VALIDATED) &&
7649                     m->vmp_cs_validated &&
7650                     m->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
7651                         /*
7652                          * CODE SIGNING:
7653                          * This page is no longer dirty
7654                          * but could have been modified,
7655                          * so it will need to be
7656                          * re-validated.
7657                          */
7658                         m->vmp_cs_validated = VMP_CS_ALL_FALSE;
7659
7660                         VM_PAGEOUT_DEBUG(vm_cs_validated_resets, 1);
7661
7662                         pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
7663                 }
7664                 if (m->vmp_overwriting) {
7665                         /*
7666                          * the (COPY_OUT_FROM == FALSE) request_page_list case
7667                          */
7668                         if (m->vmp_busy) {
7669 #if CONFIG_PHANTOM_CACHE
7670                                 if (m->vmp_absent && !m_object->internal) {
7671                                         dwp->dw_mask |= DW_vm_phantom_cache_update;
7672                                 }
7673 #endif
7674                                 m->vmp_absent = FALSE;
7675
7676                                 dwp->dw_mask |= DW_clear_busy;
7677                         } else {
7678                                 /*
7679                                  * alternate (COPY_OUT_FROM == FALSE) page_list case
7680                                  * Occurs when the original page was wired
7681                                  * at the time of the list request
7682                                  */
7683                                 assert(VM_PAGE_WIRED(m));
7684
7685                                 dwp->dw_mask |= DW_vm_page_unwire; /* reactivates */
7686                         }
7687                         m->vmp_overwriting = FALSE;
7688                 }
7689                 m->vmp_cleaning = FALSE;
7690
7691                 if (m->vmp_free_when_done) {
7692                         /*
7693                          * With the clean queue enabled, UPL_PAGEOUT should
7694                          * no longer set the pageout bit. Its pages now go
7695                          * to the clean queue.
7696                          *
7697                          * We don't use the cleaned Q anymore and so this
7698                          * assert isn't correct. The code for the clean Q
7699                          * still exists and might be used in the future. If we
7700                          * go back to the cleaned Q, we will re-enable this
7701                          * assert.
7702                          *
7703                          * assert(!(upl->flags & UPL_PAGEOUT));
7704                          */
7705                         assert(!m_object->internal);
7706
7707                         m->vmp_free_when_done = FALSE;
7708
7709                         if ((flags & UPL_COMMIT_SET_DIRTY) ||
7710                             (m->vmp_pmapped && (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED))) {
7711                                 /*
7712                                  * page was re-dirtied after we started
7713                                  * the pageout... reactivate it since
7714                                  * we don't know whether the on-disk
7715                                  * copy matches what is now in memory
7716                                  */
7717                                 SET_PAGE_DIRTY(m, FALSE);
7718
7719                                 dwp->dw_mask |= DW_vm_page_activate | DW_PAGE_WAKEUP;
7720
7721                                 if (upl->flags & UPL_PAGEOUT) {
7722                                         counter_inc(&vm_statistics_reactivations);
7723                                         DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL);
7724                                 }
7725                         } else {
7726                                 /*
7727                                  * page has been successfully cleaned
7728                                  * go ahead and free it for other use
7729                                  */
7730                                 if (m_object->internal) {
7731                                         DTRACE_VM2(anonpgout, int, 1, (uint64_t *), NULL);
7732                                 } else {
7733                                         DTRACE_VM2(fspgout, int, 1, (uint64_t *), NULL);
7734                                 }
7735                                 m->vmp_dirty = FALSE;
7736                                 m->vmp_busy = TRUE;
7737
7738                                 dwp->dw_mask |= DW_vm_page_free;
7739                         }
7740                         goto commit_next_page;
7741                 }
7742                 /*
7743                  * It is a part of the semantic of COPYOUT_FROM
7744                  * UPLs that a commit implies cache sync
7745                  * between the vm page and the backing store
7746                  * this can be used to strip the precious bit
7747                  * as well as clean
7748                  */
7749                 if ((upl->flags & UPL_PAGE_SYNC_DONE) || (flags & UPL_COMMIT_CLEAR_PRECIOUS)) {
7750                         m->vmp_precious = FALSE;
7751                 }
7752
7753                 if (flags & UPL_COMMIT_SET_DIRTY) {
7754                         SET_PAGE_DIRTY(m, FALSE);
7755                 } else {
7756                         m->vmp_dirty = FALSE;
7757                 }
7758
7759                 /* with the clean queue on, move *all* cleaned pages to the clean queue */
7760                 if (hibernate_cleaning_in_progress == FALSE && !m->vmp_dirty && (upl->flags & UPL_PAGEOUT)) {
7761                         pgpgout_count++;
7762
7763                         counter_inc(&vm_statistics_pageouts);
7764                         DTRACE_VM2(pgout, int, 1, (uint64_t *), NULL);
7765
7766                         dwp->dw_mask |= DW_enqueue_cleaned;
7767                 } else if (should_be_throttled == TRUE && (m->vmp_q_state == VM_PAGE_NOT_ON_Q)) {
7768                         /*
7769                          * page coming back in from being 'frozen'...
7770                          * it was dirty before it was frozen, so keep it so
7771                          * the vm_page_activate will notice that it really belongs
7772                          * on the throttle queue and put it there
7773                          */
7774                         SET_PAGE_DIRTY(m, FALSE);
7775                         dwp->dw_mask |= DW_vm_page_activate;
7776                 } else {
7777                         if ((flags & UPL_COMMIT_INACTIVATE) && !m->vmp_clustered && (m->vmp_q_state != VM_PAGE_ON_SPECULATIVE_Q)) {
7778                                 dwp->dw_mask |= DW_vm_page_deactivate_internal;
7779                                 clear_refmod |= VM_MEM_REFERENCED;
7780                         } else if (!VM_PAGE_PAGEABLE(m)) {
7781                                 if (m->vmp_clustered || (flags & UPL_COMMIT_SPECULATE)) {
7782                                         dwp->dw_mask |= DW_vm_page_speculate;
7783                                 } else if (m->vmp_reference) {
7784                                         dwp->dw_mask |= DW_vm_page_activate;
7785                                 } else {
7786                                         dwp->dw_mask |= DW_vm_page_deactivate_internal;
7787                                         clear_refmod |= VM_MEM_REFERENCED;
7788                                 }
7789                         }
7790                 }
7791                 if (upl->flags & UPL_ACCESS_BLOCKED) {
7792                         /*
7793                          * We blocked access to the pages in this URL.
7794                          * Clear the "busy" bit on this page before we
7795                          * wake up any waiter.
7796                          */
7797                         dwp->dw_mask |= DW_clear_busy;
7798                 }
7799                 /*
7800                  * Wakeup any thread waiting for the page to be un-cleaning.
7801                  */
7802                 dwp->dw_mask |= DW_PAGE_WAKEUP;
7803
7804 commit_next_page:
7805                 if (clear_refmod) {
7806                         pmap_clear_refmod(VM_PAGE_GET_PHYS_PAGE(m), clear_refmod);
7807                 }
7808
7809                 target_offset += PAGE_SIZE_64;
7810                 xfer_size -= PAGE_SIZE;
7811                 entry++;
7812
7813                 if (dwp->dw_mask) {
7814                         if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) {
7815                                 VM_PAGE_ADD_DELAYED_WORK(dwp, m, dw_count);
7816
7817                                 if (dw_count >= dw_limit) {
7818                                         vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
7819
7820                                         dwp = dwp_start;
7821                                         dw_count = 0;
7822                                 }
7823                         } else {
7824                                 if (dwp->dw_mask & DW_clear_busy) {
7825                                         m->vmp_busy = FALSE;
7826                                 }
7827
7828                                 if (dwp->dw_mask & DW_PAGE_WAKEUP) {
7829                                         PAGE_WAKEUP(m);
7830                                 }
7831                         }
7832                 }
7833         }
7834         if (dw_count) {
7835                 vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
7836                 dwp = dwp_start;
7837                 dw_count = 0;
7838         }
7839
7840         if (fast_path_possible) {
7841                 assert(shadow_object->purgable != VM_PURGABLE_VOLATILE);
7842                 assert(shadow_object->purgable != VM_PURGABLE_EMPTY);
7843
7844                 if (local_queue_count || unwired_count) {
7845                         if (local_queue_count) {
7846                                 vm_page_t       first_target;
7847                                 vm_page_queue_head_t    *target_queue;
7848
7849                                 if (throttle_page) {
7850                                         target_queue = &vm_page_queue_throttled;
7851                                 } else {
7852                                         if (flags & UPL_COMMIT_INACTIVATE) {
7853                                                 if (shadow_object->internal) {
7854                                                         target_queue = &vm_page_queue_anonymous;
7855                                                 } else {
7856                                                         target_queue = &vm_page_queue_inactive;
7857                                                 }
7858                                         } else {
7859                                                 target_queue = &vm_page_queue_active;
7860                                         }
7861                                 }
7862                                 /*
7863                                  * Transfer the entire local queue to a regular LRU page queues.
7864                                  */
7865                                 vm_page_lockspin_queues();
7866
7867                                 first_target = (vm_page_t) vm_page_queue_first(target_queue);
7868
7869                                 if (vm_page_queue_empty(target_queue)) {
7870                                         target_queue->prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
7871                                 } else {
7872                                         first_target->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
7873                                 }
7874
7875                                 target_queue->next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local);
7876                                 first_local->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(target_queue);
7877                                 last_local->vmp_pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_target);
7878
7879                                 /*
7880                                  * Adjust the global page counts.
7881                                  */
7882                                 if (throttle_page) {
7883                                         vm_page_throttled_count += local_queue_count;
7884                                 } else {
7885                                         if (flags & UPL_COMMIT_INACTIVATE) {
7886                                                 if (shadow_object->internal) {
7887                                                         vm_page_anonymous_count += local_queue_count;
7888                                                 }
7889                                                 vm_page_inactive_count += local_queue_count;
7890
7891                                                 token_new_pagecount += local_queue_count;
7892                                         } else {
7893                                                 vm_page_active_count += local_queue_count;
7894                                         }
7895
7896                                         if (shadow_object->internal) {
7897                                                 vm_page_pageable_internal_count += local_queue_count;
7898                                         } else {
7899                                                 vm_page_pageable_external_count += local_queue_count;
7900                                         }
7901                                 }
7902                         } else {
7903                                 vm_page_lockspin_queues();
7904                         }
7905                         if (unwired_count) {
7906                                 vm_page_wire_count -= unwired_count;
7907                                 VM_CHECK_MEMORYSTATUS;
7908                         }
7909                         vm_page_unlock_queues();
7910
7911                         VM_OBJECT_WIRED_PAGE_COUNT(shadow_object, -unwired_count);
7912                 }
7913         }
7914         occupied = 1;
7915
7916         if (upl->flags & UPL_DEVICE_MEMORY) {
7917                 occupied = 0;
7918         } else if (upl->flags & UPL_LITE) {
7919                 int     pg_num;
7920                 int     i;
7921
7922                 occupied = 0;
7923
7924                 if (!fast_path_full_commit) {
7925                         pg_num = upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE;
7926                         pg_num = (pg_num + 31) >> 5;
7927
7928                         for (i = 0; i < pg_num; i++) {
7929                                 if (lite_list[i] != 0) {
7930                                         occupied = 1;
7931                                         break;
7932                                 }
7933                         }
7934                 }
7935         } else {
7936                 if (vm_page_queue_empty(&upl->map_object->memq)) {
7937                         occupied = 0;
7938                 }
7939         }
7940         if (occupied == 0) {
7941                 /*
7942                  * If this UPL element belongs to a Vector UPL and is
7943                  * empty, then this is the right function to deallocate
7944                  * it. So go ahead set the *empty variable. The flag
7945                  * UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
7946                  * should be considered relevant for the Vector UPL and not
7947                  * the internal UPLs.
7948                  */
7949                 if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) || isVectorUPL) {
7950                         *empty = TRUE;
7951                 }
7952
7953                 if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
7954                         /*
7955                          * this is not a paging object
7956                          * so we need to drop the paging reference
7957                          * that was taken when we created the UPL
7958                          * against this object
7959                          */
7960                         vm_object_activity_end(shadow_object);
7961                         vm_object_collapse(shadow_object, 0, TRUE);
7962                 } else {
7963                         /*
7964                          * we dontated the paging reference to
7965                          * the map object... vm_pageout_object_terminate
7966                          * will drop this reference
7967                          */
7968                 }
7969         }
7970         VM_OBJECT_WIRED_PAGE_UPDATE_END(shadow_object, shadow_object->wire_tag);
7971         vm_object_unlock(shadow_object);
7972         if (object != shadow_object) {
7973                 vm_object_unlock(object);
7974         }
7975
7976         if (!isVectorUPL) {
7977                 upl_unlock(upl);
7978         } else {
7979                 /*
7980                  * If we completed our operations on an UPL that is
7981                  * part of a Vectored UPL and if empty is TRUE, then
7982                  * we should go ahead and deallocate this UPL element.
7983                  * Then we check if this was the last of the UPL elements
7984                  * within that Vectored UPL. If so, set empty to TRUE
7985                  * so that in ubc_upl_commit_range or ubc_upl_commit, we
7986                  * can go ahead and deallocate the Vector UPL too.
7987                  */
7988                 if (*empty == TRUE) {
7989                         *empty = vector_upl_set_subupl(vector_upl, upl, 0);
7990                         upl_deallocate(upl);
7991                 }
7992                 goto process_upl_to_commit;
7993         }
7994         if (pgpgout_count) {
7995                 DTRACE_VM2(pgpgout, int, pgpgout_count, (uint64_t *), NULL);
7996         }
7997
7998         kr = KERN_SUCCESS;
7999 done:
8000         if (dwp_start && dwp_finish_ctx) {
8001                 vm_page_delayed_work_finish_ctx(dwp_start);
8002                 dwp_start = dwp = NULL;
8003         }
8004
8005         return kr;
8006 }
8007
8008 kern_return_t
8009 upl_abort_range(
8010         upl_t                   upl,
8011         upl_offset_t            offset,
8012         upl_size_t              size,
8013         int                     error,
8014         boolean_t               *empty)
8015 {
8016         upl_page_info_t         *user_page_list = NULL;
8017         upl_size_t              xfer_size, subupl_size;
8018         vm_object_t             shadow_object;
8019         vm_object_t             object;
8020         vm_object_offset_t      target_offset;
8021         upl_offset_t            subupl_offset = offset;
8022         int                     entry;
8023         wpl_array_t             lite_list;
8024         int                     occupied;
8025         struct  vm_page_delayed_work    dw_array;
8026         struct  vm_page_delayed_work    *dwp, *dwp_start;
8027         bool                    dwp_finish_ctx = TRUE;
8028         int                     dw_count;
8029         int                     dw_limit;
8030         int                     isVectorUPL = 0;
8031         upl_t                   vector_upl = NULL;
8032         vm_object_offset_t      obj_start, obj_end, obj_offset;
8033         kern_return_t           kr = KERN_SUCCESS;
8034
8035 //      DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p offset 0x%llx size 0x%llx error 0x%x\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object, (uint64_t)offset, (uint64_t)size, error);
8036
8037         dwp_start = dwp = NULL;
8038
8039         subupl_size = size;
8040         *empty = FALSE;
8041
8042         if (upl == UPL_NULL) {
8043                 return KERN_INVALID_ARGUMENT;
8044         }
8045
8046         if ((upl->flags & UPL_IO_WIRE) && !(error & UPL_ABORT_DUMP_PAGES)) {
8047                 return upl_commit_range(upl, offset, size, UPL_COMMIT_FREE_ABSENT, NULL, 0, empty);
8048         }
8049
8050         dw_count = 0;
8051         dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
8052         dwp_start = vm_page_delayed_work_get_ctx();
8053         if (dwp_start == NULL) {
8054                 dwp_start = &dw_array;
8055                 dw_limit = 1;
8056                 dwp_finish_ctx = FALSE;
8057         }
8058
8059         dwp = dwp_start;
8060
8061         if ((isVectorUPL = vector_upl_is_valid(upl))) {
8062                 vector_upl = upl;
8063                 upl_lock(vector_upl);
8064         } else {
8065                 upl_lock(upl);
8066         }
8067
8068 process_upl_to_abort:
8069         if (isVectorUPL) {
8070                 size = subupl_size;
8071                 offset = subupl_offset;
8072                 if (size == 0) {
8073                         upl_unlock(vector_upl);
8074                         kr = KERN_SUCCESS;
8075                         goto done;
8076                 }
8077                 upl =  vector_upl_subupl_byoffset(vector_upl, &offset, &size);
8078                 if (upl == NULL) {
8079                         upl_unlock(vector_upl);
8080                         kr = KERN_FAILURE;
8081                         goto done;
8082                 }
8083                 subupl_size -= size;
8084                 subupl_offset += size;
8085         }
8086
8087         *empty = FALSE;
8088
8089 #if UPL_DEBUG
8090         if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
8091                 (void) OSBacktrace(&upl->upl_commit_records[upl->upl_commit_index].c_retaddr[0], UPL_DEBUG_STACK_FRAMES);
8092
8093                 upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
8094                 upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
8095                 upl->upl_commit_records[upl->upl_commit_index].c_aborted = 1;
8096
8097                 upl->upl_commit_index++;
8098         }
8099 #endif
8100         if (upl->flags & UPL_DEVICE_MEMORY) {
8101                 xfer_size = 0;
8102         } else if ((offset + size) <= upl_adjusted_size(upl, PAGE_MASK)) {
8103                 xfer_size = size;
8104         } else {
8105                 if (!isVectorUPL) {
8106                         upl_unlock(upl);
8107                 } else {
8108                         upl_unlock(vector_upl);
8109                 }
8110                 DEBUG4K_ERROR("upl %p (u_offset 0x%llx u_size 0x%x) offset 0x%x size 0x%x\n", upl, upl->u_offset, upl->u_size, offset, size);
8111                 kr = KERN_FAILURE;
8112                 goto done;
8113         }
8114         if (upl->flags & UPL_INTERNAL) {
8115                 lite_list = (wpl_array_t)
8116                     ((((uintptr_t)upl) + sizeof(struct upl))
8117                     + ((upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE) * sizeof(upl_page_info_t)));
8118
8119                 user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
8120         } else {
8121                 lite_list = (wpl_array_t)
8122                     (((uintptr_t)upl) + sizeof(struct upl));
8123         }
8124         object = upl->map_object;
8125
8126         if (upl->flags & UPL_SHADOWED) {
8127                 vm_object_lock(object);
8128                 shadow_object = object->shadow;
8129         } else {
8130                 shadow_object = object;
8131         }
8132
8133         entry = offset / PAGE_SIZE;
8134         target_offset = (vm_object_offset_t)offset;
8135
8136         if (upl->flags & UPL_KERNEL_OBJECT) {
8137                 vm_object_lock_shared(shadow_object);
8138         } else {
8139                 vm_object_lock(shadow_object);
8140         }
8141
8142         if (upl->flags & UPL_ACCESS_BLOCKED) {
8143                 assert(shadow_object->blocked_access);
8144                 shadow_object->blocked_access = FALSE;
8145                 vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
8146         }
8147
8148         if ((error & UPL_ABORT_DUMP_PAGES) && (upl->flags & UPL_KERNEL_OBJECT)) {
8149                 panic("upl_abort_range: kernel_object being DUMPED");
8150         }
8151
8152         obj_start = target_offset + upl->u_offset - shadow_object->paging_offset;
8153         obj_end = obj_start + xfer_size;
8154         obj_start = vm_object_trunc_page(obj_start);
8155         obj_end = vm_object_round_page(obj_end);
8156         for (obj_offset = obj_start;
8157             obj_offset < obj_end;
8158             obj_offset += PAGE_SIZE) {
8159                 vm_page_t       t, m;
8160                 unsigned int    pg_num;
8161                 boolean_t       needed;
8162
8163                 pg_num = (unsigned int) (target_offset / PAGE_SIZE);
8164                 assert(pg_num == target_offset / PAGE_SIZE);
8165
8166                 needed = FALSE;
8167
8168                 if (user_page_list) {
8169                         needed = user_page_list[pg_num].needed;
8170                 }
8171
8172                 dwp->dw_mask = 0;
8173                 m = VM_PAGE_NULL;
8174
8175                 if (upl->flags & UPL_LITE) {
8176                         if (lite_list[pg_num >> 5] & (1U << (pg_num & 31))) {
8177                                 lite_list[pg_num >> 5] &= ~(1U << (pg_num & 31));
8178
8179                                 if (!(upl->flags & UPL_KERNEL_OBJECT)) {
8180                                         m = vm_page_lookup(shadow_object, obj_offset);
8181                                 }
8182                         }
8183                 }
8184                 if (upl->flags & UPL_SHADOWED) {
8185                         if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
8186                                 t->vmp_free_when_done = FALSE;
8187
8188                                 VM_PAGE_FREE(t);
8189
8190                                 if (m == VM_PAGE_NULL) {
8191                                         m = vm_page_lookup(shadow_object, target_offset + object->vo_shadow_offset);
8192                                 }
8193                         }
8194                 }
8195                 if ((upl->flags & UPL_KERNEL_OBJECT)) {
8196                         goto abort_next_page;
8197                 }
8198
8199                 if (m != VM_PAGE_NULL) {
8200                         assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
8201
8202                         if (m->vmp_absent) {
8203                                 boolean_t must_free = TRUE;
8204
8205                                 /*
8206                                  * COPYOUT = FALSE case
8207                                  * check for error conditions which must
8208                                  * be passed back to the pages customer
8209                                  */
8210                                 if (error & UPL_ABORT_RESTART) {
8211                                         m->vmp_restart = TRUE;
8212                                         m->vmp_absent = FALSE;
8213                                         m->vmp_unusual = TRUE;
8214                                         must_free = FALSE;
8215                                 } else if (error & UPL_ABORT_UNAVAILABLE) {
8216                                         m->vmp_restart = FALSE;
8217                                         m->vmp_unusual = TRUE;
8218                                         must_free = FALSE;
8219                                 } else if (error & UPL_ABORT_ERROR) {
8220                                         m->vmp_restart = FALSE;
8221                                         m->vmp_absent = FALSE;
8222                                         m->vmp_error = TRUE;
8223                                         m->vmp_unusual = TRUE;
8224                                         must_free = FALSE;
8225                                 }
8226                                 if (m->vmp_clustered && needed == FALSE) {
8227                                         /*
8228                                          * This page was a part of a speculative
8229                                          * read-ahead initiated by the kernel
8230                                          * itself.  No one is expecting this
8231                                          * page and no one will clean up its
8232                                          * error state if it ever becomes valid
8233                                          * in the future.
8234                                          * We have to free it here.
8235                                          */
8236                                         must_free = TRUE;
8237                                 }
8238                                 m->vmp_cleaning = FALSE;
8239
8240                                 if (m->vmp_overwriting && !m->vmp_busy) {
8241                                         /*
8242                                          * this shouldn't happen since
8243                                          * this is an 'absent' page, but
8244                                          * it doesn't hurt to check for
8245                                          * the 'alternate' method of
8246                                          * stabilizing the page...
8247                                          * we will mark 'busy' to be cleared
8248                                          * in the following code which will
8249                                          * take care of the primary stabilzation
8250                                          * method (i.e. setting 'busy' to TRUE)
8251                                          */
8252                                         dwp->dw_mask |= DW_vm_page_unwire;
8253                                 }
8254                                 m->vmp_overwriting = FALSE;
8255
8256                                 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
8257
8258                                 if (must_free == TRUE) {
8259                                         dwp->dw_mask |= DW_vm_page_free;
8260                                 } else {
8261                                         dwp->dw_mask |= DW_vm_page_activate;
8262                                 }
8263                         } else {
8264                                 /*
8265                                  * Handle the trusted pager throttle.
8266                                  */
8267                                 if (m->vmp_laundry) {
8268                                         dwp->dw_mask |= DW_vm_pageout_throttle_up;
8269                                 }
8270
8271                                 if (upl->flags & UPL_ACCESS_BLOCKED) {
8272                                         /*
8273                                          * We blocked access to the pages in this UPL.
8274                                          * Clear the "busy" bit and wake up any waiter
8275                                          * for this page.
8276                                          */
8277                                         dwp->dw_mask |= DW_clear_busy;
8278                                 }
8279                                 if (m->vmp_overwriting) {
8280                                         if (m->vmp_busy) {
8281                                                 dwp->dw_mask |= DW_clear_busy;
8282                                         } else {
8283                                                 /*
8284                                                  * deal with the 'alternate' method
8285                                                  * of stabilizing the page...
8286                                                  * we will either free the page
8287                                                  * or mark 'busy' to be cleared
8288                                                  * in the following code which will
8289                                                  * take care of the primary stabilzation
8290                                                  * method (i.e. setting 'busy' to TRUE)
8291                                                  */
8292                                                 dwp->dw_mask |= DW_vm_page_unwire;
8293                                         }
8294                                         m->vmp_overwriting = FALSE;
8295                                 }
8296                                 m->vmp_free_when_done = FALSE;
8297                                 m->vmp_cleaning = FALSE;
8298
8299                                 if (error & UPL_ABORT_DUMP_PAGES) {
8300                                         pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
8301
8302                                         dwp->dw_mask |= DW_vm_page_free;
8303                                 } else {
8304                                         if (!(dwp->dw_mask & DW_vm_page_unwire)) {
8305                                                 if (error & UPL_ABORT_REFERENCE) {
8306                                                         /*
8307                                                          * we've been told to explictly
8308                                                          * reference this page... for
8309                                                          * file I/O, this is done by
8310                                                          * implementing an LRU on the inactive q
8311                                                          */
8312                                                         dwp->dw_mask |= DW_vm_page_lru;
8313                                                 } else if (!VM_PAGE_PAGEABLE(m)) {
8314                                                         dwp->dw_mask |= DW_vm_page_deactivate_internal;
8315                                                 }
8316                                         }
8317                                         dwp->dw_mask |= DW_PAGE_WAKEUP;
8318                                 }
8319                         }
8320                 }
8321 abort_next_page:
8322                 target_offset += PAGE_SIZE_64;
8323                 xfer_size -= PAGE_SIZE;
8324                 entry++;
8325
8326                 if (dwp->dw_mask) {
8327                         if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) {
8328                                 VM_PAGE_ADD_DELAYED_WORK(dwp, m, dw_count);
8329
8330                                 if (dw_count >= dw_limit) {
8331                                         vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
8332
8333                                         dwp = dwp_start;
8334                                         dw_count = 0;
8335                                 }
8336                         } else {
8337                                 if (dwp->dw_mask & DW_clear_busy) {
8338                                         m->vmp_busy = FALSE;
8339                                 }
8340
8341                                 if (dwp->dw_mask & DW_PAGE_WAKEUP) {
8342                                         PAGE_WAKEUP(m);
8343                                 }
8344                         }
8345                 }
8346         }
8347         if (dw_count) {
8348                 vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, dwp_start, dw_count);
8349                 dwp = dwp_start;
8350                 dw_count = 0;
8351         }
8352
8353         occupied = 1;
8354
8355         if (upl->flags & UPL_DEVICE_MEMORY) {
8356                 occupied = 0;
8357         } else if (upl->flags & UPL_LITE) {
8358                 int     pg_num;
8359                 int     i;
8360
8361                 pg_num = upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE;
8362                 pg_num = (pg_num + 31) >> 5;
8363                 occupied = 0;
8364
8365                 for (i = 0; i < pg_num; i++) {
8366                         if (lite_list[i] != 0) {
8367                                 occupied = 1;
8368                                 break;
8369                         }
8370                 }
8371         } else {
8372                 if (vm_page_queue_empty(&upl->map_object->memq)) {
8373                         occupied = 0;
8374                 }
8375         }
8376         if (occupied == 0) {
8377                 /*
8378                  * If this UPL element belongs to a Vector UPL and is
8379                  * empty, then this is the right function to deallocate
8380                  * it. So go ahead set the *empty variable. The flag
8381                  * UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
8382                  * should be considered relevant for the Vector UPL and
8383                  * not the internal UPLs.
8384                  */
8385                 if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) || isVectorUPL) {
8386                         *empty = TRUE;
8387                 }
8388
8389                 if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
8390                         /*
8391                          * this is not a paging object
8392                          * so we need to drop the paging reference
8393                          * that was taken when we created the UPL
8394                          * against this object
8395                          */
8396                         vm_object_activity_end(shadow_object);
8397                         vm_object_collapse(shadow_object, 0, TRUE);
8398                 } else {
8399                         /*
8400                          * we dontated the paging reference to
8401                          * the map object... vm_pageout_object_terminate
8402                          * will drop this reference
8403                          */
8404                 }
8405         }
8406         vm_object_unlock(shadow_object);
8407         if (object != shadow_object) {
8408                 vm_object_unlock(object);
8409         }
8410
8411         if (!isVectorUPL) {
8412                 upl_unlock(upl);
8413         } else {
8414                 /*
8415                  * If we completed our operations on an UPL that is
8416                  * part of a Vectored UPL and if empty is TRUE, then
8417                  * we should go ahead and deallocate this UPL element.
8418                  * Then we check if this was the last of the UPL elements
8419                  * within that Vectored UPL. If so, set empty to TRUE
8420                  * so that in ubc_upl_abort_range or ubc_upl_abort, we
8421                  * can go ahead and deallocate the Vector UPL too.
8422                  */
8423                 if (*empty == TRUE) {
8424                         *empty = vector_upl_set_subupl(vector_upl, upl, 0);
8425                         upl_deallocate(upl);
8426                 }
8427                 goto process_upl_to_abort;
8428         }
8429
8430         kr = KERN_SUCCESS;
8431
8432 done:
8433         if (dwp_start && dwp_finish_ctx) {
8434                 vm_page_delayed_work_finish_ctx(dwp_start);
8435                 dwp_start = dwp = NULL;
8436         }
8437
8438         return kr;
8439 }
8440
8441
8442 kern_return_t
8443 upl_abort(
8444         upl_t   upl,
8445         int     error)
8446 {
8447         boolean_t       empty;
8448
8449         if (upl == UPL_NULL) {
8450                 return KERN_INVALID_ARGUMENT;
8451         }
8452
8453         return upl_abort_range(upl, 0, upl->u_size, error, &empty);
8454 }
8455
8456
8457 /* an option on commit should be wire */
8458 kern_return_t
8459 upl_commit(
8460         upl_t                   upl,
8461         upl_page_info_t         *page_list,
8462         mach_msg_type_number_t  count)
8463 {
8464         boolean_t       empty;
8465
8466         if (upl == UPL_NULL) {
8467                 return KERN_INVALID_ARGUMENT;
8468         }
8469
8470         return upl_commit_range(upl, 0, upl->u_size, 0,
8471                    page_list, count, &empty);
8472 }
8473
8474
8475 void
8476 iopl_valid_data(
8477         upl_t    upl,
8478         vm_tag_t tag)
8479 {
8480         vm_object_t     object;
8481         vm_offset_t     offset;
8482         vm_page_t       m, nxt_page = VM_PAGE_NULL;
8483         upl_size_t      size;
8484         int             wired_count = 0;
8485
8486         if (upl == NULL) {
8487                 panic("iopl_valid_data: NULL upl");
8488         }
8489         if (vector_upl_is_valid(upl)) {
8490                 panic("iopl_valid_data: vector upl");
8491         }
8492         if ((upl->flags & (UPL_DEVICE_MEMORY | UPL_SHADOWED | UPL_ACCESS_BLOCKED | UPL_IO_WIRE | UPL_INTERNAL)) != UPL_IO_WIRE) {
8493                 panic("iopl_valid_data: unsupported upl, flags = %x", upl->flags);
8494         }
8495
8496         object = upl->map_object;
8497
8498         if (object == kernel_object || object == compressor_object) {
8499                 panic("iopl_valid_data: object == kernel or compressor");
8500         }
8501
8502         if (object->purgable == VM_PURGABLE_VOLATILE ||
8503             object->purgable == VM_PURGABLE_EMPTY) {
8504                 panic("iopl_valid_data: object %p purgable %d",
8505                     object, object->purgable);
8506         }
8507
8508         size = upl_adjusted_size(upl, PAGE_MASK);
8509
8510         vm_object_lock(object);
8511         VM_OBJECT_WIRED_PAGE_UPDATE_START(object);
8512
8513         if (object->vo_size == size && object->resident_page_count == (size / PAGE_SIZE)) {
8514                 nxt_page = (vm_page_t)vm_page_queue_first(&object->memq);
8515         } else {
8516                 offset = (vm_offset_t)(upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset);
8517         }
8518
8519         while (size) {
8520                 if (nxt_page != VM_PAGE_NULL) {
8521                         m = nxt_page;
8522                         nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->vmp_listq);
8523                 } else {
8524                         m = vm_page_lookup(object, offset);
8525                         offset += PAGE_SIZE;
8526
8527                         if (m == VM_PAGE_NULL) {
8528                                 panic("iopl_valid_data: missing expected page at offset %lx", (long)offset);
8529                         }
8530                 }
8531                 if (m->vmp_busy) {
8532                         if (!m->vmp_absent) {
8533                                 panic("iopl_valid_data: busy page w/o absent");
8534                         }
8535
8536                         if (m->vmp_pageq.next || m->vmp_pageq.prev) {
8537                                 panic("iopl_valid_data: busy+absent page on page queue");
8538                         }
8539                         if (m->vmp_reusable) {
8540                                 panic("iopl_valid_data: %p is reusable", m);
8541                         }
8542
8543                         m->vmp_absent = FALSE;
8544                         m->vmp_dirty = TRUE;
8545                         assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
8546                         assert(m->vmp_wire_count == 0);
8547                         m->vmp_wire_count++;
8548                         assert(m->vmp_wire_count);
8549                         if (m->vmp_wire_count == 1) {
8550                                 m->vmp_q_state = VM_PAGE_IS_WIRED;
8551                                 wired_count++;
8552                         } else {
8553                                 panic("iopl_valid_data: %p already wired\n", m);
8554                         }
8555
8556                         PAGE_WAKEUP_DONE(m);
8557                 }
8558                 size -= PAGE_SIZE;
8559         }
8560         if (wired_count) {
8561                 VM_OBJECT_WIRED_PAGE_COUNT(object, wired_count);
8562                 assert(object->resident_page_count >= object->wired_page_count);
8563
8564                 /* no need to adjust purgeable accounting for this object: */
8565                 assert(object->purgable != VM_PURGABLE_VOLATILE);
8566                 assert(object->purgable != VM_PURGABLE_EMPTY);
8567
8568                 vm_page_lockspin_queues();
8569                 vm_page_wire_count += wired_count;
8570                 vm_page_unlock_queues();
8571         }
8572         VM_OBJECT_WIRED_PAGE_UPDATE_END(object, tag);
8573         vm_object_unlock(object);
8574 }
8575
8576
8577 void
8578 vm_object_set_pmap_cache_attr(
8579         vm_object_t             object,
8580         upl_page_info_array_t   user_page_list,
8581         unsigned int            num_pages,
8582         boolean_t               batch_pmap_op)
8583 {
8584         unsigned int    cache_attr = 0;
8585
8586         cache_attr = object->wimg_bits & VM_WIMG_MASK;
8587         assert(user_page_list);
8588         if (cache_attr != VM_WIMG_USE_DEFAULT) {
8589                 PMAP_BATCH_SET_CACHE_ATTR(object, user_page_list, cache_attr, num_pages, batch_pmap_op);
8590         }
8591 }
8592
8593
8594 boolean_t       vm_object_iopl_wire_full(vm_object_t, upl_t, upl_page_info_array_t, wpl_array_t, upl_control_flags_t, vm_tag_t);
8595 kern_return_t   vm_object_iopl_wire_empty(vm_object_t, upl_t, upl_page_info_array_t, wpl_array_t, upl_control_flags_t, vm_tag_t, vm_object_offset_t *, int, int*);
8596
8597
8598
8599 boolean_t
8600 vm_object_iopl_wire_full(vm_object_t object, upl_t upl, upl_page_info_array_t user_page_list,
8601     wpl_array_t lite_list, upl_control_flags_t cntrl_flags, vm_tag_t tag)
8602 {
8603         vm_page_t       dst_page;
8604         unsigned int    entry;
8605         int             page_count;
8606         int             delayed_unlock = 0;
8607         boolean_t       retval = TRUE;
8608         ppnum_t         phys_page;
8609
8610         vm_object_lock_assert_exclusive(object);
8611         assert(object->purgable != VM_PURGABLE_VOLATILE);
8612         assert(object->purgable != VM_PURGABLE_EMPTY);
8613         assert(object->pager == NULL);
8614         assert(object->copy == NULL);
8615         assert(object->shadow == NULL);
8616
8617         page_count = object->resident_page_count;
8618         dst_page = (vm_page_t)vm_page_queue_first(&object->memq);
8619
8620         vm_page_lock_queues();
8621
8622         while (page_count--) {
8623                 if (dst_page->vmp_busy ||
8624                     dst_page->vmp_fictitious ||
8625                     dst_page->vmp_absent ||
8626                     dst_page->vmp_error ||
8627                     dst_page->vmp_cleaning ||
8628                     dst_page->vmp_restart ||
8629                     dst_page->vmp_laundry) {
8630                         retval = FALSE;
8631                         goto done;
8632                 }
8633                 if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
8634                         retval = FALSE;
8635                         goto done;
8636                 }
8637                 dst_page->vmp_reference = TRUE;
8638
8639                 vm_page_wire(dst_page, tag, FALSE);
8640
8641                 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
8642                         SET_PAGE_DIRTY(dst_page, FALSE);
8643                 }
8644                 entry = (unsigned int)(dst_page->vmp_offset / PAGE_SIZE);
8645                 assert(entry >= 0 && entry < object->resident_page_count);
8646                 lite_list[entry >> 5] |= 1U << (entry & 31);
8647
8648                 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
8649
8650                 if (phys_page > upl->highest_page) {
8651                         upl->highest_page = phys_page;
8652                 }
8653
8654                 if (user_page_list) {
8655                         user_page_list[entry].phys_addr = phys_page;
8656                         user_page_list[entry].absent    = dst_page->vmp_absent;
8657                         user_page_list[entry].dirty     = dst_page->vmp_dirty;
8658                         user_page_list[entry].free_when_done   = dst_page->vmp_free_when_done;
8659                         user_page_list[entry].precious  = dst_page->vmp_precious;
8660                         user_page_list[entry].device    = FALSE;
8661                         user_page_list[entry].speculative = FALSE;
8662                         user_page_list[entry].cs_validated = FALSE;
8663                         user_page_list[entry].cs_tainted = FALSE;
8664                         user_page_list[entry].cs_nx     = FALSE;
8665                         user_page_list[entry].needed    = FALSE;
8666                         user_page_list[entry].mark      = FALSE;
8667                 }
8668                 if (delayed_unlock++ > 256) {
8669                         delayed_unlock = 0;
8670                         lck_mtx_yield(&vm_page_queue_lock);
8671
8672                         VM_CHECK_MEMORYSTATUS;
8673                 }
8674                 dst_page = (vm_page_t)vm_page_queue_next(&dst_page->vmp_listq);
8675         }
8676 done:
8677         vm_page_unlock_queues();
8678
8679         VM_CHECK_MEMORYSTATUS;
8680
8681         return retval;
8682 }
8683
8684
8685 kern_return_t
8686 vm_object_iopl_wire_empty(vm_object_t object, upl_t upl, upl_page_info_array_t user_page_list,
8687     wpl_array_t lite_list, upl_control_flags_t cntrl_flags, vm_tag_t tag, vm_object_offset_t *dst_offset,
8688     int page_count, int* page_grab_count)
8689 {
8690         vm_page_t       dst_page;
8691         boolean_t       no_zero_fill = FALSE;
8692         int             interruptible;
8693         int             pages_wired = 0;
8694         int             pages_inserted = 0;
8695         int             entry = 0;
8696         uint64_t        delayed_ledger_update = 0;
8697         kern_return_t   ret = KERN_SUCCESS;
8698         int             grab_options;
8699         ppnum_t         phys_page;
8700
8701         vm_object_lock_assert_exclusive(object);
8702         assert(object->purgable != VM_PURGABLE_VOLATILE);
8703         assert(object->purgable != VM_PURGABLE_EMPTY);
8704         assert(object->pager == NULL);
8705         assert(object->copy == NULL);
8706         assert(object->shadow == NULL);
8707
8708         if (cntrl_flags & UPL_SET_INTERRUPTIBLE) {
8709                 interruptible = THREAD_ABORTSAFE;
8710         } else {
8711                 interruptible = THREAD_UNINT;
8712         }
8713
8714         if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO)) {
8715                 no_zero_fill = TRUE;
8716         }
8717
8718         grab_options = 0;
8719 #if CONFIG_SECLUDED_MEMORY
8720         if (object->can_grab_secluded) {
8721                 grab_options |= VM_PAGE_GRAB_SECLUDED;
8722         }
8723 #endif /* CONFIG_SECLUDED_MEMORY */
8724
8725         while (page_count--) {
8726                 while ((dst_page = vm_page_grab_options(grab_options))
8727                     == VM_PAGE_NULL) {
8728                         OSAddAtomic(page_count, &vm_upl_wait_for_pages);
8729
8730                         VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
8731
8732                         if (vm_page_wait(interruptible) == FALSE) {
8733                                 /*
8734                                  * interrupted case
8735                                  */
8736                                 OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
8737
8738                                 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
8739
8740                                 ret = MACH_SEND_INTERRUPTED;
8741                                 goto done;
8742                         }
8743                         OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
8744
8745                         VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
8746                 }
8747                 if (no_zero_fill == FALSE) {
8748                         vm_page_zero_fill(dst_page);
8749                 } else {
8750                         dst_page->vmp_absent = TRUE;
8751                 }
8752
8753                 dst_page->vmp_reference = TRUE;
8754
8755                 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
8756                         SET_PAGE_DIRTY(dst_page, FALSE);
8757                 }
8758                 if (dst_page->vmp_absent == FALSE) {
8759                         assert(dst_page->vmp_q_state == VM_PAGE_NOT_ON_Q);
8760                         assert(dst_page->vmp_wire_count == 0);
8761                         dst_page->vmp_wire_count++;
8762                         dst_page->vmp_q_state = VM_PAGE_IS_WIRED;
8763                         assert(dst_page->vmp_wire_count);
8764                         pages_wired++;
8765                         PAGE_WAKEUP_DONE(dst_page);
8766                 }
8767                 pages_inserted++;
8768
8769                 vm_page_insert_internal(dst_page, object, *dst_offset, tag, FALSE, TRUE, TRUE, TRUE, &delayed_ledger_update);
8770
8771                 lite_list[entry >> 5] |= 1U << (entry & 31);
8772
8773                 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
8774
8775                 if (phys_page > upl->highest_page) {
8776                         upl->highest_page = phys_page;
8777                 }
8778
8779                 if (user_page_list) {
8780                         user_page_list[entry].phys_addr = phys_page;
8781                         user_page_list[entry].absent    = dst_page->vmp_absent;
8782                         user_page_list[entry].dirty     = dst_page->vmp_dirty;
8783                         user_page_list[entry].free_when_done    = FALSE;
8784                         user_page_list[entry].precious  = FALSE;
8785                         user_page_list[entry].device    = FALSE;
8786                         user_page_list[entry].speculative = FALSE;
8787                         user_page_list[entry].cs_validated = FALSE;
8788                         user_page_list[entry].cs_tainted = FALSE;
8789                         user_page_list[entry].cs_nx     = FALSE;
8790                         user_page_list[entry].needed    = FALSE;
8791                         user_page_list[entry].mark      = FALSE;
8792                 }
8793                 entry++;
8794                 *dst_offset += PAGE_SIZE_64;
8795         }
8796 done:
8797         if (pages_wired) {
8798                 vm_page_lockspin_queues();
8799                 vm_page_wire_count += pages_wired;
8800                 vm_page_unlock_queues();
8801         }
8802         if (pages_inserted) {
8803                 if (object->internal) {
8804                         OSAddAtomic(pages_inserted, &vm_page_internal_count);
8805                 } else {
8806                         OSAddAtomic(pages_inserted, &vm_page_external_count);
8807                 }
8808         }
8809         if (delayed_ledger_update) {
8810                 task_t          owner;
8811                 int             ledger_idx_volatile;
8812                 int             ledger_idx_nonvolatile;
8813                 int             ledger_idx_volatile_compressed;
8814                 int             ledger_idx_nonvolatile_compressed;
8815                 boolean_t       do_footprint;
8816
8817                 owner = VM_OBJECT_OWNER(object);
8818                 assert(owner);
8819
8820                 vm_object_ledger_tag_ledgers(object,
8821                     &ledger_idx_volatile,
8822                     &ledger_idx_nonvolatile,
8823                     &ledger_idx_volatile_compressed,
8824                     &ledger_idx_nonvolatile_compressed,
8825                     &do_footprint);
8826
8827                 /* more non-volatile bytes */
8828                 ledger_credit(owner->ledger,
8829                     ledger_idx_nonvolatile,
8830                     delayed_ledger_update);
8831                 if (do_footprint) {
8832                         /* more footprint */
8833                         ledger_credit(owner->ledger,
8834                             task_ledgers.phys_footprint,
8835                             delayed_ledger_update);
8836                 }
8837         }
8838
8839         assert(page_grab_count);
8840         *page_grab_count = pages_inserted;
8841
8842         return ret;
8843 }
8844
8845
8846
8847 kern_return_t
8848 vm_object_iopl_request(
8849         vm_object_t             object,
8850         vm_object_offset_t      offset,
8851         upl_size_t              size,
8852         upl_t                   *upl_ptr,
8853         upl_page_info_array_t   user_page_list,
8854         unsigned int            *page_list_count,
8855         upl_control_flags_t     cntrl_flags,
8856         vm_tag_t                tag)
8857 {
8858         vm_page_t               dst_page;
8859         vm_object_offset_t      dst_offset;
8860         upl_size_t              xfer_size;
8861         upl_t                   upl = NULL;
8862         unsigned int            entry;
8863         wpl_array_t             lite_list = NULL;
8864         int                     no_zero_fill = FALSE;
8865         unsigned int            size_in_pages;
8866         int                     page_grab_count = 0;
8867         u_int32_t               psize;
8868         kern_return_t           ret;
8869         vm_prot_t               prot;
8870         struct vm_object_fault_info fault_info = {};
8871         struct  vm_page_delayed_work    dw_array;
8872         struct  vm_page_delayed_work    *dwp, *dwp_start;
8873         bool                    dwp_finish_ctx = TRUE;
8874         int                     dw_count;
8875         int                     dw_limit;
8876         int                     dw_index;
8877         boolean_t               caller_lookup;
8878         int                     io_tracking_flag = 0;
8879         int                     interruptible;
8880         ppnum_t                 phys_page;
8881
8882         boolean_t               set_cache_attr_needed = FALSE;
8883         boolean_t               free_wired_pages = FALSE;
8884         boolean_t               fast_path_empty_req = FALSE;
8885         boolean_t               fast_path_full_req = FALSE;
8886
8887 #if DEVELOPMENT || DEBUG
8888         task_t                  task = current_task();
8889 #endif /* DEVELOPMENT || DEBUG */
8890
8891         dwp_start = dwp = NULL;
8892
8893         vm_object_offset_t original_offset = offset;
8894         upl_size_t original_size = size;
8895
8896 //      DEBUG4K_UPL("object %p offset 0x%llx size 0x%llx cntrl_flags 0x%llx\n", object, (uint64_t)offset, (uint64_t)size, cntrl_flags);
8897
8898         size = (upl_size_t)(vm_object_round_page(offset + size) - vm_object_trunc_page(offset));
8899         offset = vm_object_trunc_page(offset);
8900         if (size != original_size || offset != original_offset) {
8901                 DEBUG4K_IOKIT("flags 0x%llx object %p offset 0x%llx size 0x%x -> offset 0x%llx size 0x%x\n", cntrl_flags, object, original_offset, original_size, offset, size);
8902         }
8903
8904         if (cntrl_flags & ~UPL_VALID_FLAGS) {
8905                 /*
8906                  * For forward compatibility's sake,
8907                  * reject any unknown flag.
8908                  */
8909                 return KERN_INVALID_VALUE;
8910         }
8911         if (vm_lopage_needed == FALSE) {
8912                 cntrl_flags &= ~UPL_NEED_32BIT_ADDR;
8913         }
8914
8915         if (cntrl_flags & UPL_NEED_32BIT_ADDR) {
8916                 if ((cntrl_flags & (UPL_SET_IO_WIRE | UPL_SET_LITE)) != (UPL_SET_IO_WIRE | UPL_SET_LITE)) {
8917                         return KERN_INVALID_VALUE;
8918                 }
8919
8920                 if (object->phys_contiguous) {
8921                         if ((offset + object->vo_shadow_offset) >= (vm_object_offset_t)max_valid_dma_address) {
8922                                 return KERN_INVALID_ADDRESS;
8923                         }
8924
8925                         if (((offset + object->vo_shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address) {
8926                                 return KERN_INVALID_ADDRESS;
8927                         }
8928                 }
8929         }
8930         if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO)) {
8931                 no_zero_fill = TRUE;
8932         }
8933
8934         if (cntrl_flags & UPL_COPYOUT_FROM) {
8935                 prot = VM_PROT_READ;
8936         } else {
8937                 prot = VM_PROT_READ | VM_PROT_WRITE;
8938         }
8939
8940         if ((!object->internal) && (object->paging_offset != 0)) {
8941                 panic("vm_object_iopl_request: external object with non-zero paging offset\n");
8942         }
8943
8944
8945         VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, prot, 0);
8946
8947 #if CONFIG_IOSCHED || UPL_DEBUG
8948         if ((object->io_tracking && object != kernel_object) || upl_debug_enabled) {
8949                 io_tracking_flag |= UPL_CREATE_IO_TRACKING;
8950         }
8951 #endif
8952
8953 #if CONFIG_IOSCHED
8954         if (object->io_tracking) {
8955                 /* Check if we're dealing with the kernel object. We do not support expedite on kernel object UPLs */
8956                 if (object != kernel_object) {
8957                         io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
8958                 }
8959         }
8960 #endif
8961
8962         if (object->phys_contiguous) {
8963                 psize = PAGE_SIZE;
8964         } else {
8965                 psize = size;
8966
8967                 dw_count = 0;
8968                 dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
8969                 dwp_start = vm_page_delayed_work_get_ctx();
8970                 if (dwp_start == NULL) {
8971                         dwp_start = &dw_array;
8972                         dw_limit = 1;
8973                         dwp_finish_ctx = FALSE;
8974                 }
8975
8976                 dwp = dwp_start;
8977         }
8978
8979         if (cntrl_flags & UPL_SET_INTERNAL) {
8980                 upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
8981
8982                 user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
8983                 lite_list = (wpl_array_t) (((uintptr_t)user_page_list) +
8984                     ((psize / PAGE_SIZE) * sizeof(upl_page_info_t)));
8985                 if (size == 0) {
8986                         user_page_list = NULL;
8987                         lite_list = NULL;
8988                 }
8989         } else {
8990                 upl = upl_create(UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
8991
8992                 lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
8993                 if (size == 0) {
8994                         lite_list = NULL;
8995                 }
8996         }
8997         if (user_page_list) {
8998                 user_page_list[0].device = FALSE;
8999         }
9000         *upl_ptr = upl;
9001
9002         if (cntrl_flags & UPL_NOZEROFILLIO) {
9003                 DTRACE_VM4(upl_nozerofillio,
9004                     vm_object_t, object,
9005                     vm_object_offset_t, offset,
9006                     upl_size_t, size,
9007                     upl_t, upl);
9008         }
9009
9010         upl->map_object = object;
9011         upl->u_offset = original_offset;
9012         upl->u_size = original_size;
9013
9014         size_in_pages = size / PAGE_SIZE;
9015
9016         if (object == kernel_object &&
9017             !(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS))) {
9018                 upl->flags |= UPL_KERNEL_OBJECT;
9019 #if UPL_DEBUG
9020                 vm_object_lock(object);
9021 #else
9022                 vm_object_lock_shared(object);
9023 #endif
9024         } else {
9025                 vm_object_lock(object);
9026                 vm_object_activity_begin(object);
9027         }
9028         /*
9029          * paging in progress also protects the paging_offset
9030          */
9031         upl->u_offset = original_offset + object->paging_offset;
9032
9033         if (cntrl_flags & UPL_BLOCK_ACCESS) {
9034                 /*
9035                  * The user requested that access to the pages in this UPL
9036                  * be blocked until the UPL is commited or aborted.
9037                  */
9038                 upl->flags |= UPL_ACCESS_BLOCKED;
9039         }
9040
9041 #if CONFIG_IOSCHED || UPL_DEBUG
9042         if ((upl->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
9043                 vm_object_activity_begin(object);
9044                 queue_enter(&object->uplq, upl, upl_t, uplq);
9045         }
9046 #endif
9047
9048         if (object->phys_contiguous) {
9049                 if (upl->flags & UPL_ACCESS_BLOCKED) {
9050                         assert(!object->blocked_access);
9051                         object->blocked_access = TRUE;
9052                 }
9053
9054                 vm_object_unlock(object);
9055
9056                 /*
9057                  * don't need any shadow mappings for this one
9058                  * since it is already I/O memory
9059                  */
9060                 upl->flags |= UPL_DEVICE_MEMORY;
9061
9062                 upl->highest_page = (ppnum_t) ((offset + object->vo_shadow_offset + size - 1) >> PAGE_SHIFT);
9063
9064                 if (user_page_list) {
9065                         user_page_list[0].phys_addr = (ppnum_t) ((offset + object->vo_shadow_offset) >> PAGE_SHIFT);
9066                         user_page_list[0].device = TRUE;
9067                 }
9068                 if (page_list_count != NULL) {
9069                         if (upl->flags & UPL_INTERNAL) {
9070                                 *page_list_count = 0;
9071                         } else {
9072                                 *page_list_count = 1;
9073                         }
9074                 }
9075
9076                 VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0);
9077 #if DEVELOPMENT || DEBUG
9078                 if (task != NULL) {
9079                         ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
9080                 }
9081 #endif /* DEVELOPMENT || DEBUG */
9082                 return KERN_SUCCESS;
9083         }
9084         if (object != kernel_object && object != compressor_object) {
9085                 /*
9086                  * Protect user space from future COW operations
9087                  */
9088 #if VM_OBJECT_TRACKING_OP_TRUESHARE
9089                 if (!object->true_share &&
9090                     vm_object_tracking_inited) {
9091                         void *bt[VM_OBJECT_TRACKING_BTDEPTH];
9092                         int num = 0;
9093
9094                         num = OSBacktrace(bt,
9095                             VM_OBJECT_TRACKING_BTDEPTH);
9096                         btlog_add_entry(vm_object_tracking_btlog,
9097                             object,
9098                             VM_OBJECT_TRACKING_OP_TRUESHARE,
9099                             bt,
9100                             num);
9101                 }
9102 #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
9103
9104                 vm_object_lock_assert_exclusive(object);
9105                 object->true_share = TRUE;
9106
9107                 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
9108                         object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
9109                 }
9110         }
9111
9112         if (!(cntrl_flags & UPL_COPYOUT_FROM) &&
9113             object->copy != VM_OBJECT_NULL) {
9114                 /*
9115                  * Honor copy-on-write obligations
9116                  *
9117                  * The caller is gathering these pages and
9118                  * might modify their contents.  We need to
9119                  * make sure that the copy object has its own
9120                  * private copies of these pages before we let
9121                  * the caller modify them.
9122                  *
9123                  * NOTE: someone else could map the original object
9124                  * after we've done this copy-on-write here, and they
9125                  * could then see an inconsistent picture of the memory
9126                  * while it's being modified via the UPL.  To prevent this,
9127                  * we would have to block access to these pages until the
9128                  * UPL is released.  We could use the UPL_BLOCK_ACCESS
9129                  * code path for that...
9130                  */
9131                 vm_object_update(object,
9132                     offset,
9133                     size,
9134                     NULL,
9135                     NULL,
9136                     FALSE,              /* should_return */
9137                     MEMORY_OBJECT_COPY_SYNC,
9138                     VM_PROT_NO_CHANGE);
9139                 VM_PAGEOUT_DEBUG(iopl_cow, 1);
9140                 VM_PAGEOUT_DEBUG(iopl_cow_pages, (size >> PAGE_SHIFT));
9141         }
9142         if (!(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS)) &&
9143             object->purgable != VM_PURGABLE_VOLATILE &&
9144             object->purgable != VM_PURGABLE_EMPTY &&
9145             object->copy == NULL &&
9146             size == object->vo_size &&
9147             offset == 0 &&
9148             object->shadow == NULL &&
9149             object->pager == NULL) {
9150                 if (object->resident_page_count == size_in_pages) {
9151                         assert(object != compressor_object);
9152                         assert(object != kernel_object);
9153                         fast_path_full_req = TRUE;
9154                 } else if (object->resident_page_count == 0) {
9155                         assert(object != compressor_object);
9156                         assert(object != kernel_object);
9157                         fast_path_empty_req = TRUE;
9158                         set_cache_attr_needed = TRUE;
9159                 }
9160         }
9161
9162         if (cntrl_flags & UPL_SET_INTERRUPTIBLE) {
9163                 interruptible = THREAD_ABORTSAFE;
9164         } else {
9165                 interruptible = THREAD_UNINT;
9166         }
9167
9168         entry = 0;
9169
9170         xfer_size = size;
9171         dst_offset = offset;
9172
9173         if (fast_path_full_req) {
9174                 if (vm_object_iopl_wire_full(object, upl, user_page_list, lite_list, cntrl_flags, tag) == TRUE) {
9175                         goto finish;
9176                 }
9177                 /*
9178                  * we couldn't complete the processing of this request on the fast path
9179                  * so fall through to the slow path and finish up
9180                  */
9181         } else if (fast_path_empty_req) {
9182                 if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
9183                         ret = KERN_MEMORY_ERROR;
9184                         goto return_err;
9185                 }
9186                 ret = vm_object_iopl_wire_empty(object, upl, user_page_list, lite_list, cntrl_flags, tag, &dst_offset, size_in_pages, &page_grab_count);
9187
9188                 if (ret) {
9189                         free_wired_pages = TRUE;
9190                         goto return_err;
9191                 }
9192                 goto finish;
9193         }
9194
9195         fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
9196         fault_info.lo_offset = offset;
9197         fault_info.hi_offset = offset + xfer_size;
9198         fault_info.mark_zf_absent = TRUE;
9199         fault_info.interruptible = interruptible;
9200         fault_info.batch_pmap_op = TRUE;
9201
9202         while (xfer_size) {
9203                 vm_fault_return_t       result;
9204
9205                 dwp->dw_mask = 0;
9206
9207                 if (fast_path_full_req) {
9208                         /*
9209                          * if we get here, it means that we ran into a page
9210                          * state we couldn't handle in the fast path and
9211                          * bailed out to the slow path... since the order
9212                          * we look at pages is different between the 2 paths,
9213                          * the following check is needed to determine whether
9214                          * this page was already processed in the fast path
9215                          */
9216                         if (lite_list[entry >> 5] & (1 << (entry & 31))) {
9217                                 goto skip_page;
9218                         }
9219                 }
9220                 dst_page = vm_page_lookup(object, dst_offset);
9221
9222                 if (dst_page == VM_PAGE_NULL ||
9223                     dst_page->vmp_busy ||
9224                     dst_page->vmp_error ||
9225                     dst_page->vmp_restart ||
9226                     dst_page->vmp_absent ||
9227                     dst_page->vmp_fictitious) {
9228                         if (object == kernel_object) {
9229                                 panic("vm_object_iopl_request: missing/bad page in kernel object\n");
9230                         }
9231                         if (object == compressor_object) {
9232                                 panic("vm_object_iopl_request: missing/bad page in compressor object\n");
9233                         }
9234
9235                         if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
9236                                 ret = KERN_MEMORY_ERROR;
9237                                 goto return_err;
9238                         }
9239                         set_cache_attr_needed = TRUE;
9240
9241                         /*
9242                          * We just looked up the page and the result remains valid
9243                          * until the object lock is release, so send it to
9244                          * vm_fault_page() (as "dst_page"), to avoid having to
9245                          * look it up again there.
9246                          */
9247                         caller_lookup = TRUE;
9248
9249                         do {
9250                                 vm_page_t       top_page;
9251                                 kern_return_t   error_code;
9252
9253                                 fault_info.cluster_size = xfer_size;
9254
9255                                 vm_object_paging_begin(object);
9256
9257                                 result = vm_fault_page(object, dst_offset,
9258                                     prot | VM_PROT_WRITE, FALSE,
9259                                     caller_lookup,
9260                                     &prot, &dst_page, &top_page,
9261                                     (int *)0,
9262                                     &error_code, no_zero_fill,
9263                                     FALSE, &fault_info);
9264
9265                                 /* our lookup is no longer valid at this point */
9266                                 caller_lookup = FALSE;
9267
9268                                 switch (result) {
9269                                 case VM_FAULT_SUCCESS:
9270                                         page_grab_count++;
9271
9272                                         if (!dst_page->vmp_absent) {
9273                                                 PAGE_WAKEUP_DONE(dst_page);
9274                                         } else {
9275                                                 /*
9276                                                  * we only get back an absent page if we
9277                                                  * requested that it not be zero-filled
9278                                                  * because we are about to fill it via I/O
9279                                                  *
9280                                                  * absent pages should be left BUSY
9281                                                  * to prevent them from being faulted
9282                                                  * into an address space before we've
9283                                                  * had a chance to complete the I/O on
9284                                                  * them since they may contain info that
9285                                                  * shouldn't be seen by the faulting task
9286                                                  */
9287                                         }
9288                                         /*
9289                                          *      Release paging references and
9290                                          *      top-level placeholder page, if any.
9291                                          */
9292                                         if (top_page != VM_PAGE_NULL) {
9293                                                 vm_object_t local_object;
9294
9295                                                 local_object = VM_PAGE_OBJECT(top_page);
9296
9297                                                 /*
9298                                                  * comparing 2 packed pointers
9299                                                  */
9300                                                 if (top_page->vmp_object != dst_page->vmp_object) {
9301                                                         vm_object_lock(local_object);
9302                                                         VM_PAGE_FREE(top_page);
9303                                                         vm_object_paging_end(local_object);
9304                                                         vm_object_unlock(local_object);
9305                                                 } else {
9306                                                         VM_PAGE_FREE(top_page);
9307                                                         vm_object_paging_end(local_object);
9308                                                 }
9309                                         }
9310                                         vm_object_paging_end(object);
9311                                         break;
9312
9313                                 case VM_FAULT_RETRY:
9314                                         vm_object_lock(object);
9315                                         break;
9316
9317                                 case VM_FAULT_MEMORY_SHORTAGE:
9318                                         OSAddAtomic((size_in_pages - entry), &vm_upl_wait_for_pages);
9319
9320                                         VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
9321
9322                                         if (vm_page_wait(interruptible)) {
9323                                                 OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
9324
9325                                                 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
9326                                                 vm_object_lock(object);
9327
9328                                                 break;
9329                                         }
9330                                         OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
9331
9332                                         VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
9333
9334                                         OS_FALLTHROUGH;
9335
9336                                 case VM_FAULT_INTERRUPTED:
9337                                         error_code = MACH_SEND_INTERRUPTED;
9338                                         OS_FALLTHROUGH;
9339                                 case VM_FAULT_MEMORY_ERROR:
9340 memory_error:
9341                                         ret = (error_code ? error_code: KERN_MEMORY_ERROR);
9342
9343                                         vm_object_lock(object);
9344                                         goto return_err;
9345
9346                                 case VM_FAULT_SUCCESS_NO_VM_PAGE:
9347                                         /* success but no page: fail */
9348                                         vm_object_paging_end(object);
9349                                         vm_object_unlock(object);
9350                                         goto memory_error;
9351
9352                                 default:
9353                                         panic("vm_object_iopl_request: unexpected error"
9354                                             " 0x%x from vm_fault_page()\n", result);
9355                                 }
9356                         } while (result != VM_FAULT_SUCCESS);
9357                 }
9358                 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
9359
9360                 if (upl->flags & UPL_KERNEL_OBJECT) {
9361                         goto record_phys_addr;
9362                 }
9363
9364                 if (dst_page->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
9365                         dst_page->vmp_busy = TRUE;
9366                         goto record_phys_addr;
9367                 }
9368
9369                 if (dst_page->vmp_cleaning) {
9370                         /*
9371                          * Someone else is cleaning this page in place.
9372                          * In theory, we should be able to  proceed and use this
9373                          * page but they'll probably end up clearing the "busy"
9374                          * bit on it in upl_commit_range() but they didn't set
9375                          * it, so they would clear our "busy" bit and open
9376                          * us to race conditions.
9377                          * We'd better wait for the cleaning to complete and
9378                          * then try again.
9379                          */
9380                         VM_PAGEOUT_DEBUG(vm_object_iopl_request_sleep_for_cleaning, 1);
9381                         PAGE_SLEEP(object, dst_page, THREAD_UNINT);
9382                         continue;
9383                 }
9384                 if (dst_page->vmp_laundry) {
9385                         vm_pageout_steal_laundry(dst_page, FALSE);
9386                 }
9387
9388                 if ((cntrl_flags & UPL_NEED_32BIT_ADDR) &&
9389                     phys_page >= (max_valid_dma_address >> PAGE_SHIFT)) {
9390                         vm_page_t       low_page;
9391                         int             refmod;
9392
9393                         /*
9394                          * support devices that can't DMA above 32 bits
9395                          * by substituting pages from a pool of low address
9396                          * memory for any pages we find above the 4G mark
9397                          * can't substitute if the page is already wired because
9398                          * we don't know whether that physical address has been
9399                          * handed out to some other 64 bit capable DMA device to use
9400                          */
9401                         if (VM_PAGE_WIRED(dst_page)) {
9402                                 ret = KERN_PROTECTION_FAILURE;
9403                                 goto return_err;
9404                         }
9405                         low_page = vm_page_grablo();
9406
9407                         if (low_page == VM_PAGE_NULL) {
9408                                 ret = KERN_RESOURCE_SHORTAGE;
9409                                 goto return_err;
9410                         }
9411                         /*
9412                          * from here until the vm_page_replace completes
9413                          * we musn't drop the object lock... we don't
9414                          * want anyone refaulting this page in and using
9415                          * it after we disconnect it... we want the fault
9416                          * to find the new page being substituted.
9417                          */
9418                         if (dst_page->vmp_pmapped) {
9419                                 refmod = pmap_disconnect(phys_page);
9420                         } else {
9421                                 refmod = 0;
9422                         }
9423
9424                         if (!dst_page->vmp_absent) {
9425                                 vm_page_copy(dst_page, low_page);
9426                         }
9427
9428                         low_page->vmp_reference = dst_page->vmp_reference;
9429                         low_page->vmp_dirty     = dst_page->vmp_dirty;
9430                         low_page->vmp_absent    = dst_page->vmp_absent;
9431
9432                         if (refmod & VM_MEM_REFERENCED) {
9433                                 low_page->vmp_reference = TRUE;
9434                         }
9435                         if (refmod & VM_MEM_MODIFIED) {
9436                                 SET_PAGE_DIRTY(low_page, FALSE);
9437                         }
9438
9439                         vm_page_replace(low_page, object, dst_offset);
9440
9441                         dst_page = low_page;
9442                         /*
9443                          * vm_page_grablo returned the page marked
9444                          * BUSY... we don't need a PAGE_WAKEUP_DONE
9445                          * here, because we've never dropped the object lock
9446                          */
9447                         if (!dst_page->vmp_absent) {
9448                                 dst_page->vmp_busy = FALSE;
9449                         }
9450
9451                         phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
9452                 }
9453                 if (!dst_page->vmp_busy) {
9454                         dwp->dw_mask |= DW_vm_page_wire;
9455                 }
9456
9457                 if (cntrl_flags & UPL_BLOCK_ACCESS) {
9458                         /*
9459                          * Mark the page "busy" to block any future page fault
9460                          * on this page in addition to wiring it.
9461                          * We'll also remove the mapping
9462                          * of all these pages before leaving this routine.
9463                          */
9464                         assert(!dst_page->vmp_fictitious);
9465                         dst_page->vmp_busy = TRUE;
9466                 }
9467                 /*
9468                  * expect the page to be used
9469                  * page queues lock must be held to set 'reference'
9470                  */
9471                 dwp->dw_mask |= DW_set_reference;
9472
9473                 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
9474                         SET_PAGE_DIRTY(dst_page, TRUE);
9475                         /*
9476                          * Page belonging to a code-signed object is about to
9477                          * be written. Mark it tainted and disconnect it from
9478                          * all pmaps so processes have to fault it back in and
9479                          * deal with the tainted bit.
9480                          */
9481                         if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
9482                                 dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE;
9483                                 vm_page_iopl_tainted++;
9484                                 if (dst_page->vmp_pmapped) {
9485                                         int refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
9486                                         if (refmod & VM_MEM_REFERENCED) {
9487                                                 dst_page->vmp_reference = TRUE;
9488                                         }
9489                                 }
9490                         }
9491                 }
9492                 if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
9493                         pmap_sync_page_attributes_phys(phys_page);
9494                         dst_page->vmp_written_by_kernel = FALSE;
9495                 }
9496
9497 record_phys_addr:
9498                 if (dst_page->vmp_busy) {
9499                         upl->flags |= UPL_HAS_BUSY;
9500                 }
9501
9502                 lite_list[entry >> 5] |= 1U << (entry & 31);
9503
9504                 if (phys_page > upl->highest_page) {
9505                         upl->highest_page = phys_page;
9506                 }
9507
9508                 if (user_page_list) {
9509                         user_page_list[entry].phys_addr = phys_page;
9510                         user_page_list[entry].free_when_done    = dst_page->vmp_free_when_done;
9511                         user_page_list[entry].absent    = dst_page->vmp_absent;
9512                         user_page_list[entry].dirty     = dst_page->vmp_dirty;
9513                         user_page_list[entry].precious  = dst_page->vmp_precious;
9514                         user_page_list[entry].device    = FALSE;
9515                         user_page_list[entry].needed    = FALSE;
9516                         if (dst_page->vmp_clustered == TRUE) {
9517                                 user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
9518                         } else {
9519                                 user_page_list[entry].speculative = FALSE;
9520                         }
9521                         user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
9522                         user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
9523                         user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
9524                         user_page_list[entry].mark      = FALSE;
9525                 }
9526                 if (object != kernel_object && object != compressor_object) {
9527                         /*
9528                          * someone is explicitly grabbing this page...
9529                          * update clustered and speculative state
9530                          *
9531                          */
9532                         if (dst_page->vmp_clustered) {
9533                                 VM_PAGE_CONSUME_CLUSTERED(dst_page);
9534                         }
9535                 }
9536 skip_page:
9537                 entry++;
9538                 dst_offset += PAGE_SIZE_64;
9539                 xfer_size -= PAGE_SIZE;
9540
9541                 if (dwp->dw_mask) {
9542                         VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
9543
9544                         if (dw_count >= dw_limit) {
9545                                 vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
9546
9547                                 dwp = dwp_start;
9548                                 dw_count = 0;
9549                         }
9550                 }
9551         }
9552         assert(entry == size_in_pages);
9553
9554         if (dw_count) {
9555                 vm_page_do_delayed_work(object, tag, dwp_start, dw_count);
9556                 dwp = dwp_start;
9557                 dw_count = 0;
9558         }
9559 finish:
9560         if (user_page_list && set_cache_attr_needed == TRUE) {
9561                 vm_object_set_pmap_cache_attr(object, user_page_list, size_in_pages, TRUE);
9562         }
9563
9564         if (page_list_count != NULL) {
9565                 if (upl->flags & UPL_INTERNAL) {
9566                         *page_list_count = 0;
9567                 } else if (*page_list_count > size_in_pages) {
9568                         *page_list_count = size_in_pages;
9569                 }
9570         }
9571         vm_object_unlock(object);
9572
9573         if (cntrl_flags & UPL_BLOCK_ACCESS) {
9574                 /*
9575                  * We've marked all the pages "busy" so that future
9576                  * page faults will block.
9577                  * Now remove the mapping for these pages, so that they
9578                  * can't be accessed without causing a page fault.
9579                  */
9580                 vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
9581                     PMAP_NULL,
9582                     PAGE_SIZE,
9583                     0, VM_PROT_NONE);
9584                 assert(!object->blocked_access);
9585                 object->blocked_access = TRUE;
9586         }
9587
9588         VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0);
9589 #if DEVELOPMENT || DEBUG
9590         if (task != NULL) {
9591                 ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
9592         }
9593 #endif /* DEVELOPMENT || DEBUG */
9594
9595         if (dwp_start && dwp_finish_ctx) {
9596                 vm_page_delayed_work_finish_ctx(dwp_start);
9597                 dwp_start = dwp = NULL;
9598         }
9599
9600         return KERN_SUCCESS;
9601
9602 return_err:
9603         dw_index = 0;
9604
9605         for (; offset < dst_offset; offset += PAGE_SIZE) {
9606                 boolean_t need_unwire;
9607
9608                 dst_page = vm_page_lookup(object, offset);
9609
9610                 if (dst_page == VM_PAGE_NULL) {
9611                         panic("vm_object_iopl_request: Wired page missing. \n");
9612                 }
9613
9614                 /*
9615                  * if we've already processed this page in an earlier
9616                  * dw_do_work, we need to undo the wiring... we will
9617                  * leave the dirty and reference bits on if they
9618                  * were set, since we don't have a good way of knowing
9619                  * what the previous state was and we won't get here
9620                  * under any normal circumstances...  we will always
9621                  * clear BUSY and wakeup any waiters via vm_page_free
9622                  * or PAGE_WAKEUP_DONE
9623                  */
9624                 need_unwire = TRUE;
9625
9626                 if (dw_count) {
9627                         if ((dwp_start)[dw_index].dw_m == dst_page) {
9628                                 /*
9629                                  * still in the deferred work list
9630                                  * which means we haven't yet called
9631                                  * vm_page_wire on this page
9632                                  */
9633                                 need_unwire = FALSE;
9634
9635                                 dw_index++;
9636                                 dw_count--;
9637                         }
9638                 }
9639                 vm_page_lock_queues();
9640
9641                 if (dst_page->vmp_absent || free_wired_pages == TRUE) {
9642                         vm_page_free(dst_page);
9643
9644                         need_unwire = FALSE;
9645                 } else {
9646                         if (need_unwire == TRUE) {
9647                                 vm_page_unwire(dst_page, TRUE);
9648                         }
9649
9650                         PAGE_WAKEUP_DONE(dst_page);
9651                 }
9652                 vm_page_unlock_queues();
9653
9654                 if (need_unwire == TRUE) {
9655                         counter_inc(&vm_statistics_reactivations);
9656                 }
9657         }
9658 #if UPL_DEBUG
9659         upl->upl_state = 2;
9660 #endif
9661         if (!(upl->flags & UPL_KERNEL_OBJECT)) {
9662                 vm_object_activity_end(object);
9663                 vm_object_collapse(object, 0, TRUE);
9664         }
9665         vm_object_unlock(object);
9666         upl_destroy(upl);
9667
9668         VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, ret, 0, 0);
9669 #if DEVELOPMENT || DEBUG
9670         if (task != NULL) {
9671                 ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
9672         }
9673 #endif /* DEVELOPMENT || DEBUG */
9674
9675         if (dwp_start && dwp_finish_ctx) {
9676                 vm_page_delayed_work_finish_ctx(dwp_start);
9677                 dwp_start = dwp = NULL;
9678         }
9679         return ret;
9680 }
9681
9682 kern_return_t
9683 upl_transpose(
9684         upl_t           upl1,
9685         upl_t           upl2)
9686 {
9687         kern_return_t           retval;
9688         boolean_t               upls_locked;
9689         vm_object_t             object1, object2;
9690
9691         if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2 || ((upl1->flags & UPL_VECTOR) == UPL_VECTOR) || ((upl2->flags & UPL_VECTOR) == UPL_VECTOR)) {
9692                 return KERN_INVALID_ARGUMENT;
9693         }
9694
9695         upls_locked = FALSE;
9696
9697         /*
9698          * Since we need to lock both UPLs at the same time,
9699          * avoid deadlocks by always taking locks in the same order.
9700          */
9701         if (upl1 < upl2) {
9702                 upl_lock(upl1);
9703                 upl_lock(upl2);
9704         } else {
9705                 upl_lock(upl2);
9706                 upl_lock(upl1);
9707         }
9708         upls_locked = TRUE;     /* the UPLs will need to be unlocked */
9709
9710         object1 = upl1->map_object;
9711         object2 = upl2->map_object;
9712
9713         if (upl1->u_offset != 0 || upl2->u_offset != 0 ||
9714             upl1->u_size != upl2->u_size) {
9715                 /*
9716                  * We deal only with full objects, not subsets.
9717                  * That's because we exchange the entire backing store info
9718                  * for the objects: pager, resident pages, etc...  We can't do
9719                  * only part of it.
9720                  */
9721                 retval = KERN_INVALID_VALUE;
9722                 goto done;
9723         }
9724
9725         /*
9726          * Tranpose the VM objects' backing store.
9727          */
9728         retval = vm_object_transpose(object1, object2,
9729             upl_adjusted_size(upl1, PAGE_MASK));
9730
9731         if (retval == KERN_SUCCESS) {
9732                 /*
9733                  * Make each UPL point to the correct VM object, i.e. the
9734                  * object holding the pages that the UPL refers to...
9735                  */
9736 #if CONFIG_IOSCHED || UPL_DEBUG
9737                 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
9738                         vm_object_lock(object1);
9739                         vm_object_lock(object2);
9740                 }
9741                 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
9742                         queue_remove(&object1->uplq, upl1, upl_t, uplq);
9743                 }
9744                 if ((upl2->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
9745                         queue_remove(&object2->uplq, upl2, upl_t, uplq);
9746                 }
9747 #endif
9748                 upl1->map_object = object2;
9749                 upl2->map_object = object1;
9750
9751 #if CONFIG_IOSCHED || UPL_DEBUG
9752                 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
9753                         queue_enter(&object2->uplq, upl1, upl_t, uplq);
9754                 }
9755                 if ((upl2->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
9756                         queue_enter(&object1->uplq, upl2, upl_t, uplq);
9757                 }
9758                 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
9759                         vm_object_unlock(object2);
9760                         vm_object_unlock(object1);
9761                 }
9762 #endif
9763         }
9764
9765 done:
9766         /*
9767          * Cleanup.
9768          */
9769         if (upls_locked) {
9770                 upl_unlock(upl1);
9771                 upl_unlock(upl2);
9772                 upls_locked = FALSE;
9773         }
9774
9775         return retval;
9776 }
9777
9778 void
9779 upl_range_needed(
9780         upl_t           upl,
9781         int             index,
9782         int             count)
9783 {
9784         upl_page_info_t *user_page_list;
9785         int             size_in_pages;
9786
9787         if (!(upl->flags & UPL_INTERNAL) || count <= 0) {
9788                 return;
9789         }
9790
9791         size_in_pages = upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE;
9792
9793         user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
9794
9795         while (count-- && index < size_in_pages) {
9796                 user_page_list[index++].needed = TRUE;
9797         }
9798 }
9799
9800
9801 /*
9802  * Reserve of virtual addresses in the kernel address space.
9803  * We need to map the physical pages in the kernel, so that we
9804  * can call the code-signing or slide routines with a kernel
9805  * virtual address.  We keep this pool of pre-allocated kernel
9806  * virtual addresses so that we don't have to scan the kernel's
9807  * virtaul address space each time we need to work with
9808  * a physical page.
9809  */
9810 SIMPLE_LOCK_DECLARE(vm_paging_lock, 0);
9811 #define VM_PAGING_NUM_PAGES     64
9812 vm_map_offset_t vm_paging_base_address = 0;
9813 boolean_t       vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
9814 int             vm_paging_max_index = 0;
9815 int             vm_paging_page_waiter = 0;
9816 int             vm_paging_page_waiter_total = 0;
9817
9818 unsigned long   vm_paging_no_kernel_page = 0;
9819 unsigned long   vm_paging_objects_mapped = 0;
9820 unsigned long   vm_paging_pages_mapped = 0;
9821 unsigned long   vm_paging_objects_mapped_slow = 0;
9822 unsigned long   vm_paging_pages_mapped_slow = 0;
9823
9824 __startup_func
9825 void
9826 vm_paging_map_init(void)
9827 {
9828         kern_return_t   kr;
9829         vm_map_offset_t page_map_offset;
9830         vm_map_entry_t  map_entry;
9831
9832         assert(vm_paging_base_address == 0);
9833
9834         /*
9835          * Initialize our pool of pre-allocated kernel
9836          * virtual addresses.
9837          */
9838         page_map_offset = 0;
9839         kr = vm_map_find_space(kernel_map,
9840             &page_map_offset,
9841             VM_PAGING_NUM_PAGES * PAGE_SIZE,
9842             0,
9843             0,
9844             VM_MAP_KERNEL_FLAGS_NONE,
9845             VM_KERN_MEMORY_NONE,
9846             &map_entry);
9847         if (kr != KERN_SUCCESS) {
9848                 panic("vm_paging_map_init: kernel_map full\n");
9849         }
9850         VME_OBJECT_SET(map_entry, kernel_object);
9851         VME_OFFSET_SET(map_entry, page_map_offset);
9852         map_entry->protection = VM_PROT_NONE;
9853         map_entry->max_protection = VM_PROT_NONE;
9854         map_entry->permanent = TRUE;
9855         vm_object_reference(kernel_object);
9856         vm_map_unlock(kernel_map);
9857
9858         assert(vm_paging_base_address == 0);
9859         vm_paging_base_address = page_map_offset;
9860 }
9861
9862 /*
9863  * vm_paging_map_object:
9864  *      Maps part of a VM object's pages in the kernel
9865  *      virtual address space, using the pre-allocated
9866  *      kernel virtual addresses, if possible.
9867  * Context:
9868  *      The VM object is locked.  This lock will get
9869  *      dropped and re-acquired though, so the caller
9870  *      must make sure the VM object is kept alive
9871  *      (by holding a VM map that has a reference
9872  *      on it, for example, or taking an extra reference).
9873  *      The page should also be kept busy to prevent
9874  *      it from being reclaimed.
9875  */
9876 kern_return_t
9877 vm_paging_map_object(
9878         vm_page_t               page,
9879         vm_object_t             object,
9880         vm_object_offset_t      offset,
9881         vm_prot_t               protection,
9882         boolean_t               can_unlock_object,
9883         vm_map_size_t           *size,          /* IN/OUT */
9884         vm_map_offset_t         *address,       /* OUT */
9885         boolean_t               *need_unmap)    /* OUT */
9886 {
9887         kern_return_t           kr;
9888         vm_map_offset_t         page_map_offset;
9889         vm_map_size_t           map_size;
9890         vm_object_offset_t      object_offset;
9891         int                     i;
9892
9893         if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
9894                 /* use permanent 1-to-1 kernel mapping of physical memory ? */
9895                 *address = (vm_map_offset_t)
9896                     phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(page) << PAGE_SHIFT);
9897                 *need_unmap = FALSE;
9898                 return KERN_SUCCESS;
9899
9900                 assert(page->vmp_busy);
9901                 /*
9902                  * Use one of the pre-allocated kernel virtual addresses
9903                  * and just enter the VM page in the kernel address space
9904                  * at that virtual address.
9905                  */
9906                 simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
9907
9908                 /*
9909                  * Try and find an available kernel virtual address
9910                  * from our pre-allocated pool.
9911                  */
9912                 page_map_offset = 0;
9913                 for (;;) {
9914                         for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
9915                                 if (vm_paging_page_inuse[i] == FALSE) {
9916                                         page_map_offset =
9917                                             vm_paging_base_address +
9918                                             (i * PAGE_SIZE);
9919                                         break;
9920                                 }
9921                         }
9922                         if (page_map_offset != 0) {
9923                                 /* found a space to map our page ! */
9924                                 break;
9925                         }
9926
9927                         if (can_unlock_object) {
9928                                 /*
9929                                  * If we can afford to unlock the VM object,
9930                                  * let's take the slow path now...
9931                                  */
9932                                 break;
9933                         }
9934                         /*
9935                          * We can't afford to unlock the VM object, so
9936                          * let's wait for a space to become available...
9937                          */
9938                         vm_paging_page_waiter_total++;
9939                         vm_paging_page_waiter++;
9940                         kr = assert_wait((event_t)&vm_paging_page_waiter, THREAD_UNINT);
9941                         if (kr == THREAD_WAITING) {
9942                                 simple_unlock(&vm_paging_lock);
9943                                 kr = thread_block(THREAD_CONTINUE_NULL);
9944                                 simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
9945                         }
9946                         vm_paging_page_waiter--;
9947                         /* ... and try again */
9948                 }
9949
9950                 if (page_map_offset != 0) {
9951                         /*
9952                          * We found a kernel virtual address;
9953                          * map the physical page to that virtual address.
9954                          */
9955                         if (i > vm_paging_max_index) {
9956                                 vm_paging_max_index = i;
9957                         }
9958                         vm_paging_page_inuse[i] = TRUE;
9959                         simple_unlock(&vm_paging_lock);
9960
9961                         page->vmp_pmapped = TRUE;
9962
9963                         /*
9964                          * Keep the VM object locked over the PMAP_ENTER
9965                          * and the actual use of the page by the kernel,
9966                          * or this pmap mapping might get undone by a
9967                          * vm_object_pmap_protect() call...
9968                          */
9969                         PMAP_ENTER(kernel_pmap,
9970                             page_map_offset,
9971                             page,
9972                             protection,
9973                             VM_PROT_NONE,
9974                             0,
9975                             TRUE,
9976                             kr);
9977                         assert(kr == KERN_SUCCESS);
9978                         vm_paging_objects_mapped++;
9979                         vm_paging_pages_mapped++;
9980                         *address = page_map_offset;
9981                         *need_unmap = TRUE;
9982
9983 #if KASAN
9984                         kasan_notify_address(page_map_offset, PAGE_SIZE);
9985 #endif
9986
9987                         /* all done and mapped, ready to use ! */
9988                         return KERN_SUCCESS;
9989                 }
9990
9991                 /*
9992                  * We ran out of pre-allocated kernel virtual
9993                  * addresses.  Just map the page in the kernel
9994                  * the slow and regular way.
9995                  */
9996                 vm_paging_no_kernel_page++;
9997                 simple_unlock(&vm_paging_lock);
9998         }
9999
10000         if (!can_unlock_object) {
10001                 *address = 0;
10002                 *size = 0;
10003                 *need_unmap = FALSE;
10004                 return KERN_NOT_SUPPORTED;
10005         }
10006
10007         object_offset = vm_object_trunc_page(offset);
10008         map_size = vm_map_round_page(*size,
10009             VM_MAP_PAGE_MASK(kernel_map));
10010
10011         /*
10012          * Try and map the required range of the object
10013          * in the kernel_map
10014          */
10015
10016         vm_object_reference_locked(object);     /* for the map entry */
10017         vm_object_unlock(object);
10018
10019         kr = vm_map_enter(kernel_map,
10020             address,
10021             map_size,
10022             0,
10023             VM_FLAGS_ANYWHERE,
10024             VM_MAP_KERNEL_FLAGS_NONE,
10025             VM_KERN_MEMORY_NONE,
10026             object,
10027             object_offset,
10028             FALSE,
10029             protection,
10030             VM_PROT_ALL,
10031             VM_INHERIT_NONE);
10032         if (kr != KERN_SUCCESS) {
10033                 *address = 0;
10034                 *size = 0;
10035                 *need_unmap = FALSE;
10036                 vm_object_deallocate(object);   /* for the map entry */
10037                 vm_object_lock(object);
10038                 return kr;
10039         }
10040
10041         *size = map_size;
10042
10043         /*
10044          * Enter the mapped pages in the page table now.
10045          */
10046         vm_object_lock(object);
10047         /*
10048          * VM object must be kept locked from before PMAP_ENTER()
10049          * until after the kernel is done accessing the page(s).
10050          * Otherwise, the pmap mappings in the kernel could be
10051          * undone by a call to vm_object_pmap_protect().
10052          */
10053
10054         for (page_map_offset = 0;
10055             map_size != 0;
10056             map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
10057                 page = vm_page_lookup(object, offset + page_map_offset);
10058                 if (page == VM_PAGE_NULL) {
10059                         printf("vm_paging_map_object: no page !?");
10060                         vm_object_unlock(object);
10061                         kr = vm_map_remove(kernel_map, *address, *size,
10062                             VM_MAP_REMOVE_NO_FLAGS);
10063                         assert(kr == KERN_SUCCESS);
10064                         *address = 0;
10065                         *size = 0;
10066                         *need_unmap = FALSE;
10067                         vm_object_lock(object);
10068                         return KERN_MEMORY_ERROR;
10069                 }
10070                 page->vmp_pmapped = TRUE;
10071
10072                 //assert(pmap_verify_free(VM_PAGE_GET_PHYS_PAGE(page)));
10073                 PMAP_ENTER(kernel_pmap,
10074                     *address + page_map_offset,
10075                     page,
10076                     protection,
10077                     VM_PROT_NONE,
10078                     0,
10079                     TRUE,
10080                     kr);
10081                 assert(kr == KERN_SUCCESS);
10082 #if KASAN
10083                 kasan_notify_address(*address + page_map_offset, PAGE_SIZE);
10084 #endif
10085         }
10086
10087         vm_paging_objects_mapped_slow++;
10088         vm_paging_pages_mapped_slow += (unsigned long) (map_size / PAGE_SIZE_64);
10089
10090         *need_unmap = TRUE;
10091
10092         return KERN_SUCCESS;
10093 }
10094
10095 /*
10096  * vm_paging_unmap_object:
10097  *      Unmaps part of a VM object's pages from the kernel
10098  *      virtual address space.
10099  * Context:
10100  *      The VM object is locked.  This lock will get
10101  *      dropped and re-acquired though.
10102  */
10103 void
10104 vm_paging_unmap_object(
10105         vm_object_t     object,
10106         vm_map_offset_t start,
10107         vm_map_offset_t end)
10108 {
10109         kern_return_t   kr;
10110         int             i;
10111
10112         if ((vm_paging_base_address == 0) ||
10113             (start < vm_paging_base_address) ||
10114             (end > (vm_paging_base_address
10115             + (VM_PAGING_NUM_PAGES * PAGE_SIZE)))) {
10116                 /*
10117                  * We didn't use our pre-allocated pool of
10118                  * kernel virtual address.  Deallocate the
10119                  * virtual memory.
10120                  */
10121                 if (object != VM_OBJECT_NULL) {
10122                         vm_object_unlock(object);
10123                 }
10124                 kr = vm_map_remove(kernel_map, start, end,
10125                     VM_MAP_REMOVE_NO_FLAGS);
10126                 if (object != VM_OBJECT_NULL) {
10127                         vm_object_lock(object);
10128                 }
10129                 assert(kr == KERN_SUCCESS);
10130         } else {
10131                 /*
10132                  * We used a kernel virtual address from our
10133                  * pre-allocated pool.  Put it back in the pool
10134                  * for next time.
10135                  */
10136                 assert(end - start == PAGE_SIZE);
10137                 i = (int) ((start - vm_paging_base_address) >> PAGE_SHIFT);
10138                 assert(i >= 0 && i < VM_PAGING_NUM_PAGES);
10139
10140                 /* undo the pmap mapping */
10141                 pmap_remove(kernel_pmap, start, end);
10142
10143                 simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
10144                 vm_paging_page_inuse[i] = FALSE;
10145                 if (vm_paging_page_waiter) {
10146                         thread_wakeup(&vm_paging_page_waiter);
10147                 }
10148                 simple_unlock(&vm_paging_lock);
10149         }
10150 }
10151
10152
10153 /*
10154  * page->vmp_object must be locked
10155  */
10156 void
10157 vm_pageout_steal_laundry(vm_page_t page, boolean_t queues_locked)
10158 {
10159         if (!queues_locked) {
10160                 vm_page_lockspin_queues();
10161         }
10162
10163         page->vmp_free_when_done = FALSE;
10164         /*
10165          * need to drop the laundry count...
10166          * we may also need to remove it
10167          * from the I/O paging queue...
10168          * vm_pageout_throttle_up handles both cases
10169          *
10170          * the laundry and pageout_queue flags are cleared...
10171          */
10172         vm_pageout_throttle_up(page);
10173
10174         if (!queues_locked) {
10175                 vm_page_unlock_queues();
10176         }
10177 }
10178
10179 upl_t
10180 vector_upl_create(vm_offset_t upl_offset)
10181 {
10182         int     vector_upl_size  = sizeof(struct _vector_upl);
10183         int i = 0;
10184         upl_t   upl;
10185         vector_upl_t vector_upl = (vector_upl_t)kalloc(vector_upl_size);
10186
10187         upl = upl_create(0, UPL_VECTOR, 0);
10188         upl->vector_upl = vector_upl;
10189         upl->u_offset = upl_offset;
10190         vector_upl->size = 0;
10191         vector_upl->offset = upl_offset;
10192         vector_upl->invalid_upls = 0;
10193         vector_upl->num_upls = 0;
10194         vector_upl->pagelist = NULL;
10195
10196         for (i = 0; i < MAX_VECTOR_UPL_ELEMENTS; i++) {
10197                 vector_upl->upl_iostates[i].size = 0;
10198                 vector_upl->upl_iostates[i].offset = 0;
10199         }
10200         return upl;
10201 }
10202
10203 void
10204 vector_upl_deallocate(upl_t upl)
10205 {
10206         if (upl) {
10207                 vector_upl_t vector_upl = upl->vector_upl;
10208                 if (vector_upl) {
10209                         if (vector_upl->invalid_upls != vector_upl->num_upls) {
10210                                 panic("Deallocating non-empty Vectored UPL\n");
10211                         }
10212                         kfree(vector_upl->pagelist, (sizeof(struct upl_page_info) * (vector_upl->size / PAGE_SIZE)));
10213                         vector_upl->invalid_upls = 0;
10214                         vector_upl->num_upls = 0;
10215                         vector_upl->pagelist = NULL;
10216                         vector_upl->size = 0;
10217                         vector_upl->offset = 0;
10218                         kfree(vector_upl, sizeof(struct _vector_upl));
10219                         vector_upl = (vector_upl_t)0xfeedfeed;
10220                 } else {
10221                         panic("vector_upl_deallocate was passed a non-vectored upl\n");
10222                 }
10223         } else {
10224                 panic("vector_upl_deallocate was passed a NULL upl\n");
10225         }
10226 }
10227
10228 boolean_t
10229 vector_upl_is_valid(upl_t upl)
10230 {
10231         if (upl && ((upl->flags & UPL_VECTOR) == UPL_VECTOR)) {
10232                 vector_upl_t vector_upl = upl->vector_upl;
10233                 if (vector_upl == NULL || vector_upl == (vector_upl_t)0xfeedfeed || vector_upl == (vector_upl_t)0xfeedbeef) {
10234                         return FALSE;
10235                 } else {
10236                         return TRUE;
10237                 }
10238         }
10239         return FALSE;
10240 }
10241
10242 boolean_t
10243 vector_upl_set_subupl(upl_t upl, upl_t subupl, uint32_t io_size)
10244 {
10245         if (vector_upl_is_valid(upl)) {
10246                 vector_upl_t vector_upl = upl->vector_upl;
10247
10248                 if (vector_upl) {
10249                         if (subupl) {
10250                                 if (io_size) {
10251                                         if (io_size < PAGE_SIZE) {
10252                                                 io_size = PAGE_SIZE;
10253                                         }
10254                                         subupl->vector_upl = (void*)vector_upl;
10255                                         vector_upl->upl_elems[vector_upl->num_upls++] = subupl;
10256                                         vector_upl->size += io_size;
10257                                         upl->u_size += io_size;
10258                                 } else {
10259                                         uint32_t i = 0, invalid_upls = 0;
10260                                         for (i = 0; i < vector_upl->num_upls; i++) {
10261                                                 if (vector_upl->upl_elems[i] == subupl) {
10262                                                         break;
10263                                                 }
10264                                         }
10265                                         if (i == vector_upl->num_upls) {
10266                                                 panic("Trying to remove sub-upl when none exists");
10267                                         }
10268
10269                                         vector_upl->upl_elems[i] = NULL;
10270                                         invalid_upls = os_atomic_inc(&(vector_upl)->invalid_upls,
10271                                             relaxed);
10272                                         if (invalid_upls == vector_upl->num_upls) {
10273                                                 return TRUE;
10274                                         } else {
10275                                                 return FALSE;
10276                                         }
10277                                 }
10278                         } else {
10279                                 panic("vector_upl_set_subupl was passed a NULL upl element\n");
10280                         }
10281                 } else {
10282                         panic("vector_upl_set_subupl was passed a non-vectored upl\n");
10283                 }
10284         } else {
10285                 panic("vector_upl_set_subupl was passed a NULL upl\n");
10286         }
10287
10288         return FALSE;
10289 }
10290
10291 void
10292 vector_upl_set_pagelist(upl_t upl)
10293 {
10294         if (vector_upl_is_valid(upl)) {
10295                 uint32_t i = 0;
10296                 vector_upl_t vector_upl = upl->vector_upl;
10297
10298                 if (vector_upl) {
10299                         vm_offset_t pagelist_size = 0, cur_upl_pagelist_size = 0;
10300
10301                         vector_upl->pagelist = (upl_page_info_array_t)kalloc(sizeof(struct upl_page_info) * (vector_upl->size / PAGE_SIZE));
10302
10303                         for (i = 0; i < vector_upl->num_upls; i++) {
10304                                 cur_upl_pagelist_size = sizeof(struct upl_page_info) * upl_adjusted_size(vector_upl->upl_elems[i], PAGE_MASK) / PAGE_SIZE;
10305                                 bcopy(UPL_GET_INTERNAL_PAGE_LIST_SIMPLE(vector_upl->upl_elems[i]), (char*)vector_upl->pagelist + pagelist_size, cur_upl_pagelist_size);
10306                                 pagelist_size += cur_upl_pagelist_size;
10307                                 if (vector_upl->upl_elems[i]->highest_page > upl->highest_page) {
10308                                         upl->highest_page = vector_upl->upl_elems[i]->highest_page;
10309                                 }
10310                         }
10311                         assert( pagelist_size == (sizeof(struct upl_page_info) * (vector_upl->size / PAGE_SIZE)));
10312                 } else {
10313                         panic("vector_upl_set_pagelist was passed a non-vectored upl\n");
10314                 }
10315         } else {
10316                 panic("vector_upl_set_pagelist was passed a NULL upl\n");
10317         }
10318 }
10319
10320 upl_t
10321 vector_upl_subupl_byindex(upl_t upl, uint32_t index)
10322 {
10323         if (vector_upl_is_valid(upl)) {
10324                 vector_upl_t vector_upl = upl->vector_upl;
10325                 if (vector_upl) {
10326                         if (index < vector_upl->num_upls) {
10327                                 return vector_upl->upl_elems[index];
10328                         }
10329                 } else {
10330                         panic("vector_upl_subupl_byindex was passed a non-vectored upl\n");
10331                 }
10332         }
10333         return NULL;
10334 }
10335
10336 upl_t
10337 vector_upl_subupl_byoffset(upl_t upl, upl_offset_t *upl_offset, upl_size_t *upl_size)
10338 {
10339         if (vector_upl_is_valid(upl)) {
10340                 uint32_t i = 0;
10341                 vector_upl_t vector_upl = upl->vector_upl;
10342
10343                 if (vector_upl) {
10344                         upl_t subupl = NULL;
10345                         vector_upl_iostates_t subupl_state;
10346
10347                         for (i = 0; i < vector_upl->num_upls; i++) {
10348                                 subupl = vector_upl->upl_elems[i];
10349                                 subupl_state = vector_upl->upl_iostates[i];
10350                                 if (*upl_offset <= (subupl_state.offset + subupl_state.size - 1)) {
10351                                         /* We could have been passed an offset/size pair that belongs
10352                                          * to an UPL element that has already been committed/aborted.
10353                                          * If so, return NULL.
10354                                          */
10355                                         if (subupl == NULL) {
10356                                                 return NULL;
10357                                         }
10358                                         if ((subupl_state.offset + subupl_state.size) < (*upl_offset + *upl_size)) {
10359                                                 *upl_size = (subupl_state.offset + subupl_state.size) - *upl_offset;
10360                                                 if (*upl_size > subupl_state.size) {
10361                                                         *upl_size = subupl_state.size;
10362                                                 }
10363                                         }
10364                                         if (*upl_offset >= subupl_state.offset) {
10365                                                 *upl_offset -= subupl_state.offset;
10366                                         } else if (i) {
10367                                                 panic("Vector UPL offset miscalculation\n");
10368                                         }
10369                                         return subupl;
10370                                 }
10371                         }
10372                 } else {
10373                         panic("vector_upl_subupl_byoffset was passed a non-vectored UPL\n");
10374                 }
10375         }
10376         return NULL;
10377 }
10378
10379 void
10380 vector_upl_get_submap(upl_t upl, vm_map_t *v_upl_submap, vm_offset_t *submap_dst_addr)
10381 {
10382         *v_upl_submap = NULL;
10383
10384         if (vector_upl_is_valid(upl)) {
10385                 vector_upl_t vector_upl = upl->vector_upl;
10386                 if (vector_upl) {
10387                         *v_upl_submap = vector_upl->submap;
10388                         *submap_dst_addr = vector_upl->submap_dst_addr;
10389                 } else {
10390                         panic("vector_upl_get_submap was passed a non-vectored UPL\n");
10391                 }
10392         } else {
10393                 panic("vector_upl_get_submap was passed a null UPL\n");
10394         }
10395 }
10396
10397 void
10398 vector_upl_set_submap(upl_t upl, vm_map_t submap, vm_offset_t submap_dst_addr)
10399 {
10400         if (vector_upl_is_valid(upl)) {
10401                 vector_upl_t vector_upl = upl->vector_upl;
10402                 if (vector_upl) {
10403                         vector_upl->submap = submap;
10404                         vector_upl->submap_dst_addr = submap_dst_addr;
10405                 } else {
10406                         panic("vector_upl_get_submap was passed a non-vectored UPL\n");
10407                 }
10408         } else {
10409                 panic("vector_upl_get_submap was passed a NULL UPL\n");
10410         }
10411 }
10412
10413 void
10414 vector_upl_set_iostate(upl_t upl, upl_t subupl, upl_offset_t offset, upl_size_t size)
10415 {
10416         if (vector_upl_is_valid(upl)) {
10417                 uint32_t i = 0;
10418                 vector_upl_t vector_upl = upl->vector_upl;
10419
10420                 if (vector_upl) {
10421                         for (i = 0; i < vector_upl->num_upls; i++) {
10422                                 if (vector_upl->upl_elems[i] == subupl) {
10423                                         break;
10424                                 }
10425                         }
10426
10427                         if (i == vector_upl->num_upls) {
10428                                 panic("setting sub-upl iostate when none exists");
10429                         }
10430
10431                         vector_upl->upl_iostates[i].offset = offset;
10432                         if (size < PAGE_SIZE) {
10433                                 size = PAGE_SIZE;
10434                         }
10435                         vector_upl->upl_iostates[i].size = size;
10436                 } else {
10437                         panic("vector_upl_set_iostate was passed a non-vectored UPL\n");
10438                 }
10439         } else {
10440                 panic("vector_upl_set_iostate was passed a NULL UPL\n");
10441         }
10442 }
10443
10444 void
10445 vector_upl_get_iostate(upl_t upl, upl_t subupl, upl_offset_t *offset, upl_size_t *size)
10446 {
10447         if (vector_upl_is_valid(upl)) {
10448                 uint32_t i = 0;
10449                 vector_upl_t vector_upl = upl->vector_upl;
10450
10451                 if (vector_upl) {
10452                         for (i = 0; i < vector_upl->num_upls; i++) {
10453                                 if (vector_upl->upl_elems[i] == subupl) {
10454                                         break;
10455                                 }
10456                         }
10457
10458                         if (i == vector_upl->num_upls) {
10459                                 panic("getting sub-upl iostate when none exists");
10460                         }
10461
10462                         *offset = vector_upl->upl_iostates[i].offset;
10463                         *size = vector_upl->upl_iostates[i].size;
10464                 } else {
10465                         panic("vector_upl_get_iostate was passed a non-vectored UPL\n");
10466                 }
10467         } else {
10468                 panic("vector_upl_get_iostate was passed a NULL UPL\n");
10469         }
10470 }
10471
10472 void
10473 vector_upl_get_iostate_byindex(upl_t upl, uint32_t index, upl_offset_t *offset, upl_size_t *size)
10474 {
10475         if (vector_upl_is_valid(upl)) {
10476                 vector_upl_t vector_upl = upl->vector_upl;
10477                 if (vector_upl) {
10478                         if (index < vector_upl->num_upls) {
10479                                 *offset = vector_upl->upl_iostates[index].offset;
10480                                 *size = vector_upl->upl_iostates[index].size;
10481                         } else {
10482                                 *offset = *size = 0;
10483                         }
10484                 } else {
10485                         panic("vector_upl_get_iostate_byindex was passed a non-vectored UPL\n");
10486                 }
10487         } else {
10488                 panic("vector_upl_get_iostate_byindex was passed a NULL UPL\n");
10489         }
10490 }
10491
10492 upl_page_info_t *
10493 upl_get_internal_vectorupl_pagelist(upl_t upl)
10494 {
10495         return ((vector_upl_t)(upl->vector_upl))->pagelist;
10496 }
10497
10498 void *
10499 upl_get_internal_vectorupl(upl_t upl)
10500 {
10501         return upl->vector_upl;
10502 }
10503
10504 vm_size_t
10505 upl_get_internal_pagelist_offset(void)
10506 {
10507         return sizeof(struct upl);
10508 }
10509
10510 void
10511 upl_clear_dirty(
10512         upl_t           upl,
10513         boolean_t       value)
10514 {
10515         if (value) {
10516                 upl->flags |= UPL_CLEAR_DIRTY;
10517         } else {
10518                 upl->flags &= ~UPL_CLEAR_DIRTY;
10519         }
10520 }
10521
10522 void
10523 upl_set_referenced(
10524         upl_t           upl,
10525         boolean_t       value)
10526 {
10527         upl_lock(upl);
10528         if (value) {
10529                 upl->ext_ref_count++;
10530         } else {
10531                 if (!upl->ext_ref_count) {
10532                         panic("upl_set_referenced not %p\n", upl);
10533                 }
10534                 upl->ext_ref_count--;
10535         }
10536         upl_unlock(upl);
10537 }
10538
10539 #if CONFIG_IOSCHED
10540 void
10541 upl_set_blkno(
10542         upl_t           upl,
10543         vm_offset_t     upl_offset,
10544         int             io_size,
10545         int64_t         blkno)
10546 {
10547         int i, j;
10548         if ((upl->flags & UPL_EXPEDITE_SUPPORTED) == 0) {
10549                 return;
10550         }
10551
10552         assert(upl->upl_reprio_info != 0);
10553         for (i = (int)(upl_offset / PAGE_SIZE), j = 0; j < io_size; i++, j += PAGE_SIZE) {
10554                 UPL_SET_REPRIO_INFO(upl, i, blkno, io_size);
10555         }
10556 }
10557 #endif
10558
10559 void inline
10560 memoryshot(unsigned int event, unsigned int control)
10561 {
10562         if (vm_debug_events) {
10563                 KERNEL_DEBUG_CONSTANT1((MACHDBG_CODE(DBG_MACH_VM_PRESSURE, event)) | control,
10564                     vm_page_active_count, vm_page_inactive_count,
10565                     vm_page_free_count, vm_page_speculative_count,
10566                     vm_page_throttled_count);
10567         } else {
10568                 (void) event;
10569                 (void) control;
10570         }
10571 }
10572
10573 #ifdef MACH_BSD
10574
10575 boolean_t
10576 upl_device_page(upl_page_info_t *upl)
10577 {
10578         return UPL_DEVICE_PAGE(upl);
10579 }
10580 boolean_t
10581 upl_page_present(upl_page_info_t *upl, int index)
10582 {
10583         return UPL_PAGE_PRESENT(upl, index);
10584 }
10585 boolean_t
10586 upl_speculative_page(upl_page_info_t *upl, int index)
10587 {
10588         return UPL_SPECULATIVE_PAGE(upl, index);
10589 }
10590 boolean_t
10591 upl_dirty_page(upl_page_info_t *upl, int index)
10592 {
10593         return UPL_DIRTY_PAGE(upl, index);
10594 }
10595 boolean_t
10596 upl_valid_page(upl_page_info_t *upl, int index)
10597 {
10598         return UPL_VALID_PAGE(upl, index);
10599 }
10600 ppnum_t
10601 upl_phys_page(upl_page_info_t *upl, int index)
10602 {
10603         return UPL_PHYS_PAGE(upl, index);
10604 }
10605
10606 void
10607 upl_page_set_mark(upl_page_info_t *upl, int index, boolean_t v)
10608 {
10609         upl[index].mark = v;
10610 }
10611
10612 boolean_t
10613 upl_page_get_mark(upl_page_info_t *upl, int index)
10614 {
10615         return upl[index].mark;
10616 }
10617
10618 void
10619 vm_countdirtypages(void)
10620 {
10621         vm_page_t m;
10622         int dpages;
10623         int pgopages;
10624         int precpages;
10625
10626
10627         dpages = 0;
10628         pgopages = 0;
10629         precpages = 0;
10630
10631         vm_page_lock_queues();
10632         m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
10633         do {
10634                 if (m == (vm_page_t)0) {
10635                         break;
10636                 }
10637
10638                 if (m->vmp_dirty) {
10639                         dpages++;
10640                 }
10641                 if (m->vmp_free_when_done) {
10642                         pgopages++;
10643                 }
10644                 if (m->vmp_precious) {
10645                         precpages++;
10646                 }
10647
10648                 assert(VM_PAGE_OBJECT(m) != kernel_object);
10649                 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10650                 if (m == (vm_page_t)0) {
10651                         break;
10652                 }
10653         } while (!vm_page_queue_end(&vm_page_queue_inactive, (vm_page_queue_entry_t) m));
10654         vm_page_unlock_queues();
10655
10656         vm_page_lock_queues();
10657         m = (vm_page_t) vm_page_queue_first(&vm_page_queue_throttled);
10658         do {
10659                 if (m == (vm_page_t)0) {
10660                         break;
10661                 }
10662
10663                 dpages++;
10664                 assert(m->vmp_dirty);
10665                 assert(!m->vmp_free_when_done);
10666                 assert(VM_PAGE_OBJECT(m) != kernel_object);
10667                 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10668                 if (m == (vm_page_t)0) {
10669                         break;
10670                 }
10671         } while (!vm_page_queue_end(&vm_page_queue_throttled, (vm_page_queue_entry_t) m));
10672         vm_page_unlock_queues();
10673
10674         vm_page_lock_queues();
10675         m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
10676         do {
10677                 if (m == (vm_page_t)0) {
10678                         break;
10679                 }
10680
10681                 if (m->vmp_dirty) {
10682                         dpages++;
10683                 }
10684                 if (m->vmp_free_when_done) {
10685                         pgopages++;
10686                 }
10687                 if (m->vmp_precious) {
10688                         precpages++;
10689                 }
10690
10691                 assert(VM_PAGE_OBJECT(m) != kernel_object);
10692                 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10693                 if (m == (vm_page_t)0) {
10694                         break;
10695                 }
10696         } while (!vm_page_queue_end(&vm_page_queue_anonymous, (vm_page_queue_entry_t) m));
10697         vm_page_unlock_queues();
10698
10699         printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
10700
10701         dpages = 0;
10702         pgopages = 0;
10703         precpages = 0;
10704
10705         vm_page_lock_queues();
10706         m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
10707
10708         do {
10709                 if (m == (vm_page_t)0) {
10710                         break;
10711                 }
10712                 if (m->vmp_dirty) {
10713                         dpages++;
10714                 }
10715                 if (m->vmp_free_when_done) {
10716                         pgopages++;
10717                 }
10718                 if (m->vmp_precious) {
10719                         precpages++;
10720                 }
10721
10722                 assert(VM_PAGE_OBJECT(m) != kernel_object);
10723                 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10724                 if (m == (vm_page_t)0) {
10725                         break;
10726                 }
10727         } while (!vm_page_queue_end(&vm_page_queue_active, (vm_page_queue_entry_t) m));
10728         vm_page_unlock_queues();
10729
10730         printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
10731 }
10732 #endif /* MACH_BSD */
10733
10734
10735 #if CONFIG_IOSCHED
10736 int
10737 upl_get_cached_tier(upl_t  upl)
10738 {
10739         assert(upl);
10740         if (upl->flags & UPL_TRACKED_BY_OBJECT) {
10741                 return upl->upl_priority;
10742         }
10743         return -1;
10744 }
10745 #endif /* CONFIG_IOSCHED */
10746
10747
10748 void
10749 upl_callout_iodone(upl_t upl)
10750 {
10751         struct upl_io_completion *upl_ctx = upl->upl_iodone;
10752
10753         if (upl_ctx) {
10754                 void    (*iodone_func)(void *, int) = upl_ctx->io_done;
10755
10756                 assert(upl_ctx->io_done);
10757
10758                 (*iodone_func)(upl_ctx->io_context, upl_ctx->io_error);
10759         }
10760 }
10761
10762 void
10763 upl_set_iodone(upl_t upl, void *upl_iodone)
10764 {
10765         upl->upl_iodone = (struct upl_io_completion *)upl_iodone;
10766 }
10767
10768 void
10769 upl_set_iodone_error(upl_t upl, int error)
10770 {
10771         struct upl_io_completion *upl_ctx = upl->upl_iodone;
10772
10773         if (upl_ctx) {
10774                 upl_ctx->io_error = error;
10775         }
10776 }
10777
10778
10779 ppnum_t
10780 upl_get_highest_page(
10781         upl_t                      upl)
10782 {
10783         return upl->highest_page;
10784 }
10785
10786 upl_size_t
10787 upl_get_size(
10788         upl_t                      upl)
10789 {
10790         return upl_adjusted_size(upl, PAGE_MASK);
10791 }
10792
10793 upl_size_t
10794 upl_adjusted_size(
10795         upl_t upl,
10796         vm_map_offset_t pgmask)
10797 {
10798         vm_object_offset_t start_offset, end_offset;
10799
10800         start_offset = trunc_page_mask_64(upl->u_offset, pgmask);
10801         end_offset = round_page_mask_64(upl->u_offset + upl->u_size, pgmask);
10802
10803         return (upl_size_t)(end_offset - start_offset);
10804 }
10805
10806 vm_object_offset_t
10807 upl_adjusted_offset(
10808         upl_t upl,
10809         vm_map_offset_t pgmask)
10810 {
10811         return trunc_page_mask_64(upl->u_offset, pgmask);
10812 }
10813
10814 vm_object_offset_t
10815 upl_get_data_offset(
10816         upl_t upl)
10817 {
10818         return upl->u_offset - upl_adjusted_offset(upl, PAGE_MASK);
10819 }
10820
10821 upl_t
10822 upl_associated_upl(upl_t upl)
10823 {
10824         return upl->associated_upl;
10825 }
10826
10827 void
10828 upl_set_associated_upl(upl_t upl, upl_t associated_upl)
10829 {
10830         upl->associated_upl = associated_upl;
10831 }
10832
10833 struct vnode *
10834 upl_lookup_vnode(upl_t upl)
10835 {
10836         if (!upl->map_object->internal) {
10837                 return vnode_pager_lookup_vnode(upl->map_object->pager);
10838         } else {
10839                 return NULL;
10840         }
10841 }
10842
10843 #if UPL_DEBUG
10844 kern_return_t
10845 upl_ubc_alias_set(upl_t upl, uintptr_t alias1, uintptr_t alias2)
10846 {
10847         upl->ubc_alias1 = alias1;
10848         upl->ubc_alias2 = alias2;
10849         return KERN_SUCCESS;
10850 }
10851 int
10852 upl_ubc_alias_get(upl_t upl, uintptr_t * al, uintptr_t * al2)
10853 {
10854         if (al) {
10855                 *al = upl->ubc_alias1;
10856         }
10857         if (al2) {
10858                 *al2 = upl->ubc_alias2;
10859         }
10860         return KERN_SUCCESS;
10861 }
10862 #endif /* UPL_DEBUG */
10863
10864 #if VM_PRESSURE_EVENTS
10865 /*
10866  * Upward trajectory.
10867  */
10868 extern boolean_t vm_compressor_low_on_space(void);
10869
10870 boolean_t
10871 VM_PRESSURE_NORMAL_TO_WARNING(void)
10872 {
10873         if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10874                 /* Available pages below our threshold */
10875                 if (memorystatus_available_pages < memorystatus_available_pages_pressure) {
10876                         /* No frozen processes to kill */
10877                         if (memorystatus_frozen_count == 0) {
10878                                 /* Not enough suspended processes available. */
10879                                 if (memorystatus_suspended_count < MEMORYSTATUS_SUSPENDED_THRESHOLD) {
10880                                         return TRUE;
10881                                 }
10882                         }
10883                 }
10884                 return FALSE;
10885         } else {
10886                 return (AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) ? 1 : 0;
10887         }
10888 }
10889
10890 boolean_t
10891 VM_PRESSURE_WARNING_TO_CRITICAL(void)
10892 {
10893         if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10894                 /* Available pages below our threshold */
10895                 if (memorystatus_available_pages < memorystatus_available_pages_critical) {
10896                         return TRUE;
10897                 }
10898                 return FALSE;
10899         } else {
10900                 return vm_compressor_low_on_space() || (AVAILABLE_NON_COMPRESSED_MEMORY < ((12 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0;
10901         }
10902 }
10903
10904 /*
10905  * Downward trajectory.
10906  */
10907 boolean_t
10908 VM_PRESSURE_WARNING_TO_NORMAL(void)
10909 {
10910         if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10911                 /* Available pages above our threshold */
10912                 unsigned int target_threshold = (unsigned int) (memorystatus_available_pages_pressure + ((15 * memorystatus_available_pages_pressure) / 100));
10913                 if (memorystatus_available_pages > target_threshold) {
10914                         return TRUE;
10915                 }
10916                 return FALSE;
10917         } else {
10918                 return (AVAILABLE_NON_COMPRESSED_MEMORY > ((12 * VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) / 10)) ? 1 : 0;
10919         }
10920 }
10921
10922 boolean_t
10923 VM_PRESSURE_CRITICAL_TO_WARNING(void)
10924 {
10925         if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10926                 /* Available pages above our threshold */
10927                 unsigned int target_threshold = (unsigned int)(memorystatus_available_pages_critical + ((15 * memorystatus_available_pages_critical) / 100));
10928                 if (memorystatus_available_pages > target_threshold) {
10929                         return TRUE;
10930                 }
10931                 return FALSE;
10932         } else {
10933                 return (AVAILABLE_NON_COMPRESSED_MEMORY > ((14 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0;
10934         }
10935 }
10936 #endif /* VM_PRESSURE_EVENTS */