osfmk/vm/vm_pageout.c

   1 /*
   2  * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58 /*
  59  *      File:   vm/vm_pageout.c
  60  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  61  *      Date:   1985
  62  *
  63  *      The proverbial page-out daemon.
  64  */
  65
  66 #include <stdint.h>
  67
  68 #include <debug.h>
  69 #include <mach_pagemap.h>
  70 #include <mach_cluster_stats.h>
  71
  72 #include <mach/mach_types.h>
  73 #include <mach/memory_object.h>
  74 #include <mach/memory_object_default.h>
  75 #include <mach/memory_object_control_server.h>
  76 #include <mach/mach_host_server.h>
  77 #include <mach/upl.h>
  78 #include <mach/vm_map.h>
  79 #include <mach/vm_param.h>
  80 #include <mach/vm_statistics.h>
  81 #include <mach/sdt.h>
  82
  83 #include <kern/kern_types.h>
  84 #include <kern/counters.h>
  85 #include <kern/host_statistics.h>
  86 #include <kern/machine.h>
  87 #include <kern/misc_protos.h>
  88 #include <kern/sched.h>
  89 #include <kern/thread.h>
  90 #include <kern/xpr.h>
  91 #include <kern/kalloc.h>
  92 #include <kern/policy_internal.h>
  93 #include <kern/thread_group.h>
  94
  95 #include <machine/vm_tuning.h>
  96 #include <machine/commpage.h>
  97
  98 #include <vm/pmap.h>
  99 #include <vm/vm_compressor_pager.h>
 100 #include <vm/vm_fault.h>
 101 #include <vm/vm_map.h>
 102 #include <vm/vm_object.h>
 103 #include <vm/vm_page.h>
 104 #include <vm/vm_pageout.h>
 105 #include <vm/vm_protos.h> /* must be last */
 106 #include <vm/memory_object.h>
 107 #include <vm/vm_purgeable_internal.h>
 108 #include <vm/vm_shared_region.h>
 109 #include <vm/vm_compressor.h>
 110
 111 #include <san/kasan.h>
 112
 113 #if CONFIG_PHANTOM_CACHE
 114 #include <vm/vm_phantom_cache.h>
 115 #endif
 116
 117 extern int cs_debug;
 118
 119 #if UPL_DEBUG
 120 #include <libkern/OSDebug.h>
 121 #endif
 122
 123 extern void m_drain(void);
 124
 125 #if VM_PRESSURE_EVENTS
 126 #if CONFIG_JETSAM
 127 extern unsigned int memorystatus_available_pages;
 128 extern unsigned int memorystatus_available_pages_pressure;
 129 extern unsigned int memorystatus_available_pages_critical;
 130 #else /* CONFIG_JETSAM */
 131 extern uint64_t memorystatus_available_pages;
 132 extern uint64_t memorystatus_available_pages_pressure;
 133 extern uint64_t memorystatus_available_pages_critical;
 134 #endif /* CONFIG_JETSAM */
 135
 136 extern unsigned int memorystatus_frozen_count;
 137 extern unsigned int memorystatus_suspended_count;
 138
 139 extern vm_pressure_level_t memorystatus_vm_pressure_level;
 140 int memorystatus_purge_on_warning = 2;
 141 int memorystatus_purge_on_urgent = 5;
 142 int memorystatus_purge_on_critical = 8;
 143
 144 void vm_pressure_response(void);
 145 boolean_t vm_pressure_thread_running = FALSE;
 146 extern void consider_vm_pressure_events(void);
 147
 148 #define MEMORYSTATUS_SUSPENDED_THRESHOLD  4
 149 #endif /* VM_PRESSURE_EVENTS */
 150
 151 boolean_t       vm_pressure_changed = FALSE;
 152
 153 #ifndef VM_PAGEOUT_BURST_ACTIVE_THROTTLE   /* maximum iterations of the active queue to move pages to inactive */
 154 #define VM_PAGEOUT_BURST_ACTIVE_THROTTLE  100
 155 #endif
 156
 157 #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE  /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
 158 #ifdef  CONFIG_EMBEDDED
 159 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 1024
 160 #else
 161 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096
 162 #endif
 163 #endif
 164
 165 #ifndef VM_PAGEOUT_DEADLOCK_RELIEF
 166 #define VM_PAGEOUT_DEADLOCK_RELIEF 100  /* number of pages to move to break deadlock */
 167 #endif
 168
 169 #ifndef VM_PAGEOUT_INACTIVE_RELIEF
 170 #define VM_PAGEOUT_INACTIVE_RELIEF 50   /* minimum number of pages to move to the inactive q */
 171 #endif
 172
 173 #ifndef VM_PAGE_LAUNDRY_MAX
 174 #define VM_PAGE_LAUNDRY_MAX     128UL   /* maximum pageouts on a given pageout queue */
 175 #endif  /* VM_PAGEOUT_LAUNDRY_MAX */
 176
 177 #ifndef VM_PAGEOUT_BURST_WAIT
 178 #define VM_PAGEOUT_BURST_WAIT   10      /* milliseconds */
 179 #endif  /* VM_PAGEOUT_BURST_WAIT */
 180
 181 #ifndef VM_PAGEOUT_EMPTY_WAIT
 182 #define VM_PAGEOUT_EMPTY_WAIT   200     /* milliseconds */
 183 #endif  /* VM_PAGEOUT_EMPTY_WAIT */
 184
 185 #ifndef VM_PAGEOUT_DEADLOCK_WAIT
 186 #define VM_PAGEOUT_DEADLOCK_WAIT        300     /* milliseconds */
 187 #endif  /* VM_PAGEOUT_DEADLOCK_WAIT */
 188
 189 #ifndef VM_PAGEOUT_IDLE_WAIT
 190 #define VM_PAGEOUT_IDLE_WAIT    10      /* milliseconds */
 191 #endif  /* VM_PAGEOUT_IDLE_WAIT */
 192
 193 #ifndef VM_PAGEOUT_SWAP_WAIT
 194 #define VM_PAGEOUT_SWAP_WAIT    50      /* milliseconds */
 195 #endif  /* VM_PAGEOUT_SWAP_WAIT */
 196
 197 #ifndef VM_PAGEOUT_PRESSURE_PAGES_CONSIDERED
 198 #define VM_PAGEOUT_PRESSURE_PAGES_CONSIDERED            1000    /* maximum pages considered before we issue a pressure event */
 199 #endif /* VM_PAGEOUT_PRESSURE_PAGES_CONSIDERED */
 200
 201 #ifndef VM_PAGEOUT_PRESSURE_EVENT_MONITOR_SECS
 202 #define VM_PAGEOUT_PRESSURE_EVENT_MONITOR_SECS          5       /* seconds */
 203 #endif /* VM_PAGEOUT_PRESSURE_EVENT_MONITOR_SECS */
 204
 205 unsigned int    vm_page_speculative_q_age_ms = VM_PAGE_SPECULATIVE_Q_AGE_MS;
 206 unsigned int    vm_page_speculative_percentage = 5;
 207
 208 #ifndef VM_PAGE_SPECULATIVE_TARGET
 209 #define VM_PAGE_SPECULATIVE_TARGET(total) ((total) * 1 / (100 / vm_page_speculative_percentage))
 210 #endif /* VM_PAGE_SPECULATIVE_TARGET */
 211
 212
 213 /*
 214  *      To obtain a reasonable LRU approximation, the inactive queue
 215  *      needs to be large enough to give pages on it a chance to be
 216  *      referenced a second time.  This macro defines the fraction
 217  *      of active+inactive pages that should be inactive.
 218  *      The pageout daemon uses it to update vm_page_inactive_target.
 219  *
 220  *      If vm_page_free_count falls below vm_page_free_target and
 221  *      vm_page_inactive_count is below vm_page_inactive_target,
 222  *      then the pageout daemon starts running.
 223  */
 224
 225 #ifndef VM_PAGE_INACTIVE_TARGET
 226 #ifdef  CONFIG_EMBEDDED
 227 #define VM_PAGE_INACTIVE_TARGET(avail)  ((avail) * 1 / 3)
 228 #else
 229 #define VM_PAGE_INACTIVE_TARGET(avail)  ((avail) * 1 / 2)
 230 #endif
 231 #endif  /* VM_PAGE_INACTIVE_TARGET */
 232
 233 /*
 234  *      Once the pageout daemon starts running, it keeps going
 235  *      until vm_page_free_count meets or exceeds vm_page_free_target.
 236  */
 237
 238 #ifndef VM_PAGE_FREE_TARGET
 239 #ifdef  CONFIG_EMBEDDED
 240 #define VM_PAGE_FREE_TARGET(free)       (15 + (free) / 100)
 241 #else
 242 #define VM_PAGE_FREE_TARGET(free)       (15 + (free) / 80)
 243 #endif
 244 #endif  /* VM_PAGE_FREE_TARGET */
 245
 246
 247 /*
 248  *      The pageout daemon always starts running once vm_page_free_count
 249  *      falls below vm_page_free_min.
 250  */
 251
 252 #ifndef VM_PAGE_FREE_MIN
 253 #ifdef  CONFIG_EMBEDDED
 254 #define VM_PAGE_FREE_MIN(free)          (10 + (free) / 200)
 255 #else
 256 #define VM_PAGE_FREE_MIN(free)          (10 + (free) / 100)
 257 #endif
 258 #endif  /* VM_PAGE_FREE_MIN */
 259
 260 #ifdef  CONFIG_EMBEDDED
 261 #define VM_PAGE_FREE_RESERVED_LIMIT     100
 262 #define VM_PAGE_FREE_MIN_LIMIT          1500
 263 #define VM_PAGE_FREE_TARGET_LIMIT       2000
 264 #else
 265 #define VM_PAGE_FREE_RESERVED_LIMIT     1700
 266 #define VM_PAGE_FREE_MIN_LIMIT          3500
 267 #define VM_PAGE_FREE_TARGET_LIMIT       4000
 268 #endif
 269
 270 /*
 271  *      When vm_page_free_count falls below vm_page_free_reserved,
 272  *      only vm-privileged threads can allocate pages.  vm-privilege
 273  *      allows the pageout daemon and default pager (and any other
 274  *      associated threads needed for default pageout) to continue
 275  *      operation by dipping into the reserved pool of pages.
 276  */
 277
 278 #ifndef VM_PAGE_FREE_RESERVED
 279 #define VM_PAGE_FREE_RESERVED(n)        \
 280         ((unsigned) (6 * VM_PAGE_LAUNDRY_MAX) + (n))
 281 #endif  /* VM_PAGE_FREE_RESERVED */
 282
 283 /*
 284  *      When we dequeue pages from the inactive list, they are
 285  *      reactivated (ie, put back on the active queue) if referenced.
 286  *      However, it is possible to starve the free list if other
 287  *      processors are referencing pages faster than we can turn off
 288  *      the referenced bit.  So we limit the number of reactivations
 289  *      we will make per call of vm_pageout_scan().
 290  */
 291 #define VM_PAGE_REACTIVATE_LIMIT_MAX 20000
 292 #ifndef VM_PAGE_REACTIVATE_LIMIT
 293 #ifdef  CONFIG_EMBEDDED
 294 #define VM_PAGE_REACTIVATE_LIMIT(avail) (VM_PAGE_INACTIVE_TARGET(avail) / 2)
 295 #else
 296 #define VM_PAGE_REACTIVATE_LIMIT(avail) (MAX((avail) * 1 / 20,VM_PAGE_REACTIVATE_LIMIT_MAX))
 297 #endif
 298 #endif  /* VM_PAGE_REACTIVATE_LIMIT */
 299 #define VM_PAGEOUT_INACTIVE_FORCE_RECLAIM       1000
 300
 301
 302 extern boolean_t hibernate_cleaning_in_progress;
 303
 304 /*
 305  * Exported variable used to broadcast the activation of the pageout scan
 306  * Working Set uses this to throttle its use of pmap removes.  In this
 307  * way, code which runs within memory in an uncontested context does
 308  * not keep encountering soft faults.
 309  */
 310
 311 unsigned int    vm_pageout_scan_event_counter = 0;
 312
 313 /*
 314  * Forward declarations for internal routines.
 315  */
 316 struct cq {
 317         struct vm_pageout_queue *q;
 318         void                    *current_chead;
 319         char                    *scratch_buf;
 320         int                     id;
 321 };
 322
 323 struct cq ciq[MAX_COMPRESSOR_THREAD_COUNT];
 324
 325
 326 #if VM_PRESSURE_EVENTS
 327 void vm_pressure_thread(void);
 328
 329 boolean_t VM_PRESSURE_NORMAL_TO_WARNING(void);
 330 boolean_t VM_PRESSURE_WARNING_TO_CRITICAL(void);
 331
 332 boolean_t VM_PRESSURE_WARNING_TO_NORMAL(void);
 333 boolean_t VM_PRESSURE_CRITICAL_TO_WARNING(void);
 334 #endif
 335 void vm_pageout_garbage_collect(int);
 336 static void vm_pageout_iothread_external(void);
 337 static void vm_pageout_iothread_internal(struct cq *cq);
 338 static void vm_pageout_adjust_eq_iothrottle(struct vm_pageout_queue *, boolean_t);
 339
 340 extern void vm_pageout_continue(void);
 341 extern void vm_pageout_scan(void);
 342 void vm_tests(void); /* forward */
 343
 344 boolean_t       vm_restricted_to_single_processor = FALSE;
 345 #if !CONFIG_EMBEDDED
 346 static boolean_t vm_pageout_waiter  = FALSE;
 347 static boolean_t vm_pageout_running = FALSE;
 348 #endif /* !CONFIG_EMBEDDED */
 349
 350
 351 static thread_t vm_pageout_external_iothread = THREAD_NULL;
 352 static thread_t vm_pageout_internal_iothread = THREAD_NULL;
 353
 354 unsigned int vm_pageout_reserved_internal = 0;
 355 unsigned int vm_pageout_reserved_really = 0;
 356
 357 unsigned int vm_pageout_swap_wait = 0;
 358 unsigned int vm_pageout_idle_wait = 0;          /* milliseconds */
 359 unsigned int vm_pageout_empty_wait = 0;         /* milliseconds */
 360 unsigned int vm_pageout_burst_wait = 0;         /* milliseconds */
 361 unsigned int vm_pageout_deadlock_wait = 0;      /* milliseconds */
 362 unsigned int vm_pageout_deadlock_relief = 0;
 363 unsigned int vm_pageout_inactive_relief = 0;
 364 unsigned int vm_pageout_burst_active_throttle = 0;
 365 unsigned int vm_pageout_burst_inactive_throttle = 0;
 366
 367 int     vm_upl_wait_for_pages = 0;
 368
 369
 370 /*
 371  *      These variables record the pageout daemon's actions:
 372  *      how many pages it looks at and what happens to those pages.
 373  *      No locking needed because only one thread modifies the variables.
 374  */
 375
 376 unsigned int vm_pageout_active = 0;             /* debugging */
 377 unsigned int vm_pageout_inactive = 0;           /* debugging */
 378 unsigned int vm_pageout_inactive_throttled = 0; /* debugging */
 379 unsigned int vm_pageout_inactive_forced = 0;    /* debugging */
 380 unsigned int vm_pageout_inactive_nolock = 0;    /* debugging */
 381 unsigned int vm_pageout_inactive_avoid = 0;     /* debugging */
 382 unsigned int vm_pageout_inactive_busy = 0;      /* debugging */
 383 unsigned int vm_pageout_inactive_error = 0;     /* debugging */
 384 unsigned int vm_pageout_inactive_absent = 0;    /* debugging */
 385 unsigned int vm_pageout_inactive_notalive = 0;  /* debugging */
 386 unsigned int vm_pageout_inactive_used = 0;      /* debugging */
 387 unsigned int vm_pageout_cache_evicted = 0;      /* debugging */
 388 unsigned int vm_pageout_inactive_clean = 0;     /* debugging */
 389 unsigned int vm_pageout_speculative_clean = 0;  /* debugging */
 390 unsigned int vm_pageout_speculative_dirty = 0;  /* debugging */
 391
 392 unsigned int vm_pageout_freed_from_cleaned = 0;
 393 unsigned int vm_pageout_freed_from_speculative = 0;
 394 unsigned int vm_pageout_freed_from_inactive_clean = 0;
 395 unsigned int vm_pageout_freed_after_compression = 0;
 396
 397 extern  uint32_t vm_compressor_pages_grabbed;
 398 extern  uint32_t c_segment_pages_compressed;
 399
 400 unsigned int vm_pageout_enqueued_cleaned_from_inactive_dirty = 0;
 401
 402 unsigned int vm_pageout_cleaned_reclaimed = 0;          /* debugging; how many cleaned pages are reclaimed by the pageout scan */
 403 unsigned int vm_pageout_cleaned_reactivated = 0;        /* debugging; how many cleaned pages are found to be referenced on pageout (and are therefore reactivated) */
 404 unsigned int vm_pageout_cleaned_reference_reactivated = 0;
 405 unsigned int vm_pageout_cleaned_volatile_reactivated = 0;
 406 unsigned int vm_pageout_cleaned_fault_reactivated = 0;
 407 unsigned int vm_pageout_cleaned_commit_reactivated = 0; /* debugging; how many cleaned pages are found to be referenced on commit (and are therefore reactivated) */
 408 unsigned int vm_pageout_cleaned_busy = 0;
 409 unsigned int vm_pageout_cleaned_nolock = 0;
 410
 411 unsigned int vm_pageout_inactive_dirty_internal = 0;    /* debugging */
 412 unsigned int vm_pageout_inactive_dirty_external = 0;    /* debugging */
 413 unsigned int vm_pageout_inactive_deactivated = 0;       /* debugging */
 414 unsigned int vm_pageout_inactive_anonymous = 0; /* debugging */
 415 unsigned int vm_pageout_dirty_no_pager = 0;     /* debugging */
 416 unsigned int vm_pageout_purged_objects = 0;     /* used for sysctl vm stats */
 417 unsigned int vm_stat_discard = 0;               /* debugging */
 418 unsigned int vm_stat_discard_sent = 0;          /* debugging */
 419 unsigned int vm_stat_discard_failure = 0;       /* debugging */
 420 unsigned int vm_stat_discard_throttle = 0;      /* debugging */
 421 unsigned int vm_pageout_reactivation_limit_exceeded = 0;        /* debugging */
 422 unsigned int vm_pageout_inactive_force_reclaim = 0;     /* debugging */
 423
 424 unsigned int vm_pageout_scan_reclaimed_throttled = 0;
 425 unsigned int vm_pageout_scan_active_throttled = 0;
 426 unsigned int vm_pageout_scan_inactive_throttled_internal = 0;
 427 unsigned int vm_pageout_scan_inactive_throttled_external = 0;
 428 unsigned int vm_pageout_scan_throttle = 0;                      /* debugging */
 429 unsigned int vm_pageout_scan_burst_throttle = 0;                /* debugging */
 430 unsigned int vm_pageout_scan_empty_throttle = 0;                /* debugging */
 431 unsigned int vm_pageout_scan_swap_throttle = 0;         /* debugging */
 432 unsigned int vm_pageout_scan_deadlock_detected = 0;             /* debugging */
 433 unsigned int vm_pageout_scan_active_throttle_success = 0;       /* debugging */
 434 unsigned int vm_pageout_scan_inactive_throttle_success = 0;     /* debugging */
 435 unsigned int vm_pageout_inactive_external_forced_jetsam_count = 0;      /* debugging */
 436 unsigned int vm_pageout_scan_throttle_deferred = 0;             /* debugging */
 437 unsigned int vm_pageout_scan_yield_unthrottled = 0;             /* debugging */
 438 unsigned int vm_page_speculative_count_drifts = 0;
 439 unsigned int vm_page_speculative_count_drift_max = 0;
 440
 441 uint32_t vm_compressor_failed;
 442
 443 /*
 444  * Backing store throttle when BS is exhausted
 445  */
 446 unsigned int    vm_backing_store_low = 0;
 447
 448 unsigned int vm_pageout_out_of_line  = 0;
 449 unsigned int vm_pageout_in_place  = 0;
 450
 451 unsigned int vm_page_steal_pageout_page = 0;
 452
 453 struct  vm_config       vm_config;
 454
 455 struct  vm_pageout_queue vm_pageout_queue_internal __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT)));
 456 struct  vm_pageout_queue vm_pageout_queue_external __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT)));
 457
 458 unsigned int vm_page_speculative_target = 0;
 459
 460 vm_object_t     vm_pageout_scan_wants_object = VM_OBJECT_NULL;
 461
 462 boolean_t (* volatile consider_buffer_cache_collect)(int) = NULL;
 463
 464 #if DEVELOPMENT || DEBUG
 465 unsigned long vm_cs_validated_resets = 0;
 466 #endif
 467
 468 int     vm_debug_events = 0;
 469
 470 #if CONFIG_MEMORYSTATUS
 471 #if !CONFIG_JETSAM
 472 extern boolean_t memorystatus_idle_exit_from_VM(void);
 473 #endif
 474 extern boolean_t memorystatus_kill_on_VM_page_shortage(boolean_t async);
 475 extern void memorystatus_on_pageout_scan_end(void);
 476
 477 uint32_t vm_pageout_memorystatus_fb_factor_nr = 5;
 478 uint32_t vm_pageout_memorystatus_fb_factor_dr = 2;
 479 #if DEVELOPMENT || DEBUG
 480 uint32_t vm_grab_anon_overrides = 0;
 481 uint32_t vm_grab_anon_nops = 0;
 482 #endif
 483
 484 #endif
 485
 486 #if MACH_CLUSTER_STATS
 487 unsigned long vm_pageout_cluster_dirtied = 0;
 488 unsigned long vm_pageout_cluster_cleaned = 0;
 489 unsigned long vm_pageout_cluster_collisions = 0;
 490 unsigned long vm_pageout_cluster_clusters = 0;
 491 unsigned long vm_pageout_cluster_conversions = 0;
 492 unsigned long vm_pageout_target_collisions = 0;
 493 unsigned long vm_pageout_target_page_dirtied = 0;
 494 unsigned long vm_pageout_target_page_freed = 0;
 495 #define CLUSTER_STAT(clause)    clause
 496 #else   /* MACH_CLUSTER_STATS */
 497 #define CLUSTER_STAT(clause)
 498 #endif  /* MACH_CLUSTER_STATS */
 499
 500
 501 #if DEVELOPMENT || DEBUG
 502 vmct_stats_t vmct_stats;
 503 #endif
 504
 505 /*
 506  *      Routine:        vm_pageout_object_terminate
 507  *      Purpose:
 508  *              Destroy the pageout_object, and perform all of the
 509  *              required cleanup actions.
 510  *
 511  *      In/Out conditions:
 512  *              The object must be locked, and will be returned locked.
 513  */
 514 void
 515 vm_pageout_object_terminate(
 516         vm_object_t     object)
 517 {
 518         vm_object_t     shadow_object;
 519
 520         /*
 521          * Deal with the deallocation (last reference) of a pageout object
 522          * (used for cleaning-in-place) by dropping the paging references/
 523          * freeing pages in the original object.
 524          */
 525
 526         assert(object->pageout);
 527         shadow_object = object->shadow;
 528         vm_object_lock(shadow_object);
 529
 530         while (!vm_page_queue_empty(&object->memq)) {
 531                 vm_page_t               p, m;
 532                 vm_object_offset_t      offset;
 533
 534                 p = (vm_page_t) vm_page_queue_first(&object->memq);
 535
 536                 assert(p->private);
 537                 assert(p->free_when_done);
 538                 p->free_when_done = FALSE;
 539                 assert(!p->cleaning);
 540                 assert(!p->laundry);
 541
 542                 offset = p->offset;
 543                 VM_PAGE_FREE(p);
 544                 p = VM_PAGE_NULL;
 545
 546                 m = vm_page_lookup(shadow_object,
 547                         offset + object->vo_shadow_offset);
 548
 549                 if(m == VM_PAGE_NULL)
 550                         continue;
 551
 552                 assert((m->dirty) || (m->precious) ||
 553                                 (m->busy && m->cleaning));
 554
 555                 /*
 556                  * Handle the trusted pager throttle.
 557                  * Also decrement the burst throttle (if external).
 558                  */
 559                 vm_page_lock_queues();
 560                 if (m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q)
 561                         vm_pageout_throttle_up(m);
 562
 563                 /*
 564                  * Handle the "target" page(s). These pages are to be freed if
 565                  * successfully cleaned. Target pages are always busy, and are
 566                  * wired exactly once. The initial target pages are not mapped,
 567                  * (so cannot be referenced or modified) but converted target
 568                  * pages may have been modified between the selection as an
 569                  * adjacent page and conversion to a target.
 570                  */
 571                 if (m->free_when_done) {
 572                         assert(m->busy);
 573                         assert(m->vm_page_q_state == VM_PAGE_IS_WIRED);
 574                         assert(m->wire_count == 1);
 575                         m->cleaning = FALSE;
 576                         m->free_when_done = FALSE;
 577 #if MACH_CLUSTER_STATS
 578                         if (m->wanted) vm_pageout_target_collisions++;
 579 #endif
 580                         /*
 581                          * Revoke all access to the page. Since the object is
 582                          * locked, and the page is busy, this prevents the page
 583                          * from being dirtied after the pmap_disconnect() call
 584                          * returns.
 585                          *
 586                          * Since the page is left "dirty" but "not modifed", we
 587                          * can detect whether the page was redirtied during
 588                          * pageout by checking the modify state.
 589                          */
 590                         if (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED) {
 591                                 SET_PAGE_DIRTY(m, FALSE);
 592                         } else {
 593                                 m->dirty = FALSE;
 594                         }
 595
 596                         if (m->dirty) {
 597                                 CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
 598                                 vm_page_unwire(m, TRUE);        /* reactivates */
 599                                 VM_STAT_INCR(reactivations);
 600                                 PAGE_WAKEUP_DONE(m);
 601                         } else {
 602                                 CLUSTER_STAT(vm_pageout_target_page_freed++;)
 603                                 vm_page_free(m);/* clears busy, etc. */
 604                         }
 605                         vm_page_unlock_queues();
 606                         continue;
 607                 }
 608                 /*
 609                  * Handle the "adjacent" pages. These pages were cleaned in
 610                  * place, and should be left alone.
 611                  * If prep_pin_count is nonzero, then someone is using the
 612                  * page, so make it active.
 613                  */
 614                 if ((m->vm_page_q_state == VM_PAGE_NOT_ON_Q) && !m->private) {
 615                         if (m->reference)
 616                                 vm_page_activate(m);
 617                         else
 618                                 vm_page_deactivate(m);
 619                 }
 620                 if (m->overwriting) {
 621                         /*
 622                          * the (COPY_OUT_FROM == FALSE) request_page_list case
 623                          */
 624                         if (m->busy) {
 625                                 /*
 626                                  * We do not re-set m->dirty !
 627                                  * The page was busy so no extraneous activity
 628                                  * could have occurred. COPY_INTO is a read into the
 629                                  * new pages. CLEAN_IN_PLACE does actually write
 630                                  * out the pages but handling outside of this code
 631                                  * will take care of resetting dirty. We clear the
 632                                  * modify however for the Programmed I/O case.
 633                                  */
 634                                 pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
 635
 636                                 m->busy = FALSE;
 637                                 m->absent = FALSE;
 638                         } else {
 639                                 /*
 640                                  * alternate (COPY_OUT_FROM == FALSE) request_page_list case
 641                                  * Occurs when the original page was wired
 642                                  * at the time of the list request
 643                                  */
 644                                  assert(VM_PAGE_WIRED(m));
 645                                  vm_page_unwire(m, TRUE);       /* reactivates */
 646                         }
 647                         m->overwriting = FALSE;
 648                 } else {
 649                         /*
 650                          * Set the dirty state according to whether or not the page was
 651                          * modified during the pageout. Note that we purposefully do
 652                          * NOT call pmap_clear_modify since the page is still mapped.
 653                          * If the page were to be dirtied between the 2 calls, this
 654                          * this fact would be lost. This code is only necessary to
 655                          * maintain statistics, since the pmap module is always
 656                          * consulted if m->dirty is false.
 657                          */
 658 #if MACH_CLUSTER_STATS
 659                         m->dirty = pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m));
 660
 661                         if (m->dirty)   vm_pageout_cluster_dirtied++;
 662                         else            vm_pageout_cluster_cleaned++;
 663                         if (m->wanted)  vm_pageout_cluster_collisions++;
 664 #else
 665                         m->dirty = FALSE;
 666 #endif
 667                 }
 668                 m->cleaning = FALSE;
 669
 670                 /*
 671                  * Wakeup any thread waiting for the page to be un-cleaning.
 672                  */
 673                 PAGE_WAKEUP(m);
 674                 vm_page_unlock_queues();
 675         }
 676         /*
 677          * Account for the paging reference taken in vm_paging_object_allocate.
 678          */
 679         vm_object_activity_end(shadow_object);
 680         vm_object_unlock(shadow_object);
 681
 682         assert(object->ref_count == 0);
 683         assert(object->paging_in_progress == 0);
 684         assert(object->activity_in_progress == 0);
 685         assert(object->resident_page_count == 0);
 686         return;
 687 }
 688
 689 /*
 690  * Routine:     vm_pageclean_setup
 691  *
 692  * Purpose:     setup a page to be cleaned (made non-dirty), but not
 693  *              necessarily flushed from the VM page cache.
 694  *              This is accomplished by cleaning in place.
 695  *
 696  *              The page must not be busy, and new_object
 697  *              must be locked.
 698  *
 699  */
 700 static void
 701 vm_pageclean_setup(
 702         vm_page_t               m,
 703         vm_page_t               new_m,
 704         vm_object_t             new_object,
 705         vm_object_offset_t      new_offset)
 706 {
 707         assert(!m->busy);
 708 #if 0
 709         assert(!m->cleaning);
 710 #endif
 711
 712         XPR(XPR_VM_PAGEOUT,
 713             "vm_pageclean_setup, obj 0x%X off 0x%X page 0x%X new 0x%X new_off 0x%X\n",
 714                 VM_PAGE_OBJECT(m), m->offset, m,
 715                 new_m, new_offset);
 716
 717         pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
 718
 719         /*
 720          * Mark original page as cleaning in place.
 721          */
 722         m->cleaning = TRUE;
 723         SET_PAGE_DIRTY(m, FALSE);
 724         m->precious = FALSE;
 725
 726         /*
 727          * Convert the fictitious page to a private shadow of
 728          * the real page.
 729          */
 730         assert(new_m->fictitious);
 731         assert(VM_PAGE_GET_PHYS_PAGE(new_m) == vm_page_fictitious_addr);
 732         new_m->fictitious = FALSE;
 733         new_m->private = TRUE;
 734         new_m->free_when_done = TRUE;
 735         VM_PAGE_SET_PHYS_PAGE(new_m, VM_PAGE_GET_PHYS_PAGE(m));
 736
 737         vm_page_lockspin_queues();
 738         vm_page_wire(new_m, VM_KERN_MEMORY_NONE, TRUE);
 739         vm_page_unlock_queues();
 740
 741         vm_page_insert_wired(new_m, new_object, new_offset, VM_KERN_MEMORY_NONE);
 742         assert(!new_m->wanted);
 743         new_m->busy = FALSE;
 744 }
 745
 746 /*
 747  *      Routine:        vm_pageout_initialize_page
 748  *      Purpose:
 749  *              Causes the specified page to be initialized in
 750  *              the appropriate memory object. This routine is used to push
 751  *              pages into a copy-object when they are modified in the
 752  *              permanent object.
 753  *
 754  *              The page is moved to a temporary object and paged out.
 755  *
 756  *      In/out conditions:
 757  *              The page in question must not be on any pageout queues.
 758  *              The object to which it belongs must be locked.
 759  *              The page must be busy, but not hold a paging reference.
 760  *
 761  *      Implementation:
 762  *              Move this page to a completely new object.
 763  */
 764 void
 765 vm_pageout_initialize_page(
 766         vm_page_t       m)
 767 {
 768         vm_object_t             object;
 769         vm_object_offset_t      paging_offset;
 770         memory_object_t         pager;
 771
 772         XPR(XPR_VM_PAGEOUT,
 773                 "vm_pageout_initialize_page, page 0x%X\n",
 774                 m, 0, 0, 0, 0);
 775
 776         assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
 777
 778         object = VM_PAGE_OBJECT(m);
 779
 780         assert(m->busy);
 781         assert(object->internal);
 782
 783         /*
 784          *      Verify that we really want to clean this page
 785          */
 786         assert(!m->absent);
 787         assert(!m->error);
 788         assert(m->dirty);
 789
 790         /*
 791          *      Create a paging reference to let us play with the object.
 792          */
 793         paging_offset = m->offset + object->paging_offset;
 794
 795         if (m->absent || m->error || m->restart || (!m->dirty && !m->precious)) {
 796                 panic("reservation without pageout?"); /* alan */
 797
 798                 VM_PAGE_FREE(m);
 799                 vm_object_unlock(object);
 800
 801                 return;
 802         }
 803
 804         /*
 805          * If there's no pager, then we can't clean the page.  This should
 806          * never happen since this should be a copy object and therefore not
 807          * an external object, so the pager should always be there.
 808          */
 809
 810         pager = object->pager;
 811
 812         if (pager == MEMORY_OBJECT_NULL) {
 813                 panic("missing pager for copy object");
 814
 815                 VM_PAGE_FREE(m);
 816                 return;
 817         }
 818
 819         /*
 820          * set the page for future call to vm_fault_list_request
 821          */
 822         pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
 823         SET_PAGE_DIRTY(m, FALSE);
 824
 825         /*
 826          * keep the object from collapsing or terminating
 827          */
 828         vm_object_paging_begin(object);
 829         vm_object_unlock(object);
 830
 831         /*
 832          *      Write the data to its pager.
 833          *      Note that the data is passed by naming the new object,
 834          *      not a virtual address; the pager interface has been
 835          *      manipulated to use the "internal memory" data type.
 836          *      [The object reference from its allocation is donated
 837          *      to the eventual recipient.]
 838          */
 839         memory_object_data_initialize(pager, paging_offset, PAGE_SIZE);
 840
 841         vm_object_lock(object);
 842         vm_object_paging_end(object);
 843 }
 844
 845 #if     MACH_CLUSTER_STATS
 846 #define MAXCLUSTERPAGES 16
 847 struct {
 848         unsigned long pages_in_cluster;
 849         unsigned long pages_at_higher_offsets;
 850         unsigned long pages_at_lower_offsets;
 851 } cluster_stats[MAXCLUSTERPAGES];
 852 #endif  /* MACH_CLUSTER_STATS */
 853
 854
 855 /*
 856  * vm_pageout_cluster:
 857  *
 858  * Given a page, queue it to the appropriate I/O thread,
 859  * which will page it out and attempt to clean adjacent pages
 860  * in the same operation.
 861  *
 862  * The object and queues must be locked. We will take a
 863  * paging reference to prevent deallocation or collapse when we
 864  * release the object lock back at the call site.  The I/O thread
 865  * is responsible for consuming this reference
 866  *
 867  * The page must not be on any pageout queue.
 868  */
 869 int32_t vmct_active = 0;
 870 typedef enum vmct_state_t {
 871         VMCT_IDLE,
 872         VMCT_AWAKENED,
 873         VMCT_ACTIVE,
 874 } vmct_state_t;
 875 vmct_state_t vmct_state[MAX_COMPRESSOR_THREAD_COUNT];
 876
 877 void
 878 vm_pageout_cluster(vm_page_t m)
 879 {
 880         vm_object_t     object = VM_PAGE_OBJECT(m);
 881         struct          vm_pageout_queue *q;
 882
 883
 884         XPR(XPR_VM_PAGEOUT,
 885                 "vm_pageout_cluster, object 0x%X offset 0x%X page 0x%X\n",
 886                 object, m->offset, m, 0, 0);
 887
 888         VM_PAGE_CHECK(m);
 889         LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
 890         vm_object_lock_assert_exclusive(object);
 891
 892         /*
 893          * Only a certain kind of page is appreciated here.
 894          */
 895         assert((m->dirty || m->precious) && (!VM_PAGE_WIRED(m)));
 896         assert(!m->cleaning && !m->laundry);
 897         assert(m->vm_page_q_state == VM_PAGE_NOT_ON_Q);
 898
 899         /*
 900          * protect the object from collapse or termination
 901          */
 902         vm_object_activity_begin(object);
 903
 904         if (object->internal == TRUE) {
 905                 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
 906
 907                 m->busy = TRUE;
 908
 909                 q = &vm_pageout_queue_internal;
 910         } else
 911                 q = &vm_pageout_queue_external;
 912
 913         /*
 914          * pgo_laundry count is tied to the laundry bit
 915          */
 916         m->laundry = TRUE;
 917         q->pgo_laundry++;
 918
 919         m->vm_page_q_state = VM_PAGE_ON_PAGEOUT_Q;
 920         vm_page_queue_enter(&q->pgo_pending, m, vm_page_t, pageq);
 921
 922         if (q->pgo_idle == TRUE) {
 923                 q->pgo_idle = FALSE;
 924                 thread_wakeup((event_t) &q->pgo_pending);
 925         }
 926         VM_PAGE_CHECK(m);
 927 }
 928
 929
 930 unsigned long vm_pageout_throttle_up_count = 0;
 931
 932 /*
 933  * A page is back from laundry or we are stealing it back from
 934  * the laundering state.  See if there are some pages waiting to
 935  * go to laundry and if we can let some of them go now.
 936  *
 937  * Object and page queues must be locked.
 938  */
 939 void
 940 vm_pageout_throttle_up(
 941        vm_page_t       m)
 942 {
 943        struct vm_pageout_queue *q;
 944        vm_object_t      m_object;
 945
 946        m_object = VM_PAGE_OBJECT(m);
 947
 948        assert(m_object != VM_OBJECT_NULL);
 949        assert(m_object != kernel_object);
 950
 951        LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
 952        vm_object_lock_assert_exclusive(m_object);
 953
 954        vm_pageout_throttle_up_count++;
 955
 956        if (m_object->internal == TRUE)
 957                q = &vm_pageout_queue_internal;
 958        else
 959                q = &vm_pageout_queue_external;
 960
 961        if (m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q) {
 962
 963                vm_page_queue_remove(&q->pgo_pending, m, vm_page_t, pageq);
 964                m->vm_page_q_state = VM_PAGE_NOT_ON_Q;
 965
 966                VM_PAGE_ZERO_PAGEQ_ENTRY(m);
 967
 968                vm_object_activity_end(m_object);
 969        }
 970        if (m->laundry == TRUE) {
 971
 972                m->laundry = FALSE;
 973                q->pgo_laundry--;
 974
 975                if (q->pgo_throttled == TRUE) {
 976                        q->pgo_throttled = FALSE;
 977                        thread_wakeup((event_t) &q->pgo_laundry);
 978                }
 979                if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
 980                        q->pgo_draining = FALSE;
 981                        thread_wakeup((event_t) (&q->pgo_laundry+1));
 982                }
 983         }
 984 }
 985
 986
 987 static void
 988 vm_pageout_throttle_up_batch(
 989         struct vm_pageout_queue *q,
 990         int             batch_cnt)
 991 {
 992        LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
 993
 994        vm_pageout_throttle_up_count += batch_cnt;
 995
 996        q->pgo_laundry -= batch_cnt;
 997
 998        if (q->pgo_throttled == TRUE) {
 999                q->pgo_throttled = FALSE;
1000                thread_wakeup((event_t) &q->pgo_laundry);
1001        }
1002        if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
1003                q->pgo_draining = FALSE;
1004                thread_wakeup((event_t) (&q->pgo_laundry+1));
1005        }
1006 }
1007
1008
1009
1010 /*
1011  * VM memory pressure monitoring.
1012  *
1013  * vm_pageout_scan() keeps track of the number of pages it considers and
1014  * reclaims, in the currently active vm_pageout_stat[vm_pageout_stat_now].
1015  *
1016  * compute_memory_pressure() is called every second from compute_averages()
1017  * and moves "vm_pageout_stat_now" forward, to start accumulating the number
1018  * of recalimed pages in a new vm_pageout_stat[] bucket.
1019  *
1020  * mach_vm_pressure_monitor() collects past statistics about memory pressure.
1021  * The caller provides the number of seconds ("nsecs") worth of statistics
1022  * it wants, up to 30 seconds.
1023  * It computes the number of pages reclaimed in the past "nsecs" seconds and
1024  * also returns the number of pages the system still needs to reclaim at this
1025  * moment in time.
1026  */
1027 #define VM_PAGEOUT_STAT_SIZE    31
1028 struct vm_pageout_stat {
1029         unsigned int considered;
1030         unsigned int reclaimed_clean;
1031         unsigned int pages_compressed;
1032         unsigned int pages_grabbed_by_compressor;
1033         unsigned int cleaned_dirty_external;
1034         unsigned int throttled_internal_q;
1035         unsigned int throttled_external_q;
1036         unsigned int failed_compressions;
1037 } vm_pageout_stats[VM_PAGEOUT_STAT_SIZE] = {{0,0,0,0,0,0,0,0}, };
1038
1039 unsigned int vm_pageout_stat_now = 0;
1040 unsigned int vm_memory_pressure = 0;
1041
1042 #define VM_PAGEOUT_STAT_BEFORE(i) \
1043         (((i) == 0) ? VM_PAGEOUT_STAT_SIZE - 1 : (i) - 1)
1044 #define VM_PAGEOUT_STAT_AFTER(i) \
1045         (((i) == VM_PAGEOUT_STAT_SIZE - 1) ? 0 : (i) + 1)
1046
1047 #if VM_PAGE_BUCKETS_CHECK
1048 int vm_page_buckets_check_interval = 10; /* in seconds */
1049 #endif /* VM_PAGE_BUCKETS_CHECK */
1050
1051 /*
1052  * Called from compute_averages().
1053  */
1054 void
1055 compute_memory_pressure(
1056         __unused void *arg)
1057 {
1058         unsigned int vm_pageout_next;
1059
1060 #if VM_PAGE_BUCKETS_CHECK
1061         /* check the consistency of VM page buckets at regular interval */
1062         static int counter = 0;
1063         if ((++counter % vm_page_buckets_check_interval) == 0) {
1064                 vm_page_buckets_check();
1065         }
1066 #endif /* VM_PAGE_BUCKETS_CHECK */
1067
1068         vm_memory_pressure =
1069                 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].reclaimed_clean;
1070
1071         commpage_set_memory_pressure( vm_memory_pressure );
1072
1073         /* move "now" forward */
1074         vm_pageout_next = VM_PAGEOUT_STAT_AFTER(vm_pageout_stat_now);
1075         vm_pageout_stats[vm_pageout_next].considered = 0;
1076         vm_pageout_stats[vm_pageout_next].reclaimed_clean = 0;
1077         vm_pageout_stats[vm_pageout_next].throttled_internal_q = 0;
1078         vm_pageout_stats[vm_pageout_next].throttled_external_q = 0;
1079         vm_pageout_stats[vm_pageout_next].cleaned_dirty_external = 0;
1080         vm_pageout_stats[vm_pageout_next].pages_compressed = 0;
1081         vm_pageout_stats[vm_pageout_next].pages_grabbed_by_compressor = 0;
1082         vm_pageout_stats[vm_pageout_next].failed_compressions = 0;
1083
1084         vm_pageout_stat_now = vm_pageout_next;
1085 }
1086
1087
1088 /*
1089  * IMPORTANT
1090  * mach_vm_ctl_page_free_wanted() is called indirectly, via
1091  * mach_vm_pressure_monitor(), when taking a stackshot. Therefore,
1092  * it must be safe in the restricted stackshot context. Locks and/or
1093  * blocking are not allowable.
1094  */
1095 unsigned int
1096 mach_vm_ctl_page_free_wanted(void)
1097 {
1098         unsigned int page_free_target, page_free_count, page_free_wanted;
1099
1100         page_free_target = vm_page_free_target;
1101         page_free_count = vm_page_free_count;
1102         if (page_free_target > page_free_count) {
1103                 page_free_wanted = page_free_target - page_free_count;
1104         } else {
1105                 page_free_wanted = 0;
1106         }
1107
1108         return page_free_wanted;
1109 }
1110
1111
1112 /*
1113  * IMPORTANT:
1114  * mach_vm_pressure_monitor() is called when taking a stackshot, with
1115  * wait_for_pressure FALSE, so that code path must remain safe in the
1116  * restricted stackshot context. No blocking or locks are allowable.
1117  * on that code path.
1118  */
1119
1120 kern_return_t
1121 mach_vm_pressure_monitor(
1122         boolean_t       wait_for_pressure,
1123         unsigned int    nsecs_monitored,
1124         unsigned int    *pages_reclaimed_p,
1125         unsigned int    *pages_wanted_p)
1126 {
1127         wait_result_t   wr;
1128         unsigned int    vm_pageout_then, vm_pageout_now;
1129         unsigned int    pages_reclaimed;
1130
1131         /*
1132          * We don't take the vm_page_queue_lock here because we don't want
1133          * vm_pressure_monitor() to get in the way of the vm_pageout_scan()
1134          * thread when it's trying to reclaim memory.  We don't need fully
1135          * accurate monitoring anyway...
1136          */
1137
1138         if (wait_for_pressure) {
1139                 /* wait until there's memory pressure */
1140                 while (vm_page_free_count >= vm_page_free_target) {
1141                         wr = assert_wait((event_t) &vm_page_free_wanted,
1142                                          THREAD_INTERRUPTIBLE);
1143                         if (wr == THREAD_WAITING) {
1144                                 wr = thread_block(THREAD_CONTINUE_NULL);
1145                         }
1146                         if (wr == THREAD_INTERRUPTED) {
1147                                 return KERN_ABORTED;
1148                         }
1149                         if (wr == THREAD_AWAKENED) {
1150                                 /*
1151                                  * The memory pressure might have already
1152                                  * been relieved but let's not block again
1153                                  * and let's report that there was memory
1154                                  * pressure at some point.
1155                                  */
1156                                 break;
1157                         }
1158                 }
1159         }
1160
1161         /* provide the number of pages the system wants to reclaim */
1162         if (pages_wanted_p != NULL) {
1163                 *pages_wanted_p = mach_vm_ctl_page_free_wanted();
1164         }
1165
1166         if (pages_reclaimed_p == NULL) {
1167                 return KERN_SUCCESS;
1168         }
1169
1170         /* provide number of pages reclaimed in the last "nsecs_monitored" */
1171         vm_pageout_now = vm_pageout_stat_now;
1172         pages_reclaimed = 0;
1173         for (vm_pageout_then =
1174                      VM_PAGEOUT_STAT_BEFORE(vm_pageout_now);
1175              vm_pageout_then != vm_pageout_now &&
1176                      nsecs_monitored-- != 0;
1177              vm_pageout_then =
1178                      VM_PAGEOUT_STAT_BEFORE(vm_pageout_then)) {
1179                 pages_reclaimed += vm_pageout_stats[vm_pageout_then].reclaimed_clean;
1180         }
1181         *pages_reclaimed_p = pages_reclaimed;
1182
1183         return KERN_SUCCESS;
1184 }
1185
1186
1187
1188 #if DEVELOPMENT || DEBUG
1189
1190 static void
1191 vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *, int);
1192
1193 /*
1194  * condition variable used to make sure there is
1195  * only a single sweep going on at a time
1196  */
1197 boolean_t       vm_pageout_disconnect_all_pages_active = FALSE;
1198
1199
1200 void
1201 vm_pageout_disconnect_all_pages()
1202 {
1203         vm_page_lock_queues();
1204
1205         if (vm_pageout_disconnect_all_pages_active == TRUE) {
1206                 vm_page_unlock_queues();
1207                 return;
1208         }
1209         vm_pageout_disconnect_all_pages_active = TRUE;
1210         vm_page_unlock_queues();
1211
1212         vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_throttled, vm_page_throttled_count);
1213         vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_anonymous, vm_page_anonymous_count);
1214         vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_active, vm_page_active_count);
1215
1216         vm_pageout_disconnect_all_pages_active = FALSE;
1217 }
1218
1219
1220 void
1221 vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *q, int qcount)
1222 {
1223         vm_page_t       m;
1224         vm_object_t     t_object = NULL;
1225         vm_object_t     l_object = NULL;
1226         vm_object_t     m_object = NULL;
1227         int             delayed_unlock = 0;
1228         int             try_failed_count = 0;
1229         int             disconnected_count = 0;
1230         int             paused_count = 0;
1231         int             object_locked_count = 0;
1232
1233         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS)) | DBG_FUNC_START,
1234                                   q, qcount, 0, 0, 0);
1235
1236         vm_page_lock_queues();
1237
1238         while (qcount && !vm_page_queue_empty(q)) {
1239
1240                 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1241
1242                 m = (vm_page_t) vm_page_queue_first(q);
1243                 m_object = VM_PAGE_OBJECT(m);
1244
1245                 /*
1246                  * check to see if we currently are working
1247                  * with the same object... if so, we've
1248                  * already got the lock
1249                  */
1250                 if (m_object != l_object) {
1251                         /*
1252                          * the object associated with candidate page is
1253                          * different from the one we were just working
1254                          * with... dump the lock if we still own it
1255                          */
1256                         if (l_object != NULL) {
1257                                 vm_object_unlock(l_object);
1258                                 l_object = NULL;
1259                         }
1260                         if (m_object != t_object)
1261                                 try_failed_count = 0;
1262
1263                         /*
1264                          * Try to lock object; since we've alread got the
1265                          * page queues lock, we can only 'try' for this one.
1266                          * if the 'try' fails, we need to do a mutex_pause
1267                          * to allow the owner of the object lock a chance to
1268                          * run...
1269                          */
1270                         if ( !vm_object_lock_try_scan(m_object)) {
1271
1272                                 if (try_failed_count > 20) {
1273                                         goto reenter_pg_on_q;
1274                                 }
1275                                 vm_page_unlock_queues();
1276                                 mutex_pause(try_failed_count++);
1277                                 vm_page_lock_queues();
1278                                 delayed_unlock = 0;
1279
1280                                 paused_count++;
1281
1282                                 t_object = m_object;
1283                                 continue;
1284                         }
1285                         object_locked_count++;
1286
1287                         l_object = m_object;
1288                 }
1289                 if ( !m_object->alive || m->cleaning || m->laundry || m->busy || m->absent || m->error || m->free_when_done) {
1290                         /*
1291                          * put it back on the head of its queue
1292                          */
1293                         goto reenter_pg_on_q;
1294                 }
1295                 if (m->pmapped == TRUE) {
1296
1297                         pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
1298
1299                         disconnected_count++;
1300                 }
1301 reenter_pg_on_q:
1302                 vm_page_queue_remove(q, m, vm_page_t, pageq);
1303                 vm_page_queue_enter(q, m, vm_page_t, pageq);
1304
1305                 qcount--;
1306                 try_failed_count = 0;
1307
1308                 if (delayed_unlock++ > 128) {
1309
1310                         if (l_object != NULL) {
1311                                 vm_object_unlock(l_object);
1312                                 l_object = NULL;
1313                         }
1314                         lck_mtx_yield(&vm_page_queue_lock);
1315                         delayed_unlock = 0;
1316                 }
1317         }
1318         if (l_object != NULL) {
1319                 vm_object_unlock(l_object);
1320                 l_object = NULL;
1321         }
1322         vm_page_unlock_queues();
1323
1324         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS)) | DBG_FUNC_END,
1325                                   q, disconnected_count, object_locked_count, paused_count, 0);
1326 }
1327
1328 #endif
1329
1330
1331 static void
1332 vm_pageout_page_queue(vm_page_queue_head_t *, int);
1333
1334 /*
1335  * condition variable used to make sure there is
1336  * only a single sweep going on at a time
1337  */
1338 boolean_t       vm_pageout_anonymous_pages_active = FALSE;
1339
1340
1341 void
1342 vm_pageout_anonymous_pages()
1343 {
1344         if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
1345
1346                 vm_page_lock_queues();
1347
1348                 if (vm_pageout_anonymous_pages_active == TRUE) {
1349                         vm_page_unlock_queues();
1350                         return;
1351                 }
1352                 vm_pageout_anonymous_pages_active = TRUE;
1353                 vm_page_unlock_queues();
1354
1355                 vm_pageout_page_queue(&vm_page_queue_throttled, vm_page_throttled_count);
1356                 vm_pageout_page_queue(&vm_page_queue_anonymous, vm_page_anonymous_count);
1357                 vm_pageout_page_queue(&vm_page_queue_active, vm_page_active_count);
1358
1359                 if (VM_CONFIG_SWAP_IS_PRESENT)
1360                         vm_consider_swapping();
1361
1362                 vm_page_lock_queues();
1363                 vm_pageout_anonymous_pages_active = FALSE;
1364                 vm_page_unlock_queues();
1365         }
1366 }
1367
1368
1369 void
1370 vm_pageout_page_queue(vm_page_queue_head_t *q, int qcount)
1371 {
1372         vm_page_t       m;
1373         vm_object_t     t_object = NULL;
1374         vm_object_t     l_object = NULL;
1375         vm_object_t     m_object = NULL;
1376         int             delayed_unlock = 0;
1377         int             try_failed_count = 0;
1378         int             refmod_state;
1379         int             pmap_options;
1380         struct          vm_pageout_queue *iq;
1381         ppnum_t         phys_page;
1382
1383
1384         iq = &vm_pageout_queue_internal;
1385
1386         vm_page_lock_queues();
1387
1388         while (qcount && !vm_page_queue_empty(q)) {
1389
1390                 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1391
1392                 if (VM_PAGE_Q_THROTTLED(iq)) {
1393
1394                         if (l_object != NULL) {
1395                                 vm_object_unlock(l_object);
1396                                 l_object = NULL;
1397                         }
1398                         iq->pgo_draining = TRUE;
1399
1400                         assert_wait((event_t) (&iq->pgo_laundry + 1), THREAD_INTERRUPTIBLE);
1401                         vm_page_unlock_queues();
1402
1403                         thread_block(THREAD_CONTINUE_NULL);
1404
1405                         vm_page_lock_queues();
1406                         delayed_unlock = 0;
1407                         continue;
1408                 }
1409                 m = (vm_page_t) vm_page_queue_first(q);
1410                 m_object = VM_PAGE_OBJECT(m);
1411
1412                 /*
1413                  * check to see if we currently are working
1414                  * with the same object... if so, we've
1415                  * already got the lock
1416                  */
1417                 if (m_object != l_object) {
1418                         if ( !m_object->internal)
1419                                 goto reenter_pg_on_q;
1420
1421                         /*
1422                          * the object associated with candidate page is
1423                          * different from the one we were just working
1424                          * with... dump the lock if we still own it
1425                          */
1426                         if (l_object != NULL) {
1427                                 vm_object_unlock(l_object);
1428                                 l_object = NULL;
1429                         }
1430                         if (m_object != t_object)
1431                                 try_failed_count = 0;
1432
1433                         /*
1434                          * Try to lock object; since we've alread got the
1435                          * page queues lock, we can only 'try' for this one.
1436                          * if the 'try' fails, we need to do a mutex_pause
1437                          * to allow the owner of the object lock a chance to
1438                          * run...
1439                          */
1440                         if ( !vm_object_lock_try_scan(m_object)) {
1441
1442                                 if (try_failed_count > 20) {
1443                                         goto reenter_pg_on_q;
1444                                 }
1445                                 vm_page_unlock_queues();
1446                                 mutex_pause(try_failed_count++);
1447                                 vm_page_lock_queues();
1448                                 delayed_unlock = 0;
1449
1450                                 t_object = m_object;
1451                                 continue;
1452                         }
1453                         l_object = m_object;
1454                 }
1455                 if ( !m_object->alive || m->cleaning || m->laundry || m->busy || m->absent || m->error || m->free_when_done) {
1456                         /*
1457                          * page is not to be cleaned
1458                          * put it back on the head of its queue
1459                          */
1460                         goto reenter_pg_on_q;
1461                 }
1462                 phys_page = VM_PAGE_GET_PHYS_PAGE(m);
1463
1464                 if (m->reference == FALSE && m->pmapped == TRUE) {
1465                         refmod_state = pmap_get_refmod(phys_page);
1466
1467                         if (refmod_state & VM_MEM_REFERENCED)
1468                                 m->reference = TRUE;
1469                         if (refmod_state & VM_MEM_MODIFIED) {
1470                                 SET_PAGE_DIRTY(m, FALSE);
1471                         }
1472                 }
1473                 if (m->reference == TRUE) {
1474                         m->reference = FALSE;
1475                         pmap_clear_refmod_options(phys_page, VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
1476                         goto reenter_pg_on_q;
1477                 }
1478                 if (m->pmapped == TRUE) {
1479                         if (m->dirty || m->precious) {
1480                                 pmap_options = PMAP_OPTIONS_COMPRESSOR;
1481                         } else {
1482                                 pmap_options = PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
1483                         }
1484                         refmod_state = pmap_disconnect_options(phys_page, pmap_options, NULL);
1485                         if (refmod_state & VM_MEM_MODIFIED) {
1486                                 SET_PAGE_DIRTY(m, FALSE);
1487                         }
1488                 }
1489                 if ( !m->dirty && !m->precious) {
1490                         vm_page_unlock_queues();
1491                         VM_PAGE_FREE(m);
1492                         vm_page_lock_queues();
1493                         delayed_unlock = 0;
1494
1495                         goto next_pg;
1496                 }
1497                 if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL)  {
1498
1499                         if (!m_object->pager_initialized) {
1500
1501                                 vm_page_unlock_queues();
1502
1503                                 vm_object_collapse(m_object, (vm_object_offset_t) 0, TRUE);
1504
1505                                 if (!m_object->pager_initialized)
1506                                         vm_object_compressor_pager_create(m_object);
1507
1508                                 vm_page_lock_queues();
1509                                 delayed_unlock = 0;
1510                         }
1511                         if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL)
1512                                 goto reenter_pg_on_q;
1513                         /*
1514                          * vm_object_compressor_pager_create will drop the object lock
1515                          * which means 'm' may no longer be valid to use
1516                          */
1517                         continue;
1518                 }
1519                 /*
1520                  * we've already factored out pages in the laundry which
1521                  * means this page can't be on the pageout queue so it's
1522                  * safe to do the vm_page_queues_remove
1523                  */
1524                 vm_page_queues_remove(m, TRUE);
1525
1526                 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1527
1528                 vm_pageout_cluster(m);
1529
1530                 goto next_pg;
1531
1532 reenter_pg_on_q:
1533                 vm_page_queue_remove(q, m, vm_page_t, pageq);
1534                 vm_page_queue_enter(q, m, vm_page_t, pageq);
1535 next_pg:
1536                 qcount--;
1537                 try_failed_count = 0;
1538
1539                 if (delayed_unlock++ > 128) {
1540
1541                         if (l_object != NULL) {
1542                                 vm_object_unlock(l_object);
1543                                 l_object = NULL;
1544                         }
1545                         lck_mtx_yield(&vm_page_queue_lock);
1546                         delayed_unlock = 0;
1547                 }
1548         }
1549         if (l_object != NULL) {
1550                 vm_object_unlock(l_object);
1551                 l_object = NULL;
1552         }
1553         vm_page_unlock_queues();
1554 }
1555
1556
1557
1558 /*
1559  * function in BSD to apply I/O throttle to the pageout thread
1560  */
1561 extern void vm_pageout_io_throttle(void);
1562
1563 #define VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, obj)                    \
1564         MACRO_BEGIN                                                     \
1565         /*                                                              \
1566          * If a "reusable" page somehow made it back into               \
1567          * the active queue, it's been re-used and is not               \
1568          * quite re-usable.                                             \
1569          * If the VM object was "all_reusable", consider it             \
1570          * as "all re-used" instead of converting it to                 \
1571          * "partially re-used", which could be expensive.               \
1572          */                                                             \
1573         assert(VM_PAGE_OBJECT((m)) == (obj));                           \
1574         if ((m)->reusable ||                                            \
1575             (obj)->all_reusable) {                                      \
1576                 vm_object_reuse_pages((obj),                            \
1577                                       (m)->offset,                      \
1578                                       (m)->offset + PAGE_SIZE_64,       \
1579                                       FALSE);                           \
1580         }                                                               \
1581         MACRO_END
1582
1583
1584 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT         64
1585 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX     1024
1586
1587 #define FCS_IDLE                0
1588 #define FCS_DELAYED             1
1589 #define FCS_DEADLOCK_DETECTED   2
1590
1591 struct flow_control {
1592         int             state;
1593         mach_timespec_t ts;
1594 };
1595
1596 #if CONFIG_BACKGROUND_QUEUE
1597 uint64_t vm_pageout_skipped_bq_internal = 0;
1598 uint64_t vm_pageout_considered_bq_internal = 0;
1599 uint64_t vm_pageout_considered_bq_external = 0;
1600 uint64_t vm_pageout_rejected_bq_internal = 0;
1601 uint64_t vm_pageout_rejected_bq_external = 0;
1602 #endif
1603
1604 uint32_t vm_pageout_no_victim = 0;
1605 uint32_t vm_pageout_considered_page = 0;
1606 uint32_t vm_page_filecache_min = 0;
1607
1608 #define ANONS_GRABBED_LIMIT     2
1609
1610 #if CONFIG_SECLUDED_MEMORY
1611 extern vm_page_t vm_page_grab_secluded(void);
1612 uint64_t vm_pageout_secluded_burst_count = 0;
1613 #endif /* CONFIG_SECLUDED_MEMORY */
1614
1615
1616 static void vm_pageout_delayed_unlock(int *, int *, vm_page_t *);
1617 static void vm_pageout_prepare_to_block(vm_object_t *, int *, vm_page_t *, int *, int);
1618
1619 #define VM_PAGEOUT_PB_NO_ACTION                         0
1620 #define VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER 1
1621 #define VM_PAGEOUT_PB_THREAD_YIELD                      2
1622
1623
1624 static void
1625 vm_pageout_delayed_unlock(int *delayed_unlock, int *local_freed, vm_page_t *local_freeq)
1626 {
1627         if (*local_freeq) {
1628                 vm_page_unlock_queues();
1629
1630                 VM_DEBUG_EVENT(
1631                         vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START,
1632                         vm_page_free_count, *local_freed, 0, 1);
1633
1634                 vm_page_free_list(*local_freeq, TRUE);
1635
1636                 VM_DEBUG_EVENT(vm_pageout_freelist,VM_PAGEOUT_FREELIST, DBG_FUNC_END,
1637                                vm_page_free_count, 0, 0, 1);
1638
1639                 *local_freeq = NULL;
1640                 *local_freed = 0;
1641
1642                 vm_page_lock_queues();
1643         } else {
1644                 lck_mtx_yield(&vm_page_queue_lock);
1645         }
1646         *delayed_unlock = 1;
1647 }
1648
1649
1650 static void
1651 vm_pageout_prepare_to_block(vm_object_t *object, int *delayed_unlock,
1652                             vm_page_t *local_freeq, int *local_freed, int action)
1653 {
1654         vm_page_unlock_queues();
1655
1656         if (*object != NULL) {
1657                 vm_object_unlock(*object);
1658                 *object = NULL;
1659         }
1660         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1661
1662         if (*local_freeq) {
1663
1664                 VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START,
1665                                vm_page_free_count, *local_freed, 0, 2);
1666
1667                 vm_page_free_list(*local_freeq, TRUE);
1668
1669                 VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END,
1670                                vm_page_free_count, 0, 0, 2);
1671
1672                 *local_freeq = NULL;
1673                 *local_freed = 0;
1674         }
1675         *delayed_unlock = 1;
1676
1677         switch (action) {
1678
1679         case VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER:
1680                 vm_consider_waking_compactor_swapper();
1681                 break;
1682         case VM_PAGEOUT_PB_THREAD_YIELD:
1683                 thread_yield_internal(1);
1684                 break;
1685         case VM_PAGEOUT_PB_NO_ACTION:
1686         default:
1687                 break;
1688         }
1689         vm_page_lock_queues();
1690 }
1691
1692
1693 int     last_vm_pageout_freed_from_inactive_clean = 0;
1694 int     last_vm_pageout_freed_from_cleaned = 0;
1695 int     last_vm_pageout_freed_from_speculative = 0;
1696 int     last_vm_pageout_freed_after_compression = 0;
1697 int     last_vm_pageout_enqueued_cleaned_from_inactive_dirty = 0;
1698 int     last_vm_pageout_inactive_force_reclaim = 0;
1699 int     last_vm_pageout_scan_inactive_throttled_external = 0;
1700 int     last_vm_pageout_scan_inactive_throttled_internal = 0;
1701 int     last_vm_pageout_reactivation_limit_exceeded = 0;
1702 int     last_vm_pageout_considered_page = 0;
1703 int     last_vm_compressor_pages_grabbed = 0;
1704 int     last_vm_compressor_failed = 0;
1705
1706 void update_vm_info(void)
1707 {
1708         int     tmp1, tmp2, tmp3;
1709
1710         if (!kdebug_enable)
1711                 return;
1712
1713         KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO1)) | DBG_FUNC_NONE,
1714                               vm_page_active_count,
1715                               vm_page_speculative_count,
1716                               vm_page_inactive_count,
1717                               vm_page_anonymous_count,
1718                               0);
1719
1720         KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO2)) | DBG_FUNC_NONE,
1721                               vm_page_free_count,
1722                               vm_page_wire_count,
1723                               VM_PAGE_COMPRESSOR_COUNT,
1724                               0, 0);
1725
1726         KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO3)) | DBG_FUNC_NONE,
1727                               c_segment_pages_compressed,
1728                               vm_page_internal_count,
1729                               vm_page_external_count,
1730                               vm_page_xpmapped_external_count,
1731                               0);
1732
1733
1734         if ((vm_pageout_considered_page - last_vm_pageout_considered_page) == 0 &&
1735             (vm_pageout_enqueued_cleaned_from_inactive_dirty - last_vm_pageout_enqueued_cleaned_from_inactive_dirty == 0) &&
1736             (vm_pageout_freed_after_compression - last_vm_pageout_freed_after_compression == 0))
1737                 return;
1738
1739
1740         tmp1 = vm_pageout_considered_page;
1741         tmp2 = vm_pageout_freed_from_speculative;
1742         tmp3 = vm_pageout_freed_from_inactive_clean;
1743
1744         KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO4)) | DBG_FUNC_NONE,
1745                               tmp1 - last_vm_pageout_considered_page,
1746                               tmp2 - last_vm_pageout_freed_from_speculative,
1747                               tmp3 - last_vm_pageout_freed_from_inactive_clean,
1748                               0, 0);
1749
1750         last_vm_pageout_considered_page = tmp1;
1751         last_vm_pageout_freed_from_speculative = tmp2;
1752         last_vm_pageout_freed_from_inactive_clean = tmp3;
1753
1754
1755         tmp1 = vm_pageout_scan_inactive_throttled_external;
1756         tmp2 = vm_pageout_enqueued_cleaned_from_inactive_dirty;
1757         tmp3 = vm_pageout_freed_from_cleaned;
1758
1759         KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO5)) | DBG_FUNC_NONE,
1760                               tmp1 - last_vm_pageout_scan_inactive_throttled_external,
1761                               tmp2 - last_vm_pageout_enqueued_cleaned_from_inactive_dirty,
1762                               tmp3 - last_vm_pageout_freed_from_cleaned,
1763                               0, 0);
1764
1765         vm_pageout_stats[vm_pageout_stat_now].throttled_external_q += (tmp1 - last_vm_pageout_scan_inactive_throttled_external);
1766         vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external += (tmp2 - last_vm_pageout_enqueued_cleaned_from_inactive_dirty);
1767
1768         last_vm_pageout_scan_inactive_throttled_external = tmp1;
1769         last_vm_pageout_enqueued_cleaned_from_inactive_dirty = tmp2;
1770         last_vm_pageout_freed_from_cleaned = tmp3;
1771
1772
1773         tmp1 = vm_pageout_scan_inactive_throttled_internal;
1774         tmp2 = vm_pageout_freed_after_compression;
1775         tmp3 = vm_compressor_pages_grabbed;
1776
1777         KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO6)) | DBG_FUNC_NONE,
1778                               tmp1 - last_vm_pageout_scan_inactive_throttled_internal,
1779                               tmp2 - last_vm_pageout_freed_after_compression,
1780                               tmp3 - last_vm_compressor_pages_grabbed,
1781                               0, 0);
1782
1783         vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q += (tmp1 - last_vm_pageout_scan_inactive_throttled_internal);
1784         vm_pageout_stats[vm_pageout_stat_now].pages_compressed += (tmp2 - last_vm_pageout_freed_after_compression);
1785         vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor += (tmp3 - last_vm_compressor_pages_grabbed);
1786
1787         last_vm_pageout_scan_inactive_throttled_internal = tmp1;
1788         last_vm_pageout_freed_after_compression = tmp2;
1789         last_vm_compressor_pages_grabbed = tmp3;
1790
1791
1792         if ((vm_pageout_reactivation_limit_exceeded - last_vm_pageout_reactivation_limit_exceeded) == 0 &&
1793             (vm_pageout_inactive_force_reclaim - last_vm_pageout_inactive_force_reclaim) == 0 &&
1794             (vm_compressor_failed - last_vm_compressor_failed) == 0)
1795                 return;
1796
1797         tmp1 = vm_pageout_reactivation_limit_exceeded;
1798         tmp2 = vm_pageout_inactive_force_reclaim;
1799         tmp3 = vm_compressor_failed;
1800
1801         KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO7)) | DBG_FUNC_NONE,
1802                               tmp1 - last_vm_pageout_reactivation_limit_exceeded,
1803                               tmp2 - last_vm_pageout_inactive_force_reclaim,
1804                               tmp3 - last_vm_compressor_failed,
1805                               0, 0);
1806
1807         vm_pageout_stats[vm_pageout_stat_now].failed_compressions += (tmp3 - last_vm_compressor_failed);
1808
1809         last_vm_pageout_reactivation_limit_exceeded = tmp1;
1810         last_vm_pageout_inactive_force_reclaim = tmp2;
1811         last_vm_compressor_failed = tmp3;
1812 }
1813
1814
1815 /*
1816  *      vm_pageout_scan does the dirty work for the pageout daemon.
1817  *      It returns with both vm_page_queue_free_lock and vm_page_queue_lock
1818  *      held and vm_page_free_wanted == 0.
1819  */
1820 void
1821 vm_pageout_scan(void)
1822 {
1823         unsigned int loop_count = 0;
1824         unsigned int inactive_burst_count = 0;
1825         unsigned int active_burst_count = 0;
1826         unsigned int reactivated_this_call;
1827         unsigned int reactivate_limit;
1828         vm_page_t   local_freeq = NULL;
1829         int         local_freed = 0;
1830         int         delayed_unlock;
1831         int         delayed_unlock_limit = 0;
1832         int         refmod_state = 0;
1833         int     vm_pageout_deadlock_target = 0;
1834         struct  vm_pageout_queue *iq;
1835         struct  vm_pageout_queue *eq;
1836         struct  vm_speculative_age_q *sq;
1837         struct  flow_control    flow_control = { 0, { 0, 0 } };
1838         boolean_t inactive_throttled = FALSE;
1839         boolean_t try_failed;
1840         mach_timespec_t ts;
1841         unsigned        int msecs = 0;
1842         vm_object_t     object = NULL;
1843         uint32_t        inactive_reclaim_run;
1844         boolean_t       exceeded_burst_throttle;
1845         boolean_t       grab_anonymous = FALSE;
1846         boolean_t       force_anonymous = FALSE;
1847         boolean_t       force_speculative_aging = FALSE;
1848         int             anons_grabbed = 0;
1849         int             page_prev_q_state = 0;
1850 #if CONFIG_BACKGROUND_QUEUE
1851         boolean_t       page_from_bg_q = FALSE;
1852 #endif
1853         int             cache_evict_throttle = 0;
1854         uint32_t        vm_pageout_inactive_external_forced_reactivate_limit = 0;
1855         int             force_purge = 0;
1856 #define DELAY_SPECULATIVE_AGE   1000
1857         int             delay_speculative_age = 0;
1858         vm_object_t     m_object = VM_OBJECT_NULL;
1859
1860 #if VM_PRESSURE_EVENTS
1861         vm_pressure_level_t pressure_level;
1862 #endif /* VM_PRESSURE_EVENTS */
1863
1864         VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_START,
1865                        vm_pageout_speculative_clean, vm_pageout_inactive_clean,
1866                        vm_pageout_inactive_dirty_internal, vm_pageout_inactive_dirty_external);
1867
1868         flow_control.state = FCS_IDLE;
1869         iq = &vm_pageout_queue_internal;
1870         eq = &vm_pageout_queue_external;
1871         sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
1872
1873
1874         XPR(XPR_VM_PAGEOUT, "vm_pageout_scan\n", 0, 0, 0, 0, 0);
1875
1876         /* Ask the pmap layer to return any pages it no longer needs. */
1877         pmap_release_pages_fast();
1878
1879         vm_page_lock_queues();
1880         delayed_unlock = 1;
1881
1882         /*
1883          *      Calculate the max number of referenced pages on the inactive
1884          *      queue that we will reactivate.
1885          */
1886         reactivated_this_call = 0;
1887         reactivate_limit = VM_PAGE_REACTIVATE_LIMIT(vm_page_active_count +
1888                                                     vm_page_inactive_count);
1889         inactive_reclaim_run = 0;
1890
1891         vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
1892
1893         /*
1894          *      We want to gradually dribble pages from the active queue
1895          *      to the inactive queue.  If we let the inactive queue get
1896          *      very small, and then suddenly dump many pages into it,
1897          *      those pages won't get a sufficient chance to be referenced
1898          *      before we start taking them from the inactive queue.
1899          *
1900          *      We must limit the rate at which we send pages to the pagers
1901          *      so that we don't tie up too many pages in the I/O queues.
1902          *      We implement a throttling mechanism using the laundry count
1903          *      to limit the number of pages outstanding to the default
1904          *      and external pagers.  We can bypass the throttles and look
1905          *      for clean pages if the pageout queues don't drain in a timely
1906          *      fashion since this may indicate that the pageout paths are
1907          *      stalled waiting for memory, which only we can provide.
1908          */
1909
1910
1911 Restart:
1912
1913         assert(object == NULL);
1914         assert(delayed_unlock != 0);
1915
1916         /*
1917          *      Recalculate vm_page_inactivate_target.
1918          */
1919         vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
1920                                                           vm_page_inactive_count +
1921                                                           vm_page_speculative_count);
1922
1923         vm_page_anonymous_min = vm_page_inactive_target / 20;
1924
1925
1926         /*
1927          * don't want to wake the pageout_scan thread up everytime we fall below
1928          * the targets... set a low water mark at 0.25% below the target
1929          */
1930         vm_page_inactive_min = vm_page_inactive_target - (vm_page_inactive_target / 400);
1931
1932         if (vm_page_speculative_percentage > 50)
1933                 vm_page_speculative_percentage = 50;
1934         else if (vm_page_speculative_percentage <= 0)
1935                 vm_page_speculative_percentage = 1;
1936
1937         vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count +
1938                                                                 vm_page_inactive_count);
1939
1940         try_failed = FALSE;
1941
1942         for (;;) {
1943                 vm_page_t m;
1944
1945                 DTRACE_VM2(rev, int, 1, (uint64_t *), NULL);
1946
1947                 if (vm_upl_wait_for_pages < 0)
1948                         vm_upl_wait_for_pages = 0;
1949
1950                 delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT + vm_upl_wait_for_pages;
1951
1952                 if (delayed_unlock_limit > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX)
1953                         delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX;
1954
1955 #if CONFIG_SECLUDED_MEMORY
1956                 /*
1957                  * Deal with secluded_q overflow.
1958                  */
1959                 if (vm_page_secluded_count > vm_page_secluded_target) {
1960                         unsigned int secluded_overflow;
1961                         vm_page_t secluded_page;
1962
1963                         if (object != NULL) {
1964                                 vm_object_unlock(object);
1965                                 object = NULL;
1966                                 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1967                         }
1968                         /*
1969                          * SECLUDED_AGING_BEFORE_ACTIVE:
1970                          * Excess secluded pages go to the active queue and
1971                          * will later go to the inactive queue.
1972                          */
1973                         active_burst_count = MIN(vm_pageout_burst_active_throttle,
1974                                                  vm_page_secluded_count_inuse);
1975                         secluded_overflow = (vm_page_secluded_count -
1976                                              vm_page_secluded_target);
1977                         while (secluded_overflow-- > 0 &&
1978                                vm_page_secluded_count > vm_page_secluded_target) {
1979                                 assert((vm_page_secluded_count_free +
1980                                         vm_page_secluded_count_inuse) ==
1981                                        vm_page_secluded_count);
1982                                 secluded_page = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded);
1983                                 assert(secluded_page->vm_page_q_state ==
1984                                        VM_PAGE_ON_SECLUDED_Q);
1985                                 vm_page_queues_remove(secluded_page, FALSE);
1986                                 assert(!secluded_page->fictitious);
1987                                 assert(!VM_PAGE_WIRED(secluded_page));
1988                                 if (secluded_page->vm_page_object == 0) {
1989                                         /* transfer to free queue */
1990                                         assert(secluded_page->busy);
1991                                         secluded_page->snext = local_freeq;
1992                                         local_freeq = secluded_page;
1993                                         local_freed++;
1994                                 } else {
1995                                         /* transfer to head of active queue */
1996                                         vm_page_enqueue_active(secluded_page, FALSE);
1997                                         if (active_burst_count-- == 0) {
1998                                                 vm_pageout_secluded_burst_count++;
1999                                                 break;
2000                                         }
2001                                 }
2002                                 secluded_page = VM_PAGE_NULL;
2003
2004                                 if (delayed_unlock++ > delayed_unlock_limit) {
2005                                         vm_pageout_delayed_unlock(&delayed_unlock, &local_freed, &local_freeq);
2006                                 }
2007                         }
2008                 }
2009 #endif /* CONFIG_SECLUDED_MEMORY */
2010
2011                 assert(delayed_unlock);
2012
2013                 /*
2014                  * Move pages from active to inactive if we're below the target
2015                  */
2016                 if ((vm_page_inactive_count + vm_page_speculative_count) >= vm_page_inactive_target)
2017                         goto done_moving_active_pages;
2018
2019                 if (object != NULL) {
2020                         vm_object_unlock(object);
2021                         object = NULL;
2022                         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2023                 }
2024                 /*
2025                  * Don't sweep through active queue more than the throttle
2026                  * which should be kept relatively low
2027                  */
2028                 active_burst_count = MIN(vm_pageout_burst_active_throttle, vm_page_active_count);
2029
2030                 VM_DEBUG_EVENT(vm_pageout_balance, VM_PAGEOUT_BALANCE, DBG_FUNC_START,
2031                                vm_pageout_inactive, vm_pageout_inactive_used, vm_page_free_count, local_freed);
2032
2033                 VM_DEBUG_EVENT(vm_pageout_balance, VM_PAGEOUT_BALANCE, DBG_FUNC_NONE,
2034                                vm_pageout_speculative_clean, vm_pageout_inactive_clean,
2035                                vm_pageout_inactive_dirty_internal, vm_pageout_inactive_dirty_external);
2036                 memoryshot(VM_PAGEOUT_BALANCE, DBG_FUNC_START);
2037
2038
2039                 while (!vm_page_queue_empty(&vm_page_queue_active) && active_burst_count--) {
2040
2041                         vm_pageout_active++;
2042
2043                         m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
2044
2045                         assert(m->vm_page_q_state == VM_PAGE_ON_ACTIVE_Q);
2046                         assert(!m->laundry);
2047                         assert(VM_PAGE_OBJECT(m) != kernel_object);
2048                         assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
2049
2050                         DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
2051
2052                         /*
2053                          * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
2054                          *
2055                          * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
2056                          * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
2057                          * new reference happens. If no futher references happen on the page after that remote TLB flushes
2058                          * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
2059                          * by pageout_scan, which is just fine since the last reference would have happened quite far
2060                          * in the past (TLB caches don't hang around for very long), and of course could just as easily
2061                          * have happened before we moved the page
2062                          */
2063                         pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
2064
2065                         /*
2066                          * The page might be absent or busy,
2067                          * but vm_page_deactivate can handle that.
2068                          * FALSE indicates that we don't want a H/W clear reference
2069                          */
2070                         vm_page_deactivate_internal(m, FALSE);
2071
2072                         if (delayed_unlock++ > delayed_unlock_limit) {
2073                                 vm_pageout_delayed_unlock(&delayed_unlock, &local_freed, &local_freeq);
2074                         }
2075                 }
2076
2077                 VM_DEBUG_EVENT(vm_pageout_balance, VM_PAGEOUT_BALANCE, DBG_FUNC_END,
2078                                vm_page_active_count, vm_page_inactive_count, vm_page_speculative_count, vm_page_inactive_target);
2079                 memoryshot(VM_PAGEOUT_BALANCE, DBG_FUNC_END);
2080
2081                 /**********************************************************************
2082                  * above this point we're playing with the active and secluded queues
2083                  * below this point we're playing with the throttling mechanisms
2084                  * and the inactive queue
2085                  **********************************************************************/
2086
2087 done_moving_active_pages:
2088
2089                 if (vm_page_free_count + local_freed >= vm_page_free_target)
2090                 {
2091                         vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
2092                                                     VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
2093                         /*
2094                          * make sure the pageout I/O threads are running
2095                          * throttled in case there are still requests
2096                          * in the laundry... since we have met our targets
2097                          * we don't need the laundry to be cleaned in a timely
2098                          * fashion... so let's avoid interfering with foreground
2099                          * activity
2100                          */
2101                         vm_pageout_adjust_eq_iothrottle(eq, TRUE);
2102
2103                         /*
2104                          * recalculate vm_page_inactivate_target
2105                          */
2106                         vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
2107                                                                           vm_page_inactive_count +
2108                                                                           vm_page_speculative_count);
2109 #ifndef CONFIG_EMBEDDED
2110                         if (((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) &&
2111                             !vm_page_queue_empty(&vm_page_queue_active)) {
2112                                 /*
2113                                  * inactive target still not met... keep going
2114                                  * until we get the queues balanced...
2115                                  */
2116                                 continue;
2117                         }
2118 #endif
2119                         lck_mtx_lock(&vm_page_queue_free_lock);
2120
2121                         if ((vm_page_free_count >= vm_page_free_target) &&
2122                             (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
2123                                 /*
2124                                  * done - we have met our target *and*
2125                                  * there is no one waiting for a page.
2126                                  */
2127 return_from_scan:
2128                                 assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
2129
2130                                 VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_NONE,
2131                                                vm_pageout_inactive, vm_pageout_inactive_used, 0, 0);
2132                                 VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_END,
2133                                                vm_pageout_speculative_clean, vm_pageout_inactive_clean,
2134                                                vm_pageout_inactive_dirty_internal, vm_pageout_inactive_dirty_external);
2135
2136                                 return;
2137                         }
2138                         lck_mtx_unlock(&vm_page_queue_free_lock);
2139                 }
2140
2141                 /*
2142                  * Before anything, we check if we have any ripe volatile
2143                  * objects around. If so, try to purge the first object.
2144                  * If the purge fails, fall through to reclaim a page instead.
2145                  * If the purge succeeds, go back to the top and reevalute
2146                  * the new memory situation.
2147                  */
2148
2149                 assert (available_for_purge>=0);
2150                 force_purge = 0; /* no force-purging */
2151
2152 #if VM_PRESSURE_EVENTS
2153                 pressure_level = memorystatus_vm_pressure_level;
2154
2155                 if (pressure_level > kVMPressureNormal) {
2156
2157                         if (pressure_level >= kVMPressureCritical) {
2158                                 force_purge = memorystatus_purge_on_critical;
2159                         } else if (pressure_level >= kVMPressureUrgent) {
2160                                 force_purge = memorystatus_purge_on_urgent;
2161                         } else if (pressure_level >= kVMPressureWarning) {
2162                                 force_purge = memorystatus_purge_on_warning;
2163                         }
2164                 }
2165 #endif /* VM_PRESSURE_EVENTS */
2166
2167                 if (available_for_purge || force_purge) {
2168
2169                         if (object != NULL) {
2170                                 vm_object_unlock(object);
2171                                 object = NULL;
2172                         }
2173
2174                         memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_START);
2175
2176                         VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_START, vm_page_free_count, 0, 0, 0);
2177                         if (vm_purgeable_object_purge_one(force_purge, C_DONT_BLOCK)) {
2178                                 vm_pageout_purged_objects++;
2179                                 VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, vm_page_free_count, 0, 0, 0);
2180                                 memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
2181                                 continue;
2182                         }
2183                         VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, 0, 0, 0, -1);
2184                         memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
2185                 }
2186
2187                 if (vm_page_queue_empty(&sq->age_q) && vm_page_speculative_count) {
2188                         /*
2189                          * try to pull pages from the aging bins...
2190                          * see vm_page.h for an explanation of how
2191                          * this mechanism works
2192                          */
2193                         struct vm_speculative_age_q     *aq;
2194                         boolean_t       can_steal = FALSE;
2195                         int num_scanned_queues;
2196
2197                         aq = &vm_page_queue_speculative[speculative_steal_index];
2198
2199                         num_scanned_queues = 0;
2200                         while (vm_page_queue_empty(&aq->age_q) &&
2201                                num_scanned_queues++ != VM_PAGE_MAX_SPECULATIVE_AGE_Q) {
2202
2203                                 speculative_steal_index++;
2204
2205                                 if (speculative_steal_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q)
2206                                         speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
2207
2208                                 aq = &vm_page_queue_speculative[speculative_steal_index];
2209                         }
2210
2211                         if (num_scanned_queues == VM_PAGE_MAX_SPECULATIVE_AGE_Q + 1) {
2212                                 /*
2213                                  * XXX We've scanned all the speculative
2214                                  * queues but still haven't found one
2215                                  * that is not empty, even though
2216                                  * vm_page_speculative_count is not 0.
2217                                  *
2218                                  * report the anomaly...
2219                                  */
2220                                 printf("vm_pageout_scan: "
2221                                        "all speculative queues empty "
2222                                        "but count=%d.  Re-adjusting.\n",
2223                                        vm_page_speculative_count);
2224                                 if (vm_page_speculative_count > vm_page_speculative_count_drift_max)
2225                                         vm_page_speculative_count_drift_max = vm_page_speculative_count;
2226                                 vm_page_speculative_count_drifts++;
2227 #if DEVELOPMENT || DEBUG
2228                                 panic("vm_pageout_scan: vm_page_speculative_count=%d but queues are empty", vm_page_speculative_count);
2229 #endif /* DEVELOPMENT || DEBUG */
2230                                 /* readjust... */
2231                                 vm_page_speculative_count = 0;
2232                                 /* ... and continue */
2233                                 continue;
2234                         }
2235
2236                         if (vm_page_speculative_count > vm_page_speculative_target || force_speculative_aging == TRUE)
2237                                 can_steal = TRUE;
2238                         else {
2239                                 if (!delay_speculative_age) {
2240                                         mach_timespec_t ts_fully_aged;
2241
2242                                         ts_fully_aged.tv_sec = (VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_page_speculative_q_age_ms) / 1000;
2243                                         ts_fully_aged.tv_nsec = ((VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_page_speculative_q_age_ms) % 1000)
2244                                                 * 1000 * NSEC_PER_USEC;
2245
2246                                         ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts);
2247
2248                                         clock_sec_t sec;
2249                                         clock_nsec_t nsec;
2250                                         clock_get_system_nanotime(&sec, &nsec);
2251                                         ts.tv_sec = (unsigned int) sec;
2252                                         ts.tv_nsec = nsec;
2253
2254                                         if (CMP_MACH_TIMESPEC(&ts, &ts_fully_aged) >= 0)
2255                                                 can_steal = TRUE;
2256                                         else
2257                                                 delay_speculative_age++;
2258                                 } else {
2259                                         delay_speculative_age++;
2260                                         if (delay_speculative_age == DELAY_SPECULATIVE_AGE)
2261                                                 delay_speculative_age = 0;
2262                                 }
2263                         }
2264                         if (can_steal == TRUE)
2265                                 vm_page_speculate_ageit(aq);
2266                 }
2267                 force_speculative_aging = FALSE;
2268
2269 #if CONFIG_BACKGROUND_QUEUE
2270                 if (vm_page_queue_empty(&sq->age_q) && cache_evict_throttle == 0 &&
2271                     ((vm_page_background_mode == VM_PAGE_BG_DISABLED) || (vm_page_background_count <= vm_page_background_target)))
2272 #else
2273                 if (vm_page_queue_empty(&sq->age_q) && cache_evict_throttle == 0)
2274 #endif
2275                 {
2276                         int     pages_evicted;
2277
2278                         if (object != NULL) {
2279                                 vm_object_unlock(object);
2280                                 object = NULL;
2281                         }
2282                         pages_evicted = vm_object_cache_evict(100, 10);
2283
2284                         if (pages_evicted) {
2285
2286                                 vm_pageout_cache_evicted += pages_evicted;
2287
2288                                 VM_DEBUG_EVENT(vm_pageout_cache_evict, VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE,
2289                                                vm_page_free_count, pages_evicted, vm_pageout_cache_evicted, 0);
2290                                 memoryshot(VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE);
2291
2292                                 /*
2293                                  * we just freed up to 100 pages,
2294                                  * so go back to the top of the main loop
2295                                  * and re-evaulate the memory situation
2296                                  */
2297                                 continue;
2298                         } else
2299                                 cache_evict_throttle = 1000;
2300                 }
2301                 if  (cache_evict_throttle)
2302                         cache_evict_throttle--;
2303
2304 #if CONFIG_JETSAM
2305                 /*
2306                  * don't let the filecache_min fall below 15% of available memory
2307                  * on systems with an active compressor that isn't nearing its
2308                  * limits w/r to accepting new data
2309                  *
2310                  * on systems w/o the compressor/swapper, the filecache is always
2311                  * a very large percentage of the AVAILABLE_NON_COMPRESSED_MEMORY
2312                  * since most (if not all) of the anonymous pages are in the
2313                  * throttled queue (which isn't counted as available) which
2314                  * effectively disables this filter
2315                  */
2316                 if (vm_compressor_low_on_space())
2317                         vm_page_filecache_min = 0;
2318                 else
2319                         vm_page_filecache_min = (AVAILABLE_NON_COMPRESSED_MEMORY / 7);
2320 #else
2321                 if (vm_compressor_out_of_space())
2322                         vm_page_filecache_min = 0;
2323                 else {
2324                         /*
2325                          * don't let the filecache_min fall below 33% of available memory...
2326                          */
2327                         vm_page_filecache_min = (AVAILABLE_NON_COMPRESSED_MEMORY / 3);
2328                 }
2329 #endif
2330                 if (vm_page_free_count < (vm_page_free_reserved / 4))
2331                         vm_page_filecache_min = 0;
2332
2333                 exceeded_burst_throttle = FALSE;
2334                 /*
2335                  * Sometimes we have to pause:
2336                  *      1) No inactive pages - nothing to do.
2337                  *      2) Loop control - no acceptable pages found on the inactive queue
2338                  *         within the last vm_pageout_burst_inactive_throttle iterations
2339                  *      3) Flow control - default pageout queue is full
2340                  */
2341                 if (vm_page_queue_empty(&vm_page_queue_inactive) &&
2342                     vm_page_queue_empty(&vm_page_queue_anonymous) &&
2343                     vm_page_queue_empty(&sq->age_q)) {
2344                         vm_pageout_scan_empty_throttle++;
2345                         msecs = vm_pageout_empty_wait;
2346                         goto vm_pageout_scan_delay;
2347
2348                 } else if (inactive_burst_count >=
2349                            MIN(vm_pageout_burst_inactive_throttle,
2350                                (vm_page_inactive_count +
2351                                 vm_page_speculative_count))) {
2352                         vm_pageout_scan_burst_throttle++;
2353                         msecs = vm_pageout_burst_wait;
2354
2355                         exceeded_burst_throttle = TRUE;
2356                         goto vm_pageout_scan_delay;
2357
2358                 } else if (vm_page_free_count > (vm_page_free_reserved / 4) &&
2359                            VM_PAGEOUT_SCAN_NEEDS_TO_THROTTLE()) {
2360                         vm_pageout_scan_swap_throttle++;
2361                         msecs = vm_pageout_swap_wait;
2362                         goto vm_pageout_scan_delay;
2363
2364                 } else if (VM_PAGE_Q_THROTTLED(iq) &&
2365                                   VM_DYNAMIC_PAGING_ENABLED()) {
2366                         clock_sec_t sec;
2367                         clock_nsec_t nsec;
2368
2369                         switch (flow_control.state) {
2370
2371                         case FCS_IDLE:
2372                                 if ((vm_page_free_count + local_freed) < vm_page_free_target) {
2373
2374                                         vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
2375                                                                     VM_PAGEOUT_PB_THREAD_YIELD);
2376                                         if (!VM_PAGE_Q_THROTTLED(iq)) {
2377                                                 vm_pageout_scan_yield_unthrottled++;
2378                                                 continue;
2379                                         }
2380                                         if (vm_page_pageable_external_count > vm_page_filecache_min &&
2381                                             !vm_page_queue_empty(&vm_page_queue_inactive)) {
2382                                                 anons_grabbed = ANONS_GRABBED_LIMIT;
2383                                                 vm_pageout_scan_throttle_deferred++;
2384                                                 goto consider_inactive;
2385                                         }
2386                                         if (((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) && vm_page_active_count)
2387                                                 continue;
2388                                 }
2389 reset_deadlock_timer:
2390                                 ts.tv_sec = vm_pageout_deadlock_wait / 1000;
2391                                 ts.tv_nsec = (vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
2392                                 clock_get_system_nanotime(&sec, &nsec);
2393                                 flow_control.ts.tv_sec = (unsigned int) sec;
2394                                 flow_control.ts.tv_nsec = nsec;
2395                                 ADD_MACH_TIMESPEC(&flow_control.ts, &ts);
2396
2397                                 flow_control.state = FCS_DELAYED;
2398                                 msecs = vm_pageout_deadlock_wait;
2399
2400                                 break;
2401
2402                         case FCS_DELAYED:
2403                                 clock_get_system_nanotime(&sec, &nsec);
2404                                 ts.tv_sec = (unsigned int) sec;
2405                                 ts.tv_nsec = nsec;
2406
2407                                 if (CMP_MACH_TIMESPEC(&ts, &flow_control.ts) >= 0) {
2408                                         /*
2409                                          * the pageout thread for the default pager is potentially
2410                                          * deadlocked since the
2411                                          * default pager queue has been throttled for more than the
2412                                          * allowable time... we need to move some clean pages or dirty
2413                                          * pages belonging to the external pagers if they aren't throttled
2414                                          * vm_page_free_wanted represents the number of threads currently
2415                                          * blocked waiting for pages... we'll move one page for each of
2416                                          * these plus a fixed amount to break the logjam... once we're done
2417                                          * moving this number of pages, we'll re-enter the FSC_DELAYED state
2418                                          * with a new timeout target since we have no way of knowing
2419                                          * whether we've broken the deadlock except through observation
2420                                          * of the queue associated with the default pager... we need to
2421                                          * stop moving pages and allow the system to run to see what
2422                                          * state it settles into.
2423                                          */
2424                                         vm_pageout_deadlock_target = vm_pageout_deadlock_relief + vm_page_free_wanted + vm_page_free_wanted_privileged;
2425                                         vm_pageout_scan_deadlock_detected++;
2426                                         flow_control.state = FCS_DEADLOCK_DETECTED;
2427                                         thread_wakeup((event_t) &vm_pageout_garbage_collect);
2428                                         goto consider_inactive;
2429                                 }
2430                                 /*
2431                                  * just resniff instead of trying
2432                                  * to compute a new delay time... we're going to be
2433                                  * awakened immediately upon a laundry completion,
2434                                  * so we won't wait any longer than necessary
2435                                  */
2436                                 msecs = vm_pageout_idle_wait;
2437                                 break;
2438
2439                         case FCS_DEADLOCK_DETECTED:
2440                                 if (vm_pageout_deadlock_target)
2441                                         goto consider_inactive;
2442                                 goto reset_deadlock_timer;
2443
2444                         }
2445 vm_pageout_scan_delay:
2446                         vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
2447                                                     VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
2448
2449                         if (flow_control.state == FCS_DELAYED &&
2450                             !VM_PAGE_Q_THROTTLED(iq)) {
2451                                 flow_control.state = FCS_IDLE;
2452                                 goto consider_inactive;
2453                         }
2454
2455                         if (vm_page_free_count >= vm_page_free_target) {
2456                                 /*
2457                                  * we're here because
2458                                  *  1) someone else freed up some pages while we had
2459                                  *     the queues unlocked above
2460                                  * and we've hit one of the 3 conditions that
2461                                  * cause us to pause the pageout scan thread
2462                                  *
2463                                  * since we already have enough free pages,
2464                                  * let's avoid stalling and return normally
2465                                  *
2466                                  * before we return, make sure the pageout I/O threads
2467                                  * are running throttled in case there are still requests
2468                                  * in the laundry... since we have enough free pages
2469                                  * we don't need the laundry to be cleaned in a timely
2470                                  * fashion... so let's avoid interfering with foreground
2471                                  * activity
2472                                  *
2473                                  * we don't want to hold vm_page_queue_free_lock when
2474                                  * calling vm_pageout_adjust_eq_iothrottle (since it
2475                                  * may cause other locks to be taken), we do the intitial
2476                                  * check outside of the lock.  Once we take the lock,
2477                                  * we recheck the condition since it may have changed.
2478                                  * if it has, no problem, we will make the threads
2479                                  * non-throttled before actually blocking
2480                                  */
2481                                 vm_pageout_adjust_eq_iothrottle(eq, TRUE);
2482                         }
2483                         lck_mtx_lock(&vm_page_queue_free_lock);
2484
2485                         if (vm_page_free_count >= vm_page_free_target &&
2486                             (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
2487                                 goto return_from_scan;
2488                         }
2489                         lck_mtx_unlock(&vm_page_queue_free_lock);
2490
2491                         if ((vm_page_free_count + vm_page_cleaned_count) < vm_page_free_target) {
2492                                 /*
2493                                  * we're most likely about to block due to one of
2494                                  * the 3 conditions that cause vm_pageout_scan to
2495                                  * not be able to make forward progress w/r
2496                                  * to providing new pages to the free queue,
2497                                  * so unthrottle the I/O threads in case we
2498                                  * have laundry to be cleaned... it needs
2499                                  * to be completed ASAP.
2500                                  *
2501                                  * even if we don't block, we want the io threads
2502                                  * running unthrottled since the sum of free +
2503                                  * clean pages is still under our free target
2504                                  */
2505                                 vm_pageout_adjust_eq_iothrottle(eq, FALSE);
2506                         }
2507                         if (vm_page_cleaned_count > 0 && exceeded_burst_throttle == FALSE) {
2508                                 /*
2509                                  * if we get here we're below our free target and
2510                                  * we're stalling due to a full laundry queue or
2511                                  * we don't have any inactive pages other then
2512                                  * those in the clean queue...
2513                                  * however, we have pages on the clean queue that
2514                                  * can be moved to the free queue, so let's not
2515                                  * stall the pageout scan
2516                                  */
2517                                 flow_control.state = FCS_IDLE;
2518                                 goto consider_inactive;
2519                         }
2520                         VM_CHECK_MEMORYSTATUS;
2521
2522                         if (flow_control.state != FCS_IDLE)
2523                                 vm_pageout_scan_throttle++;
2524                         iq->pgo_throttled = TRUE;
2525
2526                         assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000*NSEC_PER_USEC);
2527                         counter(c_vm_pageout_scan_block++);
2528
2529                         vm_page_unlock_queues();
2530
2531                         assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
2532
2533                         VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START,
2534                                        iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2535                         memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START);
2536
2537                         thread_block(THREAD_CONTINUE_NULL);
2538
2539                         VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END,
2540                                        iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2541                         memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END);
2542
2543                         vm_page_lock_queues();
2544
2545                         iq->pgo_throttled = FALSE;
2546
2547                         if (loop_count >= vm_page_inactive_count)
2548                                 loop_count = 0;
2549                         inactive_burst_count = 0;
2550
2551                         goto Restart;
2552                         /*NOTREACHED*/
2553                 }
2554
2555
2556                 flow_control.state = FCS_IDLE;
2557 consider_inactive:
2558                 vm_pageout_inactive_external_forced_reactivate_limit = MIN((vm_page_active_count + vm_page_inactive_count),
2559                                                                             vm_pageout_inactive_external_forced_reactivate_limit);
2560                 loop_count++;
2561                 inactive_burst_count++;
2562                 vm_pageout_inactive++;
2563
2564
2565                 /*
2566                  * Choose a victim.
2567                  */
2568                 while (1) {
2569                         uint32_t        inactive_external_count;
2570
2571 #if CONFIG_BACKGROUND_QUEUE
2572                         page_from_bg_q = FALSE;
2573 #endif /* CONFIG_BACKGROUND_QUEUE */
2574
2575                         m = NULL;
2576                         m_object = VM_OBJECT_NULL;
2577
2578                         if (VM_DYNAMIC_PAGING_ENABLED()) {
2579                                 assert(vm_page_throttled_count == 0);
2580                                 assert(vm_page_queue_empty(&vm_page_queue_throttled));
2581                         }
2582
2583                         /*
2584                          * Try for a clean-queue inactive page.
2585                          * These are pages that vm_pageout_scan tried to steal earlier, but
2586                          * were dirty and had to be cleaned.  Pick them up now that they are clean.
2587                          */
2588                         if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2589                                 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2590
2591                                 assert(m->vm_page_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q);
2592
2593                                 break;
2594                         }
2595
2596                         /*
2597                          * The next most eligible pages are ones we paged in speculatively,
2598                          * but which have not yet been touched and have been aged out.
2599                          */
2600                         if (!vm_page_queue_empty(&sq->age_q)) {
2601                                 m = (vm_page_t) vm_page_queue_first(&sq->age_q);
2602
2603                                 assert(m->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q);
2604
2605                                 if (!m->dirty || force_anonymous == FALSE)
2606                                         break;
2607                                 else
2608                                         m = NULL;
2609                         }
2610
2611 #if CONFIG_BACKGROUND_QUEUE
2612                         if (vm_page_background_mode != VM_PAGE_BG_DISABLED && (vm_page_background_count > vm_page_background_target)) {
2613                                 vm_object_t     bg_m_object = NULL;
2614
2615                                 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
2616
2617                                 bg_m_object = VM_PAGE_OBJECT(m);
2618
2619                                 if (!VM_PAGE_PAGEABLE(m)) {
2620                                         /*
2621                                          * This page is on the background queue
2622                                          * but not on a pageable queue.  This is
2623                                          * likely a transient state and whoever
2624                                          * took it out of its pageable queue
2625                                          * will likely put it back on a pageable
2626                                          * queue soon but we can't deal with it
2627                                          * at this point, so let's ignore this
2628                                          * page.
2629                                          */
2630                                 } else if (force_anonymous == FALSE || bg_m_object->internal) {
2631
2632                                         if (bg_m_object->internal &&
2633                                             ((vm_compressor_out_of_space() == TRUE) ||
2634                                              (vm_page_free_count < (vm_page_free_reserved / 4)))) {
2635
2636                                                 vm_pageout_skipped_bq_internal++;
2637                                         } else {
2638                                                 page_from_bg_q = TRUE;
2639
2640                                                 if (bg_m_object->internal)
2641                                                         vm_pageout_considered_bq_internal++;
2642                                                 else
2643                                                         vm_pageout_considered_bq_external++;
2644
2645                                                 break;
2646                                         }
2647                                 }
2648                         }
2649 #endif
2650
2651                         grab_anonymous = (vm_page_anonymous_count > vm_page_anonymous_min);
2652                         inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2653
2654                         if ((vm_page_pageable_external_count < vm_page_filecache_min || force_anonymous == TRUE) ||
2655                             ((inactive_external_count < vm_page_anonymous_count) && (inactive_external_count < (vm_page_pageable_external_count / 3)))) {
2656                                 grab_anonymous = TRUE;
2657                                 anons_grabbed = 0;
2658                         }
2659 #if CONFIG_JETSAM
2660                         /* If the file-backed pool has accumulated
2661                          * significantly more pages than the jetsam
2662                          * threshold, prefer to reclaim those
2663                          * inline to minimise compute overhead of reclaiming
2664                          * anonymous pages.
2665                          * This calculation does not account for the CPU local
2666                          * external page queues, as those are expected to be
2667                          * much smaller relative to the global pools.
2668                          */
2669                         if (grab_anonymous == TRUE && !VM_PAGE_Q_THROTTLED(eq)) {
2670                                 if (vm_page_pageable_external_count >
2671                                     vm_page_filecache_min) {
2672                                         if ((vm_page_pageable_external_count *
2673                                                 vm_pageout_memorystatus_fb_factor_dr) >
2674                                             (memorystatus_available_pages_critical *
2675                                             vm_pageout_memorystatus_fb_factor_nr)) {
2676                                                 grab_anonymous = FALSE;
2677 #if DEVELOPMENT || DEBUG
2678                                                 vm_grab_anon_overrides++;
2679 #endif
2680                                         }
2681                                 }
2682 #if DEVELOPMENT || DEBUG
2683                                 if (grab_anonymous) {
2684                                         vm_grab_anon_nops++;
2685                                 }
2686 #endif
2687                         }
2688 #endif /* CONFIG_JETSAM */
2689
2690                         if (grab_anonymous == FALSE || anons_grabbed >= ANONS_GRABBED_LIMIT || vm_page_queue_empty(&vm_page_queue_anonymous)) {
2691
2692                                 if ( !vm_page_queue_empty(&vm_page_queue_inactive) ) {
2693                                         m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2694
2695                                         assert(m->vm_page_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q);
2696                                         anons_grabbed = 0;
2697
2698                                         if (vm_page_pageable_external_count < vm_page_filecache_min) {
2699                                                 if ((++reactivated_this_call % 100))
2700                                                         goto must_activate_page;
2701                                                 /*
2702                                                  * steal 1% of the file backed pages even if
2703                                                  * we are under the limit that has been set
2704                                                  * for a healthy filecache
2705                                                  */
2706                                         }
2707                                         break;
2708                                 }
2709                         }
2710                         if ( !vm_page_queue_empty(&vm_page_queue_anonymous) ) {
2711                                 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2712
2713                                 assert(m->vm_page_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q);
2714                                 anons_grabbed++;
2715
2716                                 break;
2717                         }
2718
2719                         /*
2720                          * if we've gotten here, we have no victim page.
2721                          * check to see if we've not finished balancing the queues
2722                          * or we have a page on the aged speculative queue that we
2723                          * skipped due to force_anonymous == TRUE.. or we have
2724                          * speculative  pages that we can prematurely age... if
2725                          * one of these cases we'll keep going, else panic
2726                          */
2727                         force_anonymous = FALSE;
2728                         vm_pageout_no_victim++;
2729
2730                         if ((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target)
2731                                 goto done_with_inactivepage;
2732
2733                         if (!vm_page_queue_empty(&sq->age_q))
2734                                 goto done_with_inactivepage;
2735
2736                         if (vm_page_speculative_count) {
2737                                 force_speculative_aging = TRUE;
2738                                 goto done_with_inactivepage;
2739                         }
2740                         panic("vm_pageout: no victim");
2741
2742                         /* NOTREACHED */
2743                 }
2744                 assert(VM_PAGE_PAGEABLE(m));
2745                 m_object = VM_PAGE_OBJECT(m);
2746                 force_anonymous = FALSE;
2747
2748                 page_prev_q_state = m->vm_page_q_state;
2749                 /*
2750                  * we just found this page on one of our queues...
2751                  * it can't also be on the pageout queue, so safe
2752                  * to call vm_page_queues_remove
2753                  */
2754                 vm_page_queues_remove(m, TRUE);
2755
2756                 assert(!m->laundry);
2757                 assert(!m->private);
2758                 assert(!m->fictitious);
2759                 assert(m_object != kernel_object);
2760                 assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
2761
2762                 vm_pageout_stats[vm_pageout_stat_now].considered++;
2763                 vm_pageout_considered_page++;
2764
2765                 DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
2766
2767                 /*
2768                  * check to see if we currently are working
2769                  * with the same object... if so, we've
2770                  * already got the lock
2771                  */
2772                 if (m_object != object) {
2773                         /*
2774                          * the object associated with candidate page is
2775                          * different from the one we were just working
2776                          * with... dump the lock if we still own it
2777                          */
2778                         if (object != NULL) {
2779                                 vm_object_unlock(object);
2780                                 object = NULL;
2781                                 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2782                         }
2783                         /*
2784                          * Try to lock object; since we've alread got the
2785                          * page queues lock, we can only 'try' for this one.
2786                          * if the 'try' fails, we need to do a mutex_pause
2787                          * to allow the owner of the object lock a chance to
2788                          * run... otherwise, we're likely to trip over this
2789                          * object in the same state as we work our way through
2790                          * the queue... clumps of pages associated with the same
2791                          * object are fairly typical on the inactive and active queues
2792                          */
2793                         if (!vm_object_lock_try_scan(m_object)) {
2794                                 vm_page_t m_want = NULL;
2795
2796                                 vm_pageout_inactive_nolock++;
2797
2798                                 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q)
2799                                         vm_pageout_cleaned_nolock++;
2800
2801                                 pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(m));
2802                                 m->reference = FALSE;
2803
2804                                 /*
2805                                  * m->object must be stable since we hold the page queues lock...
2806                                  * we can update the scan_collisions field sans the object lock
2807                                  * since it is a separate field and this is the only spot that does
2808                                  * a read-modify-write operation and it is never executed concurrently...
2809                                  * we can asynchronously set this field to 0 when creating a UPL, so it
2810                                  * is possible for the value to be a bit non-determistic, but that's ok
2811                                  * since it's only used as a hint
2812                                  */
2813                                 m_object->scan_collisions = 1;
2814
2815                                 if ( !vm_page_queue_empty(&sq->age_q) )
2816                                         m_want = (vm_page_t) vm_page_queue_first(&sq->age_q);
2817                                 else if ( !vm_page_queue_empty(&vm_page_queue_cleaned))
2818                                         m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2819                                 else if ( !vm_page_queue_empty(&vm_page_queue_inactive) &&
2820                                           (anons_grabbed >= ANONS_GRABBED_LIMIT || vm_page_queue_empty(&vm_page_queue_anonymous)))
2821                                         m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2822                                 else if ( !vm_page_queue_empty(&vm_page_queue_anonymous))
2823                                         m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2824
2825                                 /*
2826                                  * this is the next object we're going to be interested in
2827                                  * try to make sure its available after the mutex_yield
2828                                  * returns control
2829                                  */
2830                                 if (m_want)
2831                                         vm_pageout_scan_wants_object = VM_PAGE_OBJECT(m_want);
2832
2833                                 /*
2834                                  * force us to dump any collected free pages
2835                                  * and to pause before moving on
2836                                  */
2837                                 try_failed = TRUE;
2838
2839                                 goto requeue_page;
2840                         }
2841                         object = m_object;
2842                         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2843
2844                         try_failed = FALSE;
2845                 }
2846                 assert(m_object == object);
2847                 assert(VM_PAGE_OBJECT(m) == m_object);
2848
2849                 if (m->busy) {
2850                         /*
2851                          *      Somebody is already playing with this page.
2852                          *      Put it back on the appropriate queue
2853                          *
2854                          */
2855                         vm_pageout_inactive_busy++;
2856
2857                         if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q)
2858                                 vm_pageout_cleaned_busy++;
2859 requeue_page:
2860                         if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q)
2861                                 vm_page_enqueue_inactive(m, FALSE);
2862                         else
2863                                 vm_page_activate(m);
2864 #if CONFIG_BACKGROUND_QUEUE
2865                         if (page_from_bg_q == TRUE) {
2866                                 if (m_object->internal)
2867                                         vm_pageout_rejected_bq_internal++;
2868                                 else
2869                                         vm_pageout_rejected_bq_external++;
2870                         }
2871 #endif
2872                         goto done_with_inactivepage;
2873                 }
2874
2875
2876                 /*
2877                  *      If it's absent, in error or the object is no longer alive,
2878                  *      we can reclaim the page... in the no longer alive case,
2879                  *      there are 2 states the page can be in that preclude us
2880                  *      from reclaiming it - busy or cleaning - that we've already
2881                  *      dealt with
2882                  */
2883                 if (m->absent || m->error || !object->alive) {
2884
2885                         if (m->absent)
2886                                 vm_pageout_inactive_absent++;
2887                         else if (!object->alive)
2888                                 vm_pageout_inactive_notalive++;
2889                         else
2890                                 vm_pageout_inactive_error++;
2891 reclaim_page:
2892                         if (vm_pageout_deadlock_target) {
2893                                 vm_pageout_scan_inactive_throttle_success++;
2894                                 vm_pageout_deadlock_target--;
2895                         }
2896
2897                         DTRACE_VM2(dfree, int, 1, (uint64_t *), NULL);
2898
2899                         if (object->internal) {
2900                                 DTRACE_VM2(anonfree, int, 1, (uint64_t *), NULL);
2901                         } else {
2902                                 DTRACE_VM2(fsfree, int, 1, (uint64_t *), NULL);
2903                         }
2904                         assert(!m->cleaning);
2905                         assert(!m->laundry);
2906
2907                         m->busy = TRUE;
2908
2909                         /*
2910                          * remove page from object here since we're already
2911                          * behind the object lock... defer the rest of the work
2912                          * we'd normally do in vm_page_free_prepare_object
2913                          * until 'vm_page_free_list' is called
2914                          */
2915                         if (m->tabled)
2916                                 vm_page_remove(m, TRUE);
2917
2918                         assert(m->pageq.next == 0 && m->pageq.prev == 0);
2919                         m->snext = local_freeq;
2920                         local_freeq = m;
2921                         local_freed++;
2922
2923                         if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q)
2924                                 vm_pageout_freed_from_speculative++;
2925                         else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q)
2926                                 vm_pageout_freed_from_cleaned++;
2927                         else
2928                                 vm_pageout_freed_from_inactive_clean++;
2929
2930                         vm_pageout_stats[vm_pageout_stat_now].reclaimed_clean++;
2931
2932                         inactive_burst_count = 0;
2933                         goto done_with_inactivepage;
2934                 }
2935                 /*
2936                  * If the object is empty, the page must be reclaimed even
2937                  * if dirty or used.
2938                  * If the page belongs to a volatile object, we stick it back
2939                  * on.
2940                  */
2941                 if (object->copy == VM_OBJECT_NULL) {
2942                         if (object->purgable == VM_PURGABLE_EMPTY) {
2943                                 if (m->pmapped == TRUE) {
2944                                         /* unmap the page */
2945                                         refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
2946                                         if (refmod_state & VM_MEM_MODIFIED) {
2947                                                 SET_PAGE_DIRTY(m, FALSE);
2948                                         }
2949                                 }
2950                                 if (m->dirty || m->precious) {
2951                                         /* we saved the cost of cleaning this page ! */
2952                                         vm_page_purged_count++;
2953                                 }
2954                                 goto reclaim_page;
2955                         }
2956
2957                         if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
2958                                 /*
2959                                  * With the VM compressor, the cost of
2960                                  * reclaiming a page is much lower (no I/O),
2961                                  * so if we find a "volatile" page, it's better
2962                                  * to let it get compressed rather than letting
2963                                  * it occupy a full page until it gets purged.
2964                                  * So no need to check for "volatile" here.
2965                                  */
2966                         } else if (object->purgable == VM_PURGABLE_VOLATILE) {
2967                                 /*
2968                                  * Avoid cleaning a "volatile" page which might
2969                                  * be purged soon.
2970                                  */
2971
2972                                 /* if it's wired, we can't put it on our queue */
2973                                 assert(!VM_PAGE_WIRED(m));
2974
2975                                 /* just stick it back on! */
2976                                 reactivated_this_call++;
2977
2978                                 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q)
2979                                         vm_pageout_cleaned_volatile_reactivated++;
2980
2981                                 goto reactivate_page;
2982                         }
2983                 }
2984                 /*
2985                  *      If it's being used, reactivate.
2986                  *      (Fictitious pages are either busy or absent.)
2987                  *      First, update the reference and dirty bits
2988                  *      to make sure the page is unreferenced.
2989                  */
2990                 refmod_state = -1;
2991
2992                 if (m->reference == FALSE && m->pmapped == TRUE) {
2993                         refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
2994
2995                         if (refmod_state & VM_MEM_REFERENCED)
2996                                 m->reference = TRUE;
2997                         if (refmod_state & VM_MEM_MODIFIED) {
2998                                 SET_PAGE_DIRTY(m, FALSE);
2999                         }
3000                 }
3001
3002                 /*
3003                  *   if (m->cleaning && !m->free_when_done)
3004                  *      If already cleaning this page in place and it hasn't
3005                  *      been recently referenced, just pull off the queue.
3006                  *      We can leave the page mapped, and upl_commit_range
3007                  *      will put it on the clean queue.
3008                  *
3009                  *   if (m->free_when_done && !m->cleaning)
3010                  *      an msync INVALIDATE is in progress...
3011                  *      this page has been marked for destruction
3012                  *      after it has been cleaned,
3013                  *      but not yet gathered into a UPL
3014                  *      where 'cleaning' will be set...
3015                  *      just leave it off the paging queues
3016                  *
3017                  *   if (m->free_when_done && m->clenaing)
3018                  *      an msync INVALIDATE is in progress
3019                  *      and the UPL has already gathered this page...
3020                  *      just leave it off the paging queues
3021                  */
3022
3023                 /*
3024                  * page with m->free_when_done and still on the queues means that an
3025                  * MS_INVALIDATE is in progress on this page... leave it alone
3026                  */
3027                 if (m->free_when_done) {
3028                         goto done_with_inactivepage;
3029                 }
3030
3031                 /* if cleaning, reactivate if referenced.  otherwise, just pull off queue */
3032                 if (m->cleaning) {
3033                         if (m->reference == TRUE) {
3034                                 reactivated_this_call++;
3035                                 goto reactivate_page;
3036                         } else {
3037                                 goto done_with_inactivepage;
3038                         }
3039                 }
3040
3041                 if (m->reference || m->dirty) {
3042                         /* deal with a rogue "reusable" page */
3043                         VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, m_object);
3044                 }
3045
3046                 if (!m->no_cache &&
3047 #if CONFIG_BACKGROUND_QUEUE
3048                     page_from_bg_q == FALSE &&
3049 #endif
3050                     (m->reference ||
3051                      (m->xpmapped && !object->internal && (vm_page_xpmapped_external_count < (vm_page_external_count / 4))))) {
3052                         /*
3053                          * The page we pulled off the inactive list has
3054                          * been referenced.  It is possible for other
3055                          * processors to be touching pages faster than we
3056                          * can clear the referenced bit and traverse the
3057                          * inactive queue, so we limit the number of
3058                          * reactivations.
3059                          */
3060                         if (++reactivated_this_call >= reactivate_limit) {
3061                                 vm_pageout_reactivation_limit_exceeded++;
3062                         } else if (++inactive_reclaim_run >= VM_PAGEOUT_INACTIVE_FORCE_RECLAIM) {
3063                                 vm_pageout_inactive_force_reclaim++;
3064                         } else {
3065                                 uint32_t isinuse;
3066
3067                                 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q)
3068                                         vm_pageout_cleaned_reference_reactivated++;
3069 reactivate_page:
3070                                 if ( !object->internal && object->pager != MEMORY_OBJECT_NULL &&
3071                                      vnode_pager_get_isinuse(object->pager, &isinuse) == KERN_SUCCESS && !isinuse) {
3072                                         /*
3073                                          * no explict mappings of this object exist
3074                                          * and it's not open via the filesystem
3075                                          */
3076                                         vm_page_deactivate(m);
3077                                         vm_pageout_inactive_deactivated++;
3078                                 } else {
3079 must_activate_page:
3080                                         /*
3081                                          * The page was/is being used, so put back on active list.
3082                                          */
3083                                         vm_page_activate(m);
3084                                         VM_STAT_INCR(reactivations);
3085                                         inactive_burst_count = 0;
3086                                 }
3087 #if CONFIG_BACKGROUND_QUEUE
3088                                 if (page_from_bg_q == TRUE) {
3089                                         if (m_object->internal)
3090                                                 vm_pageout_rejected_bq_internal++;
3091                                         else
3092                                                 vm_pageout_rejected_bq_external++;
3093                                 }
3094 #endif
3095                                 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q)
3096                                         vm_pageout_cleaned_reactivated++;
3097                                 vm_pageout_inactive_used++;
3098
3099                                 goto done_with_inactivepage;
3100                         }
3101                         /*
3102                          * Make sure we call pmap_get_refmod() if it
3103                          * wasn't already called just above, to update
3104                          * the dirty bit.
3105                          */
3106                         if ((refmod_state == -1) && !m->dirty && m->pmapped) {
3107                                 refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
3108                                 if (refmod_state & VM_MEM_MODIFIED) {
3109                                         SET_PAGE_DIRTY(m, FALSE);
3110                                 }
3111                         }
3112                 }
3113
3114                 XPR(XPR_VM_PAGEOUT,
3115                 "vm_pageout_scan, replace object 0x%X offset 0x%X page 0x%X\n",
3116                 object, m->offset, m, 0,0);
3117
3118                 /*
3119                  * we've got a candidate page to steal...
3120                  *
3121                  * m->dirty is up to date courtesy of the
3122                  * preceding check for m->reference... if
3123                  * we get here, then m->reference had to be
3124                  * FALSE (or possibly "reactivate_limit" was
3125                  * exceeded), but in either case we called
3126                  * pmap_get_refmod() and updated both
3127                  * m->reference and m->dirty
3128                  *
3129                  * if it's dirty or precious we need to
3130                  * see if the target queue is throtttled
3131                  * it if is, we need to skip over it by moving it back
3132                  * to the end of the inactive queue
3133                  */
3134
3135                 inactive_throttled = FALSE;
3136
3137                 if (m->dirty || m->precious) {
3138                         if (object->internal) {
3139                                 if (VM_PAGE_Q_THROTTLED(iq))
3140                                         inactive_throttled = TRUE;
3141                         } else if (VM_PAGE_Q_THROTTLED(eq)) {
3142                                 inactive_throttled = TRUE;
3143                         }
3144                 }
3145 throttle_inactive:
3146                 if (!VM_DYNAMIC_PAGING_ENABLED() &&
3147                     object->internal && m->dirty &&
3148                     (object->purgable == VM_PURGABLE_DENY ||
3149                      object->purgable == VM_PURGABLE_NONVOLATILE ||
3150                      object->purgable == VM_PURGABLE_VOLATILE)) {
3151                         vm_page_check_pageable_safe(m);
3152                         assert(m->vm_page_q_state == VM_PAGE_NOT_ON_Q);
3153                         vm_page_queue_enter(&vm_page_queue_throttled, m,
3154                                             vm_page_t, pageq);
3155                         m->vm_page_q_state = VM_PAGE_ON_THROTTLED_Q;
3156                         vm_page_throttled_count++;
3157
3158                         vm_pageout_scan_reclaimed_throttled++;
3159
3160                         inactive_burst_count = 0;
3161                         goto done_with_inactivepage;
3162                 }
3163                 if (inactive_throttled == TRUE) {
3164
3165                         if (object->internal == FALSE) {
3166                                 /*
3167                                  * we need to break up the following potential deadlock case...
3168                                  *  a) The external pageout thread is stuck on the truncate lock for a file that is being extended i.e. written.
3169                                  *  b) The thread doing the writing is waiting for pages while holding the truncate lock
3170                                  *  c) Most of the pages in the inactive queue belong to this file.
3171                                  *
3172                                  * we are potentially in this deadlock because...
3173                                  *  a) the external pageout queue is throttled
3174                                  *  b) we're done with the active queue and moved on to the inactive queue
3175                                  *  c) we've got a dirty external page
3176                                  *
3177                                  * since we don't know the reason for the external pageout queue being throttled we
3178                                  * must suspect that we are deadlocked, so move the current page onto the active queue
3179                                  * in an effort to cause a page from the active queue to 'age' to the inactive queue
3180                                  *
3181                                  * if we don't have jetsam configured (i.e. we have a dynamic pager), set
3182                                  * 'force_anonymous' to TRUE to cause us to grab a page from the cleaned/anonymous
3183                                  * pool the next time we select a victim page... if we can make enough new free pages,
3184                                  * the deadlock will break, the external pageout queue will empty and it will no longer
3185                                  * be throttled
3186                                  *
3187                                  * if we have jetsam configured, keep a count of the pages reactivated this way so
3188                                  * that we can try to find clean pages in the active/inactive queues before
3189                                  * deciding to jetsam a process
3190                                  */
3191                                 vm_pageout_scan_inactive_throttled_external++;
3192
3193                                 vm_page_check_pageable_safe(m);
3194                                 assert(m->vm_page_q_state == VM_PAGE_NOT_ON_Q);
3195                                 vm_page_queue_enter(&vm_page_queue_active, m, vm_page_t, pageq);
3196                                 m->vm_page_q_state = VM_PAGE_ON_ACTIVE_Q;
3197                                 vm_page_active_count++;
3198                                 vm_page_pageable_external_count++;
3199
3200                                 vm_pageout_adjust_eq_iothrottle(eq, FALSE);
3201
3202 #if CONFIG_MEMORYSTATUS && CONFIG_JETSAM
3203                                 vm_pageout_inactive_external_forced_reactivate_limit--;
3204
3205                                 if (vm_pageout_inactive_external_forced_reactivate_limit <= 0) {
3206                                         vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
3207                                         /*
3208                                          * Possible deadlock scenario so request jetsam action
3209                                          */
3210                                         assert(object);
3211                                         vm_object_unlock(object);
3212                                         object = VM_OBJECT_NULL;
3213                                         vm_page_unlock_queues();
3214
3215                                         VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_START,
3216                                                vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count);
3217
3218                                         /* Kill first suitable process */
3219                                         if (memorystatus_kill_on_VM_page_shortage(FALSE) == FALSE) {
3220                                                 panic("vm_pageout_scan: Jetsam request failed\n");
3221                                         }
3222
3223                                         VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_END, 0, 0, 0, 0);
3224
3225                                         vm_pageout_inactive_external_forced_jetsam_count++;
3226                                         vm_page_lock_queues();
3227                                         delayed_unlock = 1;
3228                                 }
3229 #else /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
3230                                 force_anonymous = TRUE;
3231 #endif
3232                                 inactive_burst_count = 0;
3233                                 goto done_with_inactivepage;
3234                         } else {
3235                                 vm_pageout_scan_inactive_throttled_internal++;
3236                                 goto must_activate_page;
3237                         }
3238                 }
3239
3240                 /*
3241                  * we've got a page that we can steal...
3242                  * eliminate all mappings and make sure
3243                  * we have the up-to-date modified state
3244                  *
3245                  * if we need to do a pmap_disconnect then we
3246                  * need to re-evaluate m->dirty since the pmap_disconnect
3247                  * provides the true state atomically... the
3248                  * page was still mapped up to the pmap_disconnect
3249                  * and may have been dirtied at the last microsecond
3250                  *
3251                  * Note that if 'pmapped' is FALSE then the page is not
3252                  * and has not been in any map, so there is no point calling
3253                  * pmap_disconnect().  m->dirty could have been set in anticipation
3254                  * of likely usage of the page.
3255                  */
3256                 if (m->pmapped == TRUE) {
3257                         int pmap_options;
3258
3259                         /*
3260                          * Don't count this page as going into the compressor
3261                          * if any of these are true:
3262                          * 1) compressed pager isn't enabled
3263                          * 2) Freezer enabled device with compressed pager
3264                          *    backend (exclusive use) i.e. most of the VM system
3265                          *    (including vm_pageout_scan) has no knowledge of
3266                          *    the compressor
3267                          * 3) This page belongs to a file and hence will not be
3268                          *    sent into the compressor
3269                          */
3270                         if ( !VM_CONFIG_COMPRESSOR_IS_ACTIVE ||
3271                             object->internal == FALSE) {
3272                                 pmap_options = 0;
3273                         } else if (m->dirty || m->precious) {
3274                                 /*
3275                                  * VM knows that this page is dirty (or
3276                                  * precious) and needs to be compressed
3277                                  * rather than freed.
3278                                  * Tell the pmap layer to count this page
3279                                  * as "compressed".
3280                                  */
3281                                 pmap_options = PMAP_OPTIONS_COMPRESSOR;
3282                         } else {
3283                                 /*
3284                                  * VM does not know if the page needs to
3285                                  * be preserved but the pmap layer might tell
3286                                  * us if any mapping has "modified" it.
3287                                  * Let's the pmap layer to count this page
3288                                  * as compressed if and only if it has been
3289                                  * modified.
3290                                  */
3291                                 pmap_options =
3292                                         PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
3293                         }
3294                         refmod_state = pmap_disconnect_options(VM_PAGE_GET_PHYS_PAGE(m),
3295                                                                pmap_options,
3296                                                                NULL);
3297                         if (refmod_state & VM_MEM_MODIFIED) {
3298                                 SET_PAGE_DIRTY(m, FALSE);
3299                         }
3300                 }
3301                 /*
3302                  * reset our count of pages that have been reclaimed
3303                  * since the last page was 'stolen'
3304                  */
3305                 inactive_reclaim_run = 0;
3306
3307                 /*
3308                  *      If it's clean and not precious, we can free the page.
3309                  */
3310                 if (!m->dirty && !m->precious) {
3311
3312                         if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q)
3313                                 vm_pageout_speculative_clean++;
3314                         else {
3315                                 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q)
3316                                         vm_pageout_inactive_anonymous++;
3317                                 else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q)
3318                                         vm_pageout_cleaned_reclaimed++;
3319
3320                                 vm_pageout_inactive_clean++;
3321                         }
3322                         /*
3323                          * OK, at this point we have found a page we are going to free.
3324                          */
3325 #if CONFIG_PHANTOM_CACHE
3326                         if (!object->internal)
3327                                 vm_phantom_cache_add_ghost(m);
3328 #endif
3329                         goto reclaim_page;
3330                 }
3331
3332                 /*
3333                  * The page may have been dirtied since the last check
3334                  * for a throttled target queue (which may have been skipped
3335                  * if the page was clean then).  With the dirty page
3336                  * disconnected here, we can make one final check.
3337                  */
3338                 if (object->internal) {
3339                         if (VM_PAGE_Q_THROTTLED(iq))
3340                                 inactive_throttled = TRUE;
3341                 } else if (VM_PAGE_Q_THROTTLED(eq)) {
3342                         inactive_throttled = TRUE;
3343                 }
3344
3345                 if (inactive_throttled == TRUE)
3346                         goto throttle_inactive;
3347
3348 #if VM_PRESSURE_EVENTS
3349 #if CONFIG_JETSAM
3350
3351                 /*
3352                  * If Jetsam is enabled, then the sending
3353                  * of memory pressure notifications is handled
3354                  * from the same thread that takes care of high-water
3355                  * and other jetsams i.e. the memorystatus_thread.
3356                  */
3357
3358 #else /* CONFIG_JETSAM */
3359
3360                 vm_pressure_response();
3361
3362 #endif /* CONFIG_JETSAM */
3363 #endif /* VM_PRESSURE_EVENTS */
3364
3365                 if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q)
3366                         vm_pageout_speculative_dirty++;
3367                 else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q)
3368                         vm_pageout_inactive_anonymous++;
3369
3370                 if (object->internal)
3371                         vm_pageout_inactive_dirty_internal++;
3372                 else
3373                         vm_pageout_inactive_dirty_external++;
3374
3375                 /*
3376                  * do NOT set the pageout bit!
3377                  * sure, we might need free pages, but this page is going to take time to become free
3378                  * anyway, so we may as well put it on the clean queue first and take it from there later
3379                  * if necessary.  that way, we'll ensure we don't free up too much. -mj
3380                  */
3381                 vm_pageout_cluster(m);
3382
3383 done_with_inactivepage:
3384
3385                 if (delayed_unlock++ > delayed_unlock_limit || try_failed == TRUE) {
3386
3387                         vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
3388                                                     VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
3389                         if (try_failed == TRUE)
3390                                 lck_mtx_yield(&vm_page_queue_lock);
3391                 }
3392
3393                 /*
3394                  * back to top of pageout scan loop
3395                  */
3396         }
3397 }
3398
3399
3400 int vm_page_free_count_init;
3401
3402 void
3403 vm_page_free_reserve(
3404         int pages)
3405 {
3406         int             free_after_reserve;
3407
3408         if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
3409
3410                 if ((vm_page_free_reserved + pages + COMPRESSOR_FREE_RESERVED_LIMIT) >= (VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT))
3411                         vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT;
3412                 else
3413                         vm_page_free_reserved += (pages + COMPRESSOR_FREE_RESERVED_LIMIT);
3414
3415         } else {
3416                 if ((vm_page_free_reserved + pages) >= VM_PAGE_FREE_RESERVED_LIMIT)
3417                         vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT;
3418                 else
3419                         vm_page_free_reserved += pages;
3420         }
3421         free_after_reserve = vm_page_free_count_init - vm_page_free_reserved;
3422
3423         vm_page_free_min = vm_page_free_reserved +
3424                 VM_PAGE_FREE_MIN(free_after_reserve);
3425
3426         if (vm_page_free_min > VM_PAGE_FREE_MIN_LIMIT)
3427                 vm_page_free_min = VM_PAGE_FREE_MIN_LIMIT;
3428
3429         vm_page_free_target = vm_page_free_reserved +
3430                 VM_PAGE_FREE_TARGET(free_after_reserve);
3431
3432         if (vm_page_free_target > VM_PAGE_FREE_TARGET_LIMIT)
3433                 vm_page_free_target = VM_PAGE_FREE_TARGET_LIMIT;
3434
3435         if (vm_page_free_target < vm_page_free_min + 5)
3436                 vm_page_free_target = vm_page_free_min + 5;
3437
3438         vm_page_throttle_limit = vm_page_free_target - (vm_page_free_target / 2);
3439 }
3440
3441 /*
3442  *      vm_pageout is the high level pageout daemon.
3443  */
3444
3445 void
3446 vm_pageout_continue(void)
3447 {
3448         DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL);
3449         vm_pageout_scan_event_counter++;
3450
3451 #if !CONFIG_EMBEDDED
3452         lck_mtx_lock(&vm_page_queue_free_lock);
3453         vm_pageout_running = TRUE;
3454         lck_mtx_unlock(&vm_page_queue_free_lock);
3455 #endif /* CONFIG_EMBEDDED */
3456
3457         vm_pageout_scan();
3458         /*
3459          * we hold both the vm_page_queue_free_lock
3460          * and the vm_page_queues_lock at this point
3461          */
3462         assert(vm_page_free_wanted == 0);
3463         assert(vm_page_free_wanted_privileged == 0);
3464         assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
3465
3466 #if !CONFIG_EMBEDDED
3467         vm_pageout_running = FALSE;
3468         if (vm_pageout_waiter) {
3469                 vm_pageout_waiter = FALSE;
3470                 thread_wakeup((event_t)&vm_pageout_waiter);
3471         }
3472 #endif /* !CONFIG_EMBEDDED */
3473
3474         lck_mtx_unlock(&vm_page_queue_free_lock);
3475         vm_page_unlock_queues();
3476
3477         counter(c_vm_pageout_block++);
3478         thread_block((thread_continue_t)vm_pageout_continue);
3479         /*NOTREACHED*/
3480 }
3481
3482 #if !CONFIG_EMBEDDED
3483 kern_return_t
3484 vm_pageout_wait(uint64_t deadline)
3485 {
3486         kern_return_t kr;
3487
3488         lck_mtx_lock(&vm_page_queue_free_lock);
3489         for (kr = KERN_SUCCESS; vm_pageout_running && (KERN_SUCCESS == kr); ) {
3490                 vm_pageout_waiter = TRUE;
3491                 if (THREAD_AWAKENED != lck_mtx_sleep_deadline(
3492                                 &vm_page_queue_free_lock, LCK_SLEEP_DEFAULT,
3493                                 (event_t) &vm_pageout_waiter, THREAD_UNINT, deadline)) {
3494                         kr = KERN_OPERATION_TIMED_OUT;
3495                 }
3496         }
3497         lck_mtx_unlock(&vm_page_queue_free_lock);
3498
3499         return (kr);
3500 }
3501 #endif /* !CONFIG_EMBEDDED */
3502
3503
3504 static void
3505 vm_pageout_iothread_external_continue(struct vm_pageout_queue *q)
3506 {
3507         vm_page_t       m = NULL;
3508         vm_object_t     object;
3509         vm_object_offset_t offset;
3510         memory_object_t pager;
3511
3512         /* On systems without a compressor, the external IO thread clears its
3513          * VM privileged bit to accommodate large allocations (e.g. bulk UPL
3514          * creation)
3515          */
3516         if (vm_pageout_internal_iothread != THREAD_NULL)
3517                 current_thread()->options &= ~TH_OPT_VMPRIV;
3518
3519         vm_page_lockspin_queues();
3520
3521         while ( !vm_page_queue_empty(&q->pgo_pending) ) {
3522
3523                    q->pgo_busy = TRUE;
3524                    vm_page_queue_remove_first(&q->pgo_pending, m, vm_page_t, pageq);
3525
3526                    assert(m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q);
3527                    VM_PAGE_CHECK(m);
3528                    /*
3529                     * grab a snapshot of the object and offset this
3530                     * page is tabled in so that we can relookup this
3531                     * page after we've taken the object lock - these
3532                     * fields are stable while we hold the page queues lock
3533                     * but as soon as we drop it, there is nothing to keep
3534                     * this page in this object... we hold an activity_in_progress
3535                     * on this object which will keep it from terminating
3536                     */
3537                    object = VM_PAGE_OBJECT(m);
3538                    offset = m->offset;
3539
3540                    if (object->object_slid) {
3541                            panic("slid page %p not allowed on this path\n", m);
3542                    }
3543                    m->vm_page_q_state = VM_PAGE_NOT_ON_Q;
3544                    VM_PAGE_ZERO_PAGEQ_ENTRY(m);
3545
3546                    vm_page_unlock_queues();
3547
3548                    vm_object_lock(object);
3549
3550                    m = vm_page_lookup(object, offset);
3551
3552                    if (m == NULL ||
3553                        m->busy || m->cleaning || !m->laundry || (m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q)) {
3554                            /*
3555                             * it's either the same page that someone else has
3556                             * started cleaning (or it's finished cleaning or
3557                             * been put back on the pageout queue), or
3558                             * the page has been freed or we have found a
3559                             * new page at this offset... in all of these cases
3560                             * we merely need to release the activity_in_progress
3561                             * we took when we put the page on the pageout queue
3562                             */
3563                            vm_object_activity_end(object);
3564                            vm_object_unlock(object);
3565
3566                            vm_page_lockspin_queues();
3567                            continue;
3568                    }
3569                    pager = object->pager;
3570
3571                    if (pager == MEMORY_OBJECT_NULL) {
3572                            /*
3573                             * This pager has been destroyed by either
3574                             * memory_object_destroy or vm_object_destroy, and
3575                             * so there is nowhere for the page to go.
3576                             */
3577                            if (m->free_when_done) {
3578                                    /*
3579                                     * Just free the page... VM_PAGE_FREE takes
3580                                     * care of cleaning up all the state...
3581                                     * including doing the vm_pageout_throttle_up
3582                                     */
3583                                    VM_PAGE_FREE(m);
3584                            } else {
3585                                    vm_page_lockspin_queues();
3586
3587                                    vm_pageout_throttle_up(m);
3588                                    vm_page_activate(m);
3589
3590                                    vm_page_unlock_queues();
3591
3592                                    /*
3593                                     *   And we are done with it.
3594                                     */
3595                            }
3596                            vm_object_activity_end(object);
3597                            vm_object_unlock(object);
3598
3599                            vm_page_lockspin_queues();
3600                            continue;
3601                    }
3602 #if 0
3603                    /*
3604                     * we don't hold the page queue lock
3605                     * so this check isn't safe to make
3606                     */
3607                    VM_PAGE_CHECK(m);
3608 #endif
3609                    /*
3610                     * give back the activity_in_progress reference we
3611                     * took when we queued up this page and replace it
3612                     * it with a paging_in_progress reference that will
3613                     * also hold the paging offset from changing and
3614                     * prevent the object from terminating
3615                     */
3616                    vm_object_activity_end(object);
3617                    vm_object_paging_begin(object);
3618                    vm_object_unlock(object);
3619
3620                    /*
3621                     * Send the data to the pager.
3622                     * any pageout clustering happens there
3623                     */
3624                    memory_object_data_return(pager,
3625                                              m->offset + object->paging_offset,
3626                                              PAGE_SIZE,
3627                                              NULL,
3628                                              NULL,
3629                                              FALSE,
3630                                              FALSE,
3631                                              0);
3632
3633                    vm_object_lock(object);
3634                    vm_object_paging_end(object);
3635                    vm_object_unlock(object);
3636
3637                    vm_pageout_io_throttle();
3638
3639                    vm_page_lockspin_queues();
3640         }
3641         q->pgo_busy = FALSE;
3642         q->pgo_idle = TRUE;
3643
3644         assert_wait((event_t) &q->pgo_pending, THREAD_UNINT);
3645         vm_page_unlock_queues();
3646
3647         thread_block_parameter((thread_continue_t)vm_pageout_iothread_external_continue, (void *) q);
3648         /*NOTREACHED*/
3649 }
3650
3651
3652 #define         MAX_FREE_BATCH          32
3653 uint32_t vm_compressor_time_thread; /* Set via sysctl to record time accrued by
3654                                      * this thread.
3655                                      */
3656
3657
3658 #if DEVELOPMENT || DEBUG
3659 uint64_t compressor_epoch_start, compressor_epoch_stop, compressor_threads_runtime;
3660 #endif
3661
3662 void
3663 vm_pageout_iothread_internal_continue(struct cq *);
3664 void
3665 vm_pageout_iothread_internal_continue(struct cq *cq)
3666 {
3667         struct vm_pageout_queue *q;
3668         vm_page_t       m = NULL;
3669         boolean_t       pgo_draining;
3670         vm_page_t   local_q;
3671         int         local_cnt;
3672         vm_page_t   local_freeq = NULL;
3673         int         local_freed = 0;
3674         int         local_batch_size;
3675         int     ncomps = 0;
3676 #if DEVELOPMENT || DEBUG
3677         boolean_t marked_active = FALSE;
3678 #endif
3679         KERNEL_DEBUG(0xe040000c | DBG_FUNC_END, 0, 0, 0, 0, 0);
3680
3681         q = cq->q;
3682         local_batch_size = q->pgo_maxlaundry / (vm_compressor_thread_count * 2);
3683
3684 #if RECORD_THE_COMPRESSED_DATA
3685         if (q->pgo_laundry)
3686                 c_compressed_record_init();
3687 #endif
3688         while (TRUE) {
3689                 int     pages_left_on_q = 0;
3690
3691                 local_cnt = 0;
3692                 local_q = NULL;
3693
3694                 KERNEL_DEBUG(0xe0400014 | DBG_FUNC_START, 0, 0, 0, 0, 0);
3695
3696                 vm_page_lock_queues();
3697 #if DEVELOPMENT || DEBUG
3698                 if (marked_active == FALSE) {
3699                         vmct_active++;
3700                         vmct_state[cq->id] = VMCT_ACTIVE;
3701                         marked_active = TRUE;
3702                         if (vmct_active == 1) {
3703                                 compressor_epoch_start = mach_absolute_time();
3704                         }
3705                 }
3706 #endif
3707                 KERNEL_DEBUG(0xe0400014 | DBG_FUNC_END, 0, 0, 0, 0, 0);
3708
3709                 KERNEL_DEBUG(0xe0400018 | DBG_FUNC_START, q->pgo_laundry, 0, 0, 0, 0);
3710
3711                 while ( !vm_page_queue_empty(&q->pgo_pending) && local_cnt <  local_batch_size) {
3712
3713                         vm_page_queue_remove_first(&q->pgo_pending, m, vm_page_t, pageq);
3714                         assert(m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q);
3715                         VM_PAGE_CHECK(m);
3716
3717                         m->vm_page_q_state = VM_PAGE_NOT_ON_Q;
3718                         VM_PAGE_ZERO_PAGEQ_ENTRY(m);
3719                         m->laundry = FALSE;
3720
3721                         m->snext = local_q;
3722                         local_q = m;
3723                         local_cnt++;
3724                 }
3725                 if (local_q == NULL)
3726                         break;
3727
3728                 q->pgo_busy = TRUE;
3729
3730                 if ((pgo_draining = q->pgo_draining) == FALSE) {
3731                         vm_pageout_throttle_up_batch(q, local_cnt);
3732                         pages_left_on_q = q->pgo_laundry;
3733                 } else
3734                         pages_left_on_q = q->pgo_laundry - local_cnt;
3735
3736                 vm_page_unlock_queues();
3737
3738 #if !RECORD_THE_COMPRESSED_DATA
3739                 if (pages_left_on_q >= local_batch_size && cq->id < (vm_compressor_thread_count - 1)) {
3740                         thread_wakeup((event_t) ((uintptr_t)&q->pgo_pending + cq->id + 1));
3741                 }
3742 #endif
3743                 KERNEL_DEBUG(0xe0400018 | DBG_FUNC_END, q->pgo_laundry, 0, 0, 0, 0);
3744
3745                 while (local_q) {
3746
3747                         KERNEL_DEBUG(0xe0400024 | DBG_FUNC_START, local_cnt, 0, 0, 0, 0);
3748
3749                         m = local_q;
3750                         local_q = m->snext;
3751                         m->snext = NULL;
3752
3753                         if (vm_pageout_compress_page(&cq->current_chead, cq->scratch_buf, m, FALSE) == KERN_SUCCESS) {
3754                                 ncomps++;
3755                                 m->snext = local_freeq;
3756                                 local_freeq = m;
3757                                 local_freed++;
3758
3759                                 if (local_freed >= MAX_FREE_BATCH) {
3760                                         vm_pageout_freed_after_compression += local_freed;
3761
3762                                         vm_page_free_list(local_freeq, TRUE);
3763                                         local_freeq = NULL;
3764                                         local_freed = 0;
3765                                 }
3766                         }
3767 #if !CONFIG_JETSAM
3768                         while (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
3769                                 kern_return_t   wait_result;
3770                                 int             need_wakeup = 0;
3771
3772                                 if (local_freeq) {
3773                                         vm_pageout_freed_after_compression += local_freed;
3774
3775                                         vm_page_free_list(local_freeq, TRUE);
3776                                         local_freeq = NULL;
3777                                         local_freed = 0;
3778
3779                                         continue;
3780                                 }
3781                                 lck_mtx_lock_spin(&vm_page_queue_free_lock);
3782
3783                                 if (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
3784
3785                                         if (vm_page_free_wanted_privileged++ == 0)
3786                                                 need_wakeup = 1;
3787                                         wait_result = assert_wait((event_t)&vm_page_free_wanted_privileged, THREAD_UNINT);
3788
3789                                         lck_mtx_unlock(&vm_page_queue_free_lock);
3790
3791                                         if (need_wakeup)
3792                                                 thread_wakeup((event_t)&vm_page_free_wanted);
3793
3794                                         if (wait_result == THREAD_WAITING)
3795
3796                                                 thread_block(THREAD_CONTINUE_NULL);
3797                                 } else
3798                                         lck_mtx_unlock(&vm_page_queue_free_lock);
3799                         }
3800 #endif
3801                 }
3802                 if (local_freeq) {
3803                         vm_pageout_freed_after_compression += local_freed;
3804
3805                         vm_page_free_list(local_freeq, TRUE);
3806                         local_freeq = NULL;
3807                         local_freed = 0;
3808                 }
3809                 if (pgo_draining == TRUE) {
3810                         vm_page_lockspin_queues();
3811                         vm_pageout_throttle_up_batch(q, local_cnt);
3812                         vm_page_unlock_queues();
3813                 }
3814         }
3815         KERNEL_DEBUG(0xe040000c | DBG_FUNC_START, 0, 0, 0, 0, 0);
3816
3817         /*
3818          * queue lock is held and our q is empty
3819          */
3820         q->pgo_busy = FALSE;
3821         q->pgo_idle = TRUE;
3822
3823         assert_wait((event_t) ((uintptr_t)&q->pgo_pending + cq->id), THREAD_UNINT);
3824 #if DEVELOPMENT || DEBUG
3825         if (marked_active == TRUE) {
3826                 vmct_active--;
3827                 vmct_state[cq->id] = VMCT_IDLE;
3828
3829                 if (vmct_active == 0) {
3830                         compressor_epoch_stop = mach_absolute_time();
3831                         assert(compressor_epoch_stop > compressor_epoch_start);
3832                         /* This interval includes intervals where one or more
3833                          * compressor threads were pre-empted
3834                          */
3835                         vmct_stats.vmct_cthreads_total += compressor_epoch_stop - compressor_epoch_start;
3836                 }
3837
3838         }
3839 #endif
3840         vm_page_unlock_queues();
3841 #if DEVELOPMENT || DEBUG
3842         if (__improbable(vm_compressor_time_thread)) {
3843                 vmct_stats.vmct_runtimes[cq->id] = thread_get_runtime_self();
3844                 vmct_stats.vmct_pages[cq->id] += ncomps;
3845                 vmct_stats.vmct_iterations[cq->id]++;
3846                 if (ncomps > vmct_stats.vmct_maxpages[cq->id]) {
3847                         vmct_stats.vmct_maxpages[cq->id] = ncomps;
3848                 }
3849                 if (ncomps < vmct_stats.vmct_minpages[cq->id]) {
3850                         vmct_stats.vmct_minpages[cq->id] = ncomps;
3851                 }
3852         }
3853 #endif
3854
3855         KERNEL_DEBUG(0xe0400018 | DBG_FUNC_END, 0, 0, 0, 0, 0);
3856
3857         thread_block_parameter((thread_continue_t)vm_pageout_iothread_internal_continue, (void *) cq);
3858         /*NOTREACHED*/
3859 }
3860
3861
3862 kern_return_t
3863 vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m, boolean_t object_locked_by_caller)
3864 {
3865         vm_object_t     object;
3866         memory_object_t pager;
3867         int             compressed_count_delta;
3868         kern_return_t   retval;
3869
3870         object = VM_PAGE_OBJECT(m);
3871
3872         if (object->object_slid) {
3873                 panic("slid page %p not allowed on this path\n", m);
3874         }
3875         assert(!m->free_when_done);
3876         assert(!m->laundry);
3877
3878         pager = object->pager;
3879
3880         if (object_locked_by_caller == FALSE && (!object->pager_initialized || pager == MEMORY_OBJECT_NULL))  {
3881
3882                 KERNEL_DEBUG(0xe0400010 | DBG_FUNC_START, object, pager, 0, 0, 0);
3883
3884                 vm_object_lock(object);
3885
3886                 /*
3887                  * If there is no memory object for the page, create
3888                  * one and hand it to the compression pager.
3889                  */
3890
3891                 if (!object->pager_initialized)
3892                         vm_object_collapse(object, (vm_object_offset_t) 0, TRUE);
3893                 if (!object->pager_initialized)
3894                         vm_object_compressor_pager_create(object);
3895
3896                 pager = object->pager;
3897
3898                 if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
3899                         /*
3900                          * Still no pager for the object,
3901                          * or the pager has been destroyed.
3902                          * Reactivate the page.
3903                          *
3904                          * Should only happen if there is no
3905                          * compression pager
3906                          */
3907                         PAGE_WAKEUP_DONE(m);
3908
3909                         vm_page_lockspin_queues();
3910                         vm_page_activate(m);
3911                         vm_pageout_dirty_no_pager++;
3912                         vm_page_unlock_queues();
3913
3914                         /*
3915                          *      And we are done with it.
3916                          */
3917                         vm_object_activity_end(object);
3918                         vm_object_unlock(object);
3919
3920                         return KERN_FAILURE;
3921                 }
3922                 vm_object_unlock(object);
3923
3924                 KERNEL_DEBUG(0xe0400010 | DBG_FUNC_END, object, pager, 0, 0, 0);
3925         }
3926         assert(object->pager_initialized && pager != MEMORY_OBJECT_NULL);
3927
3928         if (object_locked_by_caller == FALSE)
3929                 assert(object->activity_in_progress > 0);
3930
3931         retval = vm_compressor_pager_put(
3932                 pager,
3933                 m->offset + object->paging_offset,
3934                 VM_PAGE_GET_PHYS_PAGE(m),
3935                 current_chead,
3936                 scratch_buf,
3937                 &compressed_count_delta);
3938
3939         if (object_locked_by_caller == FALSE) {
3940                 vm_object_lock(object);
3941
3942                 assert(object->activity_in_progress > 0);
3943                 assert(VM_PAGE_OBJECT(m) == object);
3944         }
3945
3946         vm_compressor_pager_count(pager,
3947                                   compressed_count_delta,
3948                                   FALSE, /* shared_lock */
3949                                   object);
3950
3951         assert( !VM_PAGE_WIRED(m));
3952
3953         if (retval == KERN_SUCCESS) {
3954                 /*
3955                  * If the object is purgeable, its owner's
3956                  * purgeable ledgers will be updated in
3957                  * vm_page_remove() but the page still
3958                  * contributes to the owner's memory footprint,
3959                  * so account for it as such.
3960                  */
3961                 if (object->purgable != VM_PURGABLE_DENY &&
3962                     object->vo_purgeable_owner != NULL) {
3963                         /* one more compressed purgeable page */
3964                         vm_purgeable_compressed_update(object,
3965                                                        +1);
3966                 }
3967                 VM_STAT_INCR(compressions);
3968
3969                 if (m->tabled)
3970                         vm_page_remove(m, TRUE);
3971
3972         } else {
3973                 PAGE_WAKEUP_DONE(m);
3974
3975                 vm_page_lockspin_queues();
3976
3977                 vm_page_activate(m);
3978                 vm_compressor_failed++;
3979
3980                 vm_page_unlock_queues();
3981         }
3982         if (object_locked_by_caller == FALSE) {
3983                 vm_object_activity_end(object);
3984                 vm_object_unlock(object);
3985         }
3986         return retval;
3987 }
3988
3989
3990 static void
3991 vm_pageout_adjust_eq_iothrottle(struct vm_pageout_queue *eq, boolean_t req_lowpriority)
3992 {
3993         uint32_t        policy;
3994
3995         if (hibernate_cleaning_in_progress == TRUE)
3996                 req_lowpriority = FALSE;
3997
3998         if (eq->pgo_inited == TRUE && eq->pgo_lowpriority != req_lowpriority) {
3999
4000                 vm_page_unlock_queues();
4001
4002                 if (req_lowpriority == TRUE) {
4003                         policy = THROTTLE_LEVEL_PAGEOUT_THROTTLED;
4004                         DTRACE_VM(laundrythrottle);
4005                 } else {
4006                         policy = THROTTLE_LEVEL_PAGEOUT_UNTHROTTLED;
4007                         DTRACE_VM(laundryunthrottle);
4008                 }
4009                 proc_set_thread_policy_with_tid(kernel_task, eq->pgo_tid,
4010                                                 TASK_POLICY_EXTERNAL, TASK_POLICY_IO, policy);
4011
4012                 eq->pgo_lowpriority = req_lowpriority;
4013
4014                 vm_page_lock_queues();
4015         }
4016 }
4017
4018
4019 static void
4020 vm_pageout_iothread_external(void)
4021 {
4022         thread_t        self = current_thread();
4023
4024         self->options |= TH_OPT_VMPRIV;
4025
4026         DTRACE_VM2(laundrythrottle, int, 1, (uint64_t *), NULL);
4027
4028         proc_set_thread_policy(self, TASK_POLICY_EXTERNAL,
4029                                TASK_POLICY_IO, THROTTLE_LEVEL_PAGEOUT_THROTTLED);
4030
4031         vm_page_lock_queues();
4032
4033         vm_pageout_queue_external.pgo_tid = self->thread_id;
4034         vm_pageout_queue_external.pgo_lowpriority = TRUE;
4035         vm_pageout_queue_external.pgo_inited = TRUE;
4036
4037         vm_page_unlock_queues();
4038
4039         vm_pageout_iothread_external_continue(&vm_pageout_queue_external);
4040
4041         /*NOTREACHED*/
4042 }
4043
4044
4045 static void
4046 vm_pageout_iothread_internal(struct cq *cq)
4047 {
4048         thread_t        self = current_thread();
4049
4050         self->options |= TH_OPT_VMPRIV;
4051
4052         vm_page_lock_queues();
4053
4054         vm_pageout_queue_internal.pgo_tid = self->thread_id;
4055         vm_pageout_queue_internal.pgo_lowpriority = TRUE;
4056         vm_pageout_queue_internal.pgo_inited = TRUE;
4057
4058         vm_page_unlock_queues();
4059
4060         if (vm_restricted_to_single_processor == TRUE)
4061                 thread_vm_bind_group_add();
4062
4063
4064         thread_set_thread_name(current_thread(), "VM_compressor");
4065 #if DEVELOPMENT || DEBUG
4066         vmct_stats.vmct_minpages[cq->id] = INT32_MAX;
4067 #endif
4068         vm_pageout_iothread_internal_continue(cq);
4069
4070         /*NOTREACHED*/
4071 }
4072
4073 kern_return_t
4074 vm_set_buffer_cleanup_callout(boolean_t (*func)(int))
4075 {
4076         if (OSCompareAndSwapPtr(NULL, func, (void * volatile *) &consider_buffer_cache_collect)) {
4077                 return KERN_SUCCESS;
4078         } else {
4079                 return KERN_FAILURE; /* Already set */
4080         }
4081 }
4082
4083 extern boolean_t        memorystatus_manual_testing_on;
4084 extern unsigned int     memorystatus_level;
4085
4086
4087 #if VM_PRESSURE_EVENTS
4088
4089 boolean_t vm_pressure_events_enabled = FALSE;
4090
4091 void
4092 vm_pressure_response(void)
4093 {
4094
4095         vm_pressure_level_t     old_level = kVMPressureNormal;
4096         int                     new_level = -1;
4097         unsigned int            total_pages;
4098         uint64_t                available_memory = 0;
4099
4100         if (vm_pressure_events_enabled == FALSE)
4101                 return;
4102
4103 #if CONFIG_EMBEDDED
4104
4105         available_memory = (uint64_t) memorystatus_available_pages;
4106
4107 #else /* CONFIG_EMBEDDED */
4108
4109         available_memory = (uint64_t) AVAILABLE_NON_COMPRESSED_MEMORY;
4110         memorystatus_available_pages = (uint64_t) AVAILABLE_NON_COMPRESSED_MEMORY;
4111
4112 #endif /* CONFIG_EMBEDDED */
4113
4114         total_pages = (unsigned int) atop_64(max_mem);
4115 #if CONFIG_SECLUDED_MEMORY
4116         total_pages -= vm_page_secluded_count;
4117 #endif /* CONFIG_SECLUDED_MEMORY */
4118         memorystatus_level = (unsigned int) ((available_memory * 100) / total_pages);
4119
4120         if (memorystatus_manual_testing_on) {
4121                 return;
4122         }
4123
4124         old_level = memorystatus_vm_pressure_level;
4125
4126         switch (memorystatus_vm_pressure_level) {
4127
4128                 case kVMPressureNormal:
4129                 {
4130                         if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4131                                 new_level = kVMPressureCritical;
4132                         }  else if (VM_PRESSURE_NORMAL_TO_WARNING()) {
4133                                 new_level = kVMPressureWarning;
4134                         }
4135                         break;
4136                 }
4137
4138                 case kVMPressureWarning:
4139                 case kVMPressureUrgent:
4140                 {
4141                         if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4142                                 new_level = kVMPressureNormal;
4143                         }  else if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4144                                 new_level = kVMPressureCritical;
4145                         }
4146                         break;
4147                 }
4148
4149                 case kVMPressureCritical:
4150                 {
4151                         if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4152                                 new_level = kVMPressureNormal;
4153                         }  else if (VM_PRESSURE_CRITICAL_TO_WARNING()) {
4154                                 new_level = kVMPressureWarning;
4155                         }
4156                         break;
4157                 }
4158
4159                 default:
4160                         return;
4161         }
4162
4163         if (new_level != -1) {
4164                 memorystatus_vm_pressure_level = (vm_pressure_level_t) new_level;
4165
4166                 if ((memorystatus_vm_pressure_level != kVMPressureNormal) || (old_level != new_level)) {
4167                         if (vm_pressure_thread_running == FALSE) {
4168                                 thread_wakeup(&vm_pressure_thread);
4169                         }
4170
4171                         if (old_level != new_level) {
4172                                 thread_wakeup(&vm_pressure_changed);
4173                         }
4174                 }
4175         }
4176
4177 }
4178 #endif /* VM_PRESSURE_EVENTS */
4179
4180 kern_return_t
4181 mach_vm_pressure_level_monitor(__unused boolean_t wait_for_pressure, __unused unsigned int *pressure_level) {
4182
4183 #if CONFIG_EMBEDDED
4184
4185         return KERN_FAILURE;
4186
4187 #elif !VM_PRESSURE_EVENTS
4188
4189         return KERN_FAILURE;
4190
4191 #else /* VM_PRESSURE_EVENTS */
4192
4193         kern_return_t   kr = KERN_SUCCESS;
4194
4195         if (pressure_level != NULL) {
4196
4197                 vm_pressure_level_t     old_level = memorystatus_vm_pressure_level;
4198
4199                 if (wait_for_pressure == TRUE) {
4200                         wait_result_t           wr = 0;
4201
4202                         while (old_level == *pressure_level) {
4203                                 wr = assert_wait((event_t) &vm_pressure_changed,
4204                                                  THREAD_INTERRUPTIBLE);
4205                                 if (wr == THREAD_WAITING) {
4206                                         wr = thread_block(THREAD_CONTINUE_NULL);
4207                                 }
4208                                 if (wr == THREAD_INTERRUPTED) {
4209                                         return KERN_ABORTED;
4210                                 }
4211                                 if (wr == THREAD_AWAKENED) {
4212
4213                                         old_level = memorystatus_vm_pressure_level;
4214
4215                                         if (old_level != *pressure_level) {
4216                                                 break;
4217                                         }
4218                                 }
4219                         }
4220                 }
4221
4222                 *pressure_level = old_level;
4223                 kr = KERN_SUCCESS;
4224         } else {
4225                 kr = KERN_INVALID_ARGUMENT;
4226         }
4227
4228         return kr;
4229 #endif /* VM_PRESSURE_EVENTS */
4230 }
4231
4232 #if VM_PRESSURE_EVENTS
4233 void
4234 vm_pressure_thread(void) {
4235         static boolean_t thread_initialized = FALSE;
4236
4237         if (thread_initialized == TRUE) {
4238                 vm_pressure_thread_running = TRUE;
4239                 consider_vm_pressure_events();
4240                 vm_pressure_thread_running = FALSE;
4241         }
4242
4243         thread_initialized = TRUE;
4244         assert_wait((event_t) &vm_pressure_thread, THREAD_UNINT);
4245         thread_block((thread_continue_t)vm_pressure_thread);
4246 }
4247 #endif /* VM_PRESSURE_EVENTS */
4248
4249
4250 uint32_t vm_pageout_considered_page_last = 0;
4251
4252 /*
4253  * called once per-second via "compute_averages"
4254  */
4255 void
4256 compute_pageout_gc_throttle(__unused void *arg)
4257 {
4258         if (vm_pageout_considered_page != vm_pageout_considered_page_last) {
4259
4260                 vm_pageout_considered_page_last = vm_pageout_considered_page;
4261
4262                 thread_wakeup((event_t) &vm_pageout_garbage_collect);
4263         }
4264 }
4265
4266 /*
4267  * vm_pageout_garbage_collect can also be called when the zone allocator needs
4268  * to call zone_gc on a different thread in order to trigger zone-map-exhaustion
4269  * jetsams. We need to check if the zone map size is above its jetsam limit to
4270  * decide if this was indeed the case.
4271  *
4272  * We need to do this on a different thread because of the following reasons:
4273  *
4274  * 1. In the case of synchronous jetsams, the leaking process can try to jetsam
4275  * itself causing the system to hang. We perform synchronous jetsams if we're
4276  * leaking in the VM map entries zone, so the leaking process could be doing a
4277  * zalloc for a VM map entry while holding its vm_map lock, when it decides to
4278  * jetsam itself. We also need the vm_map lock on the process termination path,
4279  * which would now lead the dying process to deadlock against itself.
4280  *
4281  * 2. The jetsam path might need to allocate zone memory itself. We could try
4282  * using the non-blocking variant of zalloc for this path, but we can still
4283  * end up trying to do a kernel_memory_allocate when the zone_map is almost
4284  * full.
4285  */
4286
4287 extern boolean_t is_zone_map_nearing_exhaustion(void);
4288
4289 void
4290 vm_pageout_garbage_collect(int collect)
4291 {
4292         if (collect) {
4293                 if (is_zone_map_nearing_exhaustion()) {
4294                         /*
4295                          * Woken up by the zone allocator for zone-map-exhaustion jetsams.
4296                          *
4297                          * Bail out after calling zone_gc (which triggers the
4298                          * zone-map-exhaustion jetsams). If we fall through, the subsequent
4299                          * operations that clear out a bunch of caches might allocate zone
4300                          * memory themselves (for eg. vm_map operations would need VM map
4301                          * entries). Since the zone map is almost full at this point, we
4302                          * could end up with a panic. We just need to quickly jetsam a
4303                          * process and exit here.
4304                          *
4305                          * It could so happen that we were woken up to relieve memory
4306                          * pressure and the zone map also happened to be near its limit at
4307                          * the time, in which case we'll skip out early. But that should be
4308                          * ok; if memory pressure persists, the thread will simply be woken
4309                          * up again.
4310                          */
4311                         consider_zone_gc(TRUE);
4312
4313                 } else {
4314                         /* Woken up by vm_pageout_scan or compute_pageout_gc_throttle. */
4315                         boolean_t buf_large_zfree = FALSE;
4316                         boolean_t first_try = TRUE;
4317
4318                         stack_collect();
4319
4320                         consider_machine_collect();
4321                         m_drain();
4322
4323                         do {
4324                                 if (consider_buffer_cache_collect != NULL) {
4325                                         buf_large_zfree = (*consider_buffer_cache_collect)(0);
4326                                 }
4327                                 if (first_try == TRUE || buf_large_zfree == TRUE) {
4328                                         /*
4329                                          * consider_zone_gc should be last, because the other operations
4330                                          * might return memory to zones.
4331                                          */
4332                                         consider_zone_gc(FALSE);
4333                                 }
4334                                 first_try = FALSE;
4335
4336                         } while (buf_large_zfree == TRUE && vm_page_free_count < vm_page_free_target);
4337
4338                         consider_machine_adjust();
4339                 }
4340         }
4341
4342         assert_wait((event_t) &vm_pageout_garbage_collect, THREAD_UNINT);
4343
4344         thread_block_parameter((thread_continue_t) vm_pageout_garbage_collect, (void *)1);
4345         /*NOTREACHED*/
4346 }
4347
4348
4349 #if VM_PAGE_BUCKETS_CHECK
4350 #if VM_PAGE_FAKE_BUCKETS
4351 extern vm_map_offset_t vm_page_fake_buckets_start, vm_page_fake_buckets_end;
4352 #endif /* VM_PAGE_FAKE_BUCKETS */
4353 #endif /* VM_PAGE_BUCKETS_CHECK */
4354
4355
4356
4357 void
4358 vm_set_restrictions()
4359 {
4360         host_basic_info_data_t hinfo;
4361         mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
4362
4363 #define BSD_HOST 1
4364         host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count);
4365
4366         assert(hinfo.max_cpus > 0);
4367
4368         if (hinfo.max_cpus <= 3) {
4369                 /*
4370                  * on systems with a limited number of CPUS, bind the
4371                  * 4 major threads that can free memory and that tend to use
4372                  * a fair bit of CPU under pressured conditions to a single processor.
4373                  * This insures that these threads don't hog all of the available CPUs
4374                  * (important for camera launch), while allowing them to run independently
4375                  * w/r to locks... the 4 threads are
4376                  * vm_pageout_scan,  vm_pageout_iothread_internal (compressor),
4377                  * vm_compressor_swap_trigger_thread (minor and major compactions),
4378                  * memorystatus_thread (jetsams).
4379                  *
4380                  * the first time the thread is run, it is responsible for checking the
4381                  * state of vm_restricted_to_single_processor, and if TRUE it calls
4382                  * thread_bind_master...  someday this should be replaced with a group
4383                  * scheduling mechanism and KPI.
4384                  */
4385                 vm_restricted_to_single_processor = TRUE;
4386         }
4387 }
4388
4389 void
4390 vm_pageout(void)
4391 {
4392         thread_t        self = current_thread();
4393         thread_t        thread;
4394         kern_return_t   result;
4395         spl_t           s;
4396
4397         /*
4398          * Set thread privileges.
4399          */
4400         s = splsched();
4401
4402         thread_lock(self);
4403         self->options |= TH_OPT_VMPRIV;
4404         sched_set_thread_base_priority(self, BASEPRI_VM);
4405         thread_unlock(self);
4406
4407         if (!self->reserved_stack)
4408                 self->reserved_stack = self->kernel_stack;
4409
4410         if (vm_restricted_to_single_processor == TRUE)
4411                 thread_vm_bind_group_add();
4412
4413         splx(s);
4414
4415         thread_set_thread_name(current_thread(), "VM_pageout_scan");
4416
4417         /*
4418          *      Initialize some paging parameters.
4419          */
4420
4421         if (vm_pageout_swap_wait == 0)
4422                 vm_pageout_swap_wait = VM_PAGEOUT_SWAP_WAIT;
4423
4424         if (vm_pageout_idle_wait == 0)
4425                 vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
4426
4427         if (vm_pageout_burst_wait == 0)
4428                 vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
4429
4430         if (vm_pageout_empty_wait == 0)
4431                 vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
4432
4433         if (vm_pageout_deadlock_wait == 0)
4434                 vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
4435
4436         if (vm_pageout_deadlock_relief == 0)
4437                 vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
4438
4439         if (vm_pageout_inactive_relief == 0)
4440                 vm_pageout_inactive_relief = VM_PAGEOUT_INACTIVE_RELIEF;
4441
4442         if (vm_pageout_burst_active_throttle == 0)
4443                 vm_pageout_burst_active_throttle = VM_PAGEOUT_BURST_ACTIVE_THROTTLE;
4444
4445         if (vm_pageout_burst_inactive_throttle == 0)
4446                 vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
4447
4448         /*
4449          * Set kernel task to low backing store privileged
4450          * status
4451          */
4452         task_lock(kernel_task);
4453         kernel_task->priv_flags |= VM_BACKING_STORE_PRIV;
4454         task_unlock(kernel_task);
4455
4456         vm_page_free_count_init = vm_page_free_count;
4457
4458         /*
4459          * even if we've already called vm_page_free_reserve
4460          * call it again here to insure that the targets are
4461          * accurately calculated (it uses vm_page_free_count_init)
4462          * calling it with an arg of 0 will not change the reserve
4463          * but will re-calculate free_min and free_target
4464          */
4465         if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
4466                 vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
4467         } else
4468                 vm_page_free_reserve(0);
4469
4470
4471         vm_page_queue_init(&vm_pageout_queue_external.pgo_pending);
4472         vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
4473         vm_pageout_queue_external.pgo_laundry = 0;
4474         vm_pageout_queue_external.pgo_idle = FALSE;
4475         vm_pageout_queue_external.pgo_busy = FALSE;
4476         vm_pageout_queue_external.pgo_throttled = FALSE;
4477         vm_pageout_queue_external.pgo_draining = FALSE;
4478         vm_pageout_queue_external.pgo_lowpriority = FALSE;
4479         vm_pageout_queue_external.pgo_tid = -1;
4480         vm_pageout_queue_external.pgo_inited = FALSE;
4481
4482         vm_page_queue_init(&vm_pageout_queue_internal.pgo_pending);
4483         vm_pageout_queue_internal.pgo_maxlaundry = 0;
4484         vm_pageout_queue_internal.pgo_laundry = 0;
4485         vm_pageout_queue_internal.pgo_idle = FALSE;
4486         vm_pageout_queue_internal.pgo_busy = FALSE;
4487         vm_pageout_queue_internal.pgo_throttled = FALSE;
4488         vm_pageout_queue_internal.pgo_draining = FALSE;
4489         vm_pageout_queue_internal.pgo_lowpriority = FALSE;
4490         vm_pageout_queue_internal.pgo_tid = -1;
4491         vm_pageout_queue_internal.pgo_inited = FALSE;
4492
4493         /* internal pageout thread started when default pager registered first time */
4494         /* external pageout and garbage collection threads started here */
4495
4496         result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external, NULL,
4497                                               BASEPRI_VM,
4498                                               &vm_pageout_external_iothread);
4499         if (result != KERN_SUCCESS)
4500                 panic("vm_pageout_iothread_external: create failed");
4501
4502         thread_deallocate(vm_pageout_external_iothread);
4503
4504         result = kernel_thread_start_priority((thread_continue_t)vm_pageout_garbage_collect, NULL,
4505                                               BASEPRI_DEFAULT,
4506                                               &thread);
4507         if (result != KERN_SUCCESS)
4508                 panic("vm_pageout_garbage_collect: create failed");
4509
4510         thread_deallocate(thread);
4511
4512 #if VM_PRESSURE_EVENTS
4513         result = kernel_thread_start_priority((thread_continue_t)vm_pressure_thread, NULL,
4514                                                 BASEPRI_DEFAULT,
4515                                                 &thread);
4516
4517         if (result != KERN_SUCCESS)
4518                 panic("vm_pressure_thread: create failed");
4519
4520         thread_deallocate(thread);
4521 #endif
4522
4523         vm_object_reaper_init();
4524
4525
4526         bzero(&vm_config, sizeof(vm_config));
4527
4528         switch(vm_compressor_mode) {
4529
4530         case VM_PAGER_DEFAULT:
4531                 printf("mapping deprecated VM_PAGER_DEFAULT to VM_PAGER_COMPRESSOR_WITH_SWAP\n");
4532
4533         case VM_PAGER_COMPRESSOR_WITH_SWAP:
4534                 vm_config.compressor_is_present = TRUE;
4535                 vm_config.swap_is_present = TRUE;
4536                 vm_config.compressor_is_active = TRUE;
4537                 vm_config.swap_is_active = TRUE;
4538                 break;
4539
4540         case VM_PAGER_COMPRESSOR_NO_SWAP:
4541                 vm_config.compressor_is_present = TRUE;
4542                 vm_config.swap_is_present = TRUE;
4543                 vm_config.compressor_is_active = TRUE;
4544                 break;
4545
4546         case VM_PAGER_FREEZER_DEFAULT:
4547                 printf("mapping deprecated VM_PAGER_FREEZER_DEFAULT to VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP\n");
4548
4549         case VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP:
4550                 vm_config.compressor_is_present = TRUE;
4551                 vm_config.swap_is_present = TRUE;
4552                 break;
4553
4554         case VM_PAGER_COMPRESSOR_NO_SWAP_PLUS_FREEZER_COMPRESSOR_WITH_SWAP:
4555                 vm_config.compressor_is_present = TRUE;
4556                 vm_config.swap_is_present = TRUE;
4557                 vm_config.compressor_is_active = TRUE;
4558                 vm_config.freezer_swap_is_active = TRUE;
4559                 break;
4560
4561         case VM_PAGER_NOT_CONFIGURED:
4562                 break;
4563
4564         default:
4565                 printf("unknown compressor mode - %x\n", vm_compressor_mode);
4566                 break;
4567         }
4568         if (VM_CONFIG_COMPRESSOR_IS_PRESENT)
4569                 vm_compressor_pager_init();
4570
4571 #if VM_PRESSURE_EVENTS
4572         vm_pressure_events_enabled = TRUE;
4573 #endif /* VM_PRESSURE_EVENTS */
4574
4575 #if CONFIG_PHANTOM_CACHE
4576         vm_phantom_cache_init();
4577 #endif
4578 #if VM_PAGE_BUCKETS_CHECK
4579 #if VM_PAGE_FAKE_BUCKETS
4580         printf("**** DEBUG: protecting fake buckets [0x%llx:0x%llx]\n",
4581                (uint64_t) vm_page_fake_buckets_start,
4582                (uint64_t) vm_page_fake_buckets_end);
4583         pmap_protect(kernel_pmap,
4584                      vm_page_fake_buckets_start,
4585                      vm_page_fake_buckets_end,
4586                      VM_PROT_READ);
4587 //      *(char *) vm_page_fake_buckets_start = 'x';     /* panic! */
4588 #endif /* VM_PAGE_FAKE_BUCKETS */
4589 #endif /* VM_PAGE_BUCKETS_CHECK */
4590
4591 #if VM_OBJECT_TRACKING
4592         vm_object_tracking_init();
4593 #endif /* VM_OBJECT_TRACKING */
4594
4595         vm_tests();
4596
4597         vm_pageout_continue();
4598
4599         /*
4600          * Unreached code!
4601          *
4602          * The vm_pageout_continue() call above never returns, so the code below is never
4603          * executed.  We take advantage of this to declare several DTrace VM related probe
4604          * points that our kernel doesn't have an analog for.  These are probe points that
4605          * exist in Solaris and are in the DTrace documentation, so people may have written
4606          * scripts that use them.  Declaring the probe points here means their scripts will
4607          * compile and execute which we want for portability of the scripts, but since this
4608          * section of code is never reached, the probe points will simply never fire.  Yes,
4609          * this is basically a hack.  The problem is the DTrace probe points were chosen with
4610          * Solaris specific VM events in mind, not portability to different VM implementations.
4611          */
4612
4613         DTRACE_VM2(execfree, int, 1, (uint64_t *), NULL);
4614         DTRACE_VM2(execpgin, int, 1, (uint64_t *), NULL);
4615         DTRACE_VM2(execpgout, int, 1, (uint64_t *), NULL);
4616         DTRACE_VM2(pgswapin, int, 1, (uint64_t *), NULL);
4617         DTRACE_VM2(pgswapout, int, 1, (uint64_t *), NULL);
4618         DTRACE_VM2(swapin, int, 1, (uint64_t *), NULL);
4619         DTRACE_VM2(swapout, int, 1, (uint64_t *), NULL);
4620         /*NOTREACHED*/
4621 }
4622
4623
4624
4625 #if CONFIG_EMBEDDED
4626 int vm_compressor_thread_count = 1;
4627 #else
4628 int vm_compressor_thread_count = 2;
4629 #endif
4630
4631 kern_return_t
4632 vm_pageout_internal_start(void)
4633 {
4634         kern_return_t   result;
4635         int             i;
4636         host_basic_info_data_t hinfo;
4637
4638         assert (VM_CONFIG_COMPRESSOR_IS_PRESENT);
4639
4640         mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
4641 #define BSD_HOST 1
4642         host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count);
4643
4644         assert(hinfo.max_cpus > 0);
4645
4646         PE_parse_boot_argn("vmcomp_threads", &vm_compressor_thread_count, sizeof(vm_compressor_thread_count));
4647         if (vm_compressor_thread_count >= hinfo.max_cpus)
4648                 vm_compressor_thread_count = hinfo.max_cpus - 1;
4649         if (vm_compressor_thread_count <= 0)
4650                 vm_compressor_thread_count = 1;
4651         else if (vm_compressor_thread_count > MAX_COMPRESSOR_THREAD_COUNT)
4652                 vm_compressor_thread_count = MAX_COMPRESSOR_THREAD_COUNT;
4653
4654         vm_pageout_queue_internal.pgo_maxlaundry = (vm_compressor_thread_count * 4) * VM_PAGE_LAUNDRY_MAX;
4655
4656         PE_parse_boot_argn("vmpgoi_maxlaundry", &vm_pageout_queue_internal.pgo_maxlaundry, sizeof(vm_pageout_queue_internal.pgo_maxlaundry));
4657
4658         for (i = 0; i < vm_compressor_thread_count; i++) {
4659                 ciq[i].id = i;
4660                 ciq[i].q = &vm_pageout_queue_internal;
4661                 ciq[i].current_chead = NULL;
4662                 ciq[i].scratch_buf = kalloc(COMPRESSOR_SCRATCH_BUF_SIZE);
4663
4664                 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal, (void *)&ciq[i], BASEPRI_VM, &vm_pageout_internal_iothread);
4665
4666                 if (result == KERN_SUCCESS)
4667                         thread_deallocate(vm_pageout_internal_iothread);
4668                 else
4669                         break;
4670         }
4671         return result;
4672 }
4673
4674 #if CONFIG_IOSCHED
4675 /*
4676  * To support I/O Expedite for compressed files we mark the upls with special flags.
4677  * The way decmpfs works is that we create a big upl which marks all the pages needed to
4678  * represent the compressed file as busy. We tag this upl with the flag UPL_DECMP_REQ. Decmpfs
4679  * then issues smaller I/Os for compressed I/Os, deflates them and puts the data into the pages
4680  * being held in the big original UPL. We mark each of these smaller UPLs with the flag
4681  * UPL_DECMP_REAL_IO. Any outstanding real I/O UPL is tracked by the big req upl using the
4682  * decmp_io_upl field (in the upl structure). This link is protected in the forward direction
4683  * by the req upl lock (the reverse link doesnt need synch. since we never inspect this link
4684  * unless the real I/O upl is being destroyed).
4685  */
4686
4687
4688 static void
4689 upl_set_decmp_info(upl_t upl, upl_t src_upl)
4690 {
4691         assert((src_upl->flags & UPL_DECMP_REQ) != 0);
4692
4693         upl_lock(src_upl);
4694         if (src_upl->decmp_io_upl) {
4695                 /*
4696                  * If there is already an alive real I/O UPL, ignore this new UPL.
4697                  * This case should rarely happen and even if it does, it just means
4698                  * that we might issue a spurious expedite which the driver is expected
4699                  * to handle.
4700                  */
4701                 upl_unlock(src_upl);
4702                 return;
4703         }
4704         src_upl->decmp_io_upl = (void *)upl;
4705         src_upl->ref_count++;
4706
4707         upl->flags |= UPL_DECMP_REAL_IO;
4708         upl->decmp_io_upl = (void *)src_upl;
4709         upl_unlock(src_upl);
4710 }
4711 #endif /* CONFIG_IOSCHED */
4712
4713 #if UPL_DEBUG
4714 int     upl_debug_enabled = 1;
4715 #else
4716 int     upl_debug_enabled = 0;
4717 #endif
4718
4719 static upl_t
4720 upl_create(int type, int flags, upl_size_t size)
4721 {
4722         upl_t   upl;
4723         vm_size_t       page_field_size = 0;
4724         int     upl_flags = 0;
4725         vm_size_t       upl_size  = sizeof(struct upl);
4726
4727         size = round_page_32(size);
4728
4729         if (type & UPL_CREATE_LITE) {
4730                 page_field_size = (atop(size) + 7) >> 3;
4731                 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
4732
4733                 upl_flags |= UPL_LITE;
4734         }
4735         if (type & UPL_CREATE_INTERNAL) {
4736                 upl_size += sizeof(struct upl_page_info) * atop(size);
4737
4738                 upl_flags |= UPL_INTERNAL;
4739         }
4740         upl = (upl_t)kalloc(upl_size + page_field_size);
4741
4742         if (page_field_size)
4743                 bzero((char *)upl + upl_size, page_field_size);
4744
4745         upl->flags = upl_flags | flags;
4746         upl->kaddr = (vm_offset_t)0;
4747         upl->size = 0;
4748         upl->map_object = NULL;
4749         upl->ref_count = 1;
4750         upl->ext_ref_count = 0;
4751         upl->highest_page = 0;
4752         upl_lock_init(upl);
4753         upl->vector_upl = NULL;
4754         upl->associated_upl = NULL;
4755 #if CONFIG_IOSCHED
4756         if (type & UPL_CREATE_IO_TRACKING) {
4757                 upl->upl_priority = proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO);
4758         }
4759
4760         upl->upl_reprio_info = 0;
4761         upl->decmp_io_upl = 0;
4762         if ((type & UPL_CREATE_INTERNAL) && (type & UPL_CREATE_EXPEDITE_SUP)) {
4763                 /* Only support expedite on internal UPLs */
4764                 thread_t        curthread = current_thread();
4765                 upl->upl_reprio_info = (uint64_t *)kalloc(sizeof(uint64_t) * atop(size));
4766                 bzero(upl->upl_reprio_info, (sizeof(uint64_t) * atop(size)));
4767                 upl->flags |= UPL_EXPEDITE_SUPPORTED;
4768                 if (curthread->decmp_upl != NULL)
4769                         upl_set_decmp_info(upl, curthread->decmp_upl);
4770         }
4771 #endif
4772 #if CONFIG_IOSCHED || UPL_DEBUG
4773         if ((type & UPL_CREATE_IO_TRACKING) || upl_debug_enabled) {
4774                 upl->upl_creator = current_thread();
4775                 upl->uplq.next = 0;
4776                 upl->uplq.prev = 0;
4777                 upl->flags |= UPL_TRACKED_BY_OBJECT;
4778         }
4779 #endif
4780
4781 #if UPL_DEBUG
4782         upl->ubc_alias1 = 0;
4783         upl->ubc_alias2 = 0;
4784
4785         upl->upl_state = 0;
4786         upl->upl_commit_index = 0;
4787         bzero(&upl->upl_commit_records[0], sizeof(upl->upl_commit_records));
4788
4789         (void) OSBacktrace(&upl->upl_create_retaddr[0], UPL_DEBUG_STACK_FRAMES);
4790 #endif /* UPL_DEBUG */
4791
4792         return(upl);
4793 }
4794
4795 static void
4796 upl_destroy(upl_t upl)
4797 {
4798         int     page_field_size;  /* bit field in word size buf */
4799         int     size;
4800
4801         if (upl->ext_ref_count) {
4802                 panic("upl(%p) ext_ref_count", upl);
4803         }
4804
4805 #if CONFIG_IOSCHED
4806         if ((upl->flags & UPL_DECMP_REAL_IO) && upl->decmp_io_upl) {
4807                 upl_t src_upl;
4808                 src_upl = upl->decmp_io_upl;
4809                 assert((src_upl->flags & UPL_DECMP_REQ) != 0);
4810                 upl_lock(src_upl);
4811                 src_upl->decmp_io_upl = NULL;
4812                 upl_unlock(src_upl);
4813                 upl_deallocate(src_upl);
4814         }
4815 #endif /* CONFIG_IOSCHED */
4816
4817 #if CONFIG_IOSCHED || UPL_DEBUG
4818         if ((upl->flags & UPL_TRACKED_BY_OBJECT) && !(upl->flags & UPL_VECTOR)) {
4819                 vm_object_t     object;
4820
4821                 if (upl->flags & UPL_SHADOWED) {
4822                         object = upl->map_object->shadow;
4823                 } else {
4824                         object = upl->map_object;
4825                 }
4826
4827                 vm_object_lock(object);
4828                 queue_remove(&object->uplq, upl, upl_t, uplq);
4829                 vm_object_activity_end(object);
4830                 vm_object_collapse(object, 0, TRUE);
4831                 vm_object_unlock(object);
4832         }
4833 #endif
4834         /*
4835          * drop a reference on the map_object whether or
4836          * not a pageout object is inserted
4837          */
4838         if (upl->flags & UPL_SHADOWED)
4839                 vm_object_deallocate(upl->map_object);
4840
4841         if (upl->flags & UPL_DEVICE_MEMORY)
4842                 size = PAGE_SIZE;
4843         else
4844                 size = upl->size;
4845         page_field_size = 0;
4846
4847         if (upl->flags & UPL_LITE) {
4848                 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
4849                 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
4850         }
4851         upl_lock_destroy(upl);
4852         upl->vector_upl = (vector_upl_t) 0xfeedbeef;
4853
4854 #if CONFIG_IOSCHED
4855         if (upl->flags & UPL_EXPEDITE_SUPPORTED)
4856                 kfree(upl->upl_reprio_info, sizeof(uint64_t) * (size/PAGE_SIZE));
4857 #endif
4858
4859         if (upl->flags & UPL_INTERNAL) {
4860                 kfree(upl,
4861                       sizeof(struct upl) +
4862                       (sizeof(struct upl_page_info) * (size/PAGE_SIZE))
4863                       + page_field_size);
4864         } else {
4865                 kfree(upl, sizeof(struct upl) + page_field_size);
4866         }
4867 }
4868
4869 void
4870 upl_deallocate(upl_t upl)
4871 {
4872         upl_lock(upl);
4873         if (--upl->ref_count == 0) {
4874                 if(vector_upl_is_valid(upl))
4875                         vector_upl_deallocate(upl);
4876                 upl_unlock(upl);
4877                 upl_destroy(upl);
4878         }
4879         else
4880                 upl_unlock(upl);
4881 }
4882
4883 #if CONFIG_IOSCHED
4884 void
4885 upl_mark_decmp(upl_t upl)
4886 {
4887         if (upl->flags & UPL_TRACKED_BY_OBJECT) {
4888                 upl->flags |= UPL_DECMP_REQ;
4889                 upl->upl_creator->decmp_upl = (void *)upl;
4890         }
4891 }
4892
4893 void
4894 upl_unmark_decmp(upl_t upl)
4895 {
4896         if(upl && (upl->flags & UPL_DECMP_REQ)) {
4897                 upl->upl_creator->decmp_upl = NULL;
4898         }
4899 }
4900
4901 #endif /* CONFIG_IOSCHED */
4902
4903 #define VM_PAGE_Q_BACKING_UP(q)         \
4904         ((q)->pgo_laundry >= (((q)->pgo_maxlaundry * 8) / 10))
4905
4906 boolean_t must_throttle_writes(void);
4907
4908 boolean_t
4909 must_throttle_writes()
4910 {
4911         if (VM_PAGE_Q_BACKING_UP(&vm_pageout_queue_external) &&
4912             vm_page_pageable_external_count > (AVAILABLE_NON_COMPRESSED_MEMORY * 6) / 10)
4913                 return (TRUE);
4914
4915         return (FALSE);
4916 }
4917
4918
4919 #if DEVELOPMENT || DEBUG
4920 /*/*
4921  * Statistics about UPL enforcement of copy-on-write obligations.
4922  */
4923 unsigned long upl_cow = 0;
4924 unsigned long upl_cow_again = 0;
4925 unsigned long upl_cow_pages = 0;
4926 unsigned long upl_cow_again_pages = 0;
4927
4928 unsigned long iopl_cow = 0;
4929 unsigned long iopl_cow_pages = 0;
4930 #endif
4931
4932 /*
4933  *      Routine:        vm_object_upl_request
4934  *      Purpose:
4935  *              Cause the population of a portion of a vm_object.
4936  *              Depending on the nature of the request, the pages
4937  *              returned may be contain valid data or be uninitialized.
4938  *              A page list structure, listing the physical pages
4939  *              will be returned upon request.
4940  *              This function is called by the file system or any other
4941  *              supplier of backing store to a pager.
4942  *              IMPORTANT NOTE: The caller must still respect the relationship
4943  *              between the vm_object and its backing memory object.  The
4944  *              caller MUST NOT substitute changes in the backing file
4945  *              without first doing a memory_object_lock_request on the
4946  *              target range unless it is know that the pages are not
4947  *              shared with another entity at the pager level.
4948  *              Copy_in_to:
4949  *                      if a page list structure is present
4950  *                      return the mapped physical pages, where a
4951  *                      page is not present, return a non-initialized
4952  *                      one.  If the no_sync bit is turned on, don't
4953  *                      call the pager unlock to synchronize with other
4954  *                      possible copies of the page. Leave pages busy
4955  *                      in the original object, if a page list structure
4956  *                      was specified.  When a commit of the page list
4957  *                      pages is done, the dirty bit will be set for each one.
4958  *              Copy_out_from:
4959  *                      If a page list structure is present, return
4960  *                      all mapped pages.  Where a page does not exist
4961  *                      map a zero filled one. Leave pages busy in
4962  *                      the original object.  If a page list structure
4963  *                      is not specified, this call is a no-op.
4964  *
4965  *              Note:  access of default pager objects has a rather interesting
4966  *              twist.  The caller of this routine, presumably the file system
4967  *              page cache handling code, will never actually make a request
4968  *              against a default pager backed object.  Only the default
4969  *              pager will make requests on backing store related vm_objects
4970  *              In this way the default pager can maintain the relationship
4971  *              between backing store files (abstract memory objects) and
4972  *              the vm_objects (cache objects), they support.
4973  *
4974  */
4975
4976 __private_extern__ kern_return_t
4977 vm_object_upl_request(
4978         vm_object_t             object,
4979         vm_object_offset_t      offset,
4980         upl_size_t              size,
4981         upl_t                   *upl_ptr,
4982         upl_page_info_array_t   user_page_list,
4983         unsigned int            *page_list_count,
4984         upl_control_flags_t     cntrl_flags,
4985         vm_tag_t                tag)
4986 {
4987         vm_page_t               dst_page = VM_PAGE_NULL;
4988         vm_object_offset_t      dst_offset;
4989         upl_size_t              xfer_size;
4990         unsigned int            size_in_pages;
4991         boolean_t               dirty;
4992         boolean_t               hw_dirty;
4993         upl_t                   upl = NULL;
4994         unsigned int            entry;
4995 #if MACH_CLUSTER_STATS
4996         boolean_t               encountered_lrp = FALSE;
4997 #endif
4998         vm_page_t               alias_page = NULL;
4999         int                     refmod_state = 0;
5000         wpl_array_t             lite_list = NULL;
5001         vm_object_t             last_copy_object;
5002         struct  vm_page_delayed_work    dw_array[DEFAULT_DELAYED_WORK_LIMIT];
5003         struct  vm_page_delayed_work    *dwp;
5004         int                     dw_count;
5005         int                     dw_limit;
5006         int                     io_tracking_flag = 0;
5007         int                     grab_options;
5008         ppnum_t                 phys_page;
5009
5010         if (cntrl_flags & ~UPL_VALID_FLAGS) {
5011                 /*
5012                  * For forward compatibility's sake,
5013                  * reject any unknown flag.
5014                  */
5015                 return KERN_INVALID_VALUE;
5016         }
5017         if ( (!object->internal) && (object->paging_offset != 0) )
5018                 panic("vm_object_upl_request: external object with non-zero paging offset\n");
5019         if (object->phys_contiguous)
5020                 panic("vm_object_upl_request: contiguous object specified\n");
5021
5022
5023         if (size > MAX_UPL_SIZE_BYTES)
5024                 size = MAX_UPL_SIZE_BYTES;
5025
5026         if ( (cntrl_flags & UPL_SET_INTERNAL) && page_list_count != NULL)
5027                 *page_list_count = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
5028
5029 #if CONFIG_IOSCHED || UPL_DEBUG
5030         if (object->io_tracking || upl_debug_enabled)
5031                 io_tracking_flag |= UPL_CREATE_IO_TRACKING;
5032 #endif
5033 #if CONFIG_IOSCHED
5034         if (object->io_tracking)
5035                 io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
5036 #endif
5037
5038         if (cntrl_flags & UPL_SET_INTERNAL) {
5039                 if (cntrl_flags & UPL_SET_LITE) {
5040
5041                         upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
5042
5043                         user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
5044                         lite_list = (wpl_array_t)
5045                                         (((uintptr_t)user_page_list) +
5046                                         ((size/PAGE_SIZE) * sizeof(upl_page_info_t)));
5047                         if (size == 0) {
5048                                 user_page_list = NULL;
5049                                 lite_list = NULL;
5050                         }
5051                 } else {
5052                         upl = upl_create(UPL_CREATE_INTERNAL | io_tracking_flag, 0, size);
5053
5054                         user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
5055                         if (size == 0) {
5056                                 user_page_list = NULL;
5057                         }
5058                 }
5059         } else {
5060                 if (cntrl_flags & UPL_SET_LITE) {
5061
5062                         upl = upl_create(UPL_CREATE_EXTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
5063
5064                         lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
5065                         if (size == 0) {
5066                                 lite_list = NULL;
5067                         }
5068                 } else {
5069                         upl = upl_create(UPL_CREATE_EXTERNAL | io_tracking_flag, 0, size);
5070                 }
5071         }
5072         *upl_ptr = upl;
5073
5074         if (user_page_list)
5075                 user_page_list[0].device = FALSE;
5076
5077         if (cntrl_flags & UPL_SET_LITE) {
5078                 upl->map_object = object;
5079         } else {
5080                 upl->map_object = vm_object_allocate(size);
5081                 /*
5082                  * No neeed to lock the new object: nobody else knows
5083                  * about it yet, so it's all ours so far.
5084                  */
5085                 upl->map_object->shadow = object;
5086                 upl->map_object->pageout = TRUE;
5087                 upl->map_object->can_persist = FALSE;
5088                 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
5089                 upl->map_object->vo_shadow_offset = offset;
5090                 upl->map_object->wimg_bits = object->wimg_bits;
5091
5092                 VM_PAGE_GRAB_FICTITIOUS(alias_page);
5093
5094                 upl->flags |= UPL_SHADOWED;
5095         }
5096         if (cntrl_flags & UPL_FOR_PAGEOUT)
5097                 upl->flags |= UPL_PAGEOUT;
5098
5099         vm_object_lock(object);
5100         vm_object_activity_begin(object);
5101
5102         grab_options = 0;
5103 #if CONFIG_SECLUDED_MEMORY
5104         if (object->can_grab_secluded) {
5105                 grab_options |= VM_PAGE_GRAB_SECLUDED;
5106         }
5107 #endif /* CONFIG_SECLUDED_MEMORY */
5108
5109         /*
5110          * we can lock in the paging_offset once paging_in_progress is set
5111          */
5112         upl->size = size;
5113         upl->offset = offset + object->paging_offset;
5114
5115 #if CONFIG_IOSCHED || UPL_DEBUG
5116         if (object->io_tracking || upl_debug_enabled) {
5117                 vm_object_activity_begin(object);
5118                 queue_enter(&object->uplq, upl, upl_t, uplq);
5119         }
5120 #endif
5121         if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != VM_OBJECT_NULL) {
5122                 /*
5123                  * Honor copy-on-write obligations
5124                  *
5125                  * The caller is gathering these pages and
5126                  * might modify their contents.  We need to
5127                  * make sure that the copy object has its own
5128                  * private copies of these pages before we let
5129                  * the caller modify them.
5130                  */
5131                 vm_object_update(object,
5132                                  offset,
5133                                  size,
5134                                  NULL,
5135                                  NULL,
5136                                  FALSE, /* should_return */
5137                                  MEMORY_OBJECT_COPY_SYNC,
5138                                  VM_PROT_NO_CHANGE);
5139 #if DEVELOPMENT || DEBUG
5140                 upl_cow++;
5141                 upl_cow_pages += size >> PAGE_SHIFT;
5142 #endif
5143         }
5144         /*
5145          * remember which copy object we synchronized with
5146          */
5147         last_copy_object = object->copy;
5148         entry = 0;
5149
5150         xfer_size = size;
5151         dst_offset = offset;
5152         size_in_pages = size / PAGE_SIZE;
5153
5154         dwp = &dw_array[0];
5155         dw_count = 0;
5156         dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
5157
5158         if (vm_page_free_count > (vm_page_free_target + size_in_pages) ||
5159             object->resident_page_count < ((MAX_UPL_SIZE_BYTES * 2) >> PAGE_SHIFT))
5160                 object->scan_collisions = 0;
5161
5162         if ((cntrl_flags & UPL_WILL_MODIFY) && must_throttle_writes() == TRUE) {
5163                 boolean_t       isSSD = FALSE;
5164
5165 #if CONFIG_EMBEDDED
5166                 isSSD = TRUE;
5167 #else
5168                 vnode_pager_get_isSSD(object->pager, &isSSD);
5169 #endif
5170                 vm_object_unlock(object);
5171
5172                 OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
5173
5174                 if (isSSD == TRUE)
5175                         delay(1000 * size_in_pages);
5176                 else
5177                         delay(5000 * size_in_pages);
5178                 OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
5179
5180                 vm_object_lock(object);
5181         }
5182
5183         while (xfer_size) {
5184
5185                 dwp->dw_mask = 0;
5186
5187                 if ((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
5188                         vm_object_unlock(object);
5189                         VM_PAGE_GRAB_FICTITIOUS(alias_page);
5190                         vm_object_lock(object);
5191                 }
5192                 if (cntrl_flags & UPL_COPYOUT_FROM) {
5193                         upl->flags |= UPL_PAGE_SYNC_DONE;
5194
5195                         if ( ((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
5196                                 dst_page->fictitious ||
5197                                 dst_page->absent ||
5198                                 dst_page->error ||
5199                                 dst_page->cleaning ||
5200                                 (VM_PAGE_WIRED(dst_page))) {
5201
5202                                 if (user_page_list)
5203                                         user_page_list[entry].phys_addr = 0;
5204
5205                                 goto try_next_page;
5206                         }
5207                         phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
5208
5209                         /*
5210                          * grab this up front...
5211                          * a high percentange of the time we're going to
5212                          * need the hardware modification state a bit later
5213                          * anyway... so we can eliminate an extra call into
5214                          * the pmap layer by grabbing it here and recording it
5215                          */
5216                         if (dst_page->pmapped)
5217                                 refmod_state = pmap_get_refmod(phys_page);
5218                         else
5219                                 refmod_state = 0;
5220
5221                         if ( (refmod_state & VM_MEM_REFERENCED) && VM_PAGE_INACTIVE(dst_page)) {
5222                                 /*
5223                                  * page is on inactive list and referenced...
5224                                  * reactivate it now... this gets it out of the
5225                                  * way of vm_pageout_scan which would have to
5226                                  * reactivate it upon tripping over it
5227                                  */
5228                                 dwp->dw_mask |= DW_vm_page_activate;
5229                         }
5230                         if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
5231                                 /*
5232                                  * we're only asking for DIRTY pages to be returned
5233                                  */
5234                                 if (dst_page->laundry || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
5235                                         /*
5236                                          * if we were the page stolen by vm_pageout_scan to be
5237                                          * cleaned (as opposed to a buddy being clustered in
5238                                          * or this request is not being driven by a PAGEOUT cluster
5239                                          * then we only need to check for the page being dirty or
5240                                          * precious to decide whether to return it
5241                                          */
5242                                         if (dst_page->dirty || dst_page->precious || (refmod_state & VM_MEM_MODIFIED))
5243                                                 goto check_busy;
5244                                         goto dont_return;
5245                                 }
5246                                 /*
5247                                  * this is a request for a PAGEOUT cluster and this page
5248                                  * is merely along for the ride as a 'buddy'... not only
5249                                  * does it have to be dirty to be returned, but it also
5250                                  * can't have been referenced recently...
5251                                  */
5252                                 if ( (hibernate_cleaning_in_progress == TRUE ||
5253                                       (!((refmod_state & VM_MEM_REFERENCED) || dst_page->reference) ||
5254                                        (dst_page->vm_page_q_state == VM_PAGE_ON_THROTTLED_Q))) &&
5255                                      ((refmod_state & VM_MEM_MODIFIED) || dst_page->dirty || dst_page->precious) ) {
5256                                         goto check_busy;
5257                                 }
5258 dont_return:
5259                                 /*
5260                                  * if we reach here, we're not to return
5261                                  * the page... go on to the next one
5262                                  */
5263                                 if (dst_page->laundry == TRUE) {
5264                                         /*
5265                                          * if we get here, the page is not 'cleaning' (filtered out above).
5266                                          * since it has been referenced, remove it from the laundry
5267                                          * so we don't pay the cost of an I/O to clean a page
5268                                          * we're just going to take back
5269                                          */
5270                                         vm_page_lockspin_queues();
5271
5272                                         vm_pageout_steal_laundry(dst_page, TRUE);
5273                                         vm_page_activate(dst_page);
5274
5275                                         vm_page_unlock_queues();
5276                                 }
5277                                 if (user_page_list)
5278                                         user_page_list[entry].phys_addr = 0;
5279
5280                                 goto try_next_page;
5281                         }
5282 check_busy:
5283                         if (dst_page->busy) {
5284                                 if (cntrl_flags & UPL_NOBLOCK) {
5285                                         if (user_page_list)
5286                                                 user_page_list[entry].phys_addr = 0;
5287                                         dwp->dw_mask = 0;
5288
5289                                         goto try_next_page;
5290                                 }
5291                                 /*
5292                                  * someone else is playing with the
5293                                  * page.  We will have to wait.
5294                                  */
5295                                 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
5296
5297                                 continue;
5298                         }
5299                         if (dst_page->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q) {
5300
5301                                 vm_page_lockspin_queues();
5302
5303                                 if (dst_page->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q) {
5304                                         /*
5305                                          * we've buddied up a page for a clustered pageout
5306                                          * that has already been moved to the pageout
5307                                          * queue by pageout_scan... we need to remove
5308                                          * it from the queue and drop the laundry count
5309                                          * on that queue
5310                                          */
5311                                         vm_pageout_throttle_up(dst_page);
5312                                 }
5313                                 vm_page_unlock_queues();
5314                         }
5315 #if MACH_CLUSTER_STATS
5316                         /*
5317                          * pageout statistics gathering.  count
5318                          * all the pages we will page out that
5319                          * were not counted in the initial
5320                          * vm_pageout_scan work
5321                          */
5322                         if (dst_page->pageout)
5323                                 encountered_lrp = TRUE;
5324                         if ((dst_page->dirty || (object->internal && dst_page->precious))) {
5325                                 if (encountered_lrp)
5326                                         CLUSTER_STAT(pages_at_higher_offsets++;)
5327                                 else
5328                                         CLUSTER_STAT(pages_at_lower_offsets++;)
5329                         }
5330 #endif
5331                         hw_dirty = refmod_state & VM_MEM_MODIFIED;
5332                         dirty = hw_dirty ? TRUE : dst_page->dirty;
5333
5334                         if (phys_page > upl->highest_page)
5335                                 upl->highest_page = phys_page;
5336
5337                         assert (!pmap_is_noencrypt(phys_page));
5338
5339                         if (cntrl_flags & UPL_SET_LITE) {
5340                                 unsigned int    pg_num;
5341
5342                                 pg_num = (unsigned int) ((dst_offset-offset)/PAGE_SIZE);
5343                                 assert(pg_num == (dst_offset-offset)/PAGE_SIZE);
5344                                 lite_list[pg_num>>5] |= 1 << (pg_num & 31);
5345
5346                                 if (hw_dirty)
5347                                         pmap_clear_modify(phys_page);
5348
5349                                 /*
5350                                  * Mark original page as cleaning
5351                                  * in place.
5352                                  */
5353                                 dst_page->cleaning = TRUE;
5354                                 dst_page->precious = FALSE;
5355                         } else {
5356                                 /*
5357                                  * use pageclean setup, it is more
5358                                  * convenient even for the pageout
5359                                  * cases here
5360                                  */
5361                                 vm_object_lock(upl->map_object);
5362                                 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
5363                                 vm_object_unlock(upl->map_object);
5364
5365                                 alias_page->absent = FALSE;
5366                                 alias_page = NULL;
5367                         }
5368                         if (dirty) {
5369                                 SET_PAGE_DIRTY(dst_page, FALSE);
5370                         } else {
5371                                 dst_page->dirty = FALSE;
5372                         }
5373
5374                         if (!dirty)
5375                                 dst_page->precious = TRUE;
5376
5377                         if ( !(cntrl_flags & UPL_CLEAN_IN_PLACE) ) {
5378                                 if ( !VM_PAGE_WIRED(dst_page))
5379                                         dst_page->free_when_done = TRUE;
5380                         }
5381                 } else {
5382                         if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != last_copy_object) {
5383                                 /*
5384                                  * Honor copy-on-write obligations
5385                                  *
5386                                  * The copy object has changed since we
5387                                  * last synchronized for copy-on-write.
5388                                  * Another copy object might have been
5389                                  * inserted while we released the object's
5390                                  * lock.  Since someone could have seen the
5391                                  * original contents of the remaining pages
5392                                  * through that new object, we have to
5393                                  * synchronize with it again for the remaining
5394                                  * pages only.  The previous pages are "busy"
5395                                  * so they can not be seen through the new
5396                                  * mapping.  The new mapping will see our
5397                                  * upcoming changes for those previous pages,
5398                                  * but that's OK since they couldn't see what
5399                                  * was there before.  It's just a race anyway
5400                                  * and there's no guarantee of consistency or
5401                                  * atomicity.  We just don't want new mappings
5402                                  * to see both the *before* and *after* pages.
5403                                  */
5404                                 if (object->copy != VM_OBJECT_NULL) {
5405                                         vm_object_update(
5406                                                 object,
5407                                                 dst_offset,/* current offset */
5408                                                 xfer_size, /* remaining size */
5409                                                 NULL,
5410                                                 NULL,
5411                                                 FALSE,     /* should_return */
5412                                                 MEMORY_OBJECT_COPY_SYNC,
5413                                                 VM_PROT_NO_CHANGE);
5414
5415 #if DEVELOPMENT || DEBUG
5416                                         upl_cow_again++;
5417                                         upl_cow_again_pages += xfer_size >> PAGE_SHIFT;
5418 #endif
5419                                 }
5420                                 /*
5421                                  * remember the copy object we synced with
5422                                  */
5423                                 last_copy_object = object->copy;
5424                         }
5425                         dst_page = vm_page_lookup(object, dst_offset);
5426
5427                         if (dst_page != VM_PAGE_NULL) {
5428
5429                                 if ((cntrl_flags & UPL_RET_ONLY_ABSENT)) {
5430                                         /*
5431                                          * skip over pages already present in the cache
5432                                          */
5433                                         if (user_page_list)
5434                                                 user_page_list[entry].phys_addr = 0;
5435
5436                                         goto try_next_page;
5437                                 }
5438                                 if (dst_page->fictitious) {
5439                                         panic("need corner case for fictitious page");
5440                                 }
5441
5442                                 if (dst_page->busy || dst_page->cleaning) {
5443                                         /*
5444                                          * someone else is playing with the
5445                                          * page.  We will have to wait.
5446                                          */
5447                                         PAGE_SLEEP(object, dst_page, THREAD_UNINT);
5448
5449                                         continue;
5450                                 }
5451                                 if (dst_page->laundry)
5452                                         vm_pageout_steal_laundry(dst_page, FALSE);
5453                         } else {
5454                                 if (object->private) {
5455                                         /*
5456                                          * This is a nasty wrinkle for users
5457                                          * of upl who encounter device or
5458                                          * private memory however, it is
5459                                          * unavoidable, only a fault can
5460                                          * resolve the actual backing
5461                                          * physical page by asking the
5462                                          * backing device.
5463                                          */
5464                                         if (user_page_list)
5465                                                 user_page_list[entry].phys_addr = 0;
5466
5467                                         goto try_next_page;
5468                                 }
5469                                 if (object->scan_collisions) {
5470                                         /*
5471                                          * the pageout_scan thread is trying to steal
5472                                          * pages from this object, but has run into our
5473                                          * lock... grab 2 pages from the head of the object...
5474                                          * the first is freed on behalf of pageout_scan, the
5475                                          * 2nd is for our own use... we use vm_object_page_grab
5476                                          * in both cases to avoid taking pages from the free
5477                                          * list since we are under memory pressure and our
5478                                          * lock on this object is getting in the way of
5479                                          * relieving it
5480                                          */
5481                                         dst_page = vm_object_page_grab(object);
5482
5483                                         if (dst_page != VM_PAGE_NULL)
5484                                                 vm_page_release(dst_page,
5485                                                                 FALSE);
5486
5487                                         dst_page = vm_object_page_grab(object);
5488                                 }
5489                                 if (dst_page == VM_PAGE_NULL) {
5490                                         /*
5491                                          * need to allocate a page
5492                                          */
5493                                         dst_page = vm_page_grab_options(grab_options);
5494                                 }
5495                                 if (dst_page == VM_PAGE_NULL) {
5496                                         if ( (cntrl_flags & (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) == (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) {
5497                                                /*
5498                                                 * we don't want to stall waiting for pages to come onto the free list
5499                                                 * while we're already holding absent pages in this UPL
5500                                                 * the caller will deal with the empty slots
5501                                                 */
5502                                                 if (user_page_list)
5503                                                         user_page_list[entry].phys_addr = 0;
5504
5505                                                 goto try_next_page;
5506                                         }
5507                                         /*
5508                                          * no pages available... wait
5509                                          * then try again for the same
5510                                          * offset...
5511                                          */
5512                                         vm_object_unlock(object);
5513
5514                                         OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
5515
5516                                         VM_DEBUG_EVENT(vm_upl_page_wait, VM_UPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
5517
5518                                         VM_PAGE_WAIT();
5519                                         OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
5520
5521                                         VM_DEBUG_EVENT(vm_upl_page_wait, VM_UPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
5522
5523                                         vm_object_lock(object);
5524
5525                                         continue;
5526                                 }
5527                                 vm_page_insert(dst_page, object, dst_offset);
5528
5529                                 dst_page->absent = TRUE;
5530                                 dst_page->busy = FALSE;
5531
5532                                 if (cntrl_flags & UPL_RET_ONLY_ABSENT) {
5533                                         /*
5534                                          * if UPL_RET_ONLY_ABSENT was specified,
5535                                          * than we're definitely setting up a
5536                                          * upl for a clustered read/pagein
5537                                          * operation... mark the pages as clustered
5538                                          * so upl_commit_range can put them on the
5539                                          * speculative list
5540                                          */
5541                                         dst_page->clustered = TRUE;
5542
5543                                         if ( !(cntrl_flags & UPL_FILE_IO))
5544                                                 VM_STAT_INCR(pageins);
5545                                 }
5546                         }
5547                         phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
5548
5549                         dst_page->overwriting = TRUE;
5550
5551                         if (dst_page->pmapped) {
5552                                 if ( !(cntrl_flags & UPL_FILE_IO))
5553                                         /*
5554                                          * eliminate all mappings from the
5555                                          * original object and its prodigy
5556                                          */
5557                                         refmod_state = pmap_disconnect(phys_page);
5558                                 else
5559                                         refmod_state = pmap_get_refmod(phys_page);
5560                         } else
5561                                 refmod_state = 0;
5562
5563                         hw_dirty = refmod_state & VM_MEM_MODIFIED;
5564                         dirty = hw_dirty ? TRUE : dst_page->dirty;
5565
5566                         if (cntrl_flags & UPL_SET_LITE) {
5567                                 unsigned int    pg_num;
5568
5569                                 pg_num = (unsigned int) ((dst_offset-offset)/PAGE_SIZE);
5570                                 assert(pg_num == (dst_offset-offset)/PAGE_SIZE);
5571                                 lite_list[pg_num>>5] |= 1 << (pg_num & 31);
5572
5573                                 if (hw_dirty)
5574                                         pmap_clear_modify(phys_page);
5575
5576                                 /*
5577                                  * Mark original page as cleaning
5578                                  * in place.
5579                                  */
5580                                 dst_page->cleaning = TRUE;
5581                                 dst_page->precious = FALSE;
5582                         } else {
5583                                 /*
5584                                  * use pageclean setup, it is more
5585                                  * convenient even for the pageout
5586                                  * cases here
5587                                  */
5588                                 vm_object_lock(upl->map_object);
5589                                 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
5590                                 vm_object_unlock(upl->map_object);
5591
5592                                 alias_page->absent = FALSE;
5593                                 alias_page = NULL;
5594                         }
5595
5596                         if (cntrl_flags & UPL_REQUEST_SET_DIRTY) {
5597                                 upl->flags &= ~UPL_CLEAR_DIRTY;
5598                                 upl->flags |= UPL_SET_DIRTY;
5599                                 dirty = TRUE;
5600                                 upl->flags |= UPL_SET_DIRTY;
5601                         } else if (cntrl_flags & UPL_CLEAN_IN_PLACE) {
5602                                 /*
5603                                  * clean in place for read implies
5604                                  * that a write will be done on all
5605                                  * the pages that are dirty before
5606                                  * a upl commit is done.  The caller
5607                                  * is obligated to preserve the
5608                                  * contents of all pages marked dirty
5609                                  */
5610                                 upl->flags |= UPL_CLEAR_DIRTY;
5611                         }
5612                         dst_page->dirty = dirty;
5613
5614                         if (!dirty)
5615                                 dst_page->precious = TRUE;
5616
5617                         if ( !VM_PAGE_WIRED(dst_page)) {
5618                                 /*
5619                                  * deny access to the target page while
5620                                  * it is being worked on
5621                                  */
5622                                 dst_page->busy = TRUE;
5623                         } else
5624                                 dwp->dw_mask |= DW_vm_page_wire;
5625
5626                         /*
5627                          * We might be about to satisfy a fault which has been
5628                          * requested. So no need for the "restart" bit.
5629                          */
5630                         dst_page->restart = FALSE;
5631                         if (!dst_page->absent && !(cntrl_flags & UPL_WILL_MODIFY)) {
5632                                 /*
5633                                  * expect the page to be used
5634                                  */
5635                                 dwp->dw_mask |= DW_set_reference;
5636                         }
5637                         if (cntrl_flags & UPL_PRECIOUS) {
5638                                 if (object->internal) {
5639                                         SET_PAGE_DIRTY(dst_page, FALSE);
5640                                         dst_page->precious = FALSE;
5641                                 } else {
5642                                         dst_page->precious = TRUE;
5643                                 }
5644                         } else {
5645                                 dst_page->precious = FALSE;
5646                         }
5647                 }
5648                 if (dst_page->busy)
5649                         upl->flags |= UPL_HAS_BUSY;
5650
5651                 if (phys_page > upl->highest_page)
5652                         upl->highest_page = phys_page;
5653                 assert (!pmap_is_noencrypt(phys_page));
5654                 if (user_page_list) {
5655                         user_page_list[entry].phys_addr = phys_page;
5656                         user_page_list[entry].free_when_done    = dst_page->free_when_done;
5657                         user_page_list[entry].absent    = dst_page->absent;
5658                         user_page_list[entry].dirty     = dst_page->dirty;
5659                         user_page_list[entry].precious  = dst_page->precious;
5660                         user_page_list[entry].device    = FALSE;
5661                         user_page_list[entry].needed    = FALSE;
5662                         if (dst_page->clustered == TRUE)
5663                                 user_page_list[entry].speculative = (dst_page->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
5664                         else
5665                                 user_page_list[entry].speculative = FALSE;
5666                         user_page_list[entry].cs_validated = dst_page->cs_validated;
5667                         user_page_list[entry].cs_tainted = dst_page->cs_tainted;
5668                         user_page_list[entry].cs_nx = dst_page->cs_nx;
5669                         user_page_list[entry].mark      = FALSE;
5670                 }
5671                 /*
5672                  * if UPL_RET_ONLY_ABSENT is set, then
5673                  * we are working with a fresh page and we've
5674                  * just set the clustered flag on it to
5675                  * indicate that it was drug in as part of a
5676                  * speculative cluster... so leave it alone
5677                  */
5678                 if ( !(cntrl_flags & UPL_RET_ONLY_ABSENT)) {
5679                         /*
5680                          * someone is explicitly grabbing this page...
5681                          * update clustered and speculative state
5682                          *
5683                          */
5684                         if (dst_page->clustered)
5685                                 VM_PAGE_CONSUME_CLUSTERED(dst_page);
5686                 }
5687 try_next_page:
5688                 if (dwp->dw_mask) {
5689                         if (dwp->dw_mask & DW_vm_page_activate)
5690                                 VM_STAT_INCR(reactivations);
5691
5692                         VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
5693
5694                         if (dw_count >= dw_limit) {
5695                                 vm_page_do_delayed_work(object, tag, &dw_array[0], dw_count);
5696
5697                                 dwp = &dw_array[0];
5698                                 dw_count = 0;
5699                         }
5700                 }
5701                 entry++;
5702                 dst_offset += PAGE_SIZE_64;
5703                 xfer_size -= PAGE_SIZE;
5704         }
5705         if (dw_count)
5706                 vm_page_do_delayed_work(object, tag, &dw_array[0], dw_count);
5707
5708         if (alias_page != NULL) {
5709                 VM_PAGE_FREE(alias_page);
5710         }
5711
5712         if (page_list_count != NULL) {
5713                 if (upl->flags & UPL_INTERNAL)
5714                         *page_list_count = 0;
5715                 else if (*page_list_count > entry)
5716                         *page_list_count = entry;
5717         }
5718 #if UPL_DEBUG
5719         upl->upl_state = 1;
5720 #endif
5721         vm_object_unlock(object);
5722
5723         return KERN_SUCCESS;
5724 }
5725
5726 /*
5727  *      Routine:        vm_object_super_upl_request
5728  *      Purpose:
5729  *              Cause the population of a portion of a vm_object
5730  *              in much the same way as memory_object_upl_request.
5731  *              Depending on the nature of the request, the pages
5732  *              returned may be contain valid data or be uninitialized.
5733  *              However, the region may be expanded up to the super
5734  *              cluster size provided.
5735  */
5736
5737 __private_extern__ kern_return_t
5738 vm_object_super_upl_request(
5739         vm_object_t object,
5740         vm_object_offset_t      offset,
5741         upl_size_t              size,
5742         upl_size_t              super_cluster,
5743         upl_t                   *upl,
5744         upl_page_info_t         *user_page_list,
5745         unsigned int            *page_list_count,
5746         upl_control_flags_t     cntrl_flags,
5747         vm_tag_t                tag)
5748 {
5749         if (object->paging_offset > offset  || ((cntrl_flags & UPL_VECTOR)==UPL_VECTOR))
5750                 return KERN_FAILURE;
5751
5752         assert(object->paging_in_progress);
5753         offset = offset - object->paging_offset;
5754
5755         if (super_cluster > size) {
5756
5757                 vm_object_offset_t      base_offset;
5758                 upl_size_t              super_size;
5759                 vm_object_size_t        super_size_64;
5760
5761                 base_offset = (offset & ~((vm_object_offset_t) super_cluster - 1));
5762                 super_size = (offset + size) > (base_offset + super_cluster) ? super_cluster<<1 : super_cluster;
5763                 super_size_64 = ((base_offset + super_size) > object->vo_size) ? (object->vo_size - base_offset) : super_size;
5764                 super_size = (upl_size_t) super_size_64;
5765                 assert(super_size == super_size_64);
5766
5767                 if (offset > (base_offset + super_size)) {
5768                         panic("vm_object_super_upl_request: Missed target pageout"
5769                               " %#llx,%#llx, %#x, %#x, %#x, %#llx\n",
5770                               offset, base_offset, super_size, super_cluster,
5771                               size, object->paging_offset);
5772                 }
5773                 /*
5774                  * apparently there is a case where the vm requests a
5775                  * page to be written out who's offset is beyond the
5776                  * object size
5777                  */
5778                 if ((offset + size) > (base_offset + super_size)) {
5779                         super_size_64 = (offset + size) - base_offset;
5780                         super_size = (upl_size_t) super_size_64;
5781                         assert(super_size == super_size_64);
5782                 }
5783
5784                 offset = base_offset;
5785                 size = super_size;
5786         }
5787         return vm_object_upl_request(object, offset, size, upl, user_page_list, page_list_count, cntrl_flags, tag);
5788 }
5789
5790 #if CONFIG_EMBEDDED
5791 int cs_executable_create_upl = 0;
5792 extern int proc_selfpid(void);
5793 extern char *proc_name_address(void *p);
5794 #endif /* CONFIG_EMBEDDED */
5795
5796 kern_return_t
5797 vm_map_create_upl(
5798         vm_map_t                map,
5799         vm_map_address_t        offset,
5800         upl_size_t              *upl_size,
5801         upl_t                   *upl,
5802         upl_page_info_array_t   page_list,
5803         unsigned int            *count,
5804         upl_control_flags_t     *flags,
5805         vm_tag_t                tag)
5806 {
5807         vm_map_entry_t          entry;
5808         upl_control_flags_t     caller_flags;
5809         int                     force_data_sync;
5810         int                     sync_cow_data;
5811         vm_object_t             local_object;
5812         vm_map_offset_t         local_offset;
5813         vm_map_offset_t         local_start;
5814         kern_return_t           ret;
5815
5816         assert(page_aligned(offset));
5817
5818         caller_flags = *flags;
5819
5820         if (caller_flags & ~UPL_VALID_FLAGS) {
5821                 /*
5822                  * For forward compatibility's sake,
5823                  * reject any unknown flag.
5824                  */
5825                 return KERN_INVALID_VALUE;
5826         }
5827         force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
5828         sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
5829
5830         if (upl == NULL)
5831                 return KERN_INVALID_ARGUMENT;
5832
5833 REDISCOVER_ENTRY:
5834         vm_map_lock_read(map);
5835
5836         if (!vm_map_lookup_entry(map, offset, &entry)) {
5837                 vm_map_unlock_read(map);
5838                 return KERN_FAILURE;
5839         }
5840
5841         if ((entry->vme_end - offset) < *upl_size) {
5842                 *upl_size = (upl_size_t) (entry->vme_end - offset);
5843                 assert(*upl_size == entry->vme_end - offset);
5844         }
5845
5846         if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
5847                 *flags = 0;
5848
5849                 if (!entry->is_sub_map &&
5850                     VME_OBJECT(entry) != VM_OBJECT_NULL) {
5851                         if (VME_OBJECT(entry)->private)
5852                                 *flags = UPL_DEV_MEMORY;
5853
5854                         if (VME_OBJECT(entry)->phys_contiguous)
5855                                 *flags |= UPL_PHYS_CONTIG;
5856                 }
5857                 vm_map_unlock_read(map);
5858                 return KERN_SUCCESS;
5859         }
5860
5861         if (VME_OBJECT(entry) == VM_OBJECT_NULL ||
5862             !VME_OBJECT(entry)->phys_contiguous) {
5863                 if (*upl_size > MAX_UPL_SIZE_BYTES)
5864                         *upl_size = MAX_UPL_SIZE_BYTES;
5865         }
5866
5867         /*
5868          *      Create an object if necessary.
5869          */
5870         if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
5871
5872                 if (vm_map_lock_read_to_write(map))
5873                         goto REDISCOVER_ENTRY;
5874
5875                 VME_OBJECT_SET(entry,
5876                                vm_object_allocate((vm_size_t)
5877                                                   (entry->vme_end -
5878                                                    entry->vme_start)));
5879                 VME_OFFSET_SET(entry, 0);
5880
5881                 vm_map_lock_write_to_read(map);
5882         }
5883
5884         if (!(caller_flags & UPL_COPYOUT_FROM) &&
5885             !(entry->protection & VM_PROT_WRITE)) {
5886                 vm_map_unlock_read(map);
5887                 return KERN_PROTECTION_FAILURE;
5888         }
5889
5890 #if CONFIG_EMBEDDED
5891         if (map->pmap != kernel_pmap &&
5892             (caller_flags & UPL_COPYOUT_FROM) &&
5893             (entry->protection & VM_PROT_EXECUTE) &&
5894             !(entry->protection & VM_PROT_WRITE)) {
5895                 vm_offset_t     kaddr;
5896                 vm_size_t       ksize;
5897
5898                 /*
5899                  * We're about to create a read-only UPL backed by
5900                  * memory from an executable mapping.
5901                  * Wiring the pages would result in the pages being copied
5902                  * (due to the "MAP_PRIVATE" mapping) and no longer
5903                  * code-signed, so no longer eligible for execution.
5904                  * Instead, let's copy the data into a kernel buffer and
5905                  * create the UPL from this kernel buffer.
5906                  * The kernel buffer is then freed, leaving the UPL holding
5907                  * the last reference on the VM object, so the memory will
5908                  * be released when the UPL is committed.
5909                  */
5910
5911                 vm_map_unlock_read(map);
5912                 /* allocate kernel buffer */
5913                 ksize = round_page(*upl_size);
5914                 kaddr = 0;
5915                 ret = kmem_alloc_pageable(kernel_map,
5916                                           &kaddr,
5917                                           ksize,
5918                                           tag);
5919                 if (ret == KERN_SUCCESS) {
5920                         /* copyin the user data */
5921                         assert(page_aligned(offset));
5922                         ret = copyinmap(map, offset, (void *)kaddr, *upl_size);
5923                 }
5924                 if (ret == KERN_SUCCESS) {
5925                         if (ksize > *upl_size) {
5926                                 /* zero out the extra space in kernel buffer */
5927                                 memset((void *)(kaddr + *upl_size),
5928                                        0,
5929                                        ksize - *upl_size);
5930                         }
5931                         /* create the UPL from the kernel buffer */
5932                         ret = vm_map_create_upl(kernel_map, kaddr, upl_size,
5933                                                 upl, page_list, count, flags, tag);
5934                 }
5935                 if (kaddr != 0) {
5936                         /* free the kernel buffer */
5937                         kmem_free(kernel_map, kaddr, ksize);
5938                         kaddr = 0;
5939                         ksize = 0;
5940                 }
5941 #if DEVELOPMENT || DEBUG
5942                 DTRACE_VM4(create_upl_from_executable,
5943                            vm_map_t, map,
5944                            vm_map_address_t, offset,
5945                            upl_size_t, *upl_size,
5946                            kern_return_t, ret);
5947 #endif /* DEVELOPMENT || DEBUG */
5948                 return ret;
5949         }
5950 #endif /* CONFIG_EMBEDDED */
5951
5952         local_object = VME_OBJECT(entry);
5953         assert(local_object != VM_OBJECT_NULL);
5954
5955         if (!entry->is_sub_map &&
5956             !entry->needs_copy &&
5957             *upl_size != 0 &&
5958             local_object->vo_size > *upl_size && /* partial UPL */
5959             entry->wired_count == 0 && /* No COW for entries that are wired */
5960             (map->pmap != kernel_pmap) && /* alias checks */
5961             (vm_map_entry_should_cow_for_true_share(entry) /* case 1 */
5962              ||
5963              (/* case 2 */
5964               local_object->internal &&
5965               (local_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) &&
5966               local_object->ref_count > 1))) {
5967                 vm_prot_t       prot;
5968
5969                 /*
5970                  * Case 1:
5971                  * Set up the targeted range for copy-on-write to avoid
5972                  * applying true_share/copy_delay to the entire object.
5973                  *
5974                  * Case 2:
5975                  * This map entry covers only part of an internal
5976                  * object.  There could be other map entries covering
5977                  * other areas of this object and some of these map
5978                  * entries could be marked as "needs_copy", which
5979                  * assumes that the object is COPY_SYMMETRIC.
5980                  * To avoid marking this object as COPY_DELAY and
5981                  * "true_share", let's shadow it and mark the new
5982                  * (smaller) object as "true_share" and COPY_DELAY.
5983                  */
5984
5985                 if (vm_map_lock_read_to_write(map)) {
5986                         goto REDISCOVER_ENTRY;
5987                 }
5988                 vm_map_lock_assert_exclusive(map);
5989                 assert(VME_OBJECT(entry) == local_object);
5990
5991                 vm_map_clip_start(map,
5992                                   entry,
5993                                   vm_map_trunc_page(offset,
5994                                                     VM_MAP_PAGE_MASK(map)));
5995                 vm_map_clip_end(map,
5996                                 entry,
5997                                 vm_map_round_page(offset + *upl_size,
5998                                                   VM_MAP_PAGE_MASK(map)));
5999                 if ((entry->vme_end - offset) < *upl_size) {
6000                         *upl_size = (upl_size_t) (entry->vme_end - offset);
6001                         assert(*upl_size == entry->vme_end - offset);
6002                 }
6003
6004                 prot = entry->protection & ~VM_PROT_WRITE;
6005                 if (override_nx(map, VME_ALIAS(entry)) && prot)
6006                         prot |= VM_PROT_EXECUTE;
6007                 vm_object_pmap_protect(local_object,
6008                                        VME_OFFSET(entry),
6009                                        entry->vme_end - entry->vme_start,
6010                                        ((entry->is_shared ||
6011                                          map->mapped_in_other_pmaps)
6012                                         ? PMAP_NULL
6013                                         : map->pmap),
6014                                        entry->vme_start,
6015                                        prot);
6016
6017                 assert(entry->wired_count == 0);
6018
6019                 /*
6020                  * Lock the VM object and re-check its status: if it's mapped
6021                  * in another address space, we could still be racing with
6022                  * another thread holding that other VM map exclusively.
6023                  */
6024                 vm_object_lock(local_object);
6025                 if (local_object->true_share) {
6026                         /* object is already in proper state: no COW needed */
6027                         assert(local_object->copy_strategy !=
6028                                MEMORY_OBJECT_COPY_SYMMETRIC);
6029                 } else {
6030                         /* not true_share: ask for copy-on-write below */
6031                         assert(local_object->copy_strategy ==
6032                                MEMORY_OBJECT_COPY_SYMMETRIC);
6033                         entry->needs_copy = TRUE;
6034                 }
6035                 vm_object_unlock(local_object);
6036
6037                 vm_map_lock_write_to_read(map);
6038         }
6039
6040         if (entry->needs_copy)  {
6041                 /*
6042                  * Honor copy-on-write for COPY_SYMMETRIC
6043                  * strategy.
6044                  */
6045                 vm_map_t                local_map;
6046                 vm_object_t             object;
6047                 vm_object_offset_t      new_offset;
6048                 vm_prot_t               prot;
6049                 boolean_t               wired;
6050                 vm_map_version_t        version;
6051                 vm_map_t                real_map;
6052                 vm_prot_t               fault_type;
6053
6054                 local_map = map;
6055
6056                 if (caller_flags & UPL_COPYOUT_FROM) {
6057                         fault_type = VM_PROT_READ | VM_PROT_COPY;
6058                         vm_counters.create_upl_extra_cow++;
6059                         vm_counters.create_upl_extra_cow_pages +=
6060                                 (entry->vme_end - entry->vme_start) / PAGE_SIZE;
6061                 } else {
6062                         fault_type = VM_PROT_WRITE;
6063                 }
6064                 if (vm_map_lookup_locked(&local_map,
6065                                          offset, fault_type,
6066                                          OBJECT_LOCK_EXCLUSIVE,
6067                                          &version, &object,
6068                                          &new_offset, &prot, &wired,
6069                                          NULL,
6070                                          &real_map) != KERN_SUCCESS) {
6071                         if (fault_type == VM_PROT_WRITE) {
6072                                 vm_counters.create_upl_lookup_failure_write++;
6073                         } else {
6074                                 vm_counters.create_upl_lookup_failure_copy++;
6075                         }
6076                         vm_map_unlock_read(local_map);
6077                         return KERN_FAILURE;
6078                 }
6079                 if (real_map != map)
6080                         vm_map_unlock(real_map);
6081                 vm_map_unlock_read(local_map);
6082
6083                 vm_object_unlock(object);
6084
6085                 goto REDISCOVER_ENTRY;
6086         }
6087
6088         if (entry->is_sub_map) {
6089                 vm_map_t        submap;
6090
6091                 submap = VME_SUBMAP(entry);
6092                 local_start = entry->vme_start;
6093                 local_offset = VME_OFFSET(entry);
6094
6095                 vm_map_reference(submap);
6096                 vm_map_unlock_read(map);
6097
6098                 ret = vm_map_create_upl(submap,
6099                                         local_offset + (offset - local_start),
6100                                         upl_size, upl, page_list, count, flags, tag);
6101                 vm_map_deallocate(submap);
6102
6103                 return ret;
6104         }
6105
6106         if (sync_cow_data &&
6107             (VME_OBJECT(entry)->shadow ||
6108              VME_OBJECT(entry)->copy)) {
6109                 local_object = VME_OBJECT(entry);
6110                 local_start = entry->vme_start;
6111                 local_offset = VME_OFFSET(entry);
6112
6113                 vm_object_reference(local_object);
6114                 vm_map_unlock_read(map);
6115
6116                 if (local_object->shadow && local_object->copy) {
6117                         vm_object_lock_request(local_object->shadow,
6118                                                ((vm_object_offset_t)
6119                                                 ((offset - local_start) +
6120                                                  local_offset) +
6121                                                 local_object->vo_shadow_offset),
6122                                                *upl_size, FALSE,
6123                                                MEMORY_OBJECT_DATA_SYNC,
6124                                                VM_PROT_NO_CHANGE);
6125                 }
6126                 sync_cow_data = FALSE;
6127                 vm_object_deallocate(local_object);
6128
6129                 goto REDISCOVER_ENTRY;
6130         }
6131         if (force_data_sync) {
6132                 local_object = VME_OBJECT(entry);
6133                 local_start = entry->vme_start;
6134                 local_offset = VME_OFFSET(entry);
6135
6136                 vm_object_reference(local_object);
6137                 vm_map_unlock_read(map);
6138
6139                 vm_object_lock_request(local_object,
6140                                        ((vm_object_offset_t)
6141                                         ((offset - local_start) +
6142                                          local_offset)),
6143                                        (vm_object_size_t)*upl_size,
6144                                        FALSE,
6145                                        MEMORY_OBJECT_DATA_SYNC,
6146                                        VM_PROT_NO_CHANGE);
6147
6148                 force_data_sync = FALSE;
6149                 vm_object_deallocate(local_object);
6150
6151                 goto REDISCOVER_ENTRY;
6152         }
6153         if (VME_OBJECT(entry)->private)
6154                 *flags = UPL_DEV_MEMORY;
6155         else
6156                 *flags = 0;
6157
6158         if (VME_OBJECT(entry)->phys_contiguous)
6159                 *flags |= UPL_PHYS_CONTIG;
6160
6161         local_object = VME_OBJECT(entry);
6162         local_offset = VME_OFFSET(entry);
6163         local_start = entry->vme_start;
6164
6165 #if CONFIG_EMBEDDED
6166         /*
6167          * Wiring will copy the pages to the shadow object.
6168          * The shadow object will not be code-signed so
6169          * attempting to execute code from these copied pages
6170          * would trigger a code-signing violation.
6171          */
6172         if (entry->protection & VM_PROT_EXECUTE) {
6173 #if MACH_ASSERT
6174                 printf("pid %d[%s] create_upl out of executable range from "
6175                        "0x%llx to 0x%llx: side effects may include "
6176                        "code-signing violations later on\n",
6177                        proc_selfpid(),
6178                        (current_task()->bsd_info
6179                         ? proc_name_address(current_task()->bsd_info)
6180                         : "?"),
6181                        (uint64_t) entry->vme_start,
6182                        (uint64_t) entry->vme_end);
6183 #endif /* MACH_ASSERT */
6184                 DTRACE_VM2(cs_executable_create_upl,
6185                            uint64_t, (uint64_t)entry->vme_start,
6186                            uint64_t, (uint64_t)entry->vme_end);
6187                 cs_executable_create_upl++;
6188         }
6189 #endif /* CONFIG_EMBEDDED */
6190
6191         vm_object_lock(local_object);
6192
6193         /*
6194          * Ensure that this object is "true_share" and "copy_delay" now,
6195          * while we're still holding the VM map lock.  After we unlock the map,
6196          * anything could happen to that mapping, including some copy-on-write
6197          * activity.  We need to make sure that the IOPL will point at the
6198          * same memory as the mapping.
6199          */
6200         if (local_object->true_share) {
6201                 assert(local_object->copy_strategy !=
6202                        MEMORY_OBJECT_COPY_SYMMETRIC);
6203         } else if (local_object != kernel_object &&
6204                    local_object != compressor_object &&
6205                    !local_object->phys_contiguous) {
6206 #if VM_OBJECT_TRACKING_OP_TRUESHARE
6207                 if (!local_object->true_share &&
6208                     vm_object_tracking_inited) {
6209                         void *bt[VM_OBJECT_TRACKING_BTDEPTH];
6210                         int num = 0;
6211                         num = OSBacktrace(bt,
6212                                           VM_OBJECT_TRACKING_BTDEPTH);
6213                         btlog_add_entry(vm_object_tracking_btlog,
6214                                         local_object,
6215                                         VM_OBJECT_TRACKING_OP_TRUESHARE,
6216                                         bt,
6217                                         num);
6218                 }
6219 #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
6220                 local_object->true_share = TRUE;
6221                 if (local_object->copy_strategy ==
6222                     MEMORY_OBJECT_COPY_SYMMETRIC) {
6223                         local_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
6224                 }
6225         }
6226
6227         vm_object_reference_locked(local_object);
6228         vm_object_unlock(local_object);
6229
6230         vm_map_unlock_read(map);
6231
6232         ret = vm_object_iopl_request(local_object,
6233                                      ((vm_object_offset_t)
6234                                       ((offset - local_start) + local_offset)),
6235                                      *upl_size,
6236                                      upl,
6237                                      page_list,
6238                                      count,
6239                                      caller_flags,
6240                                      tag);
6241         vm_object_deallocate(local_object);
6242
6243         return ret;
6244 }
6245
6246 /*
6247  * Internal routine to enter a UPL into a VM map.
6248  *
6249  * JMM - This should just be doable through the standard
6250  * vm_map_enter() API.
6251  */
6252 kern_return_t
6253 vm_map_enter_upl(
6254         vm_map_t                map,
6255         upl_t                   upl,
6256         vm_map_offset_t         *dst_addr)
6257 {
6258         vm_map_size_t           size;
6259         vm_object_offset_t      offset;
6260         vm_map_offset_t         addr;
6261         vm_page_t               m;
6262         kern_return_t           kr;
6263         int                     isVectorUPL = 0, curr_upl=0;
6264         upl_t                   vector_upl = NULL;
6265         vm_offset_t             vector_upl_dst_addr = 0;
6266         vm_map_t                vector_upl_submap = NULL;
6267         upl_offset_t            subupl_offset = 0;
6268         upl_size_t              subupl_size = 0;
6269
6270         if (upl == UPL_NULL)
6271                 return KERN_INVALID_ARGUMENT;
6272
6273         if((isVectorUPL = vector_upl_is_valid(upl))) {
6274                 int mapped=0,valid_upls=0;
6275                 vector_upl = upl;
6276
6277                 upl_lock(vector_upl);
6278                 for(curr_upl=0; curr_upl < MAX_VECTOR_UPL_ELEMENTS; curr_upl++) {
6279                         upl =  vector_upl_subupl_byindex(vector_upl, curr_upl );
6280                         if(upl == NULL)
6281                                 continue;
6282                         valid_upls++;
6283                         if (UPL_PAGE_LIST_MAPPED & upl->flags)
6284                                 mapped++;
6285                 }
6286
6287                 if(mapped) {
6288                         if(mapped != valid_upls)
6289                                 panic("Only %d of the %d sub-upls within the Vector UPL are alread mapped\n", mapped, valid_upls);
6290                         else {
6291                                 upl_unlock(vector_upl);
6292                                 return KERN_FAILURE;
6293                         }
6294                 }
6295
6296                 kr = kmem_suballoc(map, &vector_upl_dst_addr, vector_upl->size, FALSE,
6297                                     VM_FLAGS_ANYWHERE, VM_MAP_KERNEL_FLAGS_NONE, VM_KERN_MEMORY_NONE,
6298                                     &vector_upl_submap);
6299                 if( kr != KERN_SUCCESS )
6300                         panic("Vector UPL submap allocation failed\n");
6301                 map = vector_upl_submap;
6302                 vector_upl_set_submap(vector_upl, vector_upl_submap, vector_upl_dst_addr);
6303                 curr_upl=0;
6304         }
6305         else
6306                 upl_lock(upl);
6307
6308 process_upl_to_enter:
6309         if(isVectorUPL){
6310                 if(curr_upl == MAX_VECTOR_UPL_ELEMENTS) {
6311                         *dst_addr = vector_upl_dst_addr;
6312                         upl_unlock(vector_upl);
6313                         return KERN_SUCCESS;
6314                 }
6315                 upl =  vector_upl_subupl_byindex(vector_upl, curr_upl++ );
6316                 if(upl == NULL)
6317                         goto process_upl_to_enter;
6318
6319                 vector_upl_get_iostate(vector_upl, upl, &subupl_offset, &subupl_size);
6320                 *dst_addr = (vm_map_offset_t)(vector_upl_dst_addr + (vm_map_offset_t)subupl_offset);
6321         } else {
6322                 /*
6323                  * check to see if already mapped
6324                  */
6325                 if (UPL_PAGE_LIST_MAPPED & upl->flags) {
6326                         upl_unlock(upl);
6327                         return KERN_FAILURE;
6328                 }
6329         }
6330         if ((!(upl->flags & UPL_SHADOWED)) &&
6331             ((upl->flags & UPL_HAS_BUSY) ||
6332              !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || (upl->map_object->phys_contiguous)))) {
6333
6334                 vm_object_t             object;
6335                 vm_page_t               alias_page;
6336                 vm_object_offset_t      new_offset;
6337                 unsigned int            pg_num;
6338                 wpl_array_t             lite_list;
6339
6340                 if (upl->flags & UPL_INTERNAL) {
6341                         lite_list = (wpl_array_t)
6342                                 ((((uintptr_t)upl) + sizeof(struct upl))
6343                                  + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
6344                 } else {
6345                         lite_list = (wpl_array_t)(((uintptr_t)upl) + sizeof(struct upl));
6346                 }
6347                 object = upl->map_object;
6348                 upl->map_object = vm_object_allocate(upl->size);
6349
6350                 vm_object_lock(upl->map_object);
6351
6352                 upl->map_object->shadow = object;
6353                 upl->map_object->pageout = TRUE;
6354                 upl->map_object->can_persist = FALSE;
6355                 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
6356                 upl->map_object->vo_shadow_offset = upl->offset - object->paging_offset;
6357                 upl->map_object->wimg_bits = object->wimg_bits;
6358                 offset = upl->map_object->vo_shadow_offset;
6359                 new_offset = 0;
6360                 size = upl->size;
6361
6362                 upl->flags |= UPL_SHADOWED;
6363
6364                 while (size) {
6365                         pg_num = (unsigned int) (new_offset / PAGE_SIZE);
6366                         assert(pg_num == new_offset / PAGE_SIZE);
6367
6368                         if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
6369
6370                                 VM_PAGE_GRAB_FICTITIOUS(alias_page);
6371
6372                                 vm_object_lock(object);
6373
6374                                 m = vm_page_lookup(object, offset);
6375                                 if (m == VM_PAGE_NULL) {
6376                                         panic("vm_upl_map: page missing\n");
6377                                 }
6378
6379                                 /*
6380                                  * Convert the fictitious page to a private
6381                                  * shadow of the real page.
6382                                  */
6383                                 assert(alias_page->fictitious);
6384                                 alias_page->fictitious = FALSE;
6385                                 alias_page->private = TRUE;
6386                                 alias_page->free_when_done = TRUE;
6387                                 /*
6388                                  * since m is a page in the upl it must
6389                                  * already be wired or BUSY, so it's
6390                                  * safe to assign the underlying physical
6391                                  * page to the alias
6392                                  */
6393                                 VM_PAGE_SET_PHYS_PAGE(alias_page, VM_PAGE_GET_PHYS_PAGE(m));
6394
6395                                 vm_object_unlock(object);
6396
6397                                 vm_page_lockspin_queues();
6398                                 vm_page_wire(alias_page, VM_KERN_MEMORY_NONE, TRUE);
6399                                 vm_page_unlock_queues();
6400
6401                                 vm_page_insert_wired(alias_page, upl->map_object, new_offset, VM_KERN_MEMORY_NONE);
6402
6403                                 assert(!alias_page->wanted);
6404                                 alias_page->busy = FALSE;
6405                                 alias_page->absent = FALSE;
6406                         }
6407                         size -= PAGE_SIZE;
6408                         offset += PAGE_SIZE_64;
6409                         new_offset += PAGE_SIZE_64;
6410                 }
6411                 vm_object_unlock(upl->map_object);
6412         }
6413         if (upl->flags & UPL_SHADOWED)
6414                 offset = 0;
6415         else
6416                 offset = upl->offset - upl->map_object->paging_offset;
6417
6418         size = upl->size;
6419
6420         vm_object_reference(upl->map_object);
6421
6422         if(!isVectorUPL) {
6423                 *dst_addr = 0;
6424                 /*
6425                 * NEED A UPL_MAP ALIAS
6426                 */
6427                 kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
6428                                   VM_FLAGS_ANYWHERE, VM_MAP_KERNEL_FLAGS_NONE, VM_KERN_MEMORY_OSFMK,
6429                                   upl->map_object, offset, FALSE,
6430                                   VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
6431
6432                 if (kr != KERN_SUCCESS) {
6433                         vm_object_deallocate(upl->map_object);
6434                         upl_unlock(upl);
6435                         return(kr);
6436                 }
6437         }
6438         else {
6439                 kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
6440                                   VM_FLAGS_FIXED, VM_MAP_KERNEL_FLAGS_NONE, VM_KERN_MEMORY_OSFMK,
6441                                   upl->map_object, offset, FALSE,
6442                                   VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
6443                 if(kr)
6444                         panic("vm_map_enter failed for a Vector UPL\n");
6445         }
6446         vm_object_lock(upl->map_object);
6447
6448         for (addr = *dst_addr; size > 0; size -= PAGE_SIZE, addr += PAGE_SIZE) {
6449                 m = vm_page_lookup(upl->map_object, offset);
6450
6451                 if (m) {
6452                         m->pmapped = TRUE;
6453
6454                         /* CODE SIGNING ENFORCEMENT: page has been wpmapped,
6455                          * but only in kernel space. If this was on a user map,
6456                          * we'd have to set the wpmapped bit. */
6457                         /* m->wpmapped = TRUE; */
6458                         assert(map->pmap == kernel_pmap);
6459
6460                         PMAP_ENTER(map->pmap, addr, m, VM_PROT_DEFAULT, VM_PROT_NONE, 0, TRUE, kr);
6461
6462                         assert(kr == KERN_SUCCESS);
6463 #if KASAN
6464                         kasan_notify_address(addr, PAGE_SIZE_64);
6465 #endif
6466                 }
6467                 offset += PAGE_SIZE_64;
6468         }
6469         vm_object_unlock(upl->map_object);
6470
6471         /*
6472          * hold a reference for the mapping
6473          */
6474         upl->ref_count++;
6475         upl->flags |= UPL_PAGE_LIST_MAPPED;
6476         upl->kaddr = (vm_offset_t) *dst_addr;
6477         assert(upl->kaddr == *dst_addr);
6478
6479         if(isVectorUPL)
6480                 goto process_upl_to_enter;
6481
6482         upl_unlock(upl);
6483
6484         return KERN_SUCCESS;
6485 }
6486
6487 /*
6488  * Internal routine to remove a UPL mapping from a VM map.
6489  *
6490  * XXX - This should just be doable through a standard
6491  * vm_map_remove() operation.  Otherwise, implicit clean-up
6492  * of the target map won't be able to correctly remove
6493  * these (and release the reference on the UPL).  Having
6494  * to do this means we can't map these into user-space
6495  * maps yet.
6496  */
6497 kern_return_t
6498 vm_map_remove_upl(
6499         vm_map_t        map,
6500         upl_t           upl)
6501 {
6502         vm_address_t    addr;
6503         upl_size_t      size;
6504         int             isVectorUPL = 0, curr_upl = 0;
6505         upl_t           vector_upl = NULL;
6506
6507         if (upl == UPL_NULL)
6508                 return KERN_INVALID_ARGUMENT;
6509
6510         if((isVectorUPL = vector_upl_is_valid(upl))) {
6511                 int     unmapped=0, valid_upls=0;
6512                 vector_upl = upl;
6513                 upl_lock(vector_upl);
6514                 for(curr_upl=0; curr_upl < MAX_VECTOR_UPL_ELEMENTS; curr_upl++) {
6515                         upl =  vector_upl_subupl_byindex(vector_upl, curr_upl );
6516                         if(upl == NULL)
6517                                 continue;
6518                         valid_upls++;
6519                         if (!(UPL_PAGE_LIST_MAPPED & upl->flags))
6520                                 unmapped++;
6521                 }
6522
6523                 if(unmapped) {
6524                         if(unmapped != valid_upls)
6525                                 panic("%d of the %d sub-upls within the Vector UPL is/are not mapped\n", unmapped, valid_upls);
6526                         else {
6527                                 upl_unlock(vector_upl);
6528                                 return KERN_FAILURE;
6529                         }
6530                 }
6531                 curr_upl=0;
6532         }
6533         else
6534                 upl_lock(upl);
6535
6536 process_upl_to_remove:
6537         if(isVectorUPL) {
6538                 if(curr_upl == MAX_VECTOR_UPL_ELEMENTS) {
6539                         vm_map_t v_upl_submap;
6540                         vm_offset_t v_upl_submap_dst_addr;
6541                         vector_upl_get_submap(vector_upl, &v_upl_submap, &v_upl_submap_dst_addr);
6542
6543                         vm_map_remove(map, v_upl_submap_dst_addr, v_upl_submap_dst_addr + vector_upl->size, VM_MAP_NO_FLAGS);
6544                         vm_map_deallocate(v_upl_submap);
6545                         upl_unlock(vector_upl);
6546                         return KERN_SUCCESS;
6547                 }
6548
6549                 upl =  vector_upl_subupl_byindex(vector_upl, curr_upl++ );
6550                 if(upl == NULL)
6551                         goto process_upl_to_remove;
6552         }
6553
6554         if (upl->flags & UPL_PAGE_LIST_MAPPED) {
6555                 addr = upl->kaddr;
6556                 size = upl->size;
6557
6558                 assert(upl->ref_count > 1);
6559                 upl->ref_count--;               /* removing mapping ref */
6560
6561                 upl->flags &= ~UPL_PAGE_LIST_MAPPED;
6562                 upl->kaddr = (vm_offset_t) 0;
6563
6564                 if(!isVectorUPL) {
6565                         upl_unlock(upl);
6566
6567                         vm_map_remove(
6568                                 map,
6569                                 vm_map_trunc_page(addr,
6570                                                   VM_MAP_PAGE_MASK(map)),
6571                                 vm_map_round_page(addr + size,
6572                                                   VM_MAP_PAGE_MASK(map)),
6573                                 VM_MAP_NO_FLAGS);
6574
6575                         return KERN_SUCCESS;
6576                 }
6577                 else {
6578                         /*
6579                         * If it's a Vectored UPL, we'll be removing the entire
6580                         * submap anyways, so no need to remove individual UPL
6581                         * element mappings from within the submap
6582                         */
6583                         goto process_upl_to_remove;
6584                 }
6585         }
6586         upl_unlock(upl);
6587
6588         return KERN_FAILURE;
6589 }
6590
6591
6592 kern_return_t
6593 upl_commit_range(
6594         upl_t                   upl,
6595         upl_offset_t            offset,
6596         upl_size_t              size,
6597         int                     flags,
6598         upl_page_info_t         *page_list,
6599         mach_msg_type_number_t  count,
6600         boolean_t               *empty)
6601 {
6602         upl_size_t              xfer_size, subupl_size = size;
6603         vm_object_t             shadow_object;
6604         vm_object_t             object;
6605         vm_object_t             m_object;
6606         vm_object_offset_t      target_offset;
6607         upl_offset_t            subupl_offset = offset;
6608         int                     entry;
6609         wpl_array_t             lite_list;
6610         int                     occupied;
6611         int                     clear_refmod = 0;
6612         int                     pgpgout_count = 0;
6613         struct  vm_page_delayed_work    dw_array[DEFAULT_DELAYED_WORK_LIMIT];
6614         struct  vm_page_delayed_work    *dwp;
6615         int                     dw_count;
6616         int                     dw_limit;
6617         int                     isVectorUPL = 0;
6618         upl_t                   vector_upl = NULL;
6619         boolean_t               should_be_throttled = FALSE;
6620
6621         vm_page_t               nxt_page = VM_PAGE_NULL;
6622         int                     fast_path_possible = 0;
6623         int                     fast_path_full_commit = 0;
6624         int                     throttle_page = 0;
6625         int                     unwired_count = 0;
6626         int                     local_queue_count = 0;
6627         vm_page_t               first_local, last_local;
6628
6629         *empty = FALSE;
6630
6631         if (upl == UPL_NULL)
6632                 return KERN_INVALID_ARGUMENT;
6633
6634         if (count == 0)
6635                 page_list = NULL;
6636
6637         if((isVectorUPL = vector_upl_is_valid(upl))) {
6638                 vector_upl = upl;
6639                 upl_lock(vector_upl);
6640         }
6641         else
6642                 upl_lock(upl);
6643
6644 process_upl_to_commit:
6645
6646         if(isVectorUPL) {
6647                 size = subupl_size;
6648                 offset = subupl_offset;
6649                 if(size == 0) {
6650                         upl_unlock(vector_upl);
6651                         return KERN_SUCCESS;
6652                 }
6653                 upl =  vector_upl_subupl_byoffset(vector_upl, &offset, &size);
6654                 if(upl == NULL) {
6655                         upl_unlock(vector_upl);
6656                         return KERN_FAILURE;
6657                 }
6658                 page_list = UPL_GET_INTERNAL_PAGE_LIST_SIMPLE(upl);
6659                 subupl_size -= size;
6660                 subupl_offset += size;
6661         }
6662
6663 #if UPL_DEBUG
6664         if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
6665                 (void) OSBacktrace(&upl->upl_commit_records[upl->upl_commit_index].c_retaddr[0], UPL_DEBUG_STACK_FRAMES);
6666
6667                 upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
6668                 upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
6669
6670                 upl->upl_commit_index++;
6671         }
6672 #endif
6673         if (upl->flags & UPL_DEVICE_MEMORY)
6674                 xfer_size = 0;
6675         else if ((offset + size) <= upl->size)
6676                 xfer_size = size;
6677         else {
6678                 if(!isVectorUPL)
6679                         upl_unlock(upl);
6680                 else {
6681                         upl_unlock(vector_upl);
6682                 }
6683                 return KERN_FAILURE;
6684         }
6685         if (upl->flags & UPL_SET_DIRTY)
6686                 flags |= UPL_COMMIT_SET_DIRTY;
6687         if (upl->flags & UPL_CLEAR_DIRTY)
6688                 flags |= UPL_COMMIT_CLEAR_DIRTY;
6689
6690         if (upl->flags & UPL_INTERNAL)
6691                 lite_list = (wpl_array_t) ((((uintptr_t)upl) + sizeof(struct upl))
6692                                            + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
6693         else
6694                 lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
6695
6696         object = upl->map_object;
6697
6698         if (upl->flags & UPL_SHADOWED) {
6699                 vm_object_lock(object);
6700                 shadow_object = object->shadow;
6701         } else {
6702                 shadow_object = object;
6703         }
6704         entry = offset/PAGE_SIZE;
6705         target_offset = (vm_object_offset_t)offset;
6706
6707         assert(!(target_offset & PAGE_MASK));
6708         assert(!(xfer_size & PAGE_MASK));
6709
6710         if (upl->flags & UPL_KERNEL_OBJECT)
6711                 vm_object_lock_shared(shadow_object);
6712         else
6713                 vm_object_lock(shadow_object);
6714
6715         VM_OBJECT_WIRED_PAGE_UPDATE_START(shadow_object);
6716
6717         if (upl->flags & UPL_ACCESS_BLOCKED) {
6718                 assert(shadow_object->blocked_access);
6719                 shadow_object->blocked_access = FALSE;
6720                 vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
6721         }
6722
6723         if (shadow_object->code_signed) {
6724                 /*
6725                  * CODE SIGNING:
6726                  * If the object is code-signed, do not let this UPL tell
6727                  * us if the pages are valid or not.  Let the pages be
6728                  * validated by VM the normal way (when they get mapped or
6729                  * copied).
6730                  */
6731                 flags &= ~UPL_COMMIT_CS_VALIDATED;
6732         }
6733         if (! page_list) {
6734                 /*
6735                  * No page list to get the code-signing info from !?
6736                  */
6737                 flags &= ~UPL_COMMIT_CS_VALIDATED;
6738         }
6739         if (!VM_DYNAMIC_PAGING_ENABLED() && shadow_object->internal)
6740                 should_be_throttled = TRUE;
6741
6742         dwp = &dw_array[0];
6743         dw_count = 0;
6744         dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
6745
6746         if ((upl->flags & UPL_IO_WIRE) &&
6747             !(flags & UPL_COMMIT_FREE_ABSENT) &&
6748             !isVectorUPL &&
6749             shadow_object->purgable != VM_PURGABLE_VOLATILE &&
6750             shadow_object->purgable != VM_PURGABLE_EMPTY) {
6751
6752                 if (!vm_page_queue_empty(&shadow_object->memq)) {
6753
6754                         if (size == shadow_object->vo_size) {
6755                                 nxt_page = (vm_page_t)vm_page_queue_first(&shadow_object->memq);
6756                                 fast_path_full_commit = 1;
6757                         }
6758                         fast_path_possible = 1;
6759
6760                         if (!VM_DYNAMIC_PAGING_ENABLED() && shadow_object->internal &&
6761                             (shadow_object->purgable == VM_PURGABLE_DENY ||
6762                              shadow_object->purgable == VM_PURGABLE_NONVOLATILE ||
6763                              shadow_object->purgable == VM_PURGABLE_VOLATILE)) {
6764                                 throttle_page = 1;
6765                         }
6766                 }
6767         }
6768         first_local = VM_PAGE_NULL;
6769         last_local = VM_PAGE_NULL;
6770
6771         while (xfer_size) {
6772                 vm_page_t       t, m;
6773
6774                 dwp->dw_mask = 0;
6775                 clear_refmod = 0;
6776
6777                 m = VM_PAGE_NULL;
6778
6779                 if (upl->flags & UPL_LITE) {
6780                         unsigned int    pg_num;
6781
6782                         if (nxt_page != VM_PAGE_NULL) {
6783                                 m = nxt_page;
6784                                 nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->listq);
6785                                 target_offset = m->offset;
6786                         }
6787                         pg_num = (unsigned int) (target_offset/PAGE_SIZE);
6788                         assert(pg_num == target_offset/PAGE_SIZE);
6789
6790                         if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
6791                                 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
6792
6793                                 if (!(upl->flags & UPL_KERNEL_OBJECT) && m == VM_PAGE_NULL)
6794                                         m = vm_page_lookup(shadow_object, target_offset + (upl->offset - shadow_object->paging_offset));
6795                         } else
6796                                 m = NULL;
6797                 }
6798                 if (upl->flags & UPL_SHADOWED) {
6799                         if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
6800
6801                                 t->free_when_done = FALSE;
6802
6803                                 VM_PAGE_FREE(t);
6804
6805                                 if (!(upl->flags & UPL_KERNEL_OBJECT) && m == VM_PAGE_NULL)
6806                                         m = vm_page_lookup(shadow_object, target_offset + object->vo_shadow_offset);
6807                         }
6808                 }
6809                 if (m == VM_PAGE_NULL)
6810                         goto commit_next_page;
6811
6812                 m_object = VM_PAGE_OBJECT(m);
6813
6814                 if (m->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
6815                         assert(m->busy);
6816
6817                         dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
6818                         goto commit_next_page;
6819                 }
6820
6821                 if (flags & UPL_COMMIT_CS_VALIDATED) {
6822                         /*
6823                          * CODE SIGNING:
6824                          * Set the code signing bits according to
6825                          * what the UPL says they should be.
6826                          */
6827                         m->cs_validated = page_list[entry].cs_validated;
6828                         m->cs_tainted = page_list[entry].cs_tainted;
6829                         m->cs_nx = page_list[entry].cs_nx;
6830                 }
6831                 if (flags & UPL_COMMIT_WRITTEN_BY_KERNEL)
6832                         m->written_by_kernel = TRUE;
6833
6834                 if (upl->flags & UPL_IO_WIRE) {
6835
6836                         if (page_list)
6837                                 page_list[entry].phys_addr = 0;
6838
6839                         if (flags & UPL_COMMIT_SET_DIRTY) {
6840                                 SET_PAGE_DIRTY(m, FALSE);
6841                         } else if (flags & UPL_COMMIT_CLEAR_DIRTY) {
6842                                 m->dirty = FALSE;
6843
6844                                 if (! (flags & UPL_COMMIT_CS_VALIDATED) &&
6845                                     m->cs_validated && !m->cs_tainted) {
6846                                         /*
6847                                          * CODE SIGNING:
6848                                          * This page is no longer dirty
6849                                          * but could have been modified,
6850                                          * so it will need to be
6851                                          * re-validated.
6852                                          */
6853                                         if (m->slid) {
6854                                                 panic("upl_commit_range(%p): page %p was slid\n",
6855                                                       upl, m);
6856                                         }
6857                                         assert(!m->slid);
6858                                         m->cs_validated = FALSE;
6859 #if DEVELOPMENT || DEBUG
6860                                         vm_cs_validated_resets++;
6861 #endif
6862                                         pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
6863                                 }
6864                                 clear_refmod |= VM_MEM_MODIFIED;
6865                         }
6866                         if (upl->flags & UPL_ACCESS_BLOCKED) {
6867                                 /*
6868                                  * We blocked access to the pages in this UPL.
6869                                  * Clear the "busy" bit and wake up any waiter
6870                                  * for this page.
6871                                  */
6872                                 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
6873                         }
6874                         if (fast_path_possible) {
6875                                 assert(m_object->purgable != VM_PURGABLE_EMPTY);
6876                                 assert(m_object->purgable != VM_PURGABLE_VOLATILE);
6877                                 if (m->absent) {
6878                                         assert(m->vm_page_q_state == VM_PAGE_NOT_ON_Q);
6879                                         assert(m->wire_count == 0);
6880                                         assert(m->busy);
6881
6882                                         m->absent = FALSE;
6883                                         dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
6884                                 } else {
6885                                         if (m->wire_count == 0)
6886                                                 panic("wire_count == 0, m = %p, obj = %p\n", m, shadow_object);
6887                                         assert(m->vm_page_q_state == VM_PAGE_IS_WIRED);
6888
6889                                         /*
6890                                          * XXX FBDP need to update some other
6891                                          * counters here (purgeable_wired_count)
6892                                          * (ledgers), ...
6893                                          */
6894                                         assert(m->wire_count > 0);
6895                                         m->wire_count--;
6896
6897                                         if (m->wire_count == 0) {
6898                                                 m->vm_page_q_state = VM_PAGE_NOT_ON_Q;
6899                                                 unwired_count++;
6900                                         }
6901                                 }
6902                                 if (m->wire_count == 0) {
6903                                         assert(m->pageq.next == 0 && m->pageq.prev == 0);
6904
6905                                         if (last_local == VM_PAGE_NULL) {
6906                                                 assert(first_local == VM_PAGE_NULL);
6907
6908                                                 last_local = m;
6909                                                 first_local = m;
6910                                         } else {
6911                                                 assert(first_local != VM_PAGE_NULL);
6912
6913                                                 m->pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local);
6914                                                 first_local->pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(m);
6915                                                 first_local = m;
6916                                         }
6917                                         local_queue_count++;
6918
6919                                         if (throttle_page) {
6920                                                 m->vm_page_q_state = VM_PAGE_ON_THROTTLED_Q;
6921                                         } else {
6922                                                 if (flags & UPL_COMMIT_INACTIVATE) {
6923                                                         if (shadow_object->internal)
6924                                                                 m->vm_page_q_state = VM_PAGE_ON_INACTIVE_INTERNAL_Q;
6925                                                         else
6926                                                                 m->vm_page_q_state = VM_PAGE_ON_INACTIVE_EXTERNAL_Q;
6927                                                 } else
6928                                                         m->vm_page_q_state = VM_PAGE_ON_ACTIVE_Q;
6929                                         }
6930                                 }
6931                         } else {
6932                                 if (flags & UPL_COMMIT_INACTIVATE) {
6933                                         dwp->dw_mask |= DW_vm_page_deactivate_internal;
6934                                         clear_refmod |= VM_MEM_REFERENCED;
6935                                 }
6936                                 if (m->absent) {
6937                                         if (flags & UPL_COMMIT_FREE_ABSENT)
6938                                                 dwp->dw_mask |= DW_vm_page_free;
6939                                         else {
6940                                                 m->absent = FALSE;
6941                                                 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
6942
6943                                                 if ( !(dwp->dw_mask & DW_vm_page_deactivate_internal))
6944                                                         dwp->dw_mask |= DW_vm_page_activate;
6945                                         }
6946                                 } else
6947                                         dwp->dw_mask |= DW_vm_page_unwire;
6948                         }
6949                         goto commit_next_page;
6950                 }
6951                 assert(m->vm_page_q_state != VM_PAGE_USED_BY_COMPRESSOR);
6952
6953                 if (page_list)
6954                         page_list[entry].phys_addr = 0;
6955
6956                 /*
6957                  * make sure to clear the hardware
6958                  * modify or reference bits before
6959                  * releasing the BUSY bit on this page
6960                  * otherwise we risk losing a legitimate
6961                  * change of state
6962                  */
6963                 if (flags & UPL_COMMIT_CLEAR_DIRTY) {
6964                         m->dirty = FALSE;
6965
6966                         clear_refmod |= VM_MEM_MODIFIED;
6967                 }
6968                 if (m->laundry)
6969                         dwp->dw_mask |= DW_vm_pageout_throttle_up;
6970
6971                 if (VM_PAGE_WIRED(m))
6972                         m->free_when_done = FALSE;
6973
6974                 if (! (flags & UPL_COMMIT_CS_VALIDATED) &&
6975                     m->cs_validated && !m->cs_tainted) {
6976                         /*
6977                          * CODE SIGNING:
6978                          * This page is no longer dirty
6979                          * but could have been modified,
6980                          * so it will need to be
6981                          * re-validated.
6982                          */
6983                         if (m->slid) {
6984                                 panic("upl_commit_range(%p): page %p was slid\n",
6985                                       upl, m);
6986                         }
6987                         assert(!m->slid);
6988                         m->cs_validated = FALSE;
6989 #if DEVELOPMENT || DEBUG
6990                         vm_cs_validated_resets++;
6991 #endif
6992                         pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
6993                 }
6994                 if (m->overwriting) {
6995                         /*
6996                          * the (COPY_OUT_FROM == FALSE) request_page_list case
6997                          */
6998                         if (m->busy) {
6999 #if CONFIG_PHANTOM_CACHE
7000                                 if (m->absent && !m_object->internal)
7001                                         dwp->dw_mask |= DW_vm_phantom_cache_update;
7002 #endif
7003                                 m->absent = FALSE;
7004
7005                                 dwp->dw_mask |= DW_clear_busy;
7006                         } else {
7007                                 /*
7008                                  * alternate (COPY_OUT_FROM == FALSE) page_list case
7009                                  * Occurs when the original page was wired
7010                                  * at the time of the list request
7011                                  */
7012                                 assert(VM_PAGE_WIRED(m));
7013
7014                                 dwp->dw_mask |= DW_vm_page_unwire; /* reactivates */
7015                         }
7016                         m->overwriting = FALSE;
7017                 }
7018                 m->cleaning = FALSE;
7019
7020                 if (m->free_when_done) {
7021                         /*
7022                          * With the clean queue enabled, UPL_PAGEOUT should
7023                          * no longer set the pageout bit. It's pages now go
7024                          * to the clean queue.
7025                          */
7026                         assert(!(flags & UPL_PAGEOUT));
7027                         assert(!m_object->internal);
7028
7029                         m->free_when_done = FALSE;
7030 #if MACH_CLUSTER_STATS
7031                         if (m->wanted) vm_pageout_target_collisions++;
7032 #endif
7033                         if ((flags & UPL_COMMIT_SET_DIRTY) ||
7034                             (m->pmapped && (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED))) {
7035                                 /*
7036                                  * page was re-dirtied after we started
7037                                  * the pageout... reactivate it since
7038                                  * we don't know whether the on-disk
7039                                  * copy matches what is now in memory
7040                                  */
7041                                 SET_PAGE_DIRTY(m, FALSE);
7042
7043                                 dwp->dw_mask |= DW_vm_page_activate | DW_PAGE_WAKEUP;
7044
7045                                 if (upl->flags & UPL_PAGEOUT) {
7046                                         CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
7047                                         VM_STAT_INCR(reactivations);
7048                                         DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL);
7049                                 }
7050                         } else {
7051                                 /*
7052                                  * page has been successfully cleaned
7053                                  * go ahead and free it for other use
7054                                  */
7055                                 if (m_object->internal) {
7056                                         DTRACE_VM2(anonpgout, int, 1, (uint64_t *), NULL);
7057                                 } else {
7058                                         DTRACE_VM2(fspgout, int, 1, (uint64_t *), NULL);
7059                                 }
7060                                 m->dirty = FALSE;
7061                                 m->busy = TRUE;
7062
7063                                 dwp->dw_mask |= DW_vm_page_free;
7064                         }
7065                         goto commit_next_page;
7066                 }
7067 #if MACH_CLUSTER_STATS
7068                 if (m->wpmapped)
7069                         m->dirty = pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m));
7070
7071                 if (m->dirty)   vm_pageout_cluster_dirtied++;
7072                 else            vm_pageout_cluster_cleaned++;
7073                 if (m->wanted)  vm_pageout_cluster_collisions++;
7074 #endif
7075                 /*
7076                  * It is a part of the semantic of COPYOUT_FROM
7077                  * UPLs that a commit implies cache sync
7078                  * between the vm page and the backing store
7079                  * this can be used to strip the precious bit
7080                  * as well as clean
7081                  */
7082                 if ((upl->flags & UPL_PAGE_SYNC_DONE) || (flags & UPL_COMMIT_CLEAR_PRECIOUS))
7083                         m->precious = FALSE;
7084
7085                 if (flags & UPL_COMMIT_SET_DIRTY) {
7086                         SET_PAGE_DIRTY(m, FALSE);
7087                 } else {
7088                         m->dirty = FALSE;
7089                 }
7090
7091                 /* with the clean queue on, move *all* cleaned pages to the clean queue */
7092                 if (hibernate_cleaning_in_progress == FALSE && !m->dirty && (upl->flags & UPL_PAGEOUT)) {
7093                         pgpgout_count++;
7094
7095                         VM_STAT_INCR(pageouts);
7096                         DTRACE_VM2(pgout, int, 1, (uint64_t *), NULL);
7097
7098                         dwp->dw_mask |= DW_enqueue_cleaned;
7099                         vm_pageout_enqueued_cleaned_from_inactive_dirty++;
7100                 } else if (should_be_throttled == TRUE && (m->vm_page_q_state == VM_PAGE_NOT_ON_Q)) {
7101                         /*
7102                          * page coming back in from being 'frozen'...
7103                          * it was dirty before it was frozen, so keep it so
7104                          * the vm_page_activate will notice that it really belongs
7105                          * on the throttle queue and put it there
7106                          */
7107                         SET_PAGE_DIRTY(m, FALSE);
7108                         dwp->dw_mask |= DW_vm_page_activate;
7109
7110                 } else {
7111                         if ((flags & UPL_COMMIT_INACTIVATE) && !m->clustered && (m->vm_page_q_state != VM_PAGE_ON_SPECULATIVE_Q)) {
7112                                 dwp->dw_mask |= DW_vm_page_deactivate_internal;
7113                                 clear_refmod |= VM_MEM_REFERENCED;
7114                         } else if ( !VM_PAGE_PAGEABLE(m)) {
7115
7116                                 if (m->clustered || (flags & UPL_COMMIT_SPECULATE))
7117                                         dwp->dw_mask |= DW_vm_page_speculate;
7118                                 else if (m->reference)
7119                                         dwp->dw_mask |= DW_vm_page_activate;
7120                                 else {
7121                                         dwp->dw_mask |= DW_vm_page_deactivate_internal;
7122                                         clear_refmod |= VM_MEM_REFERENCED;
7123                                 }
7124                         }
7125                 }
7126                 if (upl->flags & UPL_ACCESS_BLOCKED) {
7127                         /*
7128                          * We blocked access to the pages in this URL.
7129                          * Clear the "busy" bit on this page before we
7130                          * wake up any waiter.
7131                          */
7132                         dwp->dw_mask |= DW_clear_busy;
7133                 }
7134                 /*
7135                  * Wakeup any thread waiting for the page to be un-cleaning.
7136                  */
7137                 dwp->dw_mask |= DW_PAGE_WAKEUP;
7138
7139 commit_next_page:
7140                 if (clear_refmod)
7141                         pmap_clear_refmod(VM_PAGE_GET_PHYS_PAGE(m), clear_refmod);
7142
7143                 target_offset += PAGE_SIZE_64;
7144                 xfer_size -= PAGE_SIZE;
7145                 entry++;
7146
7147                 if (dwp->dw_mask) {
7148                         if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) {
7149                                 VM_PAGE_ADD_DELAYED_WORK(dwp, m, dw_count);
7150
7151                                 if (dw_count >= dw_limit) {
7152                                         vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, &dw_array[0], dw_count);
7153
7154                                         dwp = &dw_array[0];
7155                                         dw_count = 0;
7156                                 }
7157                         } else {
7158                                 if (dwp->dw_mask & DW_clear_busy)
7159                                         m->busy = FALSE;
7160
7161                                 if (dwp->dw_mask & DW_PAGE_WAKEUP)
7162                                         PAGE_WAKEUP(m);
7163                         }
7164                 }
7165         }
7166         if (dw_count)
7167                 vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, &dw_array[0], dw_count);
7168
7169         if (fast_path_possible) {
7170
7171                 assert(shadow_object->purgable != VM_PURGABLE_VOLATILE);
7172                 assert(shadow_object->purgable != VM_PURGABLE_EMPTY);
7173
7174                 if (local_queue_count || unwired_count) {
7175
7176                         if (local_queue_count) {
7177                                 vm_page_t       first_target;
7178                                 vm_page_queue_head_t    *target_queue;
7179
7180                                 if (throttle_page)
7181                                         target_queue = &vm_page_queue_throttled;
7182                                 else {
7183                                         if (flags & UPL_COMMIT_INACTIVATE) {
7184                                                 if (shadow_object->internal)
7185                                                         target_queue = &vm_page_queue_anonymous;
7186                                                 else
7187                                                         target_queue = &vm_page_queue_inactive;
7188                                         } else
7189                                                 target_queue = &vm_page_queue_active;
7190                                 }
7191                                 /*
7192                                  * Transfer the entire local queue to a regular LRU page queues.
7193                                  */
7194                                 vm_page_lockspin_queues();
7195
7196                                 first_target = (vm_page_t) vm_page_queue_first(target_queue);
7197
7198                                 if (vm_page_queue_empty(target_queue))
7199                                         target_queue->prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
7200                                 else
7201                                         first_target->pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
7202
7203                                 target_queue->next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local);
7204                                 first_local->pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(target_queue);
7205                                 last_local->pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_target);
7206
7207                                 /*
7208                                  * Adjust the global page counts.
7209                                  */
7210                                 if (throttle_page) {
7211                                         vm_page_throttled_count += local_queue_count;
7212                                 } else {
7213                                         if (flags & UPL_COMMIT_INACTIVATE) {
7214                                                 if (shadow_object->internal)
7215                                                         vm_page_anonymous_count += local_queue_count;
7216                                                 vm_page_inactive_count += local_queue_count;
7217
7218                                                 token_new_pagecount += local_queue_count;
7219                                         } else
7220                                                 vm_page_active_count += local_queue_count;
7221
7222                                         if (shadow_object->internal)
7223                                                 vm_page_pageable_internal_count += local_queue_count;
7224                                         else
7225                                                 vm_page_pageable_external_count += local_queue_count;
7226                                 }
7227                         } else {
7228                                 vm_page_lockspin_queues();
7229                         }
7230                         if (unwired_count) {
7231                                 vm_page_wire_count -= unwired_count;
7232                                 VM_CHECK_MEMORYSTATUS;
7233                         }
7234                         vm_page_unlock_queues();
7235
7236                         VM_OBJECT_WIRED_PAGE_COUNT(shadow_object, -unwired_count);
7237                 }
7238         }
7239         occupied = 1;
7240
7241         if (upl->flags & UPL_DEVICE_MEMORY)  {
7242                 occupied = 0;
7243         } else if (upl->flags & UPL_LITE) {
7244                 int     pg_num;
7245                 int     i;
7246
7247                 occupied = 0;
7248
7249                 if (!fast_path_full_commit) {
7250                         pg_num = upl->size/PAGE_SIZE;
7251                         pg_num = (pg_num + 31) >> 5;
7252
7253                         for (i = 0; i < pg_num; i++) {
7254                                 if (lite_list[i] != 0) {
7255                                         occupied = 1;
7256                                         break;
7257                                 }
7258                         }
7259                 }
7260         } else {
7261                 if (vm_page_queue_empty(&upl->map_object->memq))
7262                         occupied = 0;
7263         }
7264         if (occupied == 0) {
7265                 /*
7266                  * If this UPL element belongs to a Vector UPL and is
7267                  * empty, then this is the right function to deallocate
7268                  * it. So go ahead set the *empty variable. The flag
7269                  * UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
7270                  * should be considered relevant for the Vector UPL and not
7271                  * the internal UPLs.
7272                  */
7273                 if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) || isVectorUPL)
7274                         *empty = TRUE;
7275
7276                 if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
7277                         /*
7278                          * this is not a paging object
7279                          * so we need to drop the paging reference
7280                          * that was taken when we created the UPL
7281                          * against this object
7282                          */
7283                         vm_object_activity_end(shadow_object);
7284                         vm_object_collapse(shadow_object, 0, TRUE);
7285                 } else {
7286                          /*
7287                           * we dontated the paging reference to
7288                           * the map object... vm_pageout_object_terminate
7289                           * will drop this reference
7290                           */
7291                 }
7292         }
7293         VM_OBJECT_WIRED_PAGE_UPDATE_END(shadow_object, shadow_object->wire_tag);
7294         vm_object_unlock(shadow_object);
7295         if (object != shadow_object)
7296                 vm_object_unlock(object);
7297
7298         if(!isVectorUPL)
7299                 upl_unlock(upl);
7300         else {
7301                 /*
7302                  * If we completed our operations on an UPL that is
7303                  * part of a Vectored UPL and if empty is TRUE, then
7304                  * we should go ahead and deallocate this UPL element.
7305                  * Then we check if this was the last of the UPL elements
7306                  * within that Vectored UPL. If so, set empty to TRUE
7307                  * so that in ubc_upl_commit_range or ubc_upl_commit, we
7308                  * can go ahead and deallocate the Vector UPL too.
7309                  */
7310                 if(*empty==TRUE) {
7311                         *empty = vector_upl_set_subupl(vector_upl, upl, 0);
7312                         upl_deallocate(upl);
7313                 }
7314                 goto process_upl_to_commit;
7315         }
7316         if (pgpgout_count) {
7317                 DTRACE_VM2(pgpgout, int, pgpgout_count, (uint64_t *), NULL);
7318         }
7319
7320         return KERN_SUCCESS;
7321 }
7322
7323 kern_return_t
7324 upl_abort_range(
7325         upl_t                   upl,
7326         upl_offset_t            offset,
7327         upl_size_t              size,
7328         int                     error,
7329         boolean_t               *empty)
7330 {
7331         upl_page_info_t         *user_page_list = NULL;
7332         upl_size_t              xfer_size, subupl_size = size;
7333         vm_object_t             shadow_object;
7334         vm_object_t             object;
7335         vm_object_offset_t      target_offset;
7336         upl_offset_t            subupl_offset = offset;
7337         int                     entry;
7338         wpl_array_t             lite_list;
7339         int                     occupied;
7340         struct  vm_page_delayed_work    dw_array[DEFAULT_DELAYED_WORK_LIMIT];
7341         struct  vm_page_delayed_work    *dwp;
7342         int                     dw_count;
7343         int                     dw_limit;
7344         int                     isVectorUPL = 0;
7345         upl_t                   vector_upl = NULL;
7346
7347         *empty = FALSE;
7348
7349         if (upl == UPL_NULL)
7350                 return KERN_INVALID_ARGUMENT;
7351
7352         if ( (upl->flags & UPL_IO_WIRE) && !(error & UPL_ABORT_DUMP_PAGES) )
7353                 return upl_commit_range(upl, offset, size, UPL_COMMIT_FREE_ABSENT, NULL, 0, empty);
7354
7355         if((isVectorUPL = vector_upl_is_valid(upl))) {
7356                 vector_upl = upl;
7357                 upl_lock(vector_upl);
7358         }
7359         else
7360                 upl_lock(upl);
7361
7362 process_upl_to_abort:
7363         if(isVectorUPL) {
7364                 size = subupl_size;
7365                 offset = subupl_offset;
7366                 if(size == 0) {
7367                         upl_unlock(vector_upl);
7368                         return KERN_SUCCESS;
7369                 }
7370                 upl =  vector_upl_subupl_byoffset(vector_upl, &offset, &size);
7371                 if(upl == NULL) {
7372                         upl_unlock(vector_upl);
7373                         return KERN_FAILURE;
7374                 }
7375                 subupl_size -= size;
7376                 subupl_offset += size;
7377         }
7378
7379         *empty = FALSE;
7380
7381 #if UPL_DEBUG
7382         if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
7383                 (void) OSBacktrace(&upl->upl_commit_records[upl->upl_commit_index].c_retaddr[0], UPL_DEBUG_STACK_FRAMES);
7384
7385                 upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
7386                 upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
7387                 upl->upl_commit_records[upl->upl_commit_index].c_aborted = 1;
7388
7389                 upl->upl_commit_index++;
7390         }
7391 #endif
7392         if (upl->flags & UPL_DEVICE_MEMORY)
7393                 xfer_size = 0;
7394         else if ((offset + size) <= upl->size)
7395                 xfer_size = size;
7396         else {
7397                 if(!isVectorUPL)
7398                         upl_unlock(upl);
7399                 else {
7400                         upl_unlock(vector_upl);
7401                 }
7402
7403                 return KERN_FAILURE;
7404         }
7405         if (upl->flags & UPL_INTERNAL) {
7406                 lite_list = (wpl_array_t)
7407                         ((((uintptr_t)upl) + sizeof(struct upl))
7408                         + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
7409
7410                 user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
7411         } else {
7412                 lite_list = (wpl_array_t)
7413                         (((uintptr_t)upl) + sizeof(struct upl));
7414         }
7415         object = upl->map_object;
7416
7417         if (upl->flags & UPL_SHADOWED) {
7418                 vm_object_lock(object);
7419                 shadow_object = object->shadow;
7420         } else
7421                 shadow_object = object;
7422
7423         entry = offset/PAGE_SIZE;
7424         target_offset = (vm_object_offset_t)offset;
7425
7426         assert(!(target_offset & PAGE_MASK));
7427         assert(!(xfer_size & PAGE_MASK));
7428
7429         if (upl->flags & UPL_KERNEL_OBJECT)
7430                 vm_object_lock_shared(shadow_object);
7431         else
7432                 vm_object_lock(shadow_object);
7433
7434         if (upl->flags & UPL_ACCESS_BLOCKED) {
7435                 assert(shadow_object->blocked_access);
7436                 shadow_object->blocked_access = FALSE;
7437                 vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
7438         }
7439
7440         dwp = &dw_array[0];
7441         dw_count = 0;
7442         dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
7443
7444         if ((error & UPL_ABORT_DUMP_PAGES) && (upl->flags & UPL_KERNEL_OBJECT))
7445                 panic("upl_abort_range: kernel_object being DUMPED");
7446
7447         while (xfer_size) {
7448                 vm_page_t       t, m;
7449                 unsigned int    pg_num;
7450                 boolean_t       needed;
7451
7452                 pg_num = (unsigned int) (target_offset/PAGE_SIZE);
7453                 assert(pg_num == target_offset/PAGE_SIZE);
7454
7455                 needed = FALSE;
7456
7457                 if (user_page_list)
7458                         needed = user_page_list[pg_num].needed;
7459
7460                 dwp->dw_mask = 0;
7461                 m = VM_PAGE_NULL;
7462
7463                 if (upl->flags & UPL_LITE) {
7464
7465                         if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
7466                                 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
7467
7468                                 if ( !(upl->flags & UPL_KERNEL_OBJECT))
7469                                         m = vm_page_lookup(shadow_object, target_offset +
7470                                                            (upl->offset - shadow_object->paging_offset));
7471                         }
7472                 }
7473                 if (upl->flags & UPL_SHADOWED) {
7474                         if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
7475                                 t->free_when_done = FALSE;
7476
7477                                 VM_PAGE_FREE(t);
7478
7479                                 if (m == VM_PAGE_NULL)
7480                                         m = vm_page_lookup(shadow_object, target_offset + object->vo_shadow_offset);
7481                         }
7482                 }
7483                 if ((upl->flags & UPL_KERNEL_OBJECT))
7484                         goto abort_next_page;
7485
7486                 if (m != VM_PAGE_NULL) {
7487
7488                         assert(m->vm_page_q_state != VM_PAGE_USED_BY_COMPRESSOR);
7489
7490                         if (m->absent) {
7491                                 boolean_t must_free = TRUE;
7492
7493                                 /*
7494                                  * COPYOUT = FALSE case
7495                                  * check for error conditions which must
7496                                  * be passed back to the pages customer
7497                                  */
7498                                 if (error & UPL_ABORT_RESTART) {
7499                                         m->restart = TRUE;
7500                                         m->absent = FALSE;
7501                                         m->unusual = TRUE;
7502                                         must_free = FALSE;
7503                                 } else if (error & UPL_ABORT_UNAVAILABLE) {
7504                                         m->restart = FALSE;
7505                                         m->unusual = TRUE;
7506                                         must_free = FALSE;
7507                                 } else if (error & UPL_ABORT_ERROR) {
7508                                         m->restart = FALSE;
7509                                         m->absent = FALSE;
7510                                         m->error = TRUE;
7511                                         m->unusual = TRUE;
7512                                         must_free = FALSE;
7513                                 }
7514                                 if (m->clustered && needed == FALSE) {
7515                                         /*
7516                                          * This page was a part of a speculative
7517                                          * read-ahead initiated by the kernel
7518                                          * itself.  No one is expecting this
7519                                          * page and no one will clean up its
7520                                          * error state if it ever becomes valid
7521                                          * in the future.
7522                                          * We have to free it here.
7523                                          */
7524                                         must_free = TRUE;
7525                                 }
7526                                 m->cleaning = FALSE;
7527
7528                                 if (m->overwriting && !m->busy) {
7529                                         /*
7530                                          * this shouldn't happen since
7531                                          * this is an 'absent' page, but
7532                                          * it doesn't hurt to check for
7533                                          * the 'alternate' method of
7534                                          * stabilizing the page...
7535                                          * we will mark 'busy' to be cleared
7536                                          * in the following code which will
7537                                          * take care of the primary stabilzation
7538                                          * method (i.e. setting 'busy' to TRUE)
7539                                          */
7540                                         dwp->dw_mask |= DW_vm_page_unwire;
7541                                 }
7542                                 m->overwriting = FALSE;
7543
7544                                 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7545
7546                                 if (must_free == TRUE)
7547                                         dwp->dw_mask |= DW_vm_page_free;
7548                                 else
7549                                         dwp->dw_mask |= DW_vm_page_activate;
7550                         } else {
7551                                 /*
7552                                  * Handle the trusted pager throttle.
7553                                  */
7554                                 if (m->laundry)
7555                                         dwp->dw_mask |= DW_vm_pageout_throttle_up;
7556
7557                                 if (upl->flags & UPL_ACCESS_BLOCKED) {
7558                                         /*
7559                                          * We blocked access to the pages in this UPL.
7560                                          * Clear the "busy" bit and wake up any waiter
7561                                          * for this page.
7562                                          */
7563                                         dwp->dw_mask |= DW_clear_busy;
7564                                 }
7565                                 if (m->overwriting) {
7566                                         if (m->busy)
7567                                                 dwp->dw_mask |= DW_clear_busy;
7568                                         else {
7569                                                 /*
7570                                                  * deal with the 'alternate' method
7571                                                  * of stabilizing the page...
7572                                                  * we will either free the page
7573                                                  * or mark 'busy' to be cleared
7574                                                  * in the following code which will
7575                                                  * take care of the primary stabilzation
7576                                                  * method (i.e. setting 'busy' to TRUE)
7577                                                  */
7578                                                 dwp->dw_mask |= DW_vm_page_unwire;
7579                                         }
7580                                         m->overwriting = FALSE;
7581                                 }
7582                                 m->free_when_done = FALSE;
7583                                 m->cleaning = FALSE;
7584
7585                                 if (error & UPL_ABORT_DUMP_PAGES) {
7586                                         pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
7587
7588                                         dwp->dw_mask |= DW_vm_page_free;
7589                                 } else {
7590                                         if (!(dwp->dw_mask & DW_vm_page_unwire)) {
7591                                                 if (error & UPL_ABORT_REFERENCE) {
7592                                                         /*
7593                                                          * we've been told to explictly
7594                                                          * reference this page... for
7595                                                          * file I/O, this is done by
7596                                                          * implementing an LRU on the inactive q
7597                                                          */
7598                                                         dwp->dw_mask |= DW_vm_page_lru;
7599
7600                                                 } else if ( !VM_PAGE_PAGEABLE(m))
7601                                                         dwp->dw_mask |= DW_vm_page_deactivate_internal;
7602                                         }
7603                                         dwp->dw_mask |= DW_PAGE_WAKEUP;
7604                                 }
7605                         }
7606                 }
7607 abort_next_page:
7608                 target_offset += PAGE_SIZE_64;
7609                 xfer_size -= PAGE_SIZE;
7610                 entry++;
7611
7612                 if (dwp->dw_mask) {
7613                         if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) {
7614                                 VM_PAGE_ADD_DELAYED_WORK(dwp, m, dw_count);
7615
7616                                 if (dw_count >= dw_limit) {
7617                                         vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, &dw_array[0], dw_count);
7618
7619                                         dwp = &dw_array[0];
7620                                         dw_count = 0;
7621                                 }
7622                         } else {
7623                                 if (dwp->dw_mask & DW_clear_busy)
7624                                         m->busy = FALSE;
7625
7626                                 if (dwp->dw_mask & DW_PAGE_WAKEUP)
7627                                         PAGE_WAKEUP(m);
7628                         }
7629                 }
7630         }
7631         if (dw_count)
7632                 vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, &dw_array[0], dw_count);
7633
7634         occupied = 1;
7635
7636         if (upl->flags & UPL_DEVICE_MEMORY)  {
7637                 occupied = 0;
7638         } else if (upl->flags & UPL_LITE) {
7639                 int     pg_num;
7640                 int     i;
7641
7642                 pg_num = upl->size/PAGE_SIZE;
7643                 pg_num = (pg_num + 31) >> 5;
7644                 occupied = 0;
7645
7646                 for (i = 0; i < pg_num; i++) {
7647                         if (lite_list[i] != 0) {
7648                                 occupied = 1;
7649                                 break;
7650                         }
7651                 }
7652         } else {
7653                 if (vm_page_queue_empty(&upl->map_object->memq))
7654                         occupied = 0;
7655         }
7656         if (occupied == 0) {
7657                 /*
7658                  * If this UPL element belongs to a Vector UPL and is
7659                  * empty, then this is the right function to deallocate
7660                  * it. So go ahead set the *empty variable. The flag
7661                  * UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
7662                  * should be considered relevant for the Vector UPL and
7663                  * not the internal UPLs.
7664                  */
7665                 if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) || isVectorUPL)
7666                         *empty = TRUE;
7667
7668                 if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
7669                         /*
7670                          * this is not a paging object
7671                          * so we need to drop the paging reference
7672                          * that was taken when we created the UPL
7673                          * against this object
7674                          */
7675                         vm_object_activity_end(shadow_object);
7676                         vm_object_collapse(shadow_object, 0, TRUE);
7677                 } else {
7678                          /*
7679                           * we dontated the paging reference to
7680                           * the map object... vm_pageout_object_terminate
7681                           * will drop this reference
7682                           */
7683                 }
7684         }
7685         vm_object_unlock(shadow_object);
7686         if (object != shadow_object)
7687                 vm_object_unlock(object);
7688
7689         if(!isVectorUPL)
7690                 upl_unlock(upl);
7691         else {
7692                 /*
7693                 * If we completed our operations on an UPL that is
7694                 * part of a Vectored UPL and if empty is TRUE, then
7695                 * we should go ahead and deallocate this UPL element.
7696                 * Then we check if this was the last of the UPL elements
7697                 * within that Vectored UPL. If so, set empty to TRUE
7698                 * so that in ubc_upl_abort_range or ubc_upl_abort, we
7699                 * can go ahead and deallocate the Vector UPL too.
7700                 */
7701                 if(*empty == TRUE) {
7702                         *empty = vector_upl_set_subupl(vector_upl, upl,0);
7703                         upl_deallocate(upl);
7704                 }
7705                 goto process_upl_to_abort;
7706         }
7707
7708         return KERN_SUCCESS;
7709 }
7710
7711
7712 kern_return_t
7713 upl_abort(
7714         upl_t   upl,
7715         int     error)
7716 {
7717         boolean_t       empty;
7718
7719         if (upl == UPL_NULL)
7720                 return KERN_INVALID_ARGUMENT;
7721
7722         return upl_abort_range(upl, 0, upl->size, error, &empty);
7723 }
7724
7725
7726 /* an option on commit should be wire */
7727 kern_return_t
7728 upl_commit(
7729         upl_t                   upl,
7730         upl_page_info_t         *page_list,
7731         mach_msg_type_number_t  count)
7732 {
7733         boolean_t       empty;
7734
7735         if (upl == UPL_NULL)
7736                 return KERN_INVALID_ARGUMENT;
7737
7738         return upl_commit_range(upl, 0, upl->size, 0, page_list, count, &empty);
7739 }
7740
7741
7742 void
7743 iopl_valid_data(
7744         upl_t    upl,
7745         vm_tag_t tag)
7746 {
7747         vm_object_t     object;
7748         vm_offset_t     offset;
7749         vm_page_t       m, nxt_page = VM_PAGE_NULL;
7750         upl_size_t      size;
7751         int             wired_count = 0;
7752
7753         if (upl == NULL)
7754                 panic("iopl_valid_data: NULL upl");
7755         if (vector_upl_is_valid(upl))
7756                 panic("iopl_valid_data: vector upl");
7757         if ((upl->flags & (UPL_DEVICE_MEMORY|UPL_SHADOWED|UPL_ACCESS_BLOCKED|UPL_IO_WIRE|UPL_INTERNAL)) != UPL_IO_WIRE)
7758                 panic("iopl_valid_data: unsupported upl, flags = %x", upl->flags);
7759
7760         object = upl->map_object;
7761
7762         if (object == kernel_object || object == compressor_object)
7763                 panic("iopl_valid_data: object == kernel or compressor");
7764
7765         if (object->purgable == VM_PURGABLE_VOLATILE ||
7766             object->purgable == VM_PURGABLE_EMPTY)
7767                 panic("iopl_valid_data: object %p purgable %d",
7768                       object, object->purgable);
7769
7770         size = upl->size;
7771
7772         vm_object_lock(object);
7773         VM_OBJECT_WIRED_PAGE_UPDATE_START(object);
7774
7775         if (object->vo_size == size && object->resident_page_count == (size / PAGE_SIZE))
7776                 nxt_page = (vm_page_t)vm_page_queue_first(&object->memq);
7777         else
7778                 offset = 0 + upl->offset - object->paging_offset;
7779
7780         while (size) {
7781
7782                 if (nxt_page != VM_PAGE_NULL) {
7783                         m = nxt_page;
7784                         nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->listq);
7785                 } else {
7786                         m = vm_page_lookup(object, offset);
7787                         offset += PAGE_SIZE;
7788
7789                         if (m == VM_PAGE_NULL)
7790                                 panic("iopl_valid_data: missing expected page at offset %lx", (long)offset);
7791                 }
7792                 if (m->busy) {
7793                         if (!m->absent)
7794                                 panic("iopl_valid_data: busy page w/o absent");
7795
7796                         if (m->pageq.next || m->pageq.prev)
7797                                 panic("iopl_valid_data: busy+absent page on page queue");
7798                         if (m->reusable) {
7799                                 panic("iopl_valid_data: %p is reusable", m);
7800                         }
7801
7802                         m->absent = FALSE;
7803                         m->dirty = TRUE;
7804                         assert(m->vm_page_q_state == VM_PAGE_NOT_ON_Q);
7805                         assert(m->wire_count == 0);
7806                         m->wire_count++;
7807                         assert(m->wire_count);
7808                         if (m->wire_count == 1) {
7809                                 m->vm_page_q_state = VM_PAGE_IS_WIRED;
7810                                 wired_count++;
7811                         } else {
7812                                 panic("iopl_valid_data: %p already wired\n", m);
7813                         }
7814
7815                         PAGE_WAKEUP_DONE(m);
7816                 }
7817                 size -= PAGE_SIZE;
7818         }
7819         if (wired_count) {
7820
7821                 VM_OBJECT_WIRED_PAGE_COUNT(object, wired_count);
7822                 assert(object->resident_page_count >= object->wired_page_count);
7823
7824                 /* no need to adjust purgeable accounting for this object: */
7825                 assert(object->purgable != VM_PURGABLE_VOLATILE);
7826                 assert(object->purgable != VM_PURGABLE_EMPTY);
7827
7828                 vm_page_lockspin_queues();
7829                 vm_page_wire_count += wired_count;
7830                 vm_page_unlock_queues();
7831         }
7832         VM_OBJECT_WIRED_PAGE_UPDATE_END(object, tag);
7833         vm_object_unlock(object);
7834 }
7835
7836
7837 void
7838 vm_object_set_pmap_cache_attr(
7839                 vm_object_t             object,
7840                 upl_page_info_array_t   user_page_list,
7841                 unsigned int            num_pages,
7842                 boolean_t               batch_pmap_op)
7843 {
7844         unsigned int    cache_attr = 0;
7845
7846         cache_attr = object->wimg_bits & VM_WIMG_MASK;
7847         assert(user_page_list);
7848         if (cache_attr != VM_WIMG_USE_DEFAULT) {
7849                 PMAP_BATCH_SET_CACHE_ATTR(object, user_page_list, cache_attr, num_pages, batch_pmap_op);
7850         }
7851 }
7852
7853
7854 boolean_t       vm_object_iopl_wire_full(vm_object_t, upl_t, upl_page_info_array_t, wpl_array_t, upl_control_flags_t, vm_tag_t);
7855 kern_return_t   vm_object_iopl_wire_empty(vm_object_t, upl_t, upl_page_info_array_t, wpl_array_t, upl_control_flags_t, vm_tag_t, vm_object_offset_t *, int);
7856
7857
7858
7859 boolean_t
7860 vm_object_iopl_wire_full(vm_object_t object, upl_t upl, upl_page_info_array_t user_page_list,
7861                             wpl_array_t lite_list, upl_control_flags_t cntrl_flags, vm_tag_t tag)
7862 {
7863         vm_page_t       dst_page;
7864         unsigned int    entry;
7865         int             page_count;
7866         int             delayed_unlock = 0;
7867         boolean_t       retval = TRUE;
7868         ppnum_t         phys_page;
7869
7870         vm_object_lock_assert_exclusive(object);
7871         assert(object->purgable != VM_PURGABLE_VOLATILE);
7872         assert(object->purgable != VM_PURGABLE_EMPTY);
7873         assert(object->pager == NULL);
7874         assert(object->copy == NULL);
7875         assert(object->shadow == NULL);
7876
7877         page_count = object->resident_page_count;
7878         dst_page = (vm_page_t)vm_page_queue_first(&object->memq);
7879
7880         vm_page_lock_queues();
7881
7882         while (page_count--) {
7883
7884                 if (dst_page->busy ||
7885                     dst_page->fictitious ||
7886                     dst_page->absent ||
7887                     dst_page->error ||
7888                     dst_page->cleaning ||
7889                     dst_page->restart ||
7890                     dst_page->laundry) {
7891                         retval = FALSE;
7892                         goto done;
7893                 }
7894                 if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->written_by_kernel == TRUE) {
7895                         retval = FALSE;
7896                         goto done;
7897                 }
7898                 dst_page->reference = TRUE;
7899
7900                 vm_page_wire(dst_page, tag, FALSE);
7901
7902                 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
7903                         SET_PAGE_DIRTY(dst_page, FALSE);
7904                 }
7905                 entry = (unsigned int)(dst_page->offset / PAGE_SIZE);
7906                 assert(entry >= 0 && entry < object->resident_page_count);
7907                 lite_list[entry>>5] |= 1 << (entry & 31);
7908
7909                 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
7910
7911                 if (phys_page > upl->highest_page)
7912                         upl->highest_page = phys_page;
7913
7914                 if (user_page_list) {
7915                         user_page_list[entry].phys_addr = phys_page;
7916                         user_page_list[entry].absent    = dst_page->absent;
7917                         user_page_list[entry].dirty     = dst_page->dirty;
7918                         user_page_list[entry].free_when_done   = dst_page->free_when_done;
7919                         user_page_list[entry].precious  = dst_page->precious;
7920                         user_page_list[entry].device    = FALSE;
7921                         user_page_list[entry].speculative = FALSE;
7922                         user_page_list[entry].cs_validated = FALSE;
7923                         user_page_list[entry].cs_tainted = FALSE;
7924                         user_page_list[entry].cs_nx     = FALSE;
7925                         user_page_list[entry].needed    = FALSE;
7926                         user_page_list[entry].mark      = FALSE;
7927                 }
7928                 if (delayed_unlock++ > 256) {
7929                         delayed_unlock = 0;
7930                         lck_mtx_yield(&vm_page_queue_lock);
7931
7932                         VM_CHECK_MEMORYSTATUS;
7933                 }
7934                 dst_page = (vm_page_t)vm_page_queue_next(&dst_page->listq);
7935         }
7936 done:
7937         vm_page_unlock_queues();
7938
7939         VM_CHECK_MEMORYSTATUS;
7940
7941         return (retval);
7942 }
7943
7944
7945 kern_return_t
7946 vm_object_iopl_wire_empty(vm_object_t object, upl_t upl, upl_page_info_array_t user_page_list,
7947                              wpl_array_t lite_list, upl_control_flags_t cntrl_flags, vm_tag_t tag, vm_object_offset_t *dst_offset, int page_count)
7948 {
7949         vm_page_t       dst_page;
7950         boolean_t       no_zero_fill = FALSE;
7951         int             interruptible;
7952         int             pages_wired = 0;
7953         int             pages_inserted = 0;
7954         int             entry = 0;
7955         uint64_t        delayed_ledger_update = 0;
7956         kern_return_t   ret = KERN_SUCCESS;
7957         int             grab_options;
7958         ppnum_t         phys_page;
7959
7960         vm_object_lock_assert_exclusive(object);
7961         assert(object->purgable != VM_PURGABLE_VOLATILE);
7962         assert(object->purgable != VM_PURGABLE_EMPTY);
7963         assert(object->pager == NULL);
7964         assert(object->copy == NULL);
7965         assert(object->shadow == NULL);
7966
7967         if (cntrl_flags & UPL_SET_INTERRUPTIBLE)
7968                 interruptible = THREAD_ABORTSAFE;
7969         else
7970                 interruptible = THREAD_UNINT;
7971
7972         if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO))
7973                 no_zero_fill = TRUE;
7974
7975         grab_options = 0;
7976 #if CONFIG_SECLUDED_MEMORY
7977         if (object->can_grab_secluded) {
7978                 grab_options |= VM_PAGE_GRAB_SECLUDED;
7979         }
7980 #endif /* CONFIG_SECLUDED_MEMORY */
7981
7982         while (page_count--) {
7983
7984                 while ((dst_page = vm_page_grab_options(grab_options))
7985                        == VM_PAGE_NULL) {
7986
7987                         OSAddAtomic(page_count, &vm_upl_wait_for_pages);
7988
7989                         VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
7990
7991                         if (vm_page_wait(interruptible) == FALSE) {
7992                                 /*
7993                                  * interrupted case
7994                                  */
7995                                 OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
7996
7997                                 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
7998
7999                                 ret = MACH_SEND_INTERRUPTED;
8000                                 goto done;
8001                         }
8002                         OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
8003
8004                         VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
8005                 }
8006                 if (no_zero_fill == FALSE)
8007                         vm_page_zero_fill(dst_page);
8008                 else
8009                         dst_page->absent = TRUE;
8010
8011                 dst_page->reference = TRUE;
8012
8013                 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
8014                         SET_PAGE_DIRTY(dst_page, FALSE);
8015                 }
8016                 if (dst_page->absent == FALSE) {
8017                         assert(dst_page->vm_page_q_state == VM_PAGE_NOT_ON_Q);
8018                         assert(dst_page->wire_count == 0);
8019                         dst_page->wire_count++;
8020                         dst_page->vm_page_q_state = VM_PAGE_IS_WIRED;
8021                         assert(dst_page->wire_count);
8022                         pages_wired++;
8023                         PAGE_WAKEUP_DONE(dst_page);
8024                 }
8025                 pages_inserted++;
8026
8027                 vm_page_insert_internal(dst_page, object, *dst_offset, tag, FALSE, TRUE, TRUE, TRUE, &delayed_ledger_update);
8028
8029                 lite_list[entry>>5] |= 1 << (entry & 31);
8030
8031                 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
8032
8033                 if (phys_page > upl->highest_page)
8034                         upl->highest_page = phys_page;
8035
8036                 if (user_page_list) {
8037                         user_page_list[entry].phys_addr = phys_page;
8038                         user_page_list[entry].absent    = dst_page->absent;
8039                         user_page_list[entry].dirty     = dst_page->dirty;
8040                         user_page_list[entry].free_when_done    = FALSE;
8041                         user_page_list[entry].precious  = FALSE;
8042                         user_page_list[entry].device    = FALSE;
8043                         user_page_list[entry].speculative = FALSE;
8044                         user_page_list[entry].cs_validated = FALSE;
8045                         user_page_list[entry].cs_tainted = FALSE;
8046                         user_page_list[entry].cs_nx     = FALSE;
8047                         user_page_list[entry].needed    = FALSE;
8048                         user_page_list[entry].mark      = FALSE;
8049                 }
8050                 entry++;
8051                 *dst_offset += PAGE_SIZE_64;
8052         }
8053 done:
8054         if (pages_wired) {
8055                 vm_page_lockspin_queues();
8056                 vm_page_wire_count += pages_wired;
8057                 vm_page_unlock_queues();
8058         }
8059         if (pages_inserted) {
8060                 if (object->internal) {
8061                         OSAddAtomic(pages_inserted, &vm_page_internal_count);
8062                 } else {
8063                         OSAddAtomic(pages_inserted, &vm_page_external_count);
8064                 }
8065         }
8066         if (delayed_ledger_update) {
8067                 task_t          owner;
8068
8069                 owner = object->vo_purgeable_owner;
8070                 assert(owner);
8071
8072                 /* more non-volatile bytes */
8073                 ledger_credit(owner->ledger,
8074                               task_ledgers.purgeable_nonvolatile,
8075                               delayed_ledger_update);
8076                 /* more footprint */
8077                 ledger_credit(owner->ledger,
8078                               task_ledgers.phys_footprint,
8079                               delayed_ledger_update);
8080         }
8081         return (ret);
8082 }
8083
8084
8085 unsigned int vm_object_iopl_request_sleep_for_cleaning = 0;
8086
8087
8088 kern_return_t
8089 vm_object_iopl_request(
8090         vm_object_t             object,
8091         vm_object_offset_t      offset,
8092         upl_size_t              size,
8093         upl_t                   *upl_ptr,
8094         upl_page_info_array_t   user_page_list,
8095         unsigned int            *page_list_count,
8096         upl_control_flags_t     cntrl_flags,
8097         vm_tag_t                tag)
8098 {
8099         vm_page_t               dst_page;
8100         vm_object_offset_t      dst_offset;
8101         upl_size_t              xfer_size;
8102         upl_t                   upl = NULL;
8103         unsigned int            entry;
8104         wpl_array_t             lite_list = NULL;
8105         int                     no_zero_fill = FALSE;
8106         unsigned int            size_in_pages;
8107         u_int32_t               psize;
8108         kern_return_t           ret;
8109         vm_prot_t               prot;
8110         struct vm_object_fault_info fault_info;
8111         struct  vm_page_delayed_work    dw_array[DEFAULT_DELAYED_WORK_LIMIT];
8112         struct  vm_page_delayed_work    *dwp;
8113         int                     dw_count;
8114         int                     dw_limit;
8115         int                     dw_index;
8116         boolean_t               caller_lookup;
8117         int                     io_tracking_flag = 0;
8118         int                     interruptible;
8119         ppnum_t                 phys_page;
8120
8121         boolean_t               set_cache_attr_needed = FALSE;
8122         boolean_t               free_wired_pages = FALSE;
8123         boolean_t               fast_path_empty_req = FALSE;
8124         boolean_t               fast_path_full_req = FALSE;
8125
8126         if (cntrl_flags & ~UPL_VALID_FLAGS) {
8127                 /*
8128                  * For forward compatibility's sake,
8129                  * reject any unknown flag.
8130                  */
8131                 return KERN_INVALID_VALUE;
8132         }
8133         if (vm_lopage_needed == FALSE)
8134                 cntrl_flags &= ~UPL_NEED_32BIT_ADDR;
8135
8136         if (cntrl_flags & UPL_NEED_32BIT_ADDR) {
8137                 if ( (cntrl_flags & (UPL_SET_IO_WIRE | UPL_SET_LITE)) != (UPL_SET_IO_WIRE | UPL_SET_LITE))
8138                         return KERN_INVALID_VALUE;
8139
8140                 if (object->phys_contiguous) {
8141                         if ((offset + object->vo_shadow_offset) >= (vm_object_offset_t)max_valid_dma_address)
8142                                 return KERN_INVALID_ADDRESS;
8143
8144                         if (((offset + object->vo_shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address)
8145                                 return KERN_INVALID_ADDRESS;
8146                 }
8147         }
8148         if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO))
8149                 no_zero_fill = TRUE;
8150
8151         if (cntrl_flags & UPL_COPYOUT_FROM)
8152                 prot = VM_PROT_READ;
8153         else
8154                 prot = VM_PROT_READ | VM_PROT_WRITE;
8155
8156         if ((!object->internal) && (object->paging_offset != 0))
8157                 panic("vm_object_iopl_request: external object with non-zero paging offset\n");
8158
8159 #if CONFIG_IOSCHED || UPL_DEBUG
8160         if ((object->io_tracking && object != kernel_object) || upl_debug_enabled)
8161                 io_tracking_flag |= UPL_CREATE_IO_TRACKING;
8162 #endif
8163
8164 #if CONFIG_IOSCHED
8165         if (object->io_tracking) {
8166                 /* Check if we're dealing with the kernel object. We do not support expedite on kernel object UPLs */
8167                 if (object != kernel_object)
8168                         io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
8169         }
8170 #endif
8171
8172         if (object->phys_contiguous)
8173                 psize = PAGE_SIZE;
8174         else
8175                 psize = size;
8176
8177         if (cntrl_flags & UPL_SET_INTERNAL) {
8178                 upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
8179
8180                 user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
8181                 lite_list = (wpl_array_t) (((uintptr_t)user_page_list) +
8182                                            ((psize / PAGE_SIZE) * sizeof(upl_page_info_t)));
8183                 if (size == 0) {
8184                         user_page_list = NULL;
8185                         lite_list = NULL;
8186                 }
8187         } else {
8188                 upl = upl_create(UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
8189
8190                 lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
8191                 if (size == 0) {
8192                         lite_list = NULL;
8193                 }
8194         }
8195         if (user_page_list)
8196                 user_page_list[0].device = FALSE;
8197         *upl_ptr = upl;
8198
8199         upl->map_object = object;
8200         upl->size = size;
8201
8202         size_in_pages = size / PAGE_SIZE;
8203
8204         if (object == kernel_object &&
8205             !(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS))) {
8206                 upl->flags |= UPL_KERNEL_OBJECT;
8207 #if UPL_DEBUG
8208                 vm_object_lock(object);
8209 #else
8210                 vm_object_lock_shared(object);
8211 #endif
8212         } else {
8213                 vm_object_lock(object);
8214                 vm_object_activity_begin(object);
8215         }
8216         /*
8217          * paging in progress also protects the paging_offset
8218          */
8219         upl->offset = offset + object->paging_offset;
8220
8221         if (cntrl_flags & UPL_BLOCK_ACCESS) {
8222                 /*
8223                  * The user requested that access to the pages in this UPL
8224                  * be blocked until the UPL is commited or aborted.
8225                  */
8226                 upl->flags |= UPL_ACCESS_BLOCKED;
8227         }
8228
8229 #if CONFIG_IOSCHED || UPL_DEBUG
8230         if (upl->flags & UPL_TRACKED_BY_OBJECT) {
8231                 vm_object_activity_begin(object);
8232                 queue_enter(&object->uplq, upl, upl_t, uplq);
8233         }
8234 #endif
8235
8236         if (object->phys_contiguous) {
8237
8238                 if (upl->flags & UPL_ACCESS_BLOCKED) {
8239                         assert(!object->blocked_access);
8240                         object->blocked_access = TRUE;
8241                 }
8242
8243                 vm_object_unlock(object);
8244
8245                 /*
8246                  * don't need any shadow mappings for this one
8247                  * since it is already I/O memory
8248                  */
8249                 upl->flags |= UPL_DEVICE_MEMORY;
8250
8251                 upl->highest_page = (ppnum_t) ((offset + object->vo_shadow_offset + size - 1)>>PAGE_SHIFT);
8252
8253                 if (user_page_list) {
8254                         user_page_list[0].phys_addr = (ppnum_t) ((offset + object->vo_shadow_offset)>>PAGE_SHIFT);
8255                         user_page_list[0].device = TRUE;
8256                 }
8257                 if (page_list_count != NULL) {
8258                         if (upl->flags & UPL_INTERNAL)
8259                                 *page_list_count = 0;
8260                         else
8261                                 *page_list_count = 1;
8262                 }
8263                 return KERN_SUCCESS;
8264         }
8265         if (object != kernel_object && object != compressor_object) {
8266                 /*
8267                  * Protect user space from future COW operations
8268                  */
8269 #if VM_OBJECT_TRACKING_OP_TRUESHARE
8270                 if (!object->true_share &&
8271                     vm_object_tracking_inited) {
8272                         void *bt[VM_OBJECT_TRACKING_BTDEPTH];
8273                         int num = 0;
8274
8275                         num = OSBacktrace(bt,
8276                                           VM_OBJECT_TRACKING_BTDEPTH);
8277                         btlog_add_entry(vm_object_tracking_btlog,
8278                                         object,
8279                                         VM_OBJECT_TRACKING_OP_TRUESHARE,
8280                                         bt,
8281                                         num);
8282                 }
8283 #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
8284
8285                 vm_object_lock_assert_exclusive(object);
8286                 object->true_share = TRUE;
8287
8288                 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC)
8289                         object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
8290         }
8291
8292         if (!(cntrl_flags & UPL_COPYOUT_FROM) &&
8293             object->copy != VM_OBJECT_NULL) {
8294                 /*
8295                  * Honor copy-on-write obligations
8296                  *
8297                  * The caller is gathering these pages and
8298                  * might modify their contents.  We need to
8299                  * make sure that the copy object has its own
8300                  * private copies of these pages before we let
8301                  * the caller modify them.
8302                  *
8303                  * NOTE: someone else could map the original object
8304                  * after we've done this copy-on-write here, and they
8305                  * could then see an inconsistent picture of the memory
8306                  * while it's being modified via the UPL.  To prevent this,
8307                  * we would have to block access to these pages until the
8308                  * UPL is released.  We could use the UPL_BLOCK_ACCESS
8309                  * code path for that...
8310                  */
8311                 vm_object_update(object,
8312                                  offset,
8313                                  size,
8314                                  NULL,
8315                                  NULL,
8316                                  FALSE, /* should_return */
8317                                  MEMORY_OBJECT_COPY_SYNC,
8318                                  VM_PROT_NO_CHANGE);
8319 #if DEVELOPMENT || DEBUG
8320                 iopl_cow++;
8321                 iopl_cow_pages += size >> PAGE_SHIFT;
8322 #endif
8323         }
8324         if (!(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS)) &&
8325             object->purgable != VM_PURGABLE_VOLATILE &&
8326             object->purgable != VM_PURGABLE_EMPTY &&
8327             object->copy == NULL &&
8328             size == object->vo_size &&
8329             offset == 0 &&
8330             object->shadow == NULL &&
8331             object->pager == NULL)
8332         {
8333                 if (object->resident_page_count == size_in_pages)
8334                 {
8335                         assert(object != compressor_object);
8336                         assert(object != kernel_object);
8337                         fast_path_full_req = TRUE;
8338                 }
8339                 else if (object->resident_page_count == 0)
8340                 {
8341                         assert(object != compressor_object);
8342                         assert(object != kernel_object);
8343                         fast_path_empty_req = TRUE;
8344                         set_cache_attr_needed = TRUE;
8345                 }
8346         }
8347
8348         if (cntrl_flags & UPL_SET_INTERRUPTIBLE)
8349                 interruptible = THREAD_ABORTSAFE;
8350         else
8351                 interruptible = THREAD_UNINT;
8352
8353         entry = 0;
8354
8355         xfer_size = size;
8356         dst_offset = offset;
8357         dw_count = 0;
8358
8359         if (fast_path_full_req) {
8360
8361                 if (vm_object_iopl_wire_full(object, upl, user_page_list, lite_list, cntrl_flags, tag) == TRUE)
8362                         goto finish;
8363                 /*
8364                  * we couldn't complete the processing of this request on the fast path
8365                  * so fall through to the slow path and finish up
8366                  */
8367
8368         } else if (fast_path_empty_req) {
8369
8370                 if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
8371                         ret = KERN_MEMORY_ERROR;
8372                         goto return_err;
8373                 }
8374                 ret = vm_object_iopl_wire_empty(object, upl, user_page_list, lite_list, cntrl_flags, tag, &dst_offset, size_in_pages);
8375
8376                 if (ret) {
8377                         free_wired_pages = TRUE;
8378                         goto return_err;
8379                 }
8380                 goto finish;
8381         }
8382
8383         fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
8384         fault_info.user_tag  = 0;
8385         fault_info.lo_offset = offset;
8386         fault_info.hi_offset = offset + xfer_size;
8387         fault_info.no_cache  = FALSE;
8388         fault_info.stealth = FALSE;
8389         fault_info.io_sync = FALSE;
8390         fault_info.cs_bypass = FALSE;
8391         fault_info.mark_zf_absent = TRUE;
8392         fault_info.interruptible = interruptible;
8393         fault_info.batch_pmap_op = TRUE;
8394
8395         dwp = &dw_array[0];
8396         dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
8397
8398         while (xfer_size) {
8399                 vm_fault_return_t       result;
8400
8401                 dwp->dw_mask = 0;
8402
8403                 if (fast_path_full_req) {
8404                         /*
8405                          * if we get here, it means that we ran into a page
8406                          * state we couldn't handle in the fast path and
8407                          * bailed out to the slow path... since the order
8408                          * we look at pages is different between the 2 paths,
8409                          * the following check is needed to determine whether
8410                          * this page was already processed in the fast path
8411                          */
8412                         if (lite_list[entry>>5] & (1 << (entry & 31)))
8413                                 goto skip_page;
8414                 }
8415                 dst_page = vm_page_lookup(object, dst_offset);
8416
8417                 if (dst_page == VM_PAGE_NULL ||
8418                     dst_page->busy ||
8419                     dst_page->error ||
8420                     dst_page->restart ||
8421                     dst_page->absent ||
8422                     dst_page->fictitious) {
8423
8424                    if (object == kernel_object)
8425                            panic("vm_object_iopl_request: missing/bad page in kernel object\n");
8426                    if (object == compressor_object)
8427                            panic("vm_object_iopl_request: missing/bad page in compressor object\n");
8428
8429                    if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
8430                            ret = KERN_MEMORY_ERROR;
8431                            goto return_err;
8432                    }
8433                    set_cache_attr_needed = TRUE;
8434
8435                    /*
8436                     * We just looked up the page and the result remains valid
8437                     * until the object lock is release, so send it to
8438                     * vm_fault_page() (as "dst_page"), to avoid having to
8439                     * look it up again there.
8440                     */
8441                    caller_lookup = TRUE;
8442
8443                    do {
8444                         vm_page_t       top_page;
8445                         kern_return_t   error_code;
8446
8447                         fault_info.cluster_size = xfer_size;
8448
8449                         vm_object_paging_begin(object);
8450
8451                         result = vm_fault_page(object, dst_offset,
8452                                                prot | VM_PROT_WRITE, FALSE,
8453                                                caller_lookup,
8454                                                &prot, &dst_page, &top_page,
8455                                                (int *)0,
8456                                                &error_code, no_zero_fill,
8457                                                FALSE, &fault_info);
8458
8459                         /* our lookup is no longer valid at this point */
8460                         caller_lookup = FALSE;
8461
8462                         switch (result) {
8463
8464                         case VM_FAULT_SUCCESS:
8465
8466                                 if ( !dst_page->absent) {
8467                                         PAGE_WAKEUP_DONE(dst_page);
8468                                 } else {
8469                                         /*
8470                                          * we only get back an absent page if we
8471                                          * requested that it not be zero-filled
8472                                          * because we are about to fill it via I/O
8473                                          *
8474                                          * absent pages should be left BUSY
8475                                          * to prevent them from being faulted
8476                                          * into an address space before we've
8477                                          * had a chance to complete the I/O on
8478                                          * them since they may contain info that
8479                                          * shouldn't be seen by the faulting task
8480                                          */
8481                                 }
8482                                 /*
8483                                  *      Release paging references and
8484                                  *      top-level placeholder page, if any.
8485                                  */
8486                                 if (top_page != VM_PAGE_NULL) {
8487                                         vm_object_t local_object;
8488
8489                                         local_object = VM_PAGE_OBJECT(top_page);
8490
8491                                         /*
8492                                          * comparing 2 packed pointers
8493                                          */
8494                                         if (top_page->vm_page_object != dst_page->vm_page_object) {
8495                                                 vm_object_lock(local_object);
8496                                                 VM_PAGE_FREE(top_page);
8497                                                 vm_object_paging_end(local_object);
8498                                                 vm_object_unlock(local_object);
8499                                         } else {
8500                                                 VM_PAGE_FREE(top_page);
8501                                                 vm_object_paging_end(local_object);
8502                                         }
8503                                 }
8504                                 vm_object_paging_end(object);
8505                                 break;
8506
8507                         case VM_FAULT_RETRY:
8508                                 vm_object_lock(object);
8509                                 break;
8510
8511                         case VM_FAULT_MEMORY_SHORTAGE:
8512                                 OSAddAtomic((size_in_pages - entry), &vm_upl_wait_for_pages);
8513
8514                                 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
8515
8516                                 if (vm_page_wait(interruptible)) {
8517                                         OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
8518
8519                                         VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
8520                                         vm_object_lock(object);
8521
8522                                         break;
8523                                 }
8524                                 OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
8525
8526                                 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
8527
8528                                 /* fall thru */
8529
8530                         case VM_FAULT_INTERRUPTED:
8531                                 error_code = MACH_SEND_INTERRUPTED;
8532                         case VM_FAULT_MEMORY_ERROR:
8533                         memory_error:
8534                                 ret = (error_code ? error_code: KERN_MEMORY_ERROR);
8535
8536                                 vm_object_lock(object);
8537                                 goto return_err;
8538
8539                         case VM_FAULT_SUCCESS_NO_VM_PAGE:
8540                                 /* success but no page: fail */
8541                                 vm_object_paging_end(object);
8542                                 vm_object_unlock(object);
8543                                 goto memory_error;
8544
8545                         default:
8546                                 panic("vm_object_iopl_request: unexpected error"
8547                                       " 0x%x from vm_fault_page()\n", result);
8548                         }
8549                    } while (result != VM_FAULT_SUCCESS);
8550
8551                 }
8552                 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
8553
8554                 if (upl->flags & UPL_KERNEL_OBJECT)
8555                         goto record_phys_addr;
8556
8557                 if (dst_page->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
8558                         dst_page->busy = TRUE;
8559                         goto record_phys_addr;
8560                 }
8561
8562                 if (dst_page->cleaning) {
8563                         /*
8564                          * Someone else is cleaning this page in place.
8565                          * In theory, we should be able to  proceed and use this
8566                          * page but they'll probably end up clearing the "busy"
8567                          * bit on it in upl_commit_range() but they didn't set
8568                          * it, so they would clear our "busy" bit and open
8569                          * us to race conditions.
8570                          * We'd better wait for the cleaning to complete and
8571                          * then try again.
8572                          */
8573                         vm_object_iopl_request_sleep_for_cleaning++;
8574                         PAGE_SLEEP(object, dst_page, THREAD_UNINT);
8575                         continue;
8576                 }
8577                 if (dst_page->laundry)
8578                         vm_pageout_steal_laundry(dst_page, FALSE);
8579
8580                 if ( (cntrl_flags & UPL_NEED_32BIT_ADDR) &&
8581                      phys_page >= (max_valid_dma_address >> PAGE_SHIFT) ) {
8582                         vm_page_t       low_page;
8583                         int             refmod;
8584
8585                         /*
8586                          * support devices that can't DMA above 32 bits
8587                          * by substituting pages from a pool of low address
8588                          * memory for any pages we find above the 4G mark
8589                          * can't substitute if the page is already wired because
8590                          * we don't know whether that physical address has been
8591                          * handed out to some other 64 bit capable DMA device to use
8592                          */
8593                         if (VM_PAGE_WIRED(dst_page)) {
8594                                 ret = KERN_PROTECTION_FAILURE;
8595                                 goto return_err;
8596                         }
8597                         low_page = vm_page_grablo();
8598
8599                         if (low_page == VM_PAGE_NULL) {
8600                                 ret = KERN_RESOURCE_SHORTAGE;
8601                                 goto return_err;
8602                         }
8603                         /*
8604                          * from here until the vm_page_replace completes
8605                          * we musn't drop the object lock... we don't
8606                          * want anyone refaulting this page in and using
8607                          * it after we disconnect it... we want the fault
8608                          * to find the new page being substituted.
8609                          */
8610                         if (dst_page->pmapped)
8611                                 refmod = pmap_disconnect(phys_page);
8612                         else
8613                                 refmod = 0;
8614
8615                         if (!dst_page->absent)
8616                                 vm_page_copy(dst_page, low_page);
8617
8618                         low_page->reference = dst_page->reference;
8619                         low_page->dirty     = dst_page->dirty;
8620                         low_page->absent    = dst_page->absent;
8621
8622                         if (refmod & VM_MEM_REFERENCED)
8623                                 low_page->reference = TRUE;
8624                         if (refmod & VM_MEM_MODIFIED) {
8625                                 SET_PAGE_DIRTY(low_page, FALSE);
8626                         }
8627
8628                         vm_page_replace(low_page, object, dst_offset);
8629
8630                         dst_page = low_page;
8631                         /*
8632                          * vm_page_grablo returned the page marked
8633                          * BUSY... we don't need a PAGE_WAKEUP_DONE
8634                          * here, because we've never dropped the object lock
8635                          */
8636                         if ( !dst_page->absent)
8637                                 dst_page->busy = FALSE;
8638
8639                         phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
8640                 }
8641                 if ( !dst_page->busy)
8642                         dwp->dw_mask |= DW_vm_page_wire;
8643
8644                 if (cntrl_flags & UPL_BLOCK_ACCESS) {
8645                         /*
8646                          * Mark the page "busy" to block any future page fault
8647                          * on this page in addition to wiring it.
8648                          * We'll also remove the mapping
8649                          * of all these pages before leaving this routine.
8650                          */
8651                         assert(!dst_page->fictitious);
8652                         dst_page->busy = TRUE;
8653                 }
8654                 /*
8655                  * expect the page to be used
8656                  * page queues lock must be held to set 'reference'
8657                  */
8658                 dwp->dw_mask |= DW_set_reference;
8659
8660                 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
8661                         SET_PAGE_DIRTY(dst_page, TRUE);
8662                 }
8663                 if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->written_by_kernel == TRUE) {
8664                         pmap_sync_page_attributes_phys(phys_page);
8665                         dst_page->written_by_kernel = FALSE;
8666                 }
8667
8668 record_phys_addr:
8669                 if (dst_page->busy)
8670                         upl->flags |= UPL_HAS_BUSY;
8671
8672                 lite_list[entry>>5] |= 1 << (entry & 31);
8673
8674                 if (phys_page > upl->highest_page)
8675                         upl->highest_page = phys_page;
8676
8677                 if (user_page_list) {
8678                         user_page_list[entry].phys_addr = phys_page;
8679                         user_page_list[entry].free_when_done    = dst_page->free_when_done;
8680                         user_page_list[entry].absent    = dst_page->absent;
8681                         user_page_list[entry].dirty     = dst_page->dirty;
8682                         user_page_list[entry].precious  = dst_page->precious;
8683                         user_page_list[entry].device    = FALSE;
8684                         user_page_list[entry].needed    = FALSE;
8685                         if (dst_page->clustered == TRUE)
8686                                 user_page_list[entry].speculative = (dst_page->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
8687                         else
8688                                 user_page_list[entry].speculative = FALSE;
8689                         user_page_list[entry].cs_validated = dst_page->cs_validated;
8690                         user_page_list[entry].cs_tainted = dst_page->cs_tainted;
8691                         user_page_list[entry].cs_nx = dst_page->cs_nx;
8692                         user_page_list[entry].mark      = FALSE;
8693                 }
8694                 if (object != kernel_object && object != compressor_object) {
8695                         /*
8696                          * someone is explicitly grabbing this page...
8697                          * update clustered and speculative state
8698                          *
8699                          */
8700                         if (dst_page->clustered)
8701                                 VM_PAGE_CONSUME_CLUSTERED(dst_page);
8702                 }
8703 skip_page:
8704                 entry++;
8705                 dst_offset += PAGE_SIZE_64;
8706                 xfer_size -= PAGE_SIZE;
8707
8708                 if (dwp->dw_mask) {
8709                         VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
8710
8711                         if (dw_count >= dw_limit) {
8712                                 vm_page_do_delayed_work(object, tag, &dw_array[0], dw_count);
8713
8714                                 dwp = &dw_array[0];
8715                                 dw_count = 0;
8716                         }
8717                 }
8718         }
8719         assert(entry == size_in_pages);
8720
8721         if (dw_count)
8722                 vm_page_do_delayed_work(object, tag, &dw_array[0], dw_count);
8723 finish:
8724         if (user_page_list && set_cache_attr_needed == TRUE)
8725                 vm_object_set_pmap_cache_attr(object, user_page_list, size_in_pages, TRUE);
8726
8727         if (page_list_count != NULL) {
8728                 if (upl->flags & UPL_INTERNAL)
8729                         *page_list_count = 0;
8730                 else if (*page_list_count > size_in_pages)
8731                         *page_list_count = size_in_pages;
8732         }
8733         vm_object_unlock(object);
8734
8735         if (cntrl_flags & UPL_BLOCK_ACCESS) {
8736                 /*
8737                  * We've marked all the pages "busy" so that future
8738                  * page faults will block.
8739                  * Now remove the mapping for these pages, so that they
8740                  * can't be accessed without causing a page fault.
8741                  */
8742                 vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
8743                                        PMAP_NULL, 0, VM_PROT_NONE);
8744                 assert(!object->blocked_access);
8745                 object->blocked_access = TRUE;
8746         }
8747
8748         return KERN_SUCCESS;
8749
8750 return_err:
8751         dw_index = 0;
8752
8753         for (; offset < dst_offset; offset += PAGE_SIZE) {
8754                 boolean_t need_unwire;
8755
8756                 dst_page = vm_page_lookup(object, offset);
8757
8758                 if (dst_page == VM_PAGE_NULL)
8759                         panic("vm_object_iopl_request: Wired page missing. \n");
8760
8761                 /*
8762                  * if we've already processed this page in an earlier
8763                  * dw_do_work, we need to undo the wiring... we will
8764                  * leave the dirty and reference bits on if they
8765                  * were set, since we don't have a good way of knowing
8766                  * what the previous state was and we won't get here
8767                  * under any normal circumstances...  we will always
8768                  * clear BUSY and wakeup any waiters via vm_page_free
8769                  * or PAGE_WAKEUP_DONE
8770                  */
8771                 need_unwire = TRUE;
8772
8773                 if (dw_count) {
8774                         if (dw_array[dw_index].dw_m == dst_page) {
8775                                 /*
8776                                  * still in the deferred work list
8777                                  * which means we haven't yet called
8778                                  * vm_page_wire on this page
8779                                  */
8780                                 need_unwire = FALSE;
8781
8782                                 dw_index++;
8783                                 dw_count--;
8784                         }
8785                 }
8786                 vm_page_lock_queues();
8787
8788                 if (dst_page->absent || free_wired_pages == TRUE) {
8789                         vm_page_free(dst_page);
8790
8791                         need_unwire = FALSE;
8792                 } else {
8793                         if (need_unwire == TRUE)
8794                                 vm_page_unwire(dst_page, TRUE);
8795
8796                         PAGE_WAKEUP_DONE(dst_page);
8797                 }
8798                 vm_page_unlock_queues();
8799
8800                 if (need_unwire == TRUE)
8801                         VM_STAT_INCR(reactivations);
8802         }
8803 #if UPL_DEBUG
8804         upl->upl_state = 2;
8805 #endif
8806         if (! (upl->flags & UPL_KERNEL_OBJECT)) {
8807                 vm_object_activity_end(object);
8808                 vm_object_collapse(object, 0, TRUE);
8809         }
8810         vm_object_unlock(object);
8811         upl_destroy(upl);
8812
8813         return ret;
8814 }
8815
8816 kern_return_t
8817 upl_transpose(
8818         upl_t           upl1,
8819         upl_t           upl2)
8820 {
8821         kern_return_t           retval;
8822         boolean_t               upls_locked;
8823         vm_object_t             object1, object2;
8824
8825         if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2  || ((upl1->flags & UPL_VECTOR)==UPL_VECTOR)  || ((upl2->flags & UPL_VECTOR)==UPL_VECTOR)) {
8826                 return KERN_INVALID_ARGUMENT;
8827         }
8828
8829         upls_locked = FALSE;
8830
8831         /*
8832          * Since we need to lock both UPLs at the same time,
8833          * avoid deadlocks by always taking locks in the same order.
8834          */
8835         if (upl1 < upl2) {
8836                 upl_lock(upl1);
8837                 upl_lock(upl2);
8838         } else {
8839                 upl_lock(upl2);
8840                 upl_lock(upl1);
8841         }
8842         upls_locked = TRUE;     /* the UPLs will need to be unlocked */
8843
8844         object1 = upl1->map_object;
8845         object2 = upl2->map_object;
8846
8847         if (upl1->offset != 0 || upl2->offset != 0 ||
8848             upl1->size != upl2->size) {
8849                 /*
8850                  * We deal only with full objects, not subsets.
8851                  * That's because we exchange the entire backing store info
8852                  * for the objects: pager, resident pages, etc...  We can't do
8853                  * only part of it.
8854                  */
8855                 retval = KERN_INVALID_VALUE;
8856                 goto done;
8857         }
8858
8859         /*
8860          * Tranpose the VM objects' backing store.
8861          */
8862         retval = vm_object_transpose(object1, object2,
8863                                      (vm_object_size_t) upl1->size);
8864
8865         if (retval == KERN_SUCCESS) {
8866                 /*
8867                  * Make each UPL point to the correct VM object, i.e. the
8868                  * object holding the pages that the UPL refers to...
8869                  */
8870 #if CONFIG_IOSCHED || UPL_DEBUG
8871                 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
8872                         vm_object_lock(object1);
8873                         vm_object_lock(object2);
8874                 }
8875                 if (upl1->flags & UPL_TRACKED_BY_OBJECT)
8876                         queue_remove(&object1->uplq, upl1, upl_t, uplq);
8877                 if (upl2->flags & UPL_TRACKED_BY_OBJECT)
8878                         queue_remove(&object2->uplq, upl2, upl_t, uplq);
8879 #endif
8880                 upl1->map_object = object2;
8881                 upl2->map_object = object1;
8882
8883 #if CONFIG_IOSCHED || UPL_DEBUG
8884                 if (upl1->flags & UPL_TRACKED_BY_OBJECT)
8885                         queue_enter(&object2->uplq, upl1, upl_t, uplq);
8886                 if (upl2->flags & UPL_TRACKED_BY_OBJECT)
8887                         queue_enter(&object1->uplq, upl2, upl_t, uplq);
8888                 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
8889                         vm_object_unlock(object2);
8890                         vm_object_unlock(object1);
8891                 }
8892 #endif
8893         }
8894
8895 done:
8896         /*
8897          * Cleanup.
8898          */
8899         if (upls_locked) {
8900                 upl_unlock(upl1);
8901                 upl_unlock(upl2);
8902                 upls_locked = FALSE;
8903         }
8904
8905         return retval;
8906 }
8907
8908 void
8909 upl_range_needed(
8910         upl_t           upl,
8911         int             index,
8912         int             count)
8913 {
8914         upl_page_info_t *user_page_list;
8915         int             size_in_pages;
8916
8917         if ( !(upl->flags & UPL_INTERNAL) || count <= 0)
8918                 return;
8919
8920         size_in_pages = upl->size / PAGE_SIZE;
8921
8922         user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
8923
8924         while (count-- && index < size_in_pages)
8925                 user_page_list[index++].needed = TRUE;
8926 }
8927
8928
8929 /*
8930  * Reserve of virtual addresses in the kernel address space.
8931  * We need to map the physical pages in the kernel, so that we
8932  * can call the code-signing or slide routines with a kernel
8933  * virtual address.  We keep this pool of pre-allocated kernel
8934  * virtual addresses so that we don't have to scan the kernel's
8935  * virtaul address space each time we need to work with
8936  * a physical page.
8937  */
8938 decl_simple_lock_data(,vm_paging_lock)
8939 #define VM_PAGING_NUM_PAGES     64
8940 vm_map_offset_t vm_paging_base_address = 0;
8941 boolean_t       vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
8942 int             vm_paging_max_index = 0;
8943 int             vm_paging_page_waiter = 0;
8944 int             vm_paging_page_waiter_total = 0;
8945 unsigned long   vm_paging_no_kernel_page = 0;
8946 unsigned long   vm_paging_objects_mapped = 0;
8947 unsigned long   vm_paging_pages_mapped = 0;
8948 unsigned long   vm_paging_objects_mapped_slow = 0;
8949 unsigned long   vm_paging_pages_mapped_slow = 0;
8950
8951 void
8952 vm_paging_map_init(void)
8953 {
8954         kern_return_t   kr;
8955         vm_map_offset_t page_map_offset;
8956         vm_map_entry_t  map_entry;
8957
8958         assert(vm_paging_base_address == 0);
8959
8960         /*
8961          * Initialize our pool of pre-allocated kernel
8962          * virtual addresses.
8963          */
8964         page_map_offset = 0;
8965         kr = vm_map_find_space(kernel_map,
8966                                &page_map_offset,
8967                                VM_PAGING_NUM_PAGES * PAGE_SIZE,
8968                                0,
8969                                0,
8970                                VM_MAP_KERNEL_FLAGS_NONE,
8971                                VM_KERN_MEMORY_NONE,
8972                                &map_entry);
8973         if (kr != KERN_SUCCESS) {
8974                 panic("vm_paging_map_init: kernel_map full\n");
8975         }
8976         VME_OBJECT_SET(map_entry, kernel_object);
8977         VME_OFFSET_SET(map_entry, page_map_offset);
8978         map_entry->protection = VM_PROT_NONE;
8979         map_entry->max_protection = VM_PROT_NONE;
8980         map_entry->permanent = TRUE;
8981         vm_object_reference(kernel_object);
8982         vm_map_unlock(kernel_map);
8983
8984         assert(vm_paging_base_address == 0);
8985         vm_paging_base_address = page_map_offset;
8986 }
8987
8988 /*
8989  * vm_paging_map_object:
8990  *      Maps part of a VM object's pages in the kernel
8991  *      virtual address space, using the pre-allocated
8992  *      kernel virtual addresses, if possible.
8993  * Context:
8994  *      The VM object is locked.  This lock will get
8995  *      dropped and re-acquired though, so the caller
8996  *      must make sure the VM object is kept alive
8997  *      (by holding a VM map that has a reference
8998  *      on it, for example, or taking an extra reference).
8999  *      The page should also be kept busy to prevent
9000  *      it from being reclaimed.
9001  */
9002 kern_return_t
9003 vm_paging_map_object(
9004         vm_page_t               page,
9005         vm_object_t             object,
9006         vm_object_offset_t      offset,
9007         vm_prot_t               protection,
9008         boolean_t               can_unlock_object,
9009         vm_map_size_t           *size,          /* IN/OUT */
9010         vm_map_offset_t         *address,       /* OUT */
9011         boolean_t               *need_unmap)    /* OUT */
9012 {
9013         kern_return_t           kr;
9014         vm_map_offset_t         page_map_offset;
9015         vm_map_size_t           map_size;
9016         vm_object_offset_t      object_offset;
9017         int                     i;
9018
9019         if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
9020                 /* use permanent 1-to-1 kernel mapping of physical memory ? */
9021 #if __x86_64__
9022                 *address = (vm_map_offset_t)
9023                         PHYSMAP_PTOV((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(page) <<
9024                                      PAGE_SHIFT);
9025                 *need_unmap = FALSE;
9026                 return KERN_SUCCESS;
9027 #elif __arm__ || __arm64__
9028                 *address = (vm_map_offset_t)
9029                         phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(page) << PAGE_SHIFT);
9030                 *need_unmap = FALSE;
9031                 return KERN_SUCCESS;
9032 #else
9033 #warn "vm_paging_map_object: no 1-to-1 kernel mapping of physical memory..."
9034 #endif
9035
9036                 assert(page->busy);
9037                 /*
9038                  * Use one of the pre-allocated kernel virtual addresses
9039                  * and just enter the VM page in the kernel address space
9040                  * at that virtual address.
9041                  */
9042                 simple_lock(&vm_paging_lock);
9043
9044                 /*
9045                  * Try and find an available kernel virtual address
9046                  * from our pre-allocated pool.
9047                  */
9048                 page_map_offset = 0;
9049                 for (;;) {
9050                         for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
9051                                 if (vm_paging_page_inuse[i] == FALSE) {
9052                                         page_map_offset =
9053                                                 vm_paging_base_address +
9054                                                 (i * PAGE_SIZE);
9055                                         break;
9056                                 }
9057                         }
9058                         if (page_map_offset != 0) {
9059                                 /* found a space to map our page ! */
9060                                 break;
9061                         }
9062
9063                         if (can_unlock_object) {
9064                                 /*
9065                                  * If we can afford to unlock the VM object,
9066                                  * let's take the slow path now...
9067                                  */
9068                                 break;
9069                         }
9070                         /*
9071                          * We can't afford to unlock the VM object, so
9072                          * let's wait for a space to become available...
9073                          */
9074                         vm_paging_page_waiter_total++;
9075                         vm_paging_page_waiter++;
9076                         kr = assert_wait((event_t)&vm_paging_page_waiter, THREAD_UNINT);
9077                         if (kr == THREAD_WAITING) {
9078                                 simple_unlock(&vm_paging_lock);
9079                                 kr = thread_block(THREAD_CONTINUE_NULL);
9080                                 simple_lock(&vm_paging_lock);
9081                         }
9082                         vm_paging_page_waiter--;
9083                         /* ... and try again */
9084                 }
9085
9086                 if (page_map_offset != 0) {
9087                         /*
9088                          * We found a kernel virtual address;
9089                          * map the physical page to that virtual address.
9090                          */
9091                         if (i > vm_paging_max_index) {
9092                                 vm_paging_max_index = i;
9093                         }
9094                         vm_paging_page_inuse[i] = TRUE;
9095                         simple_unlock(&vm_paging_lock);
9096
9097                         page->pmapped = TRUE;
9098
9099                         /*
9100                          * Keep the VM object locked over the PMAP_ENTER
9101                          * and the actual use of the page by the kernel,
9102                          * or this pmap mapping might get undone by a
9103                          * vm_object_pmap_protect() call...
9104                          */
9105                         PMAP_ENTER(kernel_pmap,
9106                                    page_map_offset,
9107                                    page,
9108                                    protection,
9109                                    VM_PROT_NONE,
9110                                    0,
9111                                    TRUE,
9112                                    kr);
9113                         assert(kr == KERN_SUCCESS);
9114                         vm_paging_objects_mapped++;
9115                         vm_paging_pages_mapped++;
9116                         *address = page_map_offset;
9117                         *need_unmap = TRUE;
9118
9119 #if KASAN
9120                         kasan_notify_address(page_map_offset, PAGE_SIZE);
9121 #endif
9122
9123                         /* all done and mapped, ready to use ! */
9124                         return KERN_SUCCESS;
9125                 }
9126
9127                 /*
9128                  * We ran out of pre-allocated kernel virtual
9129                  * addresses.  Just map the page in the kernel
9130                  * the slow and regular way.
9131                  */
9132                 vm_paging_no_kernel_page++;
9133                 simple_unlock(&vm_paging_lock);
9134         }
9135
9136         if (! can_unlock_object) {
9137                 *address = 0;
9138                 *size = 0;
9139                 *need_unmap = FALSE;
9140                 return KERN_NOT_SUPPORTED;
9141         }
9142
9143         object_offset = vm_object_trunc_page(offset);
9144         map_size = vm_map_round_page(*size,
9145                                      VM_MAP_PAGE_MASK(kernel_map));
9146
9147         /*
9148          * Try and map the required range of the object
9149          * in the kernel_map
9150          */
9151
9152         vm_object_reference_locked(object);     /* for the map entry */
9153         vm_object_unlock(object);
9154
9155         kr = vm_map_enter(kernel_map,
9156                           address,
9157                           map_size,
9158                           0,
9159                           VM_FLAGS_ANYWHERE,
9160                           VM_MAP_KERNEL_FLAGS_NONE,
9161                           VM_KERN_MEMORY_NONE,
9162                           object,
9163                           object_offset,
9164                           FALSE,
9165                           protection,
9166                           VM_PROT_ALL,
9167                           VM_INHERIT_NONE);
9168         if (kr != KERN_SUCCESS) {
9169                 *address = 0;
9170                 *size = 0;
9171                 *need_unmap = FALSE;
9172                 vm_object_deallocate(object);   /* for the map entry */
9173                 vm_object_lock(object);
9174                 return kr;
9175         }
9176
9177         *size = map_size;
9178
9179         /*
9180          * Enter the mapped pages in the page table now.
9181          */
9182         vm_object_lock(object);
9183         /*
9184          * VM object must be kept locked from before PMAP_ENTER()
9185          * until after the kernel is done accessing the page(s).
9186          * Otherwise, the pmap mappings in the kernel could be
9187          * undone by a call to vm_object_pmap_protect().
9188          */
9189
9190         for (page_map_offset = 0;
9191              map_size != 0;
9192              map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
9193
9194                 page = vm_page_lookup(object, offset + page_map_offset);
9195                 if (page == VM_PAGE_NULL) {
9196                         printf("vm_paging_map_object: no page !?");
9197                         vm_object_unlock(object);
9198                         kr = vm_map_remove(kernel_map, *address, *size,
9199                                            VM_MAP_NO_FLAGS);
9200                         assert(kr == KERN_SUCCESS);
9201                         *address = 0;
9202                         *size = 0;
9203                         *need_unmap = FALSE;
9204                         vm_object_lock(object);
9205                         return KERN_MEMORY_ERROR;
9206                 }
9207                 page->pmapped = TRUE;
9208
9209                 //assert(pmap_verify_free(VM_PAGE_GET_PHYS_PAGE(page)));
9210                 PMAP_ENTER(kernel_pmap,
9211                            *address + page_map_offset,
9212                            page,
9213                            protection,
9214                            VM_PROT_NONE,
9215                            0,
9216                            TRUE,
9217                            kr);
9218                 assert(kr == KERN_SUCCESS);
9219 #if KASAN
9220                 kasan_notify_address(*address + page_map_offset, PAGE_SIZE);
9221 #endif
9222         }
9223
9224         vm_paging_objects_mapped_slow++;
9225         vm_paging_pages_mapped_slow += (unsigned long) (map_size / PAGE_SIZE_64);
9226
9227         *need_unmap = TRUE;
9228
9229         return KERN_SUCCESS;
9230 }
9231
9232 /*
9233  * vm_paging_unmap_object:
9234  *      Unmaps part of a VM object's pages from the kernel
9235  *      virtual address space.
9236  * Context:
9237  *      The VM object is locked.  This lock will get
9238  *      dropped and re-acquired though.
9239  */
9240 void
9241 vm_paging_unmap_object(
9242         vm_object_t     object,
9243         vm_map_offset_t start,
9244         vm_map_offset_t end)
9245 {
9246         kern_return_t   kr;
9247         int             i;
9248
9249         if ((vm_paging_base_address == 0) ||
9250             (start < vm_paging_base_address) ||
9251             (end > (vm_paging_base_address
9252                      + (VM_PAGING_NUM_PAGES * PAGE_SIZE)))) {
9253                 /*
9254                  * We didn't use our pre-allocated pool of
9255                  * kernel virtual address.  Deallocate the
9256                  * virtual memory.
9257                  */
9258                 if (object != VM_OBJECT_NULL) {
9259                         vm_object_unlock(object);
9260                 }
9261                 kr = vm_map_remove(kernel_map, start, end, VM_MAP_NO_FLAGS);
9262                 if (object != VM_OBJECT_NULL) {
9263                         vm_object_lock(object);
9264                 }
9265                 assert(kr == KERN_SUCCESS);
9266         } else {
9267                 /*
9268                  * We used a kernel virtual address from our
9269                  * pre-allocated pool.  Put it back in the pool
9270                  * for next time.
9271                  */
9272                 assert(end - start == PAGE_SIZE);
9273                 i = (int) ((start - vm_paging_base_address) >> PAGE_SHIFT);
9274                 assert(i >= 0 && i < VM_PAGING_NUM_PAGES);
9275
9276                 /* undo the pmap mapping */
9277                 pmap_remove(kernel_pmap, start, end);
9278
9279                 simple_lock(&vm_paging_lock);
9280                 vm_paging_page_inuse[i] = FALSE;
9281                 if (vm_paging_page_waiter) {
9282                         thread_wakeup(&vm_paging_page_waiter);
9283                 }
9284                 simple_unlock(&vm_paging_lock);
9285         }
9286 }
9287
9288
9289 /*
9290  * page->object must be locked
9291  */
9292 void
9293 vm_pageout_steal_laundry(vm_page_t page, boolean_t queues_locked)
9294 {
9295         if (!queues_locked) {
9296                 vm_page_lockspin_queues();
9297         }
9298
9299         page->free_when_done = FALSE;
9300         /*
9301          * need to drop the laundry count...
9302          * we may also need to remove it
9303          * from the I/O paging queue...
9304          * vm_pageout_throttle_up handles both cases
9305          *
9306          * the laundry and pageout_queue flags are cleared...
9307          */
9308         vm_pageout_throttle_up(page);
9309
9310         vm_page_steal_pageout_page++;
9311
9312         if (!queues_locked) {
9313                 vm_page_unlock_queues();
9314         }
9315 }
9316
9317 upl_t
9318 vector_upl_create(vm_offset_t upl_offset)
9319 {
9320         int     vector_upl_size  = sizeof(struct _vector_upl);
9321         int i=0;
9322         upl_t   upl;
9323         vector_upl_t vector_upl = (vector_upl_t)kalloc(vector_upl_size);
9324
9325         upl = upl_create(0,UPL_VECTOR,0);
9326         upl->vector_upl = vector_upl;
9327         upl->offset = upl_offset;
9328         vector_upl->size = 0;
9329         vector_upl->offset = upl_offset;
9330         vector_upl->invalid_upls=0;
9331         vector_upl->num_upls=0;
9332         vector_upl->pagelist = NULL;
9333
9334         for(i=0; i < MAX_VECTOR_UPL_ELEMENTS ; i++) {
9335                 vector_upl->upl_iostates[i].size = 0;
9336                 vector_upl->upl_iostates[i].offset = 0;
9337
9338         }
9339         return upl;
9340 }
9341
9342 void
9343 vector_upl_deallocate(upl_t upl)
9344 {
9345         if(upl) {
9346                 vector_upl_t vector_upl = upl->vector_upl;
9347                 if(vector_upl) {
9348                         if(vector_upl->invalid_upls != vector_upl->num_upls)
9349                                 panic("Deallocating non-empty Vectored UPL\n");
9350                         kfree(vector_upl->pagelist,(sizeof(struct upl_page_info)*(vector_upl->size/PAGE_SIZE)));
9351                         vector_upl->invalid_upls=0;
9352                         vector_upl->num_upls = 0;
9353                         vector_upl->pagelist = NULL;
9354                         vector_upl->size = 0;
9355                         vector_upl->offset = 0;
9356                         kfree(vector_upl, sizeof(struct _vector_upl));
9357                         vector_upl = (vector_upl_t)0xfeedfeed;
9358                 }
9359                 else
9360                         panic("vector_upl_deallocate was passed a non-vectored upl\n");
9361         }
9362         else
9363                 panic("vector_upl_deallocate was passed a NULL upl\n");
9364 }
9365
9366 boolean_t
9367 vector_upl_is_valid(upl_t upl)
9368 {
9369         if(upl &&  ((upl->flags & UPL_VECTOR)==UPL_VECTOR)) {
9370                 vector_upl_t vector_upl = upl->vector_upl;
9371                 if(vector_upl == NULL || vector_upl == (vector_upl_t)0xfeedfeed || vector_upl == (vector_upl_t)0xfeedbeef)
9372                         return FALSE;
9373                 else
9374                         return TRUE;
9375         }
9376         return FALSE;
9377 }
9378
9379 boolean_t
9380 vector_upl_set_subupl(upl_t upl,upl_t subupl, uint32_t io_size)
9381 {
9382         if(vector_upl_is_valid(upl)) {
9383                 vector_upl_t vector_upl = upl->vector_upl;
9384
9385                 if(vector_upl) {
9386                         if(subupl) {
9387                                 if(io_size) {
9388                                         if(io_size < PAGE_SIZE)
9389                                                 io_size = PAGE_SIZE;
9390                                         subupl->vector_upl = (void*)vector_upl;
9391                                         vector_upl->upl_elems[vector_upl->num_upls++] = subupl;
9392                                         vector_upl->size += io_size;
9393                                         upl->size += io_size;
9394                                 }
9395                                 else {
9396                                         uint32_t i=0,invalid_upls=0;
9397                                         for(i = 0; i < vector_upl->num_upls; i++) {
9398                                                 if(vector_upl->upl_elems[i] == subupl)
9399                                                         break;
9400                                         }
9401                                         if(i == vector_upl->num_upls)
9402                                                 panic("Trying to remove sub-upl when none exists");
9403
9404                                         vector_upl->upl_elems[i] = NULL;
9405                                         invalid_upls = hw_atomic_add(&(vector_upl)->invalid_upls, 1);
9406                                         if(invalid_upls == vector_upl->num_upls)
9407                                                 return TRUE;
9408                                         else
9409                                                 return FALSE;
9410                                 }
9411                         }
9412                         else
9413                                 panic("vector_upl_set_subupl was passed a NULL upl element\n");
9414                 }
9415                 else
9416                         panic("vector_upl_set_subupl was passed a non-vectored upl\n");
9417         }
9418         else
9419                 panic("vector_upl_set_subupl was passed a NULL upl\n");
9420
9421         return FALSE;
9422 }
9423
9424 void
9425 vector_upl_set_pagelist(upl_t upl)
9426 {
9427         if(vector_upl_is_valid(upl)) {
9428                 uint32_t i=0;
9429                 vector_upl_t vector_upl = upl->vector_upl;
9430
9431                 if(vector_upl) {
9432                         vm_offset_t pagelist_size=0, cur_upl_pagelist_size=0;
9433
9434                         vector_upl->pagelist = (upl_page_info_array_t)kalloc(sizeof(struct upl_page_info)*(vector_upl->size/PAGE_SIZE));
9435
9436                         for(i=0; i < vector_upl->num_upls; i++) {
9437                                 cur_upl_pagelist_size = sizeof(struct upl_page_info) * vector_upl->upl_elems[i]->size/PAGE_SIZE;
9438                                 bcopy(UPL_GET_INTERNAL_PAGE_LIST_SIMPLE(vector_upl->upl_elems[i]), (char*)vector_upl->pagelist + pagelist_size, cur_upl_pagelist_size);
9439                                 pagelist_size += cur_upl_pagelist_size;
9440                                 if(vector_upl->upl_elems[i]->highest_page > upl->highest_page)
9441                                         upl->highest_page = vector_upl->upl_elems[i]->highest_page;
9442                         }
9443                         assert( pagelist_size == (sizeof(struct upl_page_info)*(vector_upl->size/PAGE_SIZE)) );
9444                 }
9445                 else
9446                         panic("vector_upl_set_pagelist was passed a non-vectored upl\n");
9447         }
9448         else
9449                 panic("vector_upl_set_pagelist was passed a NULL upl\n");
9450
9451 }
9452
9453 upl_t
9454 vector_upl_subupl_byindex(upl_t upl, uint32_t index)
9455 {
9456         if(vector_upl_is_valid(upl)) {
9457                 vector_upl_t vector_upl = upl->vector_upl;
9458                 if(vector_upl) {
9459                         if(index < vector_upl->num_upls)
9460                                 return vector_upl->upl_elems[index];
9461                 }
9462                 else
9463                         panic("vector_upl_subupl_byindex was passed a non-vectored upl\n");
9464         }
9465         return NULL;
9466 }
9467
9468 upl_t
9469 vector_upl_subupl_byoffset(upl_t upl, upl_offset_t *upl_offset, upl_size_t *upl_size)
9470 {
9471         if(vector_upl_is_valid(upl)) {
9472                 uint32_t i=0;
9473                 vector_upl_t vector_upl = upl->vector_upl;
9474
9475                 if(vector_upl) {
9476                         upl_t subupl = NULL;
9477                         vector_upl_iostates_t subupl_state;
9478
9479                         for(i=0; i < vector_upl->num_upls; i++) {
9480                                 subupl = vector_upl->upl_elems[i];
9481                                 subupl_state = vector_upl->upl_iostates[i];
9482                                 if( *upl_offset <= (subupl_state.offset + subupl_state.size - 1)) {
9483                                         /* We could have been passed an offset/size pair that belongs
9484                                          * to an UPL element that has already been committed/aborted.
9485                                          * If so, return NULL.
9486                                          */
9487                                         if(subupl == NULL)
9488                                                 return NULL;
9489                                         if((subupl_state.offset + subupl_state.size) < (*upl_offset + *upl_size)) {
9490                                                 *upl_size = (subupl_state.offset + subupl_state.size) - *upl_offset;
9491                                                 if(*upl_size > subupl_state.size)
9492                                                         *upl_size = subupl_state.size;
9493                                         }
9494                                         if(*upl_offset >= subupl_state.offset)
9495                                                 *upl_offset -= subupl_state.offset;
9496                                         else if(i)
9497                                                 panic("Vector UPL offset miscalculation\n");
9498                                         return subupl;
9499                                 }
9500                         }
9501                 }
9502                 else
9503                         panic("vector_upl_subupl_byoffset was passed a non-vectored UPL\n");
9504         }
9505         return NULL;
9506 }
9507
9508 void
9509 vector_upl_get_submap(upl_t upl, vm_map_t *v_upl_submap, vm_offset_t *submap_dst_addr)
9510 {
9511         *v_upl_submap = NULL;
9512
9513         if(vector_upl_is_valid(upl)) {
9514                 vector_upl_t vector_upl = upl->vector_upl;
9515                 if(vector_upl) {
9516                         *v_upl_submap = vector_upl->submap;
9517                         *submap_dst_addr = vector_upl->submap_dst_addr;
9518                 }
9519                 else
9520                         panic("vector_upl_get_submap was passed a non-vectored UPL\n");
9521         }
9522         else
9523                 panic("vector_upl_get_submap was passed a null UPL\n");
9524 }
9525
9526 void
9527 vector_upl_set_submap(upl_t upl, vm_map_t submap, vm_offset_t submap_dst_addr)
9528 {
9529         if(vector_upl_is_valid(upl)) {
9530                 vector_upl_t vector_upl = upl->vector_upl;
9531                 if(vector_upl) {
9532                         vector_upl->submap = submap;
9533                         vector_upl->submap_dst_addr = submap_dst_addr;
9534                 }
9535                 else
9536                         panic("vector_upl_get_submap was passed a non-vectored UPL\n");
9537         }
9538         else
9539                 panic("vector_upl_get_submap was passed a NULL UPL\n");
9540 }
9541
9542 void
9543 vector_upl_set_iostate(upl_t upl, upl_t subupl, upl_offset_t offset, upl_size_t size)
9544 {
9545         if(vector_upl_is_valid(upl)) {
9546                 uint32_t i = 0;
9547                 vector_upl_t vector_upl = upl->vector_upl;
9548
9549                 if(vector_upl) {
9550                         for(i = 0; i < vector_upl->num_upls; i++) {
9551                                 if(vector_upl->upl_elems[i] == subupl)
9552                                         break;
9553                         }
9554
9555                         if(i == vector_upl->num_upls)
9556                                 panic("setting sub-upl iostate when none exists");
9557
9558                         vector_upl->upl_iostates[i].offset = offset;
9559                         if(size < PAGE_SIZE)
9560                                 size = PAGE_SIZE;
9561                         vector_upl->upl_iostates[i].size = size;
9562                 }
9563                 else
9564                         panic("vector_upl_set_iostate was passed a non-vectored UPL\n");
9565         }
9566         else
9567                 panic("vector_upl_set_iostate was passed a NULL UPL\n");
9568 }
9569
9570 void
9571 vector_upl_get_iostate(upl_t upl, upl_t subupl, upl_offset_t *offset, upl_size_t *size)
9572 {
9573         if(vector_upl_is_valid(upl)) {
9574                 uint32_t i = 0;
9575                 vector_upl_t vector_upl = upl->vector_upl;
9576
9577                 if(vector_upl) {
9578                         for(i = 0; i < vector_upl->num_upls; i++) {
9579                                 if(vector_upl->upl_elems[i] == subupl)
9580                                         break;
9581                         }
9582
9583                         if(i == vector_upl->num_upls)
9584                                 panic("getting sub-upl iostate when none exists");
9585
9586                         *offset = vector_upl->upl_iostates[i].offset;
9587                         *size = vector_upl->upl_iostates[i].size;
9588                 }
9589                 else
9590                         panic("vector_upl_get_iostate was passed a non-vectored UPL\n");
9591         }
9592         else
9593                 panic("vector_upl_get_iostate was passed a NULL UPL\n");
9594 }
9595
9596 void
9597 vector_upl_get_iostate_byindex(upl_t upl, uint32_t index, upl_offset_t *offset, upl_size_t *size)
9598 {
9599         if(vector_upl_is_valid(upl)) {
9600                 vector_upl_t vector_upl = upl->vector_upl;
9601                 if(vector_upl) {
9602                         if(index < vector_upl->num_upls) {
9603                                 *offset = vector_upl->upl_iostates[index].offset;
9604                                 *size = vector_upl->upl_iostates[index].size;
9605                         }
9606                         else
9607                                 *offset = *size = 0;
9608                 }
9609                 else
9610                         panic("vector_upl_get_iostate_byindex was passed a non-vectored UPL\n");
9611         }
9612         else
9613                 panic("vector_upl_get_iostate_byindex was passed a NULL UPL\n");
9614 }
9615
9616 upl_page_info_t *
9617 upl_get_internal_vectorupl_pagelist(upl_t upl)
9618 {
9619         return ((vector_upl_t)(upl->vector_upl))->pagelist;
9620 }
9621
9622 void *
9623 upl_get_internal_vectorupl(upl_t upl)
9624 {
9625         return upl->vector_upl;
9626 }
9627
9628 vm_size_t
9629 upl_get_internal_pagelist_offset(void)
9630 {
9631         return sizeof(struct upl);
9632 }
9633
9634 void
9635 upl_clear_dirty(
9636         upl_t           upl,
9637         boolean_t       value)
9638 {
9639         if (value) {
9640                 upl->flags |= UPL_CLEAR_DIRTY;
9641         } else {
9642                 upl->flags &= ~UPL_CLEAR_DIRTY;
9643         }
9644 }
9645
9646 void
9647 upl_set_referenced(
9648         upl_t           upl,
9649         boolean_t       value)
9650 {
9651         upl_lock(upl);
9652         if (value) {
9653                 upl->ext_ref_count++;
9654         } else {
9655                 if (!upl->ext_ref_count) {
9656                         panic("upl_set_referenced not %p\n", upl);
9657                 }
9658                 upl->ext_ref_count--;
9659         }
9660         upl_unlock(upl);
9661 }
9662
9663 #if CONFIG_IOSCHED
9664 void
9665 upl_set_blkno(
9666         upl_t           upl,
9667         vm_offset_t     upl_offset,
9668         int             io_size,
9669         int64_t         blkno)
9670 {
9671                 int i,j;
9672                 if ((upl->flags & UPL_EXPEDITE_SUPPORTED) == 0)
9673                         return;
9674
9675                 assert(upl->upl_reprio_info != 0);
9676                 for(i = (int)(upl_offset / PAGE_SIZE), j = 0; j < io_size; i++, j += PAGE_SIZE) {
9677                         UPL_SET_REPRIO_INFO(upl, i, blkno, io_size);
9678                 }
9679 }
9680 #endif
9681
9682 boolean_t
9683 vm_page_is_slideable(vm_page_t m)
9684 {
9685         boolean_t result = FALSE;
9686         vm_shared_region_slide_info_t si;
9687         vm_object_t     m_object;
9688
9689         m_object = VM_PAGE_OBJECT(m);
9690
9691         vm_object_lock_assert_held(m_object);
9692
9693         /* make sure our page belongs to the one object allowed to do this */
9694         if (!m_object->object_slid) {
9695                 goto done;
9696         }
9697
9698         si = m_object->vo_slide_info;
9699         if (si == NULL) {
9700                 goto done;
9701         }
9702
9703         if(!m->slid && (si->start <= m->offset && si->end > m->offset)) {
9704                 result = TRUE;
9705         }
9706
9707 done:
9708         return result;
9709 }
9710
9711 int vm_page_slide_counter = 0;
9712 int vm_page_slide_errors = 0;
9713 kern_return_t
9714 vm_page_slide(
9715         vm_page_t       page,
9716         vm_map_offset_t kernel_mapping_offset)
9717 {
9718         kern_return_t           kr;
9719         vm_map_size_t           kernel_mapping_size;
9720         boolean_t               kernel_mapping_needs_unmap;
9721         vm_offset_t             kernel_vaddr;
9722         uint32_t                pageIndex;
9723         uint32_t                slide_chunk;
9724         vm_object_t             page_object;
9725
9726         page_object = VM_PAGE_OBJECT(page);
9727
9728         assert(!page->slid);
9729         assert(page_object->object_slid);
9730         vm_object_lock_assert_exclusive(page_object);
9731
9732         if (page->error)
9733                 return KERN_FAILURE;
9734
9735         /*
9736          * Take a paging-in-progress reference to keep the object
9737          * alive even if we have to unlock it (in vm_paging_map_object()
9738          * for example)...
9739          */
9740         vm_object_paging_begin(page_object);
9741
9742         if (kernel_mapping_offset == 0) {
9743                 /*
9744                  * The page hasn't already been mapped in kernel space
9745                  * by the caller.  Map it now, so that we can access
9746                  * its contents and decrypt them.
9747                  */
9748                 kernel_mapping_size = PAGE_SIZE;
9749                 kernel_mapping_needs_unmap = FALSE;
9750                 kr = vm_paging_map_object(page,
9751                                           page_object,
9752                                           page->offset,
9753                                           VM_PROT_READ | VM_PROT_WRITE,
9754                                           FALSE,
9755                                           &kernel_mapping_size,
9756                                           &kernel_mapping_offset,
9757                                           &kernel_mapping_needs_unmap);
9758                 if (kr != KERN_SUCCESS) {
9759                         panic("vm_page_slide: "
9760                               "could not map page in kernel: 0x%x\n",
9761                               kr);
9762                 }
9763         } else {
9764                 kernel_mapping_size = 0;
9765                 kernel_mapping_needs_unmap = FALSE;
9766         }
9767         kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
9768
9769         /*
9770          * Slide the pointers on the page.
9771          */
9772
9773         /*assert that slide_file_info.start/end are page-aligned?*/
9774
9775         assert(!page->slid);
9776         assert(page_object->object_slid);
9777
9778         pageIndex = (uint32_t)((page->offset -
9779                                 page_object->vo_slide_info->start) /
9780                                PAGE_SIZE_FOR_SR_SLIDE);
9781         for (slide_chunk = 0;
9782              slide_chunk < PAGE_SIZE / PAGE_SIZE_FOR_SR_SLIDE;
9783              slide_chunk++) {
9784                 kr = vm_shared_region_slide_page(page_object->vo_slide_info,
9785                                                  (kernel_vaddr +
9786                                                   (slide_chunk *
9787                                                    PAGE_SIZE_FOR_SR_SLIDE)),
9788                                                  (pageIndex + slide_chunk));
9789                 if (kr != KERN_SUCCESS) {
9790                         break;
9791                 }
9792         }
9793
9794         vm_page_slide_counter++;
9795
9796         /*
9797          * Unmap the page from the kernel's address space,
9798          */
9799         if (kernel_mapping_needs_unmap) {
9800                 vm_paging_unmap_object(page_object,
9801                                        kernel_vaddr,
9802                                        kernel_vaddr + PAGE_SIZE);
9803         }
9804
9805         page->dirty = FALSE;
9806         pmap_clear_refmod(VM_PAGE_GET_PHYS_PAGE(page), VM_MEM_MODIFIED | VM_MEM_REFERENCED);
9807
9808         if (kr != KERN_SUCCESS || cs_debug > 1) {
9809                 printf("vm_page_slide(%p): "
9810                        "obj %p off 0x%llx mobj %p moff 0x%llx\n",
9811                        page,
9812                        page_object, page->offset,
9813                        page_object->pager,
9814                        page->offset + page_object->paging_offset);
9815         }
9816
9817         if (kr == KERN_SUCCESS) {
9818                 page->slid = TRUE;
9819         } else {
9820                 page->error = TRUE;
9821                 vm_page_slide_errors++;
9822         }
9823
9824         vm_object_paging_end(page_object);
9825
9826         return kr;
9827 }
9828
9829 void inline memoryshot(unsigned int event, unsigned int control)
9830 {
9831         if (vm_debug_events) {
9832                 KERNEL_DEBUG_CONSTANT1((MACHDBG_CODE(DBG_MACH_VM_PRESSURE, event)) | control,
9833                                         vm_page_active_count, vm_page_inactive_count,
9834                                         vm_page_free_count, vm_page_speculative_count,
9835                                         vm_page_throttled_count);
9836         } else {
9837                 (void) event;
9838                 (void) control;
9839         }
9840
9841 }
9842
9843 #ifdef MACH_BSD
9844
9845 boolean_t  upl_device_page(upl_page_info_t *upl)
9846 {
9847         return(UPL_DEVICE_PAGE(upl));
9848 }
9849 boolean_t  upl_page_present(upl_page_info_t *upl, int index)
9850 {
9851         return(UPL_PAGE_PRESENT(upl, index));
9852 }
9853 boolean_t  upl_speculative_page(upl_page_info_t *upl, int index)
9854 {
9855         return(UPL_SPECULATIVE_PAGE(upl, index));
9856 }
9857 boolean_t  upl_dirty_page(upl_page_info_t *upl, int index)
9858 {
9859         return(UPL_DIRTY_PAGE(upl, index));
9860 }
9861 boolean_t  upl_valid_page(upl_page_info_t *upl, int index)
9862 {
9863         return(UPL_VALID_PAGE(upl, index));
9864 }
9865 ppnum_t  upl_phys_page(upl_page_info_t *upl, int index)
9866 {
9867         return(UPL_PHYS_PAGE(upl, index));
9868 }
9869
9870 void upl_page_set_mark(upl_page_info_t *upl, int index, boolean_t v)
9871 {
9872         upl[index].mark = v;
9873 }
9874
9875 boolean_t upl_page_get_mark(upl_page_info_t *upl, int index)
9876 {
9877         return upl[index].mark;
9878 }
9879
9880 void
9881 vm_countdirtypages(void)
9882 {
9883         vm_page_t m;
9884         int dpages;
9885         int pgopages;
9886         int precpages;
9887
9888
9889         dpages=0;
9890         pgopages=0;
9891         precpages=0;
9892
9893         vm_page_lock_queues();
9894         m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
9895         do {
9896                 if (m ==(vm_page_t )0) break;
9897
9898                 if(m->dirty) dpages++;
9899                 if(m->free_when_done) pgopages++;
9900                 if(m->precious) precpages++;
9901
9902                 assert(VM_PAGE_OBJECT(m) != kernel_object);
9903                 m = (vm_page_t) vm_page_queue_next(&m->pageq);
9904                 if (m ==(vm_page_t )0) break;
9905
9906         } while (!vm_page_queue_end(&vm_page_queue_inactive, (vm_page_queue_entry_t) m));
9907         vm_page_unlock_queues();
9908
9909         vm_page_lock_queues();
9910         m = (vm_page_t) vm_page_queue_first(&vm_page_queue_throttled);
9911         do {
9912                 if (m ==(vm_page_t )0) break;
9913
9914                 dpages++;
9915                 assert(m->dirty);
9916                 assert(!m->free_when_done);
9917                 assert(VM_PAGE_OBJECT(m) != kernel_object);
9918                 m = (vm_page_t) vm_page_queue_next(&m->pageq);
9919                 if (m ==(vm_page_t )0) break;
9920
9921         } while (!vm_page_queue_end(&vm_page_queue_throttled, (vm_page_queue_entry_t) m));
9922         vm_page_unlock_queues();
9923
9924         vm_page_lock_queues();
9925         m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
9926         do {
9927                 if (m ==(vm_page_t )0) break;
9928
9929                 if(m->dirty) dpages++;
9930                 if(m->free_when_done) pgopages++;
9931                 if(m->precious) precpages++;
9932
9933                 assert(VM_PAGE_OBJECT(m) != kernel_object);
9934                 m = (vm_page_t) vm_page_queue_next(&m->pageq);
9935                 if (m ==(vm_page_t )0) break;
9936
9937         } while (!vm_page_queue_end(&vm_page_queue_anonymous, (vm_page_queue_entry_t) m));
9938         vm_page_unlock_queues();
9939
9940         printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
9941
9942         dpages=0;
9943         pgopages=0;
9944         precpages=0;
9945
9946         vm_page_lock_queues();
9947         m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
9948
9949         do {
9950                 if(m == (vm_page_t )0) break;
9951                 if(m->dirty) dpages++;
9952                 if(m->free_when_done) pgopages++;
9953                 if(m->precious) precpages++;
9954
9955                 assert(VM_PAGE_OBJECT(m) != kernel_object);
9956                 m = (vm_page_t) vm_page_queue_next(&m->pageq);
9957                 if(m == (vm_page_t )0) break;
9958
9959         } while (!vm_page_queue_end(&vm_page_queue_active, (vm_page_queue_entry_t) m));
9960         vm_page_unlock_queues();
9961
9962         printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
9963
9964 }
9965 #endif /* MACH_BSD */
9966
9967
9968 #if CONFIG_IOSCHED
9969 int upl_get_cached_tier(upl_t  upl)
9970 {
9971        assert(upl);
9972        if (upl->flags & UPL_TRACKED_BY_OBJECT)
9973                return (upl->upl_priority);
9974        return (-1);
9975 }
9976 #endif /* CONFIG_IOSCHED */
9977
9978 ppnum_t upl_get_highest_page(
9979                              upl_t                      upl)
9980 {
9981         return upl->highest_page;
9982 }
9983
9984 upl_size_t upl_get_size(
9985                              upl_t                      upl)
9986 {
9987         return upl->size;
9988 }
9989
9990 upl_t upl_associated_upl(upl_t upl)
9991 {
9992         return upl->associated_upl;
9993 }
9994
9995 void upl_set_associated_upl(upl_t upl, upl_t associated_upl)
9996 {
9997         upl->associated_upl = associated_upl;
9998 }
9999
10000 struct vnode * upl_lookup_vnode(upl_t upl)
10001 {
10002         if (!upl->map_object->internal)
10003                 return vnode_pager_lookup_vnode(upl->map_object->pager);
10004         else
10005                 return NULL;
10006 }
10007
10008 #if UPL_DEBUG
10009 kern_return_t  upl_ubc_alias_set(upl_t upl, uintptr_t alias1, uintptr_t alias2)
10010 {
10011         upl->ubc_alias1 = alias1;
10012         upl->ubc_alias2 = alias2;
10013         return KERN_SUCCESS;
10014 }
10015 int  upl_ubc_alias_get(upl_t upl, uintptr_t * al, uintptr_t * al2)
10016 {
10017         if(al)
10018                 *al = upl->ubc_alias1;
10019         if(al2)
10020                 *al2 = upl->ubc_alias2;
10021         return KERN_SUCCESS;
10022 }
10023 #endif /* UPL_DEBUG */
10024
10025 #if VM_PRESSURE_EVENTS
10026 /*
10027  * Upward trajectory.
10028  */
10029 extern boolean_t vm_compressor_low_on_space(void);
10030
10031 boolean_t
10032 VM_PRESSURE_NORMAL_TO_WARNING(void)     {
10033
10034         if ( !VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10035
10036                 /* Available pages below our threshold */
10037                 if (memorystatus_available_pages < memorystatus_available_pages_pressure) {
10038                         /* No frozen processes to kill */
10039                         if (memorystatus_frozen_count == 0) {
10040                                 /* Not enough suspended processes available. */
10041                                 if (memorystatus_suspended_count < MEMORYSTATUS_SUSPENDED_THRESHOLD) {
10042                                         return TRUE;
10043                                 }
10044                         }
10045                 }
10046                 return FALSE;
10047
10048         } else {
10049                 return ((AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) ? 1 : 0);
10050         }
10051 }
10052
10053 boolean_t
10054 VM_PRESSURE_WARNING_TO_CRITICAL(void) {
10055
10056         if ( !VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10057
10058                 /* Available pages below our threshold */
10059                 if (memorystatus_available_pages < memorystatus_available_pages_critical) {
10060                         return TRUE;
10061                 }
10062                 return FALSE;
10063         } else {
10064                 return (vm_compressor_low_on_space() || (AVAILABLE_NON_COMPRESSED_MEMORY < ((12 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0);
10065         }
10066 }
10067
10068 /*
10069  * Downward trajectory.
10070  */
10071 boolean_t
10072 VM_PRESSURE_WARNING_TO_NORMAL(void) {
10073
10074         if ( !VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10075
10076                 /* Available pages above our threshold */
10077                 unsigned int target_threshold = (unsigned int) (memorystatus_available_pages_pressure + ((15 * memorystatus_available_pages_pressure) / 100));
10078                 if (memorystatus_available_pages > target_threshold) {
10079                         return TRUE;
10080                 }
10081                 return FALSE;
10082         } else {
10083                 return ((AVAILABLE_NON_COMPRESSED_MEMORY > ((12 * VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) / 10)) ? 1 : 0);
10084         }
10085 }
10086
10087 boolean_t
10088 VM_PRESSURE_CRITICAL_TO_WARNING(void) {
10089
10090         if ( !VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
10091
10092                 /* Available pages above our threshold */
10093                 unsigned int target_threshold = (unsigned int)(memorystatus_available_pages_critical + ((15 * memorystatus_available_pages_critical) / 100));
10094                 if (memorystatus_available_pages > target_threshold) {
10095                         return TRUE;
10096                 }
10097                 return FALSE;
10098         } else {
10099                 return ((AVAILABLE_NON_COMPRESSED_MEMORY > ((14 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0);
10100         }
10101 }
10102 #endif /* VM_PRESSURE_EVENTS */
10103
10104
10105
10106 #define VM_TEST_COLLAPSE_COMPRESSOR             0
10107 #define VM_TEST_WIRE_AND_EXTRACT                0
10108 #define VM_TEST_PAGE_WIRE_OVERFLOW_PANIC        0
10109 #if __arm64__
10110 #define VM_TEST_KERNEL_OBJECT_FAULT             0
10111 #endif /* __arm64__ */
10112 #define VM_TEST_DEVICE_PAGER_TRANSPOSE          (DEVELOPMENT || DEBUG)
10113
10114 #if VM_TEST_COLLAPSE_COMPRESSOR
10115 extern boolean_t vm_object_collapse_compressor_allowed;
10116 #include <IOKit/IOLib.h>
10117 static void
10118 vm_test_collapse_compressor(void)
10119 {
10120         vm_object_size_t        backing_size, top_size;
10121         vm_object_t             backing_object, top_object;
10122         vm_map_offset_t         backing_offset, top_offset;
10123         unsigned char           *backing_address, *top_address;
10124         kern_return_t           kr;
10125
10126         printf("VM_TEST_COLLAPSE_COMPRESSOR:\n");
10127
10128         /* create backing object */
10129         backing_size = 15 * PAGE_SIZE;
10130         backing_object = vm_object_allocate(backing_size);
10131         assert(backing_object != VM_OBJECT_NULL);
10132         printf("VM_TEST_COLLAPSE_COMPRESSOR: created backing object %p\n",
10133                 backing_object);
10134         /* map backing object */
10135         backing_offset = 0;
10136         kr = vm_map_enter(kernel_map, &backing_offset, backing_size, 0,
10137                           VM_FLAGS_ANYWHERE, VM_MAP_KERNEL_FLAGS_NONE,
10138                           backing_object, 0, FALSE,
10139                           VM_PROT_DEFAULT, VM_PROT_DEFAULT, VM_INHERIT_DEFAULT);
10140         assert(kr == KERN_SUCCESS);
10141         backing_address = (unsigned char *) backing_offset;
10142         printf("VM_TEST_COLLAPSE_COMPRESSOR: "
10143                "mapped backing object %p at 0x%llx\n",
10144                backing_object, (uint64_t) backing_offset);
10145         /* populate with pages to be compressed in backing object */
10146         backing_address[0x1*PAGE_SIZE] = 0xB1;
10147         backing_address[0x4*PAGE_SIZE] = 0xB4;
10148         backing_address[0x7*PAGE_SIZE] = 0xB7;
10149         backing_address[0xa*PAGE_SIZE] = 0xBA;
10150         backing_address[0xd*PAGE_SIZE] = 0xBD;
10151         printf("VM_TEST_COLLAPSE_COMPRESSOR: "
10152                "populated pages to be compressed in "
10153                "backing_object %p\n", backing_object);
10154         /* compress backing object */
10155         vm_object_pageout(backing_object);
10156         printf("VM_TEST_COLLAPSE_COMPRESSOR: compressing backing_object %p\n",
10157                backing_object);
10158         /* wait for all the pages to be gone */
10159         while (*(volatile int *)&backing_object->resident_page_count != 0)
10160                 IODelay(10);
10161         printf("VM_TEST_COLLAPSE_COMPRESSOR: backing_object %p compressed\n",
10162                backing_object);
10163         /* populate with pages to be resident in backing object */
10164         backing_address[0x0*PAGE_SIZE] = 0xB0;
10165         backing_address[0x3*PAGE_SIZE] = 0xB3;
10166         backing_address[0x6*PAGE_SIZE] = 0xB6;
10167         backing_address[0x9*PAGE_SIZE] = 0xB9;
10168         backing_address[0xc*PAGE_SIZE] = 0xBC;
10169         printf("VM_TEST_COLLAPSE_COMPRESSOR: "
10170                "populated pages to be resident in "
10171                "backing_object %p\n", backing_object);
10172         /* leave the other pages absent */
10173         /* mess with the paging_offset of the backing_object */
10174         assert(backing_object->paging_offset == 0);
10175         backing_object->paging_offset = 0x3000;
10176
10177         /* create top object */
10178         top_size = 9 * PAGE_SIZE;
10179         top_object = vm_object_allocate(top_size);
10180         assert(top_object != VM_OBJECT_NULL);
10181         printf("VM_TEST_COLLAPSE_COMPRESSOR: created top object %p\n",
10182                 top_object);
10183         /* map top object */
10184         top_offset = 0;
10185         kr = vm_map_enter(kernel_map, &top_offset, top_size, 0,
10186                           VM_FLAGS_ANYWHERE, VM_MAP_KERNEL_FLAGS_NONE,
10187                           top_object, 0, FALSE,
10188                           VM_PROT_DEFAULT, VM_PROT_DEFAULT, VM_INHERIT_DEFAULT);
10189         assert(kr == KERN_SUCCESS);
10190         top_address = (unsigned char *) top_offset;
10191         printf("VM_TEST_COLLAPSE_COMPRESSOR: "
10192                "mapped top object %p at 0x%llx\n",
10193                top_object, (uint64_t) top_offset);
10194         /* populate with pages to be compressed in top object */
10195         top_address[0x3*PAGE_SIZE] = 0xA3;
10196         top_address[0x4*PAGE_SIZE] = 0xA4;
10197         top_address[0x5*PAGE_SIZE] = 0xA5;
10198         printf("VM_TEST_COLLAPSE_COMPRESSOR: "
10199                "populated pages to be compressed in "
10200                "top_object %p\n", top_object);
10201         /* compress top object */
10202         vm_object_pageout(top_object);
10203         printf("VM_TEST_COLLAPSE_COMPRESSOR: compressing top_object %p\n",
10204                top_object);
10205         /* wait for all the pages to be gone */
10206         while (top_object->resident_page_count != 0)
10207                 IODelay(10);
10208         printf("VM_TEST_COLLAPSE_COMPRESSOR: top_object %p compressed\n",
10209                top_object);
10210         /* populate with pages to be resident in top object */
10211         top_address[0x0*PAGE_SIZE] = 0xA0;
10212         top_address[0x1*PAGE_SIZE] = 0xA1;
10213         top_address[0x2*PAGE_SIZE] = 0xA2;
10214         printf("VM_TEST_COLLAPSE_COMPRESSOR: "
10215                "populated pages to be resident in "
10216                "top_object %p\n", top_object);
10217         /* leave the other pages absent */
10218
10219         /* link the 2 objects */
10220         vm_object_reference(backing_object);
10221         top_object->shadow = backing_object;
10222         top_object->vo_shadow_offset = 0x3000;
10223         printf("VM_TEST_COLLAPSE_COMPRESSOR: linked %p and %p\n",
10224                top_object, backing_object);
10225
10226         /* unmap backing object */
10227         vm_map_remove(kernel_map,
10228                       backing_offset,
10229                       backing_offset + backing_size,
10230                       0);
10231         printf("VM_TEST_COLLAPSE_COMPRESSOR: "
10232                "unmapped backing_object %p [0x%llx:0x%llx]\n",
10233                backing_object,
10234                (uint64_t) backing_offset,
10235                (uint64_t) (backing_offset + backing_size));
10236
10237         /* collapse */
10238         printf("VM_TEST_COLLAPSE_COMPRESSOR: collapsing %p\n", top_object);
10239         vm_object_lock(top_object);
10240         vm_object_collapse(top_object, 0, FALSE);
10241         vm_object_unlock(top_object);
10242         printf("VM_TEST_COLLAPSE_COMPRESSOR: collapsed %p\n", top_object);
10243
10244         /* did it work? */
10245         if (top_object->shadow != VM_OBJECT_NULL) {
10246                 printf("VM_TEST_COLLAPSE_COMPRESSOR: not collapsed\n");
10247                 printf("VM_TEST_COLLAPSE_COMPRESSOR: FAIL\n");
10248                 if (vm_object_collapse_compressor_allowed) {
10249                         panic("VM_TEST_COLLAPSE_COMPRESSOR: FAIL\n");
10250                 }
10251         } else {
10252                 /* check the contents of the mapping */
10253                 unsigned char expect[9] =
10254                         { 0xA0, 0xA1, 0xA2,     /* resident in top */
10255                           0xA3, 0xA4, 0xA5,     /* compressed in top */
10256                           0xB9, /* resident in backing + shadow_offset */
10257                           0xBD, /* compressed in backing + shadow_offset + paging_offset */
10258                           0x00 };               /* absent in both */
10259                 unsigned char actual[9];
10260                 unsigned int i, errors;
10261
10262                 errors = 0;
10263                 for (i = 0; i < sizeof (actual); i++) {
10264                         actual[i] = (unsigned char) top_address[i*PAGE_SIZE];
10265                         if (actual[i] != expect[i]) {
10266                                 errors++;
10267                         }
10268                 }
10269                 printf("VM_TEST_COLLAPSE_COMPRESSOR: "
10270                        "actual [%x %x %x %x %x %x %x %x %x] "
10271                        "expect [%x %x %x %x %x %x %x %x %x] "
10272                        "%d errors\n",
10273                        actual[0], actual[1], actual[2], actual[3],
10274                        actual[4], actual[5], actual[6], actual[7],
10275                        actual[8],
10276                        expect[0], expect[1], expect[2], expect[3],
10277                        expect[4], expect[5], expect[6], expect[7],
10278                        expect[8],
10279                        errors);
10280                 if (errors) {
10281                         panic("VM_TEST_COLLAPSE_COMPRESSOR: FAIL\n");
10282                 } else {
10283                         printf("VM_TEST_COLLAPSE_COMPRESSOR: PASS\n");
10284                 }
10285         }
10286 }
10287 #else /* VM_TEST_COLLAPSE_COMPRESSOR */
10288 #define vm_test_collapse_compressor()
10289 #endif /* VM_TEST_COLLAPSE_COMPRESSOR */
10290
10291 #if VM_TEST_WIRE_AND_EXTRACT
10292 extern ledger_template_t        task_ledger_template;
10293 #include <mach/mach_vm.h>
10294 extern ppnum_t vm_map_get_phys_page(vm_map_t map,
10295                                     vm_offset_t offset);
10296 static void
10297 vm_test_wire_and_extract(void)
10298 {
10299         ledger_t                ledger;
10300         vm_map_t                user_map, wire_map;
10301         mach_vm_address_t       user_addr, wire_addr;
10302         mach_vm_size_t          user_size, wire_size;
10303         mach_vm_offset_t        cur_offset;
10304         vm_prot_t               cur_prot, max_prot;
10305         ppnum_t                 user_ppnum, wire_ppnum;
10306         kern_return_t           kr;
10307
10308         ledger = ledger_instantiate(task_ledger_template,
10309                                     LEDGER_CREATE_ACTIVE_ENTRIES);
10310         user_map = vm_map_create(pmap_create(ledger, 0, PMAP_CREATE_64BIT),
10311                                  0x100000000ULL,
10312                                  0x200000000ULL,
10313                                  TRUE);
10314         wire_map = vm_map_create(NULL,
10315                                  0x100000000ULL,
10316                                  0x200000000ULL,
10317                                  TRUE);
10318         user_addr = 0;
10319         user_size = 0x10000;
10320         kr = mach_vm_allocate(user_map,
10321                               &user_addr,
10322                               user_size,
10323                               VM_FLAGS_ANYWHERE);
10324         assert(kr == KERN_SUCCESS);
10325         wire_addr = 0;
10326         wire_size = user_size;
10327         kr = mach_vm_remap(wire_map,
10328                            &wire_addr,
10329                            wire_size,
10330                            0,
10331                            VM_FLAGS_ANYWHERE,
10332                            user_map,
10333                            user_addr,
10334                            FALSE,
10335                            &cur_prot,
10336                            &max_prot,
10337                            VM_INHERIT_NONE);
10338         assert(kr == KERN_SUCCESS);
10339         for (cur_offset = 0;
10340              cur_offset < wire_size;
10341              cur_offset += PAGE_SIZE) {
10342                 kr = vm_map_wire_and_extract(wire_map,
10343                                              wire_addr + cur_offset,
10344                                              VM_PROT_DEFAULT | VM_PROT_MEMORY_TAG_MAKE(VM_KERN_MEMORY_OSFMK),
10345                                              TRUE,
10346                                              &wire_ppnum);
10347                 assert(kr == KERN_SUCCESS);
10348                 user_ppnum = vm_map_get_phys_page(user_map,
10349                                                   user_addr + cur_offset);
10350                 printf("VM_TEST_WIRE_AND_EXTRACT: kr=0x%x "
10351                        "user[%p:0x%llx:0x%x] wire[%p:0x%llx:0x%x]\n",
10352                        kr,
10353                        user_map, user_addr + cur_offset, user_ppnum,
10354                        wire_map, wire_addr + cur_offset, wire_ppnum);
10355                 if (kr != KERN_SUCCESS ||
10356                     wire_ppnum == 0 ||
10357                     wire_ppnum != user_ppnum) {
10358                         panic("VM_TEST_WIRE_AND_EXTRACT: FAIL\n");
10359                 }
10360         }
10361         cur_offset -= PAGE_SIZE;
10362         kr = vm_map_wire_and_extract(wire_map,
10363                                      wire_addr + cur_offset,
10364                                      VM_PROT_DEFAULT,
10365                                      TRUE,
10366                                      &wire_ppnum);
10367         assert(kr == KERN_SUCCESS);
10368         printf("VM_TEST_WIRE_AND_EXTRACT: re-wire kr=0x%x "
10369                "user[%p:0x%llx:0x%x] wire[%p:0x%llx:0x%x]\n",
10370                kr,
10371                user_map, user_addr + cur_offset, user_ppnum,
10372                wire_map, wire_addr + cur_offset, wire_ppnum);
10373         if (kr != KERN_SUCCESS ||
10374             wire_ppnum == 0 ||
10375             wire_ppnum != user_ppnum) {
10376                 panic("VM_TEST_WIRE_AND_EXTRACT: FAIL\n");
10377         }
10378
10379         printf("VM_TEST_WIRE_AND_EXTRACT: PASS\n");
10380 }
10381 #else /* VM_TEST_WIRE_AND_EXTRACT */
10382 #define vm_test_wire_and_extract()
10383 #endif /* VM_TEST_WIRE_AND_EXTRACT */
10384
10385 #if VM_TEST_PAGE_WIRE_OVERFLOW_PANIC
10386 static void
10387 vm_test_page_wire_overflow_panic(void)
10388 {
10389         vm_object_t object;
10390         vm_page_t page;
10391
10392         printf("VM_TEST_PAGE_WIRE_OVERFLOW_PANIC: starting...\n");
10393
10394         object = vm_object_allocate(PAGE_SIZE);
10395         vm_object_lock(object);
10396         page = vm_page_alloc(object, 0x0);
10397         vm_page_lock_queues();
10398         do {
10399                 vm_page_wire(page, 1, FALSE);
10400         } while (page->wire_count != 0);
10401         vm_page_unlock_queues();
10402         vm_object_unlock(object);
10403         panic("FBDP(%p,%p): wire_count overflow not detected\n",
10404               object, page);
10405 }
10406 #else /* VM_TEST_PAGE_WIRE_OVERFLOW_PANIC */
10407 #define vm_test_page_wire_overflow_panic()
10408 #endif /* VM_TEST_PAGE_WIRE_OVERFLOW_PANIC */
10409
10410 #if __arm64__ && VM_TEST_KERNEL_OBJECT_FAULT
10411 extern int copyinframe(vm_address_t fp, char *frame, boolean_t is64bit);
10412 static void
10413 vm_test_kernel_object_fault(void)
10414 {
10415         kern_return_t kr;
10416         vm_offset_t stack;
10417         uintptr_t frameb[2];
10418         int ret;
10419
10420         kr = kernel_memory_allocate(kernel_map, &stack,
10421                                     kernel_stack_size + (2*PAGE_SIZE),
10422                                     0,
10423                                     (KMA_KSTACK | KMA_KOBJECT |
10424                                      KMA_GUARD_FIRST | KMA_GUARD_LAST),
10425                                     VM_KERN_MEMORY_STACK);
10426         if (kr != KERN_SUCCESS) {
10427                 panic("VM_TEST_KERNEL_OBJECT_FAULT: kernel_memory_allocate kr 0x%x\n", kr);
10428         }
10429         ret = copyinframe((uintptr_t)stack, (char *)frameb, TRUE);
10430         if (ret != 0) {
10431                 printf("VM_TEST_KERNEL_OBJECT_FAULT: PASS\n");
10432         } else {
10433                 printf("VM_TEST_KERNEL_OBJECT_FAULT: FAIL\n");
10434         }
10435         vm_map_remove(kernel_map,
10436                       stack,
10437                       stack + kernel_stack_size + (2*PAGE_SIZE),
10438                       VM_MAP_REMOVE_KUNWIRE);
10439         stack = 0;
10440 }
10441 #else /* __arm64__ && VM_TEST_KERNEL_OBJECT_FAULT */
10442 #define vm_test_kernel_object_fault()
10443 #endif /* __arm64__ && VM_TEST_KERNEL_OBJECT_FAULT */
10444
10445 #if VM_TEST_DEVICE_PAGER_TRANSPOSE
10446 static void
10447 vm_test_device_pager_transpose(void)
10448 {
10449         memory_object_t device_pager;
10450         vm_object_t     anon_object, device_object;
10451         vm_size_t       size;
10452         vm_map_offset_t anon_mapping, device_mapping;
10453         kern_return_t   kr;
10454
10455         size = 3 * PAGE_SIZE;
10456         anon_object = vm_object_allocate(size);
10457         assert(anon_object != VM_OBJECT_NULL);
10458         device_pager = device_pager_setup(NULL, 0, size, 0);
10459         assert(device_pager != NULL);
10460         device_object = memory_object_to_vm_object(device_pager);
10461         assert(device_object != VM_OBJECT_NULL);
10462         anon_mapping = 0;
10463         kr = vm_map_enter(kernel_map, &anon_mapping, size, 0,
10464                           VM_FLAGS_ANYWHERE, VM_MAP_KERNEL_FLAGS_NONE, VM_KERN_MEMORY_NONE,
10465                           anon_object, 0, FALSE, VM_PROT_DEFAULT, VM_PROT_ALL,
10466                           VM_INHERIT_DEFAULT);
10467         assert(kr == KERN_SUCCESS);
10468         device_mapping = 0;
10469         kr = vm_map_enter_mem_object(kernel_map, &device_mapping, size, 0,
10470                                      VM_FLAGS_ANYWHERE,
10471                                      VM_MAP_KERNEL_FLAGS_NONE,
10472                                      VM_KERN_MEMORY_NONE,
10473                                      (void *)device_pager, 0, FALSE,
10474                                      VM_PROT_DEFAULT, VM_PROT_ALL,
10475                                      VM_INHERIT_DEFAULT);
10476         assert(kr == KERN_SUCCESS);
10477         memory_object_deallocate(device_pager);
10478
10479         vm_object_lock(anon_object);
10480         vm_object_activity_begin(anon_object);
10481         anon_object->blocked_access = TRUE;
10482         vm_object_unlock(anon_object);
10483         vm_object_lock(device_object);
10484         vm_object_activity_begin(device_object);
10485         device_object->blocked_access = TRUE;
10486         vm_object_unlock(device_object);
10487
10488         assert(anon_object->ref_count == 1);
10489         assert(!anon_object->named);
10490         assert(device_object->ref_count == 2);
10491         assert(device_object->named);
10492
10493         kr = vm_object_transpose(device_object, anon_object, size);
10494         assert(kr == KERN_SUCCESS);
10495
10496         vm_object_lock(anon_object);
10497         vm_object_activity_end(anon_object);
10498         anon_object->blocked_access = FALSE;
10499         vm_object_unlock(anon_object);
10500         vm_object_lock(device_object);
10501         vm_object_activity_end(device_object);
10502         device_object->blocked_access = FALSE;
10503         vm_object_unlock(device_object);
10504
10505         assert(anon_object->ref_count == 2);
10506         assert(anon_object->named);
10507         kr = vm_deallocate(kernel_map, anon_mapping, size);
10508         assert(kr == KERN_SUCCESS);
10509         assert(device_object->ref_count == 1);
10510         assert(!device_object->named);
10511         kr = vm_deallocate(kernel_map, device_mapping, size);
10512         assert(kr == KERN_SUCCESS);
10513
10514         printf("VM_TEST_DEVICE_PAGER_TRANSPOSE: PASS\n");
10515 }
10516 #else /* VM_TEST_DEVICE_PAGER_TRANSPOSE */
10517 #define vm_test_device_pager_transpose()
10518 #endif /* VM_TEST_DEVICE_PAGER_TRANSPOSE */
10519
10520 void
10521 vm_tests(void)
10522 {
10523         vm_test_collapse_compressor();
10524         vm_test_wire_and_extract();
10525         vm_test_page_wire_overflow_panic();
10526         vm_test_kernel_object_fault();
10527         vm_test_device_pager_transpose();
10528 }