osfmk/vm/vm_pageout.c

   1 /*
   2  * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58 /*
  59  *      File:   vm/vm_pageout.c
  60  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  61  *      Date:   1985
  62  *
  63  *      The proverbial page-out daemon.
  64  */
  65
  66 #include <stdint.h>
  67
  68 #include <debug.h>
  69 #include <mach_pagemap.h>
  70 #include <mach_cluster_stats.h>
  71
  72 #include <mach/mach_types.h>
  73 #include <mach/memory_object.h>
  74 #include <mach/memory_object_default.h>
  75 #include <mach/memory_object_control_server.h>
  76 #include <mach/mach_host_server.h>
  77 #include <mach/upl.h>
  78 #include <mach/vm_map.h>
  79 #include <mach/vm_param.h>
  80 #include <mach/vm_statistics.h>
  81 #include <mach/sdt.h>
  82
  83 #include <kern/kern_types.h>
  84 #include <kern/counters.h>
  85 #include <kern/host_statistics.h>
  86 #include <kern/machine.h>
  87 #include <kern/misc_protos.h>
  88 #include <kern/sched.h>
  89 #include <kern/thread.h>
  90 #include <kern/xpr.h>
  91 #include <kern/kalloc.h>
  92 #include <kern/policy_internal.h>
  93
  94 #include <machine/vm_tuning.h>
  95 #include <machine/commpage.h>
  96
  97 #include <vm/pmap.h>
  98 #include <vm/vm_compressor_pager.h>
  99 #include <vm/vm_fault.h>
 100 #include <vm/vm_map.h>
 101 #include <vm/vm_object.h>
 102 #include <vm/vm_page.h>
 103 #include <vm/vm_pageout.h>
 104 #include <vm/vm_protos.h> /* must be last */
 105 #include <vm/memory_object.h>
 106 #include <vm/vm_purgeable_internal.h>
 107 #include <vm/vm_shared_region.h>
 108 #include <vm/vm_compressor.h>
 109
 110 #if CONFIG_PHANTOM_CACHE
 111 #include <vm/vm_phantom_cache.h>
 112 #endif
 113 /*
 114  * ENCRYPTED SWAP:
 115  */
 116 #include <libkern/crypto/aes.h>
 117 extern u_int32_t random(void);  /* from <libkern/libkern.h> */
 118
 119 extern int cs_debug;
 120
 121 #if UPL_DEBUG
 122 #include <libkern/OSDebug.h>
 123 #endif
 124
 125 extern void m_drain(void);
 126
 127 #if VM_PRESSURE_EVENTS
 128 extern unsigned int memorystatus_available_pages;
 129 extern unsigned int memorystatus_available_pages_pressure;
 130 extern unsigned int memorystatus_available_pages_critical;
 131 extern unsigned int memorystatus_frozen_count;
 132 extern unsigned int memorystatus_suspended_count;
 133
 134 extern vm_pressure_level_t memorystatus_vm_pressure_level;
 135 int memorystatus_purge_on_warning = 2;
 136 int memorystatus_purge_on_urgent = 5;
 137 int memorystatus_purge_on_critical = 8;
 138
 139 void vm_pressure_response(void);
 140 boolean_t vm_pressure_thread_running = FALSE;
 141 extern void consider_vm_pressure_events(void);
 142
 143 #define MEMORYSTATUS_SUSPENDED_THRESHOLD  4
 144 #endif /* VM_PRESSURE_EVENTS */
 145
 146 boolean_t       vm_pressure_changed = FALSE;
 147
 148 #ifndef VM_PAGEOUT_BURST_ACTIVE_THROTTLE   /* maximum iterations of the active queue to move pages to inactive */
 149 #define VM_PAGEOUT_BURST_ACTIVE_THROTTLE  100
 150 #endif
 151
 152 #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE  /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
 153 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096
 154 #endif
 155
 156 #ifndef VM_PAGEOUT_DEADLOCK_RELIEF
 157 #define VM_PAGEOUT_DEADLOCK_RELIEF 100  /* number of pages to move to break deadlock */
 158 #endif
 159
 160 #ifndef VM_PAGEOUT_INACTIVE_RELIEF
 161 #define VM_PAGEOUT_INACTIVE_RELIEF 50   /* minimum number of pages to move to the inactive q */
 162 #endif
 163
 164 #ifndef VM_PAGE_LAUNDRY_MAX
 165 #define VM_PAGE_LAUNDRY_MAX     128UL   /* maximum pageouts on a given pageout queue */
 166 #endif  /* VM_PAGEOUT_LAUNDRY_MAX */
 167
 168 #ifndef VM_PAGEOUT_BURST_WAIT
 169 #define VM_PAGEOUT_BURST_WAIT   10      /* milliseconds */
 170 #endif  /* VM_PAGEOUT_BURST_WAIT */
 171
 172 #ifndef VM_PAGEOUT_EMPTY_WAIT
 173 #define VM_PAGEOUT_EMPTY_WAIT   200     /* milliseconds */
 174 #endif  /* VM_PAGEOUT_EMPTY_WAIT */
 175
 176 #ifndef VM_PAGEOUT_DEADLOCK_WAIT
 177 #define VM_PAGEOUT_DEADLOCK_WAIT        300     /* milliseconds */
 178 #endif  /* VM_PAGEOUT_DEADLOCK_WAIT */
 179
 180 #ifndef VM_PAGEOUT_IDLE_WAIT
 181 #define VM_PAGEOUT_IDLE_WAIT    10      /* milliseconds */
 182 #endif  /* VM_PAGEOUT_IDLE_WAIT */
 183
 184 #ifndef VM_PAGEOUT_SWAP_WAIT
 185 #define VM_PAGEOUT_SWAP_WAIT    50      /* milliseconds */
 186 #endif  /* VM_PAGEOUT_SWAP_WAIT */
 187
 188 #ifndef VM_PAGEOUT_PRESSURE_PAGES_CONSIDERED
 189 #define VM_PAGEOUT_PRESSURE_PAGES_CONSIDERED            1000    /* maximum pages considered before we issue a pressure event */
 190 #endif /* VM_PAGEOUT_PRESSURE_PAGES_CONSIDERED */
 191
 192 #ifndef VM_PAGEOUT_PRESSURE_EVENT_MONITOR_SECS
 193 #define VM_PAGEOUT_PRESSURE_EVENT_MONITOR_SECS          5       /* seconds */
 194 #endif /* VM_PAGEOUT_PRESSURE_EVENT_MONITOR_SECS */
 195
 196 unsigned int    vm_page_speculative_q_age_ms = VM_PAGE_SPECULATIVE_Q_AGE_MS;
 197 unsigned int    vm_page_speculative_percentage = 5;
 198
 199 #ifndef VM_PAGE_SPECULATIVE_TARGET
 200 #define VM_PAGE_SPECULATIVE_TARGET(total) ((total) * 1 / (100 / vm_page_speculative_percentage))
 201 #endif /* VM_PAGE_SPECULATIVE_TARGET */
 202
 203
 204 #ifndef VM_PAGE_INACTIVE_HEALTHY_LIMIT
 205 #define VM_PAGE_INACTIVE_HEALTHY_LIMIT(total) ((total) * 1 / 200)
 206 #endif /* VM_PAGE_INACTIVE_HEALTHY_LIMIT */
 207
 208
 209 /*
 210  *      To obtain a reasonable LRU approximation, the inactive queue
 211  *      needs to be large enough to give pages on it a chance to be
 212  *      referenced a second time.  This macro defines the fraction
 213  *      of active+inactive pages that should be inactive.
 214  *      The pageout daemon uses it to update vm_page_inactive_target.
 215  *
 216  *      If vm_page_free_count falls below vm_page_free_target and
 217  *      vm_page_inactive_count is below vm_page_inactive_target,
 218  *      then the pageout daemon starts running.
 219  */
 220
 221 #ifndef VM_PAGE_INACTIVE_TARGET
 222 #define VM_PAGE_INACTIVE_TARGET(avail)  ((avail) * 1 / 2)
 223 #endif  /* VM_PAGE_INACTIVE_TARGET */
 224
 225 /*
 226  *      Once the pageout daemon starts running, it keeps going
 227  *      until vm_page_free_count meets or exceeds vm_page_free_target.
 228  */
 229
 230 #ifndef VM_PAGE_FREE_TARGET
 231 #define VM_PAGE_FREE_TARGET(free)       (15 + (free) / 80)
 232 #endif  /* VM_PAGE_FREE_TARGET */
 233
 234
 235 /*
 236  *      The pageout daemon always starts running once vm_page_free_count
 237  *      falls below vm_page_free_min.
 238  */
 239
 240 #ifndef VM_PAGE_FREE_MIN
 241 #define VM_PAGE_FREE_MIN(free)          (10 + (free) / 100)
 242 #endif  /* VM_PAGE_FREE_MIN */
 243
 244 #define VM_PAGE_FREE_RESERVED_LIMIT     1700
 245 #define VM_PAGE_FREE_MIN_LIMIT          3500
 246 #define VM_PAGE_FREE_TARGET_LIMIT       4000
 247
 248 /*
 249  *      When vm_page_free_count falls below vm_page_free_reserved,
 250  *      only vm-privileged threads can allocate pages.  vm-privilege
 251  *      allows the pageout daemon and default pager (and any other
 252  *      associated threads needed for default pageout) to continue
 253  *      operation by dipping into the reserved pool of pages.
 254  */
 255
 256 #ifndef VM_PAGE_FREE_RESERVED
 257 #define VM_PAGE_FREE_RESERVED(n)        \
 258         ((unsigned) (6 * VM_PAGE_LAUNDRY_MAX) + (n))
 259 #endif  /* VM_PAGE_FREE_RESERVED */
 260
 261 /*
 262  *      When we dequeue pages from the inactive list, they are
 263  *      reactivated (ie, put back on the active queue) if referenced.
 264  *      However, it is possible to starve the free list if other
 265  *      processors are referencing pages faster than we can turn off
 266  *      the referenced bit.  So we limit the number of reactivations
 267  *      we will make per call of vm_pageout_scan().
 268  */
 269 #define VM_PAGE_REACTIVATE_LIMIT_MAX 20000
 270 #ifndef VM_PAGE_REACTIVATE_LIMIT
 271 #define VM_PAGE_REACTIVATE_LIMIT(avail) (MAX((avail) * 1 / 20,VM_PAGE_REACTIVATE_LIMIT_MAX))
 272 #endif  /* VM_PAGE_REACTIVATE_LIMIT */
 273 #define VM_PAGEOUT_INACTIVE_FORCE_RECLAIM       1000
 274
 275
 276 extern boolean_t hibernate_cleaning_in_progress;
 277
 278 /*
 279  * Exported variable used to broadcast the activation of the pageout scan
 280  * Working Set uses this to throttle its use of pmap removes.  In this
 281  * way, code which runs within memory in an uncontested context does
 282  * not keep encountering soft faults.
 283  */
 284
 285 unsigned int    vm_pageout_scan_event_counter = 0;
 286
 287 /*
 288  * Forward declarations for internal routines.
 289  */
 290 struct cq {
 291         struct vm_pageout_queue *q;
 292         void                    *current_chead;
 293         char                    *scratch_buf;
 294         int                     id;
 295 };
 296 #define MAX_COMPRESSOR_THREAD_COUNT     8
 297
 298 struct cq ciq[MAX_COMPRESSOR_THREAD_COUNT];
 299
 300 void    *vm_pageout_immediate_chead;
 301 char    *vm_pageout_immediate_scratch_buf;
 302
 303
 304 #if VM_PRESSURE_EVENTS
 305 void vm_pressure_thread(void);
 306
 307 boolean_t VM_PRESSURE_NORMAL_TO_WARNING(void);
 308 boolean_t VM_PRESSURE_WARNING_TO_CRITICAL(void);
 309
 310 boolean_t VM_PRESSURE_WARNING_TO_NORMAL(void);
 311 boolean_t VM_PRESSURE_CRITICAL_TO_WARNING(void);
 312 #endif
 313 static void vm_pageout_garbage_collect(int);
 314 static void vm_pageout_iothread_external(void);
 315 static void vm_pageout_iothread_internal(struct cq *cq);
 316 static void vm_pageout_adjust_io_throttles(struct vm_pageout_queue *, struct vm_pageout_queue *, boolean_t);
 317
 318 extern void vm_pageout_continue(void);
 319 extern void vm_pageout_scan(void);
 320
 321 static void     vm_pageout_immediate(vm_page_t, boolean_t);
 322 boolean_t       vm_compressor_immediate_preferred = FALSE;
 323 boolean_t       vm_compressor_immediate_preferred_override = FALSE;
 324 boolean_t       vm_restricted_to_single_processor = FALSE;
 325 static boolean_t vm_pageout_waiter  = FALSE;
 326 static boolean_t vm_pageout_running = FALSE;
 327
 328
 329 static thread_t vm_pageout_external_iothread = THREAD_NULL;
 330 static thread_t vm_pageout_internal_iothread = THREAD_NULL;
 331
 332 unsigned int vm_pageout_reserved_internal = 0;
 333 unsigned int vm_pageout_reserved_really = 0;
 334
 335 unsigned int vm_pageout_swap_wait = 0;
 336 unsigned int vm_pageout_idle_wait = 0;          /* milliseconds */
 337 unsigned int vm_pageout_empty_wait = 0;         /* milliseconds */
 338 unsigned int vm_pageout_burst_wait = 0;         /* milliseconds */
 339 unsigned int vm_pageout_deadlock_wait = 0;      /* milliseconds */
 340 unsigned int vm_pageout_deadlock_relief = 0;
 341 unsigned int vm_pageout_inactive_relief = 0;
 342 unsigned int vm_pageout_burst_active_throttle = 0;
 343 unsigned int vm_pageout_burst_inactive_throttle = 0;
 344
 345 int     vm_upl_wait_for_pages = 0;
 346
 347
 348 /*
 349  *      These variables record the pageout daemon's actions:
 350  *      how many pages it looks at and what happens to those pages.
 351  *      No locking needed because only one thread modifies the variables.
 352  */
 353
 354 unsigned int vm_pageout_active = 0;             /* debugging */
 355 unsigned int vm_pageout_inactive = 0;           /* debugging */
 356 unsigned int vm_pageout_inactive_throttled = 0; /* debugging */
 357 unsigned int vm_pageout_inactive_forced = 0;    /* debugging */
 358 unsigned int vm_pageout_inactive_nolock = 0;    /* debugging */
 359 unsigned int vm_pageout_inactive_avoid = 0;     /* debugging */
 360 unsigned int vm_pageout_inactive_busy = 0;      /* debugging */
 361 unsigned int vm_pageout_inactive_error = 0;     /* debugging */
 362 unsigned int vm_pageout_inactive_absent = 0;    /* debugging */
 363 unsigned int vm_pageout_inactive_notalive = 0;  /* debugging */
 364 unsigned int vm_pageout_inactive_used = 0;      /* debugging */
 365 unsigned int vm_pageout_cache_evicted = 0;      /* debugging */
 366 unsigned int vm_pageout_inactive_clean = 0;     /* debugging */
 367 unsigned int vm_pageout_speculative_clean = 0;  /* debugging */
 368
 369 unsigned int vm_pageout_freed_from_cleaned = 0;
 370 unsigned int vm_pageout_freed_from_speculative = 0;
 371 unsigned int vm_pageout_freed_from_inactive_clean = 0;
 372
 373 unsigned int vm_pageout_enqueued_cleaned_from_inactive_clean = 0;
 374 unsigned int vm_pageout_enqueued_cleaned_from_inactive_dirty = 0;
 375
 376 unsigned int vm_pageout_cleaned_reclaimed = 0;          /* debugging; how many cleaned pages are reclaimed by the pageout scan */
 377 unsigned int vm_pageout_cleaned_reactivated = 0;        /* debugging; how many cleaned pages are found to be referenced on pageout (and are therefore reactivated) */
 378 unsigned int vm_pageout_cleaned_reference_reactivated = 0;
 379 unsigned int vm_pageout_cleaned_volatile_reactivated = 0;
 380 unsigned int vm_pageout_cleaned_fault_reactivated = 0;
 381 unsigned int vm_pageout_cleaned_commit_reactivated = 0; /* debugging; how many cleaned pages are found to be referenced on commit (and are therefore reactivated) */
 382 unsigned int vm_pageout_cleaned_busy = 0;
 383 unsigned int vm_pageout_cleaned_nolock = 0;
 384
 385 unsigned int vm_pageout_inactive_dirty_internal = 0;    /* debugging */
 386 unsigned int vm_pageout_inactive_dirty_external = 0;    /* debugging */
 387 unsigned int vm_pageout_inactive_deactivated = 0;       /* debugging */
 388 unsigned int vm_pageout_inactive_anonymous = 0; /* debugging */
 389 unsigned int vm_pageout_dirty_no_pager = 0;     /* debugging */
 390 unsigned int vm_pageout_purged_objects = 0;     /* used for sysctl vm stats */
 391 unsigned int vm_stat_discard = 0;               /* debugging */
 392 unsigned int vm_stat_discard_sent = 0;          /* debugging */
 393 unsigned int vm_stat_discard_failure = 0;       /* debugging */
 394 unsigned int vm_stat_discard_throttle = 0;      /* debugging */
 395 unsigned int vm_pageout_reactivation_limit_exceeded = 0;        /* debugging */
 396 unsigned int vm_pageout_catch_ups = 0;                          /* debugging */
 397 unsigned int vm_pageout_inactive_force_reclaim = 0;     /* debugging */
 398
 399 unsigned int vm_pageout_scan_reclaimed_throttled = 0;
 400 unsigned int vm_pageout_scan_active_throttled = 0;
 401 unsigned int vm_pageout_scan_inactive_throttled_internal = 0;
 402 unsigned int vm_pageout_scan_inactive_throttled_external = 0;
 403 unsigned int vm_pageout_scan_throttle = 0;                      /* debugging */
 404 unsigned int vm_pageout_scan_burst_throttle = 0;                /* debugging */
 405 unsigned int vm_pageout_scan_empty_throttle = 0;                /* debugging */
 406 unsigned int vm_pageout_scan_swap_throttle = 0;         /* debugging */
 407 unsigned int vm_pageout_scan_deadlock_detected = 0;             /* debugging */
 408 unsigned int vm_pageout_scan_active_throttle_success = 0;       /* debugging */
 409 unsigned int vm_pageout_scan_inactive_throttle_success = 0;     /* debugging */
 410 unsigned int vm_pageout_inactive_external_forced_jetsam_count = 0;      /* debugging */
 411 unsigned int vm_pageout_scan_throttle_deferred = 0;             /* debugging */
 412 unsigned int vm_pageout_scan_yield_unthrottled = 0;             /* debugging */
 413 unsigned int vm_page_speculative_count_drifts = 0;
 414 unsigned int vm_page_speculative_count_drift_max = 0;
 415
 416
 417 /*
 418  * Backing store throttle when BS is exhausted
 419  */
 420 unsigned int    vm_backing_store_low = 0;
 421
 422 unsigned int vm_pageout_out_of_line  = 0;
 423 unsigned int vm_pageout_in_place  = 0;
 424
 425 unsigned int vm_page_steal_pageout_page = 0;
 426
 427 struct  vm_config       vm_config;
 428
 429 /*
 430  * ENCRYPTED SWAP:
 431  * counters and statistics...
 432  */
 433 unsigned long vm_page_decrypt_counter = 0;
 434 unsigned long vm_page_decrypt_for_upl_counter = 0;
 435 unsigned long vm_page_encrypt_counter = 0;
 436 unsigned long vm_page_encrypt_abort_counter = 0;
 437 unsigned long vm_page_encrypt_already_encrypted_counter = 0;
 438 boolean_t vm_pages_encrypted = FALSE; /* are there encrypted pages ? */
 439
 440 struct  vm_pageout_queue vm_pageout_queue_internal __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT)));
 441 struct  vm_pageout_queue vm_pageout_queue_external __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT)));
 442
 443 unsigned int vm_page_speculative_target = 0;
 444
 445 vm_object_t     vm_pageout_scan_wants_object = VM_OBJECT_NULL;
 446
 447 boolean_t (* volatile consider_buffer_cache_collect)(int) = NULL;
 448
 449 #if DEVELOPMENT || DEBUG
 450 unsigned long vm_cs_validated_resets = 0;
 451 #endif
 452
 453 int     vm_debug_events = 0;
 454
 455 #if CONFIG_MEMORYSTATUS
 456 #if !CONFIG_JETSAM
 457 extern boolean_t memorystatus_idle_exit_from_VM(void);
 458 #endif
 459 extern boolean_t memorystatus_kill_on_VM_page_shortage(boolean_t async);
 460 extern void memorystatus_on_pageout_scan_end(void);
 461
 462 uint32_t vm_pageout_memorystatus_fb_factor_nr = 5;
 463 uint32_t vm_pageout_memorystatus_fb_factor_dr = 2;
 464 #if DEVELOPMENT || DEBUG
 465 uint32_t vm_grab_anon_overrides = 0;
 466 uint32_t vm_grab_anon_nops = 0;
 467 #endif
 468
 469 #endif
 470
 471 /*
 472  *      Routine:        vm_backing_store_disable
 473  *      Purpose:
 474  *              Suspend non-privileged threads wishing to extend
 475  *              backing store when we are low on backing store
 476  *              (Synchronized by caller)
 477  */
 478 void
 479 vm_backing_store_disable(
 480         boolean_t       disable)
 481 {
 482         if(disable) {
 483                 vm_backing_store_low = 1;
 484         } else {
 485                 if(vm_backing_store_low) {
 486                         vm_backing_store_low = 0;
 487                         thread_wakeup((event_t) &vm_backing_store_low);
 488                 }
 489         }
 490 }
 491
 492
 493 #if MACH_CLUSTER_STATS
 494 unsigned long vm_pageout_cluster_dirtied = 0;
 495 unsigned long vm_pageout_cluster_cleaned = 0;
 496 unsigned long vm_pageout_cluster_collisions = 0;
 497 unsigned long vm_pageout_cluster_clusters = 0;
 498 unsigned long vm_pageout_cluster_conversions = 0;
 499 unsigned long vm_pageout_target_collisions = 0;
 500 unsigned long vm_pageout_target_page_dirtied = 0;
 501 unsigned long vm_pageout_target_page_freed = 0;
 502 #define CLUSTER_STAT(clause)    clause
 503 #else   /* MACH_CLUSTER_STATS */
 504 #define CLUSTER_STAT(clause)
 505 #endif  /* MACH_CLUSTER_STATS */
 506
 507 /*
 508  *      Routine:        vm_pageout_object_terminate
 509  *      Purpose:
 510  *              Destroy the pageout_object, and perform all of the
 511  *              required cleanup actions.
 512  *
 513  *      In/Out conditions:
 514  *              The object must be locked, and will be returned locked.
 515  */
 516 void
 517 vm_pageout_object_terminate(
 518         vm_object_t     object)
 519 {
 520         vm_object_t     shadow_object;
 521
 522         /*
 523          * Deal with the deallocation (last reference) of a pageout object
 524          * (used for cleaning-in-place) by dropping the paging references/
 525          * freeing pages in the original object.
 526          */
 527
 528         assert(object->pageout);
 529         shadow_object = object->shadow;
 530         vm_object_lock(shadow_object);
 531
 532         while (!vm_page_queue_empty(&object->memq)) {
 533                 vm_page_t               p, m;
 534                 vm_object_offset_t      offset;
 535
 536                 p = (vm_page_t) vm_page_queue_first(&object->memq);
 537
 538                 assert(p->private);
 539                 assert(p->free_when_done);
 540                 p->free_when_done = FALSE;
 541                 assert(!p->cleaning);
 542                 assert(!p->laundry);
 543
 544                 offset = p->offset;
 545                 VM_PAGE_FREE(p);
 546                 p = VM_PAGE_NULL;
 547
 548                 m = vm_page_lookup(shadow_object,
 549                         offset + object->vo_shadow_offset);
 550
 551                 if(m == VM_PAGE_NULL)
 552                         continue;
 553
 554                 assert((m->dirty) || (m->precious) ||
 555                                 (m->busy && m->cleaning));
 556
 557                 /*
 558                  * Handle the trusted pager throttle.
 559                  * Also decrement the burst throttle (if external).
 560                  */
 561                 vm_page_lock_queues();
 562                 if (m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q)
 563                         vm_pageout_throttle_up(m);
 564
 565                 /*
 566                  * Handle the "target" page(s). These pages are to be freed if
 567                  * successfully cleaned. Target pages are always busy, and are
 568                  * wired exactly once. The initial target pages are not mapped,
 569                  * (so cannot be referenced or modified) but converted target
 570                  * pages may have been modified between the selection as an
 571                  * adjacent page and conversion to a target.
 572                  */
 573                 if (m->free_when_done) {
 574                         assert(m->busy);
 575                         assert(m->vm_page_q_state == VM_PAGE_IS_WIRED);
 576                         assert(m->wire_count == 1);
 577                         m->cleaning = FALSE;
 578                         m->encrypted_cleaning = FALSE;
 579                         m->free_when_done = FALSE;
 580 #if MACH_CLUSTER_STATS
 581                         if (m->wanted) vm_pageout_target_collisions++;
 582 #endif
 583                         /*
 584                          * Revoke all access to the page. Since the object is
 585                          * locked, and the page is busy, this prevents the page
 586                          * from being dirtied after the pmap_disconnect() call
 587                          * returns.
 588                          *
 589                          * Since the page is left "dirty" but "not modifed", we
 590                          * can detect whether the page was redirtied during
 591                          * pageout by checking the modify state.
 592                          */
 593                         if (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED) {
 594                                 SET_PAGE_DIRTY(m, FALSE);
 595                         } else {
 596                                 m->dirty = FALSE;
 597                         }
 598
 599                         if (m->dirty) {
 600                                 CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
 601                                 vm_page_unwire(m, TRUE);        /* reactivates */
 602                                 VM_STAT_INCR(reactivations);
 603                                 PAGE_WAKEUP_DONE(m);
 604                         } else {
 605                                 CLUSTER_STAT(vm_pageout_target_page_freed++;)
 606                                 vm_page_free(m);/* clears busy, etc. */
 607                         }
 608                         vm_page_unlock_queues();
 609                         continue;
 610                 }
 611                 /*
 612                  * Handle the "adjacent" pages. These pages were cleaned in
 613                  * place, and should be left alone.
 614                  * If prep_pin_count is nonzero, then someone is using the
 615                  * page, so make it active.
 616                  */
 617                 if ((m->vm_page_q_state == VM_PAGE_NOT_ON_Q) && !m->private) {
 618                         if (m->reference)
 619                                 vm_page_activate(m);
 620                         else
 621                                 vm_page_deactivate(m);
 622                 }
 623                 if (m->overwriting) {
 624                         /*
 625                          * the (COPY_OUT_FROM == FALSE) request_page_list case
 626                          */
 627                         if (m->busy) {
 628                                 /*
 629                                  * We do not re-set m->dirty !
 630                                  * The page was busy so no extraneous activity
 631                                  * could have occurred. COPY_INTO is a read into the
 632                                  * new pages. CLEAN_IN_PLACE does actually write
 633                                  * out the pages but handling outside of this code
 634                                  * will take care of resetting dirty. We clear the
 635                                  * modify however for the Programmed I/O case.
 636                                  */
 637                                 pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
 638
 639                                 m->busy = FALSE;
 640                                 m->absent = FALSE;
 641                         } else {
 642                                 /*
 643                                  * alternate (COPY_OUT_FROM == FALSE) request_page_list case
 644                                  * Occurs when the original page was wired
 645                                  * at the time of the list request
 646                                  */
 647                                  assert(VM_PAGE_WIRED(m));
 648                                  vm_page_unwire(m, TRUE);       /* reactivates */
 649                         }
 650                         m->overwriting = FALSE;
 651                 } else {
 652                         /*
 653                          * Set the dirty state according to whether or not the page was
 654                          * modified during the pageout. Note that we purposefully do
 655                          * NOT call pmap_clear_modify since the page is still mapped.
 656                          * If the page were to be dirtied between the 2 calls, this
 657                          * this fact would be lost. This code is only necessary to
 658                          * maintain statistics, since the pmap module is always
 659                          * consulted if m->dirty is false.
 660                          */
 661 #if MACH_CLUSTER_STATS
 662                         m->dirty = pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m));
 663
 664                         if (m->dirty)   vm_pageout_cluster_dirtied++;
 665                         else            vm_pageout_cluster_cleaned++;
 666                         if (m->wanted)  vm_pageout_cluster_collisions++;
 667 #else
 668                         m->dirty = FALSE;
 669 #endif
 670                 }
 671                 if (m->encrypted_cleaning == TRUE) {
 672                         m->encrypted_cleaning = FALSE;
 673                         m->busy = FALSE;
 674                 }
 675                 m->cleaning = FALSE;
 676
 677                 /*
 678                  * Wakeup any thread waiting for the page to be un-cleaning.
 679                  */
 680                 PAGE_WAKEUP(m);
 681                 vm_page_unlock_queues();
 682         }
 683         /*
 684          * Account for the paging reference taken in vm_paging_object_allocate.
 685          */
 686         vm_object_activity_end(shadow_object);
 687         vm_object_unlock(shadow_object);
 688
 689         assert(object->ref_count == 0);
 690         assert(object->paging_in_progress == 0);
 691         assert(object->activity_in_progress == 0);
 692         assert(object->resident_page_count == 0);
 693         return;
 694 }
 695
 696 /*
 697  * Routine:     vm_pageclean_setup
 698  *
 699  * Purpose:     setup a page to be cleaned (made non-dirty), but not
 700  *              necessarily flushed from the VM page cache.
 701  *              This is accomplished by cleaning in place.
 702  *
 703  *              The page must not be busy, and new_object
 704  *              must be locked.
 705  *
 706  */
 707 static void
 708 vm_pageclean_setup(
 709         vm_page_t               m,
 710         vm_page_t               new_m,
 711         vm_object_t             new_object,
 712         vm_object_offset_t      new_offset)
 713 {
 714         assert(!m->busy);
 715 #if 0
 716         assert(!m->cleaning);
 717 #endif
 718
 719         XPR(XPR_VM_PAGEOUT,
 720             "vm_pageclean_setup, obj 0x%X off 0x%X page 0x%X new 0x%X new_off 0x%X\n",
 721                 VM_PAGE_OBJECT(m), m->offset, m,
 722                 new_m, new_offset);
 723
 724         pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
 725
 726         /*
 727          * Mark original page as cleaning in place.
 728          */
 729         m->cleaning = TRUE;
 730         SET_PAGE_DIRTY(m, FALSE);
 731         m->precious = FALSE;
 732
 733         /*
 734          * Convert the fictitious page to a private shadow of
 735          * the real page.
 736          */
 737         assert(new_m->fictitious);
 738         assert(VM_PAGE_GET_PHYS_PAGE(new_m) == vm_page_fictitious_addr);
 739         new_m->fictitious = FALSE;
 740         new_m->private = TRUE;
 741         new_m->free_when_done = TRUE;
 742         VM_PAGE_SET_PHYS_PAGE(new_m, VM_PAGE_GET_PHYS_PAGE(m));
 743
 744         vm_page_lockspin_queues();
 745         vm_page_wire(new_m, VM_KERN_MEMORY_NONE, TRUE);
 746         vm_page_unlock_queues();
 747
 748         vm_page_insert_wired(new_m, new_object, new_offset, VM_KERN_MEMORY_NONE);
 749         assert(!new_m->wanted);
 750         new_m->busy = FALSE;
 751 }
 752
 753 /*
 754  *      Routine:        vm_pageout_initialize_page
 755  *      Purpose:
 756  *              Causes the specified page to be initialized in
 757  *              the appropriate memory object. This routine is used to push
 758  *              pages into a copy-object when they are modified in the
 759  *              permanent object.
 760  *
 761  *              The page is moved to a temporary object and paged out.
 762  *
 763  *      In/out conditions:
 764  *              The page in question must not be on any pageout queues.
 765  *              The object to which it belongs must be locked.
 766  *              The page must be busy, but not hold a paging reference.
 767  *
 768  *      Implementation:
 769  *              Move this page to a completely new object.
 770  */
 771 void
 772 vm_pageout_initialize_page(
 773         vm_page_t       m)
 774 {
 775         vm_object_t             object;
 776         vm_object_offset_t      paging_offset;
 777         memory_object_t         pager;
 778
 779         XPR(XPR_VM_PAGEOUT,
 780                 "vm_pageout_initialize_page, page 0x%X\n",
 781                 m, 0, 0, 0, 0);
 782
 783         assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
 784
 785         object = VM_PAGE_OBJECT(m);
 786
 787         assert(m->busy);
 788         assert(object->internal);
 789
 790         /*
 791          *      Verify that we really want to clean this page
 792          */
 793         assert(!m->absent);
 794         assert(!m->error);
 795         assert(m->dirty);
 796
 797         /*
 798          *      Create a paging reference to let us play with the object.
 799          */
 800         paging_offset = m->offset + object->paging_offset;
 801
 802         if (m->absent || m->error || m->restart || (!m->dirty && !m->precious)) {
 803                 panic("reservation without pageout?"); /* alan */
 804
 805                 VM_PAGE_FREE(m);
 806                 vm_object_unlock(object);
 807
 808                 return;
 809         }
 810
 811         /*
 812          * If there's no pager, then we can't clean the page.  This should
 813          * never happen since this should be a copy object and therefore not
 814          * an external object, so the pager should always be there.
 815          */
 816
 817         pager = object->pager;
 818
 819         if (pager == MEMORY_OBJECT_NULL) {
 820                 panic("missing pager for copy object");
 821
 822                 VM_PAGE_FREE(m);
 823                 return;
 824         }
 825
 826         /*
 827          * set the page for future call to vm_fault_list_request
 828          */
 829         pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
 830         SET_PAGE_DIRTY(m, FALSE);
 831
 832         /*
 833          * keep the object from collapsing or terminating
 834          */
 835         vm_object_paging_begin(object);
 836         vm_object_unlock(object);
 837
 838         /*
 839          *      Write the data to its pager.
 840          *      Note that the data is passed by naming the new object,
 841          *      not a virtual address; the pager interface has been
 842          *      manipulated to use the "internal memory" data type.
 843          *      [The object reference from its allocation is donated
 844          *      to the eventual recipient.]
 845          */
 846         memory_object_data_initialize(pager, paging_offset, PAGE_SIZE);
 847
 848         vm_object_lock(object);
 849         vm_object_paging_end(object);
 850 }
 851
 852 #if     MACH_CLUSTER_STATS
 853 #define MAXCLUSTERPAGES 16
 854 struct {
 855         unsigned long pages_in_cluster;
 856         unsigned long pages_at_higher_offsets;
 857         unsigned long pages_at_lower_offsets;
 858 } cluster_stats[MAXCLUSTERPAGES];
 859 #endif  /* MACH_CLUSTER_STATS */
 860
 861
 862 /*
 863  * vm_pageout_cluster:
 864  *
 865  * Given a page, queue it to the appropriate I/O thread,
 866  * which will page it out and attempt to clean adjacent pages
 867  * in the same operation.
 868  *
 869  * The object and queues must be locked. We will take a
 870  * paging reference to prevent deallocation or collapse when we
 871  * release the object lock back at the call site.  The I/O thread
 872  * is responsible for consuming this reference
 873  *
 874  * The page must not be on any pageout queue.
 875  */
 876
 877 int
 878 vm_pageout_cluster(vm_page_t m, boolean_t immediate_ok, boolean_t keep_object_locked)
 879 {
 880         vm_object_t     object = VM_PAGE_OBJECT(m);
 881         struct          vm_pageout_queue *q;
 882
 883
 884         XPR(XPR_VM_PAGEOUT,
 885                 "vm_pageout_cluster, object 0x%X offset 0x%X page 0x%X\n",
 886                 object, m->offset, m, 0, 0);
 887
 888         VM_PAGE_CHECK(m);
 889         LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
 890         vm_object_lock_assert_exclusive(object);
 891
 892         /*
 893          * Only a certain kind of page is appreciated here.
 894          */
 895         assert((m->dirty || m->precious) && (!VM_PAGE_WIRED(m)));
 896         assert(!m->cleaning && !m->laundry);
 897         assert(m->vm_page_q_state == VM_PAGE_NOT_ON_Q);
 898
 899         /*
 900          * protect the object from collapse or termination
 901          */
 902         vm_object_activity_begin(object);
 903
 904         if (object->internal == TRUE) {
 905                 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
 906
 907                 m->busy = TRUE;
 908
 909                 if (vm_compressor_immediate_preferred == TRUE && immediate_ok == TRUE) {
 910                         panic("immediate compressor mode no longer supported\n");
 911
 912                         if (keep_object_locked == FALSE)
 913                                 vm_object_unlock(object);
 914                         vm_page_unlock_queues();
 915
 916                         vm_pageout_immediate(m, keep_object_locked);
 917
 918                         return (1);
 919                 }
 920                 q = &vm_pageout_queue_internal;
 921         } else
 922                 q = &vm_pageout_queue_external;
 923
 924         /*
 925          * pgo_laundry count is tied to the laundry bit
 926          */
 927         m->laundry = TRUE;
 928         q->pgo_laundry++;
 929
 930         m->vm_page_q_state = VM_PAGE_ON_PAGEOUT_Q;
 931         vm_page_queue_enter(&q->pgo_pending, m, vm_page_t, pageq);
 932
 933         if (q->pgo_idle == TRUE) {
 934                 q->pgo_idle = FALSE;
 935                 thread_wakeup((event_t) &q->pgo_pending);
 936         }
 937         VM_PAGE_CHECK(m);
 938
 939         return (0);
 940 }
 941
 942
 943 unsigned long vm_pageout_throttle_up_count = 0;
 944
 945 /*
 946  * A page is back from laundry or we are stealing it back from
 947  * the laundering state.  See if there are some pages waiting to
 948  * go to laundry and if we can let some of them go now.
 949  *
 950  * Object and page queues must be locked.
 951  */
 952 void
 953 vm_pageout_throttle_up(
 954        vm_page_t       m)
 955 {
 956        struct vm_pageout_queue *q;
 957        vm_object_t      m_object;
 958
 959        m_object = VM_PAGE_OBJECT(m);
 960
 961        assert(m_object != VM_OBJECT_NULL);
 962        assert(m_object != kernel_object);
 963
 964        LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
 965        vm_object_lock_assert_exclusive(m_object);
 966
 967        vm_pageout_throttle_up_count++;
 968
 969        if (m_object->internal == TRUE)
 970                q = &vm_pageout_queue_internal;
 971        else
 972                q = &vm_pageout_queue_external;
 973
 974        if (m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q) {
 975
 976                vm_page_queue_remove(&q->pgo_pending, m, vm_page_t, pageq);
 977                m->vm_page_q_state = VM_PAGE_NOT_ON_Q;
 978
 979                VM_PAGE_ZERO_PAGEQ_ENTRY(m);
 980
 981                vm_object_activity_end(m_object);
 982        }
 983        if (m->laundry == TRUE) {
 984
 985                m->laundry = FALSE;
 986                q->pgo_laundry--;
 987
 988                if (q->pgo_throttled == TRUE) {
 989                        q->pgo_throttled = FALSE;
 990                        thread_wakeup((event_t) &q->pgo_laundry);
 991                }
 992                if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
 993                        q->pgo_draining = FALSE;
 994                        thread_wakeup((event_t) (&q->pgo_laundry+1));
 995                }
 996         }
 997 }
 998
 999
1000 static void
1001 vm_pageout_throttle_up_batch(
1002         struct vm_pageout_queue *q,
1003         int             batch_cnt)
1004 {
1005        LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1006
1007        vm_pageout_throttle_up_count += batch_cnt;
1008
1009        q->pgo_laundry -= batch_cnt;
1010
1011        if (q->pgo_throttled == TRUE) {
1012                q->pgo_throttled = FALSE;
1013                thread_wakeup((event_t) &q->pgo_laundry);
1014        }
1015        if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
1016                q->pgo_draining = FALSE;
1017                thread_wakeup((event_t) (&q->pgo_laundry+1));
1018        }
1019 }
1020
1021
1022
1023 /*
1024  * VM memory pressure monitoring.
1025  *
1026  * vm_pageout_scan() keeps track of the number of pages it considers and
1027  * reclaims, in the currently active vm_pageout_stat[vm_pageout_stat_now].
1028  *
1029  * compute_memory_pressure() is called every second from compute_averages()
1030  * and moves "vm_pageout_stat_now" forward, to start accumulating the number
1031  * of recalimed pages in a new vm_pageout_stat[] bucket.
1032  *
1033  * mach_vm_pressure_monitor() collects past statistics about memory pressure.
1034  * The caller provides the number of seconds ("nsecs") worth of statistics
1035  * it wants, up to 30 seconds.
1036  * It computes the number of pages reclaimed in the past "nsecs" seconds and
1037  * also returns the number of pages the system still needs to reclaim at this
1038  * moment in time.
1039  */
1040 #define VM_PAGEOUT_STAT_SIZE    31
1041 struct vm_pageout_stat {
1042         unsigned int considered;
1043         unsigned int reclaimed;
1044 } vm_pageout_stats[VM_PAGEOUT_STAT_SIZE] = {{0,0}, };
1045 unsigned int vm_pageout_stat_now = 0;
1046 unsigned int vm_memory_pressure = 0;
1047
1048 #define VM_PAGEOUT_STAT_BEFORE(i) \
1049         (((i) == 0) ? VM_PAGEOUT_STAT_SIZE - 1 : (i) - 1)
1050 #define VM_PAGEOUT_STAT_AFTER(i) \
1051         (((i) == VM_PAGEOUT_STAT_SIZE - 1) ? 0 : (i) + 1)
1052
1053 #if VM_PAGE_BUCKETS_CHECK
1054 int vm_page_buckets_check_interval = 10; /* in seconds */
1055 #endif /* VM_PAGE_BUCKETS_CHECK */
1056
1057 /*
1058  * Called from compute_averages().
1059  */
1060 void
1061 compute_memory_pressure(
1062         __unused void *arg)
1063 {
1064         unsigned int vm_pageout_next;
1065
1066 #if VM_PAGE_BUCKETS_CHECK
1067         /* check the consistency of VM page buckets at regular interval */
1068         static int counter = 0;
1069         if ((++counter % vm_page_buckets_check_interval) == 0) {
1070                 vm_page_buckets_check();
1071         }
1072 #endif /* VM_PAGE_BUCKETS_CHECK */
1073
1074         vm_memory_pressure =
1075                 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].reclaimed;
1076
1077         commpage_set_memory_pressure( vm_memory_pressure );
1078
1079         /* move "now" forward */
1080         vm_pageout_next = VM_PAGEOUT_STAT_AFTER(vm_pageout_stat_now);
1081         vm_pageout_stats[vm_pageout_next].considered = 0;
1082         vm_pageout_stats[vm_pageout_next].reclaimed = 0;
1083         vm_pageout_stat_now = vm_pageout_next;
1084 }
1085
1086
1087 /*
1088  * IMPORTANT
1089  * mach_vm_ctl_page_free_wanted() is called indirectly, via
1090  * mach_vm_pressure_monitor(), when taking a stackshot. Therefore,
1091  * it must be safe in the restricted stackshot context. Locks and/or
1092  * blocking are not allowable.
1093  */
1094 unsigned int
1095 mach_vm_ctl_page_free_wanted(void)
1096 {
1097         unsigned int page_free_target, page_free_count, page_free_wanted;
1098
1099         page_free_target = vm_page_free_target;
1100         page_free_count = vm_page_free_count;
1101         if (page_free_target > page_free_count) {
1102                 page_free_wanted = page_free_target - page_free_count;
1103         } else {
1104                 page_free_wanted = 0;
1105         }
1106
1107         return page_free_wanted;
1108 }
1109
1110
1111 /*
1112  * IMPORTANT:
1113  * mach_vm_pressure_monitor() is called when taking a stackshot, with
1114  * wait_for_pressure FALSE, so that code path must remain safe in the
1115  * restricted stackshot context. No blocking or locks are allowable.
1116  * on that code path.
1117  */
1118
1119 kern_return_t
1120 mach_vm_pressure_monitor(
1121         boolean_t       wait_for_pressure,
1122         unsigned int    nsecs_monitored,
1123         unsigned int    *pages_reclaimed_p,
1124         unsigned int    *pages_wanted_p)
1125 {
1126         wait_result_t   wr;
1127         unsigned int    vm_pageout_then, vm_pageout_now;
1128         unsigned int    pages_reclaimed;
1129
1130         /*
1131          * We don't take the vm_page_queue_lock here because we don't want
1132          * vm_pressure_monitor() to get in the way of the vm_pageout_scan()
1133          * thread when it's trying to reclaim memory.  We don't need fully
1134          * accurate monitoring anyway...
1135          */
1136
1137         if (wait_for_pressure) {
1138                 /* wait until there's memory pressure */
1139                 while (vm_page_free_count >= vm_page_free_target) {
1140                         wr = assert_wait((event_t) &vm_page_free_wanted,
1141                                          THREAD_INTERRUPTIBLE);
1142                         if (wr == THREAD_WAITING) {
1143                                 wr = thread_block(THREAD_CONTINUE_NULL);
1144                         }
1145                         if (wr == THREAD_INTERRUPTED) {
1146                                 return KERN_ABORTED;
1147                         }
1148                         if (wr == THREAD_AWAKENED) {
1149                                 /*
1150                                  * The memory pressure might have already
1151                                  * been relieved but let's not block again
1152                                  * and let's report that there was memory
1153                                  * pressure at some point.
1154                                  */
1155                                 break;
1156                         }
1157                 }
1158         }
1159
1160         /* provide the number of pages the system wants to reclaim */
1161         if (pages_wanted_p != NULL) {
1162                 *pages_wanted_p = mach_vm_ctl_page_free_wanted();
1163         }
1164
1165         if (pages_reclaimed_p == NULL) {
1166                 return KERN_SUCCESS;
1167         }
1168
1169         /* provide number of pages reclaimed in the last "nsecs_monitored" */
1170         vm_pageout_now = vm_pageout_stat_now;
1171         pages_reclaimed = 0;
1172         for (vm_pageout_then =
1173                      VM_PAGEOUT_STAT_BEFORE(vm_pageout_now);
1174              vm_pageout_then != vm_pageout_now &&
1175                      nsecs_monitored-- != 0;
1176              vm_pageout_then =
1177                      VM_PAGEOUT_STAT_BEFORE(vm_pageout_then)) {
1178                 pages_reclaimed += vm_pageout_stats[vm_pageout_then].reclaimed;
1179         }
1180         *pages_reclaimed_p = pages_reclaimed;
1181
1182         return KERN_SUCCESS;
1183 }
1184
1185
1186
1187 #if DEVELOPMENT || DEBUG
1188
1189 static void
1190 vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *, int);
1191
1192 /*
1193  * condition variable used to make sure there is
1194  * only a single sweep going on at a time
1195  */
1196 boolean_t       vm_pageout_disconnect_all_pages_active = FALSE;
1197
1198
1199 void
1200 vm_pageout_disconnect_all_pages()
1201 {
1202         vm_page_lock_queues();
1203
1204         if (vm_pageout_disconnect_all_pages_active == TRUE) {
1205                 vm_page_unlock_queues();
1206                 return;
1207         }
1208         vm_pageout_disconnect_all_pages_active = TRUE;
1209         vm_page_unlock_queues();
1210
1211         vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_throttled, vm_page_throttled_count);
1212         vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_anonymous, vm_page_anonymous_count);
1213         vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_active, vm_page_active_count);
1214
1215         vm_pageout_disconnect_all_pages_active = FALSE;
1216 }
1217
1218
1219 void
1220 vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *q, int qcount)
1221 {
1222         vm_page_t       m;
1223         vm_object_t     t_object = NULL;
1224         vm_object_t     l_object = NULL;
1225         vm_object_t     m_object = NULL;
1226         int             delayed_unlock = 0;
1227         int             try_failed_count = 0;
1228         int             disconnected_count = 0;
1229         int             paused_count = 0;
1230         int             object_locked_count = 0;
1231
1232         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS)) | DBG_FUNC_START,
1233                                   q, qcount, 0, 0, 0);
1234
1235         vm_page_lock_queues();
1236
1237         while (qcount && !vm_page_queue_empty(q)) {
1238
1239                 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1240
1241                 m = (vm_page_t) vm_page_queue_first(q);
1242                 m_object = VM_PAGE_OBJECT(m);
1243
1244                 /*
1245                  * check to see if we currently are working
1246                  * with the same object... if so, we've
1247                  * already got the lock
1248                  */
1249                 if (m_object != l_object) {
1250                         /*
1251                          * the object associated with candidate page is
1252                          * different from the one we were just working
1253                          * with... dump the lock if we still own it
1254                          */
1255                         if (l_object != NULL) {
1256                                 vm_object_unlock(l_object);
1257                                 l_object = NULL;
1258                         }
1259                         if (m_object != t_object)
1260                                 try_failed_count = 0;
1261
1262                         /*
1263                          * Try to lock object; since we've alread got the
1264                          * page queues lock, we can only 'try' for this one.
1265                          * if the 'try' fails, we need to do a mutex_pause
1266                          * to allow the owner of the object lock a chance to
1267                          * run...
1268                          */
1269                         if ( !vm_object_lock_try_scan(m_object)) {
1270
1271                                 if (try_failed_count > 20) {
1272                                         goto reenter_pg_on_q;
1273                                 }
1274                                 vm_page_unlock_queues();
1275                                 mutex_pause(try_failed_count++);
1276                                 vm_page_lock_queues();
1277                                 delayed_unlock = 0;
1278
1279                                 paused_count++;
1280
1281                                 t_object = m_object;
1282                                 continue;
1283                         }
1284                         object_locked_count++;
1285
1286                         l_object = m_object;
1287                 }
1288                 if ( !m_object->alive || m->encrypted_cleaning || m->cleaning || m->laundry || m->busy || m->absent || m->error || m->free_when_done) {
1289                         /*
1290                          * put it back on the head of its queue
1291                          */
1292                         goto reenter_pg_on_q;
1293                 }
1294                 if (m->pmapped == TRUE) {
1295
1296                         pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
1297
1298                         disconnected_count++;
1299                 }
1300 reenter_pg_on_q:
1301                 vm_page_queue_remove(q, m, vm_page_t, pageq);
1302                 vm_page_queue_enter(q, m, vm_page_t, pageq);
1303
1304                 qcount--;
1305                 try_failed_count = 0;
1306
1307                 if (delayed_unlock++ > 128) {
1308
1309                         if (l_object != NULL) {
1310                                 vm_object_unlock(l_object);
1311                                 l_object = NULL;
1312                         }
1313                         lck_mtx_yield(&vm_page_queue_lock);
1314                         delayed_unlock = 0;
1315                 }
1316         }
1317         if (l_object != NULL) {
1318                 vm_object_unlock(l_object);
1319                 l_object = NULL;
1320         }
1321         vm_page_unlock_queues();
1322
1323         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS)) | DBG_FUNC_END,
1324                                   q, disconnected_count, object_locked_count, paused_count, 0);
1325 }
1326
1327 #endif
1328
1329
1330 static void
1331 vm_pageout_page_queue(vm_page_queue_head_t *, int);
1332
1333 /*
1334  * condition variable used to make sure there is
1335  * only a single sweep going on at a time
1336  */
1337 boolean_t       vm_pageout_anonymous_pages_active = FALSE;
1338
1339
1340 void
1341 vm_pageout_anonymous_pages()
1342 {
1343         if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
1344
1345                 vm_page_lock_queues();
1346
1347                 if (vm_pageout_anonymous_pages_active == TRUE) {
1348                         vm_page_unlock_queues();
1349                         return;
1350                 }
1351                 vm_pageout_anonymous_pages_active = TRUE;
1352                 vm_page_unlock_queues();
1353
1354                 vm_pageout_page_queue(&vm_page_queue_throttled, vm_page_throttled_count);
1355                 vm_pageout_page_queue(&vm_page_queue_anonymous, vm_page_anonymous_count);
1356                 vm_pageout_page_queue(&vm_page_queue_active, vm_page_active_count);
1357
1358                 if (VM_CONFIG_SWAP_IS_PRESENT)
1359                         vm_consider_swapping();
1360
1361                 vm_page_lock_queues();
1362                 vm_pageout_anonymous_pages_active = FALSE;
1363                 vm_page_unlock_queues();
1364         }
1365 }
1366
1367
1368 void
1369 vm_pageout_page_queue(vm_page_queue_head_t *q, int qcount)
1370 {
1371         vm_page_t       m;
1372         vm_object_t     t_object = NULL;
1373         vm_object_t     l_object = NULL;
1374         vm_object_t     m_object = NULL;
1375         int             delayed_unlock = 0;
1376         int             try_failed_count = 0;
1377         int             refmod_state;
1378         int             pmap_options;
1379         struct          vm_pageout_queue *iq;
1380         ppnum_t         phys_page;
1381
1382
1383         iq = &vm_pageout_queue_internal;
1384
1385         vm_page_lock_queues();
1386
1387         while (qcount && !vm_page_queue_empty(q)) {
1388
1389                 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1390
1391                 if (VM_PAGE_Q_THROTTLED(iq)) {
1392
1393                         if (l_object != NULL) {
1394                                 vm_object_unlock(l_object);
1395                                 l_object = NULL;
1396                         }
1397                         iq->pgo_draining = TRUE;
1398
1399                         assert_wait((event_t) (&iq->pgo_laundry + 1), THREAD_INTERRUPTIBLE);
1400                         vm_page_unlock_queues();
1401
1402                         thread_block(THREAD_CONTINUE_NULL);
1403
1404                         vm_page_lock_queues();
1405                         delayed_unlock = 0;
1406                         continue;
1407                 }
1408                 m = (vm_page_t) vm_page_queue_first(q);
1409                 m_object = VM_PAGE_OBJECT(m);
1410
1411                 /*
1412                  * check to see if we currently are working
1413                  * with the same object... if so, we've
1414                  * already got the lock
1415                  */
1416                 if (m_object != l_object) {
1417                         if ( !m_object->internal)
1418                                 goto reenter_pg_on_q;
1419
1420                         /*
1421                          * the object associated with candidate page is
1422                          * different from the one we were just working
1423                          * with... dump the lock if we still own it
1424                          */
1425                         if (l_object != NULL) {
1426                                 vm_object_unlock(l_object);
1427                                 l_object = NULL;
1428                         }
1429                         if (m_object != t_object)
1430                                 try_failed_count = 0;
1431
1432                         /*
1433                          * Try to lock object; since we've alread got the
1434                          * page queues lock, we can only 'try' for this one.
1435                          * if the 'try' fails, we need to do a mutex_pause
1436                          * to allow the owner of the object lock a chance to
1437                          * run...
1438                          */
1439                         if ( !vm_object_lock_try_scan(m_object)) {
1440
1441                                 if (try_failed_count > 20) {
1442                                         goto reenter_pg_on_q;
1443                                 }
1444                                 vm_page_unlock_queues();
1445                                 mutex_pause(try_failed_count++);
1446                                 vm_page_lock_queues();
1447                                 delayed_unlock = 0;
1448
1449                                 t_object = m_object;
1450                                 continue;
1451                         }
1452                         l_object = m_object;
1453                 }
1454                 if ( !m_object->alive || m->encrypted_cleaning || m->cleaning || m->laundry || m->busy || m->absent || m->error || m->free_when_done) {
1455                         /*
1456                          * page is not to be cleaned
1457                          * put it back on the head of its queue
1458                          */
1459                         goto reenter_pg_on_q;
1460                 }
1461                 phys_page = VM_PAGE_GET_PHYS_PAGE(m);
1462
1463                 if (m->reference == FALSE && m->pmapped == TRUE) {
1464                         refmod_state = pmap_get_refmod(phys_page);
1465
1466                         if (refmod_state & VM_MEM_REFERENCED)
1467                                 m->reference = TRUE;
1468                         if (refmod_state & VM_MEM_MODIFIED) {
1469                                 SET_PAGE_DIRTY(m, FALSE);
1470                         }
1471                 }
1472                 if (m->reference == TRUE) {
1473                         m->reference = FALSE;
1474                         pmap_clear_refmod_options(phys_page, VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
1475                         goto reenter_pg_on_q;
1476                 }
1477                 if (m->pmapped == TRUE) {
1478                         if (m->dirty || m->precious) {
1479                                 pmap_options = PMAP_OPTIONS_COMPRESSOR;
1480                         } else {
1481                                 pmap_options = PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
1482                         }
1483                         refmod_state = pmap_disconnect_options(phys_page, pmap_options, NULL);
1484                         if (refmod_state & VM_MEM_MODIFIED) {
1485                                 SET_PAGE_DIRTY(m, FALSE);
1486                         }
1487                 }
1488                 if ( !m->dirty && !m->precious) {
1489                         vm_page_unlock_queues();
1490                         VM_PAGE_FREE(m);
1491                         vm_page_lock_queues();
1492                         delayed_unlock = 0;
1493
1494                         goto next_pg;
1495                 }
1496                 if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL)  {
1497
1498                         if (!m_object->pager_initialized) {
1499
1500                                 vm_page_unlock_queues();
1501
1502                                 vm_object_collapse(m_object, (vm_object_offset_t) 0, TRUE);
1503
1504                                 if (!m_object->pager_initialized)
1505                                         vm_object_compressor_pager_create(m_object);
1506
1507                                 vm_page_lock_queues();
1508                                 delayed_unlock = 0;
1509                         }
1510                         if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL)
1511                                 goto reenter_pg_on_q;
1512                         /*
1513                          * vm_object_compressor_pager_create will drop the object lock
1514                          * which means 'm' may no longer be valid to use
1515                          */
1516                         continue;
1517                 }
1518                 /*
1519                  * we've already factored out pages in the laundry which
1520                  * means this page can't be on the pageout queue so it's
1521                  * safe to do the vm_page_queues_remove
1522                  */
1523                 vm_page_queues_remove(m, TRUE);
1524
1525                 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1526
1527                 vm_pageout_cluster(m, FALSE, FALSE);
1528
1529                 goto next_pg;
1530
1531 reenter_pg_on_q:
1532                 vm_page_queue_remove(q, m, vm_page_t, pageq);
1533                 vm_page_queue_enter(q, m, vm_page_t, pageq);
1534 next_pg:
1535                 qcount--;
1536                 try_failed_count = 0;
1537
1538                 if (delayed_unlock++ > 128) {
1539
1540                         if (l_object != NULL) {
1541                                 vm_object_unlock(l_object);
1542                                 l_object = NULL;
1543                         }
1544                         lck_mtx_yield(&vm_page_queue_lock);
1545                         delayed_unlock = 0;
1546                 }
1547         }
1548         if (l_object != NULL) {
1549                 vm_object_unlock(l_object);
1550                 l_object = NULL;
1551         }
1552         vm_page_unlock_queues();
1553 }
1554
1555
1556
1557 /*
1558  * function in BSD to apply I/O throttle to the pageout thread
1559  */
1560 extern void vm_pageout_io_throttle(void);
1561
1562 #define VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, obj)                    \
1563         MACRO_BEGIN                                                     \
1564         /*                                                              \
1565          * If a "reusable" page somehow made it back into               \
1566          * the active queue, it's been re-used and is not               \
1567          * quite re-usable.                                             \
1568          * If the VM object was "all_reusable", consider it             \
1569          * as "all re-used" instead of converting it to                 \
1570          * "partially re-used", which could be expensive.               \
1571          */                                                             \
1572         assert(VM_PAGE_OBJECT((m)) == (obj));                           \
1573         if ((m)->reusable ||                                            \
1574             (obj)->all_reusable) {                                      \
1575                 vm_object_reuse_pages((obj),                            \
1576                                       (m)->offset,                      \
1577                                       (m)->offset + PAGE_SIZE_64,       \
1578                                       FALSE);                           \
1579         }                                                               \
1580         MACRO_END
1581
1582
1583 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT         64
1584 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX     1024
1585
1586 #define FCS_IDLE                0
1587 #define FCS_DELAYED             1
1588 #define FCS_DEADLOCK_DETECTED   2
1589
1590 struct flow_control {
1591         int             state;
1592         mach_timespec_t ts;
1593 };
1594
1595 #if CONFIG_BACKGROUND_QUEUE
1596 uint64_t vm_pageout_considered_bq_internal = 0;
1597 uint64_t vm_pageout_considered_bq_external = 0;
1598 uint64_t vm_pageout_rejected_bq_internal = 0;
1599 uint64_t vm_pageout_rejected_bq_external = 0;
1600 #endif
1601 uint32_t vm_pageout_considered_page = 0;
1602 uint32_t vm_page_filecache_min = 0;
1603
1604 #define ANONS_GRABBED_LIMIT     2
1605
1606 #if CONFIG_SECLUDED_MEMORY
1607 extern vm_page_t vm_page_grab_secluded(void);
1608 uint64_t vm_pageout_freed_from_secluded = 0;
1609 uint64_t vm_pageout_secluded_reactivated = 0;   /* debugging; how many secluded pages are found to be referenced on pageout (and are therefore reactivated) */
1610 uint64_t vm_pageout_secluded_burst_count = 0;
1611 #endif /* CONFIG_SECLUDED_MEMORY */
1612
1613 /*
1614  *      vm_pageout_scan does the dirty work for the pageout daemon.
1615  *      It returns with both vm_page_queue_free_lock and vm_page_queue_lock
1616  *      held and vm_page_free_wanted == 0.
1617  */
1618 void
1619 vm_pageout_scan(void)
1620 {
1621         unsigned int loop_count = 0;
1622         unsigned int inactive_burst_count = 0;
1623         unsigned int active_burst_count = 0;
1624         unsigned int reactivated_this_call;
1625         unsigned int reactivate_limit;
1626         vm_page_t   local_freeq = NULL;
1627         int         local_freed = 0;
1628         int         delayed_unlock;
1629         int         delayed_unlock_limit = 0;
1630         int         refmod_state = 0;
1631         int     vm_pageout_deadlock_target = 0;
1632         struct  vm_pageout_queue *iq;
1633         struct  vm_pageout_queue *eq;
1634         struct  vm_speculative_age_q *sq;
1635         struct  flow_control    flow_control = { 0, { 0, 0 } };
1636         boolean_t inactive_throttled = FALSE;
1637         boolean_t try_failed;
1638         mach_timespec_t ts;
1639         unsigned        int msecs = 0;
1640         vm_object_t     object;
1641         vm_object_t     last_object_tried;
1642         uint32_t        catch_up_count = 0;
1643         uint32_t        inactive_reclaim_run;
1644         boolean_t       exceeded_burst_throttle;
1645         boolean_t       grab_anonymous = FALSE;
1646         boolean_t       force_anonymous = FALSE;
1647         int             anons_grabbed = 0;
1648         int             page_prev_q_state = 0;
1649         boolean_t       requeue_insert_first = FALSE;
1650 #if CONFIG_BACKGROUND_QUEUE
1651         boolean_t       ignore_reference = FALSE;
1652 #endif
1653 #if CONFIG_SECLUDED_MEMORY
1654         boolean_t       ignore_reference_secluded;
1655 #endif /* CONFIG_SECLUDED_MEMORY */
1656         int             cache_evict_throttle = 0;
1657         uint32_t        vm_pageout_inactive_external_forced_reactivate_limit = 0;
1658         int             force_purge = 0;
1659 #define DELAY_SPECULATIVE_AGE   1000
1660         int             delay_speculative_age = 0;
1661         vm_object_t     m_object = VM_OBJECT_NULL;
1662
1663 #if VM_PRESSURE_EVENTS
1664         vm_pressure_level_t pressure_level;
1665 #endif /* VM_PRESSURE_EVENTS */
1666
1667         VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_START,
1668                        vm_pageout_speculative_clean, vm_pageout_inactive_clean,
1669                        vm_pageout_inactive_dirty_internal, vm_pageout_inactive_dirty_external);
1670
1671         flow_control.state = FCS_IDLE;
1672         iq = &vm_pageout_queue_internal;
1673         eq = &vm_pageout_queue_external;
1674         sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
1675
1676
1677         XPR(XPR_VM_PAGEOUT, "vm_pageout_scan\n", 0, 0, 0, 0, 0);
1678
1679
1680         vm_page_lock_queues();
1681         delayed_unlock = 1;     /* must be nonzero if Qs are locked, 0 if unlocked */
1682
1683         /*
1684          *      Calculate the max number of referenced pages on the inactive
1685          *      queue that we will reactivate.
1686          */
1687         reactivated_this_call = 0;
1688         reactivate_limit = VM_PAGE_REACTIVATE_LIMIT(vm_page_active_count +
1689                                                     vm_page_inactive_count);
1690         inactive_reclaim_run = 0;
1691
1692         vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
1693
1694         /*
1695          *      We want to gradually dribble pages from the active queue
1696          *      to the inactive queue.  If we let the inactive queue get
1697          *      very small, and then suddenly dump many pages into it,
1698          *      those pages won't get a sufficient chance to be referenced
1699          *      before we start taking them from the inactive queue.
1700          *
1701          *      We must limit the rate at which we send pages to the pagers
1702          *      so that we don't tie up too many pages in the I/O queues.
1703          *      We implement a throttling mechanism using the laundry count
1704          *      to limit the number of pages outstanding to the default
1705          *      and external pagers.  We can bypass the throttles and look
1706          *      for clean pages if the pageout queues don't drain in a timely
1707          *      fashion since this may indicate that the pageout paths are
1708          *      stalled waiting for memory, which only we can provide.
1709          */
1710
1711
1712 Restart:
1713
1714
1715         assert(delayed_unlock!=0);
1716
1717         /*
1718          *      Recalculate vm_page_inactivate_target.
1719          */
1720         vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
1721                                                           vm_page_inactive_count +
1722                                                           vm_page_speculative_count);
1723
1724         vm_page_anonymous_min = vm_page_inactive_target / 20;
1725
1726
1727         /*
1728          * don't want to wake the pageout_scan thread up everytime we fall below
1729          * the targets... set a low water mark at 0.25% below the target
1730          */
1731         vm_page_inactive_min = vm_page_inactive_target - (vm_page_inactive_target / 400);
1732
1733         if (vm_page_speculative_percentage > 50)
1734                 vm_page_speculative_percentage = 50;
1735         else if (vm_page_speculative_percentage <= 0)
1736                 vm_page_speculative_percentage = 1;
1737
1738         vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count +
1739                                                                 vm_page_inactive_count);
1740
1741         object = NULL;
1742         last_object_tried = NULL;
1743         try_failed = FALSE;
1744
1745         if ((vm_page_inactive_count + vm_page_speculative_count) < VM_PAGE_INACTIVE_HEALTHY_LIMIT(vm_page_active_count))
1746                 catch_up_count = vm_page_inactive_count + vm_page_speculative_count;
1747         else
1748                 catch_up_count = 0;
1749
1750         for (;;) {
1751                 vm_page_t m;
1752
1753                 DTRACE_VM2(rev, int, 1, (uint64_t *), NULL);
1754
1755 #if CONFIG_SECLUDED_MEMORY
1756                 if (vm_page_secluded_count > vm_page_secluded_target &&
1757                     object != NULL) {
1758                         vm_object_unlock(object);
1759                         object = NULL;
1760                         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1761                 }
1762
1763                 /*
1764                  * Deal with secluded_q overflow.
1765                  */
1766                 if (vm_page_secluded_count > vm_page_secluded_target &&
1767                     secluded_aging_policy == SECLUDED_AGING_FIFO) {
1768                         unsigned int secluded_overflow;
1769                         vm_page_t secluded_page;
1770
1771                         /*
1772                          * SECLUDED_AGING_FIFO:
1773                          * No aging, just reclaim the excess pages
1774                          * at the tail of the secluded queue.
1775                          * We're reclaiming pages and we're not hogging
1776                          * any global lock, so no need for throttling.
1777                          */
1778
1779                         secluded_overflow = (vm_page_secluded_count -
1780                                              vm_page_secluded_target);
1781                         /* transfer to free queue */
1782                         vm_page_unlock_queues();
1783                         while (secluded_overflow--) {
1784                                 secluded_page = vm_page_grab_secluded();
1785                                 if (secluded_page == VM_PAGE_NULL) {
1786                                         break;
1787                                 }
1788                                 assert(secluded_page->busy);
1789                                 assert(secluded_page->pageq.next == 0 &&
1790                                        secluded_page->pageq.prev == 0);
1791
1792                                 secluded_page->snext = local_freeq;
1793                                 local_freeq = secluded_page;
1794                                 local_freed++;
1795                                 secluded_page = VM_PAGE_NULL;
1796                         }
1797                 } else if (vm_page_secluded_count > vm_page_secluded_target &&
1798                            secluded_aging_policy == SECLUDED_AGING_ALONG_ACTIVE) {
1799                         unsigned int secluded_overflow;
1800                         vm_page_t secluded_page;
1801
1802                         /*
1803                          * SECLUDED_AGING_ALONG_ACTIVE:
1804                          * There might be free pages at the tail of the
1805                          * secluded queue:
1806                          * just move them to the free queue (in batches).
1807                          * There can also be an excessive number of "inuse"
1808                          * pages:
1809                          * we age them by resetting their "referenced" bit and
1810                          * moving them to the inactive queue.  Their trip
1811                          * through the secluded queue was equivalent to a trip
1812                          * through the active queue.
1813                          *
1814                          * We're holding the page queue lock, so we need
1815                          * to throttle and give someone else a chance to
1816                          * grab that lock if needed.
1817                          *
1818                          * We're also limiting the number of secluded "inuse"
1819                          * pages that get moved to the inactive queue, using
1820                          * the same "active_bust_count" method we use when
1821                          * balancing the active and inactive queues, because
1822                          * there can be a large number
1823                          * of extra "inuse" pages and handling them gets in the
1824                          * way of actually reclaiming memory.
1825                          */
1826
1827                         active_burst_count = MIN(vm_pageout_burst_active_throttle,
1828                                                  vm_page_secluded_count_inuse);
1829                         delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT;
1830                         delayed_unlock = 1;
1831                         secluded_overflow = (vm_page_secluded_count -
1832                                              vm_page_secluded_target);
1833                         while (secluded_overflow-- > 0 &&
1834                                vm_page_secluded_count > vm_page_secluded_target) {
1835                                 assert((vm_page_secluded_count_free +
1836                                         vm_page_secluded_count_inuse) ==
1837                                        vm_page_secluded_count);
1838                                 vm_page_queue_remove_first(&vm_page_queue_secluded,
1839                                                            secluded_page,
1840                                                            vm_page_t,
1841                                                            pageq);
1842                                 assert(secluded_page->vm_page_q_state ==
1843                                        VM_PAGE_ON_SECLUDED_Q);
1844                                 VM_PAGE_ZERO_PAGEQ_ENTRY(secluded_page);
1845                                 secluded_page->vm_page_q_state = VM_PAGE_NOT_ON_Q;
1846                                 vm_page_secluded_count--;
1847                                 assert(!secluded_page->fictitious);
1848                                 assert(!VM_PAGE_WIRED(secluded_page));
1849                                 if (secluded_page->vm_page_object == 0) {
1850                                         /* transfer to free queue */
1851                                         assert(secluded_page->busy);
1852                                         vm_page_secluded_count_free--;
1853                                         secluded_page->snext = local_freeq;
1854                                         local_freeq = secluded_page;
1855                                         local_freed++;
1856                                 } else {
1857                                         vm_page_secluded_count_inuse--;
1858                                         /* transfer to head of inactive queue */
1859                                         pmap_clear_refmod_options(
1860                                                 VM_PAGE_GET_PHYS_PAGE(secluded_page),
1861                                                 VM_MEM_REFERENCED,
1862                                                 PMAP_OPTIONS_NOFLUSH,
1863                                                 (void *)NULL);
1864                                         vm_page_enqueue_inactive(secluded_page,
1865                                                                  FALSE);
1866                                         if (active_burst_count-- == 0) {
1867                                                 vm_pageout_secluded_burst_count++;
1868                                                 break;
1869                                         }
1870                                 }
1871                                 secluded_page = VM_PAGE_NULL;
1872                                 if (delayed_unlock++ > delayed_unlock_limit) {
1873                                         if (local_freeq) {
1874                                                 vm_page_unlock_queues();
1875                                                 VM_DEBUG_EVENT(
1876                                                         vm_pageout_freelist,
1877                                                         VM_PAGEOUT_FREELIST,
1878                                                         DBG_FUNC_START,
1879                                                         vm_page_free_count,
1880                                                         local_freed,
1881                                                         delayed_unlock_limit,
1882                                                         1);
1883                                                 vm_page_free_list(local_freeq,
1884                                                                   TRUE);
1885                                                 VM_DEBUG_EVENT(
1886                                                         vm_pageout_freelist,
1887                                                         VM_PAGEOUT_FREELIST,
1888                                                         DBG_FUNC_END,
1889                                                         vm_page_free_count,
1890                                                         0, 0, 1);
1891                                                 local_freeq = NULL;
1892                                                 local_freed = 0;
1893                                                 vm_page_lock_queues();
1894                                         } else {
1895                                                 lck_mtx_yield(&vm_page_queue_lock);
1896                                         }
1897                                         delayed_unlock = 1;
1898                                 }
1899                         }
1900                         delayed_unlock = 1;
1901                 } else if (vm_page_secluded_count > vm_page_secluded_target &&
1902                            secluded_aging_policy == SECLUDED_AGING_AFTER_INACTIVE) {
1903                         /*
1904                          * SECLUDED_AGING_AFTER_INACTIVE:
1905                          * No balancing needed at this point:  when we get to
1906                          * the "choose a victim" part below, we'll consider the
1907                          * extra secluded pages before any inactive page.
1908                          */
1909                 } else if (vm_page_secluded_count > vm_page_secluded_target &&
1910                            secluded_aging_policy == SECLUDED_AGING_BEFORE_ACTIVE) {
1911                         unsigned int secluded_overflow;
1912                         vm_page_t secluded_page;
1913
1914                         /*
1915                          * SECLUDED_AGING_BEFORE_ACTIVE:
1916                          * Excess secluded pages go to the active queue and
1917                          * will later go to the inactive queue.
1918                          */
1919                         active_burst_count = MIN(vm_pageout_burst_active_throttle,
1920                                                  vm_page_secluded_count_inuse);
1921                         delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT;
1922                         delayed_unlock = 1;
1923                         secluded_overflow = (vm_page_secluded_count -
1924                                              vm_page_secluded_target);
1925                         while (secluded_overflow-- > 0 &&
1926                                vm_page_secluded_count > vm_page_secluded_target) {
1927                                 assert((vm_page_secluded_count_free +
1928                                         vm_page_secluded_count_inuse) ==
1929                                        vm_page_secluded_count);
1930                                 vm_page_queue_remove_first(&vm_page_queue_secluded,
1931                                                            secluded_page,
1932                                                            vm_page_t,
1933                                                            pageq);
1934                                 assert(secluded_page->vm_page_q_state ==
1935                                        VM_PAGE_ON_SECLUDED_Q);
1936                                 VM_PAGE_ZERO_PAGEQ_ENTRY(secluded_page);
1937                                 secluded_page->vm_page_q_state = VM_PAGE_NOT_ON_Q;
1938                                 vm_page_secluded_count--;
1939                                 assert(!secluded_page->fictitious);
1940                                 assert(!VM_PAGE_WIRED(secluded_page));
1941                                 if (secluded_page->vm_page_object == 0) {
1942                                         /* transfer to free queue */
1943                                         assert(secluded_page->busy);
1944                                         vm_page_secluded_count_free--;
1945                                         secluded_page->snext = local_freeq;
1946                                         local_freeq = secluded_page;
1947                                         local_freed++;
1948                                 } else {
1949                                         vm_page_secluded_count_inuse--;
1950                                         /* transfer to head of active queue */
1951                                         vm_page_enqueue_active(secluded_page,
1952                                                                FALSE);
1953                                         if (active_burst_count-- == 0) {
1954                                                 vm_pageout_secluded_burst_count++;
1955                                                 break;
1956                                         }
1957                                 }
1958                                 secluded_page = VM_PAGE_NULL;
1959                                 if (delayed_unlock++ > delayed_unlock_limit) {
1960                                         if (local_freeq) {
1961                                                 vm_page_unlock_queues();
1962                                                 VM_DEBUG_EVENT(
1963                                                         vm_pageout_freelist,
1964                                                         VM_PAGEOUT_FREELIST,
1965                                                         DBG_FUNC_START,
1966                                                         vm_page_free_count,
1967                                                         local_freed,
1968                                                         delayed_unlock_limit,
1969                                                         1);
1970                                                 vm_page_free_list(local_freeq,
1971                                                                   TRUE);
1972                                                 VM_DEBUG_EVENT(
1973                                                         vm_pageout_freelist,
1974                                                         VM_PAGEOUT_FREELIST,
1975                                                         DBG_FUNC_END,
1976                                                         vm_page_free_count,
1977                                                         0, 0, 1);
1978                                                 local_freeq = NULL;
1979                                                 local_freed = 0;
1980                                                 vm_page_lock_queues();
1981                                         } else {
1982                                                 lck_mtx_yield(&vm_page_queue_lock);
1983                                         }
1984                                         delayed_unlock = 1;
1985                                 }
1986                         }
1987                         delayed_unlock = 1;
1988                 } else if (vm_page_secluded_count > vm_page_secluded_target) {
1989                         panic("unsupported secluded_aging_policy %d\n",
1990                               secluded_aging_policy);
1991                 }
1992                 if (local_freeq) {
1993                         vm_page_unlock_queues();
1994                         VM_DEBUG_EVENT(vm_pageout_freelist,
1995                                        VM_PAGEOUT_FREELIST,
1996                                        DBG_FUNC_START,
1997                                        vm_page_free_count,
1998                                        local_freed,
1999                                        0,
2000                                        0);
2001                         vm_page_free_list(local_freeq, TRUE);
2002                         VM_DEBUG_EVENT(vm_pageout_freelist,
2003                                        VM_PAGEOUT_FREELIST,
2004                                        DBG_FUNC_END,
2005                                        vm_page_free_count, 0, 0, 0);
2006                         local_freeq = NULL;
2007                         local_freed = 0;
2008                         vm_page_lock_queues();
2009                 }
2010 #endif /* CONFIG_SECLUDED_MEMORY */
2011
2012                 assert(delayed_unlock);
2013
2014                 if (vm_upl_wait_for_pages < 0)
2015                         vm_upl_wait_for_pages = 0;
2016
2017                 delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT + vm_upl_wait_for_pages;
2018
2019                 if (delayed_unlock_limit > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX)
2020                         delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX;
2021
2022                 /*
2023                  * Move pages from active to inactive if we're below the target
2024                  */
2025                 /* if we are trying to make clean, we need to make sure we actually have inactive - mj */
2026                 if ((vm_page_inactive_count + vm_page_speculative_count) >= vm_page_inactive_target)
2027                         goto done_moving_active_pages;
2028
2029                 if (object != NULL) {
2030                         vm_object_unlock(object);
2031                         object = NULL;
2032                         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2033                 }
2034                 /*
2035                  * Don't sweep through active queue more than the throttle
2036                  * which should be kept relatively low
2037                  */
2038                 active_burst_count = MIN(vm_pageout_burst_active_throttle, vm_page_active_count);
2039
2040                 VM_DEBUG_EVENT(vm_pageout_balance, VM_PAGEOUT_BALANCE, DBG_FUNC_START,
2041                                vm_pageout_inactive, vm_pageout_inactive_used, vm_page_free_count, local_freed);
2042
2043                 VM_DEBUG_EVENT(vm_pageout_balance, VM_PAGEOUT_BALANCE, DBG_FUNC_NONE,
2044                                vm_pageout_speculative_clean, vm_pageout_inactive_clean,
2045                                vm_pageout_inactive_dirty_internal, vm_pageout_inactive_dirty_external);
2046                 memoryshot(VM_PAGEOUT_BALANCE, DBG_FUNC_START);
2047
2048
2049                 while (!vm_page_queue_empty(&vm_page_queue_active) && active_burst_count--) {
2050
2051                         vm_pageout_active++;
2052
2053                         m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
2054
2055                         assert(m->vm_page_q_state == VM_PAGE_ON_ACTIVE_Q);
2056                         assert(!m->laundry);
2057                         assert(VM_PAGE_OBJECT(m) != kernel_object);
2058                         assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
2059
2060                         DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
2061
2062                         /*
2063                          * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
2064                          *
2065                          * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
2066                          * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
2067                          * new reference happens. If no futher references happen on the page after that remote TLB flushes
2068                          * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
2069                          * by pageout_scan, which is just fine since the last reference would have happened quite far
2070                          * in the past (TLB caches don't hang around for very long), and of course could just as easily
2071                          * have happened before we moved the page
2072                          */
2073                         pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
2074
2075                         /*
2076                          * The page might be absent or busy,
2077                          * but vm_page_deactivate can handle that.
2078                          * FALSE indicates that we don't want a H/W clear reference
2079                          */
2080                         vm_page_deactivate_internal(m, FALSE);
2081
2082                         if (delayed_unlock++ > delayed_unlock_limit) {
2083
2084                                 if (local_freeq) {
2085                                         vm_page_unlock_queues();
2086
2087                                         VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START,
2088                                                        vm_page_free_count, local_freed, delayed_unlock_limit, 1);
2089
2090                                         vm_page_free_list(local_freeq, TRUE);
2091
2092                                         VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END,
2093                                                        vm_page_free_count, 0, 0, 1);
2094
2095                                         local_freeq = NULL;
2096                                         local_freed = 0;
2097                                         vm_page_lock_queues();
2098                                 } else {
2099                                         lck_mtx_yield(&vm_page_queue_lock);
2100                                 }
2101
2102                                 delayed_unlock = 1;
2103
2104                                 /*
2105                                  * continue the while loop processing
2106                                  * the active queue... need to hold
2107                                  * the page queues lock
2108                                  */
2109                         }
2110                 }
2111
2112                 VM_DEBUG_EVENT(vm_pageout_balance, VM_PAGEOUT_BALANCE, DBG_FUNC_END,
2113                                vm_page_active_count, vm_page_inactive_count, vm_page_speculative_count, vm_page_inactive_target);
2114                 memoryshot(VM_PAGEOUT_BALANCE, DBG_FUNC_END);
2115
2116                 /**********************************************************************
2117                  * above this point we're playing with the active queue
2118                  * below this point we're playing with the throttling mechanisms
2119                  * and the inactive queue
2120                  **********************************************************************/
2121
2122 done_moving_active_pages:
2123
2124 #if CONFIG_BACKGROUND_QUEUE
2125                 if ((vm_page_free_count + local_freed >= vm_page_free_target) &&
2126                     ((vm_page_background_mode < VM_PAGE_BG_LEVEL_2) || (vm_page_background_count <= vm_page_background_target)))
2127 #else
2128                 if (vm_page_free_count + local_freed >= vm_page_free_target)
2129 #endif
2130                 {
2131                         if (object != NULL) {
2132                                 vm_object_unlock(object);
2133                                 object = NULL;
2134                         }
2135                         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2136
2137                         vm_page_unlock_queues();
2138
2139                         if (local_freeq) {
2140
2141                                 VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START,
2142                                                vm_page_free_count, local_freed, delayed_unlock_limit, 2);
2143
2144                                 vm_page_free_list(local_freeq, TRUE);
2145
2146                                 VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END,
2147                                                vm_page_free_count, local_freed, 0, 2);
2148
2149                                 local_freeq = NULL;
2150                                 local_freed = 0;
2151                         }
2152                         vm_consider_waking_compactor_swapper();
2153
2154                         vm_page_lock_queues();
2155
2156                         /*
2157                          * make sure the pageout I/O threads are running
2158                          * throttled in case there are still requests
2159                          * in the laundry... since we have met our targets
2160                          * we don't need the laundry to be cleaned in a timely
2161                          * fashion... so let's avoid interfering with foreground
2162                          * activity
2163                          */
2164                         vm_pageout_adjust_io_throttles(iq, eq, TRUE);
2165
2166                         /*
2167                          * recalculate vm_page_inactivate_target
2168                          */
2169                         vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
2170                                                                           vm_page_inactive_count +
2171                                                                           vm_page_speculative_count);
2172                         if (((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) &&
2173                             !vm_page_queue_empty(&vm_page_queue_active)) {
2174                                 /*
2175                                  * inactive target still not met... keep going
2176                                  * until we get the queues balanced...
2177                                  */
2178                                 continue;
2179                         }
2180                         lck_mtx_lock(&vm_page_queue_free_lock);
2181
2182                         if ((vm_page_free_count >= vm_page_free_target) &&
2183                             (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
2184                                 /*
2185                                  * done - we have met our target *and*
2186                                  * there is no one waiting for a page.
2187                                  */
2188 return_from_scan:
2189                                 assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
2190
2191                                 VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_NONE,
2192                                                vm_pageout_inactive, vm_pageout_inactive_used, 0, 0);
2193                                 VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_END,
2194                                                vm_pageout_speculative_clean, vm_pageout_inactive_clean,
2195                                                vm_pageout_inactive_dirty_internal, vm_pageout_inactive_dirty_external);
2196
2197                                 return;
2198                         }
2199                         lck_mtx_unlock(&vm_page_queue_free_lock);
2200                 }
2201
2202                 /*
2203                  * Before anything, we check if we have any ripe volatile
2204                  * objects around. If so, try to purge the first object.
2205                  * If the purge fails, fall through to reclaim a page instead.
2206                  * If the purge succeeds, go back to the top and reevalute
2207                  * the new memory situation.
2208                  */
2209
2210                 assert (available_for_purge>=0);
2211                 force_purge = 0; /* no force-purging */
2212
2213 #if VM_PRESSURE_EVENTS
2214                 pressure_level = memorystatus_vm_pressure_level;
2215
2216                 if (pressure_level > kVMPressureNormal) {
2217
2218                         if (pressure_level >= kVMPressureCritical) {
2219                                 force_purge = memorystatus_purge_on_critical;
2220                         } else if (pressure_level >= kVMPressureUrgent) {
2221                                 force_purge = memorystatus_purge_on_urgent;
2222                         } else if (pressure_level >= kVMPressureWarning) {
2223                                 force_purge = memorystatus_purge_on_warning;
2224                         }
2225                 }
2226 #endif /* VM_PRESSURE_EVENTS */
2227
2228                 if (available_for_purge || force_purge) {
2229
2230                         if (object != NULL) {
2231                                 vm_object_unlock(object);
2232                                 object = NULL;
2233                         }
2234
2235                         memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_START);
2236
2237                         VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_START, vm_page_free_count, 0, 0, 0);
2238                         if (vm_purgeable_object_purge_one(force_purge, C_DONT_BLOCK)) {
2239                                 vm_pageout_purged_objects++;
2240                                 VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, vm_page_free_count, 0, 0, 0);
2241                                 memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
2242                                 continue;
2243                         }
2244                         VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, 0, 0, 0, -1);
2245                         memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
2246                 }
2247
2248                 if (vm_page_queue_empty(&sq->age_q) && vm_page_speculative_count) {
2249                         /*
2250                          * try to pull pages from the aging bins...
2251                          * see vm_page.h for an explanation of how
2252                          * this mechanism works
2253                          */
2254                         struct vm_speculative_age_q     *aq;
2255                         boolean_t       can_steal = FALSE;
2256                         int num_scanned_queues;
2257
2258                         aq = &vm_page_queue_speculative[speculative_steal_index];
2259
2260                         num_scanned_queues = 0;
2261                         while (vm_page_queue_empty(&aq->age_q) &&
2262                                num_scanned_queues++ != VM_PAGE_MAX_SPECULATIVE_AGE_Q) {
2263
2264                                 speculative_steal_index++;
2265
2266                                 if (speculative_steal_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q)
2267                                         speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
2268
2269                                 aq = &vm_page_queue_speculative[speculative_steal_index];
2270                         }
2271
2272                         if (num_scanned_queues == VM_PAGE_MAX_SPECULATIVE_AGE_Q + 1) {
2273                                 /*
2274                                  * XXX We've scanned all the speculative
2275                                  * queues but still haven't found one
2276                                  * that is not empty, even though
2277                                  * vm_page_speculative_count is not 0.
2278                                  *
2279                                  * report the anomaly...
2280                                  */
2281                                 printf("vm_pageout_scan: "
2282                                        "all speculative queues empty "
2283                                        "but count=%d.  Re-adjusting.\n",
2284                                        vm_page_speculative_count);
2285                                 if (vm_page_speculative_count > vm_page_speculative_count_drift_max)
2286                                         vm_page_speculative_count_drift_max = vm_page_speculative_count;
2287                                 vm_page_speculative_count_drifts++;
2288 #if DEVELOPMENT || DEBUG
2289                                 panic("vm_pageout_scan: vm_page_speculative_count=%d but queues are empty", vm_page_speculative_count);
2290 #endif /* DEVELOPMENT || DEBUG */
2291                                 /* readjust... */
2292                                 vm_page_speculative_count = 0;
2293                                 /* ... and continue */
2294                                 continue;
2295                         }
2296
2297                         if (vm_page_speculative_count > vm_page_speculative_target)
2298                                 can_steal = TRUE;
2299                         else {
2300                                 if (!delay_speculative_age) {
2301                                         mach_timespec_t ts_fully_aged;
2302
2303                                         ts_fully_aged.tv_sec = (VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_page_speculative_q_age_ms) / 1000;
2304                                         ts_fully_aged.tv_nsec = ((VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_page_speculative_q_age_ms) % 1000)
2305                                                 * 1000 * NSEC_PER_USEC;
2306
2307                                         ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts);
2308
2309                                         clock_sec_t sec;
2310                                         clock_nsec_t nsec;
2311                                         clock_get_system_nanotime(&sec, &nsec);
2312                                         ts.tv_sec = (unsigned int) sec;
2313                                         ts.tv_nsec = nsec;
2314
2315                                         if (CMP_MACH_TIMESPEC(&ts, &ts_fully_aged) >= 0)
2316                                                 can_steal = TRUE;
2317                                         else
2318                                                 delay_speculative_age++;
2319                                 } else {
2320                                         delay_speculative_age++;
2321                                         if (delay_speculative_age == DELAY_SPECULATIVE_AGE)
2322                                                 delay_speculative_age = 0;
2323                                 }
2324                         }
2325                         if (can_steal == TRUE)
2326                                 vm_page_speculate_ageit(aq);
2327                 }
2328 #if CONFIG_BACKGROUND_QUEUE
2329                 if (vm_page_queue_empty(&sq->age_q) && cache_evict_throttle == 0 &&
2330                     ((vm_page_background_mode == VM_PAGE_BG_DISABLED) || (vm_page_background_count <= vm_page_background_target)))
2331 #else
2332                 if (vm_page_queue_empty(&sq->age_q) && cache_evict_throttle == 0)
2333 #endif
2334                 {
2335                         int     pages_evicted;
2336
2337                         if (object != NULL) {
2338                                 vm_object_unlock(object);
2339                                 object = NULL;
2340                         }
2341                         pages_evicted = vm_object_cache_evict(100, 10);
2342
2343                         if (pages_evicted) {
2344
2345                                 vm_pageout_cache_evicted += pages_evicted;
2346
2347                                 VM_DEBUG_EVENT(vm_pageout_cache_evict, VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE,
2348                                                vm_page_free_count, pages_evicted, vm_pageout_cache_evicted, 0);
2349                                 memoryshot(VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE);
2350
2351                                 /*
2352                                  * we just freed up to 100 pages,
2353                                  * so go back to the top of the main loop
2354                                  * and re-evaulate the memory situation
2355                                  */
2356                                 continue;
2357                         } else
2358                                 cache_evict_throttle = 100;
2359                 }
2360                 if  (cache_evict_throttle)
2361                         cache_evict_throttle--;
2362
2363 #if CONFIG_JETSAM
2364                 /*
2365                  * don't let the filecache_min fall below 15% of available memory
2366                  * on systems with an active compressor that isn't nearing its
2367                  * limits w/r to accepting new data
2368                  *
2369                  * on systems w/o the compressor/swapper, the filecache is always
2370                  * a very large percentage of the AVAILABLE_NON_COMPRESSED_MEMORY
2371                  * since most (if not all) of the anonymous pages are in the
2372                  * throttled queue (which isn't counted as available) which
2373                  * effectively disables this filter
2374                  */
2375                 if (vm_compressor_low_on_space())
2376                         vm_page_filecache_min = 0;
2377                 else
2378                         vm_page_filecache_min = (AVAILABLE_NON_COMPRESSED_MEMORY / 7);
2379 #else
2380                 /*
2381                  * don't let the filecache_min fall below 33% of available memory...
2382                  */
2383                 vm_page_filecache_min = (AVAILABLE_NON_COMPRESSED_MEMORY / 3);
2384 #endif
2385                 if (vm_page_free_count < (vm_page_free_reserved / 4))
2386                         vm_page_filecache_min = 0;
2387
2388                 exceeded_burst_throttle = FALSE;
2389                 /*
2390                  * Sometimes we have to pause:
2391                  *      1) No inactive pages - nothing to do.
2392                  *      2) Loop control - no acceptable pages found on the inactive queue
2393                  *         within the last vm_pageout_burst_inactive_throttle iterations
2394                  *      3) Flow control - default pageout queue is full
2395                  */
2396                 if (vm_page_queue_empty(&vm_page_queue_inactive) &&
2397                     vm_page_queue_empty(&vm_page_queue_anonymous) &&
2398                     vm_page_queue_empty(&sq->age_q)) {
2399                         vm_pageout_scan_empty_throttle++;
2400                         msecs = vm_pageout_empty_wait;
2401                         goto vm_pageout_scan_delay;
2402
2403                 } else if (inactive_burst_count >=
2404                            MIN(vm_pageout_burst_inactive_throttle,
2405                                (vm_page_inactive_count +
2406                                 vm_page_speculative_count))) {
2407                         vm_pageout_scan_burst_throttle++;
2408                         msecs = vm_pageout_burst_wait;
2409
2410                         exceeded_burst_throttle = TRUE;
2411                         goto vm_pageout_scan_delay;
2412
2413                 } else if (vm_page_free_count > (vm_page_free_reserved / 4) &&
2414                            VM_PAGEOUT_SCAN_NEEDS_TO_THROTTLE()) {
2415                         vm_pageout_scan_swap_throttle++;
2416                         msecs = vm_pageout_swap_wait;
2417                         goto vm_pageout_scan_delay;
2418
2419                 } else if (VM_PAGE_Q_THROTTLED(iq) &&
2420                                   VM_DYNAMIC_PAGING_ENABLED()) {
2421                         clock_sec_t sec;
2422                         clock_nsec_t nsec;
2423
2424                         switch (flow_control.state) {
2425
2426                         case FCS_IDLE:
2427                                 if ((vm_page_free_count + local_freed) < vm_page_free_target) {
2428
2429                                         if (object != NULL) {
2430                                                 vm_object_unlock(object);
2431                                                 object = NULL;
2432                                         }
2433                                         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2434
2435                                         vm_page_unlock_queues();
2436
2437                                         if (local_freeq) {
2438
2439                                                 VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START,
2440                                                                vm_page_free_count, local_freed, delayed_unlock_limit, 3);
2441
2442                                                 vm_page_free_list(local_freeq, TRUE);
2443
2444                                                 VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END,
2445                                                                vm_page_free_count, local_freed, 0, 3);
2446
2447                                                 local_freeq = NULL;
2448                                                 local_freed = 0;
2449                                         }
2450                                         thread_yield_internal(1);
2451
2452                                         vm_page_lock_queues();
2453
2454                                         if (!VM_PAGE_Q_THROTTLED(iq)) {
2455                                                 vm_pageout_scan_yield_unthrottled++;
2456                                                 continue;
2457                                         }
2458                                         if (vm_page_pageable_external_count > vm_page_filecache_min &&
2459                                             !vm_page_queue_empty(&vm_page_queue_inactive)) {
2460                                                 anons_grabbed = ANONS_GRABBED_LIMIT;
2461                                                 vm_pageout_scan_throttle_deferred++;
2462                                                 goto consider_inactive;
2463                                         }
2464                                         if (((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) && vm_page_active_count)
2465                                                 continue;
2466                                 }
2467 reset_deadlock_timer:
2468                                 ts.tv_sec = vm_pageout_deadlock_wait / 1000;
2469                                 ts.tv_nsec = (vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
2470                                 clock_get_system_nanotime(&sec, &nsec);
2471                                 flow_control.ts.tv_sec = (unsigned int) sec;
2472                                 flow_control.ts.tv_nsec = nsec;
2473                                 ADD_MACH_TIMESPEC(&flow_control.ts, &ts);
2474
2475                                 flow_control.state = FCS_DELAYED;
2476                                 msecs = vm_pageout_deadlock_wait;
2477
2478                                 break;
2479
2480                         case FCS_DELAYED:
2481                                 clock_get_system_nanotime(&sec, &nsec);
2482                                 ts.tv_sec = (unsigned int) sec;
2483                                 ts.tv_nsec = nsec;
2484
2485                                 if (CMP_MACH_TIMESPEC(&ts, &flow_control.ts) >= 0) {
2486                                         /*
2487                                          * the pageout thread for the default pager is potentially
2488                                          * deadlocked since the
2489                                          * default pager queue has been throttled for more than the
2490                                          * allowable time... we need to move some clean pages or dirty
2491                                          * pages belonging to the external pagers if they aren't throttled
2492                                          * vm_page_free_wanted represents the number of threads currently
2493                                          * blocked waiting for pages... we'll move one page for each of
2494                                          * these plus a fixed amount to break the logjam... once we're done
2495                                          * moving this number of pages, we'll re-enter the FSC_DELAYED state
2496                                          * with a new timeout target since we have no way of knowing
2497                                          * whether we've broken the deadlock except through observation
2498                                          * of the queue associated with the default pager... we need to
2499                                          * stop moving pages and allow the system to run to see what
2500                                          * state it settles into.
2501                                          */
2502                                         vm_pageout_deadlock_target = vm_pageout_deadlock_relief + vm_page_free_wanted + vm_page_free_wanted_privileged;
2503                                         vm_pageout_scan_deadlock_detected++;
2504                                         flow_control.state = FCS_DEADLOCK_DETECTED;
2505                                         thread_wakeup((event_t) &vm_pageout_garbage_collect);
2506                                         goto consider_inactive;
2507                                 }
2508                                 /*
2509                                  * just resniff instead of trying
2510                                  * to compute a new delay time... we're going to be
2511                                  * awakened immediately upon a laundry completion,
2512                                  * so we won't wait any longer than necessary
2513                                  */
2514                                 msecs = vm_pageout_idle_wait;
2515                                 break;
2516
2517                         case FCS_DEADLOCK_DETECTED:
2518                                 if (vm_pageout_deadlock_target)
2519                                         goto consider_inactive;
2520                                 goto reset_deadlock_timer;
2521
2522                         }
2523 vm_pageout_scan_delay:
2524                         if (object != NULL) {
2525                                 vm_object_unlock(object);
2526                                 object = NULL;
2527                         }
2528                         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2529
2530                         vm_page_unlock_queues();
2531
2532                         if (local_freeq) {
2533
2534                                 VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START,
2535                                                vm_page_free_count, local_freed, delayed_unlock_limit, 3);
2536
2537                                 vm_page_free_list(local_freeq, TRUE);
2538
2539                                 VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END,
2540                                                vm_page_free_count, local_freed, 0, 3);
2541
2542                                 local_freeq = NULL;
2543                                 local_freed = 0;
2544                         }
2545                         vm_consider_waking_compactor_swapper();
2546
2547                         vm_page_lock_queues();
2548
2549                         if (flow_control.state == FCS_DELAYED &&
2550                             !VM_PAGE_Q_THROTTLED(iq)) {
2551                                 flow_control.state = FCS_IDLE;
2552                                 goto consider_inactive;
2553                         }
2554
2555                         if (vm_page_free_count >= vm_page_free_target) {
2556                                 /*
2557                                  * we're here because
2558                                  *  1) someone else freed up some pages while we had
2559                                  *     the queues unlocked above
2560                                  * and we've hit one of the 3 conditions that
2561                                  * cause us to pause the pageout scan thread
2562                                  *
2563                                  * since we already have enough free pages,
2564                                  * let's avoid stalling and return normally
2565                                  *
2566                                  * before we return, make sure the pageout I/O threads
2567                                  * are running throttled in case there are still requests
2568                                  * in the laundry... since we have enough free pages
2569                                  * we don't need the laundry to be cleaned in a timely
2570                                  * fashion... so let's avoid interfering with foreground
2571                                  * activity
2572                                  *
2573                                  * we don't want to hold vm_page_queue_free_lock when
2574                                  * calling vm_pageout_adjust_io_throttles (since it
2575                                  * may cause other locks to be taken), we do the intitial
2576                                  * check outside of the lock.  Once we take the lock,
2577                                  * we recheck the condition since it may have changed.
2578                                  * if it has, no problem, we will make the threads
2579                                  * non-throttled before actually blocking
2580                                  */
2581                                 vm_pageout_adjust_io_throttles(iq, eq, TRUE);
2582                         }
2583                         lck_mtx_lock(&vm_page_queue_free_lock);
2584
2585                         if (vm_page_free_count >= vm_page_free_target &&
2586                             (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
2587                                 goto return_from_scan;
2588                         }
2589                         lck_mtx_unlock(&vm_page_queue_free_lock);
2590
2591                         if ((vm_page_free_count + vm_page_cleaned_count) < vm_page_free_target) {
2592                                 /*
2593                                  * we're most likely about to block due to one of
2594                                  * the 3 conditions that cause vm_pageout_scan to
2595                                  * not be able to make forward progress w/r
2596                                  * to providing new pages to the free queue,
2597                                  * so unthrottle the I/O threads in case we
2598                                  * have laundry to be cleaned... it needs
2599                                  * to be completed ASAP.
2600                                  *
2601                                  * even if we don't block, we want the io threads
2602                                  * running unthrottled since the sum of free +
2603                                  * clean pages is still under our free target
2604                                  */
2605                                 vm_pageout_adjust_io_throttles(iq, eq, FALSE);
2606                         }
2607                         if (vm_page_cleaned_count > 0 && exceeded_burst_throttle == FALSE) {
2608                                 /*
2609                                  * if we get here we're below our free target and
2610                                  * we're stalling due to a full laundry queue or
2611                                  * we don't have any inactive pages other then
2612                                  * those in the clean queue...
2613                                  * however, we have pages on the clean queue that
2614                                  * can be moved to the free queue, so let's not
2615                                  * stall the pageout scan
2616                                  */
2617                                 flow_control.state = FCS_IDLE;
2618                                 goto consider_inactive;
2619                         }
2620                         VM_CHECK_MEMORYSTATUS;
2621
2622                         if (flow_control.state != FCS_IDLE)
2623                                 vm_pageout_scan_throttle++;
2624                         iq->pgo_throttled = TRUE;
2625
2626                         assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000*NSEC_PER_USEC);
2627                         counter(c_vm_pageout_scan_block++);
2628
2629                         vm_page_unlock_queues();
2630
2631                         assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
2632
2633                         VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START,
2634                                        iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2635                         memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START);
2636
2637                         thread_block(THREAD_CONTINUE_NULL);
2638
2639                         VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END,
2640                                        iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2641                         memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END);
2642
2643                         vm_page_lock_queues();
2644                         delayed_unlock = 1;
2645
2646                         iq->pgo_throttled = FALSE;
2647
2648                         if (loop_count >= vm_page_inactive_count)
2649                                 loop_count = 0;
2650                         inactive_burst_count = 0;
2651
2652                         goto Restart;
2653                         /*NOTREACHED*/
2654                 }
2655
2656
2657                 flow_control.state = FCS_IDLE;
2658 consider_inactive:
2659                 vm_pageout_inactive_external_forced_reactivate_limit = MIN((vm_page_active_count + vm_page_inactive_count),
2660                                                                             vm_pageout_inactive_external_forced_reactivate_limit);
2661                 loop_count++;
2662                 inactive_burst_count++;
2663                 vm_pageout_inactive++;
2664
2665
2666                 /*
2667                  * Choose a victim.
2668                  */
2669                 while (1) {
2670                         uint32_t        inactive_external_count;
2671
2672 #if CONFIG_BACKGROUND_QUEUE
2673                         ignore_reference = FALSE;
2674 #endif /* CONFIG_BACKGROUND_QUEUE */
2675
2676                         m = NULL;
2677                         m_object = VM_OBJECT_NULL;
2678
2679                         if (VM_DYNAMIC_PAGING_ENABLED()) {
2680                                 assert(vm_page_throttled_count == 0);
2681                                 assert(vm_page_queue_empty(&vm_page_queue_throttled));
2682                         }
2683
2684
2685 #if CONFIG_SECLUDED_MEMORY
2686                         if ((secluded_aging_policy ==
2687                              SECLUDED_AGING_AFTER_INACTIVE) &&
2688                             vm_page_secluded_count > vm_page_secluded_target) {
2689                                 /*
2690                                  * SECLUDED_AGING_AFTER_INACTIVE:
2691                                  * Secluded pages have already been aged
2692                                  * through the active and inactive queues, and
2693                                  * we now have too many of them, so let's
2694                                  * balance that queue by considering reclaiming
2695                                  * the oldest page in the secluded queue.
2696                                  */
2697                                 assert(!vm_page_queue_empty(&vm_page_queue_secluded));
2698                                 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_secluded);
2699                                 if (m->vm_page_object == 0) {
2700                                         /*
2701                                          * It's already a free page:
2702                                          * just move it to a free queue.
2703                                          */
2704                                         vm_page_queues_remove(m, TRUE);
2705                                         assert(m->busy);
2706                                         assert(m->pageq.next == 0);
2707                                         assert(m->pageq.prev == 0);
2708                                         m->snext = local_freeq;
2709                                         local_freeq = m;
2710                                         local_freed++;
2711                                         goto done_with_inactivepage;
2712                                 }
2713                                 /*
2714                                  * Not a free page: we've found our next
2715                                  * "victim".
2716                                  */
2717                                 break;
2718                         }
2719 #endif /* CONFIG_SECLUDED_MEMORY */
2720
2721 #if CONFIG_BACKGROUND_QUEUE
2722                         if (vm_page_background_mode != VM_PAGE_BG_DISABLED && (vm_page_background_count > vm_page_background_target)) {
2723                                 vm_object_t     bg_m_object = NULL;
2724
2725                                 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
2726
2727                                 bg_m_object = VM_PAGE_OBJECT(m);
2728
2729                                 if (!VM_PAGE_PAGEABLE(m)) {
2730                                         /*
2731                                          * This page is on the background queue
2732                                          * but not on a pageable queue.  This is
2733                                          * likely a transient state and whoever
2734                                          * took it out of its pageable queue
2735                                          * will likely put it back on a pageable
2736                                          * queue soon but we can't deal with it
2737                                          * at this point, so let's ignore this
2738                                          * page.
2739                                          */
2740                                 } else if (force_anonymous == FALSE || bg_m_object->internal) {
2741                                         ignore_reference = TRUE;
2742
2743                                         if (bg_m_object->internal)
2744                                                 vm_pageout_considered_bq_internal++;
2745                                         else
2746                                                 vm_pageout_considered_bq_external++;
2747
2748                                         assert(VM_PAGE_PAGEABLE(m));
2749                                         break;
2750                                 }
2751                         }
2752 #endif
2753
2754                         /*
2755                          * The most eligible pages are ones we paged in speculatively,
2756                          * but which have not yet been touched.
2757                          */
2758                         if (!vm_page_queue_empty(&sq->age_q) && force_anonymous == FALSE) {
2759                                 m = (vm_page_t) vm_page_queue_first(&sq->age_q);
2760
2761                                 assert(m->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q);
2762
2763                                 break;
2764                         }
2765                         /*
2766                          * Try a clean-queue inactive page.
2767                          */
2768                         if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2769                                 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2770
2771                                 assert(m->vm_page_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q);
2772
2773                                 break;
2774                         }
2775
2776                         grab_anonymous = (vm_page_anonymous_count > vm_page_anonymous_min);
2777                         inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2778
2779                         if ((vm_page_pageable_external_count < vm_page_filecache_min || force_anonymous == TRUE) ||
2780                             ((inactive_external_count < vm_page_anonymous_count) && (inactive_external_count < (vm_page_pageable_external_count / 3)))) {
2781                                 grab_anonymous = TRUE;
2782                                 anons_grabbed = 0;
2783                         }
2784 #if CONFIG_JETSAM
2785                         /* If the file-backed pool has accumulated
2786                          * significantly more pages than the jetsam
2787                          * threshold, prefer to reclaim those
2788                          * inline to minimise compute overhead of reclaiming
2789                          * anonymous pages.
2790                          * This calculation does not account for the CPU local
2791                          * external page queues, as those are expected to be
2792                          * much smaller relative to the global pools.
2793                          */
2794                         if (grab_anonymous) {
2795                                 if (vm_page_pageable_external_count >
2796                                     vm_page_filecache_min) {
2797                                         if ((vm_page_pageable_external_count *
2798                                                 vm_pageout_memorystatus_fb_factor_dr) >
2799                                             (memorystatus_available_pages_critical *
2800                                             vm_pageout_memorystatus_fb_factor_nr)) {
2801                                                 grab_anonymous = FALSE;
2802 #if DEVELOPMENT || DEBUG
2803                                                 vm_grab_anon_overrides++;
2804 #endif
2805                                         }
2806                                 }
2807 #if DEVELOPMENT || DEBUG
2808                                 if (grab_anonymous) {
2809                                         vm_grab_anon_nops++;
2810
2811                                 }
2812 #endif
2813                         }
2814 #endif /* CONFIG_JETSAM */
2815
2816                         if (grab_anonymous == FALSE || anons_grabbed >= ANONS_GRABBED_LIMIT || vm_page_queue_empty(&vm_page_queue_anonymous)) {
2817
2818                                 if ( !vm_page_queue_empty(&vm_page_queue_inactive) ) {
2819                                         m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2820
2821                                         assert(m->vm_page_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q);
2822                                         anons_grabbed = 0;
2823
2824                                         if (vm_page_pageable_external_count < vm_page_filecache_min) {
2825                                                 if ((++reactivated_this_call % 100))
2826                                                         goto must_activate_page;
2827                                                 /*
2828                                                  * steal 1% of the file backed pages even if
2829                                                  * we are under the limit that has been set
2830                                                  * for a healthy filecache
2831                                                  */
2832                                         }
2833                                         break;
2834                                 }
2835                         }
2836                         if ( !vm_page_queue_empty(&vm_page_queue_anonymous) ) {
2837                                 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2838
2839                                 assert(m->vm_page_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q);
2840                                 anons_grabbed++;
2841
2842                                 break;
2843                         }
2844
2845                         /*
2846                          * if we've gotten here, we have no victim page.
2847                          * if making clean, free the local freed list and return.
2848                          * if making free, check to see if we've finished balancing the queues
2849                          * yet, if we haven't just continue, else panic
2850                          */
2851                         vm_page_unlock_queues();
2852
2853                         if (object != NULL) {
2854                                 vm_object_unlock(object);
2855                                 object = NULL;
2856                         }
2857                         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2858
2859                         if (local_freeq) {
2860                                 VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START,
2861                                                vm_page_free_count, local_freed, delayed_unlock_limit, 5);
2862
2863                                 vm_page_free_list(local_freeq, TRUE);
2864
2865                                 VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END,
2866                                                vm_page_free_count, local_freed, 0, 5);
2867
2868                                 local_freeq = NULL;
2869                                 local_freed = 0;
2870                         }
2871                         vm_page_lock_queues();
2872                         delayed_unlock = 1;
2873
2874                         force_anonymous = FALSE;
2875
2876                         if ((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target)
2877                                 goto Restart;
2878
2879                         if (!vm_page_queue_empty(&sq->age_q))
2880                                 goto Restart;
2881
2882                         panic("vm_pageout: no victim");
2883
2884                         /* NOTREACHED */
2885                 }
2886                 m_object = VM_PAGE_OBJECT(m);
2887                 force_anonymous = FALSE;
2888
2889                 page_prev_q_state = m->vm_page_q_state;
2890                 requeue_insert_first = FALSE;
2891                 /*
2892                  * we just found this page on one of our queues...
2893                  * it can't also be on the pageout queue, so safe
2894                  * to call vm_page_queues_remove
2895                  */
2896                 vm_page_queues_remove(m, TRUE);
2897
2898                 assert(!m->laundry);
2899                 assert(!m->private);
2900                 assert(!m->fictitious);
2901                 assert(m_object != kernel_object);
2902                 assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
2903
2904
2905                 if (page_prev_q_state != VM_PAGE_ON_SPECULATIVE_Q &&
2906                     page_prev_q_state != VM_PAGE_ON_SECLUDED_Q)
2907                         vm_pageout_stats[vm_pageout_stat_now].considered++;
2908
2909                 DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
2910
2911                 /*
2912                  * check to see if we currently are working
2913                  * with the same object... if so, we've
2914                  * already got the lock
2915                  */
2916                 if (m_object != object) {
2917                         /*
2918                          * the object associated with candidate page is
2919                          * different from the one we were just working
2920                          * with... dump the lock if we still own it
2921                          */
2922                         if (object != NULL) {
2923                                 vm_object_unlock(object);
2924                                 object = NULL;
2925                                 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2926                         }
2927                         /*
2928                          * Try to lock object; since we've alread got the
2929                          * page queues lock, we can only 'try' for this one.
2930                          * if the 'try' fails, we need to do a mutex_pause
2931                          * to allow the owner of the object lock a chance to
2932                          * run... otherwise, we're likely to trip over this
2933                          * object in the same state as we work our way through
2934                          * the queue... clumps of pages associated with the same
2935                          * object are fairly typical on the inactive and active queues
2936                          */
2937                         if (!vm_object_lock_try_scan(m_object)) {
2938                                 vm_page_t m_want = NULL;
2939
2940                                 vm_pageout_inactive_nolock++;
2941
2942                                 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q)
2943                                         vm_pageout_cleaned_nolock++;
2944
2945                                 if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q)
2946                                         requeue_insert_first = TRUE;
2947
2948                                 pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(m));
2949                                 m->reference = FALSE;
2950
2951                                 /*
2952                                  * m->object must be stable since we hold the page queues lock...
2953                                  * we can update the scan_collisions field sans the object lock
2954                                  * since it is a separate field and this is the only spot that does
2955                                  * a read-modify-write operation and it is never executed concurrently...
2956                                  * we can asynchronously set this field to 0 when creating a UPL, so it
2957                                  * is possible for the value to be a bit non-determistic, but that's ok
2958                                  * since it's only used as a hint
2959                                  */
2960                                 m_object->scan_collisions = 1;
2961
2962                                 if ( !vm_page_queue_empty(&sq->age_q) )
2963                                         m_want = (vm_page_t) vm_page_queue_first(&sq->age_q);
2964                                 else if ( !vm_page_queue_empty(&vm_page_queue_cleaned))
2965                                         m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2966                                 else if ( !vm_page_queue_empty(&vm_page_queue_inactive) &&
2967                                           (anons_grabbed >= ANONS_GRABBED_LIMIT || vm_page_queue_empty(&vm_page_queue_anonymous)))
2968                                         m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2969                                 else if ( !vm_page_queue_empty(&vm_page_queue_anonymous))
2970                                         m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2971
2972                                 /*
2973                                  * this is the next object we're going to be interested in
2974                                  * try to make sure its available after the mutex_yield
2975                                  * returns control
2976                                  */
2977                                 if (m_want)
2978                                         vm_pageout_scan_wants_object = VM_PAGE_OBJECT(m_want);
2979
2980                                 /*
2981                                  * force us to dump any collected free pages
2982                                  * and to pause before moving on
2983                                  */
2984                                 try_failed = TRUE;
2985
2986                                 goto requeue_page;
2987                         }
2988                         object = m_object;
2989                         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2990
2991                         try_failed = FALSE;
2992                 }
2993                 assert(m_object == object);
2994                 assert(VM_PAGE_OBJECT(m) == m_object);
2995
2996                 if (catch_up_count)
2997                         catch_up_count--;
2998
2999                 if (m->busy) {
3000                         if (m->encrypted_cleaning) {
3001                                 /*
3002                                  * ENCRYPTED SWAP:
3003                                  * if this page has already been picked up as
3004                                  * part of a page-out cluster, it will be busy
3005                                  * because it is being encrypted (see
3006                                  * vm_object_upl_request()).  But we still
3007                                  * want to demote it from "clean-in-place"
3008                                  * (aka "adjacent") to "clean-and-free" (aka
3009                                  * "target"), so let's ignore its "busy" bit
3010                                  * here and proceed to check for "cleaning" a
3011                                  * little bit below...
3012                                  *
3013                                  * CAUTION CAUTION:
3014                                  * A "busy" page should still be left alone for
3015                                  * most purposes, so we have to be very careful
3016                                  * not to process that page too much.
3017                                  */
3018                                 assert(m->cleaning);
3019                                 goto consider_inactive_page;
3020                         }
3021
3022                         /*
3023                          *      Somebody is already playing with this page.
3024                          *      Put it back on the appropriate queue
3025                          *
3026                          */
3027                         vm_pageout_inactive_busy++;
3028
3029                         if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q)
3030                                 vm_pageout_cleaned_busy++;
3031
3032 requeue_page:
3033                         if (requeue_insert_first)
3034                                 vm_page_enqueue_inactive(m, TRUE);
3035                         else
3036                                 vm_page_enqueue_inactive(m, FALSE);
3037 #if CONFIG_BACKGROUND_QUEUE
3038                         if (ignore_reference == TRUE) {
3039                                 if (m_object->internal)
3040                                         vm_pageout_rejected_bq_internal++;
3041                                 else
3042                                         vm_pageout_rejected_bq_external++;
3043                         }
3044 #endif
3045                         goto done_with_inactivepage;
3046                 }
3047
3048
3049                 /*
3050                  *      If it's absent, in error or the object is no longer alive,
3051                  *      we can reclaim the page... in the no longer alive case,
3052                  *      there are 2 states the page can be in that preclude us
3053                  *      from reclaiming it - busy or cleaning - that we've already
3054                  *      dealt with
3055                  */
3056                 if (m->absent || m->error || !object->alive) {
3057
3058                         if (m->absent)
3059                                 vm_pageout_inactive_absent++;
3060                         else if (!object->alive)
3061                                 vm_pageout_inactive_notalive++;
3062                         else
3063                                 vm_pageout_inactive_error++;
3064 reclaim_page:
3065                         if (vm_pageout_deadlock_target) {
3066                                 vm_pageout_scan_inactive_throttle_success++;
3067                                 vm_pageout_deadlock_target--;
3068                         }
3069
3070                         DTRACE_VM2(dfree, int, 1, (uint64_t *), NULL);
3071
3072                         if (object->internal) {
3073                                 DTRACE_VM2(anonfree, int, 1, (uint64_t *), NULL);
3074                         } else {
3075                                 DTRACE_VM2(fsfree, int, 1, (uint64_t *), NULL);
3076                         }
3077                         assert(!m->cleaning);
3078                         assert(!m->laundry);
3079
3080                         m->busy = TRUE;
3081
3082                         /*
3083                          * remove page from object here since we're already
3084                          * behind the object lock... defer the rest of the work
3085                          * we'd normally do in vm_page_free_prepare_object
3086                          * until 'vm_page_free_list' is called
3087                          */
3088                         if (m->tabled)
3089                                 vm_page_remove(m, TRUE);
3090
3091                         assert(m->pageq.next == 0 && m->pageq.prev == 0);
3092                         m->snext = local_freeq;
3093                         local_freeq = m;
3094                         local_freed++;
3095
3096 #if CONFIG_SECLUDED_MEMORY
3097                         if (page_prev_q_state == VM_PAGE_ON_SECLUDED_Q)
3098                                 vm_pageout_freed_from_secluded++;
3099 #endif /* CONFIG_SECLUDED_MEMORY */
3100                         if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q)
3101                                 vm_pageout_freed_from_speculative++;
3102                         else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q)
3103                                 vm_pageout_freed_from_cleaned++;
3104                         else
3105                                 vm_pageout_freed_from_inactive_clean++;
3106
3107                         if (page_prev_q_state != VM_PAGE_ON_SPECULATIVE_Q &&
3108                             page_prev_q_state != VM_PAGE_ON_SECLUDED_Q)
3109                                 vm_pageout_stats[vm_pageout_stat_now].reclaimed++;
3110
3111                         inactive_burst_count = 0;
3112                         goto done_with_inactivepage;
3113                 }
3114                 /*
3115                  * If the object is empty, the page must be reclaimed even
3116                  * if dirty or used.
3117                  * If the page belongs to a volatile object, we stick it back
3118                  * on.
3119                  */
3120                 if (object->copy == VM_OBJECT_NULL) {
3121                         if (object->purgable == VM_PURGABLE_EMPTY) {
3122                                 if (m->pmapped == TRUE) {
3123                                         /* unmap the page */
3124                                         refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
3125                                         if (refmod_state & VM_MEM_MODIFIED) {
3126                                                 SET_PAGE_DIRTY(m, FALSE);
3127                                         }
3128                                 }
3129                                 if (m->dirty || m->precious) {
3130                                         /* we saved the cost of cleaning this page ! */
3131                                         vm_page_purged_count++;
3132                                 }
3133                                 goto reclaim_page;
3134                         }
3135
3136                         if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
3137                                 /*
3138                                  * With the VM compressor, the cost of
3139                                  * reclaiming a page is much lower (no I/O),
3140                                  * so if we find a "volatile" page, it's better
3141                                  * to let it get compressed rather than letting
3142                                  * it occupy a full page until it gets purged.
3143                                  * So no need to check for "volatile" here.
3144                                  */
3145                         } else if (object->purgable == VM_PURGABLE_VOLATILE) {
3146                                 /*
3147                                  * Avoid cleaning a "volatile" page which might
3148                                  * be purged soon.
3149                                  */
3150
3151                                 /* if it's wired, we can't put it on our queue */
3152                                 assert(!VM_PAGE_WIRED(m));
3153
3154                                 /* just stick it back on! */
3155                                 reactivated_this_call++;
3156
3157                                 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q)
3158                                         vm_pageout_cleaned_volatile_reactivated++;
3159
3160                                 goto reactivate_page;
3161                         }
3162                 }
3163
3164 consider_inactive_page:
3165                 if (m->busy) {
3166                         /*
3167                          * CAUTION CAUTION:
3168                          * A "busy" page should always be left alone, except...
3169                          */
3170                         if (m->cleaning && m->encrypted_cleaning) {
3171                                 /*
3172                                  * ENCRYPTED_SWAP:
3173                                  * We could get here with a "busy" page
3174                                  * if it's being encrypted during a
3175                                  * "clean-in-place" operation.  We'll deal
3176                                  * with it right away by testing if it has been
3177                                  * referenced and either reactivating it or
3178                                  * promoting it from "clean-in-place" to
3179                                  * "clean-and-free".
3180                                  */
3181                         } else {
3182                                 panic("\"busy\" page considered for pageout\n");
3183                         }
3184                 }
3185
3186                 /*
3187                  *      If it's being used, reactivate.
3188                  *      (Fictitious pages are either busy or absent.)
3189                  *      First, update the reference and dirty bits
3190                  *      to make sure the page is unreferenced.
3191                  */
3192                 refmod_state = -1;
3193
3194                 if (m->reference == FALSE && m->pmapped == TRUE) {
3195                         refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
3196
3197                         if (refmod_state & VM_MEM_REFERENCED)
3198                                 m->reference = TRUE;
3199                         if (refmod_state & VM_MEM_MODIFIED) {
3200                                 SET_PAGE_DIRTY(m, FALSE);
3201                         }
3202                 }
3203
3204                 /*
3205                  *   if (m->cleaning && !m->free_when_done)
3206                  *      If already cleaning this page in place and it hasn't
3207                  *      been recently referenced, just pull off the queue.
3208                  *      We can leave the page mapped, and upl_commit_range
3209                  *      will put it on the clean queue.
3210                  *
3211                  *      note: if m->encrypted_cleaning == TRUE, then
3212                  *              m->cleaning == TRUE
3213                  *      and we'll handle it here
3214                  *
3215                  *   if (m->free_when_done && !m->cleaning)
3216                  *      an msync INVALIDATE is in progress...
3217                  *      this page has been marked for destruction
3218                  *      after it has been cleaned,
3219                  *      but not yet gathered into a UPL
3220                  *      where 'cleaning' will be set...
3221                  *      just leave it off the paging queues
3222                  *
3223                  *   if (m->free_when_done && m->clenaing)
3224                  *      an msync INVALIDATE is in progress
3225                  *      and the UPL has already gathered this page...
3226                  *      just leave it off the paging queues
3227                  */
3228
3229                 /*
3230                  * page with m->free_when_done and still on the queues means that an
3231                  * MS_INVALIDATE is in progress on this page... leave it alone
3232                  */
3233                 if (m->free_when_done) {
3234                         goto done_with_inactivepage;
3235                 }
3236
3237                 /* if cleaning, reactivate if referenced.  otherwise, just pull off queue */
3238                 if (m->cleaning) {
3239                         if (m->reference == TRUE) {
3240                                 reactivated_this_call++;
3241                                 goto reactivate_page;
3242                         } else {
3243                                 goto done_with_inactivepage;
3244                         }
3245                 }
3246
3247                 if (m->reference || m->dirty) {
3248                         /* deal with a rogue "reusable" page */
3249                         VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, m_object);
3250                 }
3251
3252 #if CONFIG_SECLUDED_MEMORY
3253                 if (secluded_for_filecache &&
3254                     vm_page_secluded_target > 0 &&
3255                     m_object->eligible_for_secluded &&
3256                     secluded_aging_policy == SECLUDED_AGING_FIFO) {
3257                         /*
3258                          * SECLUDED_AGING_FIFO:
3259                          * This victim page is eligible for the secluded pool
3260                          * and we're not aging secluded pages, so let's not
3261                          * reactivate it if it's been re-referenced.
3262                          * Later on, we'll move it to the secluded queue
3263                          * instead of freeing it.
3264                          */
3265                         ignore_reference_secluded = TRUE;
3266                 } else {
3267                         ignore_reference_secluded = FALSE;
3268                 }
3269 #endif /* CONFIG_SECLUDED_MEMORY */
3270
3271                 if (!m->no_cache &&
3272 #if CONFIG_BACKGROUND_QUEUE
3273                     ignore_reference == FALSE &&
3274 #endif
3275 #if CONFIG_SECLUDED_MEMORY
3276                     ignore_reference_secluded == FALSE &&
3277 #endif /* CONFIG_SECLUDED_MEMORY */
3278                     (m->reference ||
3279                      (m->xpmapped && !object->internal && (vm_page_xpmapped_external_count < (vm_page_external_count / 4))))) {
3280                         /*
3281                          * The page we pulled off the inactive list has
3282                          * been referenced.  It is possible for other
3283                          * processors to be touching pages faster than we
3284                          * can clear the referenced bit and traverse the
3285                          * inactive queue, so we limit the number of
3286                          * reactivations.
3287                          */
3288                         if (++reactivated_this_call >= reactivate_limit) {
3289                                 vm_pageout_reactivation_limit_exceeded++;
3290                         } else if (catch_up_count) {
3291                                 vm_pageout_catch_ups++;
3292                         } else if (++inactive_reclaim_run >= VM_PAGEOUT_INACTIVE_FORCE_RECLAIM) {
3293                                 vm_pageout_inactive_force_reclaim++;
3294                         } else {
3295                                 uint32_t isinuse;
3296
3297                                 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q)
3298                                         vm_pageout_cleaned_reference_reactivated++;
3299
3300 reactivate_page:
3301                                 if ( !object->internal && object->pager != MEMORY_OBJECT_NULL &&
3302                                      vnode_pager_get_isinuse(object->pager, &isinuse) == KERN_SUCCESS && !isinuse) {
3303                                         /*
3304                                          * no explict mappings of this object exist
3305                                          * and it's not open via the filesystem
3306                                          */
3307                                         vm_page_deactivate(m);
3308                                         vm_pageout_inactive_deactivated++;
3309                                 } else {
3310 must_activate_page:
3311                                         /*
3312                                          * The page was/is being used, so put back on active list.
3313                                          */
3314                                         vm_page_activate(m);
3315                                         VM_STAT_INCR(reactivations);
3316                                         inactive_burst_count = 0;
3317                                 }
3318 #if CONFIG_BACKGROUND_QUEUE
3319                                 if (ignore_reference == TRUE) {
3320                                         if (m_object->internal)
3321                                                 vm_pageout_rejected_bq_internal++;
3322                                         else
3323                                                 vm_pageout_rejected_bq_external++;
3324                                 }
3325 #endif
3326                                 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q)
3327                                         vm_pageout_cleaned_reactivated++;
3328 #if CONFIG_SECLUDED_MEMORY
3329                                 if (page_prev_q_state == VM_PAGE_ON_SECLUDED_Q)
3330                                         vm_pageout_secluded_reactivated++;
3331 #endif /* CONFIG_SECLUDED_MEMORY */
3332
3333                                 vm_pageout_inactive_used++;
3334
3335                                 goto done_with_inactivepage;
3336                         }
3337                         /*
3338                          * Make sure we call pmap_get_refmod() if it
3339                          * wasn't already called just above, to update
3340                          * the dirty bit.
3341                          */
3342                         if ((refmod_state == -1) && !m->dirty && m->pmapped) {
3343                                 refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
3344                                 if (refmod_state & VM_MEM_MODIFIED) {
3345                                         SET_PAGE_DIRTY(m, FALSE);
3346                                 }
3347                         }
3348                 }
3349
3350                 XPR(XPR_VM_PAGEOUT,
3351                 "vm_pageout_scan, replace object 0x%X offset 0x%X page 0x%X\n",
3352                 object, m->offset, m, 0,0);
3353
3354                 /*
3355                  * we've got a candidate page to steal...
3356                  *
3357                  * m->dirty is up to date courtesy of the
3358                  * preceding check for m->reference... if
3359                  * we get here, then m->reference had to be
3360                  * FALSE (or possibly "reactivate_limit" was
3361                  * exceeded), but in either case we called
3362                  * pmap_get_refmod() and updated both
3363                  * m->reference and m->dirty
3364                  *
3365                  * if it's dirty or precious we need to
3366                  * see if the target queue is throtttled
3367                  * it if is, we need to skip over it by moving it back
3368                  * to the end of the inactive queue
3369                  */
3370
3371                 inactive_throttled = FALSE;
3372
3373                 if (m->dirty || m->precious) {
3374                         if (object->internal) {
3375                                 if (VM_PAGE_Q_THROTTLED(iq))
3376                                         inactive_throttled = TRUE;
3377                         } else if (VM_PAGE_Q_THROTTLED(eq)) {
3378                                 inactive_throttled = TRUE;
3379                         }
3380                 }
3381 throttle_inactive:
3382                 if (!VM_DYNAMIC_PAGING_ENABLED() &&
3383                     object->internal && m->dirty &&
3384                     (object->purgable == VM_PURGABLE_DENY ||
3385                      object->purgable == VM_PURGABLE_NONVOLATILE ||
3386                      object->purgable == VM_PURGABLE_VOLATILE)) {
3387                         vm_page_check_pageable_safe(m);
3388                         assert(m->vm_page_q_state == VM_PAGE_NOT_ON_Q);
3389                         vm_page_queue_enter(&vm_page_queue_throttled, m,
3390                                             vm_page_t, pageq);
3391                         m->vm_page_q_state = VM_PAGE_ON_THROTTLED_Q;
3392                         vm_page_throttled_count++;
3393
3394                         vm_pageout_scan_reclaimed_throttled++;
3395
3396                         inactive_burst_count = 0;
3397                         goto done_with_inactivepage;
3398                 }
3399                 if (inactive_throttled == TRUE) {
3400
3401                         if (object->internal == FALSE) {
3402                                 /*
3403                                  * we need to break up the following potential deadlock case...
3404                                  *  a) The external pageout thread is stuck on the truncate lock for a file that is being extended i.e. written.
3405                                  *  b) The thread doing the writing is waiting for pages while holding the truncate lock
3406                                  *  c) Most of the pages in the inactive queue belong to this file.
3407                                  *
3408                                  * we are potentially in this deadlock because...
3409                                  *  a) the external pageout queue is throttled
3410                                  *  b) we're done with the active queue and moved on to the inactive queue
3411                                  *  c) we've got a dirty external page
3412                                  *
3413                                  * since we don't know the reason for the external pageout queue being throttled we
3414                                  * must suspect that we are deadlocked, so move the current page onto the active queue
3415                                  * in an effort to cause a page from the active queue to 'age' to the inactive queue
3416                                  *
3417                                  * if we don't have jetsam configured (i.e. we have a dynamic pager), set
3418                                  * 'force_anonymous' to TRUE to cause us to grab a page from the cleaned/anonymous
3419                                  * pool the next time we select a victim page... if we can make enough new free pages,
3420                                  * the deadlock will break, the external pageout queue will empty and it will no longer
3421                                  * be throttled
3422                                  *
3423                                  * if we have jestam configured, keep a count of the pages reactivated this way so
3424                                  * that we can try to find clean pages in the active/inactive queues before
3425                                  * deciding to jetsam a process
3426                                  */
3427                                 vm_pageout_scan_inactive_throttled_external++;
3428
3429                                 vm_page_check_pageable_safe(m);
3430                                 assert(m->vm_page_q_state == VM_PAGE_NOT_ON_Q);
3431                                 vm_page_queue_enter(&vm_page_queue_active, m, vm_page_t, pageq);
3432                                 m->vm_page_q_state = VM_PAGE_ON_ACTIVE_Q;
3433                                 vm_page_active_count++;
3434                                 vm_page_pageable_external_count++;
3435
3436                                 vm_pageout_adjust_io_throttles(iq, eq, FALSE);
3437
3438 #if CONFIG_MEMORYSTATUS && CONFIG_JETSAM
3439                                 vm_pageout_inactive_external_forced_reactivate_limit--;
3440
3441                                 if (vm_pageout_inactive_external_forced_reactivate_limit <= 0) {
3442                                         vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
3443                                         /*
3444                                          * Possible deadlock scenario so request jetsam action
3445                                          */
3446                                         assert(object);
3447                                         vm_object_unlock(object);
3448                                         object = VM_OBJECT_NULL;
3449                                         vm_page_unlock_queues();
3450
3451                                         VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_START,
3452                                                vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count);
3453
3454                                         /* Kill first suitable process */
3455                                         if (memorystatus_kill_on_VM_page_shortage(FALSE) == FALSE) {
3456                                                 panic("vm_pageout_scan: Jetsam request failed\n");
3457                                         }
3458
3459                                         VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_END, 0, 0, 0, 0);
3460
3461                                         vm_pageout_inactive_external_forced_jetsam_count++;
3462                                         vm_page_lock_queues();
3463                                         delayed_unlock = 1;
3464                                 }
3465 #else /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
3466                                 force_anonymous = TRUE;
3467 #endif
3468                                 inactive_burst_count = 0;
3469                                 goto done_with_inactivepage;
3470                         } else {
3471                                 vm_pageout_scan_inactive_throttled_internal++;
3472
3473                                 goto must_activate_page;
3474                         }
3475                 }
3476
3477                 /*
3478                  * we've got a page that we can steal...
3479                  * eliminate all mappings and make sure
3480                  * we have the up-to-date modified state
3481                  *
3482                  * if we need to do a pmap_disconnect then we
3483                  * need to re-evaluate m->dirty since the pmap_disconnect
3484                  * provides the true state atomically... the
3485                  * page was still mapped up to the pmap_disconnect
3486                  * and may have been dirtied at the last microsecond
3487                  *
3488                  * Note that if 'pmapped' is FALSE then the page is not
3489                  * and has not been in any map, so there is no point calling
3490                  * pmap_disconnect().  m->dirty could have been set in anticipation
3491                  * of likely usage of the page.
3492                  */
3493                 if (m->pmapped == TRUE) {
3494                         int pmap_options;
3495
3496                         /*
3497                          * Don't count this page as going into the compressor
3498                          * if any of these are true:
3499                          * 1) compressed pager isn't enabled
3500                          * 2) Freezer enabled device with compressed pager
3501                          *    backend (exclusive use) i.e. most of the VM system
3502                          *    (including vm_pageout_scan) has no knowledge of
3503                          *    the compressor
3504                          * 3) This page belongs to a file and hence will not be
3505                          *    sent into the compressor
3506                          */
3507                         if ( !VM_CONFIG_COMPRESSOR_IS_ACTIVE ||
3508                             object->internal == FALSE) {
3509                                 pmap_options = 0;
3510                         } else if (m->dirty || m->precious) {
3511                                 /*
3512                                  * VM knows that this page is dirty (or
3513                                  * precious) and needs to be compressed
3514                                  * rather than freed.
3515                                  * Tell the pmap layer to count this page
3516                                  * as "compressed".
3517                                  */
3518                                 pmap_options = PMAP_OPTIONS_COMPRESSOR;
3519                         } else {
3520                                 /*
3521                                  * VM does not know if the page needs to
3522                                  * be preserved but the pmap layer might tell
3523                                  * us if any mapping has "modified" it.
3524                                  * Let's the pmap layer to count this page
3525                                  * as compressed if and only if it has been
3526                                  * modified.
3527                                  */
3528                                 pmap_options =
3529                                         PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
3530                         }
3531                         refmod_state = pmap_disconnect_options(VM_PAGE_GET_PHYS_PAGE(m),
3532                                                                pmap_options,
3533                                                                NULL);
3534                         if (refmod_state & VM_MEM_MODIFIED) {
3535                                 SET_PAGE_DIRTY(m, FALSE);
3536                         }
3537                 }
3538                 /*
3539                  * reset our count of pages that have been reclaimed
3540                  * since the last page was 'stolen'
3541                  */
3542                 inactive_reclaim_run = 0;
3543
3544                 /*
3545                  *      If it's clean and not precious, we can free the page.
3546                  */
3547                 if (!m->dirty && !m->precious) {
3548
3549                         if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q)
3550                                 vm_pageout_speculative_clean++;
3551                         else {
3552                                 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q)
3553                                         vm_pageout_inactive_anonymous++;
3554                                 else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q)
3555                                         vm_pageout_cleaned_reclaimed++;
3556
3557                                 vm_pageout_inactive_clean++;
3558                         }
3559
3560 #if CONFIG_SECLUDED_MEMORY
3561                         if (secluded_for_filecache &&
3562                             vm_page_secluded_target > 0 &&
3563                             !m->fictitious &&
3564                             m_object->eligible_for_secluded &&
3565                             num_tasks_can_use_secluded_mem == 0 &&
3566                             (secluded_aging_policy == SECLUDED_AGING_FIFO ||
3567                              ((secluded_aging_policy ==
3568                                SECLUDED_AGING_AFTER_INACTIVE) &&
3569                               (page_prev_q_state != VM_PAGE_ON_SECLUDED_Q)))) {
3570                                 assert(page_prev_q_state != VM_PAGE_ON_SECLUDED_Q);
3571                                 assert(m->vm_page_q_state == VM_PAGE_NOT_ON_Q);
3572                                 LCK_MTX_ASSERT(&vm_page_queue_lock,
3573                                                LCK_MTX_ASSERT_OWNED);
3574                                 vm_page_queue_enter(&vm_page_queue_secluded,
3575                                                     m,
3576                                                     vm_page_t,
3577                                                     pageq);
3578                                 m->vm_page_q_state = VM_PAGE_ON_SECLUDED_Q;
3579                                 vm_object_unlock(m_object);
3580                                 object = VM_OBJECT_NULL;
3581                                 vm_page_secluded_count++;
3582                                 vm_page_secluded_count_inuse++;
3583                                 assert(!m_object->internal);
3584 //                              vm_page_pageable_external_count++;
3585                                 m = VM_PAGE_NULL;
3586                                 goto done_with_inactivepage;
3587                         }
3588 #endif /* CONFIG_SECLUDED_MEMORY */
3589
3590                         /*
3591                          * OK, at this point we have found a page we are going to free.
3592                          */
3593 #if CONFIG_PHANTOM_CACHE
3594                         if (!object->internal)
3595                                 vm_phantom_cache_add_ghost(m);
3596 #endif
3597                         goto reclaim_page;
3598                 }
3599
3600                 /*
3601                  * The page may have been dirtied since the last check
3602                  * for a throttled target queue (which may have been skipped
3603                  * if the page was clean then).  With the dirty page
3604                  * disconnected here, we can make one final check.
3605                  */
3606                 if (object->internal) {
3607                         if (VM_PAGE_Q_THROTTLED(iq))
3608                                 inactive_throttled = TRUE;
3609                 } else if (VM_PAGE_Q_THROTTLED(eq)) {
3610                         inactive_throttled = TRUE;
3611                 }
3612
3613                 if (inactive_throttled == TRUE)
3614                         goto throttle_inactive;
3615
3616 #if VM_PRESSURE_EVENTS
3617 #if CONFIG_JETSAM
3618
3619                 /*
3620                  * If Jetsam is enabled, then the sending
3621                  * of memory pressure notifications is handled
3622                  * from the same thread that takes care of high-water
3623                  * and other jetsams i.e. the memorystatus_thread.
3624                  */
3625
3626 #else /* CONFIG_JETSAM */
3627
3628                 vm_pressure_response();
3629
3630 #endif /* CONFIG_JETSAM */
3631 #endif /* VM_PRESSURE_EVENTS */
3632
3633                 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q)
3634                         vm_pageout_inactive_anonymous++;
3635                 if (object->internal)
3636                         vm_pageout_inactive_dirty_internal++;
3637                 else
3638                         vm_pageout_inactive_dirty_external++;
3639
3640                 /*
3641                  * do NOT set the pageout bit!
3642                  * sure, we might need free pages, but this page is going to take time to become free
3643                  * anyway, so we may as well put it on the clean queue first and take it from there later
3644                  * if necessary.  that way, we'll ensure we don't free up too much. -mj
3645                  */
3646                 vm_pageout_cluster(m, FALSE, FALSE);
3647
3648 done_with_inactivepage:
3649
3650                 if (delayed_unlock++ > delayed_unlock_limit || try_failed == TRUE) {
3651                         boolean_t       need_delay = TRUE;
3652
3653                         if (object != NULL) {
3654                                 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
3655                                 vm_object_unlock(object);
3656                                 object = NULL;
3657                         }
3658                         vm_page_unlock_queues();
3659
3660                         if (local_freeq) {
3661
3662                                 VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START,
3663                                                vm_page_free_count, local_freed, delayed_unlock_limit, 4);
3664
3665                                 vm_page_free_list(local_freeq, TRUE);
3666
3667                                 VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END,
3668                                                vm_page_free_count, local_freed, 0, 4);
3669
3670                                 local_freeq = NULL;
3671                                 local_freed = 0;
3672                                 need_delay = FALSE;
3673                         }
3674                         vm_consider_waking_compactor_swapper();
3675
3676                         vm_page_lock_queues();
3677
3678                         if (need_delay == TRUE)
3679                                 lck_mtx_yield(&vm_page_queue_lock);
3680
3681                         delayed_unlock = 1;
3682                 }
3683                 vm_pageout_considered_page++;
3684
3685                 /*
3686                  * back to top of pageout scan loop
3687                  */
3688         }
3689 }
3690
3691
3692 int vm_page_free_count_init;
3693
3694 void
3695 vm_page_free_reserve(
3696         int pages)
3697 {
3698         int             free_after_reserve;
3699
3700         if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
3701
3702                 if ((vm_page_free_reserved + pages + COMPRESSOR_FREE_RESERVED_LIMIT) >= (VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT))
3703                         vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT;
3704                 else
3705                         vm_page_free_reserved += (pages + COMPRESSOR_FREE_RESERVED_LIMIT);
3706
3707         } else {
3708                 if ((vm_page_free_reserved + pages) >= VM_PAGE_FREE_RESERVED_LIMIT)
3709                         vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT;
3710                 else
3711                         vm_page_free_reserved += pages;
3712         }
3713         free_after_reserve = vm_page_free_count_init - vm_page_free_reserved;
3714
3715         vm_page_free_min = vm_page_free_reserved +
3716                 VM_PAGE_FREE_MIN(free_after_reserve);
3717
3718         if (vm_page_free_min > VM_PAGE_FREE_MIN_LIMIT)
3719                 vm_page_free_min = VM_PAGE_FREE_MIN_LIMIT;
3720
3721         vm_page_free_target = vm_page_free_reserved +
3722                 VM_PAGE_FREE_TARGET(free_after_reserve);
3723
3724         if (vm_page_free_target > VM_PAGE_FREE_TARGET_LIMIT)
3725                 vm_page_free_target = VM_PAGE_FREE_TARGET_LIMIT;
3726
3727         if (vm_page_free_target < vm_page_free_min + 5)
3728                 vm_page_free_target = vm_page_free_min + 5;
3729
3730         vm_page_throttle_limit = vm_page_free_target - (vm_page_free_target / 2);
3731 }
3732
3733 /*
3734  *      vm_pageout is the high level pageout daemon.
3735  */
3736
3737 void
3738 vm_pageout_continue(void)
3739 {
3740         DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL);
3741         vm_pageout_scan_event_counter++;
3742
3743         lck_mtx_lock(&vm_page_queue_free_lock);
3744         vm_pageout_running = TRUE;
3745         lck_mtx_unlock(&vm_page_queue_free_lock);
3746
3747         vm_pageout_scan();
3748         /*
3749          * we hold both the vm_page_queue_free_lock
3750          * and the vm_page_queues_lock at this point
3751          */
3752         assert(vm_page_free_wanted == 0);
3753         assert(vm_page_free_wanted_privileged == 0);
3754         assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
3755
3756         vm_pageout_running = FALSE;
3757         if (vm_pageout_waiter) {
3758                 vm_pageout_waiter = FALSE;
3759                 thread_wakeup((event_t)&vm_pageout_waiter);
3760         }
3761
3762         lck_mtx_unlock(&vm_page_queue_free_lock);
3763         vm_page_unlock_queues();
3764
3765         counter(c_vm_pageout_block++);
3766         thread_block((thread_continue_t)vm_pageout_continue);
3767         /*NOTREACHED*/
3768 }
3769
3770 kern_return_t
3771 vm_pageout_wait(uint64_t deadline)
3772 {
3773         kern_return_t kr;
3774
3775         lck_mtx_lock(&vm_page_queue_free_lock);
3776         for (kr = KERN_SUCCESS; vm_pageout_running && (KERN_SUCCESS == kr); ) {
3777                 vm_pageout_waiter = TRUE;
3778                 if (THREAD_AWAKENED != lck_mtx_sleep_deadline(
3779                                 &vm_page_queue_free_lock, LCK_SLEEP_DEFAULT,
3780                                 (event_t) &vm_pageout_waiter, THREAD_UNINT, deadline)) {
3781                         kr = KERN_OPERATION_TIMED_OUT;
3782                 }
3783         }
3784         lck_mtx_unlock(&vm_page_queue_free_lock);
3785
3786         return (kr);
3787 }
3788
3789
3790 static void
3791 vm_pageout_iothread_external_continue(struct vm_pageout_queue *q)
3792 {
3793         vm_page_t       m = NULL;
3794         vm_object_t     object;
3795         vm_object_offset_t offset;
3796         memory_object_t pager;
3797
3798
3799         if (vm_pageout_internal_iothread != THREAD_NULL)
3800                 current_thread()->options &= ~TH_OPT_VMPRIV;
3801
3802         vm_page_lockspin_queues();
3803
3804         while ( !vm_page_queue_empty(&q->pgo_pending) ) {
3805
3806                    q->pgo_busy = TRUE;
3807                    vm_page_queue_remove_first(&q->pgo_pending, m, vm_page_t, pageq);
3808
3809                    assert(m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q);
3810                    VM_PAGE_CHECK(m);
3811                    /*
3812                     * grab a snapshot of the object and offset this
3813                     * page is tabled in so that we can relookup this
3814                     * page after we've taken the object lock - these
3815                     * fields are stable while we hold the page queues lock
3816                     * but as soon as we drop it, there is nothing to keep
3817                     * this page in this object... we hold an activity_in_progress
3818                     * on this object which will keep it from terminating
3819                     */
3820                    object = VM_PAGE_OBJECT(m);
3821                    offset = m->offset;
3822
3823                    if (object->object_slid) {
3824                            panic("slid page %p not allowed on this path\n", m);
3825                    }
3826                    m->vm_page_q_state = VM_PAGE_NOT_ON_Q;
3827                    VM_PAGE_ZERO_PAGEQ_ENTRY(m);
3828
3829                    vm_page_unlock_queues();
3830
3831                    vm_object_lock(object);
3832
3833                    m = vm_page_lookup(object, offset);
3834
3835                    if (m == NULL ||
3836                        m->busy || m->cleaning || !m->laundry || (m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q)) {
3837                            /*
3838                             * it's either the same page that someone else has
3839                             * started cleaning (or it's finished cleaning or
3840                             * been put back on the pageout queue), or
3841                             * the page has been freed or we have found a
3842                             * new page at this offset... in all of these cases
3843                             * we merely need to release the activity_in_progress
3844                             * we took when we put the page on the pageout queue
3845                             */
3846                            vm_object_activity_end(object);
3847                            vm_object_unlock(object);
3848
3849                            vm_page_lockspin_queues();
3850                            continue;
3851                    }
3852                    pager = object->pager;
3853
3854                    if (pager == MEMORY_OBJECT_NULL) {
3855                            /*
3856                             * This pager has been destroyed by either
3857                             * memory_object_destroy or vm_object_destroy, and
3858                             * so there is nowhere for the page to go.
3859                             */
3860                            if (m->free_when_done) {
3861                                    /*
3862                                     * Just free the page... VM_PAGE_FREE takes
3863                                     * care of cleaning up all the state...
3864                                     * including doing the vm_pageout_throttle_up
3865                                     */
3866                                    VM_PAGE_FREE(m);
3867                            } else {
3868                                    vm_page_lockspin_queues();
3869
3870                                    vm_pageout_throttle_up(m);
3871                                    vm_page_activate(m);
3872
3873                                    vm_page_unlock_queues();
3874
3875                                    /*
3876                                     *   And we are done with it.
3877                                     */
3878                            }
3879                            vm_object_activity_end(object);
3880                            vm_object_unlock(object);
3881
3882                            vm_page_lockspin_queues();
3883                            continue;
3884                    }
3885 #if 0
3886                    /*
3887                     * we don't hold the page queue lock
3888                     * so this check isn't safe to make
3889                     */
3890                    VM_PAGE_CHECK(m);
3891 #endif
3892                    /*
3893                     * give back the activity_in_progress reference we
3894                     * took when we queued up this page and replace it
3895                     * it with a paging_in_progress reference that will
3896                     * also hold the paging offset from changing and
3897                     * prevent the object from terminating
3898                     */
3899                    vm_object_activity_end(object);
3900                    vm_object_paging_begin(object);
3901                    vm_object_unlock(object);
3902
3903                    /*
3904                     * Send the data to the pager.
3905                     * any pageout clustering happens there
3906                     */
3907                    memory_object_data_return(pager,
3908                                              m->offset + object->paging_offset,
3909                                              PAGE_SIZE,
3910                                              NULL,
3911                                              NULL,
3912                                              FALSE,
3913                                              FALSE,
3914                                              0);
3915
3916                    vm_object_lock(object);
3917                    vm_object_paging_end(object);
3918                    vm_object_unlock(object);
3919
3920                    vm_pageout_io_throttle();
3921
3922                    vm_page_lockspin_queues();
3923         }
3924         q->pgo_busy = FALSE;
3925         q->pgo_idle = TRUE;
3926
3927         assert_wait((event_t) &q->pgo_pending, THREAD_UNINT);
3928         vm_page_unlock_queues();
3929
3930         thread_block_parameter((thread_continue_t)vm_pageout_iothread_external_continue, (void *) q);
3931         /*NOTREACHED*/
3932 }
3933
3934
3935 uint32_t        vm_compressor_failed;
3936
3937 #define         MAX_FREE_BATCH          32
3938 uint32_t vm_compressor_time_thread; /* Set via sysctl to record time accrued by
3939                                      * this thread.
3940                                      */
3941 uint64_t vm_compressor_thread_runtime;
3942
3943 static void
3944 vm_pageout_iothread_internal_continue(struct cq *cq)
3945 {
3946         struct vm_pageout_queue *q;
3947         vm_page_t       m = NULL;
3948         boolean_t       pgo_draining;
3949         vm_page_t   local_q;
3950         int         local_cnt;
3951         vm_page_t   local_freeq = NULL;
3952         int         local_freed = 0;
3953         int         local_batch_size;
3954
3955
3956         KERNEL_DEBUG(0xe040000c | DBG_FUNC_END, 0, 0, 0, 0, 0);
3957
3958         q = cq->q;
3959         local_batch_size = q->pgo_maxlaundry / (vm_compressor_thread_count * 2);
3960
3961 #if RECORD_THE_COMPRESSED_DATA
3962         if (q->pgo_laundry)
3963                 c_compressed_record_init();
3964 #endif
3965         while (TRUE) {
3966                 int     pages_left_on_q = 0;
3967
3968                 local_cnt = 0;
3969                 local_q = NULL;
3970
3971                 KERNEL_DEBUG(0xe0400014 | DBG_FUNC_START, 0, 0, 0, 0, 0);
3972
3973                 vm_page_lock_queues();
3974
3975                 KERNEL_DEBUG(0xe0400014 | DBG_FUNC_END, 0, 0, 0, 0, 0);
3976
3977                 KERNEL_DEBUG(0xe0400018 | DBG_FUNC_START, q->pgo_laundry, 0, 0, 0, 0);
3978
3979                 while ( !vm_page_queue_empty(&q->pgo_pending) && local_cnt <  local_batch_size) {
3980
3981                         vm_page_queue_remove_first(&q->pgo_pending, m, vm_page_t, pageq);
3982                         assert(m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q);
3983                         VM_PAGE_CHECK(m);
3984
3985                         m->vm_page_q_state = VM_PAGE_NOT_ON_Q;
3986                         VM_PAGE_ZERO_PAGEQ_ENTRY(m);
3987                         m->laundry = FALSE;
3988
3989                         m->snext = local_q;
3990                         local_q = m;
3991                         local_cnt++;
3992                 }
3993                 if (local_q == NULL)
3994                         break;
3995
3996                 q->pgo_busy = TRUE;
3997
3998                 if ((pgo_draining = q->pgo_draining) == FALSE) {
3999                         vm_pageout_throttle_up_batch(q, local_cnt);
4000                         pages_left_on_q = q->pgo_laundry;
4001                 } else
4002                         pages_left_on_q = q->pgo_laundry - local_cnt;
4003
4004                 vm_page_unlock_queues();
4005
4006 #if !RECORD_THE_COMPRESSED_DATA
4007                 if (pages_left_on_q >= local_batch_size && cq->id < (vm_compressor_thread_count - 1))
4008                         thread_wakeup((event_t) ((uintptr_t)&q->pgo_pending + cq->id + 1));
4009 #endif
4010                 KERNEL_DEBUG(0xe0400018 | DBG_FUNC_END, q->pgo_laundry, 0, 0, 0, 0);
4011
4012                 while (local_q) {
4013
4014                         KERNEL_DEBUG(0xe0400024 | DBG_FUNC_START, local_cnt, 0, 0, 0, 0);
4015
4016                         m = local_q;
4017                         local_q = m->snext;
4018                         m->snext = NULL;
4019
4020                         if (vm_pageout_compress_page(&cq->current_chead, cq->scratch_buf, m, FALSE) == KERN_SUCCESS) {
4021
4022                                 m->snext = local_freeq;
4023                                 local_freeq = m;
4024                                 local_freed++;
4025
4026                                 if (local_freed >= MAX_FREE_BATCH) {
4027
4028                                         vm_page_free_list(local_freeq, TRUE);
4029                                         local_freeq = NULL;
4030                                         local_freed = 0;
4031                                 }
4032                         }
4033 #if !CONFIG_JETSAM
4034                         while (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4035                                 kern_return_t   wait_result;
4036                                 int             need_wakeup = 0;
4037
4038                                 if (local_freeq) {
4039                                         vm_page_free_list(local_freeq, TRUE);
4040
4041                                         local_freeq = NULL;
4042                                         local_freed = 0;
4043
4044                                         continue;
4045                                 }
4046                                 lck_mtx_lock_spin(&vm_page_queue_free_lock);
4047
4048                                 if (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4049
4050                                         if (vm_page_free_wanted_privileged++ == 0)
4051                                                 need_wakeup = 1;
4052                                         wait_result = assert_wait((event_t)&vm_page_free_wanted_privileged, THREAD_UNINT);
4053
4054                                         lck_mtx_unlock(&vm_page_queue_free_lock);
4055
4056                                         if (need_wakeup)
4057                                                 thread_wakeup((event_t)&vm_page_free_wanted);
4058
4059                                         if (wait_result == THREAD_WAITING)
4060
4061                                                 thread_block(THREAD_CONTINUE_NULL);
4062                                 } else
4063                                         lck_mtx_unlock(&vm_page_queue_free_lock);
4064                         }
4065 #endif
4066                 }
4067                 if (local_freeq) {
4068                         vm_page_free_list(local_freeq, TRUE);
4069
4070                         local_freeq = NULL;
4071                         local_freed = 0;
4072                 }
4073                 if (pgo_draining == TRUE) {
4074                         vm_page_lockspin_queues();
4075                         vm_pageout_throttle_up_batch(q, local_cnt);
4076                         vm_page_unlock_queues();
4077                 }
4078         }
4079         KERNEL_DEBUG(0xe040000c | DBG_FUNC_START, 0, 0, 0, 0, 0);
4080
4081         /*
4082          * queue lock is held and our q is empty
4083          */
4084         q->pgo_busy = FALSE;
4085         q->pgo_idle = TRUE;
4086
4087         assert_wait((event_t) ((uintptr_t)&q->pgo_pending + cq->id), THREAD_UNINT);
4088         vm_page_unlock_queues();
4089
4090         if (__improbable(vm_compressor_time_thread)) {
4091                 vm_compressor_thread_runtime = thread_get_runtime_self();
4092         }
4093
4094         KERNEL_DEBUG(0xe0400018 | DBG_FUNC_END, 0, 0, 0, 0, 0);
4095
4096         thread_block_parameter((thread_continue_t)vm_pageout_iothread_internal_continue, (void *) cq);
4097         /*NOTREACHED*/
4098 }
4099
4100
4101
4102 static void
4103 vm_pageout_immediate(vm_page_t m, boolean_t object_locked_by_caller)
4104 {
4105         assert(vm_pageout_immediate_scratch_buf);
4106
4107         if (vm_pageout_compress_page(&vm_pageout_immediate_chead, vm_pageout_immediate_scratch_buf, m, object_locked_by_caller) == KERN_SUCCESS) {
4108
4109                 vm_page_free_prepare_object(m, TRUE);
4110                 vm_page_release(m, TRUE);
4111         }
4112 }
4113
4114
4115 kern_return_t
4116 vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m, boolean_t object_locked_by_caller)
4117 {
4118         vm_object_t     object;
4119         memory_object_t pager;
4120         int             compressed_count_delta;
4121         kern_return_t   retval;
4122
4123         object = VM_PAGE_OBJECT(m);
4124
4125         if (object->object_slid) {
4126                 panic("slid page %p not allowed on this path\n", m);
4127         }
4128         assert(!m->free_when_done);
4129         assert(!m->laundry);
4130
4131         pager = object->pager;
4132
4133         if (object_locked_by_caller == FALSE && (!object->pager_initialized || pager == MEMORY_OBJECT_NULL))  {
4134
4135                 KERNEL_DEBUG(0xe0400010 | DBG_FUNC_START, object, pager, 0, 0, 0);
4136
4137                 vm_object_lock(object);
4138
4139                 /*
4140                  * If there is no memory object for the page, create
4141                  * one and hand it to the compression pager.
4142                  */
4143
4144                 if (!object->pager_initialized)
4145                         vm_object_collapse(object, (vm_object_offset_t) 0, TRUE);
4146                 if (!object->pager_initialized)
4147                         vm_object_compressor_pager_create(object);
4148
4149                 pager = object->pager;
4150
4151                 if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
4152                         /*
4153                          * Still no pager for the object,
4154                          * or the pager has been destroyed.
4155                          * Reactivate the page.
4156                          *
4157                          * Should only happen if there is no
4158                          * compression pager
4159                          */
4160                         PAGE_WAKEUP_DONE(m);
4161
4162                         vm_page_lockspin_queues();
4163                         vm_page_activate(m);
4164                         vm_pageout_dirty_no_pager++;
4165                         vm_page_unlock_queues();
4166
4167                         /*
4168                          *      And we are done with it.
4169                          */
4170                         vm_object_activity_end(object);
4171                         vm_object_unlock(object);
4172
4173                         return KERN_FAILURE;
4174                 }
4175                 vm_object_unlock(object);
4176
4177                 KERNEL_DEBUG(0xe0400010 | DBG_FUNC_END, object, pager, 0, 0, 0);
4178         }
4179         assert(object->pager_initialized && pager != MEMORY_OBJECT_NULL);
4180
4181         if (object_locked_by_caller == FALSE)
4182                 assert(object->activity_in_progress > 0);
4183
4184         retval = vm_compressor_pager_put(
4185                 pager,
4186                 m->offset + object->paging_offset,
4187                 VM_PAGE_GET_PHYS_PAGE(m),
4188                 current_chead,
4189                 scratch_buf,
4190                 &compressed_count_delta);
4191
4192         if (object_locked_by_caller == FALSE) {
4193                 vm_object_lock(object);
4194
4195                 assert(object->activity_in_progress > 0);
4196                 assert(VM_PAGE_OBJECT(m) == object);
4197         }
4198
4199         vm_compressor_pager_count(pager,
4200                                   compressed_count_delta,
4201                                   FALSE, /* shared_lock */
4202                                   object);
4203
4204         assert( !VM_PAGE_WIRED(m));
4205
4206         if (retval == KERN_SUCCESS) {
4207                 /*
4208                  * If the object is purgeable, its owner's
4209                  * purgeable ledgers will be updated in
4210                  * vm_page_remove() but the page still
4211                  * contributes to the owner's memory footprint,
4212                  * so account for it as such.
4213                  */
4214                 if (object->purgable != VM_PURGABLE_DENY &&
4215                     object->vo_purgeable_owner != NULL) {
4216                         /* one more compressed purgeable page */
4217                         vm_purgeable_compressed_update(object,
4218                                                        +1);
4219                 }
4220                 VM_STAT_INCR(compressions);
4221
4222                 if (m->tabled)
4223                         vm_page_remove(m, TRUE);
4224
4225         } else {
4226                 PAGE_WAKEUP_DONE(m);
4227
4228                 vm_page_lockspin_queues();
4229
4230                 vm_page_activate(m);
4231                 vm_compressor_failed++;
4232
4233                 vm_page_unlock_queues();
4234         }
4235         if (object_locked_by_caller == FALSE) {
4236                 vm_object_activity_end(object);
4237                 vm_object_unlock(object);
4238         }
4239         return retval;
4240 }
4241
4242
4243 static void
4244 vm_pageout_adjust_io_throttles(struct vm_pageout_queue *iq, struct vm_pageout_queue *eq, boolean_t req_lowpriority)
4245 {
4246         uint32_t        policy;
4247         boolean_t       set_iq = FALSE;
4248         boolean_t       set_eq = FALSE;
4249
4250         if (hibernate_cleaning_in_progress == TRUE)
4251                 req_lowpriority = FALSE;
4252
4253         if (eq->pgo_inited == TRUE && eq->pgo_lowpriority != req_lowpriority)
4254                 set_eq = TRUE;
4255
4256         if (set_iq == TRUE || set_eq == TRUE) {
4257
4258                 vm_page_unlock_queues();
4259
4260                 if (req_lowpriority == TRUE) {
4261                         policy = THROTTLE_LEVEL_PAGEOUT_THROTTLED;
4262                         DTRACE_VM(laundrythrottle);
4263                 } else {
4264                         policy = THROTTLE_LEVEL_PAGEOUT_UNTHROTTLED;
4265                         DTRACE_VM(laundryunthrottle);
4266                 }
4267                 if (set_iq == TRUE) {
4268                         proc_set_thread_policy_with_tid(kernel_task, iq->pgo_tid,
4269                                                         TASK_POLICY_EXTERNAL, TASK_POLICY_IO, policy);
4270
4271                         iq->pgo_lowpriority = req_lowpriority;
4272                 }
4273                 if (set_eq == TRUE) {
4274                         proc_set_thread_policy_with_tid(kernel_task, eq->pgo_tid,
4275                                                         TASK_POLICY_EXTERNAL, TASK_POLICY_IO, policy);
4276
4277                         eq->pgo_lowpriority = req_lowpriority;
4278                 }
4279                 vm_page_lock_queues();
4280         }
4281 }
4282
4283
4284 static void
4285 vm_pageout_iothread_external(void)
4286 {
4287         thread_t        self = current_thread();
4288
4289         self->options |= TH_OPT_VMPRIV;
4290
4291         DTRACE_VM2(laundrythrottle, int, 1, (uint64_t *), NULL);
4292
4293         proc_set_thread_policy(self, TASK_POLICY_EXTERNAL,
4294                                TASK_POLICY_IO, THROTTLE_LEVEL_PAGEOUT_THROTTLED);
4295
4296         vm_page_lock_queues();
4297
4298         vm_pageout_queue_external.pgo_tid = self->thread_id;
4299         vm_pageout_queue_external.pgo_lowpriority = TRUE;
4300         vm_pageout_queue_external.pgo_inited = TRUE;
4301
4302         vm_page_unlock_queues();
4303
4304         vm_pageout_iothread_external_continue(&vm_pageout_queue_external);
4305
4306         /*NOTREACHED*/
4307 }
4308
4309
4310 static void
4311 vm_pageout_iothread_internal(struct cq *cq)
4312 {
4313         thread_t        self = current_thread();
4314
4315         self->options |= TH_OPT_VMPRIV;
4316
4317         vm_page_lock_queues();
4318
4319         vm_pageout_queue_internal.pgo_tid = self->thread_id;
4320         vm_pageout_queue_internal.pgo_lowpriority = TRUE;
4321         vm_pageout_queue_internal.pgo_inited = TRUE;
4322
4323         vm_page_unlock_queues();
4324
4325         if (vm_restricted_to_single_processor == TRUE)
4326                 thread_vm_bind_group_add();
4327
4328         vm_pageout_iothread_internal_continue(cq);
4329
4330         /*NOTREACHED*/
4331 }
4332
4333 kern_return_t
4334 vm_set_buffer_cleanup_callout(boolean_t (*func)(int))
4335 {
4336         if (OSCompareAndSwapPtr(NULL, func, (void * volatile *) &consider_buffer_cache_collect)) {
4337                 return KERN_SUCCESS;
4338         } else {
4339                 return KERN_FAILURE; /* Already set */
4340         }
4341 }
4342
4343 extern boolean_t        memorystatus_manual_testing_on;
4344 extern unsigned int     memorystatus_level;
4345
4346
4347 #if VM_PRESSURE_EVENTS
4348
4349 boolean_t vm_pressure_events_enabled = FALSE;
4350
4351 void
4352 vm_pressure_response(void)
4353 {
4354
4355         vm_pressure_level_t     old_level = kVMPressureNormal;
4356         int                     new_level = -1;
4357         unsigned int            total_pages;
4358         uint64_t                available_memory = 0;
4359
4360         if (vm_pressure_events_enabled == FALSE)
4361                 return;
4362
4363
4364         available_memory = (uint64_t) AVAILABLE_NON_COMPRESSED_MEMORY;
4365
4366
4367         total_pages = (unsigned int) atop_64(max_mem);
4368 #if CONFIG_SECLUDED_MEMORY
4369         total_pages -= vm_page_secluded_count;
4370 #endif /* CONFIG_SECLUDED_MEMORY */
4371         memorystatus_level = (unsigned int) ((available_memory * 100) / total_pages);
4372
4373         if (memorystatus_manual_testing_on) {
4374                 return;
4375         }
4376
4377         old_level = memorystatus_vm_pressure_level;
4378
4379         switch (memorystatus_vm_pressure_level) {
4380
4381                 case kVMPressureNormal:
4382                 {
4383                         if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4384                                 new_level = kVMPressureCritical;
4385                         }  else if (VM_PRESSURE_NORMAL_TO_WARNING()) {
4386                                 new_level = kVMPressureWarning;
4387                         }
4388                         break;
4389                 }
4390
4391                 case kVMPressureWarning:
4392                 case kVMPressureUrgent:
4393                 {
4394                         if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4395                                 new_level = kVMPressureNormal;
4396                         }  else if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4397                                 new_level = kVMPressureCritical;
4398                         }
4399                         break;
4400                 }
4401
4402                 case kVMPressureCritical:
4403                 {
4404                         if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4405                                 new_level = kVMPressureNormal;
4406                         }  else if (VM_PRESSURE_CRITICAL_TO_WARNING()) {
4407                                 new_level = kVMPressureWarning;
4408                         }
4409                         break;
4410                 }
4411
4412                 default:
4413                         return;
4414         }
4415
4416         if (new_level != -1) {
4417                 memorystatus_vm_pressure_level = (vm_pressure_level_t) new_level;
4418
4419                 if ((memorystatus_vm_pressure_level != kVMPressureNormal) || (old_level != new_level)) {
4420                         if (vm_pressure_thread_running == FALSE) {
4421                                 thread_wakeup(&vm_pressure_thread);
4422                         }
4423
4424                         if (old_level != new_level) {
4425                                 thread_wakeup(&vm_pressure_changed);
4426                         }
4427                 }
4428         }
4429
4430 }
4431 #endif /* VM_PRESSURE_EVENTS */
4432
4433 kern_return_t
4434 mach_vm_pressure_level_monitor(__unused boolean_t wait_for_pressure, __unused unsigned int *pressure_level) {
4435
4436 #if   !VM_PRESSURE_EVENTS
4437
4438         return KERN_FAILURE;
4439
4440 #else /* VM_PRESSURE_EVENTS */
4441
4442         kern_return_t   kr = KERN_SUCCESS;
4443
4444         if (pressure_level != NULL) {
4445
4446                 vm_pressure_level_t     old_level = memorystatus_vm_pressure_level;
4447
4448                 if (wait_for_pressure == TRUE) {
4449                         wait_result_t           wr = 0;
4450
4451                         while (old_level == *pressure_level) {
4452                                 wr = assert_wait((event_t) &vm_pressure_changed,
4453                                                  THREAD_INTERRUPTIBLE);
4454                                 if (wr == THREAD_WAITING) {
4455                                         wr = thread_block(THREAD_CONTINUE_NULL);
4456                                 }
4457                                 if (wr == THREAD_INTERRUPTED) {
4458                                         return KERN_ABORTED;
4459                                 }
4460                                 if (wr == THREAD_AWAKENED) {
4461
4462                                         old_level = memorystatus_vm_pressure_level;
4463
4464                                         if (old_level != *pressure_level) {
4465                                                 break;
4466                                         }
4467                                 }
4468                         }
4469                 }
4470
4471                 *pressure_level = old_level;
4472                 kr = KERN_SUCCESS;
4473         } else {
4474                 kr = KERN_INVALID_ARGUMENT;
4475         }
4476
4477         return kr;
4478 #endif /* VM_PRESSURE_EVENTS */
4479 }
4480
4481 #if VM_PRESSURE_EVENTS
4482 void
4483 vm_pressure_thread(void) {
4484         static boolean_t thread_initialized = FALSE;
4485
4486         if (thread_initialized == TRUE) {
4487                 vm_pressure_thread_running = TRUE;
4488                 consider_vm_pressure_events();
4489                 vm_pressure_thread_running = FALSE;
4490         }
4491
4492         thread_initialized = TRUE;
4493         assert_wait((event_t) &vm_pressure_thread, THREAD_UNINT);
4494         thread_block((thread_continue_t)vm_pressure_thread);
4495 }
4496 #endif /* VM_PRESSURE_EVENTS */
4497
4498
4499 uint32_t vm_pageout_considered_page_last = 0;
4500
4501 /*
4502  * called once per-second via "compute_averages"
4503  */
4504 void
4505 compute_pageout_gc_throttle(__unused void *arg)
4506 {
4507         if (vm_pageout_considered_page != vm_pageout_considered_page_last) {
4508
4509                 vm_pageout_considered_page_last = vm_pageout_considered_page;
4510
4511                 thread_wakeup((event_t) &vm_pageout_garbage_collect);
4512         }
4513 }
4514
4515
4516 static void
4517 vm_pageout_garbage_collect(int collect)
4518 {
4519
4520         if (collect) {
4521                 boolean_t buf_large_zfree = FALSE;
4522                 boolean_t first_try = TRUE;
4523
4524                 stack_collect();
4525
4526                 consider_machine_collect();
4527                 m_drain();
4528
4529                 do {
4530                         if (consider_buffer_cache_collect != NULL) {
4531                                 buf_large_zfree = (*consider_buffer_cache_collect)(0);
4532                         }
4533                         if (first_try == TRUE || buf_large_zfree == TRUE) {
4534                                 /*
4535                                  * consider_zone_gc should be last, because the other operations
4536                                  * might return memory to zones.
4537                                  */
4538                                 consider_zone_gc();
4539                         }
4540                         first_try = FALSE;
4541
4542                 } while (buf_large_zfree == TRUE && vm_page_free_count < vm_page_free_target);
4543
4544                 consider_machine_adjust();
4545         }
4546         assert_wait((event_t) &vm_pageout_garbage_collect, THREAD_UNINT);
4547
4548         thread_block_parameter((thread_continue_t) vm_pageout_garbage_collect, (void *)1);
4549         /*NOTREACHED*/
4550 }
4551
4552
4553 #if VM_PAGE_BUCKETS_CHECK
4554 #if VM_PAGE_FAKE_BUCKETS
4555 extern vm_map_offset_t vm_page_fake_buckets_start, vm_page_fake_buckets_end;
4556 #endif /* VM_PAGE_FAKE_BUCKETS */
4557 #endif /* VM_PAGE_BUCKETS_CHECK */
4558
4559
4560 #define FBDP_TEST_COLLAPSE_COMPRESSOR 0
4561 #define FBDP_TEST_WIRE_AND_EXTRACT 0
4562 #define FBDP_TEST_PAGE_WIRE_OVERFLOW 0
4563
4564 #if FBDP_TEST_COLLAPSE_COMPRESSOR
4565 extern boolean_t vm_object_collapse_compressor_allowed;
4566 #include <IOKit/IOLib.h>
4567 #endif /* FBDP_TEST_COLLAPSE_COMPRESSOR */
4568
4569 #if FBDP_TEST_WIRE_AND_EXTRACT
4570 extern ledger_template_t        task_ledger_template;
4571 #include <mach/mach_vm.h>
4572 extern ppnum_t vm_map_get_phys_page(vm_map_t map,
4573                                     vm_offset_t offset);
4574 #endif /* FBDP_TEST_WIRE_AND_EXTRACT */
4575
4576
4577 void
4578 vm_set_restrictions()
4579 {
4580         host_basic_info_data_t hinfo;
4581         mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
4582
4583 #define BSD_HOST 1
4584         host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count);
4585
4586         assert(hinfo.max_cpus > 0);
4587
4588         if (hinfo.max_cpus <= 3) {
4589                 /*
4590                  * on systems with a limited number of CPUS, bind the
4591                  * 4 major threads that can free memory and that tend to use
4592                  * a fair bit of CPU under pressured conditions to a single processor.
4593                  * This insures that these threads don't hog all of the available CPUs
4594                  * (important for camera launch), while allowing them to run independently
4595                  * w/r to locks... the 4 threads are
4596                  * vm_pageout_scan,  vm_pageout_iothread_internal (compressor),
4597                  * vm_compressor_swap_trigger_thread (minor and major compactions),
4598                  * memorystatus_thread (jetsams).
4599                  *
4600                  * the first time the thread is run, it is responsible for checking the
4601                  * state of vm_restricted_to_single_processor, and if TRUE it calls
4602                  * thread_bind_master...  someday this should be replaced with a group
4603                  * scheduling mechanism and KPI.
4604                  */
4605                 vm_restricted_to_single_processor = TRUE;
4606         }
4607 }
4608
4609
4610 void
4611 vm_pageout(void)
4612 {
4613         thread_t        self = current_thread();
4614         thread_t        thread;
4615         kern_return_t   result;
4616         spl_t           s;
4617
4618         /*
4619          * Set thread privileges.
4620          */
4621         s = splsched();
4622
4623         thread_lock(self);
4624         self->options |= TH_OPT_VMPRIV;
4625         sched_set_thread_base_priority(self, BASEPRI_PREEMPT - 1);
4626         thread_unlock(self);
4627
4628         if (!self->reserved_stack)
4629                 self->reserved_stack = self->kernel_stack;
4630
4631         if (vm_restricted_to_single_processor == TRUE)
4632                 thread_vm_bind_group_add();
4633
4634         splx(s);
4635
4636         /*
4637          *      Initialize some paging parameters.
4638          */
4639
4640         if (vm_pageout_swap_wait == 0)
4641                 vm_pageout_swap_wait = VM_PAGEOUT_SWAP_WAIT;
4642
4643         if (vm_pageout_idle_wait == 0)
4644                 vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
4645
4646         if (vm_pageout_burst_wait == 0)
4647                 vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
4648
4649         if (vm_pageout_empty_wait == 0)
4650                 vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
4651
4652         if (vm_pageout_deadlock_wait == 0)
4653                 vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
4654
4655         if (vm_pageout_deadlock_relief == 0)
4656                 vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
4657
4658         if (vm_pageout_inactive_relief == 0)
4659                 vm_pageout_inactive_relief = VM_PAGEOUT_INACTIVE_RELIEF;
4660
4661         if (vm_pageout_burst_active_throttle == 0)
4662                 vm_pageout_burst_active_throttle = VM_PAGEOUT_BURST_ACTIVE_THROTTLE;
4663
4664         if (vm_pageout_burst_inactive_throttle == 0)
4665                 vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
4666
4667         /*
4668          * Set kernel task to low backing store privileged
4669          * status
4670          */
4671         task_lock(kernel_task);
4672         kernel_task->priv_flags |= VM_BACKING_STORE_PRIV;
4673         task_unlock(kernel_task);
4674
4675         vm_page_free_count_init = vm_page_free_count;
4676
4677         /*
4678          * even if we've already called vm_page_free_reserve
4679          * call it again here to insure that the targets are
4680          * accurately calculated (it uses vm_page_free_count_init)
4681          * calling it with an arg of 0 will not change the reserve
4682          * but will re-calculate free_min and free_target
4683          */
4684         if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
4685                 vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
4686         } else
4687                 vm_page_free_reserve(0);
4688
4689
4690         vm_page_queue_init(&vm_pageout_queue_external.pgo_pending);
4691         vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
4692         vm_pageout_queue_external.pgo_laundry = 0;
4693         vm_pageout_queue_external.pgo_idle = FALSE;
4694         vm_pageout_queue_external.pgo_busy = FALSE;
4695         vm_pageout_queue_external.pgo_throttled = FALSE;
4696         vm_pageout_queue_external.pgo_draining = FALSE;
4697         vm_pageout_queue_external.pgo_lowpriority = FALSE;
4698         vm_pageout_queue_external.pgo_tid = -1;
4699         vm_pageout_queue_external.pgo_inited = FALSE;
4700
4701         vm_page_queue_init(&vm_pageout_queue_internal.pgo_pending);
4702         vm_pageout_queue_internal.pgo_maxlaundry = 0;
4703         vm_pageout_queue_internal.pgo_laundry = 0;
4704         vm_pageout_queue_internal.pgo_idle = FALSE;
4705         vm_pageout_queue_internal.pgo_busy = FALSE;
4706         vm_pageout_queue_internal.pgo_throttled = FALSE;
4707         vm_pageout_queue_internal.pgo_draining = FALSE;
4708         vm_pageout_queue_internal.pgo_lowpriority = FALSE;
4709         vm_pageout_queue_internal.pgo_tid = -1;
4710         vm_pageout_queue_internal.pgo_inited = FALSE;
4711
4712         /* internal pageout thread started when default pager registered first time */
4713         /* external pageout and garbage collection threads started here */
4714
4715         result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external, NULL,
4716                                               BASEPRI_PREEMPT - 1,
4717                                               &vm_pageout_external_iothread);
4718         if (result != KERN_SUCCESS)
4719                 panic("vm_pageout_iothread_external: create failed");
4720
4721         thread_deallocate(vm_pageout_external_iothread);
4722
4723         result = kernel_thread_start_priority((thread_continue_t)vm_pageout_garbage_collect, NULL,
4724                                               BASEPRI_DEFAULT,
4725                                               &thread);
4726         if (result != KERN_SUCCESS)
4727                 panic("vm_pageout_garbage_collect: create failed");
4728
4729         thread_deallocate(thread);
4730
4731 #if VM_PRESSURE_EVENTS
4732         result = kernel_thread_start_priority((thread_continue_t)vm_pressure_thread, NULL,
4733                                                 BASEPRI_DEFAULT,
4734                                                 &thread);
4735
4736         if (result != KERN_SUCCESS)
4737                 panic("vm_pressure_thread: create failed");
4738
4739         thread_deallocate(thread);
4740 #endif
4741
4742         vm_object_reaper_init();
4743
4744
4745         bzero(&vm_config, sizeof(vm_config));
4746
4747         switch(vm_compressor_mode) {
4748
4749         case VM_PAGER_DEFAULT:
4750                 printf("mapping deprecated VM_PAGER_DEFAULT to VM_PAGER_COMPRESSOR_WITH_SWAP\n");
4751
4752         case VM_PAGER_COMPRESSOR_WITH_SWAP:
4753                 vm_config.compressor_is_present = TRUE;
4754                 vm_config.swap_is_present = TRUE;
4755                 vm_config.compressor_is_active = TRUE;
4756                 vm_config.swap_is_active = TRUE;
4757                 break;
4758
4759         case VM_PAGER_COMPRESSOR_NO_SWAP:
4760                 vm_config.compressor_is_present = TRUE;
4761                 vm_config.swap_is_present = TRUE;
4762                 vm_config.compressor_is_active = TRUE;
4763                 break;
4764
4765         case VM_PAGER_FREEZER_DEFAULT:
4766                 printf("mapping deprecated VM_PAGER_FREEZER_DEFAULT to VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP\n");
4767
4768         case VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP:
4769                 vm_config.compressor_is_present = TRUE;
4770                 vm_config.swap_is_present = TRUE;
4771                 break;
4772
4773         case VM_PAGER_COMPRESSOR_NO_SWAP_PLUS_FREEZER_COMPRESSOR_WITH_SWAP:
4774                 vm_config.compressor_is_present = TRUE;
4775                 vm_config.swap_is_present = TRUE;
4776                 vm_config.compressor_is_active = TRUE;
4777                 vm_config.freezer_swap_is_active = TRUE;
4778                 break;
4779
4780         case VM_PAGER_NOT_CONFIGURED:
4781                 break;
4782
4783         default:
4784                 printf("unknown compressor mode - %x\n", vm_compressor_mode);
4785                 break;
4786         }
4787         if (VM_CONFIG_COMPRESSOR_IS_PRESENT)
4788                 vm_compressor_pager_init();
4789
4790 #if VM_PRESSURE_EVENTS
4791         vm_pressure_events_enabled = TRUE;
4792 #endif /* VM_PRESSURE_EVENTS */
4793
4794 #if CONFIG_PHANTOM_CACHE
4795         vm_phantom_cache_init();
4796 #endif
4797 #if VM_PAGE_BUCKETS_CHECK
4798 #if VM_PAGE_FAKE_BUCKETS
4799         printf("**** DEBUG: protecting fake buckets [0x%llx:0x%llx]\n",
4800                (uint64_t) vm_page_fake_buckets_start,
4801                (uint64_t) vm_page_fake_buckets_end);
4802         pmap_protect(kernel_pmap,
4803                      vm_page_fake_buckets_start,
4804                      vm_page_fake_buckets_end,
4805                      VM_PROT_READ);
4806 //      *(char *) vm_page_fake_buckets_start = 'x';     /* panic! */
4807 #endif /* VM_PAGE_FAKE_BUCKETS */
4808 #endif /* VM_PAGE_BUCKETS_CHECK */
4809
4810 #if VM_OBJECT_TRACKING
4811         vm_object_tracking_init();
4812 #endif /* VM_OBJECT_TRACKING */
4813
4814
4815 #if FBDP_TEST_COLLAPSE_COMPRESSOR
4816         vm_object_size_t        backing_size, top_size;
4817         vm_object_t             backing_object, top_object;
4818         vm_map_offset_t         backing_offset, top_offset;
4819         unsigned char           *backing_address, *top_address;
4820         kern_return_t           kr;
4821
4822         printf("FBDP_TEST_COLLAPSE_COMPRESSOR:\n");
4823
4824         /* create backing object */
4825         backing_size = 15 * PAGE_SIZE;
4826         backing_object = vm_object_allocate(backing_size);
4827         assert(backing_object != VM_OBJECT_NULL);
4828         printf("FBDP_TEST_COLLAPSE_COMPRESSOR: created backing object %p\n",
4829                 backing_object);
4830         /* map backing object */
4831         backing_offset = 0;
4832         kr = vm_map_enter(kernel_map, &backing_offset, backing_size, 0,
4833                           VM_FLAGS_ANYWHERE, backing_object, 0, FALSE,
4834                           VM_PROT_DEFAULT, VM_PROT_DEFAULT, VM_INHERIT_DEFAULT);
4835         assert(kr == KERN_SUCCESS);
4836         backing_address = (unsigned char *) backing_offset;
4837         printf("FBDP_TEST_COLLAPSE_COMPRESSOR: "
4838                "mapped backing object %p at 0x%llx\n",
4839                backing_object, (uint64_t) backing_offset);
4840         /* populate with pages to be compressed in backing object */
4841         backing_address[0x1*PAGE_SIZE] = 0xB1;
4842         backing_address[0x4*PAGE_SIZE] = 0xB4;
4843         backing_address[0x7*PAGE_SIZE] = 0xB7;
4844         backing_address[0xa*PAGE_SIZE] = 0xBA;
4845         backing_address[0xd*PAGE_SIZE] = 0xBD;
4846         printf("FBDP_TEST_COLLAPSE_COMPRESSOR: "
4847                "populated pages to be compressed in "
4848                "backing_object %p\n", backing_object);
4849         /* compress backing object */
4850         vm_object_pageout(backing_object);
4851         printf("FBDP_TEST_COLLAPSE_COMPRESSOR: compressing backing_object %p\n",
4852                backing_object);
4853         /* wait for all the pages to be gone */
4854         while (*(volatile int *)&backing_object->resident_page_count != 0)
4855                 IODelay(10);
4856         printf("FBDP_TEST_COLLAPSE_COMPRESSOR: backing_object %p compressed\n",
4857                backing_object);
4858         /* populate with pages to be resident in backing object */
4859         backing_address[0x0*PAGE_SIZE] = 0xB0;
4860         backing_address[0x3*PAGE_SIZE] = 0xB3;
4861         backing_address[0x6*PAGE_SIZE] = 0xB6;
4862         backing_address[0x9*PAGE_SIZE] = 0xB9;
4863         backing_address[0xc*PAGE_SIZE] = 0xBC;
4864         printf("FBDP_TEST_COLLAPSE_COMPRESSOR: "
4865                "populated pages to be resident in "
4866                "backing_object %p\n", backing_object);
4867         /* leave the other pages absent */
4868         /* mess with the paging_offset of the backing_object */
4869         assert(backing_object->paging_offset == 0);
4870         backing_object->paging_offset = 0x3000;
4871
4872         /* create top object */
4873         top_size = 9 * PAGE_SIZE;
4874         top_object = vm_object_allocate(top_size);
4875         assert(top_object != VM_OBJECT_NULL);
4876         printf("FBDP_TEST_COLLAPSE_COMPRESSOR: created top object %p\n",
4877                 top_object);
4878         /* map top object */
4879         top_offset = 0;
4880         kr = vm_map_enter(kernel_map, &top_offset, top_size, 0,
4881                           VM_FLAGS_ANYWHERE, top_object, 0, FALSE,
4882                           VM_PROT_DEFAULT, VM_PROT_DEFAULT, VM_INHERIT_DEFAULT);
4883         assert(kr == KERN_SUCCESS);
4884         top_address = (unsigned char *) top_offset;
4885         printf("FBDP_TEST_COLLAPSE_COMPRESSOR: "
4886                "mapped top object %p at 0x%llx\n",
4887                top_object, (uint64_t) top_offset);
4888         /* populate with pages to be compressed in top object */
4889         top_address[0x3*PAGE_SIZE] = 0xA3;
4890         top_address[0x4*PAGE_SIZE] = 0xA4;
4891         top_address[0x5*PAGE_SIZE] = 0xA5;
4892         printf("FBDP_TEST_COLLAPSE_COMPRESSOR: "
4893                "populated pages to be compressed in "
4894                "top_object %p\n", top_object);
4895         /* compress top object */
4896         vm_object_pageout(top_object);
4897         printf("FBDP_TEST_COLLAPSE_COMPRESSOR: compressing top_object %p\n",
4898                top_object);
4899         /* wait for all the pages to be gone */
4900         while (top_object->resident_page_count != 0);
4901         printf("FBDP_TEST_COLLAPSE_COMPRESSOR: top_object %p compressed\n",
4902                top_object);
4903         /* populate with pages to be resident in top object */
4904         top_address[0x0*PAGE_SIZE] = 0xA0;
4905         top_address[0x1*PAGE_SIZE] = 0xA1;
4906         top_address[0x2*PAGE_SIZE] = 0xA2;
4907         printf("FBDP_TEST_COLLAPSE_COMPRESSOR: "
4908                "populated pages to be resident in "
4909                "top_object %p\n", top_object);
4910         /* leave the other pages absent */
4911
4912         /* link the 2 objects */
4913         vm_object_reference(backing_object);
4914         top_object->shadow = backing_object;
4915         top_object->vo_shadow_offset = 0x3000;
4916         printf("FBDP_TEST_COLLAPSE_COMPRESSOR: linked %p and %p\n",
4917                top_object, backing_object);
4918
4919         /* unmap backing object */
4920         vm_map_remove(kernel_map,
4921                       backing_offset,
4922                       backing_offset + backing_size,
4923                       0);
4924         printf("FBDP_TEST_COLLAPSE_COMPRESSOR: "
4925                "unmapped backing_object %p [0x%llx:0x%llx]\n",
4926                backing_object,
4927                (uint64_t) backing_offset,
4928                (uint64_t) (backing_offset + backing_size));
4929
4930         /* collapse */
4931         printf("FBDP_TEST_COLLAPSE_COMPRESSOR: collapsing %p\n", top_object);
4932         vm_object_lock(top_object);
4933         vm_object_collapse(top_object, 0, FALSE);
4934         vm_object_unlock(top_object);
4935         printf("FBDP_TEST_COLLAPSE_COMPRESSOR: collapsed %p\n", top_object);
4936
4937         /* did it work? */
4938         if (top_object->shadow != VM_OBJECT_NULL) {
4939                 printf("FBDP_TEST_COLLAPSE_COMPRESSOR: not collapsed\n");
4940                 printf("FBDP_TEST_COLLAPSE_COMPRESSOR: FAIL\n");
4941                 if (vm_object_collapse_compressor_allowed) {
4942                         panic("FBDP_TEST_COLLAPSE_COMPRESSOR: FAIL\n");
4943                 }
4944         } else {
4945                 /* check the contents of the mapping */
4946                 unsigned char expect[9] =
4947                         { 0xA0, 0xA1, 0xA2,     /* resident in top */
4948                           0xA3, 0xA4, 0xA5,     /* compressed in top */
4949                           0xB9, /* resident in backing + shadow_offset */
4950                           0xBD, /* compressed in backing + shadow_offset + paging_offset */
4951                           0x00 };               /* absent in both */
4952                 unsigned char actual[9];
4953                 unsigned int i, errors;
4954
4955                 errors = 0;
4956                 for (i = 0; i < sizeof (actual); i++) {
4957                         actual[i] = (unsigned char) top_address[i*PAGE_SIZE];
4958                         if (actual[i] != expect[i]) {
4959                                 errors++;
4960                         }
4961                 }
4962                 printf("FBDP_TEST_COLLAPSE_COMPRESSOR: "
4963                        "actual [%x %x %x %x %x %x %x %x %x] "
4964                        "expect [%x %x %x %x %x %x %x %x %x] "
4965                        "%d errors\n",
4966                        actual[0], actual[1], actual[2], actual[3],
4967                        actual[4], actual[5], actual[6], actual[7],
4968                        actual[8],
4969                        expect[0], expect[1], expect[2], expect[3],
4970                        expect[4], expect[5], expect[6], expect[7],
4971                        expect[8],
4972                        errors);
4973                 if (errors) {
4974                         panic("FBDP_TEST_COLLAPSE_COMPRESSOR: FAIL\n");
4975                 } else {
4976                         printf("FBDP_TEST_COLLAPSE_COMPRESSOR: PASS\n");
4977                 }
4978         }
4979 #endif /* FBDP_TEST_COLLAPSE_COMPRESSOR */
4980
4981 #if FBDP_TEST_WIRE_AND_EXTRACT
4982         ledger_t                ledger;
4983         vm_map_t                user_map, wire_map;
4984         mach_vm_address_t       user_addr, wire_addr;
4985         mach_vm_size_t          user_size, wire_size;
4986         mach_vm_offset_t        cur_offset;
4987         vm_prot_t               cur_prot, max_prot;
4988         ppnum_t                 user_ppnum, wire_ppnum;
4989         kern_return_t           kr;
4990
4991         ledger = ledger_instantiate(task_ledger_template,
4992                                     LEDGER_CREATE_ACTIVE_ENTRIES);
4993         user_map = vm_map_create(pmap_create(ledger, 0, PMAP_CREATE_64BIT),
4994                                  0x100000000ULL,
4995                                  0x200000000ULL,
4996                                  TRUE);
4997         wire_map = vm_map_create(NULL,
4998                                  0x100000000ULL,
4999                                  0x200000000ULL,
5000                                  TRUE);
5001         user_addr = 0;
5002         user_size = 0x10000;
5003         kr = mach_vm_allocate(user_map,
5004                               &user_addr,
5005                               user_size,
5006                               VM_FLAGS_ANYWHERE);
5007         assert(kr == KERN_SUCCESS);
5008         wire_addr = 0;
5009         wire_size = user_size;
5010         kr = mach_vm_remap(wire_map,
5011                            &wire_addr,
5012                            wire_size,
5013                            0,
5014                            VM_FLAGS_ANYWHERE,
5015                            user_map,
5016                            user_addr,
5017                            FALSE,
5018                            &cur_prot,
5019                            &max_prot,
5020                            VM_INHERIT_NONE);
5021         assert(kr == KERN_SUCCESS);
5022         for (cur_offset = 0;
5023              cur_offset < wire_size;
5024              cur_offset += PAGE_SIZE) {
5025                 kr = vm_map_wire_and_extract(wire_map,
5026                                              wire_addr + cur_offset,
5027                                              VM_PROT_DEFAULT | VM_PROT_MEMORY_TAG_MAKE(VM_KERN_MEMORY_OSFMK),
5028                                              TRUE,
5029                                              &wire_ppnum);
5030                 assert(kr == KERN_SUCCESS);
5031                 user_ppnum = vm_map_get_phys_page(user_map,
5032                                                   user_addr + cur_offset);
5033                 printf("FBDP_TEST_WIRE_AND_EXTRACT: kr=0x%x "
5034                        "user[%p:0x%llx:0x%x] wire[%p:0x%llx:0x%x]\n",
5035                        kr,
5036                        user_map, user_addr + cur_offset, user_ppnum,
5037                        wire_map, wire_addr + cur_offset, wire_ppnum);
5038                 if (kr != KERN_SUCCESS ||
5039                     wire_ppnum == 0 ||
5040                     wire_ppnum != user_ppnum) {
5041                         panic("FBDP_TEST_WIRE_AND_EXTRACT: FAIL\n");
5042                 }
5043         }
5044         cur_offset -= PAGE_SIZE;
5045         kr = vm_map_wire_and_extract(wire_map,
5046                                      wire_addr + cur_offset,
5047                                      VM_PROT_DEFAULT,
5048                                      TRUE,
5049                                      &wire_ppnum);
5050         assert(kr == KERN_SUCCESS);
5051         printf("FBDP_TEST_WIRE_AND_EXTRACT: re-wire kr=0x%x "
5052                "user[%p:0x%llx:0x%x] wire[%p:0x%llx:0x%x]\n",
5053                kr,
5054                user_map, user_addr + cur_offset, user_ppnum,
5055                wire_map, wire_addr + cur_offset, wire_ppnum);
5056         if (kr != KERN_SUCCESS ||
5057             wire_ppnum == 0 ||
5058             wire_ppnum != user_ppnum) {
5059                 panic("FBDP_TEST_WIRE_AND_EXTRACT: FAIL\n");
5060         }
5061
5062         printf("FBDP_TEST_WIRE_AND_EXTRACT: PASS\n");
5063 #endif /* FBDP_TEST_WIRE_AND_EXTRACT */
5064
5065 #if FBDP_TEST_PAGE_WIRE_OVERFLOW
5066         vm_object_t fbdp_object;
5067         vm_page_t fbdp_page;
5068
5069         printf("FBDP_TEST_PAGE_WIRE_OVERFLOW: starting...\n");
5070
5071         fbdp_object = vm_object_allocate(PAGE_SIZE);
5072         vm_object_lock(fbdp_object);
5073         fbdp_page = vm_page_alloc(fbdp_object, 0x0);
5074         vm_page_lock_queues();
5075         do {
5076                 vm_page_wire(fbdp_page, 1, FALSE);
5077         } while (fbdp_page->wire_count != 0);
5078         vm_page_unlock_queues();
5079         vm_object_unlock(fbdp_object);
5080         panic("FBDP(%p,%p): wire_count overflow not detected\n",
5081               fbdp_object, fbdp_page);
5082 #endif /* FBDP_TEST_PAGE_WIRE_OVERFLOW */
5083
5084         vm_pageout_continue();
5085
5086         /*
5087          * Unreached code!
5088          *
5089          * The vm_pageout_continue() call above never returns, so the code below is never
5090          * executed.  We take advantage of this to declare several DTrace VM related probe
5091          * points that our kernel doesn't have an analog for.  These are probe points that
5092          * exist in Solaris and are in the DTrace documentation, so people may have written
5093          * scripts that use them.  Declaring the probe points here means their scripts will
5094          * compile and execute which we want for portability of the scripts, but since this
5095          * section of code is never reached, the probe points will simply never fire.  Yes,
5096          * this is basically a hack.  The problem is the DTrace probe points were chosen with
5097          * Solaris specific VM events in mind, not portability to different VM implementations.
5098          */
5099
5100         DTRACE_VM2(execfree, int, 1, (uint64_t *), NULL);
5101         DTRACE_VM2(execpgin, int, 1, (uint64_t *), NULL);
5102         DTRACE_VM2(execpgout, int, 1, (uint64_t *), NULL);
5103         DTRACE_VM2(pgswapin, int, 1, (uint64_t *), NULL);
5104         DTRACE_VM2(pgswapout, int, 1, (uint64_t *), NULL);
5105         DTRACE_VM2(swapin, int, 1, (uint64_t *), NULL);
5106         DTRACE_VM2(swapout, int, 1, (uint64_t *), NULL);
5107         /*NOTREACHED*/
5108 }
5109
5110
5111
5112 int vm_compressor_thread_count = 2;
5113
5114 kern_return_t
5115 vm_pageout_internal_start(void)
5116 {
5117         kern_return_t   result;
5118         int             i;
5119         host_basic_info_data_t hinfo;
5120
5121         assert (VM_CONFIG_COMPRESSOR_IS_PRESENT);
5122
5123         mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
5124 #define BSD_HOST 1
5125         host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count);
5126
5127         assert(hinfo.max_cpus > 0);
5128
5129         if (vm_compressor_thread_count >= hinfo.max_cpus)
5130                 vm_compressor_thread_count = hinfo.max_cpus - 1;
5131         if (vm_compressor_thread_count <= 0)
5132                 vm_compressor_thread_count = 1;
5133         else if (vm_compressor_thread_count > MAX_COMPRESSOR_THREAD_COUNT)
5134                 vm_compressor_thread_count = MAX_COMPRESSOR_THREAD_COUNT;
5135
5136         if (vm_compressor_immediate_preferred == TRUE) {
5137                 vm_pageout_immediate_chead = NULL;
5138                 vm_pageout_immediate_scratch_buf = kalloc(vm_compressor_get_encode_scratch_size());
5139
5140                 vm_compressor_thread_count = 1;
5141         }
5142
5143         vm_pageout_queue_internal.pgo_maxlaundry = (vm_compressor_thread_count * 4) * VM_PAGE_LAUNDRY_MAX;
5144
5145         for (i = 0; i < vm_compressor_thread_count; i++) {
5146                 ciq[i].id = i;
5147                 ciq[i].q = &vm_pageout_queue_internal;
5148                 ciq[i].current_chead = NULL;
5149                 ciq[i].scratch_buf = kalloc(COMPRESSOR_SCRATCH_BUF_SIZE);
5150
5151                 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal, (void *)&ciq[i], BASEPRI_PREEMPT - 1, &vm_pageout_internal_iothread);
5152
5153                 if (result == KERN_SUCCESS)
5154                         thread_deallocate(vm_pageout_internal_iothread);
5155                 else
5156                         break;
5157         }
5158         return result;
5159 }
5160
5161 #if CONFIG_IOSCHED
5162 /*
5163  * To support I/O Expedite for compressed files we mark the upls with special flags.
5164  * The way decmpfs works is that we create a big upl which marks all the pages needed to
5165  * represent the compressed file as busy. We tag this upl with the flag UPL_DECMP_REQ. Decmpfs
5166  * then issues smaller I/Os for compressed I/Os, deflates them and puts the data into the pages
5167  * being held in the big original UPL. We mark each of these smaller UPLs with the flag
5168  * UPL_DECMP_REAL_IO. Any outstanding real I/O UPL is tracked by the big req upl using the
5169  * decmp_io_upl field (in the upl structure). This link is protected in the forward direction
5170  * by the req upl lock (the reverse link doesnt need synch. since we never inspect this link
5171  * unless the real I/O upl is being destroyed).
5172  */
5173
5174
5175 static void
5176 upl_set_decmp_info(upl_t upl, upl_t src_upl)
5177 {
5178         assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5179
5180         upl_lock(src_upl);
5181         if (src_upl->decmp_io_upl) {
5182                 /*
5183                  * If there is already an alive real I/O UPL, ignore this new UPL.
5184                  * This case should rarely happen and even if it does, it just means
5185                  * that we might issue a spurious expedite which the driver is expected
5186                  * to handle.
5187                  */
5188                 upl_unlock(src_upl);
5189                 return;
5190         }
5191         src_upl->decmp_io_upl = (void *)upl;
5192         src_upl->ref_count++;
5193
5194         upl->flags |= UPL_DECMP_REAL_IO;
5195         upl->decmp_io_upl = (void *)src_upl;
5196         upl_unlock(src_upl);
5197 }
5198 #endif /* CONFIG_IOSCHED */
5199
5200 #if UPL_DEBUG
5201 int     upl_debug_enabled = 1;
5202 #else
5203 int     upl_debug_enabled = 0;
5204 #endif
5205
5206 static upl_t
5207 upl_create(int type, int flags, upl_size_t size)
5208 {
5209         upl_t   upl;
5210         vm_size_t       page_field_size = 0;
5211         int     upl_flags = 0;
5212         vm_size_t       upl_size  = sizeof(struct upl);
5213
5214         size = round_page_32(size);
5215
5216         if (type & UPL_CREATE_LITE) {
5217                 page_field_size = (atop(size) + 7) >> 3;
5218                 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
5219
5220                 upl_flags |= UPL_LITE;
5221         }
5222         if (type & UPL_CREATE_INTERNAL) {
5223                 upl_size += sizeof(struct upl_page_info) * atop(size);
5224
5225                 upl_flags |= UPL_INTERNAL;
5226         }
5227         upl = (upl_t)kalloc(upl_size + page_field_size);
5228
5229         if (page_field_size)
5230                 bzero((char *)upl + upl_size, page_field_size);
5231
5232         upl->flags = upl_flags | flags;
5233         upl->kaddr = (vm_offset_t)0;
5234         upl->size = 0;
5235         upl->map_object = NULL;
5236         upl->ref_count = 1;
5237         upl->ext_ref_count = 0;
5238         upl->highest_page = 0;
5239         upl_lock_init(upl);
5240         upl->vector_upl = NULL;
5241         upl->associated_upl = NULL;
5242 #if CONFIG_IOSCHED
5243         if (type & UPL_CREATE_IO_TRACKING) {
5244                 upl->upl_priority = proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO);
5245         }
5246
5247         upl->upl_reprio_info = 0;
5248         upl->decmp_io_upl = 0;
5249         if ((type & UPL_CREATE_INTERNAL) && (type & UPL_CREATE_EXPEDITE_SUP)) {
5250                 /* Only support expedite on internal UPLs */
5251                 thread_t        curthread = current_thread();
5252                 upl->upl_reprio_info = (uint64_t *)kalloc(sizeof(uint64_t) * atop(size));
5253                 bzero(upl->upl_reprio_info, (sizeof(uint64_t) * atop(size)));
5254                 upl->flags |= UPL_EXPEDITE_SUPPORTED;
5255                 if (curthread->decmp_upl != NULL)
5256                         upl_set_decmp_info(upl, curthread->decmp_upl);
5257         }
5258 #endif
5259 #if CONFIG_IOSCHED || UPL_DEBUG
5260         if ((type & UPL_CREATE_IO_TRACKING) || upl_debug_enabled) {
5261                 upl->upl_creator = current_thread();
5262                 upl->uplq.next = 0;
5263                 upl->uplq.prev = 0;
5264                 upl->flags |= UPL_TRACKED_BY_OBJECT;
5265         }
5266 #endif
5267
5268 #if UPL_DEBUG
5269         upl->ubc_alias1 = 0;
5270         upl->ubc_alias2 = 0;
5271
5272         upl->upl_state = 0;
5273         upl->upl_commit_index = 0;
5274         bzero(&upl->upl_commit_records[0], sizeof(upl->upl_commit_records));
5275
5276         (void) OSBacktrace(&upl->upl_create_retaddr[0], UPL_DEBUG_STACK_FRAMES);
5277 #endif /* UPL_DEBUG */
5278
5279         return(upl);
5280 }
5281
5282 static void
5283 upl_destroy(upl_t upl)
5284 {
5285         int     page_field_size;  /* bit field in word size buf */
5286         int     size;
5287
5288         if (upl->ext_ref_count) {
5289                 panic("upl(%p) ext_ref_count", upl);
5290         }
5291
5292 #if CONFIG_IOSCHED
5293         if ((upl->flags & UPL_DECMP_REAL_IO) && upl->decmp_io_upl) {
5294                 upl_t src_upl;
5295                 src_upl = upl->decmp_io_upl;
5296                 assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5297                 upl_lock(src_upl);
5298                 src_upl->decmp_io_upl = NULL;
5299                 upl_unlock(src_upl);
5300                 upl_deallocate(src_upl);
5301         }
5302 #endif /* CONFIG_IOSCHED */
5303
5304 #if CONFIG_IOSCHED || UPL_DEBUG
5305         if ((upl->flags & UPL_TRACKED_BY_OBJECT) && !(upl->flags & UPL_VECTOR)) {
5306                 vm_object_t     object;
5307
5308                 if (upl->flags & UPL_SHADOWED) {
5309                         object = upl->map_object->shadow;
5310                 } else {
5311                         object = upl->map_object;
5312                 }
5313
5314                 vm_object_lock(object);
5315                 queue_remove(&object->uplq, upl, upl_t, uplq);
5316                 vm_object_activity_end(object);
5317                 vm_object_collapse(object, 0, TRUE);
5318                 vm_object_unlock(object);
5319         }
5320 #endif
5321         /*
5322          * drop a reference on the map_object whether or
5323          * not a pageout object is inserted
5324          */
5325         if (upl->flags & UPL_SHADOWED)
5326                 vm_object_deallocate(upl->map_object);
5327
5328         if (upl->flags & UPL_DEVICE_MEMORY)
5329                 size = PAGE_SIZE;
5330         else
5331                 size = upl->size;
5332         page_field_size = 0;
5333
5334         if (upl->flags & UPL_LITE) {
5335                 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
5336                 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
5337         }
5338         upl_lock_destroy(upl);
5339         upl->vector_upl = (vector_upl_t) 0xfeedbeef;
5340
5341 #if CONFIG_IOSCHED
5342         if (upl->flags & UPL_EXPEDITE_SUPPORTED)
5343                 kfree(upl->upl_reprio_info, sizeof(uint64_t) * (size/PAGE_SIZE));
5344 #endif
5345
5346         if (upl->flags & UPL_INTERNAL) {
5347                 kfree(upl,
5348                       sizeof(struct upl) +
5349                       (sizeof(struct upl_page_info) * (size/PAGE_SIZE))
5350                       + page_field_size);
5351         } else {
5352                 kfree(upl, sizeof(struct upl) + page_field_size);
5353         }
5354 }
5355
5356 void
5357 upl_deallocate(upl_t upl)
5358 {
5359         upl_lock(upl);
5360         if (--upl->ref_count == 0) {
5361                 if(vector_upl_is_valid(upl))
5362                         vector_upl_deallocate(upl);
5363                 upl_unlock(upl);
5364                 upl_destroy(upl);
5365         }
5366         else
5367                 upl_unlock(upl);
5368 }
5369
5370 #if CONFIG_IOSCHED
5371 void
5372 upl_mark_decmp(upl_t upl)
5373 {
5374         if (upl->flags & UPL_TRACKED_BY_OBJECT) {
5375                 upl->flags |= UPL_DECMP_REQ;
5376                 upl->upl_creator->decmp_upl = (void *)upl;
5377         }
5378 }
5379
5380 void
5381 upl_unmark_decmp(upl_t upl)
5382 {
5383         if(upl && (upl->flags & UPL_DECMP_REQ)) {
5384                 upl->upl_creator->decmp_upl = NULL;
5385         }
5386 }
5387
5388 #endif /* CONFIG_IOSCHED */
5389
5390 #define VM_PAGE_Q_BACKING_UP(q)         \
5391         ((q)->pgo_laundry >= (((q)->pgo_maxlaundry * 8) / 10))
5392
5393 boolean_t must_throttle_writes(void);
5394
5395 boolean_t
5396 must_throttle_writes()
5397 {
5398         if (VM_PAGE_Q_BACKING_UP(&vm_pageout_queue_external) &&
5399             vm_page_pageable_external_count > (AVAILABLE_NON_COMPRESSED_MEMORY * 6) / 10)
5400                 return (TRUE);
5401
5402         return (FALSE);
5403 }
5404
5405
5406 #if DEVELOPMENT || DEBUG
5407 /*/*
5408  * Statistics about UPL enforcement of copy-on-write obligations.
5409  */
5410 unsigned long upl_cow = 0;
5411 unsigned long upl_cow_again = 0;
5412 unsigned long upl_cow_pages = 0;
5413 unsigned long upl_cow_again_pages = 0;
5414
5415 unsigned long iopl_cow = 0;
5416 unsigned long iopl_cow_pages = 0;
5417 #endif
5418
5419 /*
5420  *      Routine:        vm_object_upl_request
5421  *      Purpose:
5422  *              Cause the population of a portion of a vm_object.
5423  *              Depending on the nature of the request, the pages
5424  *              returned may be contain valid data or be uninitialized.
5425  *              A page list structure, listing the physical pages
5426  *              will be returned upon request.
5427  *              This function is called by the file system or any other
5428  *              supplier of backing store to a pager.
5429  *              IMPORTANT NOTE: The caller must still respect the relationship
5430  *              between the vm_object and its backing memory object.  The
5431  *              caller MUST NOT substitute changes in the backing file
5432  *              without first doing a memory_object_lock_request on the
5433  *              target range unless it is know that the pages are not
5434  *              shared with another entity at the pager level.
5435  *              Copy_in_to:
5436  *                      if a page list structure is present
5437  *                      return the mapped physical pages, where a
5438  *                      page is not present, return a non-initialized
5439  *                      one.  If the no_sync bit is turned on, don't
5440  *                      call the pager unlock to synchronize with other
5441  *                      possible copies of the page. Leave pages busy
5442  *                      in the original object, if a page list structure
5443  *                      was specified.  When a commit of the page list
5444  *                      pages is done, the dirty bit will be set for each one.
5445  *              Copy_out_from:
5446  *                      If a page list structure is present, return
5447  *                      all mapped pages.  Where a page does not exist
5448  *                      map a zero filled one. Leave pages busy in
5449  *                      the original object.  If a page list structure
5450  *                      is not specified, this call is a no-op.
5451  *
5452  *              Note:  access of default pager objects has a rather interesting
5453  *              twist.  The caller of this routine, presumably the file system
5454  *              page cache handling code, will never actually make a request
5455  *              against a default pager backed object.  Only the default
5456  *              pager will make requests on backing store related vm_objects
5457  *              In this way the default pager can maintain the relationship
5458  *              between backing store files (abstract memory objects) and
5459  *              the vm_objects (cache objects), they support.
5460  *
5461  */
5462
5463 __private_extern__ kern_return_t
5464 vm_object_upl_request(
5465         vm_object_t             object,
5466         vm_object_offset_t      offset,
5467         upl_size_t              size,
5468         upl_t                   *upl_ptr,
5469         upl_page_info_array_t   user_page_list,
5470         unsigned int            *page_list_count,
5471         upl_control_flags_t     cntrl_flags)
5472 {
5473         vm_page_t               dst_page = VM_PAGE_NULL;
5474         vm_object_offset_t      dst_offset;
5475         upl_size_t              xfer_size;
5476         unsigned int            size_in_pages;
5477         boolean_t               dirty;
5478         boolean_t               hw_dirty;
5479         upl_t                   upl = NULL;
5480         unsigned int            entry;
5481 #if MACH_CLUSTER_STATS
5482         boolean_t               encountered_lrp = FALSE;
5483 #endif
5484         vm_page_t               alias_page = NULL;
5485         int                     refmod_state = 0;
5486         wpl_array_t             lite_list = NULL;
5487         vm_object_t             last_copy_object;
5488         struct  vm_page_delayed_work    dw_array[DEFAULT_DELAYED_WORK_LIMIT];
5489         struct  vm_page_delayed_work    *dwp;
5490         int                     dw_count;
5491         int                     dw_limit;
5492         int                     io_tracking_flag = 0;
5493         int                     grab_options;
5494         ppnum_t                 phys_page;
5495
5496         if (cntrl_flags & ~UPL_VALID_FLAGS) {
5497                 /*
5498                  * For forward compatibility's sake,
5499                  * reject any unknown flag.
5500                  */
5501                 return KERN_INVALID_VALUE;
5502         }
5503         if ( (!object->internal) && (object->paging_offset != 0) )
5504                 panic("vm_object_upl_request: external object with non-zero paging offset\n");
5505         if (object->phys_contiguous)
5506                 panic("vm_object_upl_request: contiguous object specified\n");
5507
5508
5509         if (size > MAX_UPL_SIZE_BYTES)
5510                 size = MAX_UPL_SIZE_BYTES;
5511
5512         if ( (cntrl_flags & UPL_SET_INTERNAL) && page_list_count != NULL)
5513                 *page_list_count = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
5514
5515 #if CONFIG_IOSCHED || UPL_DEBUG
5516         if (object->io_tracking || upl_debug_enabled)
5517                 io_tracking_flag |= UPL_CREATE_IO_TRACKING;
5518 #endif
5519 #if CONFIG_IOSCHED
5520         if (object->io_tracking)
5521                 io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
5522 #endif
5523
5524         if (cntrl_flags & UPL_SET_INTERNAL) {
5525                 if (cntrl_flags & UPL_SET_LITE) {
5526
5527                         upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
5528
5529                         user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
5530                         lite_list = (wpl_array_t)
5531                                         (((uintptr_t)user_page_list) +
5532                                         ((size/PAGE_SIZE) * sizeof(upl_page_info_t)));
5533                         if (size == 0) {
5534                                 user_page_list = NULL;
5535                                 lite_list = NULL;
5536                         }
5537                 } else {
5538                         upl = upl_create(UPL_CREATE_INTERNAL | io_tracking_flag, 0, size);
5539
5540                         user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
5541                         if (size == 0) {
5542                                 user_page_list = NULL;
5543                         }
5544                 }
5545         } else {
5546                 if (cntrl_flags & UPL_SET_LITE) {
5547
5548                         upl = upl_create(UPL_CREATE_EXTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
5549
5550                         lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
5551                         if (size == 0) {
5552                                 lite_list = NULL;
5553                         }
5554                 } else {
5555                         upl = upl_create(UPL_CREATE_EXTERNAL | io_tracking_flag, 0, size);
5556                 }
5557         }
5558         *upl_ptr = upl;
5559
5560         if (user_page_list)
5561                 user_page_list[0].device = FALSE;
5562
5563         if (cntrl_flags & UPL_SET_LITE) {
5564                 upl->map_object = object;
5565         } else {
5566                 upl->map_object = vm_object_allocate(size);
5567                 /*
5568                  * No neeed to lock the new object: nobody else knows
5569                  * about it yet, so it's all ours so far.
5570                  */
5571                 upl->map_object->shadow = object;
5572                 upl->map_object->pageout = TRUE;
5573                 upl->map_object->can_persist = FALSE;
5574                 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
5575                 upl->map_object->vo_shadow_offset = offset;
5576                 upl->map_object->wimg_bits = object->wimg_bits;
5577
5578                 VM_PAGE_GRAB_FICTITIOUS(alias_page);
5579
5580                 upl->flags |= UPL_SHADOWED;
5581         }
5582         /*
5583          * ENCRYPTED SWAP:
5584          * Just mark the UPL as "encrypted" here.
5585          * We'll actually encrypt the pages later,
5586          * in upl_encrypt(), when the caller has
5587          * selected which pages need to go to swap.
5588          */
5589         if (cntrl_flags & UPL_ENCRYPT)
5590                 upl->flags |= UPL_ENCRYPTED;
5591
5592         if (cntrl_flags & UPL_FOR_PAGEOUT)
5593                 upl->flags |= UPL_PAGEOUT;
5594
5595         vm_object_lock(object);
5596         vm_object_activity_begin(object);
5597
5598         grab_options = 0;
5599 #if CONFIG_SECLUDED_MEMORY
5600         if (object->can_grab_secluded) {
5601                 grab_options |= VM_PAGE_GRAB_SECLUDED;
5602         }
5603 #endif /* CONFIG_SECLUDED_MEMORY */
5604
5605         /*
5606          * we can lock in the paging_offset once paging_in_progress is set
5607          */
5608         upl->size = size;
5609         upl->offset = offset + object->paging_offset;
5610
5611 #if CONFIG_IOSCHED || UPL_DEBUG
5612         if (object->io_tracking || upl_debug_enabled) {
5613                 vm_object_activity_begin(object);
5614                 queue_enter(&object->uplq, upl, upl_t, uplq);
5615         }
5616 #endif
5617         if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != VM_OBJECT_NULL) {
5618                 /*
5619                  * Honor copy-on-write obligations
5620                  *
5621                  * The caller is gathering these pages and
5622                  * might modify their contents.  We need to
5623                  * make sure that the copy object has its own
5624                  * private copies of these pages before we let
5625                  * the caller modify them.
5626                  */
5627                 vm_object_update(object,
5628                                  offset,
5629                                  size,
5630                                  NULL,
5631                                  NULL,
5632                                  FALSE, /* should_return */
5633                                  MEMORY_OBJECT_COPY_SYNC,
5634                                  VM_PROT_NO_CHANGE);
5635 #if DEVELOPMENT || DEBUG
5636                 upl_cow++;
5637                 upl_cow_pages += size >> PAGE_SHIFT;
5638 #endif
5639         }
5640         /*
5641          * remember which copy object we synchronized with
5642          */
5643         last_copy_object = object->copy;
5644         entry = 0;
5645
5646         xfer_size = size;
5647         dst_offset = offset;
5648         size_in_pages = size / PAGE_SIZE;
5649
5650         dwp = &dw_array[0];
5651         dw_count = 0;
5652         dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
5653
5654         if (vm_page_free_count > (vm_page_free_target + size_in_pages) ||
5655             object->resident_page_count < ((MAX_UPL_SIZE_BYTES * 2) >> PAGE_SHIFT))
5656                 object->scan_collisions = 0;
5657
5658         if ((cntrl_flags & UPL_WILL_MODIFY) && must_throttle_writes() == TRUE) {
5659                 boolean_t       isSSD = FALSE;
5660
5661                 vnode_pager_get_isSSD(object->pager, &isSSD);
5662                 vm_object_unlock(object);
5663
5664                 OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
5665
5666                 if (isSSD == TRUE)
5667                         delay(1000 * size_in_pages);
5668                 else
5669                         delay(5000 * size_in_pages);
5670                 OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
5671
5672                 vm_object_lock(object);
5673         }
5674
5675         while (xfer_size) {
5676
5677                 dwp->dw_mask = 0;
5678
5679                 if ((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
5680                         vm_object_unlock(object);
5681                         VM_PAGE_GRAB_FICTITIOUS(alias_page);
5682                         vm_object_lock(object);
5683                 }
5684                 if (cntrl_flags & UPL_COPYOUT_FROM) {
5685                         upl->flags |= UPL_PAGE_SYNC_DONE;
5686
5687                         if ( ((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
5688                                 dst_page->fictitious ||
5689                                 dst_page->absent ||
5690                                 dst_page->error ||
5691                                 dst_page->cleaning ||
5692                                 (VM_PAGE_WIRED(dst_page))) {
5693
5694                                 if (user_page_list)
5695                                         user_page_list[entry].phys_addr = 0;
5696
5697                                 goto try_next_page;
5698                         }
5699                         phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
5700
5701                         /*
5702                          * grab this up front...
5703                          * a high percentange of the time we're going to
5704                          * need the hardware modification state a bit later
5705                          * anyway... so we can eliminate an extra call into
5706                          * the pmap layer by grabbing it here and recording it
5707                          */
5708                         if (dst_page->pmapped)
5709                                 refmod_state = pmap_get_refmod(phys_page);
5710                         else
5711                                 refmod_state = 0;
5712
5713                         if ( (refmod_state & VM_MEM_REFERENCED) && VM_PAGE_INACTIVE(dst_page)) {
5714                                 /*
5715                                  * page is on inactive list and referenced...
5716                                  * reactivate it now... this gets it out of the
5717                                  * way of vm_pageout_scan which would have to
5718                                  * reactivate it upon tripping over it
5719                                  */
5720                                 dwp->dw_mask |= DW_vm_page_activate;
5721                         }
5722                         if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
5723                                 /*
5724                                  * we're only asking for DIRTY pages to be returned
5725                                  */
5726                                 if (dst_page->laundry || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
5727                                         /*
5728                                          * if we were the page stolen by vm_pageout_scan to be
5729                                          * cleaned (as opposed to a buddy being clustered in
5730                                          * or this request is not being driven by a PAGEOUT cluster
5731                                          * then we only need to check for the page being dirty or
5732                                          * precious to decide whether to return it
5733                                          */
5734                                         if (dst_page->dirty || dst_page->precious || (refmod_state & VM_MEM_MODIFIED))
5735                                                 goto check_busy;
5736                                         goto dont_return;
5737                                 }
5738                                 /*
5739                                  * this is a request for a PAGEOUT cluster and this page
5740                                  * is merely along for the ride as a 'buddy'... not only
5741                                  * does it have to be dirty to be returned, but it also
5742                                  * can't have been referenced recently...
5743                                  */
5744                                 if ( (hibernate_cleaning_in_progress == TRUE ||
5745                                       (!((refmod_state & VM_MEM_REFERENCED) || dst_page->reference) ||
5746                                        (dst_page->vm_page_q_state == VM_PAGE_ON_THROTTLED_Q))) &&
5747                                      ((refmod_state & VM_MEM_MODIFIED) || dst_page->dirty || dst_page->precious) ) {
5748                                         goto check_busy;
5749                                 }
5750 dont_return:
5751                                 /*
5752                                  * if we reach here, we're not to return
5753                                  * the page... go on to the next one
5754                                  */
5755                                 if (dst_page->laundry == TRUE) {
5756                                         /*
5757                                          * if we get here, the page is not 'cleaning' (filtered out above).
5758                                          * since it has been referenced, remove it from the laundry
5759                                          * so we don't pay the cost of an I/O to clean a page
5760                                          * we're just going to take back
5761                                          */
5762                                         vm_page_lockspin_queues();
5763
5764                                         vm_pageout_steal_laundry(dst_page, TRUE);
5765                                         vm_page_activate(dst_page);
5766
5767                                         vm_page_unlock_queues();
5768                                 }
5769                                 if (user_page_list)
5770                                         user_page_list[entry].phys_addr = 0;
5771
5772                                 goto try_next_page;
5773                         }
5774 check_busy:
5775                         if (dst_page->busy) {
5776                                 if (cntrl_flags & UPL_NOBLOCK) {
5777                                         if (user_page_list)
5778                                                 user_page_list[entry].phys_addr = 0;
5779                                         dwp->dw_mask = 0;
5780
5781                                         goto try_next_page;
5782                                 }
5783                                 /*
5784                                  * someone else is playing with the
5785                                  * page.  We will have to wait.
5786                                  */
5787                                 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
5788
5789                                 continue;
5790                         }
5791                         /*
5792                          * ENCRYPTED SWAP:
5793                          * The caller is gathering this page and might
5794                          * access its contents later on.  Decrypt the
5795                          * page before adding it to the UPL, so that
5796                          * the caller never sees encrypted data.
5797                          */
5798                         if (! (cntrl_flags & UPL_ENCRYPT) && dst_page->encrypted) {
5799                                 int  was_busy;
5800
5801                                 /*
5802                                  * save the current state of busy
5803                                  * mark page as busy while decrypt
5804                                  * is in progress since it will drop
5805                                  * the object lock...
5806                                  */
5807                                 was_busy = dst_page->busy;
5808                                 dst_page->busy = TRUE;
5809
5810                                 vm_page_decrypt(dst_page, 0);
5811                                 vm_page_decrypt_for_upl_counter++;
5812                                 /*
5813                                  * restore to original busy state
5814                                  */
5815                                 dst_page->busy = was_busy;
5816                         }
5817                         if (dst_page->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q) {
5818
5819                                 vm_page_lockspin_queues();
5820
5821                                 if (dst_page->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q) {
5822                                         /*
5823                                          * we've buddied up a page for a clustered pageout
5824                                          * that has already been moved to the pageout
5825                                          * queue by pageout_scan... we need to remove
5826                                          * it from the queue and drop the laundry count
5827                                          * on that queue
5828                                          */
5829                                         vm_pageout_throttle_up(dst_page);
5830                                 }
5831                                 vm_page_unlock_queues();
5832                         }
5833 #if MACH_CLUSTER_STATS
5834                         /*
5835                          * pageout statistics gathering.  count
5836                          * all the pages we will page out that
5837                          * were not counted in the initial
5838                          * vm_pageout_scan work
5839                          */
5840                         if (dst_page->pageout)
5841                                 encountered_lrp = TRUE;
5842                         if ((dst_page->dirty || (object->internal && dst_page->precious))) {
5843                                 if (encountered_lrp)
5844                                         CLUSTER_STAT(pages_at_higher_offsets++;)
5845                                 else
5846                                         CLUSTER_STAT(pages_at_lower_offsets++;)
5847                         }
5848 #endif
5849                         hw_dirty = refmod_state & VM_MEM_MODIFIED;
5850                         dirty = hw_dirty ? TRUE : dst_page->dirty;
5851
5852                         if (phys_page > upl->highest_page)
5853                                 upl->highest_page = phys_page;
5854
5855                         assert (!pmap_is_noencrypt(phys_page));
5856
5857                         if (cntrl_flags & UPL_SET_LITE) {
5858                                 unsigned int    pg_num;
5859
5860                                 pg_num = (unsigned int) ((dst_offset-offset)/PAGE_SIZE);
5861                                 assert(pg_num == (dst_offset-offset)/PAGE_SIZE);
5862                                 lite_list[pg_num>>5] |= 1 << (pg_num & 31);
5863
5864                                 if (hw_dirty)
5865                                         pmap_clear_modify(phys_page);
5866
5867                                 /*
5868                                  * Mark original page as cleaning
5869                                  * in place.
5870                                  */
5871                                 dst_page->cleaning = TRUE;
5872                                 dst_page->precious = FALSE;
5873                         } else {
5874                                 /*
5875                                  * use pageclean setup, it is more
5876                                  * convenient even for the pageout
5877                                  * cases here
5878                                  */
5879                                 vm_object_lock(upl->map_object);
5880                                 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
5881                                 vm_object_unlock(upl->map_object);
5882
5883                                 alias_page->absent = FALSE;
5884                                 alias_page = NULL;
5885                         }
5886                         if (dirty) {
5887                                 SET_PAGE_DIRTY(dst_page, FALSE);
5888                         } else {
5889                                 dst_page->dirty = FALSE;
5890                         }
5891
5892                         if (!dirty)
5893                                 dst_page->precious = TRUE;
5894
5895                         if ( (cntrl_flags & UPL_ENCRYPT) ) {
5896                                 /*
5897                                  * ENCRYPTED SWAP:
5898                                  * We want to deny access to the target page
5899                                  * because its contents are about to be
5900                                  * encrypted and the user would be very
5901                                  * confused to see encrypted data instead
5902                                  * of their data.
5903                                  * We also set "encrypted_cleaning" to allow
5904                                  * vm_pageout_scan() to demote that page
5905                                  * from "adjacent/clean-in-place" to
5906                                  * "target/clean-and-free" if it bumps into
5907                                  * this page during its scanning while we're
5908                                  * still processing this cluster.
5909                                  */
5910                                 dst_page->busy = TRUE;
5911                                 dst_page->encrypted_cleaning = TRUE;
5912                         }
5913                         if ( !(cntrl_flags & UPL_CLEAN_IN_PLACE) ) {
5914                                 if ( !VM_PAGE_WIRED(dst_page))
5915                                         dst_page->free_when_done = TRUE;
5916                         }
5917                 } else {
5918                         if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != last_copy_object) {
5919                                 /*
5920                                  * Honor copy-on-write obligations
5921                                  *
5922                                  * The copy object has changed since we
5923                                  * last synchronized for copy-on-write.
5924                                  * Another copy object might have been
5925                                  * inserted while we released the object's
5926                                  * lock.  Since someone could have seen the
5927                                  * original contents of the remaining pages
5928                                  * through that new object, we have to
5929                                  * synchronize with it again for the remaining
5930                                  * pages only.  The previous pages are "busy"
5931                                  * so they can not be seen through the new
5932                                  * mapping.  The new mapping will see our
5933                                  * upcoming changes for those previous pages,
5934                                  * but that's OK since they couldn't see what
5935                                  * was there before.  It's just a race anyway
5936                                  * and there's no guarantee of consistency or
5937                                  * atomicity.  We just don't want new mappings
5938                                  * to see both the *before* and *after* pages.
5939                                  */
5940                                 if (object->copy != VM_OBJECT_NULL) {
5941                                         vm_object_update(
5942                                                 object,
5943                                                 dst_offset,/* current offset */
5944                                                 xfer_size, /* remaining size */
5945                                                 NULL,
5946                                                 NULL,
5947                                                 FALSE,     /* should_return */
5948                                                 MEMORY_OBJECT_COPY_SYNC,
5949                                                 VM_PROT_NO_CHANGE);
5950
5951 #if DEVELOPMENT || DEBUG
5952                                         upl_cow_again++;
5953                                         upl_cow_again_pages += xfer_size >> PAGE_SHIFT;
5954 #endif
5955                                 }
5956                                 /*
5957                                  * remember the copy object we synced with
5958                                  */
5959                                 last_copy_object = object->copy;
5960                         }
5961                         dst_page = vm_page_lookup(object, dst_offset);
5962
5963                         if (dst_page != VM_PAGE_NULL) {
5964
5965                                 if ((cntrl_flags & UPL_RET_ONLY_ABSENT)) {
5966                                         /*
5967                                          * skip over pages already present in the cache
5968                                          */
5969                                         if (user_page_list)
5970                                                 user_page_list[entry].phys_addr = 0;
5971
5972                                         goto try_next_page;
5973                                 }
5974                                 if (dst_page->fictitious) {
5975                                         panic("need corner case for fictitious page");
5976                                 }
5977
5978                                 if (dst_page->busy || dst_page->cleaning) {
5979                                         /*
5980                                          * someone else is playing with the
5981                                          * page.  We will have to wait.
5982                                          */
5983                                         PAGE_SLEEP(object, dst_page, THREAD_UNINT);
5984
5985                                         continue;
5986                                 }
5987                                 if (dst_page->laundry)
5988                                         vm_pageout_steal_laundry(dst_page, FALSE);
5989                         } else {
5990                                 if (object->private) {
5991                                         /*
5992                                          * This is a nasty wrinkle for users
5993                                          * of upl who encounter device or
5994                                          * private memory however, it is
5995                                          * unavoidable, only a fault can
5996                                          * resolve the actual backing
5997                                          * physical page by asking the
5998                                          * backing device.
5999                                          */
6000                                         if (user_page_list)
6001                                                 user_page_list[entry].phys_addr = 0;
6002
6003                                         goto try_next_page;
6004                                 }
6005                                 if (object->scan_collisions) {
6006                                         /*
6007                                          * the pageout_scan thread is trying to steal
6008                                          * pages from this object, but has run into our
6009                                          * lock... grab 2 pages from the head of the object...
6010                                          * the first is freed on behalf of pageout_scan, the
6011                                          * 2nd is for our own use... we use vm_object_page_grab
6012                                          * in both cases to avoid taking pages from the free
6013                                          * list since we are under memory pressure and our
6014                                          * lock on this object is getting in the way of
6015                                          * relieving it
6016                                          */
6017                                         dst_page = vm_object_page_grab(object);
6018
6019                                         if (dst_page != VM_PAGE_NULL)
6020                                                 vm_page_release(dst_page,
6021                                                                 FALSE);
6022
6023                                         dst_page = vm_object_page_grab(object);
6024                                 }
6025                                 if (dst_page == VM_PAGE_NULL) {
6026                                         /*
6027                                          * need to allocate a page
6028                                          */
6029                                         dst_page = vm_page_grab_options(grab_options);
6030                                 }
6031                                 if (dst_page == VM_PAGE_NULL) {
6032                                         if ( (cntrl_flags & (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) == (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) {
6033                                                /*
6034                                                 * we don't want to stall waiting for pages to come onto the free list
6035                                                 * while we're already holding absent pages in this UPL
6036                                                 * the caller will deal with the empty slots
6037                                                 */
6038                                                 if (user_page_list)
6039                                                         user_page_list[entry].phys_addr = 0;
6040
6041                                                 goto try_next_page;
6042                                         }
6043                                         /*
6044                                          * no pages available... wait
6045                                          * then try again for the same
6046                                          * offset...
6047                                          */
6048                                         vm_object_unlock(object);
6049
6050                                         OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
6051
6052                                         VM_DEBUG_EVENT(vm_upl_page_wait, VM_UPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
6053
6054                                         VM_PAGE_WAIT();
6055                                         OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
6056
6057                                         VM_DEBUG_EVENT(vm_upl_page_wait, VM_UPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
6058
6059                                         vm_object_lock(object);
6060
6061                                         continue;
6062                                 }
6063                                 vm_page_insert(dst_page, object, dst_offset);
6064
6065                                 dst_page->absent = TRUE;
6066                                 dst_page->busy = FALSE;
6067
6068                                 if (cntrl_flags & UPL_RET_ONLY_ABSENT) {
6069                                         /*
6070                                          * if UPL_RET_ONLY_ABSENT was specified,
6071                                          * than we're definitely setting up a
6072                                          * upl for a clustered read/pagein
6073                                          * operation... mark the pages as clustered
6074                                          * so upl_commit_range can put them on the
6075                                          * speculative list
6076                                          */
6077                                         dst_page->clustered = TRUE;
6078
6079                                         if ( !(cntrl_flags & UPL_FILE_IO))
6080                                                 VM_STAT_INCR(pageins);
6081                                 }
6082                         }
6083                         phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
6084
6085                         /*
6086                          * ENCRYPTED SWAP:
6087                          */
6088                         if (cntrl_flags & UPL_ENCRYPT) {
6089                                 /*
6090                                  * The page is going to be encrypted when we
6091                                  * get it from the pager, so mark it so.
6092                                  */
6093                                 dst_page->encrypted = TRUE;
6094                         } else {
6095                                 /*
6096                                  * Otherwise, the page will not contain
6097                                  * encrypted data.
6098                                  */
6099                                 dst_page->encrypted = FALSE;
6100                         }
6101                         dst_page->overwriting = TRUE;
6102
6103                         if (dst_page->pmapped) {
6104                                 if ( !(cntrl_flags & UPL_FILE_IO))
6105                                         /*
6106                                          * eliminate all mappings from the
6107                                          * original object and its prodigy
6108                                          */
6109                                         refmod_state = pmap_disconnect(phys_page);
6110                                 else
6111                                         refmod_state = pmap_get_refmod(phys_page);
6112                         } else
6113                                 refmod_state = 0;
6114
6115                         hw_dirty = refmod_state & VM_MEM_MODIFIED;
6116                         dirty = hw_dirty ? TRUE : dst_page->dirty;
6117
6118                         if (cntrl_flags & UPL_SET_LITE) {
6119                                 unsigned int    pg_num;
6120
6121                                 pg_num = (unsigned int) ((dst_offset-offset)/PAGE_SIZE);
6122                                 assert(pg_num == (dst_offset-offset)/PAGE_SIZE);
6123                                 lite_list[pg_num>>5] |= 1 << (pg_num & 31);
6124
6125                                 if (hw_dirty)
6126                                         pmap_clear_modify(phys_page);
6127
6128                                 /*
6129                                  * Mark original page as cleaning
6130                                  * in place.
6131                                  */
6132                                 dst_page->cleaning = TRUE;
6133                                 dst_page->precious = FALSE;
6134                         } else {
6135                                 /*
6136                                  * use pageclean setup, it is more
6137                                  * convenient even for the pageout
6138                                  * cases here
6139                                  */
6140                                 vm_object_lock(upl->map_object);
6141                                 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
6142                                 vm_object_unlock(upl->map_object);
6143
6144                                 alias_page->absent = FALSE;
6145                                 alias_page = NULL;
6146                         }
6147
6148                         if (cntrl_flags & UPL_REQUEST_SET_DIRTY) {
6149                                 upl->flags &= ~UPL_CLEAR_DIRTY;
6150                                 upl->flags |= UPL_SET_DIRTY;
6151                                 dirty = TRUE;
6152                                 upl->flags |= UPL_SET_DIRTY;
6153                         } else if (cntrl_flags & UPL_CLEAN_IN_PLACE) {
6154                                 /*
6155                                  * clean in place for read implies
6156                                  * that a write will be done on all
6157                                  * the pages that are dirty before
6158                                  * a upl commit is done.  The caller
6159                                  * is obligated to preserve the
6160                                  * contents of all pages marked dirty
6161                                  */
6162                                 upl->flags |= UPL_CLEAR_DIRTY;
6163                         }
6164                         dst_page->dirty = dirty;
6165
6166                         if (!dirty)
6167                                 dst_page->precious = TRUE;
6168
6169                         if ( !VM_PAGE_WIRED(dst_page)) {
6170                                 /*
6171                                  * deny access to the target page while
6172                                  * it is being worked on
6173                                  */
6174                                 dst_page->busy = TRUE;
6175                         } else
6176                                 dwp->dw_mask |= DW_vm_page_wire;
6177
6178                         /*
6179                          * We might be about to satisfy a fault which has been
6180                          * requested. So no need for the "restart" bit.
6181                          */
6182                         dst_page->restart = FALSE;
6183                         if (!dst_page->absent && !(cntrl_flags & UPL_WILL_MODIFY)) {
6184                                 /*
6185                                  * expect the page to be used
6186                                  */
6187                                 dwp->dw_mask |= DW_set_reference;
6188                         }
6189                         if (cntrl_flags & UPL_PRECIOUS) {
6190                                 if (object->internal) {
6191                                         SET_PAGE_DIRTY(dst_page, FALSE);
6192                                         dst_page->precious = FALSE;
6193                                 } else {
6194                                         dst_page->precious = TRUE;
6195                                 }
6196                         } else {
6197                                 dst_page->precious = FALSE;
6198                         }
6199                 }
6200                 if (dst_page->busy)
6201                         upl->flags |= UPL_HAS_BUSY;
6202
6203                 if (phys_page > upl->highest_page)
6204                         upl->highest_page = phys_page;
6205                 assert (!pmap_is_noencrypt(phys_page));
6206                 if (user_page_list) {
6207                         user_page_list[entry].phys_addr = phys_page;
6208                         user_page_list[entry].free_when_done    = dst_page->free_when_done;
6209                         user_page_list[entry].absent    = dst_page->absent;
6210                         user_page_list[entry].dirty     = dst_page->dirty;
6211                         user_page_list[entry].precious  = dst_page->precious;
6212                         user_page_list[entry].device    = FALSE;
6213                         user_page_list[entry].needed    = FALSE;
6214                         if (dst_page->clustered == TRUE)
6215                                 user_page_list[entry].speculative = (dst_page->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
6216                         else
6217                                 user_page_list[entry].speculative = FALSE;
6218                         user_page_list[entry].cs_validated = dst_page->cs_validated;
6219                         user_page_list[entry].cs_tainted = dst_page->cs_tainted;
6220                         user_page_list[entry].cs_nx = dst_page->cs_nx;
6221                         user_page_list[entry].mark      = FALSE;
6222                 }
6223                 /*
6224                  * if UPL_RET_ONLY_ABSENT is set, then
6225                  * we are working with a fresh page and we've
6226                  * just set the clustered flag on it to
6227                  * indicate that it was drug in as part of a
6228                  * speculative cluster... so leave it alone
6229                  */
6230                 if ( !(cntrl_flags & UPL_RET_ONLY_ABSENT)) {
6231                         /*
6232                          * someone is explicitly grabbing this page...
6233                          * update clustered and speculative state
6234                          *
6235                          */
6236                         if (dst_page->clustered)
6237                                 VM_PAGE_CONSUME_CLUSTERED(dst_page);
6238                 }
6239 try_next_page:
6240                 if (dwp->dw_mask) {
6241                         if (dwp->dw_mask & DW_vm_page_activate)
6242                                 VM_STAT_INCR(reactivations);
6243
6244                         VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
6245
6246                         if (dw_count >= dw_limit) {
6247                                 vm_page_do_delayed_work(object, UPL_MEMORY_TAG(cntrl_flags), &dw_array[0], dw_count);
6248
6249                                 dwp = &dw_array[0];
6250                                 dw_count = 0;
6251                         }
6252                 }
6253                 entry++;
6254                 dst_offset += PAGE_SIZE_64;
6255                 xfer_size -= PAGE_SIZE;
6256         }
6257         if (dw_count)
6258                 vm_page_do_delayed_work(object, UPL_MEMORY_TAG(cntrl_flags), &dw_array[0], dw_count);
6259
6260         if (alias_page != NULL) {
6261                 VM_PAGE_FREE(alias_page);
6262         }
6263
6264         if (page_list_count != NULL) {
6265                 if (upl->flags & UPL_INTERNAL)
6266                         *page_list_count = 0;
6267                 else if (*page_list_count > entry)
6268                         *page_list_count = entry;
6269         }
6270 #if UPL_DEBUG
6271         upl->upl_state = 1;
6272 #endif
6273         vm_object_unlock(object);
6274
6275         return KERN_SUCCESS;
6276 }
6277
6278 /*
6279  *      Routine:        vm_object_super_upl_request
6280  *      Purpose:
6281  *              Cause the population of a portion of a vm_object
6282  *              in much the same way as memory_object_upl_request.
6283  *              Depending on the nature of the request, the pages
6284  *              returned may be contain valid data or be uninitialized.
6285  *              However, the region may be expanded up to the super
6286  *              cluster size provided.
6287  */
6288
6289 __private_extern__ kern_return_t
6290 vm_object_super_upl_request(
6291         vm_object_t object,
6292         vm_object_offset_t      offset,
6293         upl_size_t              size,
6294         upl_size_t              super_cluster,
6295         upl_t                   *upl,
6296         upl_page_info_t         *user_page_list,
6297         unsigned int            *page_list_count,
6298         upl_control_flags_t     cntrl_flags)
6299 {
6300         if (object->paging_offset > offset  || ((cntrl_flags & UPL_VECTOR)==UPL_VECTOR))
6301                 return KERN_FAILURE;
6302
6303         assert(object->paging_in_progress);
6304         offset = offset - object->paging_offset;
6305
6306         if (super_cluster > size) {
6307
6308                 vm_object_offset_t      base_offset;
6309                 upl_size_t              super_size;
6310                 vm_object_size_t        super_size_64;
6311
6312                 base_offset = (offset & ~((vm_object_offset_t) super_cluster - 1));
6313                 super_size = (offset + size) > (base_offset + super_cluster) ? super_cluster<<1 : super_cluster;
6314                 super_size_64 = ((base_offset + super_size) > object->vo_size) ? (object->vo_size - base_offset) : super_size;
6315                 super_size = (upl_size_t) super_size_64;
6316                 assert(super_size == super_size_64);
6317
6318                 if (offset > (base_offset + super_size)) {
6319                         panic("vm_object_super_upl_request: Missed target pageout"
6320                               " %#llx,%#llx, %#x, %#x, %#x, %#llx\n",
6321                               offset, base_offset, super_size, super_cluster,
6322                               size, object->paging_offset);
6323                 }
6324                 /*
6325                  * apparently there is a case where the vm requests a
6326                  * page to be written out who's offset is beyond the
6327                  * object size
6328                  */
6329                 if ((offset + size) > (base_offset + super_size)) {
6330                         super_size_64 = (offset + size) - base_offset;
6331                         super_size = (upl_size_t) super_size_64;
6332                         assert(super_size == super_size_64);
6333                 }
6334
6335                 offset = base_offset;
6336                 size = super_size;
6337         }
6338         return vm_object_upl_request(object, offset, size, upl, user_page_list, page_list_count, cntrl_flags);
6339 }
6340
6341
6342 kern_return_t
6343 vm_map_create_upl(
6344         vm_map_t                map,
6345         vm_map_address_t        offset,
6346         upl_size_t              *upl_size,
6347         upl_t                   *upl,
6348         upl_page_info_array_t   page_list,
6349         unsigned int            *count,
6350         upl_control_flags_t     *flags)
6351 {
6352         vm_map_entry_t          entry;
6353         upl_control_flags_t     caller_flags;
6354         int                     force_data_sync;
6355         int                     sync_cow_data;
6356         vm_object_t             local_object;
6357         vm_map_offset_t         local_offset;
6358         vm_map_offset_t         local_start;
6359         kern_return_t           ret;
6360
6361         assert(page_aligned(offset));
6362
6363         caller_flags = *flags;
6364
6365         if (caller_flags & ~UPL_VALID_FLAGS) {
6366                 /*
6367                  * For forward compatibility's sake,
6368                  * reject any unknown flag.
6369                  */
6370                 return KERN_INVALID_VALUE;
6371         }
6372         force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
6373         sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
6374
6375         if (upl == NULL)
6376                 return KERN_INVALID_ARGUMENT;
6377
6378 REDISCOVER_ENTRY:
6379         vm_map_lock_read(map);
6380
6381         if (!vm_map_lookup_entry(map, offset, &entry)) {
6382                 vm_map_unlock_read(map);
6383                 return KERN_FAILURE;
6384         }
6385
6386         if ((entry->vme_end - offset) < *upl_size) {
6387                 *upl_size = (upl_size_t) (entry->vme_end - offset);
6388                 assert(*upl_size == entry->vme_end - offset);
6389         }
6390
6391         if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
6392                 *flags = 0;
6393
6394                 if (!entry->is_sub_map &&
6395                     VME_OBJECT(entry) != VM_OBJECT_NULL) {
6396                         if (VME_OBJECT(entry)->private)
6397                                 *flags = UPL_DEV_MEMORY;
6398
6399                         if (VME_OBJECT(entry)->phys_contiguous)
6400                                 *flags |= UPL_PHYS_CONTIG;
6401                 }
6402                 vm_map_unlock_read(map);
6403                 return KERN_SUCCESS;
6404         }
6405
6406         if (VME_OBJECT(entry) == VM_OBJECT_NULL ||
6407             !VME_OBJECT(entry)->phys_contiguous) {
6408                 if (*upl_size > MAX_UPL_SIZE_BYTES)
6409                         *upl_size = MAX_UPL_SIZE_BYTES;
6410         }
6411
6412         /*
6413          *      Create an object if necessary.
6414          */
6415         if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
6416
6417                 if (vm_map_lock_read_to_write(map))
6418                         goto REDISCOVER_ENTRY;
6419
6420                 VME_OBJECT_SET(entry,
6421                                vm_object_allocate((vm_size_t)
6422                                                   (entry->vme_end -
6423                                                    entry->vme_start)));
6424                 VME_OFFSET_SET(entry, 0);
6425
6426                 vm_map_lock_write_to_read(map);
6427         }
6428
6429         if (!(caller_flags & UPL_COPYOUT_FROM) &&
6430             !(entry->protection & VM_PROT_WRITE)) {
6431                 vm_map_unlock_read(map);
6432                 return KERN_PROTECTION_FAILURE;
6433         }
6434
6435
6436         local_object = VME_OBJECT(entry);
6437         assert(local_object != VM_OBJECT_NULL);
6438
6439         if (!entry->is_sub_map &&
6440             !entry->needs_copy &&
6441             *upl_size != 0 &&
6442             local_object->vo_size > *upl_size && /* partial UPL */
6443             entry->wired_count == 0 && /* No COW for entries that are wired */
6444             (map->pmap != kernel_pmap) && /* alias checks */
6445             (vm_map_entry_should_cow_for_true_share(entry) /* case 1 */
6446              ||
6447              (/* case 2 */
6448               local_object->internal &&
6449               (local_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) &&
6450               local_object->ref_count > 1))) {
6451                 vm_prot_t       prot;
6452
6453                 /*
6454                  * Case 1:
6455                  * Set up the targeted range for copy-on-write to avoid
6456                  * applying true_share/copy_delay to the entire object.
6457                  *
6458                  * Case 2:
6459                  * This map entry covers only part of an internal
6460                  * object.  There could be other map entries covering
6461                  * other areas of this object and some of these map
6462                  * entries could be marked as "needs_copy", which
6463                  * assumes that the object is COPY_SYMMETRIC.
6464                  * To avoid marking this object as COPY_DELAY and
6465                  * "true_share", let's shadow it and mark the new
6466                  * (smaller) object as "true_share" and COPY_DELAY.
6467                  */
6468
6469                 if (vm_map_lock_read_to_write(map)) {
6470                         goto REDISCOVER_ENTRY;
6471                 }
6472                 vm_map_lock_assert_exclusive(map);
6473                 assert(VME_OBJECT(entry) == local_object);
6474
6475                 vm_map_clip_start(map,
6476                                   entry,
6477                                   vm_map_trunc_page(offset,
6478                                                     VM_MAP_PAGE_MASK(map)));
6479                 vm_map_clip_end(map,
6480                                 entry,
6481                                 vm_map_round_page(offset + *upl_size,
6482                                                   VM_MAP_PAGE_MASK(map)));
6483                 if ((entry->vme_end - offset) < *upl_size) {
6484                         *upl_size = (upl_size_t) (entry->vme_end - offset);
6485                         assert(*upl_size == entry->vme_end - offset);
6486                 }
6487
6488                 prot = entry->protection & ~VM_PROT_WRITE;
6489                 if (override_nx(map, VME_ALIAS(entry)) && prot)
6490                         prot |= VM_PROT_EXECUTE;
6491                 vm_object_pmap_protect(local_object,
6492                                        VME_OFFSET(entry),
6493                                        entry->vme_end - entry->vme_start,
6494                                        ((entry->is_shared ||
6495                                          map->mapped_in_other_pmaps)
6496                                         ? PMAP_NULL
6497                                         : map->pmap),
6498                                        entry->vme_start,
6499                                        prot);
6500
6501                 assert(entry->wired_count == 0);
6502
6503                 /*
6504                  * Lock the VM object and re-check its status: if it's mapped
6505                  * in another address space, we could still be racing with
6506                  * another thread holding that other VM map exclusively.
6507                  */
6508                 vm_object_lock(local_object);
6509                 if (local_object->true_share) {
6510                         /* object is already in proper state: no COW needed */
6511                         assert(local_object->copy_strategy !=
6512                                MEMORY_OBJECT_COPY_SYMMETRIC);
6513                 } else {
6514                         /* not true_share: ask for copy-on-write below */
6515                         assert(local_object->copy_strategy ==
6516                                MEMORY_OBJECT_COPY_SYMMETRIC);
6517                         entry->needs_copy = TRUE;
6518                 }
6519                 vm_object_unlock(local_object);
6520
6521                 vm_map_lock_write_to_read(map);
6522         }
6523
6524         if (entry->needs_copy)  {
6525                 /*
6526                  * Honor copy-on-write for COPY_SYMMETRIC
6527                  * strategy.
6528                  */
6529                 vm_map_t                local_map;
6530                 vm_object_t             object;
6531                 vm_object_offset_t      new_offset;
6532                 vm_prot_t               prot;
6533                 boolean_t               wired;
6534                 vm_map_version_t        version;
6535                 vm_map_t                real_map;
6536                 vm_prot_t               fault_type;
6537
6538                 local_map = map;
6539
6540                 if (caller_flags & UPL_COPYOUT_FROM) {
6541                         fault_type = VM_PROT_READ | VM_PROT_COPY;
6542                         vm_counters.create_upl_extra_cow++;
6543                         vm_counters.create_upl_extra_cow_pages +=
6544                                 (entry->vme_end - entry->vme_start) / PAGE_SIZE;
6545                 } else {
6546                         fault_type = VM_PROT_WRITE;
6547                 }
6548                 if (vm_map_lookup_locked(&local_map,
6549                                          offset, fault_type,
6550                                          OBJECT_LOCK_EXCLUSIVE,
6551                                          &version, &object,
6552                                          &new_offset, &prot, &wired,
6553                                          NULL,
6554                                          &real_map) != KERN_SUCCESS) {
6555                         if (fault_type == VM_PROT_WRITE) {
6556                                 vm_counters.create_upl_lookup_failure_write++;
6557                         } else {
6558                                 vm_counters.create_upl_lookup_failure_copy++;
6559                         }
6560                         vm_map_unlock_read(local_map);
6561                         return KERN_FAILURE;
6562                 }
6563                 if (real_map != map)
6564                         vm_map_unlock(real_map);
6565                 vm_map_unlock_read(local_map);
6566
6567                 vm_object_unlock(object);
6568
6569                 goto REDISCOVER_ENTRY;
6570         }
6571
6572         if (entry->is_sub_map) {
6573                 vm_map_t        submap;
6574
6575                 submap = VME_SUBMAP(entry);
6576                 local_start = entry->vme_start;
6577                 local_offset = VME_OFFSET(entry);
6578
6579                 vm_map_reference(submap);
6580                 vm_map_unlock_read(map);
6581
6582                 ret = vm_map_create_upl(submap,
6583                                         local_offset + (offset - local_start),
6584                                         upl_size, upl, page_list, count, flags);
6585                 vm_map_deallocate(submap);
6586
6587                 return ret;
6588         }
6589
6590         if (sync_cow_data &&
6591             (VME_OBJECT(entry)->shadow ||
6592              VME_OBJECT(entry)->copy)) {
6593                 local_object = VME_OBJECT(entry);
6594                 local_start = entry->vme_start;
6595                 local_offset = VME_OFFSET(entry);
6596
6597                 vm_object_reference(local_object);
6598                 vm_map_unlock_read(map);
6599
6600                 if (local_object->shadow && local_object->copy) {
6601                         vm_object_lock_request(local_object->shadow,
6602                                                ((vm_object_offset_t)
6603                                                 ((offset - local_start) +
6604                                                  local_offset) +
6605                                                 local_object->vo_shadow_offset),
6606                                                *upl_size, FALSE,
6607                                                MEMORY_OBJECT_DATA_SYNC,
6608                                                VM_PROT_NO_CHANGE);
6609                 }
6610                 sync_cow_data = FALSE;
6611                 vm_object_deallocate(local_object);
6612
6613                 goto REDISCOVER_ENTRY;
6614         }
6615         if (force_data_sync) {
6616                 local_object = VME_OBJECT(entry);
6617                 local_start = entry->vme_start;
6618                 local_offset = VME_OFFSET(entry);
6619
6620                 vm_object_reference(local_object);
6621                 vm_map_unlock_read(map);
6622
6623                 vm_object_lock_request(local_object,
6624                                        ((vm_object_offset_t)
6625                                         ((offset - local_start) +
6626                                          local_offset)),
6627                                        (vm_object_size_t)*upl_size,
6628                                        FALSE,
6629                                        MEMORY_OBJECT_DATA_SYNC,
6630                                        VM_PROT_NO_CHANGE);
6631
6632                 force_data_sync = FALSE;
6633                 vm_object_deallocate(local_object);
6634
6635                 goto REDISCOVER_ENTRY;
6636         }
6637         if (VME_OBJECT(entry)->private)
6638                 *flags = UPL_DEV_MEMORY;
6639         else
6640                 *flags = 0;
6641
6642         if (VME_OBJECT(entry)->phys_contiguous)
6643                 *flags |= UPL_PHYS_CONTIG;
6644
6645         local_object = VME_OBJECT(entry);
6646         local_offset = VME_OFFSET(entry);
6647         local_start = entry->vme_start;
6648
6649
6650         vm_object_lock(local_object);
6651
6652         /*
6653          * Ensure that this object is "true_share" and "copy_delay" now,
6654          * while we're still holding the VM map lock.  After we unlock the map,
6655          * anything could happen to that mapping, including some copy-on-write
6656          * activity.  We need to make sure that the IOPL will point at the
6657          * same memory as the mapping.
6658          */
6659         if (local_object->true_share) {
6660                 assert(local_object->copy_strategy !=
6661                        MEMORY_OBJECT_COPY_SYMMETRIC);
6662         } else if (local_object != kernel_object &&
6663                    local_object != compressor_object &&
6664                    !local_object->phys_contiguous) {
6665 #if VM_OBJECT_TRACKING_OP_TRUESHARE
6666                 if (!local_object->true_share &&
6667                     vm_object_tracking_inited) {
6668                         void *bt[VM_OBJECT_TRACKING_BTDEPTH];
6669                         int num = 0;
6670                         num = OSBacktrace(bt,
6671                                           VM_OBJECT_TRACKING_BTDEPTH);
6672                         btlog_add_entry(vm_object_tracking_btlog,
6673                                         local_object,
6674                                         VM_OBJECT_TRACKING_OP_TRUESHARE,
6675                                         bt,
6676                                         num);
6677                 }
6678 #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
6679                 local_object->true_share = TRUE;
6680                 if (local_object->copy_strategy ==
6681                     MEMORY_OBJECT_COPY_SYMMETRIC) {
6682                         local_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
6683                 }
6684         }
6685
6686         vm_object_reference_locked(local_object);
6687         vm_object_unlock(local_object);
6688
6689         vm_map_unlock_read(map);
6690
6691         ret = vm_object_iopl_request(local_object,
6692                                      ((vm_object_offset_t)
6693                                       ((offset - local_start) + local_offset)),
6694                                      *upl_size,
6695                                      upl,
6696                                      page_list,
6697                                      count,
6698                                      caller_flags);
6699         vm_object_deallocate(local_object);
6700
6701         return ret;
6702 }
6703
6704 /*
6705  * Internal routine to enter a UPL into a VM map.
6706  *
6707  * JMM - This should just be doable through the standard
6708  * vm_map_enter() API.
6709  */
6710 kern_return_t
6711 vm_map_enter_upl(
6712         vm_map_t                map,
6713         upl_t                   upl,
6714         vm_map_offset_t         *dst_addr)
6715 {
6716         vm_map_size_t           size;
6717         vm_object_offset_t      offset;
6718         vm_map_offset_t         addr;
6719         vm_page_t               m;
6720         kern_return_t           kr;
6721         int                     isVectorUPL = 0, curr_upl=0;
6722         upl_t                   vector_upl = NULL;
6723         vm_offset_t             vector_upl_dst_addr = 0;
6724         vm_map_t                vector_upl_submap = NULL;
6725         upl_offset_t            subupl_offset = 0;
6726         upl_size_t              subupl_size = 0;
6727
6728         if (upl == UPL_NULL)
6729                 return KERN_INVALID_ARGUMENT;
6730
6731         if((isVectorUPL = vector_upl_is_valid(upl))) {
6732                 int mapped=0,valid_upls=0;
6733                 vector_upl = upl;
6734
6735                 upl_lock(vector_upl);
6736                 for(curr_upl=0; curr_upl < MAX_VECTOR_UPL_ELEMENTS; curr_upl++) {
6737                         upl =  vector_upl_subupl_byindex(vector_upl, curr_upl );
6738                         if(upl == NULL)
6739                                 continue;
6740                         valid_upls++;
6741                         if (UPL_PAGE_LIST_MAPPED & upl->flags)
6742                                 mapped++;
6743                 }
6744
6745                 if(mapped) {
6746                         if(mapped != valid_upls)
6747                                 panic("Only %d of the %d sub-upls within the Vector UPL are alread mapped\n", mapped, valid_upls);
6748                         else {
6749                                 upl_unlock(vector_upl);
6750                                 return KERN_FAILURE;
6751                         }
6752                 }
6753
6754                 kr = kmem_suballoc(map, &vector_upl_dst_addr, vector_upl->size, FALSE, VM_FLAGS_ANYWHERE, &vector_upl_submap);
6755                 if( kr != KERN_SUCCESS )
6756                         panic("Vector UPL submap allocation failed\n");
6757                 map = vector_upl_submap;
6758                 vector_upl_set_submap(vector_upl, vector_upl_submap, vector_upl_dst_addr);
6759                 curr_upl=0;
6760         }
6761         else
6762                 upl_lock(upl);
6763
6764 process_upl_to_enter:
6765         if(isVectorUPL){
6766                 if(curr_upl == MAX_VECTOR_UPL_ELEMENTS) {
6767                         *dst_addr = vector_upl_dst_addr;
6768                         upl_unlock(vector_upl);
6769                         return KERN_SUCCESS;
6770                 }
6771                 upl =  vector_upl_subupl_byindex(vector_upl, curr_upl++ );
6772                 if(upl == NULL)
6773                         goto process_upl_to_enter;
6774
6775                 vector_upl_get_iostate(vector_upl, upl, &subupl_offset, &subupl_size);
6776                 *dst_addr = (vm_map_offset_t)(vector_upl_dst_addr + (vm_map_offset_t)subupl_offset);
6777         } else {
6778                 /*
6779                  * check to see if already mapped
6780                  */
6781                 if (UPL_PAGE_LIST_MAPPED & upl->flags) {
6782                         upl_unlock(upl);
6783                         return KERN_FAILURE;
6784                 }
6785         }
6786         if ((!(upl->flags & UPL_SHADOWED)) &&
6787             ((upl->flags & UPL_HAS_BUSY) ||
6788              !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || (upl->map_object->phys_contiguous)))) {
6789
6790                 vm_object_t             object;
6791                 vm_page_t               alias_page;
6792                 vm_object_offset_t      new_offset;
6793                 unsigned int            pg_num;
6794                 wpl_array_t             lite_list;
6795
6796                 if (upl->flags & UPL_INTERNAL) {
6797                         lite_list = (wpl_array_t)
6798                                 ((((uintptr_t)upl) + sizeof(struct upl))
6799                                  + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
6800                 } else {
6801                         lite_list = (wpl_array_t)(((uintptr_t)upl) + sizeof(struct upl));
6802                 }
6803                 object = upl->map_object;
6804                 upl->map_object = vm_object_allocate(upl->size);
6805
6806                 vm_object_lock(upl->map_object);
6807
6808                 upl->map_object->shadow = object;
6809                 upl->map_object->pageout = TRUE;
6810                 upl->map_object->can_persist = FALSE;
6811                 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
6812                 upl->map_object->vo_shadow_offset = upl->offset - object->paging_offset;
6813                 upl->map_object->wimg_bits = object->wimg_bits;
6814                 offset = upl->map_object->vo_shadow_offset;
6815                 new_offset = 0;
6816                 size = upl->size;
6817
6818                 upl->flags |= UPL_SHADOWED;
6819
6820                 while (size) {
6821                         pg_num = (unsigned int) (new_offset / PAGE_SIZE);
6822                         assert(pg_num == new_offset / PAGE_SIZE);
6823
6824                         if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
6825
6826                                 VM_PAGE_GRAB_FICTITIOUS(alias_page);
6827
6828                                 vm_object_lock(object);
6829
6830                                 m = vm_page_lookup(object, offset);
6831                                 if (m == VM_PAGE_NULL) {
6832                                         panic("vm_upl_map: page missing\n");
6833                                 }
6834
6835                                 /*
6836                                  * Convert the fictitious page to a private
6837                                  * shadow of the real page.
6838                                  */
6839                                 assert(alias_page->fictitious);
6840                                 alias_page->fictitious = FALSE;
6841                                 alias_page->private = TRUE;
6842                                 alias_page->free_when_done = TRUE;
6843                                 /*
6844                                  * since m is a page in the upl it must
6845                                  * already be wired or BUSY, so it's
6846                                  * safe to assign the underlying physical
6847                                  * page to the alias
6848                                  */
6849                                 VM_PAGE_SET_PHYS_PAGE(alias_page, VM_PAGE_GET_PHYS_PAGE(m));
6850
6851                                 vm_object_unlock(object);
6852
6853                                 vm_page_lockspin_queues();
6854                                 vm_page_wire(alias_page, VM_KERN_MEMORY_NONE, TRUE);
6855                                 vm_page_unlock_queues();
6856
6857                                 /*
6858                                  * ENCRYPTED SWAP:
6859                                  * The virtual page ("m") has to be wired in some way
6860                                  * here or its backing physical page could
6861                                  * be recycled at any time.
6862                                  * Assuming this is enforced by the caller, we can't
6863                                  * get an encrypted page here.  Since the encryption
6864                                  * key depends on the VM page's "pager" object and
6865                                  * the "paging_offset", we couldn't handle 2 pageable
6866                                  * VM pages (with different pagers and paging_offsets)
6867                                  * sharing the same physical page:  we could end up
6868                                  * encrypting with one key (via one VM page) and
6869                                  * decrypting with another key (via the alias VM page).
6870                                  */
6871                                 ASSERT_PAGE_DECRYPTED(m);
6872
6873                                 vm_page_insert_wired(alias_page, upl->map_object, new_offset, VM_KERN_MEMORY_NONE);
6874
6875                                 assert(!alias_page->wanted);
6876                                 alias_page->busy = FALSE;
6877                                 alias_page->absent = FALSE;
6878                         }
6879                         size -= PAGE_SIZE;
6880                         offset += PAGE_SIZE_64;
6881                         new_offset += PAGE_SIZE_64;
6882                 }
6883                 vm_object_unlock(upl->map_object);
6884         }
6885         if (upl->flags & UPL_SHADOWED)
6886                 offset = 0;
6887         else
6888                 offset = upl->offset - upl->map_object->paging_offset;
6889
6890         size = upl->size;
6891
6892         vm_object_reference(upl->map_object);
6893
6894         if(!isVectorUPL) {
6895                 *dst_addr = 0;
6896                 /*
6897                 * NEED A UPL_MAP ALIAS
6898                 */
6899                 kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
6900                                   VM_FLAGS_ANYWHERE | VM_MAKE_TAG(VM_KERN_MEMORY_OSFMK),
6901                                   upl->map_object, offset, FALSE,
6902                                   VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
6903
6904                 if (kr != KERN_SUCCESS) {
6905                         vm_object_deallocate(upl->map_object);
6906                         upl_unlock(upl);
6907                         return(kr);
6908                 }
6909         }
6910         else {
6911                 kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
6912                                   VM_FLAGS_FIXED | VM_MAKE_TAG(VM_KERN_MEMORY_OSFMK),
6913                                   upl->map_object, offset, FALSE,
6914                                   VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
6915                 if(kr)
6916                         panic("vm_map_enter failed for a Vector UPL\n");
6917         }
6918         vm_object_lock(upl->map_object);
6919
6920         for (addr = *dst_addr; size > 0; size -= PAGE_SIZE, addr += PAGE_SIZE) {
6921                 m = vm_page_lookup(upl->map_object, offset);
6922
6923                 if (m) {
6924                         m->pmapped = TRUE;
6925
6926                         /* CODE SIGNING ENFORCEMENT: page has been wpmapped,
6927                          * but only in kernel space. If this was on a user map,
6928                          * we'd have to set the wpmapped bit. */
6929                         /* m->wpmapped = TRUE; */
6930                         assert(map->pmap == kernel_pmap);
6931
6932                         PMAP_ENTER(map->pmap, addr, m, VM_PROT_DEFAULT, VM_PROT_NONE, 0, TRUE);
6933                 }
6934                 offset += PAGE_SIZE_64;
6935         }
6936         vm_object_unlock(upl->map_object);
6937
6938         /*
6939          * hold a reference for the mapping
6940          */
6941         upl->ref_count++;
6942         upl->flags |= UPL_PAGE_LIST_MAPPED;
6943         upl->kaddr = (vm_offset_t) *dst_addr;
6944         assert(upl->kaddr == *dst_addr);
6945
6946         if(isVectorUPL)
6947                 goto process_upl_to_enter;
6948
6949         upl_unlock(upl);
6950
6951         return KERN_SUCCESS;
6952 }
6953
6954 /*
6955  * Internal routine to remove a UPL mapping from a VM map.
6956  *
6957  * XXX - This should just be doable through a standard
6958  * vm_map_remove() operation.  Otherwise, implicit clean-up
6959  * of the target map won't be able to correctly remove
6960  * these (and release the reference on the UPL).  Having
6961  * to do this means we can't map these into user-space
6962  * maps yet.
6963  */
6964 kern_return_t
6965 vm_map_remove_upl(
6966         vm_map_t        map,
6967         upl_t           upl)
6968 {
6969         vm_address_t    addr;
6970         upl_size_t      size;
6971         int             isVectorUPL = 0, curr_upl = 0;
6972         upl_t           vector_upl = NULL;
6973
6974         if (upl == UPL_NULL)
6975                 return KERN_INVALID_ARGUMENT;
6976
6977         if((isVectorUPL = vector_upl_is_valid(upl))) {
6978                 int     unmapped=0, valid_upls=0;
6979                 vector_upl = upl;
6980                 upl_lock(vector_upl);
6981                 for(curr_upl=0; curr_upl < MAX_VECTOR_UPL_ELEMENTS; curr_upl++) {
6982                         upl =  vector_upl_subupl_byindex(vector_upl, curr_upl );
6983                         if(upl == NULL)
6984                                 continue;
6985                         valid_upls++;
6986                         if (!(UPL_PAGE_LIST_MAPPED & upl->flags))
6987                                 unmapped++;
6988                 }
6989
6990                 if(unmapped) {
6991                         if(unmapped != valid_upls)
6992                                 panic("%d of the %d sub-upls within the Vector UPL is/are not mapped\n", unmapped, valid_upls);
6993                         else {
6994                                 upl_unlock(vector_upl);
6995                                 return KERN_FAILURE;
6996                         }
6997                 }
6998                 curr_upl=0;
6999         }
7000         else
7001                 upl_lock(upl);
7002
7003 process_upl_to_remove:
7004         if(isVectorUPL) {
7005                 if(curr_upl == MAX_VECTOR_UPL_ELEMENTS) {
7006                         vm_map_t v_upl_submap;
7007                         vm_offset_t v_upl_submap_dst_addr;
7008                         vector_upl_get_submap(vector_upl, &v_upl_submap, &v_upl_submap_dst_addr);
7009
7010                         vm_map_remove(map, v_upl_submap_dst_addr, v_upl_submap_dst_addr + vector_upl->size, VM_MAP_NO_FLAGS);
7011                         vm_map_deallocate(v_upl_submap);
7012                         upl_unlock(vector_upl);
7013                         return KERN_SUCCESS;
7014                 }
7015
7016                 upl =  vector_upl_subupl_byindex(vector_upl, curr_upl++ );
7017                 if(upl == NULL)
7018                         goto process_upl_to_remove;
7019         }
7020
7021         if (upl->flags & UPL_PAGE_LIST_MAPPED) {
7022                 addr = upl->kaddr;
7023                 size = upl->size;
7024
7025                 assert(upl->ref_count > 1);
7026                 upl->ref_count--;               /* removing mapping ref */
7027
7028                 upl->flags &= ~UPL_PAGE_LIST_MAPPED;
7029                 upl->kaddr = (vm_offset_t) 0;
7030
7031                 if(!isVectorUPL) {
7032                         upl_unlock(upl);
7033
7034                         vm_map_remove(
7035                                 map,
7036                                 vm_map_trunc_page(addr,
7037                                                   VM_MAP_PAGE_MASK(map)),
7038                                 vm_map_round_page(addr + size,
7039                                                   VM_MAP_PAGE_MASK(map)),
7040                                 VM_MAP_NO_FLAGS);
7041
7042                         return KERN_SUCCESS;
7043                 }
7044                 else {
7045                         /*
7046                         * If it's a Vectored UPL, we'll be removing the entire
7047                         * submap anyways, so no need to remove individual UPL
7048                         * element mappings from within the submap
7049                         */
7050                         goto process_upl_to_remove;
7051                 }
7052         }
7053         upl_unlock(upl);
7054
7055         return KERN_FAILURE;
7056 }
7057
7058
7059 kern_return_t
7060 upl_commit_range(
7061         upl_t                   upl,
7062         upl_offset_t            offset,
7063         upl_size_t              size,
7064         int                     flags,
7065         upl_page_info_t         *page_list,
7066         mach_msg_type_number_t  count,
7067         boolean_t               *empty)
7068 {
7069         upl_size_t              xfer_size, subupl_size = size;
7070         vm_object_t             shadow_object;
7071         vm_object_t             object;
7072         vm_object_t             m_object;
7073         vm_object_offset_t      target_offset;
7074         upl_offset_t            subupl_offset = offset;
7075         int                     entry;
7076         wpl_array_t             lite_list;
7077         int                     occupied;
7078         int                     clear_refmod = 0;
7079         int                     pgpgout_count = 0;
7080         struct  vm_page_delayed_work    dw_array[DEFAULT_DELAYED_WORK_LIMIT];
7081         struct  vm_page_delayed_work    *dwp;
7082         int                     dw_count;
7083         int                     dw_limit;
7084         int                     isVectorUPL = 0;
7085         upl_t                   vector_upl = NULL;
7086         boolean_t               should_be_throttled = FALSE;
7087
7088         vm_page_t               nxt_page = VM_PAGE_NULL;
7089         int                     fast_path_possible = 0;
7090         int                     fast_path_full_commit = 0;
7091         int                     throttle_page = 0;
7092         int                     unwired_count = 0;
7093         int                     local_queue_count = 0;
7094         vm_page_t               first_local, last_local;
7095
7096         *empty = FALSE;
7097
7098         if (upl == UPL_NULL)
7099                 return KERN_INVALID_ARGUMENT;
7100
7101         if (count == 0)
7102                 page_list = NULL;
7103
7104         if((isVectorUPL = vector_upl_is_valid(upl))) {
7105                 vector_upl = upl;
7106                 upl_lock(vector_upl);
7107         }
7108         else
7109                 upl_lock(upl);
7110
7111 process_upl_to_commit:
7112
7113         if(isVectorUPL) {
7114                 size = subupl_size;
7115                 offset = subupl_offset;
7116                 if(size == 0) {
7117                         upl_unlock(vector_upl);
7118                         return KERN_SUCCESS;
7119                 }
7120                 upl =  vector_upl_subupl_byoffset(vector_upl, &offset, &size);
7121                 if(upl == NULL) {
7122                         upl_unlock(vector_upl);
7123                         return KERN_FAILURE;
7124                 }
7125                 page_list = UPL_GET_INTERNAL_PAGE_LIST_SIMPLE(upl);
7126                 subupl_size -= size;
7127                 subupl_offset += size;
7128         }
7129
7130 #if UPL_DEBUG
7131         if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
7132                 (void) OSBacktrace(&upl->upl_commit_records[upl->upl_commit_index].c_retaddr[0], UPL_DEBUG_STACK_FRAMES);
7133
7134                 upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
7135                 upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
7136
7137                 upl->upl_commit_index++;
7138         }
7139 #endif
7140         if (upl->flags & UPL_DEVICE_MEMORY)
7141                 xfer_size = 0;
7142         else if ((offset + size) <= upl->size)
7143                 xfer_size = size;
7144         else {
7145                 if(!isVectorUPL)
7146                         upl_unlock(upl);
7147                 else {
7148                         upl_unlock(vector_upl);
7149                 }
7150                 return KERN_FAILURE;
7151         }
7152         if (upl->flags & UPL_SET_DIRTY)
7153                 flags |= UPL_COMMIT_SET_DIRTY;
7154         if (upl->flags & UPL_CLEAR_DIRTY)
7155                 flags |= UPL_COMMIT_CLEAR_DIRTY;
7156
7157         if (upl->flags & UPL_INTERNAL)
7158                 lite_list = (wpl_array_t) ((((uintptr_t)upl) + sizeof(struct upl))
7159                                            + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
7160         else
7161                 lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
7162
7163         object = upl->map_object;
7164
7165         if (upl->flags & UPL_SHADOWED) {
7166                 vm_object_lock(object);
7167                 shadow_object = object->shadow;
7168         } else {
7169                 shadow_object = object;
7170         }
7171         entry = offset/PAGE_SIZE;
7172         target_offset = (vm_object_offset_t)offset;
7173
7174         assert(!(target_offset & PAGE_MASK));
7175         assert(!(xfer_size & PAGE_MASK));
7176
7177         if (upl->flags & UPL_KERNEL_OBJECT)
7178                 vm_object_lock_shared(shadow_object);
7179         else
7180                 vm_object_lock(shadow_object);
7181
7182         if (upl->flags & UPL_ACCESS_BLOCKED) {
7183                 assert(shadow_object->blocked_access);
7184                 shadow_object->blocked_access = FALSE;
7185                 vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
7186         }
7187
7188         if (shadow_object->code_signed) {
7189                 /*
7190                  * CODE SIGNING:
7191                  * If the object is code-signed, do not let this UPL tell
7192                  * us if the pages are valid or not.  Let the pages be
7193                  * validated by VM the normal way (when they get mapped or
7194                  * copied).
7195                  */
7196                 flags &= ~UPL_COMMIT_CS_VALIDATED;
7197         }
7198         if (! page_list) {
7199                 /*
7200                  * No page list to get the code-signing info from !?
7201                  */
7202                 flags &= ~UPL_COMMIT_CS_VALIDATED;
7203         }
7204         if (!VM_DYNAMIC_PAGING_ENABLED() && shadow_object->internal)
7205                 should_be_throttled = TRUE;
7206
7207         dwp = &dw_array[0];
7208         dw_count = 0;
7209         dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
7210
7211         if ((upl->flags & UPL_IO_WIRE) &&
7212             !(flags & UPL_COMMIT_FREE_ABSENT) &&
7213             !isVectorUPL &&
7214             shadow_object->purgable != VM_PURGABLE_VOLATILE &&
7215             shadow_object->purgable != VM_PURGABLE_EMPTY) {
7216
7217                 if (!vm_page_queue_empty(&shadow_object->memq)) {
7218
7219                         if (size == shadow_object->vo_size) {
7220                                 nxt_page = (vm_page_t)vm_page_queue_first(&shadow_object->memq);
7221                                 fast_path_full_commit = 1;
7222                         }
7223                         fast_path_possible = 1;
7224
7225                         if (!VM_DYNAMIC_PAGING_ENABLED() && shadow_object->internal &&
7226                             (shadow_object->purgable == VM_PURGABLE_DENY ||
7227                              shadow_object->purgable == VM_PURGABLE_NONVOLATILE ||
7228                              shadow_object->purgable == VM_PURGABLE_VOLATILE)) {
7229                                 throttle_page = 1;
7230                         }
7231                 }
7232         }
7233         first_local = VM_PAGE_NULL;
7234         last_local = VM_PAGE_NULL;
7235
7236         while (xfer_size) {
7237                 vm_page_t       t, m;
7238
7239                 dwp->dw_mask = 0;
7240                 clear_refmod = 0;
7241
7242                 m = VM_PAGE_NULL;
7243
7244                 if (upl->flags & UPL_LITE) {
7245                         unsigned int    pg_num;
7246
7247                         if (nxt_page != VM_PAGE_NULL) {
7248                                 m = nxt_page;
7249                                 nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->listq);
7250                                 target_offset = m->offset;
7251                         }
7252                         pg_num = (unsigned int) (target_offset/PAGE_SIZE);
7253                         assert(pg_num == target_offset/PAGE_SIZE);
7254
7255                         if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
7256                                 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
7257
7258                                 if (!(upl->flags & UPL_KERNEL_OBJECT) && m == VM_PAGE_NULL)
7259                                         m = vm_page_lookup(shadow_object, target_offset + (upl->offset - shadow_object->paging_offset));
7260                         } else
7261                                 m = NULL;
7262                 }
7263                 if (upl->flags & UPL_SHADOWED) {
7264                         if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
7265
7266                                 t->free_when_done = FALSE;
7267
7268                                 VM_PAGE_FREE(t);
7269
7270                                 if (!(upl->flags & UPL_KERNEL_OBJECT) && m == VM_PAGE_NULL)
7271                                         m = vm_page_lookup(shadow_object, target_offset + object->vo_shadow_offset);
7272                         }
7273                 }
7274                 if (m == VM_PAGE_NULL)
7275                         goto commit_next_page;
7276
7277                 m_object = VM_PAGE_OBJECT(m);
7278
7279                 if (m->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
7280                         assert(m->busy);
7281
7282                         dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7283                         goto commit_next_page;
7284                 }
7285
7286                 if (flags & UPL_COMMIT_CS_VALIDATED) {
7287                         /*
7288                          * CODE SIGNING:
7289                          * Set the code signing bits according to
7290                          * what the UPL says they should be.
7291                          */
7292                         m->cs_validated = page_list[entry].cs_validated;
7293                         m->cs_tainted = page_list[entry].cs_tainted;
7294                         m->cs_nx = page_list[entry].cs_nx;
7295                 }
7296                 if (flags & UPL_COMMIT_WRITTEN_BY_KERNEL)
7297                         m->written_by_kernel = TRUE;
7298
7299                 if (upl->flags & UPL_IO_WIRE) {
7300
7301                         if (page_list)
7302                                 page_list[entry].phys_addr = 0;
7303
7304                         if (flags & UPL_COMMIT_SET_DIRTY) {
7305                                 SET_PAGE_DIRTY(m, FALSE);
7306                         } else if (flags & UPL_COMMIT_CLEAR_DIRTY) {
7307                                 m->dirty = FALSE;
7308
7309                                 if (! (flags & UPL_COMMIT_CS_VALIDATED) &&
7310                                     m->cs_validated && !m->cs_tainted) {
7311                                         /*
7312                                          * CODE SIGNING:
7313                                          * This page is no longer dirty
7314                                          * but could have been modified,
7315                                          * so it will need to be
7316                                          * re-validated.
7317                                          */
7318                                         if (m->slid) {
7319                                                 panic("upl_commit_range(%p): page %p was slid\n",
7320                                                       upl, m);
7321                                         }
7322                                         assert(!m->slid);
7323                                         m->cs_validated = FALSE;
7324 #if DEVELOPMENT || DEBUG
7325                                         vm_cs_validated_resets++;
7326 #endif
7327                                         pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
7328                                 }
7329                                 clear_refmod |= VM_MEM_MODIFIED;
7330                         }
7331                         if (upl->flags & UPL_ACCESS_BLOCKED) {
7332                                 /*
7333                                  * We blocked access to the pages in this UPL.
7334                                  * Clear the "busy" bit and wake up any waiter
7335                                  * for this page.
7336                                  */
7337                                 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7338                         }
7339                         if (fast_path_possible) {
7340                                 assert(m_object->purgable != VM_PURGABLE_EMPTY);
7341                                 assert(m_object->purgable != VM_PURGABLE_VOLATILE);
7342                                 if (m->absent) {
7343                                         assert(m->vm_page_q_state == VM_PAGE_NOT_ON_Q);
7344                                         assert(m->wire_count == 0);
7345                                         assert(m->busy);
7346
7347                                         m->absent = FALSE;
7348                                         dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7349                                 } else {
7350                                         if (m->wire_count == 0)
7351                                                 panic("wire_count == 0, m = %p, obj = %p\n", m, shadow_object);
7352                                         assert(m->vm_page_q_state == VM_PAGE_IS_WIRED);
7353
7354                                         /*
7355                                          * XXX FBDP need to update some other
7356                                          * counters here (purgeable_wired_count)
7357                                          * (ledgers), ...
7358                                          */
7359                                         assert(m->wire_count > 0);
7360                                         m->wire_count--;
7361
7362                                         if (m->wire_count == 0) {
7363                                                 m->vm_page_q_state = VM_PAGE_NOT_ON_Q;
7364                                                 unwired_count++;
7365                                         }
7366                                 }
7367                                 if (m->wire_count == 0) {
7368                                         assert(m->pageq.next == 0 && m->pageq.prev == 0);
7369
7370                                         if (last_local == VM_PAGE_NULL) {
7371                                                 assert(first_local == VM_PAGE_NULL);
7372
7373                                                 last_local = m;
7374                                                 first_local = m;
7375                                         } else {
7376                                                 assert(first_local != VM_PAGE_NULL);
7377
7378                                                 m->pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local);
7379                                                 first_local->pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(m);
7380                                                 first_local = m;
7381                                         }
7382                                         local_queue_count++;
7383
7384                                         if (throttle_page) {
7385                                                 m->vm_page_q_state = VM_PAGE_ON_THROTTLED_Q;
7386                                         } else {
7387                                                 if (flags & UPL_COMMIT_INACTIVATE) {
7388                                                         if (shadow_object->internal)
7389                                                                 m->vm_page_q_state = VM_PAGE_ON_INACTIVE_INTERNAL_Q;
7390                                                         else
7391                                                                 m->vm_page_q_state = VM_PAGE_ON_INACTIVE_EXTERNAL_Q;
7392                                                 } else
7393                                                         m->vm_page_q_state = VM_PAGE_ON_ACTIVE_Q;
7394                                         }
7395                                 }
7396                         } else {
7397                                 if (flags & UPL_COMMIT_INACTIVATE) {
7398                                         dwp->dw_mask |= DW_vm_page_deactivate_internal;
7399                                         clear_refmod |= VM_MEM_REFERENCED;
7400                                 }
7401                                 if (m->absent) {
7402                                         if (flags & UPL_COMMIT_FREE_ABSENT)
7403                                                 dwp->dw_mask |= DW_vm_page_free;
7404                                         else {
7405                                                 m->absent = FALSE;
7406                                                 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7407
7408                                                 if ( !(dwp->dw_mask & DW_vm_page_deactivate_internal))
7409                                                         dwp->dw_mask |= DW_vm_page_activate;
7410                                         }
7411                                 } else
7412                                         dwp->dw_mask |= DW_vm_page_unwire;
7413                         }
7414                         goto commit_next_page;
7415                 }
7416                 assert(m->vm_page_q_state != VM_PAGE_USED_BY_COMPRESSOR);
7417
7418                 if (page_list)
7419                         page_list[entry].phys_addr = 0;
7420
7421                 /*
7422                  * make sure to clear the hardware
7423                  * modify or reference bits before
7424                  * releasing the BUSY bit on this page
7425                  * otherwise we risk losing a legitimate
7426                  * change of state
7427                  */
7428                 if (flags & UPL_COMMIT_CLEAR_DIRTY) {
7429                         m->dirty = FALSE;
7430
7431                         clear_refmod |= VM_MEM_MODIFIED;
7432                 }
7433                 if (m->laundry)
7434                         dwp->dw_mask |= DW_vm_pageout_throttle_up;
7435
7436                 if (VM_PAGE_WIRED(m))
7437                         m->free_when_done = FALSE;
7438
7439                 if (! (flags & UPL_COMMIT_CS_VALIDATED) &&
7440                     m->cs_validated && !m->cs_tainted) {
7441                         /*
7442                          * CODE SIGNING:
7443                          * This page is no longer dirty
7444                          * but could have been modified,
7445                          * so it will need to be
7446                          * re-validated.
7447                          */
7448                         if (m->slid) {
7449                                 panic("upl_commit_range(%p): page %p was slid\n",
7450                                       upl, m);
7451                         }
7452                         assert(!m->slid);
7453                         m->cs_validated = FALSE;
7454 #if DEVELOPMENT || DEBUG
7455                         vm_cs_validated_resets++;
7456 #endif
7457                         pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
7458                 }
7459                 if (m->overwriting) {
7460                         /*
7461                          * the (COPY_OUT_FROM == FALSE) request_page_list case
7462                          */
7463                         if (m->busy) {
7464 #if CONFIG_PHANTOM_CACHE
7465                                 if (m->absent && !m_object->internal)
7466                                         dwp->dw_mask |= DW_vm_phantom_cache_update;
7467 #endif
7468                                 m->absent = FALSE;
7469
7470                                 dwp->dw_mask |= DW_clear_busy;
7471                         } else {
7472                                 /*
7473                                  * alternate (COPY_OUT_FROM == FALSE) page_list case
7474                                  * Occurs when the original page was wired
7475                                  * at the time of the list request
7476                                  */
7477                                 assert(VM_PAGE_WIRED(m));
7478
7479                                 dwp->dw_mask |= DW_vm_page_unwire; /* reactivates */
7480                         }
7481                         m->overwriting = FALSE;
7482                 }
7483                 if (m->encrypted_cleaning == TRUE) {
7484                         m->encrypted_cleaning = FALSE;
7485
7486                         dwp->dw_mask |= DW_clear_busy | DW_PAGE_WAKEUP;
7487                 }
7488                 m->cleaning = FALSE;
7489
7490                 if (m->free_when_done) {
7491                         /*
7492                          * With the clean queue enabled, UPL_PAGEOUT should
7493                          * no longer set the pageout bit. It's pages now go
7494                          * to the clean queue.
7495                          */
7496                         assert(!(flags & UPL_PAGEOUT));
7497                         assert(!m_object->internal);
7498
7499                         m->free_when_done = FALSE;
7500 #if MACH_CLUSTER_STATS
7501                         if (m->wanted) vm_pageout_target_collisions++;
7502 #endif
7503                         if ((flags & UPL_COMMIT_SET_DIRTY) ||
7504                             (m->pmapped && (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED))) {
7505                                 /*
7506                                  * page was re-dirtied after we started
7507                                  * the pageout... reactivate it since
7508                                  * we don't know whether the on-disk
7509                                  * copy matches what is now in memory
7510                                  */
7511                                 SET_PAGE_DIRTY(m, FALSE);
7512
7513                                 dwp->dw_mask |= DW_vm_page_activate | DW_PAGE_WAKEUP;
7514
7515                                 if (upl->flags & UPL_PAGEOUT) {
7516                                         CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
7517                                         VM_STAT_INCR(reactivations);
7518                                         DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL);
7519                                 }
7520                         } else {
7521                                 /*
7522                                  * page has been successfully cleaned
7523                                  * go ahead and free it for other use
7524                                  */
7525                                 if (m_object->internal) {
7526                                         DTRACE_VM2(anonpgout, int, 1, (uint64_t *), NULL);
7527                                 } else {
7528                                         DTRACE_VM2(fspgout, int, 1, (uint64_t *), NULL);
7529                                 }
7530                                 m->dirty = FALSE;
7531                                 m->busy = TRUE;
7532
7533                                 dwp->dw_mask |= DW_vm_page_free;
7534                         }
7535                         goto commit_next_page;
7536                 }
7537 #if MACH_CLUSTER_STATS
7538                 if (m->wpmapped)
7539                         m->dirty = pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m));
7540
7541                 if (m->dirty)   vm_pageout_cluster_dirtied++;
7542                 else            vm_pageout_cluster_cleaned++;
7543                 if (m->wanted)  vm_pageout_cluster_collisions++;
7544 #endif
7545                 /*
7546                  * It is a part of the semantic of COPYOUT_FROM
7547                  * UPLs that a commit implies cache sync
7548                  * between the vm page and the backing store
7549                  * this can be used to strip the precious bit
7550                  * as well as clean
7551                  */
7552                 if ((upl->flags & UPL_PAGE_SYNC_DONE) || (flags & UPL_COMMIT_CLEAR_PRECIOUS))
7553                         m->precious = FALSE;
7554
7555                 if (flags & UPL_COMMIT_SET_DIRTY) {
7556                         SET_PAGE_DIRTY(m, FALSE);
7557                 } else {
7558                         m->dirty = FALSE;
7559                 }
7560
7561                 /* with the clean queue on, move *all* cleaned pages to the clean queue */
7562                 if (hibernate_cleaning_in_progress == FALSE && !m->dirty && (upl->flags & UPL_PAGEOUT)) {
7563                         pgpgout_count++;
7564
7565                         VM_STAT_INCR(pageouts);
7566                         DTRACE_VM2(pgout, int, 1, (uint64_t *), NULL);
7567
7568                         dwp->dw_mask |= DW_enqueue_cleaned;
7569                         vm_pageout_enqueued_cleaned_from_inactive_dirty++;
7570                 } else if (should_be_throttled == TRUE && (m->vm_page_q_state == VM_PAGE_NOT_ON_Q)) {
7571                         /*
7572                          * page coming back in from being 'frozen'...
7573                          * it was dirty before it was frozen, so keep it so
7574                          * the vm_page_activate will notice that it really belongs
7575                          * on the throttle queue and put it there
7576                          */
7577                         SET_PAGE_DIRTY(m, FALSE);
7578                         dwp->dw_mask |= DW_vm_page_activate;
7579
7580                 } else {
7581                         if ((flags & UPL_COMMIT_INACTIVATE) && !m->clustered && (m->vm_page_q_state != VM_PAGE_ON_SPECULATIVE_Q)) {
7582                                 dwp->dw_mask |= DW_vm_page_deactivate_internal;
7583                                 clear_refmod |= VM_MEM_REFERENCED;
7584                         } else if ( !VM_PAGE_PAGEABLE(m)) {
7585
7586                                 if (m->clustered || (flags & UPL_COMMIT_SPECULATE))
7587                                         dwp->dw_mask |= DW_vm_page_speculate;
7588                                 else if (m->reference)
7589                                         dwp->dw_mask |= DW_vm_page_activate;
7590                                 else {
7591                                         dwp->dw_mask |= DW_vm_page_deactivate_internal;
7592                                         clear_refmod |= VM_MEM_REFERENCED;
7593                                 }
7594                         }
7595                 }
7596                 if (upl->flags & UPL_ACCESS_BLOCKED) {
7597                         /*
7598                          * We blocked access to the pages in this URL.
7599                          * Clear the "busy" bit on this page before we
7600                          * wake up any waiter.
7601                          */
7602                         dwp->dw_mask |= DW_clear_busy;
7603                 }
7604                 /*
7605                  * Wakeup any thread waiting for the page to be un-cleaning.
7606                  */
7607                 dwp->dw_mask |= DW_PAGE_WAKEUP;
7608
7609 commit_next_page:
7610                 if (clear_refmod)
7611                         pmap_clear_refmod(VM_PAGE_GET_PHYS_PAGE(m), clear_refmod);
7612
7613                 target_offset += PAGE_SIZE_64;
7614                 xfer_size -= PAGE_SIZE;
7615                 entry++;
7616
7617                 if (dwp->dw_mask) {
7618                         if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) {
7619                                 VM_PAGE_ADD_DELAYED_WORK(dwp, m, dw_count);
7620
7621                                 if (dw_count >= dw_limit) {
7622                                         vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, &dw_array[0], dw_count);
7623
7624                                         dwp = &dw_array[0];
7625                                         dw_count = 0;
7626                                 }
7627                         } else {
7628                                 if (dwp->dw_mask & DW_clear_busy)
7629                                         m->busy = FALSE;
7630
7631                                 if (dwp->dw_mask & DW_PAGE_WAKEUP)
7632                                         PAGE_WAKEUP(m);
7633                         }
7634                 }
7635         }
7636         if (dw_count)
7637                 vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, &dw_array[0], dw_count);
7638
7639         if (fast_path_possible) {
7640
7641                 assert(shadow_object->purgable != VM_PURGABLE_VOLATILE);
7642                 assert(shadow_object->purgable != VM_PURGABLE_EMPTY);
7643
7644                 if (local_queue_count || unwired_count) {
7645
7646                         if (local_queue_count) {
7647                                 vm_page_t       first_target;
7648                                 vm_page_queue_head_t    *target_queue;
7649
7650                                 if (throttle_page)
7651                                         target_queue = &vm_page_queue_throttled;
7652                                 else {
7653                                         if (flags & UPL_COMMIT_INACTIVATE) {
7654                                                 if (shadow_object->internal)
7655                                                         target_queue = &vm_page_queue_anonymous;
7656                                                 else
7657                                                         target_queue = &vm_page_queue_inactive;
7658                                         } else
7659                                                 target_queue = &vm_page_queue_active;
7660                                 }
7661                                 /*
7662                                  * Transfer the entire local queue to a regular LRU page queues.
7663                                  */
7664                                 vm_page_lockspin_queues();
7665
7666                                 first_target = (vm_page_t) vm_page_queue_first(target_queue);
7667
7668                                 if (vm_page_queue_empty(target_queue))
7669                                         target_queue->prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
7670                                 else
7671                                         first_target->pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
7672
7673                                 target_queue->next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local);
7674                                 first_local->pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(target_queue);
7675                                 last_local->pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_target);
7676
7677                                 /*
7678                                  * Adjust the global page counts.
7679                                  */
7680                                 if (throttle_page) {
7681                                         vm_page_throttled_count += local_queue_count;
7682                                 } else {
7683                                         if (flags & UPL_COMMIT_INACTIVATE) {
7684                                                 if (shadow_object->internal)
7685                                                         vm_page_anonymous_count += local_queue_count;
7686                                                 vm_page_inactive_count += local_queue_count;
7687
7688                                                 token_new_pagecount += local_queue_count;
7689                                         } else
7690                                                 vm_page_active_count += local_queue_count;
7691
7692                                         if (shadow_object->internal)
7693                                                 vm_page_pageable_internal_count += local_queue_count;
7694                                         else
7695                                                 vm_page_pageable_external_count += local_queue_count;
7696                                 }
7697                         } else {
7698                                 vm_page_lockspin_queues();
7699                         }
7700                         if (unwired_count) {
7701                                 vm_page_wire_count -= unwired_count;
7702                                 VM_CHECK_MEMORYSTATUS;
7703                         }
7704                         vm_page_unlock_queues();
7705
7706                         shadow_object->wired_page_count -= unwired_count;
7707
7708                         if (!shadow_object->wired_page_count) {
7709                             VM_OBJECT_UNWIRED(shadow_object);
7710                         }
7711                 }
7712         }
7713         occupied = 1;
7714
7715         if (upl->flags & UPL_DEVICE_MEMORY)  {
7716                 occupied = 0;
7717         } else if (upl->flags & UPL_LITE) {
7718                 int     pg_num;
7719                 int     i;
7720
7721                 occupied = 0;
7722
7723                 if (!fast_path_full_commit) {
7724                         pg_num = upl->size/PAGE_SIZE;
7725                         pg_num = (pg_num + 31) >> 5;
7726
7727                         for (i = 0; i < pg_num; i++) {
7728                                 if (lite_list[i] != 0) {
7729                                         occupied = 1;
7730                                         break;
7731                                 }
7732                         }
7733                 }
7734         } else {
7735                 if (vm_page_queue_empty(&upl->map_object->memq))
7736                         occupied = 0;
7737         }
7738         if (occupied == 0) {
7739                 /*
7740                  * If this UPL element belongs to a Vector UPL and is
7741                  * empty, then this is the right function to deallocate
7742                  * it. So go ahead set the *empty variable. The flag
7743                  * UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
7744                  * should be considered relevant for the Vector UPL and not
7745                  * the internal UPLs.
7746                  */
7747                 if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) || isVectorUPL)
7748                         *empty = TRUE;
7749
7750                 if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
7751                         /*
7752                          * this is not a paging object
7753                          * so we need to drop the paging reference
7754                          * that was taken when we created the UPL
7755                          * against this object
7756                          */
7757                         vm_object_activity_end(shadow_object);
7758                         vm_object_collapse(shadow_object, 0, TRUE);
7759                 } else {
7760                          /*
7761                           * we dontated the paging reference to
7762                           * the map object... vm_pageout_object_terminate
7763                           * will drop this reference
7764                           */
7765                 }
7766         }
7767         vm_object_unlock(shadow_object);
7768         if (object != shadow_object)
7769                 vm_object_unlock(object);
7770
7771         if(!isVectorUPL)
7772                 upl_unlock(upl);
7773         else {
7774                 /*
7775                  * If we completed our operations on an UPL that is
7776                  * part of a Vectored UPL and if empty is TRUE, then
7777                  * we should go ahead and deallocate this UPL element.
7778                  * Then we check if this was the last of the UPL elements
7779                  * within that Vectored UPL. If so, set empty to TRUE
7780                  * so that in ubc_upl_commit_range or ubc_upl_commit, we
7781                  * can go ahead and deallocate the Vector UPL too.
7782                  */
7783                 if(*empty==TRUE) {
7784                         *empty = vector_upl_set_subupl(vector_upl, upl, 0);
7785                         upl_deallocate(upl);
7786                 }
7787                 goto process_upl_to_commit;
7788         }
7789         if (pgpgout_count) {
7790                 DTRACE_VM2(pgpgout, int, pgpgout_count, (uint64_t *), NULL);
7791         }
7792
7793         return KERN_SUCCESS;
7794 }
7795
7796 kern_return_t
7797 upl_abort_range(
7798         upl_t                   upl,
7799         upl_offset_t            offset,
7800         upl_size_t              size,
7801         int                     error,
7802         boolean_t               *empty)
7803 {
7804         upl_page_info_t         *user_page_list = NULL;
7805         upl_size_t              xfer_size, subupl_size = size;
7806         vm_object_t             shadow_object;
7807         vm_object_t             object;
7808         vm_object_offset_t      target_offset;
7809         upl_offset_t            subupl_offset = offset;
7810         int                     entry;
7811         wpl_array_t             lite_list;
7812         int                     occupied;
7813         struct  vm_page_delayed_work    dw_array[DEFAULT_DELAYED_WORK_LIMIT];
7814         struct  vm_page_delayed_work    *dwp;
7815         int                     dw_count;
7816         int                     dw_limit;
7817         int                     isVectorUPL = 0;
7818         upl_t                   vector_upl = NULL;
7819
7820         *empty = FALSE;
7821
7822         if (upl == UPL_NULL)
7823                 return KERN_INVALID_ARGUMENT;
7824
7825         if ( (upl->flags & UPL_IO_WIRE) && !(error & UPL_ABORT_DUMP_PAGES) )
7826                 return upl_commit_range(upl, offset, size, UPL_COMMIT_FREE_ABSENT, NULL, 0, empty);
7827
7828         if((isVectorUPL = vector_upl_is_valid(upl))) {
7829                 vector_upl = upl;
7830                 upl_lock(vector_upl);
7831         }
7832         else
7833                 upl_lock(upl);
7834
7835 process_upl_to_abort:
7836         if(isVectorUPL) {
7837                 size = subupl_size;
7838                 offset = subupl_offset;
7839                 if(size == 0) {
7840                         upl_unlock(vector_upl);
7841                         return KERN_SUCCESS;
7842                 }
7843                 upl =  vector_upl_subupl_byoffset(vector_upl, &offset, &size);
7844                 if(upl == NULL) {
7845                         upl_unlock(vector_upl);
7846                         return KERN_FAILURE;
7847                 }
7848                 subupl_size -= size;
7849                 subupl_offset += size;
7850         }
7851
7852         *empty = FALSE;
7853
7854 #if UPL_DEBUG
7855         if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
7856                 (void) OSBacktrace(&upl->upl_commit_records[upl->upl_commit_index].c_retaddr[0], UPL_DEBUG_STACK_FRAMES);
7857
7858                 upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
7859                 upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
7860                 upl->upl_commit_records[upl->upl_commit_index].c_aborted = 1;
7861
7862                 upl->upl_commit_index++;
7863         }
7864 #endif
7865         if (upl->flags & UPL_DEVICE_MEMORY)
7866                 xfer_size = 0;
7867         else if ((offset + size) <= upl->size)
7868                 xfer_size = size;
7869         else {
7870                 if(!isVectorUPL)
7871                         upl_unlock(upl);
7872                 else {
7873                         upl_unlock(vector_upl);
7874                 }
7875
7876                 return KERN_FAILURE;
7877         }
7878         if (upl->flags & UPL_INTERNAL) {
7879                 lite_list = (wpl_array_t)
7880                         ((((uintptr_t)upl) + sizeof(struct upl))
7881                         + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
7882
7883                 user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
7884         } else {
7885                 lite_list = (wpl_array_t)
7886                         (((uintptr_t)upl) + sizeof(struct upl));
7887         }
7888         object = upl->map_object;
7889
7890         if (upl->flags & UPL_SHADOWED) {
7891                 vm_object_lock(object);
7892                 shadow_object = object->shadow;
7893         } else
7894                 shadow_object = object;
7895
7896         entry = offset/PAGE_SIZE;
7897         target_offset = (vm_object_offset_t)offset;
7898
7899         assert(!(target_offset & PAGE_MASK));
7900         assert(!(xfer_size & PAGE_MASK));
7901
7902         if (upl->flags & UPL_KERNEL_OBJECT)
7903                 vm_object_lock_shared(shadow_object);
7904         else
7905                 vm_object_lock(shadow_object);
7906
7907         if (upl->flags & UPL_ACCESS_BLOCKED) {
7908                 assert(shadow_object->blocked_access);
7909                 shadow_object->blocked_access = FALSE;
7910                 vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
7911         }
7912
7913         dwp = &dw_array[0];
7914         dw_count = 0;
7915         dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
7916
7917         if ((error & UPL_ABORT_DUMP_PAGES) && (upl->flags & UPL_KERNEL_OBJECT))
7918                 panic("upl_abort_range: kernel_object being DUMPED");
7919
7920         while (xfer_size) {
7921                 vm_page_t       t, m;
7922                 unsigned int    pg_num;
7923                 boolean_t       needed;
7924
7925                 pg_num = (unsigned int) (target_offset/PAGE_SIZE);
7926                 assert(pg_num == target_offset/PAGE_SIZE);
7927
7928                 needed = FALSE;
7929
7930                 if (user_page_list)
7931                         needed = user_page_list[pg_num].needed;
7932
7933                 dwp->dw_mask = 0;
7934                 m = VM_PAGE_NULL;
7935
7936                 if (upl->flags & UPL_LITE) {
7937
7938                         if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
7939                                 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
7940
7941                                 if ( !(upl->flags & UPL_KERNEL_OBJECT))
7942                                         m = vm_page_lookup(shadow_object, target_offset +
7943                                                            (upl->offset - shadow_object->paging_offset));
7944                         }
7945                 }
7946                 if (upl->flags & UPL_SHADOWED) {
7947                         if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
7948                                 t->free_when_done = FALSE;
7949
7950                                 VM_PAGE_FREE(t);
7951
7952                                 if (m == VM_PAGE_NULL)
7953                                         m = vm_page_lookup(shadow_object, target_offset + object->vo_shadow_offset);
7954                         }
7955                 }
7956                 if ((upl->flags & UPL_KERNEL_OBJECT))
7957                         goto abort_next_page;
7958
7959                 if (m != VM_PAGE_NULL) {
7960
7961                         assert(m->vm_page_q_state != VM_PAGE_USED_BY_COMPRESSOR);
7962
7963                         if (m->absent) {
7964                                 boolean_t must_free = TRUE;
7965
7966                                 /*
7967                                  * COPYOUT = FALSE case
7968                                  * check for error conditions which must
7969                                  * be passed back to the pages customer
7970                                  */
7971                                 if (error & UPL_ABORT_RESTART) {
7972                                         m->restart = TRUE;
7973                                         m->absent = FALSE;
7974                                         m->unusual = TRUE;
7975                                         must_free = FALSE;
7976                                 } else if (error & UPL_ABORT_UNAVAILABLE) {
7977                                         m->restart = FALSE;
7978                                         m->unusual = TRUE;
7979                                         must_free = FALSE;
7980                                 } else if (error & UPL_ABORT_ERROR) {
7981                                         m->restart = FALSE;
7982                                         m->absent = FALSE;
7983                                         m->error = TRUE;
7984                                         m->unusual = TRUE;
7985                                         must_free = FALSE;
7986                                 }
7987                                 if (m->clustered && needed == FALSE) {
7988                                         /*
7989                                          * This page was a part of a speculative
7990                                          * read-ahead initiated by the kernel
7991                                          * itself.  No one is expecting this
7992                                          * page and no one will clean up its
7993                                          * error state if it ever becomes valid
7994                                          * in the future.
7995                                          * We have to free it here.
7996                                          */
7997                                         must_free = TRUE;
7998                                 }
7999
8000                                 /*
8001                                  * ENCRYPTED SWAP:
8002                                  * If the page was already encrypted,
8003                                  * we don't really need to decrypt it
8004                                  * now.  It will get decrypted later,
8005                                  * on demand, as soon as someone needs
8006                                  * to access its contents.
8007                                  */
8008
8009                                 m->cleaning = FALSE;
8010                                 m->encrypted_cleaning = FALSE;
8011
8012                                 if (m->overwriting && !m->busy) {
8013                                         /*
8014                                          * this shouldn't happen since
8015                                          * this is an 'absent' page, but
8016                                          * it doesn't hurt to check for
8017                                          * the 'alternate' method of
8018                                          * stabilizing the page...
8019                                          * we will mark 'busy' to be cleared
8020                                          * in the following code which will
8021                                          * take care of the primary stabilzation
8022                                          * method (i.e. setting 'busy' to TRUE)
8023                                          */
8024                                         dwp->dw_mask |= DW_vm_page_unwire;
8025                                 }
8026                                 m->overwriting = FALSE;
8027
8028                                 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
8029
8030                                 if (must_free == TRUE)
8031                                         dwp->dw_mask |= DW_vm_page_free;
8032                                 else
8033                                         dwp->dw_mask |= DW_vm_page_activate;
8034                         } else {
8035                                 /*
8036                                  * Handle the trusted pager throttle.
8037                                  */
8038                                 if (m->laundry)
8039                                         dwp->dw_mask |= DW_vm_pageout_throttle_up;
8040
8041                                 if (upl->flags & UPL_ACCESS_BLOCKED) {
8042                                         /*
8043                                          * We blocked access to the pages in this UPL.
8044                                          * Clear the "busy" bit and wake up any waiter
8045                                          * for this page.
8046                                          */
8047                                         dwp->dw_mask |= DW_clear_busy;
8048                                 }
8049                                 if (m->overwriting) {
8050                                         if (m->busy)
8051                                                 dwp->dw_mask |= DW_clear_busy;
8052                                         else {
8053                                                 /*
8054                                                  * deal with the 'alternate' method
8055                                                  * of stabilizing the page...
8056                                                  * we will either free the page
8057                                                  * or mark 'busy' to be cleared
8058                                                  * in the following code which will
8059                                                  * take care of the primary stabilzation
8060                                                  * method (i.e. setting 'busy' to TRUE)
8061                                                  */
8062                                                 dwp->dw_mask |= DW_vm_page_unwire;
8063                                         }
8064                                         m->overwriting = FALSE;
8065                                 }
8066                                 if (m->encrypted_cleaning == TRUE) {
8067                                         m->encrypted_cleaning = FALSE;
8068
8069                                         dwp->dw_mask |= DW_clear_busy;
8070                                 }
8071                                 m->free_when_done = FALSE;
8072                                 m->cleaning = FALSE;
8073
8074                                 if (error & UPL_ABORT_DUMP_PAGES) {
8075                                         pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
8076
8077                                         dwp->dw_mask |= DW_vm_page_free;
8078                                 } else {
8079                                         if (!(dwp->dw_mask & DW_vm_page_unwire)) {
8080                                                 if (error & UPL_ABORT_REFERENCE) {
8081                                                         /*
8082                                                          * we've been told to explictly
8083                                                          * reference this page... for
8084                                                          * file I/O, this is done by
8085                                                          * implementing an LRU on the inactive q
8086                                                          */
8087                                                         dwp->dw_mask |= DW_vm_page_lru;
8088
8089                                                 } else if ( !VM_PAGE_PAGEABLE(m))
8090                                                         dwp->dw_mask |= DW_vm_page_deactivate_internal;
8091                                         }
8092                                         dwp->dw_mask |= DW_PAGE_WAKEUP;
8093                                 }
8094                         }
8095                 }
8096 abort_next_page:
8097                 target_offset += PAGE_SIZE_64;
8098                 xfer_size -= PAGE_SIZE;
8099                 entry++;
8100
8101                 if (dwp->dw_mask) {
8102                         if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) {
8103                                 VM_PAGE_ADD_DELAYED_WORK(dwp, m, dw_count);
8104
8105                                 if (dw_count >= dw_limit) {
8106                                         vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, &dw_array[0], dw_count);
8107
8108                                         dwp = &dw_array[0];
8109                                         dw_count = 0;
8110                                 }
8111                         } else {
8112                                 if (dwp->dw_mask & DW_clear_busy)
8113                                         m->busy = FALSE;
8114
8115                                 if (dwp->dw_mask & DW_PAGE_WAKEUP)
8116                                         PAGE_WAKEUP(m);
8117                         }
8118                 }
8119         }
8120         if (dw_count)
8121                 vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, &dw_array[0], dw_count);
8122
8123         occupied = 1;
8124
8125         if (upl->flags & UPL_DEVICE_MEMORY)  {
8126                 occupied = 0;
8127         } else if (upl->flags & UPL_LITE) {
8128                 int     pg_num;
8129                 int     i;
8130
8131                 pg_num = upl->size/PAGE_SIZE;
8132                 pg_num = (pg_num + 31) >> 5;
8133                 occupied = 0;
8134
8135                 for (i = 0; i < pg_num; i++) {
8136                         if (lite_list[i] != 0) {
8137                                 occupied = 1;
8138                                 break;
8139                         }
8140                 }
8141         } else {
8142                 if (vm_page_queue_empty(&upl->map_object->memq))
8143                         occupied = 0;
8144         }
8145         if (occupied == 0) {
8146                 /*
8147                  * If this UPL element belongs to a Vector UPL and is
8148                  * empty, then this is the right function to deallocate
8149                  * it. So go ahead set the *empty variable. The flag
8150                  * UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
8151                  * should be considered relevant for the Vector UPL and
8152                  * not the internal UPLs.
8153                  */
8154                 if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) || isVectorUPL)
8155                         *empty = TRUE;
8156
8157                 if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
8158                         /*
8159                          * this is not a paging object
8160                          * so we need to drop the paging reference
8161                          * that was taken when we created the UPL
8162                          * against this object
8163                          */
8164                         vm_object_activity_end(shadow_object);
8165                         vm_object_collapse(shadow_object, 0, TRUE);
8166                 } else {
8167                          /*
8168                           * we dontated the paging reference to
8169                           * the map object... vm_pageout_object_terminate
8170                           * will drop this reference
8171                           */
8172                 }
8173         }
8174         vm_object_unlock(shadow_object);
8175         if (object != shadow_object)
8176                 vm_object_unlock(object);
8177
8178         if(!isVectorUPL)
8179                 upl_unlock(upl);
8180         else {
8181                 /*
8182                 * If we completed our operations on an UPL that is
8183                 * part of a Vectored UPL and if empty is TRUE, then
8184                 * we should go ahead and deallocate this UPL element.
8185                 * Then we check if this was the last of the UPL elements
8186                 * within that Vectored UPL. If so, set empty to TRUE
8187                 * so that in ubc_upl_abort_range or ubc_upl_abort, we
8188                 * can go ahead and deallocate the Vector UPL too.
8189                 */
8190                 if(*empty == TRUE) {
8191                         *empty = vector_upl_set_subupl(vector_upl, upl,0);
8192                         upl_deallocate(upl);
8193                 }
8194                 goto process_upl_to_abort;
8195         }
8196
8197         return KERN_SUCCESS;
8198 }
8199
8200
8201 kern_return_t
8202 upl_abort(
8203         upl_t   upl,
8204         int     error)
8205 {
8206         boolean_t       empty;
8207
8208         if (upl == UPL_NULL)
8209                 return KERN_INVALID_ARGUMENT;
8210
8211         return upl_abort_range(upl, 0, upl->size, error, &empty);
8212 }
8213
8214
8215 /* an option on commit should be wire */
8216 kern_return_t
8217 upl_commit(
8218         upl_t                   upl,
8219         upl_page_info_t         *page_list,
8220         mach_msg_type_number_t  count)
8221 {
8222         boolean_t       empty;
8223
8224         if (upl == UPL_NULL)
8225                 return KERN_INVALID_ARGUMENT;
8226
8227         return upl_commit_range(upl, 0, upl->size, 0, page_list, count, &empty);
8228 }
8229
8230
8231 void
8232 iopl_valid_data(
8233         upl_t   upl)
8234 {
8235         vm_object_t     object;
8236         vm_offset_t     offset;
8237         vm_page_t       m, nxt_page = VM_PAGE_NULL;
8238         upl_size_t      size;
8239         int             wired_count = 0;
8240
8241         if (upl == NULL)
8242                 panic("iopl_valid_data: NULL upl");
8243         if (vector_upl_is_valid(upl))
8244                 panic("iopl_valid_data: vector upl");
8245         if ((upl->flags & (UPL_DEVICE_MEMORY|UPL_SHADOWED|UPL_ACCESS_BLOCKED|UPL_IO_WIRE|UPL_INTERNAL)) != UPL_IO_WIRE)
8246                 panic("iopl_valid_data: unsupported upl, flags = %x", upl->flags);
8247
8248         object = upl->map_object;
8249
8250         if (object == kernel_object || object == compressor_object)
8251                 panic("iopl_valid_data: object == kernel or compressor");
8252
8253         if (object->purgable == VM_PURGABLE_VOLATILE ||
8254             object->purgable == VM_PURGABLE_EMPTY)
8255                 panic("iopl_valid_data: object %p purgable %d",
8256                       object, object->purgable);
8257
8258         size = upl->size;
8259
8260         vm_object_lock(object);
8261
8262         if (object->vo_size == size && object->resident_page_count == (size / PAGE_SIZE))
8263                 nxt_page = (vm_page_t)vm_page_queue_first(&object->memq);
8264         else
8265                 offset = 0 + upl->offset - object->paging_offset;
8266
8267         while (size) {
8268
8269                 if (nxt_page != VM_PAGE_NULL) {
8270                         m = nxt_page;
8271                         nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->listq);
8272                 } else {
8273                         m = vm_page_lookup(object, offset);
8274                         offset += PAGE_SIZE;
8275
8276                         if (m == VM_PAGE_NULL)
8277                                 panic("iopl_valid_data: missing expected page at offset %lx", (long)offset);
8278                 }
8279                 if (m->busy) {
8280                         if (!m->absent)
8281                                 panic("iopl_valid_data: busy page w/o absent");
8282
8283                         if (m->pageq.next || m->pageq.prev)
8284                                 panic("iopl_valid_data: busy+absent page on page queue");
8285                         if (m->reusable) {
8286                                 panic("iopl_valid_data: %p is reusable", m);
8287                         }
8288
8289                         m->absent = FALSE;
8290                         m->dirty = TRUE;
8291                         assert(m->vm_page_q_state == VM_PAGE_NOT_ON_Q);
8292                         assert(m->wire_count == 0);
8293                         m->wire_count++;
8294                         assert(m->wire_count);
8295                         if (m->wire_count == 1) {
8296                                 m->vm_page_q_state = VM_PAGE_IS_WIRED;
8297                                 wired_count++;
8298                         } else {
8299                                 panic("iopl_valid_data: %p already wired\n", m);
8300                         }
8301
8302                         PAGE_WAKEUP_DONE(m);
8303                 }
8304                 size -= PAGE_SIZE;
8305         }
8306         if (wired_count) {
8307
8308                 if (!object->wired_page_count) {
8309                     VM_OBJECT_WIRED(object);
8310                 }
8311                 object->wired_page_count += wired_count;
8312                 assert(object->resident_page_count >= object->wired_page_count);
8313
8314                 /* no need to adjust purgeable accounting for this object: */
8315                 assert(object->purgable != VM_PURGABLE_VOLATILE);
8316                 assert(object->purgable != VM_PURGABLE_EMPTY);
8317
8318                 vm_page_lockspin_queues();
8319                 vm_page_wire_count += wired_count;
8320                 vm_page_unlock_queues();
8321         }
8322         vm_object_unlock(object);
8323 }
8324
8325 vm_tag_t
8326 iopl_set_tag(
8327         upl_t    upl,
8328         vm_tag_t tag)
8329 {
8330         vm_object_t     object;
8331         vm_tag_t        prior_tag;
8332
8333         if (upl == NULL)
8334                 panic("%s: NULL upl", __FUNCTION__);
8335         if (vector_upl_is_valid(upl))
8336                 panic("%s: vector upl", __FUNCTION__);
8337         if (kernel_object == upl->map_object)
8338                 return (tag);
8339         if ((upl->flags & (UPL_DEVICE_MEMORY|UPL_SHADOWED|UPL_ACCESS_BLOCKED|UPL_IO_WIRE|UPL_INTERNAL)) != UPL_IO_WIRE)
8340                 return (tag);
8341
8342         object = upl->map_object;
8343         vm_object_lock(object);
8344
8345         prior_tag        = object->wire_tag;
8346         object->wire_tag = tag;
8347         if (VM_KERN_MEMORY_NONE == prior_tag) prior_tag = tag;
8348         vm_object_unlock(object);
8349
8350         return (prior_tag);
8351 }
8352
8353
8354 void
8355 vm_object_set_pmap_cache_attr(
8356                 vm_object_t             object,
8357                 upl_page_info_array_t   user_page_list,
8358                 unsigned int            num_pages,
8359                 boolean_t               batch_pmap_op)
8360 {
8361         unsigned int    cache_attr = 0;
8362
8363         cache_attr = object->wimg_bits & VM_WIMG_MASK;
8364         assert(user_page_list);
8365         if (cache_attr != VM_WIMG_USE_DEFAULT) {
8366                 PMAP_BATCH_SET_CACHE_ATTR(object, user_page_list, cache_attr, num_pages, batch_pmap_op);
8367         }
8368 }
8369
8370
8371 boolean_t       vm_object_iopl_wire_full(vm_object_t, upl_t, upl_page_info_array_t, wpl_array_t, upl_control_flags_t);
8372 kern_return_t   vm_object_iopl_wire_empty(vm_object_t, upl_t, upl_page_info_array_t, wpl_array_t, upl_control_flags_t, vm_object_offset_t *, int);
8373
8374
8375
8376 boolean_t
8377 vm_object_iopl_wire_full(vm_object_t object, upl_t upl, upl_page_info_array_t user_page_list,
8378                             wpl_array_t lite_list, upl_control_flags_t cntrl_flags)
8379 {
8380         vm_page_t       dst_page;
8381         vm_tag_t        tag;
8382         unsigned int    entry;
8383         int             page_count;
8384         int             delayed_unlock = 0;
8385         boolean_t       retval = TRUE;
8386         ppnum_t         phys_page;
8387
8388         vm_object_lock_assert_exclusive(object);
8389         assert(object->purgable != VM_PURGABLE_VOLATILE);
8390         assert(object->purgable != VM_PURGABLE_EMPTY);
8391         assert(object->pager == NULL);
8392         assert(object->copy == NULL);
8393         assert(object->shadow == NULL);
8394
8395         tag = UPL_MEMORY_TAG(cntrl_flags);
8396         page_count = object->resident_page_count;
8397         dst_page = (vm_page_t)vm_page_queue_first(&object->memq);
8398
8399         vm_page_lock_queues();
8400
8401         while (page_count--) {
8402
8403                 if (dst_page->busy ||
8404                     dst_page->fictitious ||
8405                     dst_page->absent ||
8406                     dst_page->error ||
8407                     dst_page->cleaning ||
8408                     dst_page->restart ||
8409                     dst_page->encrypted ||
8410                     dst_page->laundry) {
8411                         retval = FALSE;
8412                         goto done;
8413                 }
8414                 if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->written_by_kernel == TRUE) {
8415                         retval = FALSE;
8416                         goto done;
8417                 }
8418                 dst_page->reference = TRUE;
8419
8420                 vm_page_wire(dst_page, tag, FALSE);
8421
8422                 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
8423                         SET_PAGE_DIRTY(dst_page, FALSE);
8424                 }
8425                 entry = (unsigned int)(dst_page->offset / PAGE_SIZE);
8426                 assert(entry >= 0 && entry < object->resident_page_count);
8427                 lite_list[entry>>5] |= 1 << (entry & 31);
8428
8429                 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
8430
8431                 if (phys_page > upl->highest_page)
8432                         upl->highest_page = phys_page;
8433
8434                 if (user_page_list) {
8435                         user_page_list[entry].phys_addr = phys_page;
8436                         user_page_list[entry].absent    = dst_page->absent;
8437                         user_page_list[entry].dirty     = dst_page->dirty;
8438                         user_page_list[entry].free_when_done   = dst_page->free_when_done;
8439                         user_page_list[entry].precious  = dst_page->precious;
8440                         user_page_list[entry].device    = FALSE;
8441                         user_page_list[entry].speculative = FALSE;
8442                         user_page_list[entry].cs_validated = FALSE;
8443                         user_page_list[entry].cs_tainted = FALSE;
8444                         user_page_list[entry].cs_nx     = FALSE;
8445                         user_page_list[entry].needed    = FALSE;
8446                         user_page_list[entry].mark      = FALSE;
8447                 }
8448                 if (delayed_unlock++ > 256) {
8449                         delayed_unlock = 0;
8450                         lck_mtx_yield(&vm_page_queue_lock);
8451
8452                         VM_CHECK_MEMORYSTATUS;
8453                 }
8454                 dst_page = (vm_page_t)vm_page_queue_next(&dst_page->listq);
8455         }
8456 done:
8457         vm_page_unlock_queues();
8458
8459         VM_CHECK_MEMORYSTATUS;
8460
8461         return (retval);
8462 }
8463
8464
8465 kern_return_t
8466 vm_object_iopl_wire_empty(vm_object_t object, upl_t upl, upl_page_info_array_t user_page_list,
8467                              wpl_array_t lite_list, upl_control_flags_t cntrl_flags, vm_object_offset_t *dst_offset, int page_count)
8468 {
8469         vm_page_t       dst_page;
8470         vm_tag_t        tag;
8471         boolean_t       no_zero_fill = FALSE;
8472         int             interruptible;
8473         int             pages_wired = 0;
8474         int             pages_inserted = 0;
8475         int             entry = 0;
8476         uint64_t        delayed_ledger_update = 0;
8477         kern_return_t   ret = KERN_SUCCESS;
8478         int             grab_options;
8479         ppnum_t         phys_page;
8480
8481         vm_object_lock_assert_exclusive(object);
8482         assert(object->purgable != VM_PURGABLE_VOLATILE);
8483         assert(object->purgable != VM_PURGABLE_EMPTY);
8484         assert(object->pager == NULL);
8485         assert(object->copy == NULL);
8486         assert(object->shadow == NULL);
8487
8488         if (cntrl_flags & UPL_SET_INTERRUPTIBLE)
8489                 interruptible = THREAD_ABORTSAFE;
8490         else
8491                 interruptible = THREAD_UNINT;
8492
8493         if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO))
8494                 no_zero_fill = TRUE;
8495
8496         tag = UPL_MEMORY_TAG(cntrl_flags);
8497
8498         grab_options = 0;
8499 #if CONFIG_SECLUDED_MEMORY
8500         if (object->can_grab_secluded) {
8501                 grab_options |= VM_PAGE_GRAB_SECLUDED;
8502         }
8503 #endif /* CONFIG_SECLUDED_MEMORY */
8504
8505         while (page_count--) {
8506
8507                 while ((dst_page = vm_page_grab_options(grab_options))
8508                        == VM_PAGE_NULL) {
8509
8510                         OSAddAtomic(page_count, &vm_upl_wait_for_pages);
8511
8512                         VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
8513
8514                         if (vm_page_wait(interruptible) == FALSE) {
8515                                 /*
8516                                  * interrupted case
8517                                  */
8518                                 OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
8519
8520                                 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
8521
8522                                 ret = MACH_SEND_INTERRUPTED;
8523                                 goto done;
8524                         }
8525                         OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
8526
8527                         VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
8528                 }
8529                 if (no_zero_fill == FALSE)
8530                         vm_page_zero_fill(dst_page);
8531                 else
8532                         dst_page->absent = TRUE;
8533
8534                 dst_page->reference = TRUE;
8535
8536                 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
8537                         SET_PAGE_DIRTY(dst_page, FALSE);
8538                 }
8539                 if (dst_page->absent == FALSE) {
8540                         assert(dst_page->vm_page_q_state == VM_PAGE_NOT_ON_Q);
8541                         assert(dst_page->wire_count == 0);
8542                         dst_page->wire_count++;
8543                         dst_page->vm_page_q_state = VM_PAGE_IS_WIRED;
8544                         assert(dst_page->wire_count);
8545                         pages_wired++;
8546                         PAGE_WAKEUP_DONE(dst_page);
8547                 }
8548                 pages_inserted++;
8549
8550                 vm_page_insert_internal(dst_page, object, *dst_offset, tag, FALSE, TRUE, TRUE, TRUE, &delayed_ledger_update);
8551
8552                 lite_list[entry>>5] |= 1 << (entry & 31);
8553
8554                 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
8555
8556                 if (phys_page > upl->highest_page)
8557                         upl->highest_page = phys_page;
8558
8559                 if (user_page_list) {
8560                         user_page_list[entry].phys_addr = phys_page;
8561                         user_page_list[entry].absent    = dst_page->absent;
8562                         user_page_list[entry].dirty     = dst_page->dirty;
8563                         user_page_list[entry].free_when_done    = FALSE;
8564                         user_page_list[entry].precious  = FALSE;
8565                         user_page_list[entry].device    = FALSE;
8566                         user_page_list[entry].speculative = FALSE;
8567                         user_page_list[entry].cs_validated = FALSE;
8568                         user_page_list[entry].cs_tainted = FALSE;
8569                         user_page_list[entry].cs_nx     = FALSE;
8570                         user_page_list[entry].needed    = FALSE;
8571                         user_page_list[entry].mark      = FALSE;
8572                 }
8573                 entry++;
8574                 *dst_offset += PAGE_SIZE_64;
8575         }
8576 done:
8577         if (pages_wired) {
8578                 vm_page_lockspin_queues();
8579                 vm_page_wire_count += pages_wired;
8580                 vm_page_unlock_queues();
8581         }
8582         if (pages_inserted) {
8583                 if (object->internal) {
8584                         OSAddAtomic(pages_inserted, &vm_page_internal_count);
8585                 } else {
8586                         OSAddAtomic(pages_inserted, &vm_page_external_count);
8587                 }
8588         }
8589         if (delayed_ledger_update) {
8590                 task_t          owner;
8591
8592                 owner = object->vo_purgeable_owner;
8593                 assert(owner);
8594
8595                 /* more non-volatile bytes */
8596                 ledger_credit(owner->ledger,
8597                               task_ledgers.purgeable_nonvolatile,
8598                               delayed_ledger_update);
8599                 /* more footprint */
8600                 ledger_credit(owner->ledger,
8601                               task_ledgers.phys_footprint,
8602                               delayed_ledger_update);
8603         }
8604         return (ret);
8605 }
8606
8607
8608 unsigned int vm_object_iopl_request_sleep_for_cleaning = 0;
8609
8610
8611 kern_return_t
8612 vm_object_iopl_request(
8613         vm_object_t             object,
8614         vm_object_offset_t      offset,
8615         upl_size_t              size,
8616         upl_t                   *upl_ptr,
8617         upl_page_info_array_t   user_page_list,
8618         unsigned int            *page_list_count,
8619         upl_control_flags_t     cntrl_flags)
8620 {
8621         vm_page_t               dst_page;
8622         vm_object_offset_t      dst_offset;
8623         upl_size_t              xfer_size;
8624         upl_t                   upl = NULL;
8625         unsigned int            entry;
8626         wpl_array_t             lite_list = NULL;
8627         int                     no_zero_fill = FALSE;
8628         unsigned int            size_in_pages;
8629         u_int32_t               psize;
8630         kern_return_t           ret;
8631         vm_prot_t               prot;
8632         struct vm_object_fault_info fault_info;
8633         struct  vm_page_delayed_work    dw_array[DEFAULT_DELAYED_WORK_LIMIT];
8634         struct  vm_page_delayed_work    *dwp;
8635         int                     dw_count;
8636         int                     dw_limit;
8637         int                     dw_index;
8638         boolean_t               caller_lookup;
8639         int                     io_tracking_flag = 0;
8640         int                     interruptible;
8641         ppnum_t                 phys_page;
8642
8643         boolean_t               set_cache_attr_needed = FALSE;
8644         boolean_t               free_wired_pages = FALSE;
8645         boolean_t               fast_path_empty_req = FALSE;
8646         boolean_t               fast_path_full_req = FALSE;
8647
8648         if (cntrl_flags & ~UPL_VALID_FLAGS) {
8649                 /*
8650                  * For forward compatibility's sake,
8651                  * reject any unknown flag.
8652                  */
8653                 return KERN_INVALID_VALUE;
8654         }
8655         if (vm_lopage_needed == FALSE)
8656                 cntrl_flags &= ~UPL_NEED_32BIT_ADDR;
8657
8658         if (cntrl_flags & UPL_NEED_32BIT_ADDR) {
8659                 if ( (cntrl_flags & (UPL_SET_IO_WIRE | UPL_SET_LITE)) != (UPL_SET_IO_WIRE | UPL_SET_LITE))
8660                         return KERN_INVALID_VALUE;
8661
8662                 if (object->phys_contiguous) {
8663                         if ((offset + object->vo_shadow_offset) >= (vm_object_offset_t)max_valid_dma_address)
8664                                 return KERN_INVALID_ADDRESS;
8665
8666                         if (((offset + object->vo_shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address)
8667                                 return KERN_INVALID_ADDRESS;
8668                 }
8669         }
8670
8671         if (cntrl_flags & UPL_ENCRYPT) {
8672                 /*
8673                  * ENCRYPTED SWAP:
8674                  * The paging path doesn't use this interface,
8675                  * so we don't support the UPL_ENCRYPT flag
8676                  * here.  We won't encrypt the pages.
8677                  */
8678                 assert(! (cntrl_flags & UPL_ENCRYPT));
8679         }
8680         if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO))
8681                 no_zero_fill = TRUE;
8682
8683         if (cntrl_flags & UPL_COPYOUT_FROM)
8684                 prot = VM_PROT_READ;
8685         else
8686                 prot = VM_PROT_READ | VM_PROT_WRITE;
8687
8688         if ((!object->internal) && (object->paging_offset != 0))
8689                 panic("vm_object_iopl_request: external object with non-zero paging offset\n");
8690
8691 #if CONFIG_IOSCHED || UPL_DEBUG
8692         if ((object->io_tracking && object != kernel_object) || upl_debug_enabled)
8693                 io_tracking_flag |= UPL_CREATE_IO_TRACKING;
8694 #endif
8695
8696 #if CONFIG_IOSCHED
8697         if (object->io_tracking) {
8698                 /* Check if we're dealing with the kernel object. We do not support expedite on kernel object UPLs */
8699                 if (object != kernel_object)
8700                         io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
8701         }
8702 #endif
8703
8704         if (object->phys_contiguous)
8705                 psize = PAGE_SIZE;
8706         else
8707                 psize = size;
8708
8709         if (cntrl_flags & UPL_SET_INTERNAL) {
8710                 upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
8711
8712                 user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
8713                 lite_list = (wpl_array_t) (((uintptr_t)user_page_list) +
8714                                            ((psize / PAGE_SIZE) * sizeof(upl_page_info_t)));
8715                 if (size == 0) {
8716                         user_page_list = NULL;
8717                         lite_list = NULL;
8718                 }
8719         } else {
8720                 upl = upl_create(UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
8721
8722                 lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
8723                 if (size == 0) {
8724                         lite_list = NULL;
8725                 }
8726         }
8727         if (user_page_list)
8728                 user_page_list[0].device = FALSE;
8729         *upl_ptr = upl;
8730
8731         upl->map_object = object;
8732         upl->size = size;
8733
8734         size_in_pages = size / PAGE_SIZE;
8735
8736         if (object == kernel_object &&
8737             !(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS))) {
8738                 upl->flags |= UPL_KERNEL_OBJECT;
8739 #if UPL_DEBUG
8740                 vm_object_lock(object);
8741 #else
8742                 vm_object_lock_shared(object);
8743 #endif
8744         } else {
8745                 vm_object_lock(object);
8746                 vm_object_activity_begin(object);
8747         }
8748         /*
8749          * paging in progress also protects the paging_offset
8750          */
8751         upl->offset = offset + object->paging_offset;
8752
8753         if (cntrl_flags & UPL_BLOCK_ACCESS) {
8754                 /*
8755                  * The user requested that access to the pages in this UPL
8756                  * be blocked until the UPL is commited or aborted.
8757                  */
8758                 upl->flags |= UPL_ACCESS_BLOCKED;
8759         }
8760
8761 #if CONFIG_IOSCHED || UPL_DEBUG
8762         if (upl->flags & UPL_TRACKED_BY_OBJECT) {
8763                 vm_object_activity_begin(object);
8764                 queue_enter(&object->uplq, upl, upl_t, uplq);
8765         }
8766 #endif
8767
8768         if (object->phys_contiguous) {
8769
8770                 if (upl->flags & UPL_ACCESS_BLOCKED) {
8771                         assert(!object->blocked_access);
8772                         object->blocked_access = TRUE;
8773                 }
8774
8775                 vm_object_unlock(object);
8776
8777                 /*
8778                  * don't need any shadow mappings for this one
8779                  * since it is already I/O memory
8780                  */
8781                 upl->flags |= UPL_DEVICE_MEMORY;
8782
8783                 upl->highest_page = (ppnum_t) ((offset + object->vo_shadow_offset + size - 1)>>PAGE_SHIFT);
8784
8785                 if (user_page_list) {
8786                         user_page_list[0].phys_addr = (ppnum_t) ((offset + object->vo_shadow_offset)>>PAGE_SHIFT);
8787                         user_page_list[0].device = TRUE;
8788                 }
8789                 if (page_list_count != NULL) {
8790                         if (upl->flags & UPL_INTERNAL)
8791                                 *page_list_count = 0;
8792                         else
8793                                 *page_list_count = 1;
8794                 }
8795                 return KERN_SUCCESS;
8796         }
8797         if (object != kernel_object && object != compressor_object) {
8798                 /*
8799                  * Protect user space from future COW operations
8800                  */
8801 #if VM_OBJECT_TRACKING_OP_TRUESHARE
8802                 if (!object->true_share &&
8803                     vm_object_tracking_inited) {
8804                         void *bt[VM_OBJECT_TRACKING_BTDEPTH];
8805                         int num = 0;
8806
8807                         num = OSBacktrace(bt,
8808                                           VM_OBJECT_TRACKING_BTDEPTH);
8809                         btlog_add_entry(vm_object_tracking_btlog,
8810                                         object,
8811                                         VM_OBJECT_TRACKING_OP_TRUESHARE,
8812                                         bt,
8813                                         num);
8814                 }
8815 #endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
8816
8817                 vm_object_lock_assert_exclusive(object);
8818                 object->true_share = TRUE;
8819
8820                 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC)
8821                         object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
8822         }
8823
8824         if (!(cntrl_flags & UPL_COPYOUT_FROM) &&
8825             object->copy != VM_OBJECT_NULL) {
8826                 /*
8827                  * Honor copy-on-write obligations
8828                  *
8829                  * The caller is gathering these pages and
8830                  * might modify their contents.  We need to
8831                  * make sure that the copy object has its own
8832                  * private copies of these pages before we let
8833                  * the caller modify them.
8834                  *
8835                  * NOTE: someone else could map the original object
8836                  * after we've done this copy-on-write here, and they
8837                  * could then see an inconsistent picture of the memory
8838                  * while it's being modified via the UPL.  To prevent this,
8839                  * we would have to block access to these pages until the
8840                  * UPL is released.  We could use the UPL_BLOCK_ACCESS
8841                  * code path for that...
8842                  */
8843                 vm_object_update(object,
8844                                  offset,
8845                                  size,
8846                                  NULL,
8847                                  NULL,
8848                                  FALSE, /* should_return */
8849                                  MEMORY_OBJECT_COPY_SYNC,
8850                                  VM_PROT_NO_CHANGE);
8851 #if DEVELOPMENT || DEBUG
8852                 iopl_cow++;
8853                 iopl_cow_pages += size >> PAGE_SHIFT;
8854 #endif
8855         }
8856         if (!(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS)) &&
8857             object->purgable != VM_PURGABLE_VOLATILE &&
8858             object->purgable != VM_PURGABLE_EMPTY &&
8859             object->copy == NULL &&
8860             size == object->vo_size &&
8861             offset == 0 &&
8862             object->shadow == NULL &&
8863             object->pager == NULL)
8864         {
8865                 if (object->resident_page_count == size_in_pages)
8866                 {
8867                         assert(object != compressor_object);
8868                         assert(object != kernel_object);
8869                         fast_path_full_req = TRUE;
8870                 }
8871                 else if (object->resident_page_count == 0)
8872                 {
8873                         assert(object != compressor_object);
8874                         assert(object != kernel_object);
8875                         fast_path_empty_req = TRUE;
8876                         set_cache_attr_needed = TRUE;
8877                 }
8878         }
8879
8880         if (cntrl_flags & UPL_SET_INTERRUPTIBLE)
8881                 interruptible = THREAD_ABORTSAFE;
8882         else
8883                 interruptible = THREAD_UNINT;
8884
8885         entry = 0;
8886
8887         xfer_size = size;
8888         dst_offset = offset;
8889         dw_count = 0;
8890
8891         if (fast_path_full_req) {
8892
8893                 if (vm_object_iopl_wire_full(object, upl, user_page_list, lite_list, cntrl_flags) == TRUE)
8894                         goto finish;
8895                 /*
8896                  * we couldn't complete the processing of this request on the fast path
8897                  * so fall through to the slow path and finish up
8898                  */
8899
8900         } else if (fast_path_empty_req) {
8901
8902                 if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
8903                         ret = KERN_MEMORY_ERROR;
8904                         goto return_err;
8905                 }
8906                 ret = vm_object_iopl_wire_empty(object, upl, user_page_list, lite_list, cntrl_flags, &dst_offset, size_in_pages);
8907
8908                 if (ret) {
8909                         free_wired_pages = TRUE;
8910                         goto return_err;
8911                 }
8912                 goto finish;
8913         }
8914
8915         fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
8916         fault_info.user_tag  = 0;
8917         fault_info.lo_offset = offset;
8918         fault_info.hi_offset = offset + xfer_size;
8919         fault_info.no_cache  = FALSE;
8920         fault_info.stealth = FALSE;
8921         fault_info.io_sync = FALSE;
8922         fault_info.cs_bypass = FALSE;
8923         fault_info.mark_zf_absent = TRUE;
8924         fault_info.interruptible = interruptible;
8925         fault_info.batch_pmap_op = TRUE;
8926
8927         dwp = &dw_array[0];
8928         dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
8929
8930         while (xfer_size) {
8931                 vm_fault_return_t       result;
8932
8933                 dwp->dw_mask = 0;
8934
8935                 if (fast_path_full_req) {
8936                         /*
8937                          * if we get here, it means that we ran into a page
8938                          * state we couldn't handle in the fast path and
8939                          * bailed out to the slow path... since the order
8940                          * we look at pages is different between the 2 paths,
8941                          * the following check is needed to determine whether
8942                          * this page was already processed in the fast path
8943                          */
8944                         if (lite_list[entry>>5] & (1 << (entry & 31)))
8945                                 goto skip_page;
8946                 }
8947                 dst_page = vm_page_lookup(object, dst_offset);
8948
8949                 /*
8950                  * ENCRYPTED SWAP:
8951                  * If the page is encrypted, we need to decrypt it,
8952                  * so force a soft page fault.
8953                  */
8954                 if (dst_page == VM_PAGE_NULL ||
8955                     dst_page->busy ||
8956                     dst_page->encrypted ||
8957                     dst_page->error ||
8958                     dst_page->restart ||
8959                     dst_page->absent ||
8960                     dst_page->fictitious) {
8961
8962                    if (object == kernel_object)
8963                            panic("vm_object_iopl_request: missing/bad page in kernel object\n");
8964                    if (object == compressor_object)
8965                            panic("vm_object_iopl_request: missing/bad page in compressor object\n");
8966
8967                    if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
8968                            ret = KERN_MEMORY_ERROR;
8969                            goto return_err;
8970                    }
8971                    set_cache_attr_needed = TRUE;
8972
8973                    /*
8974                     * We just looked up the page and the result remains valid
8975                     * until the object lock is release, so send it to
8976                     * vm_fault_page() (as "dst_page"), to avoid having to
8977                     * look it up again there.
8978                     */
8979                    caller_lookup = TRUE;
8980
8981                    do {
8982                         vm_page_t       top_page;
8983                         kern_return_t   error_code;
8984
8985                         fault_info.cluster_size = xfer_size;
8986
8987                         vm_object_paging_begin(object);
8988
8989                         result = vm_fault_page(object, dst_offset,
8990                                                prot | VM_PROT_WRITE, FALSE,
8991                                                caller_lookup,
8992                                                &prot, &dst_page, &top_page,
8993                                                (int *)0,
8994                                                &error_code, no_zero_fill,
8995                                                FALSE, &fault_info);
8996
8997                         /* our lookup is no longer valid at this point */
8998                         caller_lookup = FALSE;
8999
9000                         switch (result) {
9001
9002                         case VM_FAULT_SUCCESS:
9003
9004                                 if ( !dst_page->absent) {
9005                                         PAGE_WAKEUP_DONE(dst_page);
9006                                 } else {
9007                                         /*
9008                                          * we only get back an absent page if we
9009                                          * requested that it not be zero-filled
9010                                          * because we are about to fill it via I/O
9011                                          *
9012                                          * absent pages should be left BUSY
9013                                          * to prevent them from being faulted
9014                                          * into an address space before we've
9015                                          * had a chance to complete the I/O on
9016                                          * them since they may contain info that
9017                                          * shouldn't be seen by the faulting task
9018                                          */
9019                                 }
9020                                 /*
9021                                  *      Release paging references and
9022                                  *      top-level placeholder page, if any.
9023                                  */
9024                                 if (top_page != VM_PAGE_NULL) {
9025                                         vm_object_t local_object;
9026
9027                                         local_object = VM_PAGE_OBJECT(top_page);
9028
9029                                         /*
9030                                          * comparing 2 packed pointers
9031                                          */
9032                                         if (top_page->vm_page_object != dst_page->vm_page_object) {
9033                                                 vm_object_lock(local_object);
9034                                                 VM_PAGE_FREE(top_page);
9035                                                 vm_object_paging_end(local_object);
9036                                                 vm_object_unlock(local_object);
9037                                         } else {
9038                                                 VM_PAGE_FREE(top_page);
9039                                                 vm_object_paging_end(local_object);
9040                                         }
9041                                 }
9042                                 vm_object_paging_end(object);
9043                                 break;
9044
9045                         case VM_FAULT_RETRY:
9046                                 vm_object_lock(object);
9047                                 break;
9048
9049                         case VM_FAULT_MEMORY_SHORTAGE:
9050                                 OSAddAtomic((size_in_pages - entry), &vm_upl_wait_for_pages);
9051
9052                                 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
9053
9054                                 if (vm_page_wait(interruptible)) {
9055                                         OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
9056
9057                                         VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
9058                                         vm_object_lock(object);
9059
9060                                         break;
9061                                 }
9062                                 OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
9063
9064                                 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
9065
9066                                 /* fall thru */
9067
9068                         case VM_FAULT_INTERRUPTED:
9069                                 error_code = MACH_SEND_INTERRUPTED;
9070                         case VM_FAULT_MEMORY_ERROR:
9071                         memory_error:
9072                                 ret = (error_code ? error_code: KERN_MEMORY_ERROR);
9073
9074                                 vm_object_lock(object);
9075                                 goto return_err;
9076
9077                         case VM_FAULT_SUCCESS_NO_VM_PAGE:
9078                                 /* success but no page: fail */
9079                                 vm_object_paging_end(object);
9080                                 vm_object_unlock(object);
9081                                 goto memory_error;
9082
9083                         default:
9084                                 panic("vm_object_iopl_request: unexpected error"
9085                                       " 0x%x from vm_fault_page()\n", result);
9086                         }
9087                    } while (result != VM_FAULT_SUCCESS);
9088
9089                 }
9090                 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
9091
9092                 if (upl->flags & UPL_KERNEL_OBJECT)
9093                         goto record_phys_addr;
9094
9095                 if (dst_page->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
9096                         dst_page->busy = TRUE;
9097                         goto record_phys_addr;
9098                 }
9099
9100                 if (dst_page->cleaning) {
9101                         /*
9102                          * Someone else is cleaning this page in place.
9103                          * In theory, we should be able to  proceed and use this
9104                          * page but they'll probably end up clearing the "busy"
9105                          * bit on it in upl_commit_range() but they didn't set
9106                          * it, so they would clear our "busy" bit and open
9107                          * us to race conditions.
9108                          * We'd better wait for the cleaning to complete and
9109                          * then try again.
9110                          */
9111                         vm_object_iopl_request_sleep_for_cleaning++;
9112                         PAGE_SLEEP(object, dst_page, THREAD_UNINT);
9113                         continue;
9114                 }
9115                 if (dst_page->laundry)
9116                         vm_pageout_steal_laundry(dst_page, FALSE);
9117
9118                 if ( (cntrl_flags & UPL_NEED_32BIT_ADDR) &&
9119                      phys_page >= (max_valid_dma_address >> PAGE_SHIFT) ) {
9120                         vm_page_t       low_page;
9121                         int             refmod;
9122
9123                         /*
9124                          * support devices that can't DMA above 32 bits
9125                          * by substituting pages from a pool of low address
9126                          * memory for any pages we find above the 4G mark
9127                          * can't substitute if the page is already wired because
9128                          * we don't know whether that physical address has been
9129                          * handed out to some other 64 bit capable DMA device to use
9130                          */
9131                         if (VM_PAGE_WIRED(dst_page)) {
9132                                 ret = KERN_PROTECTION_FAILURE;
9133                                 goto return_err;
9134                         }
9135                         low_page = vm_page_grablo();
9136
9137                         if (low_page == VM_PAGE_NULL) {
9138                                 ret = KERN_RESOURCE_SHORTAGE;
9139                                 goto return_err;
9140                         }
9141                         /*
9142                          * from here until the vm_page_replace completes
9143                          * we musn't drop the object lock... we don't
9144                          * want anyone refaulting this page in and using
9145                          * it after we disconnect it... we want the fault
9146                          * to find the new page being substituted.
9147                          */
9148                         if (dst_page->pmapped)
9149                                 refmod = pmap_disconnect(phys_page);
9150                         else
9151                                 refmod = 0;
9152
9153                         if (!dst_page->absent)
9154                                 vm_page_copy(dst_page, low_page);
9155
9156                         low_page->reference = dst_page->reference;
9157                         low_page->dirty     = dst_page->dirty;
9158                         low_page->absent    = dst_page->absent;
9159
9160                         if (refmod & VM_MEM_REFERENCED)
9161                                 low_page->reference = TRUE;
9162                         if (refmod & VM_MEM_MODIFIED) {
9163                                 SET_PAGE_DIRTY(low_page, FALSE);
9164                         }
9165
9166                         vm_page_replace(low_page, object, dst_offset);
9167
9168                         dst_page = low_page;
9169                         /*
9170                          * vm_page_grablo returned the page marked
9171                          * BUSY... we don't need a PAGE_WAKEUP_DONE
9172                          * here, because we've never dropped the object lock
9173                          */
9174                         if ( !dst_page->absent)
9175                                 dst_page->busy = FALSE;
9176
9177                         phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
9178                 }
9179                 if ( !dst_page->busy)
9180                         dwp->dw_mask |= DW_vm_page_wire;
9181
9182                 if (cntrl_flags & UPL_BLOCK_ACCESS) {
9183                         /*
9184                          * Mark the page "busy" to block any future page fault
9185                          * on this page in addition to wiring it.
9186                          * We'll also remove the mapping
9187                          * of all these pages before leaving this routine.
9188                          */
9189                         assert(!dst_page->fictitious);
9190                         dst_page->busy = TRUE;
9191                 }
9192                 /*
9193                  * expect the page to be used
9194                  * page queues lock must be held to set 'reference'
9195                  */
9196                 dwp->dw_mask |= DW_set_reference;
9197
9198                 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
9199                         SET_PAGE_DIRTY(dst_page, TRUE);
9200                 }
9201                 if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->written_by_kernel == TRUE) {
9202                         pmap_sync_page_attributes_phys(phys_page);
9203                         dst_page->written_by_kernel = FALSE;
9204                 }
9205
9206 record_phys_addr:
9207                 if (dst_page->busy)
9208                         upl->flags |= UPL_HAS_BUSY;
9209
9210                 lite_list[entry>>5] |= 1 << (entry & 31);
9211
9212                 if (phys_page > upl->highest_page)
9213                         upl->highest_page = phys_page;
9214
9215                 if (user_page_list) {
9216                         user_page_list[entry].phys_addr = phys_page;
9217                         user_page_list[entry].free_when_done    = dst_page->free_when_done;
9218                         user_page_list[entry].absent    = dst_page->absent;
9219                         user_page_list[entry].dirty     = dst_page->dirty;
9220                         user_page_list[entry].precious  = dst_page->precious;
9221                         user_page_list[entry].device    = FALSE;
9222                         user_page_list[entry].needed    = FALSE;
9223                         if (dst_page->clustered == TRUE)
9224                                 user_page_list[entry].speculative = (dst_page->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
9225                         else
9226                                 user_page_list[entry].speculative = FALSE;
9227                         user_page_list[entry].cs_validated = dst_page->cs_validated;
9228                         user_page_list[entry].cs_tainted = dst_page->cs_tainted;
9229                         user_page_list[entry].cs_nx = dst_page->cs_nx;
9230                         user_page_list[entry].mark      = FALSE;
9231                 }
9232                 if (object != kernel_object && object != compressor_object) {
9233                         /*
9234                          * someone is explicitly grabbing this page...
9235                          * update clustered and speculative state
9236                          *
9237                          */
9238                         if (dst_page->clustered)
9239                                 VM_PAGE_CONSUME_CLUSTERED(dst_page);
9240                 }
9241 skip_page:
9242                 entry++;
9243                 dst_offset += PAGE_SIZE_64;
9244                 xfer_size -= PAGE_SIZE;
9245
9246                 if (dwp->dw_mask) {
9247                         VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
9248
9249                         if (dw_count >= dw_limit) {
9250                                 vm_page_do_delayed_work(object, UPL_MEMORY_TAG(cntrl_flags), &dw_array[0], dw_count);
9251
9252                                 dwp = &dw_array[0];
9253                                 dw_count = 0;
9254                         }
9255                 }
9256         }
9257         assert(entry == size_in_pages);
9258
9259         if (dw_count)
9260                 vm_page_do_delayed_work(object, UPL_MEMORY_TAG(cntrl_flags), &dw_array[0], dw_count);
9261 finish:
9262         if (user_page_list && set_cache_attr_needed == TRUE)
9263                 vm_object_set_pmap_cache_attr(object, user_page_list, size_in_pages, TRUE);
9264
9265         if (page_list_count != NULL) {
9266                 if (upl->flags & UPL_INTERNAL)
9267                         *page_list_count = 0;
9268                 else if (*page_list_count > size_in_pages)
9269                         *page_list_count = size_in_pages;
9270         }
9271         vm_object_unlock(object);
9272
9273         if (cntrl_flags & UPL_BLOCK_ACCESS) {
9274                 /*
9275                  * We've marked all the pages "busy" so that future
9276                  * page faults will block.
9277                  * Now remove the mapping for these pages, so that they
9278                  * can't be accessed without causing a page fault.
9279                  */
9280                 vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
9281                                        PMAP_NULL, 0, VM_PROT_NONE);
9282                 assert(!object->blocked_access);
9283                 object->blocked_access = TRUE;
9284         }
9285
9286         return KERN_SUCCESS;
9287
9288 return_err:
9289         dw_index = 0;
9290
9291         for (; offset < dst_offset; offset += PAGE_SIZE) {
9292                 boolean_t need_unwire;
9293
9294                 dst_page = vm_page_lookup(object, offset);
9295
9296                 if (dst_page == VM_PAGE_NULL)
9297                         panic("vm_object_iopl_request: Wired page missing. \n");
9298
9299                 /*
9300                  * if we've already processed this page in an earlier
9301                  * dw_do_work, we need to undo the wiring... we will
9302                  * leave the dirty and reference bits on if they
9303                  * were set, since we don't have a good way of knowing
9304                  * what the previous state was and we won't get here
9305                  * under any normal circumstances...  we will always
9306                  * clear BUSY and wakeup any waiters via vm_page_free
9307                  * or PAGE_WAKEUP_DONE
9308                  */
9309                 need_unwire = TRUE;
9310
9311                 if (dw_count) {
9312                         if (dw_array[dw_index].dw_m == dst_page) {
9313                                 /*
9314                                  * still in the deferred work list
9315                                  * which means we haven't yet called
9316                                  * vm_page_wire on this page
9317                                  */
9318                                 need_unwire = FALSE;
9319
9320                                 dw_index++;
9321                                 dw_count--;
9322                         }
9323                 }
9324                 vm_page_lock_queues();
9325
9326                 if (dst_page->absent || free_wired_pages == TRUE) {
9327                         vm_page_free(dst_page);
9328
9329                         need_unwire = FALSE;
9330                 } else {
9331                         if (need_unwire == TRUE)
9332                                 vm_page_unwire(dst_page, TRUE);
9333
9334                         PAGE_WAKEUP_DONE(dst_page);
9335                 }
9336                 vm_page_unlock_queues();
9337
9338                 if (need_unwire == TRUE)
9339                         VM_STAT_INCR(reactivations);
9340         }
9341 #if UPL_DEBUG
9342         upl->upl_state = 2;
9343 #endif
9344         if (! (upl->flags & UPL_KERNEL_OBJECT)) {
9345                 vm_object_activity_end(object);
9346                 vm_object_collapse(object, 0, TRUE);
9347         }
9348         vm_object_unlock(object);
9349         upl_destroy(upl);
9350
9351         return ret;
9352 }
9353
9354 kern_return_t
9355 upl_transpose(
9356         upl_t           upl1,
9357         upl_t           upl2)
9358 {
9359         kern_return_t           retval;
9360         boolean_t               upls_locked;
9361         vm_object_t             object1, object2;
9362
9363         if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2  || ((upl1->flags & UPL_VECTOR)==UPL_VECTOR)  || ((upl2->flags & UPL_VECTOR)==UPL_VECTOR)) {
9364                 return KERN_INVALID_ARGUMENT;
9365         }
9366
9367         upls_locked = FALSE;
9368
9369         /*
9370          * Since we need to lock both UPLs at the same time,
9371          * avoid deadlocks by always taking locks in the same order.
9372          */
9373         if (upl1 < upl2) {
9374                 upl_lock(upl1);
9375                 upl_lock(upl2);
9376         } else {
9377                 upl_lock(upl2);
9378                 upl_lock(upl1);
9379         }
9380         upls_locked = TRUE;     /* the UPLs will need to be unlocked */
9381
9382         object1 = upl1->map_object;
9383         object2 = upl2->map_object;
9384
9385         if (upl1->offset != 0 || upl2->offset != 0 ||
9386             upl1->size != upl2->size) {
9387                 /*
9388                  * We deal only with full objects, not subsets.
9389                  * That's because we exchange the entire backing store info
9390                  * for the objects: pager, resident pages, etc...  We can't do
9391                  * only part of it.
9392                  */
9393                 retval = KERN_INVALID_VALUE;
9394                 goto done;
9395         }
9396
9397         /*
9398          * Tranpose the VM objects' backing store.
9399          */
9400         retval = vm_object_transpose(object1, object2,
9401                                      (vm_object_size_t) upl1->size);
9402
9403         if (retval == KERN_SUCCESS) {
9404                 /*
9405                  * Make each UPL point to the correct VM object, i.e. the
9406                  * object holding the pages that the UPL refers to...
9407                  */
9408 #if CONFIG_IOSCHED || UPL_DEBUG
9409                 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
9410                         vm_object_lock(object1);
9411                         vm_object_lock(object2);
9412                 }
9413                 if (upl1->flags & UPL_TRACKED_BY_OBJECT)
9414                         queue_remove(&object1->uplq, upl1, upl_t, uplq);
9415                 if (upl2->flags & UPL_TRACKED_BY_OBJECT)
9416                         queue_remove(&object2->uplq, upl2, upl_t, uplq);
9417 #endif
9418                 upl1->map_object = object2;
9419                 upl2->map_object = object1;
9420
9421 #if CONFIG_IOSCHED || UPL_DEBUG
9422                 if (upl1->flags & UPL_TRACKED_BY_OBJECT)
9423                         queue_enter(&object2->uplq, upl1, upl_t, uplq);
9424                 if (upl2->flags & UPL_TRACKED_BY_OBJECT)
9425                         queue_enter(&object1->uplq, upl2, upl_t, uplq);
9426                 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
9427                         vm_object_unlock(object2);
9428                         vm_object_unlock(object1);
9429                 }
9430 #endif
9431         }
9432
9433 done:
9434         /*
9435          * Cleanup.
9436          */
9437         if (upls_locked) {
9438                 upl_unlock(upl1);
9439                 upl_unlock(upl2);
9440                 upls_locked = FALSE;
9441         }
9442
9443         return retval;
9444 }
9445
9446 void
9447 upl_range_needed(
9448         upl_t           upl,
9449         int             index,
9450         int             count)
9451 {
9452         upl_page_info_t *user_page_list;
9453         int             size_in_pages;
9454
9455         if ( !(upl->flags & UPL_INTERNAL) || count <= 0)
9456                 return;
9457
9458         size_in_pages = upl->size / PAGE_SIZE;
9459
9460         user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
9461
9462         while (count-- && index < size_in_pages)
9463                 user_page_list[index++].needed = TRUE;
9464 }
9465
9466
9467 /*
9468  * ENCRYPTED SWAP:
9469  *
9470  * Rationale:  the user might have some encrypted data on disk (via
9471  * FileVault or any other mechanism).  That data is then decrypted in
9472  * memory, which is safe as long as the machine is secure.  But that
9473  * decrypted data in memory could be paged out to disk by the default
9474  * pager.  The data would then be stored on disk in clear (not encrypted)
9475  * and it could be accessed by anyone who gets physical access to the
9476  * disk (if the laptop or the disk gets stolen for example).  This weakens
9477  * the security offered by FileVault.
9478  *
9479  * Solution:  the default pager will optionally request that all the
9480  * pages it gathers for pageout be encrypted, via the UPL interfaces,
9481  * before it sends this UPL to disk via the vnode_pageout() path.
9482  *
9483  * Notes:
9484  *
9485  * To avoid disrupting the VM LRU algorithms, we want to keep the
9486  * clean-in-place mechanisms, which allow us to send some extra pages to
9487  * swap (clustering) without actually removing them from the user's
9488  * address space.  We don't want the user to unknowingly access encrypted
9489  * data, so we have to actually remove the encrypted pages from the page
9490  * table.  When the user accesses the data, the hardware will fail to
9491  * locate the virtual page in its page table and will trigger a page
9492  * fault.  We can then decrypt the page and enter it in the page table
9493  * again.  Whenever we allow the user to access the contents of a page,
9494  * we have to make sure it's not encrypted.
9495  *
9496  *
9497  */
9498 /*
9499  * ENCRYPTED SWAP:
9500  * Reserve of virtual addresses in the kernel address space.
9501  * We need to map the physical pages in the kernel, so that we
9502  * can call the encryption/decryption routines with a kernel
9503  * virtual address.  We keep this pool of pre-allocated kernel
9504  * virtual addresses so that we don't have to scan the kernel's
9505  * virtaul address space each time we need to encrypt or decrypt
9506  * a physical page.
9507  * It would be nice to be able to encrypt and decrypt in physical
9508  * mode but that might not always be more efficient...
9509  */
9510 decl_simple_lock_data(,vm_paging_lock)
9511 #define VM_PAGING_NUM_PAGES     64
9512 vm_map_offset_t vm_paging_base_address = 0;
9513 boolean_t       vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
9514 int             vm_paging_max_index = 0;
9515 int             vm_paging_page_waiter = 0;
9516 int             vm_paging_page_waiter_total = 0;
9517 unsigned long   vm_paging_no_kernel_page = 0;
9518 unsigned long   vm_paging_objects_mapped = 0;
9519 unsigned long   vm_paging_pages_mapped = 0;
9520 unsigned long   vm_paging_objects_mapped_slow = 0;
9521 unsigned long   vm_paging_pages_mapped_slow = 0;
9522
9523 void
9524 vm_paging_map_init(void)
9525 {
9526         kern_return_t   kr;
9527         vm_map_offset_t page_map_offset;
9528         vm_map_entry_t  map_entry;
9529
9530         assert(vm_paging_base_address == 0);
9531
9532         /*
9533          * Initialize our pool of pre-allocated kernel
9534          * virtual addresses.
9535          */
9536         page_map_offset = 0;
9537         kr = vm_map_find_space(kernel_map,
9538                                &page_map_offset,
9539                                VM_PAGING_NUM_PAGES * PAGE_SIZE,
9540                                0,
9541                                0,
9542                                &map_entry);
9543         if (kr != KERN_SUCCESS) {
9544                 panic("vm_paging_map_init: kernel_map full\n");
9545         }
9546         VME_OBJECT_SET(map_entry, kernel_object);
9547         VME_OFFSET_SET(map_entry, page_map_offset);
9548         map_entry->protection = VM_PROT_NONE;
9549         map_entry->max_protection = VM_PROT_NONE;
9550         map_entry->permanent = TRUE;
9551         vm_object_reference(kernel_object);
9552         vm_map_unlock(kernel_map);
9553
9554         assert(vm_paging_base_address == 0);
9555         vm_paging_base_address = page_map_offset;
9556 }
9557
9558 /*
9559  * ENCRYPTED SWAP:
9560  * vm_paging_map_object:
9561  *      Maps part of a VM object's pages in the kernel
9562  *      virtual address space, using the pre-allocated
9563  *      kernel virtual addresses, if possible.
9564  * Context:
9565  *      The VM object is locked.  This lock will get
9566  *      dropped and re-acquired though, so the caller
9567  *      must make sure the VM object is kept alive
9568  *      (by holding a VM map that has a reference
9569  *      on it, for example, or taking an extra reference).
9570  *      The page should also be kept busy to prevent
9571  *      it from being reclaimed.
9572  */
9573 kern_return_t
9574 vm_paging_map_object(
9575         vm_page_t               page,
9576         vm_object_t             object,
9577         vm_object_offset_t      offset,
9578         vm_prot_t               protection,
9579         boolean_t               can_unlock_object,
9580         vm_map_size_t           *size,          /* IN/OUT */
9581         vm_map_offset_t         *address,       /* OUT */
9582         boolean_t               *need_unmap)    /* OUT */
9583 {
9584         kern_return_t           kr;
9585         vm_map_offset_t         page_map_offset;
9586         vm_map_size_t           map_size;
9587         vm_object_offset_t      object_offset;
9588         int                     i;
9589
9590         if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
9591                 /* use permanent 1-to-1 kernel mapping of physical memory ? */
9592 #if __x86_64__
9593                 *address = (vm_map_offset_t)
9594                         PHYSMAP_PTOV((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(page) <<
9595                                      PAGE_SHIFT);
9596                 *need_unmap = FALSE;
9597                 return KERN_SUCCESS;
9598 #else
9599 #warn "vm_paging_map_object: no 1-to-1 kernel mapping of physical memory..."
9600 #endif
9601
9602                 assert(page->busy);
9603                 /*
9604                  * Use one of the pre-allocated kernel virtual addresses
9605                  * and just enter the VM page in the kernel address space
9606                  * at that virtual address.
9607                  */
9608                 simple_lock(&vm_paging_lock);
9609
9610                 /*
9611                  * Try and find an available kernel virtual address
9612                  * from our pre-allocated pool.
9613                  */
9614                 page_map_offset = 0;
9615                 for (;;) {
9616                         for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
9617                                 if (vm_paging_page_inuse[i] == FALSE) {
9618                                         page_map_offset =
9619                                                 vm_paging_base_address +
9620                                                 (i * PAGE_SIZE);
9621                                         break;
9622                                 }
9623                         }
9624                         if (page_map_offset != 0) {
9625                                 /* found a space to map our page ! */
9626                                 break;
9627                         }
9628
9629                         if (can_unlock_object) {
9630                                 /*
9631                                  * If we can afford to unlock the VM object,
9632                                  * let's take the slow path now...
9633                                  */
9634                                 break;
9635                         }
9636                         /*
9637                          * We can't afford to unlock the VM object, so
9638                          * let's wait for a space to become available...
9639                          */
9640                         vm_paging_page_waiter_total++;
9641                         vm_paging_page_waiter++;
9642                         kr = assert_wait((event_t)&vm_paging_page_waiter, THREAD_UNINT);
9643                         if (kr == THREAD_WAITING) {
9644                                 simple_unlock(&vm_paging_lock);
9645                                 kr = thread_block(THREAD_CONTINUE_NULL);
9646                                 simple_lock(&vm_paging_lock);
9647                         }
9648                         vm_paging_page_waiter--;
9649                         /* ... and try again */
9650                 }
9651
9652                 if (page_map_offset != 0) {
9653                         /*
9654                          * We found a kernel virtual address;
9655                          * map the physical page to that virtual address.
9656                          */
9657                         if (i > vm_paging_max_index) {
9658                                 vm_paging_max_index = i;
9659                         }
9660                         vm_paging_page_inuse[i] = TRUE;
9661                         simple_unlock(&vm_paging_lock);
9662
9663                         page->pmapped = TRUE;
9664
9665                         /*
9666                          * Keep the VM object locked over the PMAP_ENTER
9667                          * and the actual use of the page by the kernel,
9668                          * or this pmap mapping might get undone by a
9669                          * vm_object_pmap_protect() call...
9670                          */
9671                         PMAP_ENTER(kernel_pmap,
9672                                    page_map_offset,
9673                                    page,
9674                                    protection,
9675                                    VM_PROT_NONE,
9676                                    0,
9677                                    TRUE);
9678                         vm_paging_objects_mapped++;
9679                         vm_paging_pages_mapped++;
9680                         *address = page_map_offset;
9681                         *need_unmap = TRUE;
9682
9683                         /* all done and mapped, ready to use ! */
9684                         return KERN_SUCCESS;
9685                 }
9686
9687                 /*
9688                  * We ran out of pre-allocated kernel virtual
9689                  * addresses.  Just map the page in the kernel
9690                  * the slow and regular way.
9691                  */
9692                 vm_paging_no_kernel_page++;
9693                 simple_unlock(&vm_paging_lock);
9694         }
9695
9696         if (! can_unlock_object) {
9697                 *address = 0;
9698                 *size = 0;
9699                 *need_unmap = FALSE;
9700                 return KERN_NOT_SUPPORTED;
9701         }
9702
9703         object_offset = vm_object_trunc_page(offset);
9704         map_size = vm_map_round_page(*size,
9705                                      VM_MAP_PAGE_MASK(kernel_map));
9706
9707         /*
9708          * Try and map the required range of the object
9709          * in the kernel_map
9710          */
9711
9712         vm_object_reference_locked(object);     /* for the map entry */
9713         vm_object_unlock(object);
9714
9715         kr = vm_map_enter(kernel_map,
9716                           address,
9717                           map_size,
9718                           0,
9719                           VM_FLAGS_ANYWHERE,
9720                           object,
9721                           object_offset,
9722                           FALSE,
9723                           protection,
9724                           VM_PROT_ALL,
9725                           VM_INHERIT_NONE);
9726         if (kr != KERN_SUCCESS) {
9727                 *address = 0;
9728                 *size = 0;
9729                 *need_unmap = FALSE;
9730                 vm_object_deallocate(object);   /* for the map entry */
9731                 vm_object_lock(object);
9732                 return kr;
9733         }
9734
9735         *size = map_size;
9736
9737         /*
9738          * Enter the mapped pages in the page table now.
9739          */
9740         vm_object_lock(object);
9741         /*
9742          * VM object must be kept locked from before PMAP_ENTER()
9743          * until after the kernel is done accessing the page(s).
9744          * Otherwise, the pmap mappings in the kernel could be
9745          * undone by a call to vm_object_pmap_protect().
9746          */
9747
9748         for (page_map_offset = 0;
9749              map_size != 0;
9750              map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
9751
9752                 page = vm_page_lookup(object, offset + page_map_offset);
9753                 if (page == VM_PAGE_NULL) {
9754                         printf("vm_paging_map_object: no page !?");
9755                         vm_object_unlock(object);
9756                         kr = vm_map_remove(kernel_map, *address, *size,
9757                                            VM_MAP_NO_FLAGS);
9758                         assert(kr == KERN_SUCCESS);
9759                         *address = 0;
9760                         *size = 0;
9761                         *need_unmap = FALSE;
9762                         vm_object_lock(object);
9763                         return KERN_MEMORY_ERROR;
9764                 }
9765                 page->pmapped = TRUE;
9766
9767                 //assert(pmap_verify_free(VM_PAGE_GET_PHYS_PAGE(page)));
9768                 PMAP_ENTER(kernel_pmap,
9769                            *address + page_map_offset,
9770                            page,
9771                            protection,
9772                            VM_PROT_NONE,
9773                            0,
9774                            TRUE);
9775         }
9776
9777         vm_paging_objects_mapped_slow++;
9778         vm_paging_pages_mapped_slow += (unsigned long) (map_size / PAGE_SIZE_64);
9779
9780         *need_unmap = TRUE;
9781
9782         return KERN_SUCCESS;
9783 }
9784
9785 /*
9786  * ENCRYPTED SWAP:
9787  * vm_paging_unmap_object:
9788  *      Unmaps part of a VM object's pages from the kernel
9789  *      virtual address space.
9790  * Context:
9791  *      The VM object is locked.  This lock will get
9792  *      dropped and re-acquired though.
9793  */
9794 void
9795 vm_paging_unmap_object(
9796         vm_object_t     object,
9797         vm_map_offset_t start,
9798         vm_map_offset_t end)
9799 {
9800         kern_return_t   kr;
9801         int             i;
9802
9803         if ((vm_paging_base_address == 0) ||
9804             (start < vm_paging_base_address) ||
9805             (end > (vm_paging_base_address
9806                      + (VM_PAGING_NUM_PAGES * PAGE_SIZE)))) {
9807                 /*
9808                  * We didn't use our pre-allocated pool of
9809                  * kernel virtual address.  Deallocate the
9810                  * virtual memory.
9811                  */
9812                 if (object != VM_OBJECT_NULL) {
9813                         vm_object_unlock(object);
9814                 }
9815                 kr = vm_map_remove(kernel_map, start, end, VM_MAP_NO_FLAGS);
9816                 if (object != VM_OBJECT_NULL) {
9817                         vm_object_lock(object);
9818                 }
9819                 assert(kr == KERN_SUCCESS);
9820         } else {
9821                 /*
9822                  * We used a kernel virtual address from our
9823                  * pre-allocated pool.  Put it back in the pool
9824                  * for next time.
9825                  */
9826                 assert(end - start == PAGE_SIZE);
9827                 i = (int) ((start - vm_paging_base_address) >> PAGE_SHIFT);
9828                 assert(i >= 0 && i < VM_PAGING_NUM_PAGES);
9829
9830                 /* undo the pmap mapping */
9831                 pmap_remove(kernel_pmap, start, end);
9832
9833                 simple_lock(&vm_paging_lock);
9834                 vm_paging_page_inuse[i] = FALSE;
9835                 if (vm_paging_page_waiter) {
9836                         thread_wakeup(&vm_paging_page_waiter);
9837                 }
9838                 simple_unlock(&vm_paging_lock);
9839         }
9840 }
9841
9842 #if ENCRYPTED_SWAP
9843 /*
9844  * Encryption data.
9845  * "iv" is the "initial vector".  Ideally, we want to
9846  * have a different one for each page we encrypt, so that
9847  * crackers can't find encryption patterns too easily.
9848  */
9849 #define SWAP_CRYPT_AES_KEY_SIZE 128     /* XXX 192 and 256 don't work ! */
9850 boolean_t               swap_crypt_ctx_initialized = FALSE;
9851 uint32_t                swap_crypt_key[8]; /* big enough for a 256 key */
9852 aes_ctx                 swap_crypt_ctx;
9853 const unsigned char     swap_crypt_null_iv[AES_BLOCK_SIZE] = {0xa, };
9854
9855 #if DEBUG
9856 boolean_t               swap_crypt_ctx_tested = FALSE;
9857 unsigned char swap_crypt_test_page_ref[4096] __attribute__((aligned(4096)));
9858 unsigned char swap_crypt_test_page_encrypt[4096] __attribute__((aligned(4096)));
9859 unsigned char swap_crypt_test_page_decrypt[4096] __attribute__((aligned(4096)));
9860 #endif /* DEBUG */
9861
9862 /*
9863  * Initialize the encryption context: key and key size.
9864  */
9865 void swap_crypt_ctx_initialize(void); /* forward */
9866 void
9867 swap_crypt_ctx_initialize(void)
9868 {
9869         unsigned int    i;
9870
9871         /*
9872          * No need for locking to protect swap_crypt_ctx_initialized
9873          * because the first use of encryption will come from the
9874          * pageout thread (we won't pagein before there's been a pageout)
9875          * and there's only one pageout thread.
9876          */
9877         if (swap_crypt_ctx_initialized == FALSE) {
9878                 for (i = 0;
9879                      i < (sizeof (swap_crypt_key) /
9880                           sizeof (swap_crypt_key[0]));
9881                      i++) {
9882                         swap_crypt_key[i] = random();
9883                 }
9884                 aes_encrypt_key((const unsigned char *) swap_crypt_key,
9885                                 SWAP_CRYPT_AES_KEY_SIZE,
9886                                 &swap_crypt_ctx.encrypt);
9887                 aes_decrypt_key((const unsigned char *) swap_crypt_key,
9888                                 SWAP_CRYPT_AES_KEY_SIZE,
9889                                 &swap_crypt_ctx.decrypt);
9890                 swap_crypt_ctx_initialized = TRUE;
9891         }
9892
9893 #if DEBUG
9894         /*
9895          * Validate the encryption algorithms.
9896          */
9897         if (swap_crypt_ctx_tested == FALSE) {
9898                 /* initialize */
9899                 for (i = 0; i < 4096; i++) {
9900                         swap_crypt_test_page_ref[i] = (char) i;
9901                 }
9902                 /* encrypt */
9903                 aes_encrypt_cbc(swap_crypt_test_page_ref,
9904                                 swap_crypt_null_iv,
9905                                 PAGE_SIZE / AES_BLOCK_SIZE,
9906                                 swap_crypt_test_page_encrypt,
9907                                 &swap_crypt_ctx.encrypt);
9908                 /* decrypt */
9909                 aes_decrypt_cbc(swap_crypt_test_page_encrypt,
9910                                 swap_crypt_null_iv,
9911                                 PAGE_SIZE / AES_BLOCK_SIZE,
9912                                 swap_crypt_test_page_decrypt,
9913                                 &swap_crypt_ctx.decrypt);
9914                 /* compare result with original */
9915                 for (i = 0; i < 4096; i ++) {
9916                         if (swap_crypt_test_page_decrypt[i] !=
9917                             swap_crypt_test_page_ref[i]) {
9918                                 panic("encryption test failed");
9919                         }
9920                 }
9921
9922                 /* encrypt again */
9923                 aes_encrypt_cbc(swap_crypt_test_page_decrypt,
9924                                 swap_crypt_null_iv,
9925                                 PAGE_SIZE / AES_BLOCK_SIZE,
9926                                 swap_crypt_test_page_decrypt,
9927                                 &swap_crypt_ctx.encrypt);
9928                 /* decrypt in place */
9929                 aes_decrypt_cbc(swap_crypt_test_page_decrypt,
9930                                 swap_crypt_null_iv,
9931                                 PAGE_SIZE / AES_BLOCK_SIZE,
9932                                 swap_crypt_test_page_decrypt,
9933                                 &swap_crypt_ctx.decrypt);
9934                 for (i = 0; i < 4096; i ++) {
9935                         if (swap_crypt_test_page_decrypt[i] !=
9936                             swap_crypt_test_page_ref[i]) {
9937                                 panic("in place encryption test failed");
9938                         }
9939                 }
9940
9941                 swap_crypt_ctx_tested = TRUE;
9942         }
9943 #endif /* DEBUG */
9944 }
9945
9946 /*
9947  * ENCRYPTED SWAP:
9948  * vm_page_encrypt:
9949  *      Encrypt the given page, for secure paging.
9950  *      The page might already be mapped at kernel virtual
9951  *      address "kernel_mapping_offset".  Otherwise, we need
9952  *      to map it.
9953  *
9954  * Context:
9955  *      The page's object is locked, but this lock will be released
9956  *      and re-acquired.
9957  *      The page is busy and not accessible by users (not entered in any pmap).
9958  */
9959 void
9960 vm_page_encrypt(
9961         vm_page_t       page,
9962         vm_map_offset_t kernel_mapping_offset)
9963 {
9964         kern_return_t           kr;
9965         vm_map_size_t           kernel_mapping_size;
9966         boolean_t               kernel_mapping_needs_unmap;
9967         vm_offset_t             kernel_vaddr;
9968         vm_object_t             page_object;
9969         union {
9970                 unsigned char   aes_iv[AES_BLOCK_SIZE];
9971                 struct {
9972                         memory_object_t         pager_object;
9973                         vm_object_offset_t      paging_offset;
9974                 } vm;
9975         } encrypt_iv;
9976
9977         if (! vm_pages_encrypted) {
9978                 vm_pages_encrypted = TRUE;
9979         }
9980
9981         assert(page->busy);
9982
9983         if (page->encrypted) {
9984                 /*
9985                  * Already encrypted: no need to do it again.
9986                  */
9987                 vm_page_encrypt_already_encrypted_counter++;
9988                 return;
9989         }
9990         assert(page->dirty || page->precious);
9991
9992         ASSERT_PAGE_DECRYPTED(page);
9993
9994         page_object = VM_PAGE_OBJECT(page);
9995
9996         /*
9997          * Take a paging-in-progress reference to keep the object
9998          * alive even if we have to unlock it (in vm_paging_map_object()
9999          * for example)...
10000          */
10001         vm_object_paging_begin(page_object);
10002
10003         if (kernel_mapping_offset == 0) {
10004                 /*
10005                  * The page hasn't already been mapped in kernel space
10006                  * by the caller.  Map it now, so that we can access
10007                  * its contents and encrypt them.
10008                  */
10009                 kernel_mapping_size = PAGE_SIZE;
10010                 kernel_mapping_needs_unmap = FALSE;
10011                 kr = vm_paging_map_object(page,
10012                                           page_object,
10013                                           page->offset,
10014                                           VM_PROT_READ | VM_PROT_WRITE,
10015                                           FALSE,
10016                                           &kernel_mapping_size,
10017                                           &kernel_mapping_offset,
10018                                           &kernel_mapping_needs_unmap);
10019                 if (kr != KERN_SUCCESS) {
10020                         panic("vm_page_encrypt: "
10021                               "could not map page in kernel: 0x%x\n",
10022                               kr);
10023                 }
10024         } else {
10025                 kernel_mapping_size = 0;
10026                 kernel_mapping_needs_unmap = FALSE;
10027         }
10028         kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
10029
10030         if (swap_crypt_ctx_initialized == FALSE) {
10031                 swap_crypt_ctx_initialize();
10032         }
10033         assert(swap_crypt_ctx_initialized);
10034
10035         /*
10036          * Prepare an "initial vector" for the encryption.
10037          * We use the "pager" and the "paging_offset" for that
10038          * page to obfuscate the encrypted data a bit more and
10039          * prevent crackers from finding patterns that they could
10040          * use to break the key.
10041          */
10042         bzero(&encrypt_iv.aes_iv[0], sizeof (encrypt_iv.aes_iv));
10043         encrypt_iv.vm.pager_object = page_object->pager;
10044         encrypt_iv.vm.paging_offset =
10045                 page_object->paging_offset + page->offset;
10046
10047         /* encrypt the "initial vector" */
10048         aes_encrypt_cbc((const unsigned char *) &encrypt_iv.aes_iv[0],
10049                         swap_crypt_null_iv,
10050                         1,
10051                         &encrypt_iv.aes_iv[0],
10052                         &swap_crypt_ctx.encrypt);
10053
10054         /*
10055          * Encrypt the page.
10056          */
10057         aes_encrypt_cbc((const unsigned char *) kernel_vaddr,
10058                         &encrypt_iv.aes_iv[0],
10059                         PAGE_SIZE / AES_BLOCK_SIZE,
10060                         (unsigned char *) kernel_vaddr,
10061                         &swap_crypt_ctx.encrypt);
10062
10063         vm_page_encrypt_counter++;
10064
10065         /*
10066          * Unmap the page from the kernel's address space,
10067          * if we had to map it ourselves.  Otherwise, let
10068          * the caller undo the mapping if needed.
10069          */
10070         if (kernel_mapping_needs_unmap) {
10071                 vm_paging_unmap_object(page_object,
10072                                        kernel_mapping_offset,
10073                                        kernel_mapping_offset + kernel_mapping_size);
10074         }
10075
10076         /*
10077          * Clear the "reference" and "modified" bits.
10078          * This should clean up any impact the encryption had
10079          * on them.
10080          * The page was kept busy and disconnected from all pmaps,
10081          * so it can't have been referenced or modified from user
10082          * space.
10083          * The software bits will be reset later after the I/O
10084          * has completed (in upl_commit_range()).
10085          */
10086         pmap_clear_refmod(VM_PAGE_GET_PHYS_PAGE(page), VM_MEM_REFERENCED | VM_MEM_MODIFIED);
10087
10088         page->encrypted = TRUE;
10089
10090         vm_object_paging_end(page_object);
10091 }
10092
10093 /*
10094  * ENCRYPTED SWAP:
10095  * vm_page_decrypt:
10096  *      Decrypt the given page.
10097  *      The page might already be mapped at kernel virtual
10098  *      address "kernel_mapping_offset".  Otherwise, we need
10099  *      to map it.
10100  *
10101  * Context:
10102  *      The page's VM object is locked but will be unlocked and relocked.
10103  *      The page is busy and not accessible by users (not entered in any pmap).
10104  */
10105 void
10106 vm_page_decrypt(
10107         vm_page_t       page,
10108         vm_map_offset_t kernel_mapping_offset)
10109 {
10110         kern_return_t           kr;
10111         vm_map_size_t           kernel_mapping_size;
10112         vm_offset_t             kernel_vaddr;
10113         boolean_t               kernel_mapping_needs_unmap;
10114         vm_object_t             page_object;
10115         union {
10116                 unsigned char   aes_iv[AES_BLOCK_SIZE];
10117                 struct {
10118                         memory_object_t         pager_object;
10119                         vm_object_offset_t      paging_offset;
10120                 } vm;
10121         } decrypt_iv;
10122         boolean_t               was_dirty;
10123
10124         assert(page->busy);
10125         assert(page->encrypted);
10126
10127         page_object = VM_PAGE_OBJECT(page);
10128         was_dirty = page->dirty;
10129
10130         /*
10131          * Take a paging-in-progress reference to keep the object
10132          * alive even if we have to unlock it (in vm_paging_map_object()
10133          * for example)...
10134          */
10135         vm_object_paging_begin(page_object);
10136
10137         if (kernel_mapping_offset == 0) {
10138                 /*
10139                  * The page hasn't already been mapped in kernel space
10140                  * by the caller.  Map it now, so that we can access
10141                  * its contents and decrypt them.
10142                  */
10143                 kernel_mapping_size = PAGE_SIZE;
10144                 kernel_mapping_needs_unmap = FALSE;
10145                 kr = vm_paging_map_object(page,
10146                                           page_object,
10147                                           page->offset,
10148                                           VM_PROT_READ | VM_PROT_WRITE,
10149                                           FALSE,
10150                                           &kernel_mapping_size,
10151                                           &kernel_mapping_offset,
10152                                           &kernel_mapping_needs_unmap);
10153                 if (kr != KERN_SUCCESS) {
10154                         panic("vm_page_decrypt: "
10155                               "could not map page in kernel: 0x%x\n",
10156                               kr);
10157                 }
10158         } else {
10159                 kernel_mapping_size = 0;
10160                 kernel_mapping_needs_unmap = FALSE;
10161         }
10162         kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
10163
10164         assert(swap_crypt_ctx_initialized);
10165
10166         /*
10167          * Prepare an "initial vector" for the decryption.
10168          * It has to be the same as the "initial vector" we
10169          * used to encrypt that page.
10170          */
10171         bzero(&decrypt_iv.aes_iv[0], sizeof (decrypt_iv.aes_iv));
10172         decrypt_iv.vm.pager_object = page_object->pager;
10173         decrypt_iv.vm.paging_offset =
10174                 page_object->paging_offset + page->offset;
10175
10176         /* encrypt the "initial vector" */
10177         aes_encrypt_cbc((const unsigned char *) &decrypt_iv.aes_iv[0],
10178                         swap_crypt_null_iv,
10179                         1,
10180                         &decrypt_iv.aes_iv[0],
10181                         &swap_crypt_ctx.encrypt);
10182
10183         /*
10184          * Decrypt the page.
10185          */
10186         aes_decrypt_cbc((const unsigned char *) kernel_vaddr,
10187                         &decrypt_iv.aes_iv[0],
10188                         PAGE_SIZE / AES_BLOCK_SIZE,
10189                         (unsigned char *) kernel_vaddr,
10190                         &swap_crypt_ctx.decrypt);
10191         vm_page_decrypt_counter++;
10192
10193         /*
10194          * Unmap the page from the kernel's address space,
10195          * if we had to map it ourselves.  Otherwise, let
10196          * the caller undo the mapping if needed.
10197          */
10198         if (kernel_mapping_needs_unmap) {
10199                 vm_paging_unmap_object(page_object,
10200                                        kernel_vaddr,
10201                                        kernel_vaddr + PAGE_SIZE);
10202         }
10203
10204         if (was_dirty) {
10205                 /*
10206                  * The pager did not specify that the page would be
10207                  * clean when it got paged in, so let's not clean it here
10208                  * either.
10209                  */
10210         } else {
10211                 /*
10212                  * After decryption, the page is actually still clean.
10213                  * It was encrypted as part of paging, which "cleans"
10214                  * the "dirty" pages.
10215                  * Noone could access it after it was encrypted
10216                  * and the decryption doesn't count.
10217                  */
10218                 page->dirty = FALSE;
10219                 assert (page->cs_validated == FALSE);
10220                 pmap_clear_refmod(VM_PAGE_GET_PHYS_PAGE(page), VM_MEM_MODIFIED | VM_MEM_REFERENCED);
10221         }
10222         page->encrypted = FALSE;
10223
10224         /*
10225          * We've just modified the page's contents via the data cache and part
10226          * of the new contents might still be in the cache and not yet in RAM.
10227          * Since the page is now available and might get gathered in a UPL to
10228          * be part of a DMA transfer from a driver that expects the memory to
10229          * be coherent at this point, we have to flush the data cache.
10230          */
10231         pmap_sync_page_attributes_phys(VM_PAGE_GET_PHYS_PAGE(page));
10232         /*
10233          * Since the page is not mapped yet, some code might assume that it
10234          * doesn't need to invalidate the instruction cache when writing to
10235          * that page.  That code relies on "pmapped" being FALSE, so that the
10236          * caches get synchronized when the page is first mapped.
10237          */
10238         assert(pmap_verify_free(VM_PAGE_GET_PHYS_PAGE(page)));
10239         page->pmapped = FALSE;
10240         page->wpmapped = FALSE;
10241
10242         vm_object_paging_end(page_object);
10243 }
10244
10245 #if DEVELOPMENT || DEBUG
10246 unsigned long upl_encrypt_upls = 0;
10247 unsigned long upl_encrypt_pages = 0;
10248 #endif
10249
10250 /*
10251  * ENCRYPTED SWAP:
10252  *
10253  * upl_encrypt:
10254  *      Encrypts all the pages in the UPL, within the specified range.
10255  *
10256  */
10257 void
10258 upl_encrypt(
10259         upl_t                   upl,
10260         upl_offset_t            crypt_offset,
10261         upl_size_t              crypt_size)
10262 {
10263         upl_size_t              upl_size, subupl_size=crypt_size;
10264         upl_offset_t            offset_in_upl, subupl_offset=crypt_offset;
10265         vm_object_t             upl_object;
10266         vm_object_offset_t      upl_offset;
10267         vm_page_t               page;
10268         vm_object_t             shadow_object;
10269         vm_object_offset_t      shadow_offset;
10270         vm_object_offset_t      paging_offset;
10271         vm_object_offset_t      base_offset;
10272         int                     isVectorUPL = 0;
10273         upl_t                   vector_upl = NULL;
10274
10275         if((isVectorUPL = vector_upl_is_valid(upl)))
10276                 vector_upl = upl;
10277
10278 process_upl_to_encrypt:
10279         if(isVectorUPL) {
10280                 crypt_size = subupl_size;
10281                 crypt_offset = subupl_offset;
10282                 upl =  vector_upl_subupl_byoffset(vector_upl, &crypt_offset, &crypt_size);
10283                 if(upl == NULL)
10284                         panic("upl_encrypt: Accessing a sub-upl that doesn't exist\n");
10285                 subupl_size -= crypt_size;
10286                 subupl_offset += crypt_size;
10287         }
10288
10289 #if DEVELOPMENT || DEBUG
10290         upl_encrypt_upls++;
10291         upl_encrypt_pages += crypt_size / PAGE_SIZE;
10292 #endif
10293         upl_object = upl->map_object;
10294         upl_offset = upl->offset;
10295         upl_size = upl->size;
10296
10297         vm_object_lock(upl_object);
10298
10299         /*
10300          * Find the VM object that contains the actual pages.
10301          */
10302         if (upl_object->pageout) {
10303                 shadow_object = upl_object->shadow;
10304                 /*
10305                  * The offset in the shadow object is actually also
10306                  * accounted for in upl->offset.  It possibly shouldn't be
10307                  * this way, but for now don't account for it twice.
10308                  */
10309                 shadow_offset = 0;
10310                 assert(upl_object->paging_offset == 0); /* XXX ? */
10311                 vm_object_lock(shadow_object);
10312         } else {
10313                 shadow_object = upl_object;
10314                 shadow_offset = 0;
10315         }
10316
10317         paging_offset = shadow_object->paging_offset;
10318         vm_object_paging_begin(shadow_object);
10319
10320         if (shadow_object != upl_object)
10321                 vm_object_unlock(upl_object);
10322
10323
10324         base_offset = shadow_offset;
10325         base_offset += upl_offset;
10326         base_offset += crypt_offset;
10327         base_offset -= paging_offset;
10328
10329         assert(crypt_offset + crypt_size <= upl_size);
10330
10331         for (offset_in_upl = 0;
10332              offset_in_upl < crypt_size;
10333              offset_in_upl += PAGE_SIZE) {
10334                 page = vm_page_lookup(shadow_object,
10335                                       base_offset + offset_in_upl);
10336                 if (page == VM_PAGE_NULL) {
10337                         panic("upl_encrypt: "
10338                               "no page for (obj=%p,off=0x%llx+0x%x)!\n",
10339                               shadow_object,
10340                               base_offset,
10341                               offset_in_upl);
10342                 }
10343                 /*
10344                  * Disconnect the page from all pmaps, so that nobody can
10345                  * access it while it's encrypted.  After that point, all
10346                  * accesses to this page will cause a page fault and block
10347                  * while the page is busy being encrypted.  After the
10348                  * encryption completes, any access will cause a
10349                  * page fault and the page gets decrypted at that time.
10350                  */
10351                 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(page));
10352                 vm_page_encrypt(page, 0);
10353
10354                 if (vm_object_lock_avoid(shadow_object)) {
10355                         /*
10356                          * Give vm_pageout_scan() a chance to convert more
10357                          * pages from "clean-in-place" to "clean-and-free",
10358                          * if it's interested in the same pages we selected
10359                          * in this cluster.
10360                          */
10361                         vm_object_unlock(shadow_object);
10362                         mutex_pause(2);
10363                         vm_object_lock(shadow_object);
10364                 }
10365         }
10366
10367         vm_object_paging_end(shadow_object);
10368         vm_object_unlock(shadow_object);
10369
10370         if(isVectorUPL && subupl_size)
10371                 goto process_upl_to_encrypt;
10372 }
10373
10374 #else /* ENCRYPTED_SWAP */
10375 void
10376 upl_encrypt(
10377         __unused upl_t                  upl,
10378         __unused upl_offset_t   crypt_offset,
10379         __unused upl_size_t     crypt_size)
10380 {
10381 }
10382
10383 void
10384 vm_page_encrypt(
10385         __unused vm_page_t              page,
10386         __unused vm_map_offset_t        kernel_mapping_offset)
10387 {
10388 }
10389
10390 void
10391 vm_page_decrypt(
10392         __unused vm_page_t              page,
10393         __unused vm_map_offset_t        kernel_mapping_offset)
10394 {
10395 }
10396
10397 #endif /* ENCRYPTED_SWAP */
10398
10399 /*
10400  * page->object must be locked
10401  */
10402 void
10403 vm_pageout_steal_laundry(vm_page_t page, boolean_t queues_locked)
10404 {
10405         if (!queues_locked) {
10406                 vm_page_lockspin_queues();
10407         }
10408
10409         page->free_when_done = FALSE;
10410         /*
10411          * need to drop the laundry count...
10412          * we may also need to remove it
10413          * from the I/O paging queue...
10414          * vm_pageout_throttle_up handles both cases
10415          *
10416          * the laundry and pageout_queue flags are cleared...
10417          */
10418         vm_pageout_throttle_up(page);
10419
10420         vm_page_steal_pageout_page++;
10421
10422         if (!queues_locked) {
10423                 vm_page_unlock_queues();
10424         }
10425 }
10426
10427 upl_t
10428 vector_upl_create(vm_offset_t upl_offset)
10429 {
10430         int     vector_upl_size  = sizeof(struct _vector_upl);
10431         int i=0;
10432         upl_t   upl;
10433         vector_upl_t vector_upl = (vector_upl_t)kalloc(vector_upl_size);
10434
10435         upl = upl_create(0,UPL_VECTOR,0);
10436         upl->vector_upl = vector_upl;
10437         upl->offset = upl_offset;
10438         vector_upl->size = 0;
10439         vector_upl->offset = upl_offset;
10440         vector_upl->invalid_upls=0;
10441         vector_upl->num_upls=0;
10442         vector_upl->pagelist = NULL;
10443
10444         for(i=0; i < MAX_VECTOR_UPL_ELEMENTS ; i++) {
10445                 vector_upl->upl_iostates[i].size = 0;
10446                 vector_upl->upl_iostates[i].offset = 0;
10447
10448         }
10449         return upl;
10450 }
10451
10452 void
10453 vector_upl_deallocate(upl_t upl)
10454 {
10455         if(upl) {
10456                 vector_upl_t vector_upl = upl->vector_upl;
10457                 if(vector_upl) {
10458                         if(vector_upl->invalid_upls != vector_upl->num_upls)
10459                                 panic("Deallocating non-empty Vectored UPL\n");
10460                         kfree(vector_upl->pagelist,(sizeof(struct upl_page_info)*(vector_upl->size/PAGE_SIZE)));
10461                         vector_upl->invalid_upls=0;
10462                         vector_upl->num_upls = 0;
10463                         vector_upl->pagelist = NULL;
10464                         vector_upl->size = 0;
10465                         vector_upl->offset = 0;
10466                         kfree(vector_upl, sizeof(struct _vector_upl));
10467                         vector_upl = (vector_upl_t)0xfeedfeed;
10468                 }
10469                 else
10470                         panic("vector_upl_deallocate was passed a non-vectored upl\n");
10471         }
10472         else
10473                 panic("vector_upl_deallocate was passed a NULL upl\n");
10474 }
10475
10476 boolean_t
10477 vector_upl_is_valid(upl_t upl)
10478 {
10479         if(upl &&  ((upl->flags & UPL_VECTOR)==UPL_VECTOR)) {
10480                 vector_upl_t vector_upl = upl->vector_upl;
10481                 if(vector_upl == NULL || vector_upl == (vector_upl_t)0xfeedfeed || vector_upl == (vector_upl_t)0xfeedbeef)
10482                         return FALSE;
10483                 else
10484                         return TRUE;
10485         }
10486         return FALSE;
10487 }
10488
10489 boolean_t
10490 vector_upl_set_subupl(upl_t upl,upl_t subupl, uint32_t io_size)
10491 {
10492         if(vector_upl_is_valid(upl)) {
10493                 vector_upl_t vector_upl = upl->vector_upl;
10494
10495                 if(vector_upl) {
10496                         if(subupl) {
10497                                 if(io_size) {
10498                                         if(io_size < PAGE_SIZE)
10499                                                 io_size = PAGE_SIZE;
10500                                         subupl->vector_upl = (void*)vector_upl;
10501                                         vector_upl->upl_elems[vector_upl->num_upls++] = subupl;
10502                                         vector_upl->size += io_size;
10503                                         upl->size += io_size;
10504                                 }
10505                                 else {
10506                                         uint32_t i=0,invalid_upls=0;
10507                                         for(i = 0; i < vector_upl->num_upls; i++) {
10508                                                 if(vector_upl->upl_elems[i] == subupl)
10509                                                         break;
10510                                         }
10511                                         if(i == vector_upl->num_upls)
10512                                                 panic("Trying to remove sub-upl when none exists");
10513
10514                                         vector_upl->upl_elems[i] = NULL;
10515                                         invalid_upls = hw_atomic_add(&(vector_upl)->invalid_upls, 1);
10516                                         if(invalid_upls == vector_upl->num_upls)
10517                                                 return TRUE;
10518                                         else
10519                                                 return FALSE;
10520                                 }
10521                         }
10522                         else
10523                                 panic("vector_upl_set_subupl was passed a NULL upl element\n");
10524                 }
10525                 else
10526                         panic("vector_upl_set_subupl was passed a non-vectored upl\n");
10527         }
10528         else
10529                 panic("vector_upl_set_subupl was passed a NULL upl\n");
10530
10531         return FALSE;
10532 }
10533
10534 void
10535 vector_upl_set_pagelist(upl_t upl)
10536 {
10537         if(vector_upl_is_valid(upl)) {
10538                 uint32_t i=0;
10539                 vector_upl_t vector_upl = upl->vector_upl;
10540
10541                 if(vector_upl) {
10542                         vm_offset_t pagelist_size=0, cur_upl_pagelist_size=0;
10543
10544                         vector_upl->pagelist = (upl_page_info_array_t)kalloc(sizeof(struct upl_page_info)*(vector_upl->size/PAGE_SIZE));
10545
10546                         for(i=0; i < vector_upl->num_upls; i++) {
10547                                 cur_upl_pagelist_size = sizeof(struct upl_page_info) * vector_upl->upl_elems[i]->size/PAGE_SIZE;
10548                                 bcopy(UPL_GET_INTERNAL_PAGE_LIST_SIMPLE(vector_upl->upl_elems[i]), (char*)vector_upl->pagelist + pagelist_size, cur_upl_pagelist_size);
10549                                 pagelist_size += cur_upl_pagelist_size;
10550                                 if(vector_upl->upl_elems[i]->highest_page > upl->highest_page)
10551                                         upl->highest_page = vector_upl->upl_elems[i]->highest_page;
10552                         }
10553                         assert( pagelist_size == (sizeof(struct upl_page_info)*(vector_upl->size/PAGE_SIZE)) );
10554                 }
10555                 else
10556                         panic("vector_upl_set_pagelist was passed a non-vectored upl\n");
10557         }
10558         else
10559                 panic("vector_upl_set_pagelist was passed a NULL upl\n");
10560
10561 }
10562
10563 upl_t
10564 vector_upl_subupl_byindex(upl_t upl, uint32_t index)
10565 {
10566         if(vector_upl_is_valid(upl)) {
10567                 vector_upl_t vector_upl = upl->vector_upl;
10568                 if(vector_upl) {
10569                         if(index < vector_upl->num_upls)
10570                                 return vector_upl->upl_elems[index];
10571                 }
10572                 else
10573                         panic("vector_upl_subupl_byindex was passed a non-vectored upl\n");
10574         }
10575         return NULL;
10576 }
10577
10578 upl_t
10579 vector_upl_subupl_byoffset(upl_t upl, upl_offset_t *upl_offset, upl_size_t *upl_size)
10580 {
10581         if(vector_upl_is_valid(upl)) {
10582                 uint32_t i=0;
10583                 vector_upl_t vector_upl = upl->vector_upl;
10584
10585                 if(vector_upl) {
10586                         upl_t subupl = NULL;
10587                         vector_upl_iostates_t subupl_state;
10588
10589                         for(i=0; i < vector_upl->num_upls; i++) {
10590                                 subupl = vector_upl->upl_elems[i];
10591                                 subupl_state = vector_upl->upl_iostates[i];
10592                                 if( *upl_offset <= (subupl_state.offset + subupl_state.size - 1)) {
10593                                         /* We could have been passed an offset/size pair that belongs
10594                                          * to an UPL element that has already been committed/aborted.
10595                                          * If so, return NULL.
10596                                          */
10597                                         if(subupl == NULL)
10598                                                 return NULL;
10599                                         if((subupl_state.offset + subupl_state.size) < (*upl_offset + *upl_size)) {
10600                                                 *upl_size = (subupl_state.offset + subupl_state.size) - *upl_offset;
10601                                                 if(*upl_size > subupl_state.size)
10602                                                         *upl_size = subupl_state.size;
10603                                         }
10604                                         if(*upl_offset >= subupl_state.offset)
10605                                                 *upl_offset -= subupl_state.offset;
10606                                         else if(i)
10607                                                 panic("Vector UPL offset miscalculation\n");
10608                                         return subupl;
10609                                 }
10610                         }
10611                 }
10612                 else
10613                         panic("vector_upl_subupl_byoffset was passed a non-vectored UPL\n");
10614         }
10615         return NULL;
10616 }
10617
10618 void
10619 vector_upl_get_submap(upl_t upl, vm_map_t *v_upl_submap, vm_offset_t *submap_dst_addr)
10620 {
10621         *v_upl_submap = NULL;
10622
10623         if(vector_upl_is_valid(upl)) {
10624                 vector_upl_t vector_upl = upl->vector_upl;
10625                 if(vector_upl) {
10626                         *v_upl_submap = vector_upl->submap;
10627                         *submap_dst_addr = vector_upl->submap_dst_addr;
10628                 }
10629                 else
10630                         panic("vector_upl_get_submap was passed a non-vectored UPL\n");
10631         }
10632         else
10633                 panic("vector_upl_get_submap was passed a null UPL\n");
10634 }
10635
10636 void
10637 vector_upl_set_submap(upl_t upl, vm_map_t submap, vm_offset_t submap_dst_addr)
10638 {
10639         if(vector_upl_is_valid(upl)) {
10640                 vector_upl_t vector_upl = upl->vector_upl;
10641                 if(vector_upl) {
10642                         vector_upl->submap = submap;
10643                         vector_upl->submap_dst_addr = submap_dst_addr;
10644                 }
10645                 else
10646                         panic("vector_upl_get_submap was passed a non-vectored UPL\n");
10647         }
10648         else
10649                 panic("vector_upl_get_submap was passed a NULL UPL\n");
10650 }
10651
10652 void
10653 vector_upl_set_iostate(upl_t upl, upl_t subupl, upl_offset_t offset, upl_size_t size)
10654 {
10655         if(vector_upl_is_valid(upl)) {
10656                 uint32_t i = 0;
10657                 vector_upl_t vector_upl = upl->vector_upl;
10658
10659                 if(vector_upl) {
10660                         for(i = 0; i < vector_upl->num_upls; i++) {
10661                                 if(vector_upl->upl_elems[i] == subupl)
10662                                         break;
10663                         }
10664
10665                         if(i == vector_upl->num_upls)
10666                                 panic("setting sub-upl iostate when none exists");
10667
10668                         vector_upl->upl_iostates[i].offset = offset;
10669                         if(size < PAGE_SIZE)
10670                                 size = PAGE_SIZE;
10671                         vector_upl->upl_iostates[i].size = size;
10672                 }
10673                 else
10674                         panic("vector_upl_set_iostate was passed a non-vectored UPL\n");
10675         }
10676         else
10677                 panic("vector_upl_set_iostate was passed a NULL UPL\n");
10678 }
10679
10680 void
10681 vector_upl_get_iostate(upl_t upl, upl_t subupl, upl_offset_t *offset, upl_size_t *size)
10682 {
10683         if(vector_upl_is_valid(upl)) {
10684                 uint32_t i = 0;
10685                 vector_upl_t vector_upl = upl->vector_upl;
10686
10687                 if(vector_upl) {
10688                         for(i = 0; i < vector_upl->num_upls; i++) {
10689                                 if(vector_upl->upl_elems[i] == subupl)
10690                                         break;
10691                         }
10692
10693                         if(i == vector_upl->num_upls)
10694                                 panic("getting sub-upl iostate when none exists");
10695
10696                         *offset = vector_upl->upl_iostates[i].offset;
10697                         *size = vector_upl->upl_iostates[i].size;
10698                 }
10699                 else
10700                         panic("vector_upl_get_iostate was passed a non-vectored UPL\n");
10701         }
10702         else
10703                 panic("vector_upl_get_iostate was passed a NULL UPL\n");
10704 }
10705
10706 void
10707 vector_upl_get_iostate_byindex(upl_t upl, uint32_t index, upl_offset_t *offset, upl_size_t *size)
10708 {
10709         if(vector_upl_is_valid(upl)) {
10710                 vector_upl_t vector_upl = upl->vector_upl;
10711                 if(vector_upl) {
10712                         if(index < vector_upl->num_upls) {
10713                                 *offset = vector_upl->upl_iostates[index].offset;
10714                                 *size = vector_upl->upl_iostates[index].size;
10715                         }
10716                         else
10717                                 *offset = *size = 0;
10718                 }
10719                 else
10720                         panic("vector_upl_get_iostate_byindex was passed a non-vectored UPL\n");
10721         }
10722         else
10723                 panic("vector_upl_get_iostate_byindex was passed a NULL UPL\n");
10724 }
10725
10726 upl_page_info_t *
10727 upl_get_internal_vectorupl_pagelist(upl_t upl)
10728 {
10729         return ((vector_upl_t)(upl->vector_upl))->pagelist;
10730 }
10731
10732 void *
10733 upl_get_internal_vectorupl(upl_t upl)
10734 {
10735         return upl->vector_upl;
10736 }
10737
10738 vm_size_t
10739 upl_get_internal_pagelist_offset(void)
10740 {
10741         return sizeof(struct upl);
10742 }
10743
10744 void
10745 upl_clear_dirty(
10746         upl_t           upl,
10747         boolean_t       value)
10748 {
10749         if (value) {
10750                 upl->flags |= UPL_CLEAR_DIRTY;
10751         } else {
10752                 upl->flags &= ~UPL_CLEAR_DIRTY;
10753         }
10754 }
10755
10756 void
10757 upl_set_referenced(
10758         upl_t           upl,
10759         boolean_t       value)
10760 {
10761         upl_lock(upl);
10762         if (value) {
10763                 upl->ext_ref_count++;
10764         } else {
10765                 if (!upl->ext_ref_count) {
10766                         panic("upl_set_referenced not %p\n", upl);
10767                 }
10768                 upl->ext_ref_count--;
10769         }
10770         upl_unlock(upl);
10771 }
10772
10773 #if CONFIG_IOSCHED
10774 void
10775 upl_set_blkno(
10776         upl_t           upl,
10777         vm_offset_t     upl_offset,
10778         int             io_size,
10779         int64_t         blkno)
10780 {
10781                 int i,j;
10782                 if ((upl->flags & UPL_EXPEDITE_SUPPORTED) == 0)
10783                         return;
10784
10785                 assert(upl->upl_reprio_info != 0);
10786                 for(i = (int)(upl_offset / PAGE_SIZE), j = 0; j < io_size; i++, j += PAGE_SIZE) {
10787                         UPL_SET_REPRIO_INFO(upl, i, blkno, io_size);
10788                 }
10789 }
10790 #endif
10791
10792 boolean_t
10793 vm_page_is_slideable(vm_page_t m)
10794 {
10795         boolean_t result = FALSE;
10796         vm_shared_region_slide_info_t si;
10797         vm_object_t     m_object;
10798
10799         m_object = VM_PAGE_OBJECT(m);
10800
10801         vm_object_lock_assert_held(m_object);
10802
10803         /* make sure our page belongs to the one object allowed to do this */
10804         if (!m_object->object_slid) {
10805                 goto done;
10806         }
10807
10808         si = m_object->vo_slide_info;
10809         if (si == NULL) {
10810                 goto done;
10811         }
10812
10813         if(!m->slid && (si->start <= m->offset && si->end > m->offset)) {
10814                 result = TRUE;
10815         }
10816
10817 done:
10818         return result;
10819 }
10820
10821 int vm_page_slide_counter = 0;
10822 int vm_page_slide_errors = 0;
10823 kern_return_t
10824 vm_page_slide(
10825         vm_page_t       page,
10826         vm_map_offset_t kernel_mapping_offset)
10827 {
10828         kern_return_t           kr;
10829         vm_map_size_t           kernel_mapping_size;
10830         boolean_t               kernel_mapping_needs_unmap;
10831         vm_offset_t             kernel_vaddr;
10832         uint32_t                pageIndex;
10833         uint32_t                slide_chunk;
10834         vm_object_t             page_object;
10835
10836         page_object = VM_PAGE_OBJECT(page);
10837
10838         assert(!page->slid);
10839         assert(page_object->object_slid);
10840         vm_object_lock_assert_exclusive(page_object);
10841
10842         if (page->error)
10843                 return KERN_FAILURE;
10844
10845         /*
10846          * Take a paging-in-progress reference to keep the object
10847          * alive even if we have to unlock it (in vm_paging_map_object()
10848          * for example)...
10849          */
10850         vm_object_paging_begin(page_object);
10851
10852         if (kernel_mapping_offset == 0) {
10853                 /*
10854                  * The page hasn't already been mapped in kernel space
10855                  * by the caller.  Map it now, so that we can access
10856                  * its contents and decrypt them.
10857                  */
10858                 kernel_mapping_size = PAGE_SIZE;
10859                 kernel_mapping_needs_unmap = FALSE;
10860                 kr = vm_paging_map_object(page,
10861                                           page_object,
10862                                           page->offset,
10863                                           VM_PROT_READ | VM_PROT_WRITE,
10864                                           FALSE,
10865                                           &kernel_mapping_size,
10866                                           &kernel_mapping_offset,
10867                                           &kernel_mapping_needs_unmap);
10868                 if (kr != KERN_SUCCESS) {
10869                         panic("vm_page_slide: "
10870                               "could not map page in kernel: 0x%x\n",
10871                               kr);
10872                 }
10873         } else {
10874                 kernel_mapping_size = 0;
10875                 kernel_mapping_needs_unmap = FALSE;
10876         }
10877         kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
10878
10879         /*
10880          * Slide the pointers on the page.
10881          */
10882
10883         /*assert that slide_file_info.start/end are page-aligned?*/
10884
10885         assert(!page->slid);
10886         assert(page_object->object_slid);
10887
10888         pageIndex = (uint32_t)((page->offset -
10889                                 page_object->vo_slide_info->start) /
10890                                PAGE_SIZE_FOR_SR_SLIDE);
10891         for (slide_chunk = 0;
10892              slide_chunk < PAGE_SIZE / PAGE_SIZE_FOR_SR_SLIDE;
10893              slide_chunk++) {
10894                 kr = vm_shared_region_slide_page(page_object->vo_slide_info,
10895                                                  (kernel_vaddr +
10896                                                   (slide_chunk *
10897                                                    PAGE_SIZE_FOR_SR_SLIDE)),
10898                                                  (pageIndex + slide_chunk));
10899                 if (kr != KERN_SUCCESS) {
10900                         break;
10901                 }
10902         }
10903
10904         vm_page_slide_counter++;
10905
10906         /*
10907          * Unmap the page from the kernel's address space,
10908          */
10909         if (kernel_mapping_needs_unmap) {
10910                 vm_paging_unmap_object(page_object,
10911                                        kernel_vaddr,
10912                                        kernel_vaddr + PAGE_SIZE);
10913         }
10914
10915         page->dirty = FALSE;
10916         pmap_clear_refmod(VM_PAGE_GET_PHYS_PAGE(page), VM_MEM_MODIFIED | VM_MEM_REFERENCED);
10917
10918         if (kr != KERN_SUCCESS || cs_debug > 1) {
10919                 printf("vm_page_slide(%p): "
10920                        "obj %p off 0x%llx mobj %p moff 0x%llx\n",
10921                        page,
10922                        page_object, page->offset,
10923                        page_object->pager,
10924                        page->offset + page_object->paging_offset);
10925         }
10926
10927         if (kr == KERN_SUCCESS) {
10928                 page->slid = TRUE;
10929         } else {
10930                 page->error = TRUE;
10931                 vm_page_slide_errors++;
10932         }
10933
10934         vm_object_paging_end(page_object);
10935
10936         return kr;
10937 }
10938
10939 void inline memoryshot(unsigned int event, unsigned int control)
10940 {
10941         if (vm_debug_events) {
10942                 KERNEL_DEBUG_CONSTANT1((MACHDBG_CODE(DBG_MACH_VM_PRESSURE, event)) | control,
10943                                         vm_page_active_count, vm_page_inactive_count,
10944                                         vm_page_free_count, vm_page_speculative_count,
10945                                         vm_page_throttled_count);
10946         } else {
10947                 (void) event;
10948                 (void) control;
10949         }
10950
10951 }
10952
10953 #ifdef MACH_BSD
10954
10955 boolean_t  upl_device_page(upl_page_info_t *upl)
10956 {
10957         return(UPL_DEVICE_PAGE(upl));
10958 }
10959 boolean_t  upl_page_present(upl_page_info_t *upl, int index)
10960 {
10961         return(UPL_PAGE_PRESENT(upl, index));
10962 }
10963 boolean_t  upl_speculative_page(upl_page_info_t *upl, int index)
10964 {
10965         return(UPL_SPECULATIVE_PAGE(upl, index));
10966 }
10967 boolean_t  upl_dirty_page(upl_page_info_t *upl, int index)
10968 {
10969         return(UPL_DIRTY_PAGE(upl, index));
10970 }
10971 boolean_t  upl_valid_page(upl_page_info_t *upl, int index)
10972 {
10973         return(UPL_VALID_PAGE(upl, index));
10974 }
10975 ppnum_t  upl_phys_page(upl_page_info_t *upl, int index)
10976 {
10977         return(UPL_PHYS_PAGE(upl, index));
10978 }
10979
10980 void upl_page_set_mark(upl_page_info_t *upl, int index, boolean_t v)
10981 {
10982         upl[index].mark = v;
10983 }
10984
10985 boolean_t upl_page_get_mark(upl_page_info_t *upl, int index)
10986 {
10987         return upl[index].mark;
10988 }
10989
10990 void
10991 vm_countdirtypages(void)
10992 {
10993         vm_page_t m;
10994         int dpages;
10995         int pgopages;
10996         int precpages;
10997
10998
10999         dpages=0;
11000         pgopages=0;
11001         precpages=0;
11002
11003         vm_page_lock_queues();
11004         m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
11005         do {
11006                 if (m ==(vm_page_t )0) break;
11007
11008                 if(m->dirty) dpages++;
11009                 if(m->free_when_done) pgopages++;
11010                 if(m->precious) precpages++;
11011
11012                 assert(VM_PAGE_OBJECT(m) != kernel_object);
11013                 m = (vm_page_t) vm_page_queue_next(&m->pageq);
11014                 if (m ==(vm_page_t )0) break;
11015
11016         } while (!vm_page_queue_end(&vm_page_queue_inactive, (vm_page_queue_entry_t) m));
11017         vm_page_unlock_queues();
11018
11019         vm_page_lock_queues();
11020         m = (vm_page_t) vm_page_queue_first(&vm_page_queue_throttled);
11021         do {
11022                 if (m ==(vm_page_t )0) break;
11023
11024                 dpages++;
11025                 assert(m->dirty);
11026                 assert(!m->free_when_done);
11027                 assert(VM_PAGE_OBJECT(m) != kernel_object);
11028                 m = (vm_page_t) vm_page_queue_next(&m->pageq);
11029                 if (m ==(vm_page_t )0) break;
11030
11031         } while (!vm_page_queue_end(&vm_page_queue_throttled, (vm_page_queue_entry_t) m));
11032         vm_page_unlock_queues();
11033
11034         vm_page_lock_queues();
11035         m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
11036         do {
11037                 if (m ==(vm_page_t )0) break;
11038
11039                 if(m->dirty) dpages++;
11040                 if(m->free_when_done) pgopages++;
11041                 if(m->precious) precpages++;
11042
11043                 assert(VM_PAGE_OBJECT(m) != kernel_object);
11044                 m = (vm_page_t) vm_page_queue_next(&m->pageq);
11045                 if (m ==(vm_page_t )0) break;
11046
11047         } while (!vm_page_queue_end(&vm_page_queue_anonymous, (vm_page_queue_entry_t) m));
11048         vm_page_unlock_queues();
11049
11050         printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
11051
11052         dpages=0;
11053         pgopages=0;
11054         precpages=0;
11055
11056         vm_page_lock_queues();
11057         m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
11058
11059         do {
11060                 if(m == (vm_page_t )0) break;
11061                 if(m->dirty) dpages++;
11062                 if(m->free_when_done) pgopages++;
11063                 if(m->precious) precpages++;
11064
11065                 assert(VM_PAGE_OBJECT(m) != kernel_object);
11066                 m = (vm_page_t) vm_page_queue_next(&m->pageq);
11067                 if(m == (vm_page_t )0) break;
11068
11069         } while (!vm_page_queue_end(&vm_page_queue_active, (vm_page_queue_entry_t) m));
11070         vm_page_unlock_queues();
11071
11072         printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
11073
11074 }
11075 #endif /* MACH_BSD */
11076
11077 ppnum_t upl_get_highest_page(
11078                              upl_t                      upl)
11079 {
11080         return upl->highest_page;
11081 }
11082
11083 upl_size_t upl_get_size(
11084                              upl_t                      upl)
11085 {
11086         return upl->size;
11087 }
11088
11089 upl_t upl_associated_upl(upl_t upl)
11090 {
11091         return upl->associated_upl;
11092 }
11093
11094 void upl_set_associated_upl(upl_t upl, upl_t associated_upl)
11095 {
11096         upl->associated_upl = associated_upl;
11097 }
11098
11099 struct vnode * upl_lookup_vnode(upl_t upl)
11100 {
11101         if (!upl->map_object->internal)
11102                 return vnode_pager_lookup_vnode(upl->map_object->pager);
11103         else
11104                 return NULL;
11105 }
11106
11107 #if UPL_DEBUG
11108 kern_return_t  upl_ubc_alias_set(upl_t upl, uintptr_t alias1, uintptr_t alias2)
11109 {
11110         upl->ubc_alias1 = alias1;
11111         upl->ubc_alias2 = alias2;
11112         return KERN_SUCCESS;
11113 }
11114 int  upl_ubc_alias_get(upl_t upl, uintptr_t * al, uintptr_t * al2)
11115 {
11116         if(al)
11117                 *al = upl->ubc_alias1;
11118         if(al2)
11119                 *al2 = upl->ubc_alias2;
11120         return KERN_SUCCESS;
11121 }
11122 #endif /* UPL_DEBUG */
11123
11124 #if VM_PRESSURE_EVENTS
11125 /*
11126  * Upward trajectory.
11127  */
11128 extern boolean_t vm_compressor_low_on_space(void);
11129
11130 boolean_t
11131 VM_PRESSURE_NORMAL_TO_WARNING(void)     {
11132
11133         if ( !VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11134
11135                 /* Available pages below our threshold */
11136                 if (memorystatus_available_pages < memorystatus_available_pages_pressure) {
11137                         /* No frozen processes to kill */
11138                         if (memorystatus_frozen_count == 0) {
11139                                 /* Not enough suspended processes available. */
11140                                 if (memorystatus_suspended_count < MEMORYSTATUS_SUSPENDED_THRESHOLD) {
11141                                         return TRUE;
11142                                 }
11143                         }
11144                 }
11145                 return FALSE;
11146
11147         } else {
11148                 return ((AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) ? 1 : 0);
11149         }
11150 }
11151
11152 boolean_t
11153 VM_PRESSURE_WARNING_TO_CRITICAL(void) {
11154
11155         if ( !VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11156
11157                 /* Available pages below our threshold */
11158                 if (memorystatus_available_pages < memorystatus_available_pages_critical) {
11159                         return TRUE;
11160                 }
11161                 return FALSE;
11162         } else {
11163                 return (vm_compressor_low_on_space() || (AVAILABLE_NON_COMPRESSED_MEMORY < ((12 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0);
11164         }
11165 }
11166
11167 /*
11168  * Downward trajectory.
11169  */
11170 boolean_t
11171 VM_PRESSURE_WARNING_TO_NORMAL(void) {
11172
11173         if ( !VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11174
11175                 /* Available pages above our threshold */
11176                 unsigned int target_threshold = memorystatus_available_pages_pressure + ((15 * memorystatus_available_pages_pressure) / 100);
11177                 if (memorystatus_available_pages > target_threshold) {
11178                         return TRUE;
11179                 }
11180                 return FALSE;
11181         } else {
11182                 return ((AVAILABLE_NON_COMPRESSED_MEMORY > ((12 * VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) / 10)) ? 1 : 0);
11183         }
11184 }
11185
11186 boolean_t
11187 VM_PRESSURE_CRITICAL_TO_WARNING(void) {
11188
11189         if ( !VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11190
11191                 /* Available pages above our threshold */
11192                 unsigned int target_threshold = memorystatus_available_pages_critical + ((15 * memorystatus_available_pages_critical) / 100);
11193                 if (memorystatus_available_pages > target_threshold) {
11194                         return TRUE;
11195                 }
11196                 return FALSE;
11197         } else {
11198                 return ((AVAILABLE_NON_COMPRESSED_MEMORY > ((14 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0);
11199         }
11200 }
11201 #endif /* VM_PRESSURE_EVENTS */
11202