osfmk/vm/vm_pageout.c

   1 /*
   2  * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58 /*
  59  *      File:   vm/vm_pageout.c
  60  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  61  *      Date:   1985
  62  *
  63  *      The proverbial page-out daemon.
  64  */
  65
  66 #include <stdint.h>
  67
  68 #include <debug.h>
  69 #include <mach_pagemap.h>
  70 #include <mach_cluster_stats.h>
  71 #include <mach_kdb.h>
  72 #include <advisory_pageout.h>
  73
  74 #include <mach/mach_types.h>
  75 #include <mach/memory_object.h>
  76 #include <mach/memory_object_default.h>
  77 #include <mach/memory_object_control_server.h>
  78 #include <mach/mach_host_server.h>
  79 #include <mach/upl.h>
  80 #include <mach/vm_map.h>
  81 #include <mach/vm_param.h>
  82 #include <mach/vm_statistics.h>
  83 #include <mach/sdt.h>
  84
  85 #include <kern/kern_types.h>
  86 #include <kern/counters.h>
  87 #include <kern/host_statistics.h>
  88 #include <kern/machine.h>
  89 #include <kern/misc_protos.h>
  90 #include <kern/thread.h>
  91 #include <kern/xpr.h>
  92 #include <kern/kalloc.h>
  93
  94 #include <machine/vm_tuning.h>
  95
  96 #if CONFIG_EMBEDDED
  97 #include <sys/kern_memorystatus.h>
  98 #endif
  99
 100 #include <vm/pmap.h>
 101 #include <vm/vm_fault.h>
 102 #include <vm/vm_map.h>
 103 #include <vm/vm_object.h>
 104 #include <vm/vm_page.h>
 105 #include <vm/vm_pageout.h>
 106 #include <vm/vm_protos.h> /* must be last */
 107 #include <vm/memory_object.h>
 108 #include <vm/vm_purgeable_internal.h>
 109
 110 /*
 111  * ENCRYPTED SWAP:
 112  */
 113 #include <../bsd/crypto/aes/aes.h>
 114
 115
 116 #ifndef VM_PAGEOUT_BURST_ACTIVE_THROTTLE   /* maximum iterations of the active queue to move pages to inactive */
 117 #ifdef  CONFIG_EMBEDDED
 118 #define VM_PAGEOUT_BURST_ACTIVE_THROTTLE  2048
 119 #else
 120 #define VM_PAGEOUT_BURST_ACTIVE_THROTTLE  100
 121 #endif
 122 #endif
 123
 124 #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE  /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
 125 #ifdef  CONFIG_EMBEDDED
 126 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 1024
 127 #else
 128 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096
 129 #endif
 130 #endif
 131
 132 #ifndef VM_PAGEOUT_DEADLOCK_RELIEF
 133 #define VM_PAGEOUT_DEADLOCK_RELIEF 100  /* number of pages to move to break deadlock */
 134 #endif
 135
 136 #ifndef VM_PAGEOUT_INACTIVE_RELIEF
 137 #define VM_PAGEOUT_INACTIVE_RELIEF 50   /* minimum number of pages to move to the inactive q */
 138 #endif
 139
 140 #ifndef VM_PAGE_LAUNDRY_MAX
 141 #define VM_PAGE_LAUNDRY_MAX     16UL    /* maximum pageouts on a given pageout queue */
 142 #endif  /* VM_PAGEOUT_LAUNDRY_MAX */
 143
 144 #ifndef VM_PAGEOUT_BURST_WAIT
 145 #define VM_PAGEOUT_BURST_WAIT   30      /* milliseconds per page */
 146 #endif  /* VM_PAGEOUT_BURST_WAIT */
 147
 148 #ifndef VM_PAGEOUT_EMPTY_WAIT
 149 #define VM_PAGEOUT_EMPTY_WAIT   200     /* milliseconds */
 150 #endif  /* VM_PAGEOUT_EMPTY_WAIT */
 151
 152 #ifndef VM_PAGEOUT_DEADLOCK_WAIT
 153 #define VM_PAGEOUT_DEADLOCK_WAIT        300     /* milliseconds */
 154 #endif  /* VM_PAGEOUT_DEADLOCK_WAIT */
 155
 156 #ifndef VM_PAGEOUT_IDLE_WAIT
 157 #define VM_PAGEOUT_IDLE_WAIT    10      /* milliseconds */
 158 #endif  /* VM_PAGEOUT_IDLE_WAIT */
 159
 160 #ifndef VM_PAGE_SPECULATIVE_TARGET
 161 #define VM_PAGE_SPECULATIVE_TARGET(total) ((total) * 1 / 20)
 162 #endif /* VM_PAGE_SPECULATIVE_TARGET */
 163
 164 #ifndef VM_PAGE_INACTIVE_HEALTHY_LIMIT
 165 #define VM_PAGE_INACTIVE_HEALTHY_LIMIT(total) ((total) * 1 / 200)
 166 #endif /* VM_PAGE_INACTIVE_HEALTHY_LIMIT */
 167
 168
 169 /*
 170  *      To obtain a reasonable LRU approximation, the inactive queue
 171  *      needs to be large enough to give pages on it a chance to be
 172  *      referenced a second time.  This macro defines the fraction
 173  *      of active+inactive pages that should be inactive.
 174  *      The pageout daemon uses it to update vm_page_inactive_target.
 175  *
 176  *      If vm_page_free_count falls below vm_page_free_target and
 177  *      vm_page_inactive_count is below vm_page_inactive_target,
 178  *      then the pageout daemon starts running.
 179  */
 180
 181 #ifndef VM_PAGE_INACTIVE_TARGET
 182 #define VM_PAGE_INACTIVE_TARGET(avail)  ((avail) * 1 / 3)
 183 #endif  /* VM_PAGE_INACTIVE_TARGET */
 184
 185 /*
 186  *      Once the pageout daemon starts running, it keeps going
 187  *      until vm_page_free_count meets or exceeds vm_page_free_target.
 188  */
 189
 190 #ifndef VM_PAGE_FREE_TARGET
 191 #ifdef  CONFIG_EMBEDDED
 192 #define VM_PAGE_FREE_TARGET(free)       (15 + (free) / 100)
 193 #else
 194 #define VM_PAGE_FREE_TARGET(free)       (15 + (free) / 80)
 195 #endif
 196 #endif  /* VM_PAGE_FREE_TARGET */
 197
 198 /*
 199  *      The pageout daemon always starts running once vm_page_free_count
 200  *      falls below vm_page_free_min.
 201  */
 202
 203 #ifndef VM_PAGE_FREE_MIN
 204 #ifdef  CONFIG_EMBEDDED
 205 #define VM_PAGE_FREE_MIN(free)          (10 + (free) / 200)
 206 #else
 207 #define VM_PAGE_FREE_MIN(free)          (10 + (free) / 100)
 208 #endif
 209 #endif  /* VM_PAGE_FREE_MIN */
 210
 211 #define VM_PAGE_FREE_MIN_LIMIT          1500
 212 #define VM_PAGE_FREE_TARGET_LIMIT       2000
 213
 214
 215 /*
 216  *      When vm_page_free_count falls below vm_page_free_reserved,
 217  *      only vm-privileged threads can allocate pages.  vm-privilege
 218  *      allows the pageout daemon and default pager (and any other
 219  *      associated threads needed for default pageout) to continue
 220  *      operation by dipping into the reserved pool of pages.
 221  */
 222
 223 #ifndef VM_PAGE_FREE_RESERVED
 224 #define VM_PAGE_FREE_RESERVED(n)        \
 225         ((6 * VM_PAGE_LAUNDRY_MAX) + (n))
 226 #endif  /* VM_PAGE_FREE_RESERVED */
 227
 228 /*
 229  *      When we dequeue pages from the inactive list, they are
 230  *      reactivated (ie, put back on the active queue) if referenced.
 231  *      However, it is possible to starve the free list if other
 232  *      processors are referencing pages faster than we can turn off
 233  *      the referenced bit.  So we limit the number of reactivations
 234  *      we will make per call of vm_pageout_scan().
 235  */
 236 #define VM_PAGE_REACTIVATE_LIMIT_MAX 20000
 237 #ifndef VM_PAGE_REACTIVATE_LIMIT
 238 #ifdef  CONFIG_EMBEDDED
 239 #define VM_PAGE_REACTIVATE_LIMIT(avail) (VM_PAGE_INACTIVE_TARGET(avail) / 2)
 240 #else
 241 #define VM_PAGE_REACTIVATE_LIMIT(avail) (MAX((avail) * 1 / 20,VM_PAGE_REACTIVATE_LIMIT_MAX))
 242 #endif
 243 #endif  /* VM_PAGE_REACTIVATE_LIMIT */
 244 #define VM_PAGEOUT_INACTIVE_FORCE_RECLAIM       100
 245
 246
 247 /*
 248  * must hold the page queues lock to
 249  * manipulate this structure
 250  */
 251 struct vm_pageout_queue {
 252         queue_head_t    pgo_pending;    /* laundry pages to be processed by pager's iothread */
 253         unsigned int    pgo_laundry;    /* current count of laundry pages on queue or in flight */
 254         unsigned int    pgo_maxlaundry;
 255
 256         unsigned int    pgo_idle:1,     /* iothread is blocked waiting for work to do */
 257                         pgo_busy:1,     /* iothread is currently processing request from pgo_pending */
 258                         pgo_throttled:1,/* vm_pageout_scan thread needs a wakeup when pgo_laundry drops */
 259                         :0;
 260 };
 261
 262 #define VM_PAGE_Q_THROTTLED(q)          \
 263         ((q)->pgo_laundry >= (q)->pgo_maxlaundry)
 264
 265
 266 /*
 267  * Exported variable used to broadcast the activation of the pageout scan
 268  * Working Set uses this to throttle its use of pmap removes.  In this
 269  * way, code which runs within memory in an uncontested context does
 270  * not keep encountering soft faults.
 271  */
 272
 273 unsigned int    vm_pageout_scan_event_counter = 0;
 274
 275 /*
 276  * Forward declarations for internal routines.
 277  */
 278
 279 static void vm_pageout_garbage_collect(int);
 280 static void vm_pageout_iothread_continue(struct vm_pageout_queue *);
 281 static void vm_pageout_iothread_external(void);
 282 static void vm_pageout_iothread_internal(void);
 283 static void vm_pageout_queue_steal(vm_page_t);
 284
 285 extern void vm_pageout_continue(void);
 286 extern void vm_pageout_scan(void);
 287
 288 static thread_t vm_pageout_external_iothread = THREAD_NULL;
 289 static thread_t vm_pageout_internal_iothread = THREAD_NULL;
 290
 291 unsigned int vm_pageout_reserved_internal = 0;
 292 unsigned int vm_pageout_reserved_really = 0;
 293
 294 unsigned int vm_pageout_idle_wait = 0;          /* milliseconds */
 295 unsigned int vm_pageout_empty_wait = 0;         /* milliseconds */
 296 unsigned int vm_pageout_burst_wait = 0;         /* milliseconds */
 297 unsigned int vm_pageout_deadlock_wait = 0;      /* milliseconds */
 298 unsigned int vm_pageout_deadlock_relief = 0;
 299 unsigned int vm_pageout_inactive_relief = 0;
 300 unsigned int vm_pageout_burst_active_throttle = 0;
 301 unsigned int vm_pageout_burst_inactive_throttle = 0;
 302
 303 /*
 304  *      Protection against zero fill flushing live working sets derived
 305  *      from existing backing store and files
 306  */
 307 unsigned int vm_accellerate_zf_pageout_trigger = 400;
 308 unsigned int zf_queue_min_count = 100;
 309 unsigned int vm_zf_count = 0;
 310 unsigned int vm_zf_queue_count = 0;
 311
 312 /*
 313  *      These variables record the pageout daemon's actions:
 314  *      how many pages it looks at and what happens to those pages.
 315  *      No locking needed because only one thread modifies the variables.
 316  */
 317
 318 unsigned int vm_pageout_active = 0;             /* debugging */
 319 unsigned int vm_pageout_inactive = 0;           /* debugging */
 320 unsigned int vm_pageout_inactive_throttled = 0; /* debugging */
 321 unsigned int vm_pageout_inactive_forced = 0;    /* debugging */
 322 unsigned int vm_pageout_inactive_nolock = 0;    /* debugging */
 323 unsigned int vm_pageout_inactive_avoid = 0;     /* debugging */
 324 unsigned int vm_pageout_inactive_busy = 0;      /* debugging */
 325 unsigned int vm_pageout_inactive_absent = 0;    /* debugging */
 326 unsigned int vm_pageout_inactive_used = 0;      /* debugging */
 327 unsigned int vm_pageout_inactive_clean = 0;     /* debugging */
 328 unsigned int vm_pageout_inactive_dirty = 0;     /* debugging */
 329 unsigned int vm_pageout_dirty_no_pager = 0;     /* debugging */
 330 unsigned int vm_pageout_purged_objects = 0;     /* debugging */
 331 unsigned int vm_stat_discard = 0;               /* debugging */
 332 unsigned int vm_stat_discard_sent = 0;          /* debugging */
 333 unsigned int vm_stat_discard_failure = 0;       /* debugging */
 334 unsigned int vm_stat_discard_throttle = 0;      /* debugging */
 335 unsigned int vm_pageout_reactivation_limit_exceeded = 0;        /* debugging */
 336 unsigned int vm_pageout_catch_ups = 0;                          /* debugging */
 337 unsigned int vm_pageout_inactive_force_reclaim = 0;     /* debugging */
 338
 339 unsigned int vm_pageout_scan_active_throttled = 0;
 340 unsigned int vm_pageout_scan_inactive_throttled = 0;
 341 unsigned int vm_pageout_scan_throttle = 0;                      /* debugging */
 342 unsigned int vm_pageout_scan_burst_throttle = 0;                /* debugging */
 343 unsigned int vm_pageout_scan_empty_throttle = 0;                /* debugging */
 344 unsigned int vm_pageout_scan_deadlock_detected = 0;             /* debugging */
 345 unsigned int vm_pageout_scan_active_throttle_success = 0;       /* debugging */
 346 unsigned int vm_pageout_scan_inactive_throttle_success = 0;     /* debugging */
 347 /*
 348  * Backing store throttle when BS is exhausted
 349  */
 350 unsigned int    vm_backing_store_low = 0;
 351
 352 unsigned int vm_pageout_out_of_line  = 0;
 353 unsigned int vm_pageout_in_place  = 0;
 354
 355 /*
 356  * ENCRYPTED SWAP:
 357  * counters and statistics...
 358  */
 359 unsigned long vm_page_decrypt_counter = 0;
 360 unsigned long vm_page_decrypt_for_upl_counter = 0;
 361 unsigned long vm_page_encrypt_counter = 0;
 362 unsigned long vm_page_encrypt_abort_counter = 0;
 363 unsigned long vm_page_encrypt_already_encrypted_counter = 0;
 364 boolean_t vm_pages_encrypted = FALSE; /* are there encrypted pages ? */
 365
 366 struct  vm_pageout_queue vm_pageout_queue_internal;
 367 struct  vm_pageout_queue vm_pageout_queue_external;
 368
 369 unsigned int vm_page_speculative_target = 0;
 370
 371 vm_object_t     vm_pageout_scan_wants_object = VM_OBJECT_NULL;
 372
 373 unsigned long vm_cs_validated_resets = 0;
 374
 375 /*
 376  *      Routine:        vm_backing_store_disable
 377  *      Purpose:
 378  *              Suspend non-privileged threads wishing to extend
 379  *              backing store when we are low on backing store
 380  *              (Synchronized by caller)
 381  */
 382 void
 383 vm_backing_store_disable(
 384         boolean_t       disable)
 385 {
 386         if(disable) {
 387                 vm_backing_store_low = 1;
 388         } else {
 389                 if(vm_backing_store_low) {
 390                         vm_backing_store_low = 0;
 391                         thread_wakeup((event_t) &vm_backing_store_low);
 392                 }
 393         }
 394 }
 395
 396
 397 #if MACH_CLUSTER_STATS
 398 unsigned long vm_pageout_cluster_dirtied = 0;
 399 unsigned long vm_pageout_cluster_cleaned = 0;
 400 unsigned long vm_pageout_cluster_collisions = 0;
 401 unsigned long vm_pageout_cluster_clusters = 0;
 402 unsigned long vm_pageout_cluster_conversions = 0;
 403 unsigned long vm_pageout_target_collisions = 0;
 404 unsigned long vm_pageout_target_page_dirtied = 0;
 405 unsigned long vm_pageout_target_page_freed = 0;
 406 #define CLUSTER_STAT(clause)    clause
 407 #else   /* MACH_CLUSTER_STATS */
 408 #define CLUSTER_STAT(clause)
 409 #endif  /* MACH_CLUSTER_STATS */
 410
 411 /*
 412  *      Routine:        vm_pageout_object_terminate
 413  *      Purpose:
 414  *              Destroy the pageout_object, and perform all of the
 415  *              required cleanup actions.
 416  *
 417  *      In/Out conditions:
 418  *              The object must be locked, and will be returned locked.
 419  */
 420 void
 421 vm_pageout_object_terminate(
 422         vm_object_t     object)
 423 {
 424         vm_object_t     shadow_object;
 425
 426         /*
 427          * Deal with the deallocation (last reference) of a pageout object
 428          * (used for cleaning-in-place) by dropping the paging references/
 429          * freeing pages in the original object.
 430          */
 431
 432         assert(object->pageout);
 433         shadow_object = object->shadow;
 434         vm_object_lock(shadow_object);
 435
 436         while (!queue_empty(&object->memq)) {
 437                 vm_page_t               p, m;
 438                 vm_object_offset_t      offset;
 439
 440                 p = (vm_page_t) queue_first(&object->memq);
 441
 442                 assert(p->private);
 443                 assert(p->pageout);
 444                 p->pageout = FALSE;
 445                 assert(!p->cleaning);
 446
 447                 offset = p->offset;
 448                 VM_PAGE_FREE(p);
 449                 p = VM_PAGE_NULL;
 450
 451                 m = vm_page_lookup(shadow_object,
 452                         offset + object->shadow_offset);
 453
 454                 if(m == VM_PAGE_NULL)
 455                         continue;
 456                 assert(m->cleaning);
 457                 /* used as a trigger on upl_commit etc to recognize the */
 458                 /* pageout daemon's subseqent desire to pageout a cleaning */
 459                 /* page.  When the bit is on the upl commit code will   */
 460                 /* respect the pageout bit in the target page over the  */
 461                 /* caller's page list indication */
 462                 m->dump_cleaning = FALSE;
 463
 464                 assert((m->dirty) || (m->precious) ||
 465                                 (m->busy && m->cleaning));
 466
 467                 /*
 468                  * Handle the trusted pager throttle.
 469                  * Also decrement the burst throttle (if external).
 470                  */
 471                 vm_page_lock_queues();
 472                 if (m->laundry) {
 473                         vm_pageout_throttle_up(m);
 474                 }
 475
 476                 /*
 477                  * Handle the "target" page(s). These pages are to be freed if
 478                  * successfully cleaned. Target pages are always busy, and are
 479                  * wired exactly once. The initial target pages are not mapped,
 480                  * (so cannot be referenced or modified) but converted target
 481                  * pages may have been modified between the selection as an
 482                  * adjacent page and conversion to a target.
 483                  */
 484                 if (m->pageout) {
 485                         assert(m->busy);
 486                         assert(m->wire_count == 1);
 487                         m->cleaning = FALSE;
 488                         m->encrypted_cleaning = FALSE;
 489                         m->pageout = FALSE;
 490 #if MACH_CLUSTER_STATS
 491                         if (m->wanted) vm_pageout_target_collisions++;
 492 #endif
 493                         /*
 494                          * Revoke all access to the page. Since the object is
 495                          * locked, and the page is busy, this prevents the page
 496                          * from being dirtied after the pmap_disconnect() call
 497                          * returns.
 498                          *
 499                          * Since the page is left "dirty" but "not modifed", we
 500                          * can detect whether the page was redirtied during
 501                          * pageout by checking the modify state.
 502                          */
 503                         if (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED)
 504                               m->dirty = TRUE;
 505                         else
 506                               m->dirty = FALSE;
 507
 508                         if (m->dirty) {
 509                                 CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
 510                                 vm_page_unwire(m);/* reactivates */
 511                                 VM_STAT_INCR(reactivations);
 512                                 PAGE_WAKEUP_DONE(m);
 513                         } else {
 514                                 CLUSTER_STAT(vm_pageout_target_page_freed++;)
 515                                 vm_page_free(m);/* clears busy, etc. */
 516                         }
 517                         vm_page_unlock_queues();
 518                         continue;
 519                 }
 520                 /*
 521                  * Handle the "adjacent" pages. These pages were cleaned in
 522                  * place, and should be left alone.
 523                  * If prep_pin_count is nonzero, then someone is using the
 524                  * page, so make it active.
 525                  */
 526                 if (!m->active && !m->inactive && !m->throttled && !m->private) {
 527                         if (m->reference)
 528                                 vm_page_activate(m);
 529                         else
 530                                 vm_page_deactivate(m);
 531                 }
 532                 if((m->busy) && (m->cleaning)) {
 533
 534                         /* the request_page_list case, (COPY_OUT_FROM FALSE) */
 535                         m->busy = FALSE;
 536
 537                         /* We do not re-set m->dirty ! */
 538                         /* The page was busy so no extraneous activity     */
 539                         /* could have occurred. COPY_INTO is a read into the */
 540                         /* new pages. CLEAN_IN_PLACE does actually write   */
 541                         /* out the pages but handling outside of this code */
 542                         /* will take care of resetting dirty. We clear the */
 543                         /* modify however for the Programmed I/O case.     */
 544                         pmap_clear_modify(m->phys_page);
 545
 546                         m->absent = FALSE;
 547                         m->overwriting = FALSE;
 548                 } else if (m->overwriting) {
 549                         /* alternate request page list, write to page_list */
 550                         /* case.  Occurs when the original page was wired  */
 551                         /* at the time of the list request */
 552                         assert(m->wire_count != 0);
 553                         vm_page_unwire(m);/* reactivates */
 554                         m->overwriting = FALSE;
 555                 } else {
 556                 /*
 557                  * Set the dirty state according to whether or not the page was
 558                  * modified during the pageout. Note that we purposefully do
 559                  * NOT call pmap_clear_modify since the page is still mapped.
 560                  * If the page were to be dirtied between the 2 calls, this
 561                  * this fact would be lost. This code is only necessary to
 562                  * maintain statistics, since the pmap module is always
 563                  * consulted if m->dirty is false.
 564                  */
 565 #if MACH_CLUSTER_STATS
 566                         m->dirty = pmap_is_modified(m->phys_page);
 567
 568                         if (m->dirty)   vm_pageout_cluster_dirtied++;
 569                         else            vm_pageout_cluster_cleaned++;
 570                         if (m->wanted)  vm_pageout_cluster_collisions++;
 571 #else
 572                         m->dirty = 0;
 573 #endif
 574                 }
 575                 m->cleaning = FALSE;
 576                 m->encrypted_cleaning = FALSE;
 577
 578                 /*
 579                  * Wakeup any thread waiting for the page to be un-cleaning.
 580                  */
 581                 PAGE_WAKEUP(m);
 582                 vm_page_unlock_queues();
 583         }
 584         /*
 585          * Account for the paging reference taken in vm_paging_object_allocate.
 586          */
 587         vm_object_paging_end(shadow_object);
 588         vm_object_unlock(shadow_object);
 589
 590         assert(object->ref_count == 0);
 591         assert(object->paging_in_progress == 0);
 592         assert(object->resident_page_count == 0);
 593         return;
 594 }
 595
 596 /*
 597  * Routine:     vm_pageclean_setup
 598  *
 599  * Purpose:     setup a page to be cleaned (made non-dirty), but not
 600  *              necessarily flushed from the VM page cache.
 601  *              This is accomplished by cleaning in place.
 602  *
 603  *              The page must not be busy, and the object and page
 604  *              queues must be locked.
 605  *
 606  */
 607 void
 608 vm_pageclean_setup(
 609         vm_page_t               m,
 610         vm_page_t               new_m,
 611         vm_object_t             new_object,
 612         vm_object_offset_t      new_offset)
 613 {
 614         assert(!m->busy);
 615 #if 0
 616         assert(!m->cleaning);
 617 #endif
 618
 619         XPR(XPR_VM_PAGEOUT,
 620     "vm_pageclean_setup, obj 0x%X off 0x%X page 0x%X new 0x%X new_off 0x%X\n",
 621                 (integer_t)m->object, m->offset, (integer_t)m,
 622                 (integer_t)new_m, new_offset);
 623
 624         pmap_clear_modify(m->phys_page);
 625
 626         /*
 627          * Mark original page as cleaning in place.
 628          */
 629         m->cleaning = TRUE;
 630         m->dirty = TRUE;
 631         m->precious = FALSE;
 632
 633         /*
 634          * Convert the fictitious page to a private shadow of
 635          * the real page.
 636          */
 637         assert(new_m->fictitious);
 638         assert(new_m->phys_page == vm_page_fictitious_addr);
 639         new_m->fictitious = FALSE;
 640         new_m->private = TRUE;
 641         new_m->pageout = TRUE;
 642         new_m->phys_page = m->phys_page;
 643         vm_page_wire(new_m);
 644
 645         vm_page_insert(new_m, new_object, new_offset);
 646         assert(!new_m->wanted);
 647         new_m->busy = FALSE;
 648 }
 649
 650 /*
 651  *      Routine:        vm_pageout_initialize_page
 652  *      Purpose:
 653  *              Causes the specified page to be initialized in
 654  *              the appropriate memory object. This routine is used to push
 655  *              pages into a copy-object when they are modified in the
 656  *              permanent object.
 657  *
 658  *              The page is moved to a temporary object and paged out.
 659  *
 660  *      In/out conditions:
 661  *              The page in question must not be on any pageout queues.
 662  *              The object to which it belongs must be locked.
 663  *              The page must be busy, but not hold a paging reference.
 664  *
 665  *      Implementation:
 666  *              Move this page to a completely new object.
 667  */
 668 void
 669 vm_pageout_initialize_page(
 670         vm_page_t       m)
 671 {
 672         vm_object_t             object;
 673         vm_object_offset_t      paging_offset;
 674         vm_page_t               holding_page;
 675         memory_object_t         pager;
 676
 677         XPR(XPR_VM_PAGEOUT,
 678                 "vm_pageout_initialize_page, page 0x%X\n",
 679                 (integer_t)m, 0, 0, 0, 0);
 680         assert(m->busy);
 681
 682         /*
 683          *      Verify that we really want to clean this page
 684          */
 685         assert(!m->absent);
 686         assert(!m->error);
 687         assert(m->dirty);
 688
 689         /*
 690          *      Create a paging reference to let us play with the object.
 691          */
 692         object = m->object;
 693         paging_offset = m->offset + object->paging_offset;
 694
 695         if (m->absent || m->error || m->restart || (!m->dirty && !m->precious)) {
 696                 VM_PAGE_FREE(m);
 697                 panic("reservation without pageout?"); /* alan */
 698                 vm_object_unlock(object);
 699
 700                 return;
 701         }
 702
 703         /*
 704          * If there's no pager, then we can't clean the page.  This should
 705          * never happen since this should be a copy object and therefore not
 706          * an external object, so the pager should always be there.
 707          */
 708
 709         pager = object->pager;
 710
 711         if (pager == MEMORY_OBJECT_NULL) {
 712                 VM_PAGE_FREE(m);
 713                 panic("missing pager for copy object");
 714                 return;
 715         }
 716
 717         /* set the page for future call to vm_fault_list_request */
 718         vm_object_paging_begin(object);
 719         holding_page = NULL;
 720         vm_page_lock_queues();
 721         pmap_clear_modify(m->phys_page);
 722         m->dirty = TRUE;
 723         m->busy = TRUE;
 724         m->list_req_pending = TRUE;
 725         m->cleaning = TRUE;
 726         m->pageout = TRUE;
 727         vm_page_wire(m);
 728         vm_page_unlock_queues();
 729         vm_object_unlock(object);
 730
 731         /*
 732          *      Write the data to its pager.
 733          *      Note that the data is passed by naming the new object,
 734          *      not a virtual address; the pager interface has been
 735          *      manipulated to use the "internal memory" data type.
 736          *      [The object reference from its allocation is donated
 737          *      to the eventual recipient.]
 738          */
 739         memory_object_data_initialize(pager, paging_offset, PAGE_SIZE);
 740
 741         vm_object_lock(object);
 742         vm_object_paging_end(object);
 743 }
 744
 745 #if     MACH_CLUSTER_STATS
 746 #define MAXCLUSTERPAGES 16
 747 struct {
 748         unsigned long pages_in_cluster;
 749         unsigned long pages_at_higher_offsets;
 750         unsigned long pages_at_lower_offsets;
 751 } cluster_stats[MAXCLUSTERPAGES];
 752 #endif  /* MACH_CLUSTER_STATS */
 753
 754
 755 /*
 756  * vm_pageout_cluster:
 757  *
 758  * Given a page, queue it to the appropriate I/O thread,
 759  * which will page it out and attempt to clean adjacent pages
 760  * in the same operation.
 761  *
 762  * The page must be busy, and the object and queues locked. We will take a
 763  * paging reference to prevent deallocation or collapse when we
 764  * release the object lock back at the call site.  The I/O thread
 765  * is responsible for consuming this reference
 766  *
 767  * The page must not be on any pageout queue.
 768  */
 769
 770 void
 771 vm_pageout_cluster(vm_page_t m)
 772 {
 773         vm_object_t     object = m->object;
 774         struct          vm_pageout_queue *q;
 775
 776
 777         XPR(XPR_VM_PAGEOUT,
 778                 "vm_pageout_cluster, object 0x%X offset 0x%X page 0x%X\n",
 779                 (integer_t)object, m->offset, (integer_t)m, 0, 0);
 780
 781         /*
 782          * Only a certain kind of page is appreciated here.
 783          */
 784         assert(m->busy && (m->dirty || m->precious) && (m->wire_count == 0));
 785         assert(!m->cleaning && !m->pageout && !m->inactive && !m->active);
 786         assert(!m->throttled);
 787
 788         /*
 789          * protect the object from collapse -
 790          * locking in the object's paging_offset.
 791          */
 792         vm_object_paging_begin(object);
 793
 794         /*
 795          * set the page for future call to vm_fault_list_request
 796          * page should already be marked busy
 797          */
 798         vm_page_wire(m);
 799         m->list_req_pending = TRUE;
 800         m->cleaning = TRUE;
 801         m->pageout = TRUE;
 802         m->laundry = TRUE;
 803
 804         if (object->internal == TRUE)
 805                 q = &vm_pageout_queue_internal;
 806         else
 807                 q = &vm_pageout_queue_external;
 808         q->pgo_laundry++;
 809
 810         m->pageout_queue = TRUE;
 811         queue_enter(&q->pgo_pending, m, vm_page_t, pageq);
 812
 813         if (q->pgo_idle == TRUE) {
 814                 q->pgo_idle = FALSE;
 815                 thread_wakeup((event_t) &q->pgo_pending);
 816         }
 817 }
 818
 819
 820 unsigned long vm_pageout_throttle_up_count = 0;
 821
 822 /*
 823  * A page is back from laundry.  See if there are some pages waiting to
 824  * go to laundry and if we can let some of them go now.
 825  *
 826  * Object and page queues must be locked.
 827  */
 828 void
 829 vm_pageout_throttle_up(
 830         vm_page_t       m)
 831 {
 832         struct vm_pageout_queue *q;
 833
 834         vm_pageout_throttle_up_count++;
 835
 836         assert(m->laundry);
 837         assert(m->object != VM_OBJECT_NULL);
 838         assert(m->object != kernel_object);
 839
 840         if (m->object->internal == TRUE)
 841                 q = &vm_pageout_queue_internal;
 842         else
 843                 q = &vm_pageout_queue_external;
 844
 845         m->laundry = FALSE;
 846         q->pgo_laundry--;
 847
 848         if (q->pgo_throttled == TRUE) {
 849                 q->pgo_throttled = FALSE;
 850                 thread_wakeup((event_t) &q->pgo_laundry);
 851         }
 852 }
 853
 854
 855 /*
 856  *      vm_pageout_scan does the dirty work for the pageout daemon.
 857  *      It returns with vm_page_queue_free_lock held and
 858  *      vm_page_free_wanted == 0.
 859  */
 860
 861 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT  (3 * MAX_UPL_TRANSFER)
 862
 863 #define FCS_IDLE                0
 864 #define FCS_DELAYED             1
 865 #define FCS_DEADLOCK_DETECTED   2
 866
 867 struct flow_control {
 868         int             state;
 869         mach_timespec_t ts;
 870 };
 871
 872 void
 873 vm_pageout_scan(void)
 874 {
 875         unsigned int loop_count = 0;
 876         unsigned int inactive_burst_count = 0;
 877         unsigned int active_burst_count = 0;
 878         unsigned int reactivated_this_call;
 879         unsigned int reactivate_limit;
 880         vm_page_t   local_freeq = NULL;
 881         int         local_freed = 0;
 882         int         delayed_unlock;
 883         int         need_internal_inactive = 0;
 884         int         refmod_state = 0;
 885         int     vm_pageout_deadlock_target = 0;
 886         struct  vm_pageout_queue *iq;
 887         struct  vm_pageout_queue *eq;
 888         struct  vm_speculative_age_q *sq;
 889         struct  flow_control    flow_control;
 890         boolean_t inactive_throttled = FALSE;
 891         boolean_t try_failed;
 892         mach_timespec_t         ts;
 893         unsigned int msecs = 0;
 894         vm_object_t     object;
 895         vm_object_t     last_object_tried;
 896         int     zf_ratio;
 897         int     zf_run_count;
 898         uint32_t        catch_up_count = 0;
 899         uint32_t        inactive_reclaim_run;
 900         boolean_t       forced_reclaim;
 901
 902         flow_control.state = FCS_IDLE;
 903         iq = &vm_pageout_queue_internal;
 904         eq = &vm_pageout_queue_external;
 905         sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
 906
 907
 908         XPR(XPR_VM_PAGEOUT, "vm_pageout_scan\n", 0, 0, 0, 0, 0);
 909
 910
 911         vm_page_lock_queues();
 912         delayed_unlock = 1;     /* must be nonzero if Qs are locked, 0 if unlocked */
 913
 914         /*
 915          *      Calculate the max number of referenced pages on the inactive
 916          *      queue that we will reactivate.
 917          */
 918         reactivated_this_call = 0;
 919         reactivate_limit = VM_PAGE_REACTIVATE_LIMIT(vm_page_active_count +
 920                                                     vm_page_inactive_count);
 921         inactive_reclaim_run = 0;
 922
 923
 924 /*???*/ /*
 925          *      We want to gradually dribble pages from the active queue
 926          *      to the inactive queue.  If we let the inactive queue get
 927          *      very small, and then suddenly dump many pages into it,
 928          *      those pages won't get a sufficient chance to be referenced
 929          *      before we start taking them from the inactive queue.
 930          *
 931          *      We must limit the rate at which we send pages to the pagers.
 932          *      data_write messages consume memory, for message buffers and
 933          *      for map-copy objects.  If we get too far ahead of the pagers,
 934          *      we can potentially run out of memory.
 935          *
 936          *      We can use the laundry count to limit directly the number
 937          *      of pages outstanding to the default pager.  A similar
 938          *      strategy for external pagers doesn't work, because
 939          *      external pagers don't have to deallocate the pages sent them,
 940          *      and because we might have to send pages to external pagers
 941          *      even if they aren't processing writes.  So we also
 942          *      use a burst count to limit writes to external pagers.
 943          *
 944          *      When memory is very tight, we can't rely on external pagers to
 945          *      clean pages.  They probably aren't running, because they
 946          *      aren't vm-privileged.  If we kept sending dirty pages to them,
 947          *      we could exhaust the free list.
 948          */
 949
 950
 951 Restart:
 952         assert(delayed_unlock!=0);
 953
 954         /*
 955          *      A page is "zero-filled" if it was not paged in from somewhere,
 956          *      and it belongs to an object at least VM_ZF_OBJECT_SIZE_THRESHOLD big.
 957          *      Recalculate the zero-filled page ratio.  We use this to apportion
 958          *      victimized pages between the normal and zero-filled inactive
 959          *      queues according to their relative abundance in memory.  Thus if a task
 960          *      is flooding memory with zf pages, we begin to hunt them down.
 961          *      It would be better to throttle greedy tasks at a higher level,
 962          *      but at the moment mach vm cannot do this.
 963          */
 964         {
 965                 uint32_t  total  = vm_page_active_count + vm_page_inactive_count;
 966                 uint32_t  normal = total - vm_zf_count;
 967
 968                 /* zf_ratio is the number of zf pages we victimize per normal page */
 969
 970                 if (vm_zf_count < vm_accellerate_zf_pageout_trigger)
 971                         zf_ratio = 0;
 972                 else if ((vm_zf_count <= normal) || (normal == 0))
 973                         zf_ratio = 1;
 974                 else
 975                         zf_ratio = vm_zf_count / normal;
 976
 977                 zf_run_count = 0;
 978         }
 979
 980         /*
 981          *      Recalculate vm_page_inactivate_target.
 982          */
 983         vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
 984                                                           vm_page_inactive_count +
 985                                                           vm_page_speculative_count);
 986         /*
 987          * don't want to wake the pageout_scan thread up everytime we fall below
 988          * the targets... set a low water mark at 0.25% below the target
 989          */
 990         vm_page_inactive_min = vm_page_inactive_target - (vm_page_inactive_target / 400);
 991
 992         vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count +
 993                                                                 vm_page_inactive_count);
 994         object = NULL;
 995         last_object_tried = NULL;
 996         try_failed = FALSE;
 997
 998         if ((vm_page_inactive_count + vm_page_speculative_count) < VM_PAGE_INACTIVE_HEALTHY_LIMIT(vm_page_active_count))
 999                 catch_up_count = vm_page_inactive_count + vm_page_speculative_count;
1000         else
1001                 catch_up_count = 0;
1002
1003         for (;;) {
1004                 vm_page_t m;
1005
1006                 DTRACE_VM2(rev, int, 1, (uint64_t *), NULL);
1007
1008                 if (delayed_unlock == 0) {
1009                         vm_page_lock_queues();
1010                         delayed_unlock = 1;
1011                 }
1012
1013                 /*
1014                  *      Don't sweep through active queue more than the throttle
1015                  *      which should be kept relatively low
1016                  */
1017                 active_burst_count = vm_pageout_burst_active_throttle;
1018
1019                 /*
1020                  *      Move pages from active to inactive.
1021                  */
1022                 if (need_internal_inactive == 0 && (vm_page_inactive_count + vm_page_speculative_count) >= vm_page_inactive_target)
1023                         goto done_moving_active_pages;
1024
1025                 while (!queue_empty(&vm_page_queue_active) &&
1026                        (need_internal_inactive || active_burst_count)) {
1027
1028                         if (active_burst_count)
1029                                active_burst_count--;
1030
1031                         vm_pageout_active++;
1032
1033                         m = (vm_page_t) queue_first(&vm_page_queue_active);
1034
1035                         assert(m->active && !m->inactive);
1036                         assert(!m->laundry);
1037                         assert(m->object != kernel_object);
1038                         assert(m->phys_page != vm_page_guard_addr);
1039
1040                         DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
1041
1042                         /*
1043                          * Try to lock object; since we've already got the
1044                          * page queues lock, we can only 'try' for this one.
1045                          * if the 'try' fails, we need to do a mutex_pause
1046                          * to allow the owner of the object lock a chance to
1047                          * run... otherwise, we're likely to trip over this
1048                          * object in the same state as we work our way through
1049                          * the queue... clumps of pages associated with the same
1050                          * object are fairly typical on the inactive and active queues
1051                          */
1052                         if (m->object != object) {
1053                                 if (object != NULL) {
1054                                         vm_object_unlock(object);
1055                                         object = NULL;
1056                                         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1057                                 }
1058                                 if (!vm_object_lock_try_scan(m->object)) {
1059                                         /*
1060                                          * move page to end of active queue and continue
1061                                          */
1062                                         queue_remove(&vm_page_queue_active, m,
1063                                                      vm_page_t, pageq);
1064                                         queue_enter(&vm_page_queue_active, m,
1065                                                     vm_page_t, pageq);
1066
1067                                         try_failed = TRUE;
1068
1069                                         m = (vm_page_t) queue_first(&vm_page_queue_active);
1070                                         /*
1071                                          * this is the next object we're going to be interested in
1072                                          * try to make sure its available after the mutex_yield
1073                                          * returns control
1074                                          */
1075                                         vm_pageout_scan_wants_object = m->object;
1076
1077                                         goto done_with_activepage;
1078                                 }
1079                                 object = m->object;
1080
1081                                 try_failed = FALSE;
1082                         }
1083
1084                         /*
1085                          * if the page is BUSY, then we pull it
1086                          * off the active queue and leave it alone.
1087                          * when BUSY is cleared, it will get stuck
1088                          * back on the appropriate queue
1089                          */
1090                         if (m->busy) {
1091                                 queue_remove(&vm_page_queue_active, m,
1092                                              vm_page_t, pageq);
1093                                 m->pageq.next = NULL;
1094                                 m->pageq.prev = NULL;
1095
1096                                 if (!m->fictitious)
1097                                         vm_page_active_count--;
1098                                 m->active = FALSE;
1099
1100                                 goto done_with_activepage;
1101                         }
1102
1103                         /*
1104                          *      Deactivate the page while holding the object
1105                          *      locked, so we know the page is still not busy.
1106                          *      This should prevent races between pmap_enter
1107                          *      and pmap_clear_reference.  The page might be
1108                          *      absent or fictitious, but vm_page_deactivate
1109                          *      can handle that.
1110                          */
1111                         vm_page_deactivate(m);
1112
1113                         if (need_internal_inactive) {
1114                                 vm_pageout_scan_active_throttle_success++;
1115                                 need_internal_inactive--;
1116                         }
1117 done_with_activepage:
1118                         if (delayed_unlock++ > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT || try_failed == TRUE) {
1119
1120                                 if (object != NULL) {
1121                                         vm_object_unlock(object);
1122                                         object = NULL;
1123                                         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1124                                 }
1125                                 if (local_freeq) {
1126                                         vm_page_free_list(local_freeq);
1127
1128                                         local_freeq = NULL;
1129                                         local_freed = 0;
1130                                 }
1131                                 mutex_yield(&vm_page_queue_lock);
1132
1133                                 delayed_unlock = 1;
1134
1135                                 /*
1136                                  * continue the while loop processing
1137                                  * the active queue... need to hold
1138                                  * the page queues lock
1139                                  */
1140                         }
1141                 }
1142
1143
1144
1145                 /**********************************************************************
1146                  * above this point we're playing with the active queue
1147                  * below this point we're playing with the throttling mechanisms
1148                  * and the inactive queue
1149                  **********************************************************************/
1150
1151 done_moving_active_pages:
1152
1153                 /*
1154                  *      We are done if we have met our target *and*
1155                  *      nobody is still waiting for a page.
1156                  */
1157                 if (vm_page_free_count + local_freed >= vm_page_free_target) {
1158                         if (object != NULL) {
1159                                 vm_object_unlock(object);
1160                                 object = NULL;
1161                         }
1162                         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1163
1164                         if (local_freeq) {
1165                                 vm_page_free_list(local_freeq);
1166
1167                                 local_freeq = NULL;
1168                                 local_freed = 0;
1169                         }
1170                         /*
1171                          * inactive target still not met... keep going
1172                          * until we get the queues balanced
1173                          */
1174                         if (((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) &&
1175                             !queue_empty(&vm_page_queue_active))
1176                                 continue;
1177
1178                         mutex_lock(&vm_page_queue_free_lock);
1179
1180                         if ((vm_page_free_count >= vm_page_free_target) &&
1181                             (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
1182
1183                                 vm_page_unlock_queues();
1184
1185                                 thread_wakeup((event_t) &vm_pageout_garbage_collect);
1186
1187                                 assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
1188
1189                                 return;
1190                         }
1191                         mutex_unlock(&vm_page_queue_free_lock);
1192                 }
1193                 /*
1194                  * Before anything, we check if we have any ripe volatile objects around.
1195                  * If so, purge the first and see what it gives us.
1196                  */
1197                 assert (available_for_purge>=0);
1198                 if (available_for_purge)
1199                 {
1200                         if (object != NULL) {
1201                                 vm_object_unlock(object);
1202                                 object = NULL;
1203                         }
1204                         vm_purgeable_object_purge_one();
1205                         continue;
1206                 }
1207
1208                 if (queue_empty(&sq->age_q) && vm_page_speculative_count) {
1209                         /*
1210                          * try to pull pages from the aging bins
1211                          * see vm_page.h for an explanation of how
1212                          * this mechanism works
1213                          */
1214                         struct vm_speculative_age_q     *aq;
1215                         mach_timespec_t ts_fully_aged;
1216                         boolean_t       can_steal = FALSE;
1217
1218                         aq = &vm_page_queue_speculative[speculative_steal_index];
1219
1220                         while (queue_empty(&aq->age_q)) {
1221
1222                                 speculative_steal_index++;
1223
1224                                 if (speculative_steal_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q)
1225                                         speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
1226
1227                                 aq = &vm_page_queue_speculative[speculative_steal_index];
1228                         }
1229                         if (vm_page_speculative_count > vm_page_speculative_target)
1230                                 can_steal = TRUE;
1231                         else {
1232                                 ts_fully_aged.tv_sec = (VM_PAGE_MAX_SPECULATIVE_AGE_Q * VM_PAGE_SPECULATIVE_Q_AGE_MS) / 1000;
1233                                 ts_fully_aged.tv_nsec = ((VM_PAGE_MAX_SPECULATIVE_AGE_Q * VM_PAGE_SPECULATIVE_Q_AGE_MS) % 1000)
1234                                                       * 1000 * NSEC_PER_USEC;
1235
1236                                 ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts);
1237
1238                                 clock_get_system_nanotime(&ts.tv_sec, (unsigned *)&ts.tv_nsec);
1239
1240                                 if (CMP_MACH_TIMESPEC(&ts, &ts_fully_aged) >= 0)
1241                                         can_steal = TRUE;
1242                         }
1243                         if (can_steal == TRUE)
1244                                 vm_page_speculate_ageit(aq);
1245                 }
1246
1247                 /*
1248                  * Sometimes we have to pause:
1249                  *      1) No inactive pages - nothing to do.
1250                  *      2) Flow control - default pageout queue is full
1251                  *      3) Loop control - no acceptable pages found on the inactive queue
1252                  *         within the last vm_pageout_burst_inactive_throttle iterations
1253                  */
1254                 if (queue_empty(&vm_page_queue_inactive) && queue_empty(&vm_page_queue_zf) && queue_empty(&sq->age_q) &&
1255                     (VM_PAGE_Q_THROTTLED(iq) || queue_empty(&vm_page_queue_throttled))) {
1256                         vm_pageout_scan_empty_throttle++;
1257                         msecs = vm_pageout_empty_wait;
1258                         goto vm_pageout_scan_delay;
1259
1260                 } else if (inactive_burst_count >= vm_pageout_burst_inactive_throttle) {
1261                         vm_pageout_scan_burst_throttle++;
1262                         msecs = vm_pageout_burst_wait;
1263                         goto vm_pageout_scan_delay;
1264
1265                 } else if (VM_PAGE_Q_THROTTLED(iq) && IP_VALID(memory_manager_default)) {
1266
1267                         switch (flow_control.state) {
1268
1269                         case FCS_IDLE:
1270 reset_deadlock_timer:
1271                                 ts.tv_sec = vm_pageout_deadlock_wait / 1000;
1272                                 ts.tv_nsec = (vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
1273                                 clock_get_system_nanotime(&flow_control.ts.tv_sec,
1274                                                           (unsigned *)&flow_control.ts.tv_nsec);
1275                                 ADD_MACH_TIMESPEC(&flow_control.ts, &ts);
1276
1277                                 flow_control.state = FCS_DELAYED;
1278                                 msecs = vm_pageout_deadlock_wait;
1279
1280                                 break;
1281
1282                         case FCS_DELAYED:
1283                                 clock_get_system_nanotime(&ts.tv_sec,
1284                                                           (unsigned *)&ts.tv_nsec);
1285
1286                                 if (CMP_MACH_TIMESPEC(&ts, &flow_control.ts) >= 0) {
1287                                         /*
1288                                          * the pageout thread for the default pager is potentially
1289                                          * deadlocked since the
1290                                          * default pager queue has been throttled for more than the
1291                                          * allowable time... we need to move some clean pages or dirty
1292                                          * pages belonging to the external pagers if they aren't throttled
1293                                          * vm_page_free_wanted represents the number of threads currently
1294                                          * blocked waiting for pages... we'll move one page for each of
1295                                          * these plus a fixed amount to break the logjam... once we're done
1296                                          * moving this number of pages, we'll re-enter the FSC_DELAYED state
1297                                          * with a new timeout target since we have no way of knowing
1298                                          * whether we've broken the deadlock except through observation
1299                                          * of the queue associated with the default pager... we need to
1300                                          * stop moving pages and allow the system to run to see what
1301                                          * state it settles into.
1302                                          */
1303                                         vm_pageout_deadlock_target = vm_pageout_deadlock_relief + vm_page_free_wanted + vm_page_free_wanted_privileged;
1304                                         vm_pageout_scan_deadlock_detected++;
1305                                         flow_control.state = FCS_DEADLOCK_DETECTED;
1306
1307                                         thread_wakeup((event_t) &vm_pageout_garbage_collect);
1308                                         goto consider_inactive;
1309                                 }
1310                                 /*
1311                                  * just resniff instead of trying
1312                                  * to compute a new delay time... we're going to be
1313                                  * awakened immediately upon a laundry completion,
1314                                  * so we won't wait any longer than necessary
1315                                  */
1316                                 msecs = vm_pageout_idle_wait;
1317                                 break;
1318
1319                         case FCS_DEADLOCK_DETECTED:
1320                                 if (vm_pageout_deadlock_target)
1321                                         goto consider_inactive;
1322                                 goto reset_deadlock_timer;
1323
1324                         }
1325                         vm_pageout_scan_throttle++;
1326                         iq->pgo_throttled = TRUE;
1327 vm_pageout_scan_delay:
1328                         if (object != NULL) {
1329                                 vm_object_unlock(object);
1330                                 object = NULL;
1331                         }
1332                         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1333
1334                         if (local_freeq) {
1335                                 vm_page_free_list(local_freeq);
1336
1337                                 local_freeq = NULL;
1338                                 local_freed = 0;
1339                         }
1340 #if CONFIG_EMBEDDED
1341                         {
1342                         int percent_avail;
1343
1344                         /*
1345                          * Decide if we need to send a memory status notification.
1346                          */
1347                         percent_avail =
1348                                 (vm_page_active_count + vm_page_inactive_count +
1349                                  vm_page_speculative_count + vm_page_free_count +
1350                                  (IP_VALID(memory_manager_default)?0:vm_page_purgeable_count) ) * 100 /
1351                                 atop_64(max_mem);
1352                         if (percent_avail >= (kern_memorystatus_level + 5) ||
1353                             percent_avail <= (kern_memorystatus_level - 5)) {
1354                                 kern_memorystatus_level = percent_avail;
1355                                 thread_wakeup((event_t)&kern_memorystatus_wakeup);
1356                         }
1357                         }
1358 #endif
1359                         assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000*NSEC_PER_USEC);
1360
1361                         counter(c_vm_pageout_scan_block++);
1362
1363                         vm_page_unlock_queues();
1364
1365                         assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
1366
1367                         thread_block(THREAD_CONTINUE_NULL);
1368
1369                         vm_page_lock_queues();
1370                         delayed_unlock = 1;
1371
1372                         iq->pgo_throttled = FALSE;
1373
1374                         if (loop_count >= vm_page_inactive_count)
1375                                 loop_count = 0;
1376                         inactive_burst_count = 0;
1377
1378                         goto Restart;
1379                         /*NOTREACHED*/
1380                 }
1381
1382
1383                 flow_control.state = FCS_IDLE;
1384 consider_inactive:
1385                 loop_count++;
1386                 inactive_burst_count++;
1387                 vm_pageout_inactive++;
1388
1389                 /* Choose a victim. */
1390
1391                 while (1) {
1392                         m = NULL;
1393
1394                         /*
1395                          * the most eligible pages are ones that were throttled because the
1396                          * pager wasn't ready at the time.  If a pager is ready now,
1397                          * see if one of these is useful.
1398                          */
1399                         if (!VM_PAGE_Q_THROTTLED(iq) && !queue_empty(&vm_page_queue_throttled)) {
1400                                 m = (vm_page_t) queue_first(&vm_page_queue_throttled);
1401                                 break;
1402                         }
1403
1404                         /*
1405                          * The second most eligible pages are ones we paged in speculatively,
1406                          * but which have not yet been touched.
1407                          */
1408                         if ( !queue_empty(&sq->age_q) ) {
1409                                 m = (vm_page_t) queue_first(&sq->age_q);
1410                                 break;
1411                         }
1412                         /*
1413                          * Time for a zero-filled inactive page?
1414                          */
1415                         if ( ((zf_run_count < zf_ratio) && vm_zf_queue_count >= zf_queue_min_count) ||
1416                              queue_empty(&vm_page_queue_inactive)) {
1417                                 if ( !queue_empty(&vm_page_queue_zf) ) {
1418                                         m = (vm_page_t) queue_first(&vm_page_queue_zf);
1419                                         zf_run_count++;
1420                                         break;
1421                                 }
1422                         }
1423                         /*
1424                          * It's either a normal inactive page or nothing.
1425                          */
1426                         if ( !queue_empty(&vm_page_queue_inactive) ) {
1427                                 m = (vm_page_t) queue_first(&vm_page_queue_inactive);
1428                                 zf_run_count = 0;
1429                                 break;
1430                         }
1431
1432                         panic("vm_pageout: no victim");
1433                 }
1434
1435                 assert(!m->active && (m->inactive || m->speculative || m->throttled));
1436                 assert(!m->laundry);
1437                 assert(m->object != kernel_object);
1438                 assert(m->phys_page != vm_page_guard_addr);
1439
1440                 DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
1441
1442                 /*
1443                  * check to see if we currently are working
1444                  * with the same object... if so, we've
1445                  * already got the lock
1446                  */
1447                 if (m->object != object) {
1448                         /*
1449                          * the object associated with candidate page is
1450                          * different from the one we were just working
1451                          * with... dump the lock if we still own it
1452                          */
1453                         if (object != NULL) {
1454                                 vm_object_unlock(object);
1455                                 object = NULL;
1456                                 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1457                         }
1458                         /*
1459                          * Try to lock object; since we've alread got the
1460                          * page queues lock, we can only 'try' for this one.
1461                          * if the 'try' fails, we need to do a mutex_pause
1462                          * to allow the owner of the object lock a chance to
1463                          * run... otherwise, we're likely to trip over this
1464                          * object in the same state as we work our way through
1465                          * the queue... clumps of pages associated with the same
1466                          * object are fairly typical on the inactive and active queues
1467                          */
1468                         if (!vm_object_lock_try_scan(m->object)) {
1469                                 /*
1470                                  *      Move page to end and continue.
1471                                  *      Don't re-issue ticket
1472                                  */
1473                                 if (m->zero_fill) {
1474                                         queue_remove(&vm_page_queue_zf, m,
1475                                                      vm_page_t, pageq);
1476                                         queue_enter(&vm_page_queue_zf, m,
1477                                                     vm_page_t, pageq);
1478                                 } else if (m->speculative) {
1479                                         remque(&m->pageq);
1480                                         m->speculative = FALSE;
1481                                         vm_page_speculative_count--;
1482
1483                                         /*
1484                                          * move to the tail of the inactive queue
1485                                          * to get it out of the way... the speculative
1486                                          * queue is generally too small to depend
1487                                          * on there being enough pages from other
1488                                          * objects to make cycling it back on the
1489                                          * same queue a winning proposition
1490                                          */
1491                                         queue_enter(&vm_page_queue_inactive, m,
1492                                                     vm_page_t, pageq);
1493                                         m->inactive = TRUE;
1494                                         vm_page_inactive_count++;
1495                                         token_new_pagecount++;
1496                                 }  else if (m->throttled) {
1497                                         queue_remove(&vm_page_queue_throttled, m,
1498                                                      vm_page_t, pageq);
1499                                         m->throttled = FALSE;
1500                                         vm_page_throttled_count--;
1501
1502                                         /*
1503                                          * not throttled any more, so can stick
1504                                          * it on the inactive queue.
1505                                          */
1506                                         queue_enter(&vm_page_queue_inactive, m,
1507                                                     vm_page_t, pageq);
1508                                         m->inactive = TRUE;
1509                                         vm_page_inactive_count++;
1510                                         token_new_pagecount++;
1511                                 } else {
1512                                         queue_remove(&vm_page_queue_inactive, m,
1513                                                      vm_page_t, pageq);
1514 #if MACH_ASSERT
1515                                         vm_page_inactive_count--;       /* balance for purgeable queue asserts */
1516 #endif
1517                                         vm_purgeable_q_advance_all();
1518
1519                                         queue_enter(&vm_page_queue_inactive, m,
1520                                                     vm_page_t, pageq);
1521 #if MACH_ASSERT
1522                                         vm_page_inactive_count++;       /* balance for purgeable queue asserts */
1523 #endif
1524                                         token_new_pagecount++;
1525                                 }
1526                                 pmap_clear_reference(m->phys_page);
1527                                 m->reference = FALSE;
1528
1529                                 vm_pageout_inactive_nolock++;
1530
1531                                 if ( !queue_empty(&sq->age_q) )
1532                                         m = (vm_page_t) queue_first(&sq->age_q);
1533                                 else if ( ((zf_run_count < zf_ratio) && vm_zf_queue_count >= zf_queue_min_count) ||
1534                                           queue_empty(&vm_page_queue_inactive)) {
1535                                         if ( !queue_empty(&vm_page_queue_zf) )
1536                                                 m = (vm_page_t) queue_first(&vm_page_queue_zf);
1537                                 } else if ( !queue_empty(&vm_page_queue_inactive) ) {
1538                                         m = (vm_page_t) queue_first(&vm_page_queue_inactive);
1539                                 }
1540                                 /*
1541                                  * this is the next object we're going to be interested in
1542                                  * try to make sure its available after the mutex_yield
1543                                  * returns control
1544                                  */
1545                                 vm_pageout_scan_wants_object = m->object;
1546
1547                                 /*
1548                                  * force us to dump any collected free pages
1549                                  * and to pause before moving on
1550                                  */
1551                                 try_failed = TRUE;
1552
1553                                 goto done_with_inactivepage;
1554                         }
1555                         object = m->object;
1556                         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1557
1558                         try_failed = FALSE;
1559                 }
1560
1561                 /*
1562                  *      Paging out pages of external objects which
1563                  *      are currently being created must be avoided.
1564                  *      The pager may claim for memory, thus leading to a
1565                  *      possible dead lock between it and the pageout thread,
1566                  *      if such pages are finally chosen. The remaining assumption
1567                  *      is that there will finally be enough available pages in the
1568                  *      inactive pool to page out in order to satisfy all memory
1569                  *      claimed by the thread which concurrently creates the pager.
1570                  */
1571                 if (!object->pager_initialized && object->pager_created) {
1572                         /*
1573                          *      Move page to end and continue, hoping that
1574                          *      there will be enough other inactive pages to
1575                          *      page out so that the thread which currently
1576                          *      initializes the pager will succeed.
1577                          *      Don't re-grant the ticket, the page should
1578                          *      pulled from the queue and paged out whenever
1579                          *      one of its logically adjacent fellows is
1580                          *      targeted.
1581                          *
1582                          *      Pages found on the speculative list can never be
1583                          *      in this state... they always have a pager associated
1584                          *      with them.
1585                          */
1586                         assert(!m->speculative);
1587
1588                         if (m->zero_fill) {
1589                                 queue_remove(&vm_page_queue_zf, m,
1590                                              vm_page_t, pageq);
1591                                 queue_enter(&vm_page_queue_zf, m,
1592                                             vm_page_t, pageq);
1593                         } else {
1594                                 queue_remove(&vm_page_queue_inactive, m,
1595                                              vm_page_t, pageq);
1596 #if MACH_ASSERT
1597                                 vm_page_inactive_count--;       /* balance for purgeable queue asserts */
1598 #endif
1599                                 vm_purgeable_q_advance_all();
1600
1601                                 queue_enter(&vm_page_queue_inactive, m,
1602                                             vm_page_t, pageq);
1603 #if MACH_ASSERT
1604                                 vm_page_inactive_count++;       /* balance for purgeable queue asserts */
1605 #endif
1606                                 token_new_pagecount++;
1607                         }
1608                         vm_pageout_inactive_avoid++;
1609
1610                         goto done_with_inactivepage;
1611                 }
1612                 /*
1613                  *      Remove the page from its list.
1614                  */
1615                 if (m->speculative) {
1616                         remque(&m->pageq);
1617                         m->speculative = FALSE;
1618                         vm_page_speculative_count--;
1619                 } else if (m->throttled) {
1620                         queue_remove(&vm_page_queue_throttled, m, vm_page_t, pageq);
1621                         m->throttled = FALSE;
1622                         vm_page_throttled_count--;
1623                 } else {
1624                         if (m->zero_fill) {
1625                                 queue_remove(&vm_page_queue_zf, m, vm_page_t, pageq);
1626                                 vm_zf_queue_count--;
1627                         } else {
1628                                 queue_remove(&vm_page_queue_inactive, m, vm_page_t, pageq);
1629                         }
1630                         m->inactive = FALSE;
1631                         if (!m->fictitious)
1632                                 vm_page_inactive_count--;
1633                                 vm_purgeable_q_advance_all();
1634                 }
1635
1636                 /* If the object is empty, the page must be reclaimed even if dirty or used. */
1637                 /* If the page belongs to a volatile object, we stick it back on. */
1638                 if (object->copy == VM_OBJECT_NULL) {
1639                         if(object->purgable == VM_PURGABLE_EMPTY && !m->cleaning) {
1640                                 m->busy = TRUE;
1641                                 if (m->pmapped == TRUE) {
1642                                         /* unmap the page */
1643                                         refmod_state = pmap_disconnect(m->phys_page);
1644                                         if (refmod_state & VM_MEM_MODIFIED) {
1645                                                 m->dirty = TRUE;
1646                                         }
1647                                 }
1648                                 if (m->dirty || m->precious) {
1649                                         /* we saved the cost of cleaning this page ! */
1650                                         vm_page_purged_count++;
1651                                 }
1652                                 goto reclaim_page;
1653                         }
1654                         if (object->purgable == VM_PURGABLE_VOLATILE) {
1655                                 /* if it's wired, we can't put it on our queue */
1656                                 assert(m->wire_count == 0);
1657                                 /* just stick it back on! */
1658                                 goto reactivate_page;
1659                         }
1660                 }
1661                 m->pageq.next = NULL;
1662                 m->pageq.prev = NULL;
1663
1664                 if ( !m->fictitious && catch_up_count)
1665                         catch_up_count--;
1666
1667                 /*
1668                  * ENCRYPTED SWAP:
1669                  * if this page has already been picked up as part of a
1670                  * page-out cluster, it will be busy because it is being
1671                  * encrypted (see vm_object_upl_request()).  But we still
1672                  * want to demote it from "clean-in-place" (aka "adjacent")
1673                  * to "clean-and-free" (aka "target"), so let's ignore its
1674                  * "busy" bit here and proceed to check for "cleaning" a
1675                  * little bit below...
1676                  */
1677                 if ( !m->encrypted_cleaning && (m->busy || !object->alive)) {
1678                         /*
1679                          *      Somebody is already playing with this page.
1680                          *      Leave it off the pageout queues.
1681                          *
1682                          */
1683                         vm_pageout_inactive_busy++;
1684
1685                         goto done_with_inactivepage;
1686                 }
1687
1688                 /*
1689                  *      If it's absent or in error, we can reclaim the page.
1690                  */
1691
1692                 if (m->absent || m->error) {
1693                         vm_pageout_inactive_absent++;
1694 reclaim_page:
1695                         if (vm_pageout_deadlock_target) {
1696                                 vm_pageout_scan_inactive_throttle_success++;
1697                                 vm_pageout_deadlock_target--;
1698                         }
1699
1700                         DTRACE_VM2(dfree, int, 1, (uint64_t *), NULL);
1701
1702                         if (m->object->internal) {
1703                                 DTRACE_VM2(anonfree, int, 1, (uint64_t *), NULL);
1704                         } else {
1705                                 DTRACE_VM2(fsfree, int, 1, (uint64_t *), NULL);
1706                         }
1707
1708                         vm_page_free_prepare(m);
1709
1710                         assert(m->pageq.next == NULL &&
1711                                m->pageq.prev == NULL);
1712                         m->pageq.next = (queue_entry_t)local_freeq;
1713                         local_freeq = m;
1714                         local_freed++;
1715
1716                         inactive_burst_count = 0;
1717
1718                         goto done_with_inactivepage;
1719                 }
1720
1721                 assert(!m->private);
1722                 assert(!m->fictitious);
1723
1724                 /*
1725                  *      If already cleaning this page in place, convert from
1726                  *      "adjacent" to "target". We can leave the page mapped,
1727                  *      and vm_pageout_object_terminate will determine whether
1728                  *      to free or reactivate.
1729                  */
1730
1731                 if (m->cleaning) {
1732                         m->busy = TRUE;
1733                         m->pageout = TRUE;
1734                         m->dump_cleaning = TRUE;
1735                         vm_page_wire(m);
1736
1737                         CLUSTER_STAT(vm_pageout_cluster_conversions++);
1738
1739                         inactive_burst_count = 0;
1740
1741                         goto done_with_inactivepage;
1742                 }
1743
1744                 /*
1745                  *      If it's being used, reactivate.
1746                  *      (Fictitious pages are either busy or absent.)
1747                  *      First, update the reference and dirty bits
1748                  *      to make sure the page is unreferenced.
1749                  */
1750                 refmod_state = -1;
1751
1752                 if (m->reference == FALSE && m->pmapped == TRUE) {
1753                         refmod_state = pmap_get_refmod(m->phys_page);
1754
1755                         if (refmod_state & VM_MEM_REFERENCED)
1756                                 m->reference = TRUE;
1757                         if (refmod_state & VM_MEM_MODIFIED)
1758                                 m->dirty = TRUE;
1759                 }
1760                 if (m->reference && !m->no_cache) {
1761                         /*
1762                          * The page we pulled off the inactive list has
1763                          * been referenced.  It is possible for other
1764                          * processors to be touching pages faster than we
1765                          * can clear the referenced bit and traverse the
1766                          * inactive queue, so we limit the number of
1767                          * reactivations.
1768                          */
1769                         if (++reactivated_this_call >= reactivate_limit) {
1770                                 vm_pageout_reactivation_limit_exceeded++;
1771                         } else if (catch_up_count) {
1772                                 vm_pageout_catch_ups++;
1773                         } else if (++inactive_reclaim_run >= VM_PAGEOUT_INACTIVE_FORCE_RECLAIM) {
1774                                 vm_pageout_inactive_force_reclaim++;
1775                         } else {
1776                                 /*
1777                                  * The page was being used, so put back on active list.
1778                                  */
1779 reactivate_page:
1780                                 vm_page_activate(m);
1781                                 VM_STAT_INCR(reactivations);
1782
1783                                 vm_pageout_inactive_used++;
1784                                 inactive_burst_count = 0;
1785
1786                                 goto done_with_inactivepage;
1787                         }
1788                         /*
1789                          * Make sure we call pmap_get_refmod() if it
1790                          * wasn't already called just above, to update
1791                          * the dirty bit.
1792                          */
1793                         if ((refmod_state == -1) && !m->dirty && m->pmapped) {
1794                                 refmod_state = pmap_get_refmod(m->phys_page);
1795                                 if (refmod_state & VM_MEM_MODIFIED)
1796                                         m->dirty = TRUE;
1797                         }
1798                         forced_reclaim = TRUE;
1799                 } else {
1800                         forced_reclaim = FALSE;
1801                 }
1802
1803                 XPR(XPR_VM_PAGEOUT,
1804                 "vm_pageout_scan, replace object 0x%X offset 0x%X page 0x%X\n",
1805                 (integer_t)object, (integer_t)m->offset, (integer_t)m, 0,0);
1806
1807                 /*
1808                  * we've got a candidate page to steal...
1809                  *
1810                  * m->dirty is up to date courtesy of the
1811                  * preceding check for m->reference... if
1812                  * we get here, then m->reference had to be
1813                  * FALSE (or possibly "reactivate_limit" was
1814                  * exceeded), but in either case we called
1815                  * pmap_get_refmod() and updated both
1816                  * m->reference and m->dirty
1817                  *
1818                  * if it's dirty or precious we need to
1819                  * see if the target queue is throtttled
1820                  * it if is, we need to skip over it by moving it back
1821                  * to the end of the inactive queue
1822                  */
1823                 inactive_throttled = FALSE;
1824
1825                 if (m->dirty || m->precious) {
1826                         if (object->internal) {
1827                                 if (VM_PAGE_Q_THROTTLED(iq))
1828                                         inactive_throttled = TRUE;
1829                         } else if (VM_PAGE_Q_THROTTLED(eq)) {
1830                                 inactive_throttled = TRUE;
1831                         }
1832                 }
1833                 if (inactive_throttled == TRUE) {
1834 throttle_inactive:
1835                         if (!IP_VALID(memory_manager_default) &&
1836                                 object->internal &&
1837                                 (object->purgable == VM_PURGABLE_DENY ||
1838                                  object->purgable == VM_PURGABLE_NONVOLATILE ||
1839                                  object->purgable == VM_PURGABLE_VOLATILE )) {
1840                                 queue_enter(&vm_page_queue_throttled, m,
1841                                             vm_page_t, pageq);
1842                                 m->throttled = TRUE;
1843                                 vm_page_throttled_count++;
1844                         } else {
1845                                 if (m->zero_fill) {
1846                                         queue_enter(&vm_page_queue_zf, m,
1847                                                     vm_page_t, pageq);
1848                                         vm_zf_queue_count++;
1849                                 } else
1850                                         queue_enter(&vm_page_queue_inactive, m,
1851                                                     vm_page_t, pageq);
1852                                 m->inactive = TRUE;
1853                                 if (!m->fictitious) {
1854                                         vm_page_inactive_count++;
1855                                         token_new_pagecount++;
1856                                 }
1857                         }
1858                         vm_pageout_scan_inactive_throttled++;
1859                         goto done_with_inactivepage;
1860                 }
1861
1862                 /*
1863                  * we've got a page that we can steal...
1864                  * eliminate all mappings and make sure
1865                  * we have the up-to-date modified state
1866                  * first take the page BUSY, so that no new
1867                  * mappings can be made
1868                  */
1869                 m->busy = TRUE;
1870
1871                 /*
1872                  * if we need to do a pmap_disconnect then we
1873                  * need to re-evaluate m->dirty since the pmap_disconnect
1874                  * provides the true state atomically... the
1875                  * page was still mapped up to the pmap_disconnect
1876                  * and may have been dirtied at the last microsecond
1877                  *
1878                  * we also check for the page being referenced 'late'
1879                  * if it was, we first need to do a WAKEUP_DONE on it
1880                  * since we already set m->busy = TRUE, before
1881                  * going off to reactivate it
1882                  *
1883                  * Note that if 'pmapped' is FALSE then the page is not
1884                  * and has not been in any map, so there is no point calling
1885                  * pmap_disconnect().  m->dirty and/or m->reference could
1886                  * have been set in anticipation of likely usage of the page.
1887                  */
1888                 if (m->pmapped == TRUE) {
1889                         refmod_state = pmap_disconnect(m->phys_page);
1890
1891                         if (refmod_state & VM_MEM_MODIFIED)
1892                                 m->dirty = TRUE;
1893                         if (refmod_state & VM_MEM_REFERENCED) {
1894
1895                                 /* If m->reference is already set, this page must have
1896                                  * already failed the reactivate_limit test, so don't
1897                                  * bump the counts twice.
1898                                  */
1899                                 if ( ! m->reference ) {
1900                                         m->reference = TRUE;
1901                                         if (forced_reclaim ||
1902                                             ++reactivated_this_call >= reactivate_limit)
1903                                                 vm_pageout_reactivation_limit_exceeded++;
1904                                         else {
1905                                                 PAGE_WAKEUP_DONE(m);
1906                                                 goto reactivate_page;
1907                                         }
1908                                 }
1909                         }
1910                 }
1911                 /*
1912                  * reset our count of pages that have been reclaimed
1913                  * since the last page was 'stolen'
1914                  */
1915                 inactive_reclaim_run = 0;
1916
1917                 /*
1918                  *      If it's clean and not precious, we can free the page.
1919                  */
1920                 if (!m->dirty && !m->precious) {
1921                         vm_pageout_inactive_clean++;
1922                         goto reclaim_page;
1923                 }
1924
1925                 /*
1926                  * The page may have been dirtied since the last check
1927                  * for a throttled target queue (which may have been skipped
1928                  * if the page was clean then).  With the dirty page
1929                  * disconnected here, we can make one final check.
1930                  */
1931                 {
1932                         boolean_t disconnect_throttled = FALSE;
1933                         if (object->internal) {
1934                                 if (VM_PAGE_Q_THROTTLED(iq))
1935                                         disconnect_throttled = TRUE;
1936                         } else if (VM_PAGE_Q_THROTTLED(eq)) {
1937                                 disconnect_throttled = TRUE;
1938                         }
1939
1940                         if (disconnect_throttled == TRUE) {
1941                                 PAGE_WAKEUP_DONE(m);
1942                                 goto throttle_inactive;
1943                         }
1944                 }
1945
1946                 vm_pageout_cluster(m);
1947
1948                 vm_pageout_inactive_dirty++;
1949
1950                 inactive_burst_count = 0;
1951
1952 done_with_inactivepage:
1953                 if (delayed_unlock++ > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT || try_failed == TRUE) {
1954
1955                         if (object != NULL) {
1956                                 vm_object_unlock(object);
1957                                 object = NULL;
1958                                 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1959                         }
1960                         if (local_freeq) {
1961                                 vm_page_free_list(local_freeq);
1962
1963                                 local_freeq = NULL;
1964                                 local_freed = 0;
1965                         }
1966                         mutex_yield(&vm_page_queue_lock);
1967
1968                         delayed_unlock = 1;
1969                 }
1970                 /*
1971                  * back to top of pageout scan loop
1972                  */
1973         }
1974 }
1975
1976
1977 int vm_page_free_count_init;
1978
1979 void
1980 vm_page_free_reserve(
1981         int pages)
1982 {
1983         int             free_after_reserve;
1984
1985         vm_page_free_reserved += pages;
1986
1987         free_after_reserve = vm_page_free_count_init - vm_page_free_reserved;
1988
1989         vm_page_free_min = vm_page_free_reserved +
1990                 VM_PAGE_FREE_MIN(free_after_reserve);
1991
1992         if (vm_page_free_min > VM_PAGE_FREE_MIN_LIMIT)
1993                 vm_page_free_min = VM_PAGE_FREE_MIN_LIMIT;
1994
1995         vm_page_free_target = vm_page_free_reserved +
1996                 VM_PAGE_FREE_TARGET(free_after_reserve);
1997
1998         if (vm_page_free_target > VM_PAGE_FREE_TARGET_LIMIT)
1999                 vm_page_free_target = VM_PAGE_FREE_TARGET_LIMIT;
2000
2001         if (vm_page_free_target < vm_page_free_min + 5)
2002                 vm_page_free_target = vm_page_free_min + 5;
2003
2004 }
2005
2006 /*
2007  *      vm_pageout is the high level pageout daemon.
2008  */
2009
2010 void
2011 vm_pageout_continue(void)
2012 {
2013         DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL);
2014         vm_pageout_scan_event_counter++;
2015         vm_pageout_scan();
2016         /* we hold vm_page_queue_free_lock now */
2017         assert(vm_page_free_wanted == 0);
2018         assert(vm_page_free_wanted_privileged == 0);
2019         assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
2020         mutex_unlock(&vm_page_queue_free_lock);
2021
2022         counter(c_vm_pageout_block++);
2023         thread_block((thread_continue_t)vm_pageout_continue);
2024         /*NOTREACHED*/
2025 }
2026
2027
2028 /*
2029  * must be called with the
2030  * queues and object locks held
2031  */
2032 static void
2033 vm_pageout_queue_steal(vm_page_t m)
2034 {
2035         struct vm_pageout_queue *q;
2036
2037         if (m->object->internal == TRUE)
2038                 q = &vm_pageout_queue_internal;
2039         else
2040                 q = &vm_pageout_queue_external;
2041
2042         m->laundry = FALSE;
2043         m->pageout_queue = FALSE;
2044         queue_remove(&q->pgo_pending, m, vm_page_t, pageq);
2045
2046         m->pageq.next = NULL;
2047         m->pageq.prev = NULL;
2048
2049         vm_object_paging_end(m->object);
2050
2051         q->pgo_laundry--;
2052 }
2053
2054
2055 #ifdef FAKE_DEADLOCK
2056
2057 #define FAKE_COUNT      5000
2058
2059 int internal_count = 0;
2060 int fake_deadlock = 0;
2061
2062 #endif
2063
2064 static void
2065 vm_pageout_iothread_continue(struct vm_pageout_queue *q)
2066 {
2067         vm_page_t       m = NULL;
2068         vm_object_t     object;
2069         boolean_t       need_wakeup;
2070         memory_object_t pager;
2071         thread_t        self = current_thread();
2072
2073         if ((vm_pageout_internal_iothread != THREAD_NULL)
2074             && (self == vm_pageout_external_iothread )
2075             && (self->options & TH_OPT_VMPRIV))
2076                 self->options &= ~TH_OPT_VMPRIV;
2077
2078         vm_page_lockspin_queues();
2079
2080         while ( !queue_empty(&q->pgo_pending) ) {
2081
2082                    q->pgo_busy = TRUE;
2083                    queue_remove_first(&q->pgo_pending, m, vm_page_t, pageq);
2084                    m->pageout_queue = FALSE;
2085                    vm_page_unlock_queues();
2086
2087                    m->pageq.next = NULL;
2088                    m->pageq.prev = NULL;
2089 #ifdef FAKE_DEADLOCK
2090                    if (q == &vm_pageout_queue_internal) {
2091                            vm_offset_t addr;
2092                            int  pg_count;
2093
2094                            internal_count++;
2095
2096                            if ((internal_count == FAKE_COUNT)) {
2097
2098                                    pg_count = vm_page_free_count + vm_page_free_reserved;
2099
2100                                    if (kmem_alloc(kernel_map, &addr, PAGE_SIZE * pg_count) == KERN_SUCCESS) {
2101                                            kmem_free(kernel_map, addr, PAGE_SIZE * pg_count);
2102                                    }
2103                                    internal_count = 0;
2104                                    fake_deadlock++;
2105                            }
2106                    }
2107 #endif
2108                    object = m->object;
2109
2110                    vm_object_lock(object);
2111
2112                    if (!object->pager_initialized) {
2113
2114                            /*
2115                             *   If there is no memory object for the page, create
2116                             *   one and hand it to the default pager.
2117                             */
2118
2119                            if (!object->pager_initialized)
2120                                    vm_object_collapse(object,
2121                                                       (vm_object_offset_t) 0,
2122                                                       TRUE);
2123                            if (!object->pager_initialized)
2124                                    vm_object_pager_create(object);
2125                            if (!object->pager_initialized) {
2126                                    /*
2127                                     *   Still no pager for the object.
2128                                     *   Reactivate the page.
2129                                     *
2130                                     *   Should only happen if there is no
2131                                     *   default pager.
2132                                     */
2133                                    m->list_req_pending = FALSE;
2134                                    m->cleaning = FALSE;
2135                                    m->pageout = FALSE;
2136
2137                                    vm_page_lockspin_queues();
2138                                    vm_page_unwire(m);
2139                                    vm_pageout_throttle_up(m);
2140                                    vm_pageout_dirty_no_pager++;
2141                                    vm_page_activate(m);
2142                                    vm_page_unlock_queues();
2143
2144                                    /*
2145                                     *   And we are done with it.
2146                                     */
2147                                    PAGE_WAKEUP_DONE(m);
2148
2149                                    vm_object_paging_end(object);
2150                                    vm_object_unlock(object);
2151
2152                                    vm_page_lockspin_queues();
2153                                    continue;
2154                            }
2155                    }
2156                    pager = object->pager;
2157                    if (pager == MEMORY_OBJECT_NULL) {
2158                            /*
2159                             * This pager has been destroyed by either
2160                             * memory_object_destroy or vm_object_destroy, and
2161                             * so there is nowhere for the page to go.
2162                             * Just free the page... VM_PAGE_FREE takes
2163                             * care of cleaning up all the state...
2164                             * including doing the vm_pageout_throttle_up
2165                             */
2166
2167                            VM_PAGE_FREE(m);
2168
2169                            vm_object_paging_end(object);
2170                            vm_object_unlock(object);
2171
2172                            vm_page_lockspin_queues();
2173                            continue;
2174                    }
2175                    vm_object_unlock(object);
2176                    /*
2177                     * we expect the paging_in_progress reference to have
2178                     * already been taken on the object before it was added
2179                     * to the appropriate pageout I/O queue... this will
2180                     * keep the object from being terminated and/or the
2181                     * paging_offset from changing until the I/O has
2182                     * completed... therefore no need to lock the object to
2183                     * pull the paging_offset from it.
2184                     *
2185                     * Send the data to the pager.
2186                     * any pageout clustering happens there
2187                     */
2188                    memory_object_data_return(pager,
2189                                              m->offset + object->paging_offset,
2190                                              PAGE_SIZE,
2191                                              NULL,
2192                                              NULL,
2193                                              FALSE,
2194                                              FALSE,
2195                                              0);
2196
2197                    vm_object_lock(object);
2198                    vm_object_paging_end(object);
2199                    vm_object_unlock(object);
2200
2201                    vm_page_lockspin_queues();
2202         }
2203         assert_wait((event_t) q, THREAD_UNINT);
2204
2205
2206         if (q->pgo_throttled == TRUE && !VM_PAGE_Q_THROTTLED(q)) {
2207                 q->pgo_throttled = FALSE;
2208                 need_wakeup = TRUE;
2209         } else
2210                 need_wakeup = FALSE;
2211
2212         q->pgo_busy = FALSE;
2213         q->pgo_idle = TRUE;
2214         vm_page_unlock_queues();
2215
2216         if (need_wakeup == TRUE)
2217                 thread_wakeup((event_t) &q->pgo_laundry);
2218
2219         thread_block_parameter((thread_continue_t)vm_pageout_iothread_continue, (void *) &q->pgo_pending);
2220         /*NOTREACHED*/
2221 }
2222
2223
2224 static void
2225 vm_pageout_iothread_external(void)
2226 {
2227         thread_t        self = current_thread();
2228
2229         self->options |= TH_OPT_VMPRIV;
2230
2231         vm_pageout_iothread_continue(&vm_pageout_queue_external);
2232         /*NOTREACHED*/
2233 }
2234
2235
2236 static void
2237 vm_pageout_iothread_internal(void)
2238 {
2239         thread_t        self = current_thread();
2240
2241         self->options |= TH_OPT_VMPRIV;
2242
2243         vm_pageout_iothread_continue(&vm_pageout_queue_internal);
2244         /*NOTREACHED*/
2245 }
2246
2247 static void
2248 vm_pageout_garbage_collect(int collect)
2249 {
2250         if (collect) {
2251                 stack_collect();
2252
2253                 /*
2254                  * consider_zone_gc should be last, because the other operations
2255                  * might return memory to zones.
2256                  */
2257                 consider_machine_collect();
2258                 consider_zone_gc();
2259
2260                 consider_machine_adjust();
2261         }
2262
2263         assert_wait((event_t) &vm_pageout_garbage_collect, THREAD_UNINT);
2264
2265         thread_block_parameter((thread_continue_t) vm_pageout_garbage_collect, (void *)1);
2266         /*NOTREACHED*/
2267 }
2268
2269
2270
2271 void
2272 vm_pageout(void)
2273 {
2274         thread_t        self = current_thread();
2275         thread_t        thread;
2276         kern_return_t   result;
2277         spl_t           s;
2278
2279         /*
2280          * Set thread privileges.
2281          */
2282         s = splsched();
2283         thread_lock(self);
2284         self->priority = BASEPRI_PREEMPT - 1;
2285         set_sched_pri(self, self->priority);
2286         thread_unlock(self);
2287
2288         if (!self->reserved_stack)
2289                 self->reserved_stack = self->kernel_stack;
2290
2291         splx(s);
2292
2293         /*
2294          *      Initialize some paging parameters.
2295          */
2296
2297         if (vm_pageout_idle_wait == 0)
2298                 vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
2299
2300         if (vm_pageout_burst_wait == 0)
2301                 vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
2302
2303         if (vm_pageout_empty_wait == 0)
2304                 vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
2305
2306         if (vm_pageout_deadlock_wait == 0)
2307                 vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
2308
2309         if (vm_pageout_deadlock_relief == 0)
2310                 vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
2311
2312         if (vm_pageout_inactive_relief == 0)
2313                 vm_pageout_inactive_relief = VM_PAGEOUT_INACTIVE_RELIEF;
2314
2315         if (vm_pageout_burst_active_throttle == 0)
2316                 vm_pageout_burst_active_throttle = VM_PAGEOUT_BURST_ACTIVE_THROTTLE;
2317
2318         if (vm_pageout_burst_inactive_throttle == 0)
2319                 vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
2320
2321         /*
2322          * Set kernel task to low backing store privileged
2323          * status
2324          */
2325         task_lock(kernel_task);
2326         kernel_task->priv_flags |= VM_BACKING_STORE_PRIV;
2327         task_unlock(kernel_task);
2328
2329         vm_page_free_count_init = vm_page_free_count;
2330
2331         /*
2332          * even if we've already called vm_page_free_reserve
2333          * call it again here to insure that the targets are
2334          * accurately calculated (it uses vm_page_free_count_init)
2335          * calling it with an arg of 0 will not change the reserve
2336          * but will re-calculate free_min and free_target
2337          */
2338         if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
2339                 vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
2340         } else
2341                 vm_page_free_reserve(0);
2342
2343
2344         queue_init(&vm_pageout_queue_external.pgo_pending);
2345         vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
2346         vm_pageout_queue_external.pgo_laundry = 0;
2347         vm_pageout_queue_external.pgo_idle = FALSE;
2348         vm_pageout_queue_external.pgo_busy = FALSE;
2349         vm_pageout_queue_external.pgo_throttled = FALSE;
2350
2351         queue_init(&vm_pageout_queue_internal.pgo_pending);
2352         vm_pageout_queue_internal.pgo_maxlaundry = 0;
2353         vm_pageout_queue_internal.pgo_laundry = 0;
2354         vm_pageout_queue_internal.pgo_idle = FALSE;
2355         vm_pageout_queue_internal.pgo_busy = FALSE;
2356         vm_pageout_queue_internal.pgo_throttled = FALSE;
2357
2358
2359         /* internal pageout thread started when default pager registered first time */
2360         /* external pageout and garbage collection threads started here */
2361
2362         result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external, NULL,
2363                                               BASEPRI_PREEMPT - 1,
2364                                               &vm_pageout_external_iothread);
2365         if (result != KERN_SUCCESS)
2366                 panic("vm_pageout_iothread_external: create failed");
2367
2368         thread_deallocate(vm_pageout_external_iothread);
2369
2370         result = kernel_thread_start_priority((thread_continue_t)vm_pageout_garbage_collect, NULL,
2371                                               MINPRI_KERNEL,
2372                                               &thread);
2373         if (result != KERN_SUCCESS)
2374                 panic("vm_pageout_garbage_collect: create failed");
2375
2376         thread_deallocate(thread);
2377
2378         vm_object_reaper_init();
2379
2380
2381         vm_pageout_continue();
2382
2383         /*
2384          * Unreached code!
2385          *
2386          * The vm_pageout_continue() call above never returns, so the code below is never
2387          * executed.  We take advantage of this to declare several DTrace VM related probe
2388          * points that our kernel doesn't have an analog for.  These are probe points that
2389          * exist in Solaris and are in the DTrace documentation, so people may have written
2390          * scripts that use them.  Declaring the probe points here means their scripts will
2391          * compile and execute which we want for portability of the scripts, but since this
2392          * section of code is never reached, the probe points will simply never fire.  Yes,
2393          * this is basically a hack.  The problem is the DTrace probe points were chosen with
2394          * Solaris specific VM events in mind, not portability to different VM implementations.
2395          */
2396
2397         DTRACE_VM2(execfree, int, 1, (uint64_t *), NULL);
2398         DTRACE_VM2(execpgin, int, 1, (uint64_t *), NULL);
2399         DTRACE_VM2(execpgout, int, 1, (uint64_t *), NULL);
2400         DTRACE_VM2(pgswapin, int, 1, (uint64_t *), NULL);
2401         DTRACE_VM2(pgswapout, int, 1, (uint64_t *), NULL);
2402         DTRACE_VM2(swapin, int, 1, (uint64_t *), NULL);
2403         DTRACE_VM2(swapout, int, 1, (uint64_t *), NULL);
2404         /*NOTREACHED*/
2405 }
2406
2407 kern_return_t
2408 vm_pageout_internal_start(void)
2409 {
2410         kern_return_t result;
2411
2412         vm_pageout_queue_internal.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
2413         result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal, NULL, BASEPRI_PREEMPT - 1, &vm_pageout_internal_iothread);
2414         if (result == KERN_SUCCESS)
2415                 thread_deallocate(vm_pageout_internal_iothread);
2416         return result;
2417 }
2418
2419 #define UPL_DELAYED_UNLOCK_LIMIT  (MAX_UPL_TRANSFER / 2)
2420
2421 static upl_t
2422 upl_create(int type, int flags, upl_size_t size)
2423 {
2424         upl_t   upl;
2425         int     page_field_size = 0;
2426         int     upl_flags = 0;
2427         int     upl_size  = sizeof(struct upl);
2428
2429         if (type & UPL_CREATE_LITE) {
2430                 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
2431                 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
2432
2433                 upl_flags |= UPL_LITE;
2434         }
2435         if (type & UPL_CREATE_INTERNAL) {
2436                 upl_size += sizeof(struct upl_page_info) * (size/PAGE_SIZE);
2437
2438                 upl_flags |= UPL_INTERNAL;
2439         }
2440         upl = (upl_t)kalloc(upl_size + page_field_size);
2441
2442         if (page_field_size)
2443                 bzero((char *)upl + upl_size, page_field_size);
2444
2445         upl->flags = upl_flags | flags;
2446         upl->src_object = NULL;
2447         upl->kaddr = (vm_offset_t)0;
2448         upl->size = 0;
2449         upl->map_object = NULL;
2450         upl->ref_count = 1;
2451         upl->highest_page = 0;
2452         upl_lock_init(upl);
2453 #ifdef UPL_DEBUG
2454         upl->ubc_alias1 = 0;
2455         upl->ubc_alias2 = 0;
2456 #endif /* UPL_DEBUG */
2457         return(upl);
2458 }
2459
2460 static void
2461 upl_destroy(upl_t upl)
2462 {
2463         int     page_field_size;  /* bit field in word size buf */
2464         int     size;
2465
2466 #ifdef UPL_DEBUG
2467         {
2468                 vm_object_t     object;
2469
2470                 if (upl->flags & UPL_SHADOWED) {
2471                         object = upl->map_object->shadow;
2472                 } else {
2473                         object = upl->map_object;
2474                 }
2475                 vm_object_lock(object);
2476                 queue_remove(&object->uplq, upl, upl_t, uplq);
2477                 vm_object_unlock(object);
2478         }
2479 #endif /* UPL_DEBUG */
2480         /*
2481          * drop a reference on the map_object whether or
2482          * not a pageout object is inserted
2483          */
2484         if (upl->flags & UPL_SHADOWED)
2485                 vm_object_deallocate(upl->map_object);
2486
2487         if (upl->flags & UPL_DEVICE_MEMORY)
2488                 size = PAGE_SIZE;
2489         else
2490                 size = upl->size;
2491         page_field_size = 0;
2492
2493         if (upl->flags & UPL_LITE) {
2494                 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
2495                 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
2496         }
2497         if (upl->flags & UPL_INTERNAL) {
2498                 kfree(upl,
2499                       sizeof(struct upl) +
2500                       (sizeof(struct upl_page_info) * (size/PAGE_SIZE))
2501                       + page_field_size);
2502         } else {
2503                 kfree(upl, sizeof(struct upl) + page_field_size);
2504         }
2505 }
2506
2507 void uc_upl_dealloc(upl_t upl);
2508 __private_extern__ void
2509 uc_upl_dealloc(upl_t upl)
2510 {
2511         if (--upl->ref_count == 0)
2512                 upl_destroy(upl);
2513 }
2514
2515 void
2516 upl_deallocate(upl_t upl)
2517 {
2518         if (--upl->ref_count == 0)
2519                 upl_destroy(upl);
2520 }
2521
2522 /*
2523  * Statistics about UPL enforcement of copy-on-write obligations.
2524  */
2525 unsigned long upl_cow = 0;
2526 unsigned long upl_cow_again = 0;
2527 unsigned long upl_cow_contiguous = 0;
2528 unsigned long upl_cow_pages = 0;
2529 unsigned long upl_cow_again_pages = 0;
2530 unsigned long upl_cow_contiguous_pages = 0;
2531
2532 /*
2533  *      Routine:        vm_object_upl_request
2534  *      Purpose:
2535  *              Cause the population of a portion of a vm_object.
2536  *              Depending on the nature of the request, the pages
2537  *              returned may be contain valid data or be uninitialized.
2538  *              A page list structure, listing the physical pages
2539  *              will be returned upon request.
2540  *              This function is called by the file system or any other
2541  *              supplier of backing store to a pager.
2542  *              IMPORTANT NOTE: The caller must still respect the relationship
2543  *              between the vm_object and its backing memory object.  The
2544  *              caller MUST NOT substitute changes in the backing file
2545  *              without first doing a memory_object_lock_request on the
2546  *              target range unless it is know that the pages are not
2547  *              shared with another entity at the pager level.
2548  *              Copy_in_to:
2549  *                      if a page list structure is present
2550  *                      return the mapped physical pages, where a
2551  *                      page is not present, return a non-initialized
2552  *                      one.  If the no_sync bit is turned on, don't
2553  *                      call the pager unlock to synchronize with other
2554  *                      possible copies of the page. Leave pages busy
2555  *                      in the original object, if a page list structure
2556  *                      was specified.  When a commit of the page list
2557  *                      pages is done, the dirty bit will be set for each one.
2558  *              Copy_out_from:
2559  *                      If a page list structure is present, return
2560  *                      all mapped pages.  Where a page does not exist
2561  *                      map a zero filled one. Leave pages busy in
2562  *                      the original object.  If a page list structure
2563  *                      is not specified, this call is a no-op.
2564  *
2565  *              Note:  access of default pager objects has a rather interesting
2566  *              twist.  The caller of this routine, presumably the file system
2567  *              page cache handling code, will never actually make a request
2568  *              against a default pager backed object.  Only the default
2569  *              pager will make requests on backing store related vm_objects
2570  *              In this way the default pager can maintain the relationship
2571  *              between backing store files (abstract memory objects) and
2572  *              the vm_objects (cache objects), they support.
2573  *
2574  */
2575
2576 __private_extern__ kern_return_t
2577 vm_object_upl_request(
2578         vm_object_t             object,
2579         vm_object_offset_t      offset,
2580         upl_size_t              size,
2581         upl_t                   *upl_ptr,
2582         upl_page_info_array_t   user_page_list,
2583         unsigned int            *page_list_count,
2584         int                     cntrl_flags)
2585 {
2586         vm_page_t               dst_page = VM_PAGE_NULL;
2587         vm_object_offset_t      dst_offset;
2588         upl_size_t              xfer_size;
2589         boolean_t               dirty;
2590         boolean_t               hw_dirty;
2591         upl_t                   upl = NULL;
2592         unsigned int            entry;
2593 #if MACH_CLUSTER_STATS
2594         boolean_t               encountered_lrp = FALSE;
2595 #endif
2596         vm_page_t               alias_page = NULL;
2597         int                     refmod_state = 0;
2598         wpl_array_t             lite_list = NULL;
2599         vm_object_t             last_copy_object;
2600         int                     delayed_unlock = 0;
2601         int                     j;
2602
2603         if (cntrl_flags & ~UPL_VALID_FLAGS) {
2604                 /*
2605                  * For forward compatibility's sake,
2606                  * reject any unknown flag.
2607                  */
2608                 return KERN_INVALID_VALUE;
2609         }
2610         if ( (!object->internal) && (object->paging_offset != 0) )
2611                 panic("vm_object_upl_request: external object with non-zero paging offset\n");
2612         if (object->phys_contiguous)
2613                 panic("vm_object_upl_request: contiguous object specified\n");
2614
2615
2616         if ((size / PAGE_SIZE) > MAX_UPL_SIZE)
2617                 size = MAX_UPL_SIZE * PAGE_SIZE;
2618
2619         if ( (cntrl_flags & UPL_SET_INTERNAL) && page_list_count != NULL)
2620                 *page_list_count = MAX_UPL_SIZE;
2621
2622         if (cntrl_flags & UPL_SET_INTERNAL) {
2623                 if (cntrl_flags & UPL_SET_LITE) {
2624
2625                         upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE, 0, size);
2626
2627                         user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
2628                         lite_list = (wpl_array_t)
2629                                         (((uintptr_t)user_page_list) +
2630                                         ((size/PAGE_SIZE) * sizeof(upl_page_info_t)));
2631                 } else {
2632                         upl = upl_create(UPL_CREATE_INTERNAL, 0, size);
2633
2634                         user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
2635                 }
2636         } else {
2637                 if (cntrl_flags & UPL_SET_LITE) {
2638
2639                         upl = upl_create(UPL_CREATE_EXTERNAL | UPL_CREATE_LITE, 0, size);
2640
2641                         lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
2642                 } else {
2643                         upl = upl_create(UPL_CREATE_EXTERNAL, 0, size);
2644                 }
2645         }
2646         *upl_ptr = upl;
2647
2648         if (user_page_list)
2649                 user_page_list[0].device = FALSE;
2650
2651         if (cntrl_flags & UPL_SET_LITE) {
2652                 upl->map_object = object;
2653         } else {
2654                 upl->map_object = vm_object_allocate(size);
2655                 /*
2656                  * No neeed to lock the new object: nobody else knows
2657                  * about it yet, so it's all ours so far.
2658                  */
2659                 upl->map_object->shadow = object;
2660                 upl->map_object->pageout = TRUE;
2661                 upl->map_object->can_persist = FALSE;
2662                 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2663                 upl->map_object->shadow_offset = offset;
2664                 upl->map_object->wimg_bits = object->wimg_bits;
2665
2666                 VM_PAGE_GRAB_FICTITIOUS(alias_page);
2667
2668                 upl->flags |= UPL_SHADOWED;
2669         }
2670         /*
2671          * ENCRYPTED SWAP:
2672          * Just mark the UPL as "encrypted" here.
2673          * We'll actually encrypt the pages later,
2674          * in upl_encrypt(), when the caller has
2675          * selected which pages need to go to swap.
2676          */
2677         if (cntrl_flags & UPL_ENCRYPT)
2678                 upl->flags |= UPL_ENCRYPTED;
2679
2680         if (cntrl_flags & UPL_FOR_PAGEOUT)
2681                 upl->flags |= UPL_PAGEOUT;
2682
2683         vm_object_lock(object);
2684         vm_object_paging_begin(object);
2685
2686         /*
2687          * we can lock in the paging_offset once paging_in_progress is set
2688          */
2689         upl->size = size;
2690         upl->offset = offset + object->paging_offset;
2691
2692 #ifdef UPL_DEBUG
2693         queue_enter(&object->uplq, upl, upl_t, uplq);
2694 #endif /* UPL_DEBUG */
2695
2696         if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != VM_OBJECT_NULL) {
2697                 /*
2698                  * Honor copy-on-write obligations
2699                  *
2700                  * The caller is gathering these pages and
2701                  * might modify their contents.  We need to
2702                  * make sure that the copy object has its own
2703                  * private copies of these pages before we let
2704                  * the caller modify them.
2705                  */
2706                 vm_object_update(object,
2707                                  offset,
2708                                  size,
2709                                  NULL,
2710                                  NULL,
2711                                  FALSE, /* should_return */
2712                                  MEMORY_OBJECT_COPY_SYNC,
2713                                  VM_PROT_NO_CHANGE);
2714                 upl_cow++;
2715                 upl_cow_pages += size >> PAGE_SHIFT;
2716         }
2717         /*
2718          * remember which copy object we synchronized with
2719          */
2720         last_copy_object = object->copy;
2721         entry = 0;
2722
2723         xfer_size = size;
2724         dst_offset = offset;
2725
2726         while (xfer_size) {
2727
2728                 if ((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
2729                         if (delayed_unlock) {
2730                                 delayed_unlock = 0;
2731                                 vm_page_unlock_queues();
2732                         }
2733                         vm_object_unlock(object);
2734                         VM_PAGE_GRAB_FICTITIOUS(alias_page);
2735                         goto relock;
2736                 }
2737                 if (delayed_unlock == 0) {
2738                         /*
2739                          * pageout_scan takes the vm_page_lock_queues first
2740                          * then tries for the object lock... to avoid what
2741                          * is effectively a lock inversion, we'll go to the
2742                          * trouble of taking them in that same order... otherwise
2743                          * if this object contains the majority of the pages resident
2744                          * in the UBC (or a small set of large objects actively being
2745                          * worked on contain the majority of the pages), we could
2746                          * cause the pageout_scan thread to 'starve' in its attempt
2747                          * to find pages to move to the free queue, since it has to
2748                          * successfully acquire the object lock of any candidate page
2749                          * before it can steal/clean it.
2750                          */
2751                         vm_object_unlock(object);
2752 relock:
2753                         for (j = 0; ; j++) {
2754                                 vm_page_lock_queues();
2755
2756                                 if (vm_object_lock_try(object))
2757                                         break;
2758                                 vm_page_unlock_queues();
2759                                 mutex_pause(j);
2760                         }
2761                         delayed_unlock = 1;
2762                 }
2763                 if (cntrl_flags & UPL_COPYOUT_FROM) {
2764                         upl->flags |= UPL_PAGE_SYNC_DONE;
2765
2766                         if ( ((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
2767                                 dst_page->fictitious ||
2768                                 dst_page->absent ||
2769                                 dst_page->error ||
2770                                (dst_page->wire_count && !dst_page->pageout && !dst_page->list_req_pending)) {
2771
2772                                 if (user_page_list)
2773                                         user_page_list[entry].phys_addr = 0;
2774
2775                                 goto delay_unlock_queues;
2776                         }
2777                         /*
2778                          * grab this up front...
2779                          * a high percentange of the time we're going to
2780                          * need the hardware modification state a bit later
2781                          * anyway... so we can eliminate an extra call into
2782                          * the pmap layer by grabbing it here and recording it
2783                          */
2784                         if (dst_page->pmapped)
2785                                 refmod_state = pmap_get_refmod(dst_page->phys_page);
2786                         else
2787                                 refmod_state = 0;
2788
2789                         if ( (refmod_state & VM_MEM_REFERENCED) && dst_page->inactive ) {
2790                                 /*
2791                                  * page is on inactive list and referenced...
2792                                  * reactivate it now... this gets it out of the
2793                                  * way of vm_pageout_scan which would have to
2794                                  * reactivate it upon tripping over it
2795                                  */
2796                                 vm_page_activate(dst_page);
2797                                 VM_STAT_INCR(reactivations);
2798                         }
2799                         if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
2800                                 /*
2801                                  * we're only asking for DIRTY pages to be returned
2802                                  */
2803                                 if (dst_page->list_req_pending || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
2804                                         /*
2805                                          * if we were the page stolen by vm_pageout_scan to be
2806                                          * cleaned (as opposed to a buddy being clustered in
2807                                          * or this request is not being driven by a PAGEOUT cluster
2808                                          * then we only need to check for the page being dirty or
2809                                          * precious to decide whether to return it
2810                                          */
2811                                         if (dst_page->dirty || dst_page->precious || (refmod_state & VM_MEM_MODIFIED))
2812                                                 goto check_busy;
2813                                         goto dont_return;
2814                                 }
2815                                 /*
2816                                  * this is a request for a PAGEOUT cluster and this page
2817                                  * is merely along for the ride as a 'buddy'... not only
2818                                  * does it have to be dirty to be returned, but it also
2819                                  * can't have been referenced recently... note that we've
2820                                  * already filtered above based on whether this page is
2821                                  * currently on the inactive queue or it meets the page
2822                                  * ticket (generation count) check
2823                                  */
2824                                 if ( !(refmod_state & VM_MEM_REFERENCED) &&
2825                                      ((refmod_state & VM_MEM_MODIFIED) || dst_page->dirty || dst_page->precious) ) {
2826                                         goto check_busy;
2827                                 }
2828 dont_return:
2829                                 /*
2830                                  * if we reach here, we're not to return
2831                                  * the page... go on to the next one
2832                                  */
2833                                 if (user_page_list)
2834                                         user_page_list[entry].phys_addr = 0;
2835
2836                                 goto delay_unlock_queues;
2837                         }
2838 check_busy:
2839                         if (dst_page->busy && (!(dst_page->list_req_pending && dst_page->pageout))) {
2840                                 if (cntrl_flags & UPL_NOBLOCK) {
2841                                         if (user_page_list)
2842                                                 user_page_list[entry].phys_addr = 0;
2843
2844                                         goto delay_unlock_queues;
2845                                 }
2846                                 /*
2847                                  * someone else is playing with the
2848                                  * page.  We will have to wait.
2849                                  */
2850                                 delayed_unlock = 0;
2851                                 vm_page_unlock_queues();
2852
2853                                 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
2854
2855                                 continue;
2856                         }
2857                         /*
2858                          * Someone else already cleaning the page?
2859                          */
2860                         if ((dst_page->cleaning || dst_page->absent || dst_page->wire_count != 0) && !dst_page->list_req_pending) {
2861                                 if (user_page_list)
2862                                         user_page_list[entry].phys_addr = 0;
2863
2864                                 goto delay_unlock_queues;
2865                         }
2866                         /*
2867                          * ENCRYPTED SWAP:
2868                          * The caller is gathering this page and might
2869                          * access its contents later on.  Decrypt the
2870                          * page before adding it to the UPL, so that
2871                          * the caller never sees encrypted data.
2872                          */
2873                         if (! (cntrl_flags & UPL_ENCRYPT) && dst_page->encrypted) {
2874                                 int  was_busy;
2875
2876                                 delayed_unlock = 0;
2877                                 vm_page_unlock_queues();
2878                                 /*
2879                                  * save the current state of busy
2880                                  * mark page as busy while decrypt
2881                                  * is in progress since it will drop
2882                                  * the object lock...
2883                                  */
2884                                 was_busy = dst_page->busy;
2885                                 dst_page->busy = TRUE;
2886
2887                                 vm_page_decrypt(dst_page, 0);
2888                                 vm_page_decrypt_for_upl_counter++;
2889                                 /*
2890                                  * restore to original busy state
2891                                  */
2892                                 dst_page->busy = was_busy;
2893
2894                                 vm_page_lock_queues();
2895                                 delayed_unlock = 1;
2896                         }
2897                         if (dst_page->pageout_queue == TRUE)
2898                                 /*
2899                                  * we've buddied up a page for a clustered pageout
2900                                  * that has already been moved to the pageout
2901                                  * queue by pageout_scan... we need to remove
2902                                  * it from the queue and drop the laundry count
2903                                  * on that queue
2904                                  */
2905                                 vm_pageout_queue_steal(dst_page);
2906 #if MACH_CLUSTER_STATS
2907                         /*
2908                          * pageout statistics gathering.  count
2909                          * all the pages we will page out that
2910                          * were not counted in the initial
2911                          * vm_pageout_scan work
2912                          */
2913                         if (dst_page->list_req_pending)
2914                                 encountered_lrp = TRUE;
2915                         if ((dst_page->dirty || (dst_page->object->internal && dst_page->precious)) && !dst_page->list_req_pending) {
2916                                 if (encountered_lrp)
2917                                         CLUSTER_STAT(pages_at_higher_offsets++;)
2918                                 else
2919                                         CLUSTER_STAT(pages_at_lower_offsets++;)
2920                         }
2921 #endif
2922                         /*
2923                          * Turn off busy indication on pending
2924                          * pageout.  Note: we can only get here
2925                          * in the request pending case.
2926                          */
2927                         dst_page->list_req_pending = FALSE;
2928                         dst_page->busy = FALSE;
2929
2930                         hw_dirty = refmod_state & VM_MEM_MODIFIED;
2931                         dirty = hw_dirty ? TRUE : dst_page->dirty;
2932
2933                         if (dst_page->phys_page > upl->highest_page)
2934                                 upl->highest_page = dst_page->phys_page;
2935
2936                         if (cntrl_flags & UPL_SET_LITE) {
2937                                 int     pg_num;
2938
2939                                 pg_num = (dst_offset-offset)/PAGE_SIZE;
2940                                 lite_list[pg_num>>5] |= 1 << (pg_num & 31);
2941
2942                                 if (hw_dirty)
2943                                         pmap_clear_modify(dst_page->phys_page);
2944
2945                                 /*
2946                                  * Mark original page as cleaning
2947                                  * in place.
2948                                  */
2949                                 dst_page->cleaning = TRUE;
2950                                 dst_page->precious = FALSE;
2951                         } else {
2952                                 /*
2953                                  * use pageclean setup, it is more
2954                                  * convenient even for the pageout
2955                                  * cases here
2956                                  */
2957                                 vm_object_lock(upl->map_object);
2958                                 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
2959                                 vm_object_unlock(upl->map_object);
2960
2961                                 alias_page->absent = FALSE;
2962                                 alias_page = NULL;
2963                         }
2964 #if     MACH_PAGEMAP
2965                         /*
2966                          * Record that this page has been
2967                          * written out
2968                          */
2969                         vm_external_state_set(object->existence_map, dst_page->offset);
2970 #endif  /*MACH_PAGEMAP*/
2971                         dst_page->dirty = dirty;
2972
2973                         if (!dirty)
2974                                 dst_page->precious = TRUE;
2975
2976                         if (dst_page->pageout)
2977                                 dst_page->busy = TRUE;
2978
2979                         if ( (cntrl_flags & UPL_ENCRYPT) ) {
2980                                 /*
2981                                  * ENCRYPTED SWAP:
2982                                  * We want to deny access to the target page
2983                                  * because its contents are about to be
2984                                  * encrypted and the user would be very
2985                                  * confused to see encrypted data instead
2986                                  * of their data.
2987                                  * We also set "encrypted_cleaning" to allow
2988                                  * vm_pageout_scan() to demote that page
2989                                  * from "adjacent/clean-in-place" to
2990                                  * "target/clean-and-free" if it bumps into
2991                                  * this page during its scanning while we're
2992                                  * still processing this cluster.
2993                                  */
2994                                 dst_page->busy = TRUE;
2995                                 dst_page->encrypted_cleaning = TRUE;
2996                         }
2997                         if ( !(cntrl_flags & UPL_CLEAN_IN_PLACE) ) {
2998                                 /*
2999                                  * deny access to the target page
3000                                  * while it is being worked on
3001                                  */
3002                                 if ((!dst_page->pageout) && (dst_page->wire_count == 0)) {
3003                                         dst_page->busy = TRUE;
3004                                         dst_page->pageout = TRUE;
3005                                         vm_page_wire(dst_page);
3006                                 }
3007                         }
3008                 } else {
3009                         if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != last_copy_object) {
3010                                 /*
3011                                  * Honor copy-on-write obligations
3012                                  *
3013                                  * The copy object has changed since we
3014                                  * last synchronized for copy-on-write.
3015                                  * Another copy object might have been
3016                                  * inserted while we released the object's
3017                                  * lock.  Since someone could have seen the
3018                                  * original contents of the remaining pages
3019                                  * through that new object, we have to
3020                                  * synchronize with it again for the remaining
3021                                  * pages only.  The previous pages are "busy"
3022                                  * so they can not be seen through the new
3023                                  * mapping.  The new mapping will see our
3024                                  * upcoming changes for those previous pages,
3025                                  * but that's OK since they couldn't see what
3026                                  * was there before.  It's just a race anyway
3027                                  * and there's no guarantee of consistency or
3028                                  * atomicity.  We just don't want new mappings
3029                                  * to see both the *before* and *after* pages.
3030                                  */
3031                                 if (object->copy != VM_OBJECT_NULL) {
3032                                         delayed_unlock = 0;
3033                                         vm_page_unlock_queues();
3034
3035                                         vm_object_update(
3036                                                 object,
3037                                                 dst_offset,/* current offset */
3038                                                 xfer_size, /* remaining size */
3039                                                 NULL,
3040                                                 NULL,
3041                                                 FALSE,     /* should_return */
3042                                                 MEMORY_OBJECT_COPY_SYNC,
3043                                                 VM_PROT_NO_CHANGE);
3044
3045                                         upl_cow_again++;
3046                                         upl_cow_again_pages += xfer_size >> PAGE_SHIFT;
3047
3048                                         vm_page_lock_queues();
3049                                         delayed_unlock = 1;
3050                                 }
3051                                 /*
3052                                  * remember the copy object we synced with
3053                                  */
3054                                 last_copy_object = object->copy;
3055                         }
3056                         dst_page = vm_page_lookup(object, dst_offset);
3057
3058                         if (dst_page != VM_PAGE_NULL) {
3059                                 if ( !(dst_page->list_req_pending) ) {
3060                                         if ((cntrl_flags & UPL_RET_ONLY_ABSENT) && !dst_page->absent) {
3061                                                 /*
3062                                                  * skip over pages already present in the cache
3063                                                  */
3064                                                 if (user_page_list)
3065                                                         user_page_list[entry].phys_addr = 0;
3066
3067                                                 goto delay_unlock_queues;
3068                                         }
3069                                         if (dst_page->cleaning) {
3070                                                 /*
3071                                                  * someone else is writing to the page... wait...
3072                                                  */
3073                                                 delayed_unlock = 0;
3074                                                 vm_page_unlock_queues();
3075
3076                                                 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
3077
3078                                                 continue;
3079                                         }
3080                                 } else {
3081                                         if (dst_page->fictitious &&
3082                                             dst_page->phys_page == vm_page_fictitious_addr) {
3083                                                 assert( !dst_page->speculative);
3084                                                 /*
3085                                                  * dump the fictitious page
3086                                                  */
3087                                                 dst_page->list_req_pending = FALSE;
3088
3089                                                 vm_page_free(dst_page);
3090
3091                                                 dst_page = NULL;
3092                                         } else if (dst_page->absent) {
3093                                                 /*
3094                                                  * the default_pager case
3095                                                  */
3096                                                 dst_page->list_req_pending = FALSE;
3097                                                 dst_page->busy = FALSE;
3098                                         }
3099                                 }
3100                         }
3101                         if (dst_page == VM_PAGE_NULL) {
3102                                 if (object->private) {
3103                                         /*
3104                                          * This is a nasty wrinkle for users
3105                                          * of upl who encounter device or
3106                                          * private memory however, it is
3107                                          * unavoidable, only a fault can
3108                                          * resolve the actual backing
3109                                          * physical page by asking the
3110                                          * backing device.
3111                                          */
3112                                         if (user_page_list)
3113                                                 user_page_list[entry].phys_addr = 0;
3114
3115                                         goto delay_unlock_queues;
3116                                 }
3117                                 /*
3118                                  * need to allocate a page
3119                                  */
3120                                 dst_page = vm_page_grab();
3121
3122                                 if (dst_page == VM_PAGE_NULL) {
3123                                         if ( (cntrl_flags & (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) == (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) {
3124                                                /*
3125                                                 * we don't want to stall waiting for pages to come onto the free list
3126                                                 * while we're already holding absent pages in this UPL
3127                                                 * the caller will deal with the empty slots
3128                                                 */
3129                                                 if (user_page_list)
3130                                                         user_page_list[entry].phys_addr = 0;
3131
3132                                                 goto try_next_page;
3133                                         }
3134                                         /*
3135                                          * no pages available... wait
3136                                          * then try again for the same
3137                                          * offset...
3138                                          */
3139                                         delayed_unlock = 0;
3140                                         vm_page_unlock_queues();
3141
3142                                         vm_object_unlock(object);
3143                                         VM_PAGE_WAIT();
3144
3145                                         /*
3146                                          * pageout_scan takes the vm_page_lock_queues first
3147                                          * then tries for the object lock... to avoid what
3148                                          * is effectively a lock inversion, we'll go to the
3149                                          * trouble of taking them in that same order... otherwise
3150                                          * if this object contains the majority of the pages resident
3151                                          * in the UBC (or a small set of large objects actively being
3152                                          * worked on contain the majority of the pages), we could
3153                                          * cause the pageout_scan thread to 'starve' in its attempt
3154                                          * to find pages to move to the free queue, since it has to
3155                                          * successfully acquire the object lock of any candidate page
3156                                          * before it can steal/clean it.
3157                                          */
3158                                         for (j = 0; ; j++) {
3159                                                 vm_page_lock_queues();
3160
3161                                                 if (vm_object_lock_try(object))
3162                                                         break;
3163                                                 vm_page_unlock_queues();
3164                                                 mutex_pause(j);
3165                                         }
3166                                         delayed_unlock = 1;
3167
3168                                         continue;
3169                                 }
3170                                 vm_page_insert_internal(dst_page, object, dst_offset, TRUE);
3171
3172                                 dst_page->absent = TRUE;
3173                                 dst_page->busy = FALSE;
3174
3175                                 if (cntrl_flags & UPL_RET_ONLY_ABSENT) {
3176                                         /*
3177                                          * if UPL_RET_ONLY_ABSENT was specified,
3178                                          * than we're definitely setting up a
3179                                          * upl for a clustered read/pagein
3180                                          * operation... mark the pages as clustered
3181                                          * so upl_commit_range can put them on the
3182                                          * speculative list
3183                                          */
3184                                         dst_page->clustered = TRUE;
3185                                 }
3186                         }
3187                         /*
3188                          * ENCRYPTED SWAP:
3189                          */
3190                         if (cntrl_flags & UPL_ENCRYPT) {
3191                                 /*
3192                                  * The page is going to be encrypted when we
3193                                  * get it from the pager, so mark it so.
3194                                  */
3195                                 dst_page->encrypted = TRUE;
3196                         } else {
3197                                 /*
3198                                  * Otherwise, the page will not contain
3199                                  * encrypted data.
3200                                  */
3201                                 dst_page->encrypted = FALSE;
3202                         }
3203                         dst_page->overwriting = TRUE;
3204
3205                         if (dst_page->fictitious) {
3206                                 panic("need corner case for fictitious page");
3207                         }
3208                         if (dst_page->busy) {
3209                                 /*
3210                                  * someone else is playing with the
3211                                  * page.  We will have to wait.
3212                                  */
3213                                 delayed_unlock = 0;
3214                                 vm_page_unlock_queues();
3215
3216                                 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
3217
3218                                 continue;
3219                         }
3220                         if (dst_page->pmapped) {
3221                                 if ( !(cntrl_flags & UPL_FILE_IO))
3222                                         /*
3223                                          * eliminate all mappings from the
3224                                          * original object and its prodigy
3225                                          */
3226                                         refmod_state = pmap_disconnect(dst_page->phys_page);
3227                                 else
3228                                         refmod_state = pmap_get_refmod(dst_page->phys_page);
3229                         } else
3230                                 refmod_state = 0;
3231
3232                         hw_dirty = refmod_state & VM_MEM_MODIFIED;
3233                         dirty = hw_dirty ? TRUE : dst_page->dirty;
3234
3235                         if (cntrl_flags & UPL_SET_LITE) {
3236                                 int     pg_num;
3237
3238                                 pg_num = (dst_offset-offset)/PAGE_SIZE;
3239                                 lite_list[pg_num>>5] |= 1 << (pg_num & 31);
3240
3241                                 if (hw_dirty)
3242                                         pmap_clear_modify(dst_page->phys_page);
3243
3244                                 /*
3245                                  * Mark original page as cleaning
3246                                  * in place.
3247                                  */
3248                                 dst_page->cleaning = TRUE;
3249                                 dst_page->precious = FALSE;
3250                         } else {
3251                                 /*
3252                                  * use pageclean setup, it is more
3253                                  * convenient even for the pageout
3254                                  * cases here
3255                                  */
3256                                 vm_object_lock(upl->map_object);
3257                                 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
3258                                 vm_object_unlock(upl->map_object);
3259
3260                                 alias_page->absent = FALSE;
3261                                 alias_page = NULL;
3262                         }
3263
3264                         if (cntrl_flags & UPL_CLEAN_IN_PLACE) {
3265                                 /*
3266                                  * clean in place for read implies
3267                                  * that a write will be done on all
3268                                  * the pages that are dirty before
3269                                  * a upl commit is done.  The caller
3270                                  * is obligated to preserve the
3271                                  * contents of all pages marked dirty
3272                                  */
3273                                 upl->flags |= UPL_CLEAR_DIRTY;
3274                         }
3275                         dst_page->dirty = dirty;
3276
3277                         if (!dirty)
3278                                 dst_page->precious = TRUE;
3279
3280                         if (dst_page->wire_count == 0) {
3281                                 /*
3282                                  * deny access to the target page while
3283                                  * it is being worked on
3284                                  */
3285                                 dst_page->busy = TRUE;
3286                         } else
3287                                 vm_page_wire(dst_page);
3288
3289                         if (dst_page->clustered) {
3290                                 /*
3291                                  * expect the page not to be used
3292                                  * since it's coming in as part
3293                                  * of a speculative cluster...
3294                                  * pages that are 'consumed' will
3295                                  * get a hardware reference
3296                                  */
3297                                 dst_page->reference = FALSE;
3298                         } else {
3299                                 /*
3300                                  * expect the page to be used
3301                                  */
3302                                 dst_page->reference = TRUE;
3303                         }
3304                         dst_page->precious = (cntrl_flags & UPL_PRECIOUS) ? TRUE : FALSE;
3305                 }
3306                 if (dst_page->phys_page > upl->highest_page)
3307                         upl->highest_page = dst_page->phys_page;
3308                 if (user_page_list) {
3309                         user_page_list[entry].phys_addr = dst_page->phys_page;
3310                         user_page_list[entry].dirty     = dst_page->dirty;
3311                         user_page_list[entry].pageout   = dst_page->pageout;
3312                         user_page_list[entry].absent    = dst_page->absent;
3313                         user_page_list[entry].precious  = dst_page->precious;
3314
3315                         if (dst_page->clustered == TRUE)
3316                                 user_page_list[entry].speculative = dst_page->speculative;
3317                         else
3318                                 user_page_list[entry].speculative = FALSE;
3319                 }
3320                 /*
3321                  * if UPL_RET_ONLY_ABSENT is set, then
3322                  * we are working with a fresh page and we've
3323                  * just set the clustered flag on it to
3324                  * indicate that it was drug in as part of a
3325                  * speculative cluster... so leave it alone
3326                  */
3327                 if ( !(cntrl_flags & UPL_RET_ONLY_ABSENT)) {
3328                         /*
3329                          * someone is explicitly grabbing this page...
3330                          * update clustered and speculative state
3331                          *
3332                          */
3333                         VM_PAGE_CONSUME_CLUSTERED(dst_page);
3334                 }
3335 delay_unlock_queues:
3336                 if (delayed_unlock++ > UPL_DELAYED_UNLOCK_LIMIT) {
3337                         /*
3338                          * pageout_scan takes the vm_page_lock_queues first
3339                          * then tries for the object lock... to avoid what
3340                          * is effectively a lock inversion, we'll go to the
3341                          * trouble of taking them in that same order... otherwise
3342                          * if this object contains the majority of the pages resident
3343                          * in the UBC (or a small set of large objects actively being
3344                          * worked on contain the majority of the pages), we could
3345                          * cause the pageout_scan thread to 'starve' in its attempt
3346                          * to find pages to move to the free queue, since it has to
3347                          * successfully acquire the object lock of any candidate page
3348                          * before it can steal/clean it.
3349                          */
3350                         vm_object_unlock(object);
3351                         mutex_yield(&vm_page_queue_lock);
3352
3353                         for (j = 0; ; j++) {
3354                                 if (vm_object_lock_try(object))
3355                                         break;
3356                                 vm_page_unlock_queues();
3357                                 mutex_pause(j);
3358                                 vm_page_lock_queues();
3359                         }
3360                         delayed_unlock = 1;
3361                 }
3362 try_next_page:
3363                 entry++;
3364                 dst_offset += PAGE_SIZE_64;
3365                 xfer_size -= PAGE_SIZE;
3366         }
3367         if (alias_page != NULL) {
3368                 if (delayed_unlock == 0) {
3369                         vm_page_lock_queues();
3370                         delayed_unlock = 1;
3371                 }
3372                 vm_page_free(alias_page);
3373         }
3374         if (delayed_unlock)
3375                 vm_page_unlock_queues();
3376
3377         if (page_list_count != NULL) {
3378                 if (upl->flags & UPL_INTERNAL)
3379                         *page_list_count = 0;
3380                 else if (*page_list_count > entry)
3381                         *page_list_count = entry;
3382         }
3383         vm_object_unlock(object);
3384
3385         return KERN_SUCCESS;
3386 }
3387
3388 /* JMM - Backward compatability for now */
3389 kern_return_t
3390 vm_fault_list_request(                  /* forward */
3391         memory_object_control_t         control,
3392         vm_object_offset_t      offset,
3393         upl_size_t              size,
3394         upl_t                   *upl_ptr,
3395         upl_page_info_t         **user_page_list_ptr,
3396         unsigned int            page_list_count,
3397         int                     cntrl_flags);
3398 kern_return_t
3399 vm_fault_list_request(
3400         memory_object_control_t         control,
3401         vm_object_offset_t      offset,
3402         upl_size_t              size,
3403         upl_t                   *upl_ptr,
3404         upl_page_info_t         **user_page_list_ptr,
3405         unsigned int            page_list_count,
3406         int                     cntrl_flags)
3407 {
3408         unsigned int            local_list_count;
3409         upl_page_info_t         *user_page_list;
3410         kern_return_t           kr;
3411
3412         if (user_page_list_ptr != NULL) {
3413                 local_list_count = page_list_count;
3414                 user_page_list = *user_page_list_ptr;
3415         } else {
3416                 local_list_count = 0;
3417                 user_page_list = NULL;
3418         }
3419         kr =  memory_object_upl_request(control,
3420                                 offset,
3421                                 size,
3422                                 upl_ptr,
3423                                 user_page_list,
3424                                 &local_list_count,
3425                                 cntrl_flags);
3426
3427         if(kr != KERN_SUCCESS)
3428                 return kr;
3429
3430         if ((user_page_list_ptr != NULL) && (cntrl_flags & UPL_INTERNAL)) {
3431                 *user_page_list_ptr = UPL_GET_INTERNAL_PAGE_LIST(*upl_ptr);
3432         }
3433
3434         return KERN_SUCCESS;
3435 }
3436
3437
3438
3439 /*
3440  *      Routine:        vm_object_super_upl_request
3441  *      Purpose:
3442  *              Cause the population of a portion of a vm_object
3443  *              in much the same way as memory_object_upl_request.
3444  *              Depending on the nature of the request, the pages
3445  *              returned may be contain valid data or be uninitialized.
3446  *              However, the region may be expanded up to the super
3447  *              cluster size provided.
3448  */
3449
3450 __private_extern__ kern_return_t
3451 vm_object_super_upl_request(
3452         vm_object_t object,
3453         vm_object_offset_t      offset,
3454         upl_size_t              size,
3455         upl_size_t              super_cluster,
3456         upl_t                   *upl,
3457         upl_page_info_t         *user_page_list,
3458         unsigned int            *page_list_count,
3459         int                     cntrl_flags)
3460 {
3461         if (object->paging_offset > offset)
3462                 return KERN_FAILURE;
3463
3464         assert(object->paging_in_progress);
3465         offset = offset - object->paging_offset;
3466
3467         if (super_cluster > size) {
3468
3469                 vm_object_offset_t      base_offset;
3470                 upl_size_t              super_size;
3471
3472                 base_offset = (offset & ~((vm_object_offset_t) super_cluster - 1));
3473                 super_size = (offset + size) > (base_offset + super_cluster) ? super_cluster<<1 : super_cluster;
3474                 super_size = ((base_offset + super_size) > object->size) ? (object->size - base_offset) : super_size;
3475
3476                 if (offset > (base_offset + super_size)) {
3477                         panic("vm_object_super_upl_request: Missed target pageout"
3478                               " %#llx,%#llx, %#x, %#x, %#x, %#llx\n",
3479                               offset, base_offset, super_size, super_cluster,
3480                               size, object->paging_offset);
3481                 }
3482                 /*
3483                  * apparently there is a case where the vm requests a
3484                  * page to be written out who's offset is beyond the
3485                  * object size
3486                  */
3487                 if ((offset + size) > (base_offset + super_size))
3488                         super_size = (offset + size) - base_offset;
3489
3490                 offset = base_offset;
3491                 size = super_size;
3492         }
3493         return vm_object_upl_request(object, offset, size, upl, user_page_list, page_list_count, cntrl_flags);
3494 }
3495
3496
3497 kern_return_t
3498 vm_map_create_upl(
3499         vm_map_t                map,
3500         vm_map_address_t        offset,
3501         upl_size_t              *upl_size,
3502         upl_t                   *upl,
3503         upl_page_info_array_t   page_list,
3504         unsigned int            *count,
3505         int                     *flags)
3506 {
3507         vm_map_entry_t  entry;
3508         int             caller_flags;
3509         int             force_data_sync;
3510         int             sync_cow_data;
3511         vm_object_t     local_object;
3512         vm_map_offset_t local_offset;
3513         vm_map_offset_t local_start;
3514         kern_return_t   ret;
3515
3516         caller_flags = *flags;
3517
3518         if (caller_flags & ~UPL_VALID_FLAGS) {
3519                 /*
3520                  * For forward compatibility's sake,
3521                  * reject any unknown flag.
3522                  */
3523                 return KERN_INVALID_VALUE;
3524         }
3525         force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
3526         sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
3527
3528         if (upl == NULL)
3529                 return KERN_INVALID_ARGUMENT;
3530
3531 REDISCOVER_ENTRY:
3532         vm_map_lock(map);
3533
3534         if (vm_map_lookup_entry(map, offset, &entry)) {
3535
3536                 if ((entry->vme_end - offset) < *upl_size)
3537                         *upl_size = entry->vme_end - offset;
3538
3539                 if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
3540                         *flags = 0;
3541
3542                         if (entry->object.vm_object != VM_OBJECT_NULL) {
3543                                 if (entry->object.vm_object->private)
3544                                         *flags = UPL_DEV_MEMORY;
3545
3546                                 if (entry->object.vm_object->phys_contiguous)
3547                                         *flags |= UPL_PHYS_CONTIG;
3548                         }
3549                         vm_map_unlock(map);
3550
3551                         return KERN_SUCCESS;
3552                 }
3553                 if (entry->object.vm_object == VM_OBJECT_NULL || !entry->object.vm_object->phys_contiguous) {
3554                         if ((*upl_size/page_size) > MAX_UPL_SIZE)
3555                                 *upl_size = MAX_UPL_SIZE * page_size;
3556                 }
3557                 /*
3558                  *      Create an object if necessary.
3559                  */
3560                 if (entry->object.vm_object == VM_OBJECT_NULL) {
3561                         entry->object.vm_object = vm_object_allocate((vm_size_t)(entry->vme_end - entry->vme_start));
3562                         entry->offset = 0;
3563                 }
3564                 if (!(caller_flags & UPL_COPYOUT_FROM)) {
3565                         if (!(entry->protection & VM_PROT_WRITE)) {
3566                                 vm_map_unlock(map);
3567                                 return KERN_PROTECTION_FAILURE;
3568                         }
3569                         if (entry->needs_copy)  {
3570                                 vm_map_t                local_map;
3571                                 vm_object_t             object;
3572                                 vm_object_offset_t      new_offset;
3573                                 vm_prot_t               prot;
3574                                 boolean_t               wired;
3575                                 vm_map_version_t        version;
3576                                 vm_map_t                real_map;
3577
3578                                 local_map = map;
3579                                 vm_map_lock_write_to_read(map);
3580
3581                                 if (vm_map_lookup_locked(&local_map,
3582                                                          offset, VM_PROT_WRITE,
3583                                                          OBJECT_LOCK_EXCLUSIVE,
3584                                                          &version, &object,
3585                                                          &new_offset, &prot, &wired,
3586                                                          NULL,
3587                                                          &real_map)) {
3588                                         vm_map_unlock(local_map);
3589                                         return KERN_FAILURE;
3590                                 }
3591                                 if (real_map != map)
3592                                         vm_map_unlock(real_map);
3593                                 vm_object_unlock(object);
3594                                 vm_map_unlock(local_map);
3595
3596                                 goto REDISCOVER_ENTRY;
3597                         }
3598                 }
3599                 if (entry->is_sub_map) {
3600                         vm_map_t        submap;
3601
3602                         submap = entry->object.sub_map;
3603                         local_start = entry->vme_start;
3604                         local_offset = entry->offset;
3605
3606                         vm_map_reference(submap);
3607                         vm_map_unlock(map);
3608
3609                         ret = vm_map_create_upl(submap,
3610                                                 local_offset + (offset - local_start),
3611                                                 upl_size, upl, page_list, count, flags);
3612                         vm_map_deallocate(submap);
3613
3614                         return ret;
3615                 }
3616                 if (sync_cow_data) {
3617                         if (entry->object.vm_object->shadow || entry->object.vm_object->copy) {
3618                                 local_object = entry->object.vm_object;
3619                                 local_start = entry->vme_start;
3620                                 local_offset = entry->offset;
3621
3622                                 vm_object_reference(local_object);
3623                                 vm_map_unlock(map);
3624
3625                                 if (entry->object.vm_object->shadow && entry->object.vm_object->copy) {
3626                                         vm_object_lock_request(
3627                                                                local_object->shadow,
3628                                                                (vm_object_offset_t)
3629                                                                ((offset - local_start) +
3630                                                                 local_offset) +
3631                                                                local_object->shadow_offset,
3632                                                                *upl_size, FALSE,
3633                                                                MEMORY_OBJECT_DATA_SYNC,
3634                                                                VM_PROT_NO_CHANGE);
3635                                 }
3636                                 sync_cow_data = FALSE;
3637                                 vm_object_deallocate(local_object);
3638
3639                                 goto REDISCOVER_ENTRY;
3640                         }
3641                 }
3642                 if (force_data_sync) {
3643                         local_object = entry->object.vm_object;
3644                         local_start = entry->vme_start;
3645                         local_offset = entry->offset;
3646
3647                         vm_object_reference(local_object);
3648                         vm_map_unlock(map);
3649
3650                         vm_object_lock_request(
3651                                                local_object,
3652                                                (vm_object_offset_t)
3653                                                ((offset - local_start) + local_offset),
3654                                                (vm_object_size_t)*upl_size, FALSE,
3655                                                MEMORY_OBJECT_DATA_SYNC,
3656                                                VM_PROT_NO_CHANGE);
3657
3658                         force_data_sync = FALSE;
3659                         vm_object_deallocate(local_object);
3660
3661                         goto REDISCOVER_ENTRY;
3662                 }
3663                 if (entry->object.vm_object->private)
3664                         *flags = UPL_DEV_MEMORY;
3665                 else
3666                         *flags = 0;
3667
3668                 if (entry->object.vm_object->phys_contiguous)
3669                         *flags |= UPL_PHYS_CONTIG;
3670
3671                 local_object = entry->object.vm_object;
3672                 local_offset = entry->offset;
3673                 local_start = entry->vme_start;
3674
3675                 vm_object_reference(local_object);
3676                 vm_map_unlock(map);
3677
3678                 ret = vm_object_iopl_request(local_object,
3679                                               (vm_object_offset_t) ((offset - local_start) + local_offset),
3680                                               *upl_size,
3681                                               upl,
3682                                               page_list,
3683                                               count,
3684                                               caller_flags);
3685                 vm_object_deallocate(local_object);
3686
3687                 return(ret);
3688         }
3689         vm_map_unlock(map);
3690
3691         return(KERN_FAILURE);
3692 }
3693
3694 /*
3695  * Internal routine to enter a UPL into a VM map.
3696  *
3697  * JMM - This should just be doable through the standard
3698  * vm_map_enter() API.
3699  */
3700 kern_return_t
3701 vm_map_enter_upl(
3702         vm_map_t                map,
3703         upl_t                   upl,
3704         vm_map_offset_t *dst_addr)
3705 {
3706         vm_map_size_t           size;
3707         vm_object_offset_t      offset;
3708         vm_map_offset_t         addr;
3709         vm_page_t               m;
3710         kern_return_t           kr;
3711
3712         if (upl == UPL_NULL)
3713                 return KERN_INVALID_ARGUMENT;
3714
3715         upl_lock(upl);
3716
3717         /*
3718          * check to see if already mapped
3719          */
3720         if (UPL_PAGE_LIST_MAPPED & upl->flags) {
3721                 upl_unlock(upl);
3722                 return KERN_FAILURE;
3723         }
3724
3725         if ((!(upl->flags & UPL_SHADOWED)) && !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) ||
3726                                                (upl->map_object->phys_contiguous))) {
3727                 vm_object_t             object;
3728                 vm_page_t               alias_page;
3729                 vm_object_offset_t      new_offset;
3730                 int                     pg_num;
3731                 wpl_array_t             lite_list;
3732
3733                 if (upl->flags & UPL_INTERNAL) {
3734                         lite_list = (wpl_array_t)
3735                                 ((((uintptr_t)upl) + sizeof(struct upl))
3736                                  + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
3737                 } else {
3738                         lite_list = (wpl_array_t)(((uintptr_t)upl) + sizeof(struct upl));
3739                 }
3740                 object = upl->map_object;
3741                 upl->map_object = vm_object_allocate(upl->size);
3742
3743                 vm_object_lock(upl->map_object);
3744
3745                 upl->map_object->shadow = object;
3746                 upl->map_object->pageout = TRUE;
3747                 upl->map_object->can_persist = FALSE;
3748                 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3749                 upl->map_object->shadow_offset = upl->offset - object->paging_offset;
3750                 upl->map_object->wimg_bits = object->wimg_bits;
3751                 offset = upl->map_object->shadow_offset;
3752                 new_offset = 0;
3753                 size = upl->size;
3754
3755                 upl->flags |= UPL_SHADOWED;
3756
3757                 while (size) {
3758                         pg_num = (new_offset)/PAGE_SIZE;
3759
3760                         if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
3761
3762                                 VM_PAGE_GRAB_FICTITIOUS(alias_page);
3763
3764                                 vm_object_lock(object);
3765
3766                                 m = vm_page_lookup(object, offset);
3767                                 if (m == VM_PAGE_NULL) {
3768                                         panic("vm_upl_map: page missing\n");
3769                                 }
3770
3771                                 /*
3772                                  * Convert the fictitious page to a private
3773                                  * shadow of the real page.
3774                                  */
3775                                 assert(alias_page->fictitious);
3776                                 alias_page->fictitious = FALSE;
3777                                 alias_page->private = TRUE;
3778                                 alias_page->pageout = TRUE;
3779                                 /*
3780                                  * since m is a page in the upl it must
3781                                  * already be wired or BUSY, so it's
3782                                  * safe to assign the underlying physical
3783                                  * page to the alias
3784                                  */
3785                                 alias_page->phys_page = m->phys_page;
3786
3787                                 vm_object_unlock(object);
3788
3789                                 vm_page_lockspin_queues();
3790                                 vm_page_wire(alias_page);
3791                                 vm_page_unlock_queues();
3792
3793                                 /*
3794                                  * ENCRYPTED SWAP:
3795                                  * The virtual page ("m") has to be wired in some way
3796                                  * here or its physical page ("m->phys_page") could
3797                                  * be recycled at any time.
3798                                  * Assuming this is enforced by the caller, we can't
3799                                  * get an encrypted page here.  Since the encryption
3800                                  * key depends on the VM page's "pager" object and
3801                                  * the "paging_offset", we couldn't handle 2 pageable
3802                                  * VM pages (with different pagers and paging_offsets)
3803                                  * sharing the same physical page:  we could end up
3804                                  * encrypting with one key (via one VM page) and
3805                                  * decrypting with another key (via the alias VM page).
3806                                  */
3807                                 ASSERT_PAGE_DECRYPTED(m);
3808
3809                                 vm_page_insert(alias_page, upl->map_object, new_offset);
3810
3811                                 assert(!alias_page->wanted);
3812                                 alias_page->busy = FALSE;
3813                                 alias_page->absent = FALSE;
3814                         }
3815                         size -= PAGE_SIZE;
3816                         offset += PAGE_SIZE_64;
3817                         new_offset += PAGE_SIZE_64;
3818                 }
3819                 vm_object_unlock(upl->map_object);
3820         }
3821         if ((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || upl->map_object->phys_contiguous)
3822                 offset = upl->offset - upl->map_object->paging_offset;
3823         else
3824                 offset = 0;
3825         size = upl->size;
3826
3827         vm_object_reference(upl->map_object);
3828
3829         *dst_addr = 0;
3830         /*
3831          * NEED A UPL_MAP ALIAS
3832          */
3833         kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
3834                           VM_FLAGS_ANYWHERE, upl->map_object, offset, FALSE,
3835                           VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
3836
3837         if (kr != KERN_SUCCESS) {
3838                 upl_unlock(upl);
3839                 return(kr);
3840         }
3841         vm_object_lock(upl->map_object);
3842
3843         for (addr = *dst_addr; size > 0; size -= PAGE_SIZE, addr += PAGE_SIZE) {
3844                 m = vm_page_lookup(upl->map_object, offset);
3845
3846                 if (m) {
3847                         unsigned int    cache_attr;
3848                         cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
3849
3850                         m->pmapped = TRUE;
3851                         m->wpmapped = TRUE;
3852
3853                         PMAP_ENTER(map->pmap, addr, m, VM_PROT_ALL, cache_attr, TRUE);
3854                 }
3855                 offset += PAGE_SIZE_64;
3856         }
3857         vm_object_unlock(upl->map_object);
3858
3859         /*
3860          * hold a reference for the mapping
3861          */
3862         upl->ref_count++;
3863         upl->flags |= UPL_PAGE_LIST_MAPPED;
3864         upl->kaddr = *dst_addr;
3865         upl_unlock(upl);
3866
3867         return KERN_SUCCESS;
3868 }
3869
3870 /*
3871  * Internal routine to remove a UPL mapping from a VM map.
3872  *
3873  * XXX - This should just be doable through a standard
3874  * vm_map_remove() operation.  Otherwise, implicit clean-up
3875  * of the target map won't be able to correctly remove
3876  * these (and release the reference on the UPL).  Having
3877  * to do this means we can't map these into user-space
3878  * maps yet.
3879  */
3880 kern_return_t
3881 vm_map_remove_upl(
3882         vm_map_t        map,
3883         upl_t           upl)
3884 {
3885         vm_address_t    addr;
3886         upl_size_t      size;
3887
3888         if (upl == UPL_NULL)
3889                 return KERN_INVALID_ARGUMENT;
3890
3891         upl_lock(upl);
3892
3893         if (upl->flags & UPL_PAGE_LIST_MAPPED) {
3894                 addr = upl->kaddr;
3895                 size = upl->size;
3896
3897                 assert(upl->ref_count > 1);
3898                 upl->ref_count--;               /* removing mapping ref */
3899
3900                 upl->flags &= ~UPL_PAGE_LIST_MAPPED;
3901                 upl->kaddr = (vm_offset_t) 0;
3902                 upl_unlock(upl);
3903
3904                 vm_map_remove(map,
3905                               vm_map_trunc_page(addr),
3906                               vm_map_round_page(addr + size),
3907                               VM_MAP_NO_FLAGS);
3908
3909                 return KERN_SUCCESS;
3910         }
3911         upl_unlock(upl);
3912
3913         return KERN_FAILURE;
3914 }
3915
3916 kern_return_t
3917 upl_commit_range(
3918         upl_t                   upl,
3919         upl_offset_t            offset,
3920         upl_size_t              size,
3921         int                     flags,
3922         upl_page_info_t         *page_list,
3923         mach_msg_type_number_t  count,
3924         boolean_t               *empty)
3925 {
3926         upl_size_t              xfer_size;
3927         vm_object_t             shadow_object;
3928         vm_object_t             object;
3929         vm_object_offset_t      target_offset;
3930         int                     entry;
3931         wpl_array_t             lite_list;
3932         int                     occupied;
3933         int                     delayed_unlock = 0;
3934         int                     clear_refmod = 0;
3935         int                     pgpgout_count = 0;
3936         int                     j;
3937
3938         *empty = FALSE;
3939
3940         if (upl == UPL_NULL)
3941                 return KERN_INVALID_ARGUMENT;
3942
3943         if (count == 0)
3944                 page_list = NULL;
3945
3946         if (upl->flags & UPL_DEVICE_MEMORY)
3947                 xfer_size = 0;
3948         else if ((offset + size) <= upl->size)
3949                 xfer_size = size;
3950         else
3951                 return KERN_FAILURE;
3952
3953         upl_lock(upl);
3954
3955         if (upl->flags & UPL_ACCESS_BLOCKED) {
3956                 /*
3957                  * We used this UPL to block access to the pages by marking
3958                  * them "busy".  Now we need to clear the "busy" bit to allow
3959                  * access to these pages again.
3960                  */
3961                 flags |= UPL_COMMIT_ALLOW_ACCESS;
3962         }
3963         if (upl->flags & UPL_CLEAR_DIRTY)
3964                 flags |= UPL_COMMIT_CLEAR_DIRTY;
3965
3966         if (upl->flags & UPL_INTERNAL)
3967                 lite_list = (wpl_array_t) ((((uintptr_t)upl) + sizeof(struct upl))
3968                                            + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
3969         else
3970                 lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
3971
3972         object = upl->map_object;
3973
3974         if (upl->flags & UPL_SHADOWED) {
3975                 vm_object_lock(object);
3976                 shadow_object = object->shadow;
3977         } else {
3978                 shadow_object = object;
3979         }
3980         entry = offset/PAGE_SIZE;
3981         target_offset = (vm_object_offset_t)offset;
3982
3983         /*
3984          * pageout_scan takes the vm_page_lock_queues first
3985          * then tries for the object lock... to avoid what
3986          * is effectively a lock inversion, we'll go to the
3987          * trouble of taking them in that same order... otherwise
3988          * if this object contains the majority of the pages resident
3989          * in the UBC (or a small set of large objects actively being
3990          * worked on contain the majority of the pages), we could
3991          * cause the pageout_scan thread to 'starve' in its attempt
3992          * to find pages to move to the free queue, since it has to
3993          * successfully acquire the object lock of any candidate page
3994          * before it can steal/clean it.
3995          */
3996         for (j = 0; ; j++) {
3997                 vm_page_lock_queues();
3998
3999                 if (vm_object_lock_try(shadow_object))
4000                         break;
4001                 vm_page_unlock_queues();
4002                 mutex_pause(j);
4003         }
4004         delayed_unlock = 1;
4005
4006         while (xfer_size) {
4007                 vm_page_t       t, m;
4008
4009                 m = VM_PAGE_NULL;
4010
4011                 if (upl->flags & UPL_LITE) {
4012                         int     pg_num;
4013
4014                         pg_num = target_offset/PAGE_SIZE;
4015
4016                         if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
4017                                 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
4018
4019                                 m = vm_page_lookup(shadow_object, target_offset + (upl->offset - shadow_object->paging_offset));
4020                         }
4021                 }
4022                 if (upl->flags & UPL_SHADOWED) {
4023                         if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
4024
4025                                 t->pageout = FALSE;
4026
4027                                 vm_page_free(t);
4028
4029                                 if (m == VM_PAGE_NULL)
4030                                         m = vm_page_lookup(shadow_object, target_offset + object->shadow_offset);
4031                         }
4032                 }
4033                 if (m != VM_PAGE_NULL) {
4034
4035                         clear_refmod = 0;
4036
4037                         if (upl->flags & UPL_IO_WIRE) {
4038
4039                                 vm_page_unwire(m);
4040
4041                                 if (page_list)
4042                                         page_list[entry].phys_addr = 0;
4043
4044                                 if (flags & UPL_COMMIT_SET_DIRTY)
4045                                         m->dirty = TRUE;
4046                                 else if (flags & UPL_COMMIT_CLEAR_DIRTY) {
4047                                         m->dirty = FALSE;
4048                                         if (m->cs_validated && !m->cs_tainted) {
4049                                                 /*
4050                                                  * CODE SIGNING:
4051                                                  * This page is no longer dirty
4052                                                  * but could have been modified,
4053                                                  * so it will need to be
4054                                                  * re-validated.
4055                                                  */
4056                                                 m->cs_validated = FALSE;
4057                                                 vm_cs_validated_resets++;
4058                                         }
4059                                         clear_refmod |= VM_MEM_MODIFIED;
4060                                 }
4061                                 if (flags & UPL_COMMIT_INACTIVATE)
4062                                         vm_page_deactivate(m);
4063
4064                                 if (clear_refmod)
4065                                         pmap_clear_refmod(m->phys_page, clear_refmod);
4066
4067                                 if (flags & UPL_COMMIT_ALLOW_ACCESS) {
4068                                         /*
4069                                          * We blocked access to the pages in this UPL.
4070                                          * Clear the "busy" bit and wake up any waiter
4071                                          * for this page.
4072                                          */
4073                                         PAGE_WAKEUP_DONE(m);
4074                                 }
4075                                 goto commit_next_page;
4076                         }
4077                         /*
4078                          * make sure to clear the hardware
4079                          * modify or reference bits before
4080                          * releasing the BUSY bit on this page
4081                          * otherwise we risk losing a legitimate
4082                          * change of state
4083                          */
4084                         if (flags & UPL_COMMIT_CLEAR_DIRTY) {
4085                                 m->dirty = FALSE;
4086                                 if (m->cs_validated && !m->cs_tainted) {
4087                                         /*
4088                                          * CODE SIGNING:
4089                                          * This page is no longer dirty
4090                                          * but could have been modified,
4091                                          * so it will need to be
4092                                          * re-validated.
4093                                          */
4094                                         m->cs_validated = FALSE;
4095                                         vm_cs_validated_resets++;
4096                                 }
4097                                 clear_refmod |= VM_MEM_MODIFIED;
4098                         }
4099                         if (clear_refmod)
4100                                 pmap_clear_refmod(m->phys_page, clear_refmod);
4101
4102                         if (page_list) {
4103                                 upl_page_info_t *p;
4104
4105                                 p = &(page_list[entry]);
4106
4107                                 if (p->phys_addr && p->pageout && !m->pageout) {
4108                                         m->busy = TRUE;
4109                                         m->pageout = TRUE;
4110                                         vm_page_wire(m);
4111                                 } else if (p->phys_addr &&
4112                                            !p->pageout && m->pageout &&
4113                                            !m->dump_cleaning) {
4114                                         m->pageout = FALSE;
4115                                         m->absent = FALSE;
4116                                         m->overwriting = FALSE;
4117                                         vm_page_unwire(m);
4118
4119                                         PAGE_WAKEUP_DONE(m);
4120                                 }
4121                                 page_list[entry].phys_addr = 0;
4122                         }
4123                         m->dump_cleaning = FALSE;
4124
4125                         if (m->laundry)
4126                                 vm_pageout_throttle_up(m);
4127
4128                         if (m->pageout) {
4129                                 m->cleaning = FALSE;
4130                                 m->encrypted_cleaning = FALSE;
4131                                 m->pageout = FALSE;
4132 #if MACH_CLUSTER_STATS
4133                                 if (m->wanted) vm_pageout_target_collisions++;
4134 #endif
4135                                 m->dirty = FALSE;
4136                                 if (m->cs_validated && !m->cs_tainted) {
4137                                         /*
4138                                          * CODE SIGNING:
4139                                          * This page is no longer dirty
4140                                          * but could have been modified,
4141                                          * so it will need to be
4142                                          * re-validated.
4143                                          */
4144                                         m->cs_validated = FALSE;
4145                                         vm_cs_validated_resets++;
4146                                 }
4147
4148                                 if (m->pmapped && (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED))
4149                                         m->dirty = TRUE;
4150
4151                                 if (m->dirty) {
4152                                        /*
4153                                         * page was re-dirtied after we started
4154                                         * the pageout... reactivate it since
4155                                         * we don't know whether the on-disk
4156                                         * copy matches what is now in memory
4157                                         */
4158                                         vm_page_unwire(m);
4159
4160                                         if (upl->flags & UPL_PAGEOUT) {
4161                                                 CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
4162                                                 VM_STAT_INCR(reactivations);
4163                                                 DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL);
4164                                         }
4165                                         PAGE_WAKEUP_DONE(m);
4166                                 } else {
4167                                         /*
4168                                          * page has been successfully cleaned
4169                                          * go ahead and free it for other use
4170                                          */
4171
4172                                         if (m->object->internal) {
4173                                                 DTRACE_VM2(anonpgout, int, 1, (uint64_t *), NULL);
4174                                         } else {
4175                                                 DTRACE_VM2(fspgout, int, 1, (uint64_t *), NULL);
4176                                         }
4177
4178                                         vm_page_free(m);
4179
4180                                         if (upl->flags & UPL_PAGEOUT) {
4181                                                 CLUSTER_STAT(vm_pageout_target_page_freed++;)
4182
4183                                                 if (page_list[entry].dirty) {
4184                                                         VM_STAT_INCR(pageouts);
4185                                                         DTRACE_VM2(pgout, int, 1, (uint64_t *), NULL);
4186                                                         pgpgout_count++;
4187                                                 }
4188                                         }
4189                                 }
4190                                 goto commit_next_page;
4191                         }
4192 #if MACH_CLUSTER_STATS
4193                         if (m->wpmapped)
4194                                 m->dirty = pmap_is_modified(m->phys_page);
4195
4196                         if (m->dirty)   vm_pageout_cluster_dirtied++;
4197                         else            vm_pageout_cluster_cleaned++;
4198                         if (m->wanted)  vm_pageout_cluster_collisions++;
4199 #endif
4200                         m->dirty = FALSE;
4201                         if (m->cs_validated && !m->cs_tainted) {
4202                                 /*
4203                                  * CODE SIGNING:
4204                                  * This page is no longer dirty
4205                                  * but could have been modified,
4206                                  * so it will need to be
4207                                  * re-validated.
4208                                  */
4209                                 m->cs_validated = FALSE;
4210                                 vm_cs_validated_resets++;
4211                         }
4212
4213                         if ((m->busy) && (m->cleaning)) {
4214                                 /*
4215                                  * the request_page_list case
4216                                  */
4217                                 m->absent = FALSE;
4218                                 m->overwriting = FALSE;
4219                                 m->busy = FALSE;
4220                         } else if (m->overwriting) {
4221                                 /*
4222                                  * alternate request page list, write to
4223                                  * page_list case.  Occurs when the original
4224                                  * page was wired at the time of the list
4225                                  * request
4226                                  */
4227                                 assert(m->wire_count != 0);
4228                                 vm_page_unwire(m);/* reactivates */
4229                                 m->overwriting = FALSE;
4230                         }
4231                         m->cleaning = FALSE;
4232                         m->encrypted_cleaning = FALSE;
4233
4234                         /*
4235                          * It is a part of the semantic of COPYOUT_FROM
4236                          * UPLs that a commit implies cache sync
4237                          * between the vm page and the backing store
4238                          * this can be used to strip the precious bit
4239                          * as well as clean
4240                          */
4241                         if (upl->flags & UPL_PAGE_SYNC_DONE)
4242                                 m->precious = FALSE;
4243
4244                         if (flags & UPL_COMMIT_SET_DIRTY)
4245                                 m->dirty = TRUE;
4246
4247                         if ((flags & UPL_COMMIT_INACTIVATE) && !m->clustered && !m->speculative) {
4248                                 vm_page_deactivate(m);
4249                         } else if (!m->active && !m->inactive && !m->speculative) {
4250
4251                                 if (m->clustered)
4252                                         vm_page_speculate(m, TRUE);
4253                                 else if (m->reference)
4254                                         vm_page_activate(m);
4255                                 else
4256                                         vm_page_deactivate(m);
4257                         }
4258                         if (flags & UPL_COMMIT_ALLOW_ACCESS) {
4259                                 /*
4260                                  * We blocked access to the pages in this URL.
4261                                  * Clear the "busy" bit on this page before we
4262                                  * wake up any waiter.
4263                                  */
4264                                 m->busy = FALSE;
4265                         }
4266                         /*
4267                          * Wakeup any thread waiting for the page to be un-cleaning.
4268                          */
4269                         PAGE_WAKEUP(m);
4270                 }
4271 commit_next_page:
4272                 target_offset += PAGE_SIZE_64;
4273                 xfer_size -= PAGE_SIZE;
4274                 entry++;
4275
4276                 if (delayed_unlock++ > UPL_DELAYED_UNLOCK_LIMIT) {
4277                         /*
4278                          * pageout_scan takes the vm_page_lock_queues first
4279                          * then tries for the object lock... to avoid what
4280                          * is effectively a lock inversion, we'll go to the
4281                          * trouble of taking them in that same order... otherwise
4282                          * if this object contains the majority of the pages resident
4283                          * in the UBC (or a small set of large objects actively being
4284                          * worked on contain the majority of the pages), we could
4285                          * cause the pageout_scan thread to 'starve' in its attempt
4286                          * to find pages to move to the free queue, since it has to
4287                          * successfully acquire the object lock of any candidate page
4288                          * before it can steal/clean it.
4289                          */
4290                         vm_object_unlock(shadow_object);
4291                         mutex_yield(&vm_page_queue_lock);
4292
4293                         for (j = 0; ; j++) {
4294                                 if (vm_object_lock_try(shadow_object))
4295                                         break;
4296                                 vm_page_unlock_queues();
4297                                 mutex_pause(j);
4298                                 vm_page_lock_queues();
4299                         }
4300                         delayed_unlock = 1;
4301                 }
4302         }
4303         if (delayed_unlock)
4304                 vm_page_unlock_queues();
4305
4306         occupied = 1;
4307
4308         if (upl->flags & UPL_DEVICE_MEMORY)  {
4309                 occupied = 0;
4310         } else if (upl->flags & UPL_LITE) {
4311                 int     pg_num;
4312                 int     i;
4313
4314                 pg_num = upl->size/PAGE_SIZE;
4315                 pg_num = (pg_num + 31) >> 5;
4316                 occupied = 0;
4317
4318                 for (i = 0; i < pg_num; i++) {
4319                         if (lite_list[i] != 0) {
4320                                 occupied = 1;
4321                                 break;
4322                         }
4323                 }
4324         } else {
4325                 if (queue_empty(&upl->map_object->memq))
4326                         occupied = 0;
4327         }
4328         if (occupied == 0) {
4329                 if (upl->flags & UPL_COMMIT_NOTIFY_EMPTY)
4330                         *empty = TRUE;
4331
4332                 if (object == shadow_object) {
4333                         /*
4334                          * this is not a paging object
4335                          * so we need to drop the paging reference
4336                          * that was taken when we created the UPL
4337                          * against this object
4338                          */
4339                         vm_object_paging_end(shadow_object);
4340                 } else {
4341                          /*
4342                           * we dontated the paging reference to
4343                           * the map object... vm_pageout_object_terminate
4344                           * will drop this reference
4345                           */
4346                 }
4347         }
4348         vm_object_unlock(shadow_object);
4349         if (object != shadow_object)
4350                 vm_object_unlock(object);
4351         upl_unlock(upl);
4352
4353         if (pgpgout_count) {
4354                 DTRACE_VM2(pgpgout, int, pgpgout_count, (uint64_t *), NULL);
4355         }
4356
4357         return KERN_SUCCESS;
4358 }
4359
4360 kern_return_t
4361 upl_abort_range(
4362         upl_t                   upl,
4363         upl_offset_t            offset,
4364         upl_size_t              size,
4365         int                     error,
4366         boolean_t               *empty)
4367 {
4368         upl_size_t              xfer_size;
4369         vm_object_t             shadow_object;
4370         vm_object_t             object;
4371         vm_object_offset_t      target_offset;
4372         int                     entry;
4373         wpl_array_t             lite_list;
4374         int                     occupied;
4375         int                     delayed_unlock = 0;
4376         int                     j;
4377
4378         *empty = FALSE;
4379
4380         if (upl == UPL_NULL)
4381                 return KERN_INVALID_ARGUMENT;
4382
4383         if ( (upl->flags & UPL_IO_WIRE) && !(error & UPL_ABORT_DUMP_PAGES) )
4384                 return upl_commit_range(upl, offset, size, 0, NULL, 0, empty);
4385
4386         if (upl->flags & UPL_DEVICE_MEMORY)
4387                 xfer_size = 0;
4388         else if ((offset + size) <= upl->size)
4389                 xfer_size = size;
4390         else
4391                 return KERN_FAILURE;
4392
4393         upl_lock(upl);
4394
4395         if (upl->flags & UPL_INTERNAL) {
4396                 lite_list = (wpl_array_t)
4397                         ((((uintptr_t)upl) + sizeof(struct upl))
4398                         + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
4399         } else {
4400                 lite_list = (wpl_array_t)
4401                         (((uintptr_t)upl) + sizeof(struct upl));
4402         }
4403         object = upl->map_object;
4404
4405         if (upl->flags & UPL_SHADOWED) {
4406                 vm_object_lock(object);
4407                 shadow_object = object->shadow;
4408         } else
4409                 shadow_object = object;
4410
4411         entry = offset/PAGE_SIZE;
4412         target_offset = (vm_object_offset_t)offset;
4413
4414         /*
4415          * pageout_scan takes the vm_page_lock_queues first
4416          * then tries for the object lock... to avoid what
4417          * is effectively a lock inversion, we'll go to the
4418          * trouble of taking them in that same order... otherwise
4419          * if this object contains the majority of the pages resident
4420          * in the UBC (or a small set of large objects actively being
4421          * worked on contain the majority of the pages), we could
4422          * cause the pageout_scan thread to 'starve' in its attempt
4423          * to find pages to move to the free queue, since it has to
4424          * successfully acquire the object lock of any candidate page
4425          * before it can steal/clean it.
4426          */
4427         for (j = 0; ; j++) {
4428                 vm_page_lock_queues();
4429
4430                 if (vm_object_lock_try(shadow_object))
4431                         break;
4432                 vm_page_unlock_queues();
4433                 mutex_pause(j);
4434         }
4435         delayed_unlock = 1;
4436
4437         while (xfer_size) {
4438                 vm_page_t       t, m;
4439
4440                 m = VM_PAGE_NULL;
4441
4442                 if (upl->flags & UPL_LITE) {
4443                         int     pg_num;
4444                         pg_num = target_offset/PAGE_SIZE;
4445
4446                         if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
4447                                 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
4448
4449                                 m = vm_page_lookup(shadow_object, target_offset +
4450                                                    (upl->offset - shadow_object->paging_offset));
4451                         }
4452                 }
4453                 if (upl->flags & UPL_SHADOWED) {
4454                         if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
4455                                 t->pageout = FALSE;
4456
4457                                 vm_page_free(t);
4458
4459                                 if (m == VM_PAGE_NULL)
4460                                         m = vm_page_lookup(shadow_object, target_offset + object->shadow_offset);
4461                         }
4462                 }
4463                 if (m != VM_PAGE_NULL) {
4464
4465                         if (m->absent) {
4466                                 boolean_t must_free = TRUE;
4467
4468                                 m->clustered = FALSE;
4469                                 /*
4470                                  * COPYOUT = FALSE case
4471                                  * check for error conditions which must
4472                                  * be passed back to the pages customer
4473                                  */
4474                                 if (error & UPL_ABORT_RESTART) {
4475                                         m->restart = TRUE;
4476                                         m->absent = FALSE;
4477                                         m->error = TRUE;
4478                                         m->unusual = TRUE;
4479                                         must_free = FALSE;
4480                                 } else if (error & UPL_ABORT_UNAVAILABLE) {
4481                                         m->restart = FALSE;
4482                                         m->unusual = TRUE;
4483                                         must_free = FALSE;
4484                                 } else if (error & UPL_ABORT_ERROR) {
4485                                         m->restart = FALSE;
4486                                         m->absent = FALSE;
4487                                         m->error = TRUE;
4488                                         m->unusual = TRUE;
4489                                         must_free = FALSE;
4490                                 }
4491
4492                                 /*
4493                                  * ENCRYPTED SWAP:
4494                                  * If the page was already encrypted,
4495                                  * we don't really need to decrypt it
4496                                  * now.  It will get decrypted later,
4497                                  * on demand, as soon as someone needs
4498                                  * to access its contents.
4499                                  */
4500
4501                                 m->cleaning = FALSE;
4502                                 m->encrypted_cleaning = FALSE;
4503                                 m->overwriting = FALSE;
4504                                 PAGE_WAKEUP_DONE(m);
4505
4506                                 if (must_free == TRUE)
4507                                         vm_page_free(m);
4508                                 else
4509                                         vm_page_activate(m);
4510                         } else {
4511                                 /*
4512                                  * Handle the trusted pager throttle.
4513                                  */
4514                                 if (m->laundry)
4515                                         vm_pageout_throttle_up(m);
4516
4517                                 if (m->pageout) {
4518                                         assert(m->busy);
4519                                         assert(m->wire_count == 1);
4520                                         m->pageout = FALSE;
4521                                         vm_page_unwire(m);
4522                                 }
4523                                 m->dump_cleaning = FALSE;
4524                                 m->cleaning = FALSE;
4525                                 m->encrypted_cleaning = FALSE;
4526                                 m->overwriting = FALSE;
4527 #if     MACH_PAGEMAP
4528                                 vm_external_state_clr(m->object->existence_map, m->offset);
4529 #endif  /* MACH_PAGEMAP */
4530                                 if (error & UPL_ABORT_DUMP_PAGES) {
4531                                         pmap_disconnect(m->phys_page);
4532                                         vm_page_free(m);
4533                                 } else {
4534                                         if (error & UPL_ABORT_REFERENCE) {
4535                                                 /*
4536                                                  * we've been told to explictly
4537                                                  * reference this page... for
4538                                                  * file I/O, this is done by
4539                                                  * implementing an LRU on the inactive q
4540                                                  */
4541                                                 vm_page_lru(m);
4542                                         }
4543                                         PAGE_WAKEUP_DONE(m);
4544                                 }
4545                         }
4546                 }
4547                 if (delayed_unlock++ > UPL_DELAYED_UNLOCK_LIMIT) {
4548                         /*
4549                          * pageout_scan takes the vm_page_lock_queues first
4550                          * then tries for the object lock... to avoid what
4551                          * is effectively a lock inversion, we'll go to the
4552                          * trouble of taking them in that same order... otherwise
4553                          * if this object contains the majority of the pages resident
4554                          * in the UBC (or a small set of large objects actively being
4555                          * worked on contain the majority of the pages), we could
4556                          * cause the pageout_scan thread to 'starve' in its attempt
4557                          * to find pages to move to the free queue, since it has to
4558                          * successfully acquire the object lock of any candidate page
4559                          * before it can steal/clean it.
4560                          */
4561                         vm_object_unlock(shadow_object);
4562                         mutex_yield(&vm_page_queue_lock);
4563
4564                         for (j = 0; ; j++) {
4565                                 if (vm_object_lock_try(shadow_object))
4566                                         break;
4567                                 vm_page_unlock_queues();
4568                                 mutex_pause(j);
4569                                 vm_page_lock_queues();
4570                         }
4571                         delayed_unlock = 1;
4572                 }
4573                 target_offset += PAGE_SIZE_64;
4574                 xfer_size -= PAGE_SIZE;
4575                 entry++;
4576         }
4577         if (delayed_unlock)
4578                 vm_page_unlock_queues();
4579
4580         occupied = 1;
4581
4582         if (upl->flags & UPL_DEVICE_MEMORY)  {
4583                 occupied = 0;
4584         } else if (upl->flags & UPL_LITE) {
4585                 int     pg_num;
4586                 int     i;
4587
4588                 pg_num = upl->size/PAGE_SIZE;
4589                 pg_num = (pg_num + 31) >> 5;
4590                 occupied = 0;
4591
4592                 for (i = 0; i < pg_num; i++) {
4593                         if (lite_list[i] != 0) {
4594                                 occupied = 1;
4595                                 break;
4596                         }
4597                 }
4598         } else {
4599                 if (queue_empty(&upl->map_object->memq))
4600                         occupied = 0;
4601         }
4602         if (occupied == 0) {
4603                 if (upl->flags & UPL_COMMIT_NOTIFY_EMPTY)
4604                         *empty = TRUE;
4605
4606                 if (object == shadow_object) {
4607                         /*
4608                          * this is not a paging object
4609                          * so we need to drop the paging reference
4610                          * that was taken when we created the UPL
4611                          * against this object
4612                          */
4613                         vm_object_paging_end(shadow_object);
4614                 } else {
4615                          /*
4616                           * we dontated the paging reference to
4617                           * the map object... vm_pageout_object_terminate
4618                           * will drop this reference
4619                           */
4620                 }
4621         }
4622         vm_object_unlock(shadow_object);
4623         if (object != shadow_object)
4624                 vm_object_unlock(object);
4625         upl_unlock(upl);
4626
4627         return KERN_SUCCESS;
4628 }
4629
4630
4631 kern_return_t
4632 upl_abort(
4633         upl_t   upl,
4634         int     error)
4635 {
4636         boolean_t       empty;
4637
4638         return upl_abort_range(upl, 0, upl->size, error, &empty);
4639 }
4640
4641
4642 /* an option on commit should be wire */
4643 kern_return_t
4644 upl_commit(
4645         upl_t                   upl,
4646         upl_page_info_t         *page_list,
4647         mach_msg_type_number_t  count)
4648 {
4649         boolean_t       empty;
4650
4651         return upl_commit_range(upl, 0, upl->size, 0, page_list, count, &empty);
4652 }
4653
4654
4655 kern_return_t
4656 vm_object_iopl_request(
4657         vm_object_t             object,
4658         vm_object_offset_t      offset,
4659         upl_size_t              size,
4660         upl_t                   *upl_ptr,
4661         upl_page_info_array_t   user_page_list,
4662         unsigned int            *page_list_count,
4663         int                     cntrl_flags)
4664 {
4665         vm_page_t               dst_page;
4666         vm_object_offset_t      dst_offset;
4667         upl_size_t              xfer_size;
4668         upl_t                   upl = NULL;
4669         unsigned int            entry;
4670         wpl_array_t             lite_list = NULL;
4671         int                     delayed_unlock = 0;
4672         int                     no_zero_fill = FALSE;
4673         u_int32_t               psize;
4674         kern_return_t           ret;
4675         vm_prot_t               prot;
4676         struct vm_object_fault_info fault_info;
4677
4678
4679         if (cntrl_flags & ~UPL_VALID_FLAGS) {
4680                 /*
4681                  * For forward compatibility's sake,
4682                  * reject any unknown flag.
4683                  */
4684                 return KERN_INVALID_VALUE;
4685         }
4686         if (vm_lopage_poolsize == 0)
4687                 cntrl_flags &= ~UPL_NEED_32BIT_ADDR;
4688
4689         if (cntrl_flags & UPL_NEED_32BIT_ADDR) {
4690                 if ( (cntrl_flags & (UPL_SET_IO_WIRE | UPL_SET_LITE)) != (UPL_SET_IO_WIRE | UPL_SET_LITE))
4691                         return KERN_INVALID_VALUE;
4692
4693                 if (object->phys_contiguous) {
4694                         if ((offset + object->shadow_offset) >= (vm_object_offset_t)max_valid_dma_address)
4695                                 return KERN_INVALID_ADDRESS;
4696
4697                         if (((offset + object->shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address)
4698                                 return KERN_INVALID_ADDRESS;
4699                 }
4700         }
4701
4702         if (cntrl_flags & UPL_ENCRYPT) {
4703                 /*
4704                  * ENCRYPTED SWAP:
4705                  * The paging path doesn't use this interface,
4706                  * so we don't support the UPL_ENCRYPT flag
4707                  * here.  We won't encrypt the pages.
4708                  */
4709                 assert(! (cntrl_flags & UPL_ENCRYPT));
4710         }
4711         if (cntrl_flags & UPL_NOZEROFILL)
4712                 no_zero_fill = TRUE;
4713
4714         if (cntrl_flags & UPL_COPYOUT_FROM)
4715                 prot = VM_PROT_READ;
4716         else
4717                 prot = VM_PROT_READ | VM_PROT_WRITE;
4718
4719         if (((size/page_size) > MAX_UPL_SIZE) && !object->phys_contiguous)
4720                 size = MAX_UPL_SIZE * page_size;
4721
4722         if (cntrl_flags & UPL_SET_INTERNAL) {
4723                 if (page_list_count != NULL)
4724                         *page_list_count = MAX_UPL_SIZE;
4725         }
4726         if (((cntrl_flags & UPL_SET_INTERNAL) && !(object->phys_contiguous)) &&
4727             ((page_list_count != NULL) && (*page_list_count != 0) && *page_list_count < (size/page_size)))
4728                 return KERN_INVALID_ARGUMENT;
4729
4730         if ((!object->internal) && (object->paging_offset != 0))
4731                 panic("vm_object_iopl_request: external object with non-zero paging offset\n");
4732
4733
4734         if (object->phys_contiguous)
4735                 psize = PAGE_SIZE;
4736         else
4737                 psize = size;
4738
4739         if (cntrl_flags & UPL_SET_INTERNAL) {
4740                 upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE, UPL_IO_WIRE, psize);
4741
4742                 user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
4743                 lite_list = (wpl_array_t) (((uintptr_t)user_page_list) +
4744                                            ((psize / PAGE_SIZE) * sizeof(upl_page_info_t)));
4745         } else {
4746                 upl = upl_create(UPL_CREATE_LITE, UPL_IO_WIRE, psize);
4747
4748                 lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
4749         }
4750         if (user_page_list)
4751                 user_page_list[0].device = FALSE;
4752         *upl_ptr = upl;
4753
4754         upl->map_object = object;
4755         upl->size = size;
4756
4757         vm_object_lock(object);
4758         vm_object_paging_begin(object);
4759         /*
4760          * paging in progress also protects the paging_offset
4761          */
4762         upl->offset = offset + object->paging_offset;
4763
4764         if (object->phys_contiguous) {
4765 #ifdef UPL_DEBUG
4766                 queue_enter(&object->uplq, upl, upl_t, uplq);
4767 #endif /* UPL_DEBUG */
4768
4769                 vm_object_unlock(object);
4770
4771                 /*
4772                  * don't need any shadow mappings for this one
4773                  * since it is already I/O memory
4774                  */
4775                 upl->flags |= UPL_DEVICE_MEMORY;
4776
4777                 upl->highest_page = (offset + object->shadow_offset + size - 1)>>PAGE_SHIFT;
4778
4779                 if (user_page_list) {
4780                         user_page_list[0].phys_addr = (offset + object->shadow_offset)>>PAGE_SHIFT;
4781                         user_page_list[0].device = TRUE;
4782                 }
4783                 if (page_list_count != NULL) {
4784                         if (upl->flags & UPL_INTERNAL)
4785                                 *page_list_count = 0;
4786                         else
4787                                 *page_list_count = 1;
4788                 }
4789                 return KERN_SUCCESS;
4790         }
4791         /*
4792          * Protect user space from future COW operations
4793          */
4794         object->true_share = TRUE;
4795
4796         if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC)
4797                 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
4798
4799 #ifdef UPL_DEBUG
4800         queue_enter(&object->uplq, upl, upl_t, uplq);
4801 #endif /* UPL_DEBUG */
4802
4803         if (cntrl_flags & UPL_BLOCK_ACCESS) {
4804                 /*
4805                  * The user requested that access to the pages in this URL
4806                  * be blocked until the UPL is commited or aborted.
4807                  */
4808                 upl->flags |= UPL_ACCESS_BLOCKED;
4809         }
4810         entry = 0;
4811
4812         xfer_size = size;
4813         dst_offset = offset;
4814
4815         fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
4816         fault_info.user_tag  = 0;
4817         fault_info.lo_offset = offset;
4818         fault_info.hi_offset = offset + xfer_size;
4819         fault_info.no_cache  = FALSE;
4820
4821         while (xfer_size) {
4822                 vm_fault_return_t       result;
4823                 int                     pg_num;
4824
4825                 dst_page = vm_page_lookup(object, dst_offset);
4826
4827                 /*
4828                  * ENCRYPTED SWAP:
4829                  * If the page is encrypted, we need to decrypt it,
4830                  * so force a soft page fault.
4831                  */
4832                 if ((dst_page == VM_PAGE_NULL) || (dst_page->busy) ||
4833                     (dst_page->encrypted) ||
4834                     (dst_page->unusual && (dst_page->error ||
4835                                            dst_page->restart ||
4836                                            dst_page->absent ||
4837                                            dst_page->fictitious))) {
4838
4839                    do {
4840                         vm_page_t       top_page;
4841                         kern_return_t   error_code;
4842                         int             interruptible;
4843
4844                         if (delayed_unlock) {
4845                                 delayed_unlock = 0;
4846                                 vm_page_unlock_queues();
4847                         }
4848                         if (cntrl_flags & UPL_SET_INTERRUPTIBLE)
4849                                 interruptible = THREAD_ABORTSAFE;
4850                         else
4851                                 interruptible = THREAD_UNINT;
4852
4853                         fault_info.interruptible = interruptible;
4854                         fault_info.cluster_size = xfer_size;
4855
4856                         result = vm_fault_page(object, dst_offset,
4857                                                prot | VM_PROT_WRITE, FALSE,
4858                                                &prot, &dst_page, &top_page,
4859                                                (int *)0,
4860                                                &error_code, no_zero_fill,
4861                                                FALSE, &fault_info);
4862
4863                         switch (result) {
4864
4865                         case VM_FAULT_SUCCESS:
4866
4867                                 PAGE_WAKEUP_DONE(dst_page);
4868                                 /*
4869                                  *      Release paging references and
4870                                  *      top-level placeholder page, if any.
4871                                  */
4872                                 if (top_page != VM_PAGE_NULL) {
4873                                         vm_object_t local_object;
4874
4875                                         local_object = top_page->object;
4876
4877                                         if (top_page->object != dst_page->object) {
4878                                                 vm_object_lock(local_object);
4879                                                 VM_PAGE_FREE(top_page);
4880                                                 vm_object_paging_end(local_object);
4881                                                 vm_object_unlock(local_object);
4882                                         } else {
4883                                                 VM_PAGE_FREE(top_page);
4884                                                 vm_object_paging_end(local_object);
4885                                         }
4886                                 }
4887                                 break;
4888
4889                         case VM_FAULT_RETRY:
4890                                 vm_object_lock(object);
4891                                 vm_object_paging_begin(object);
4892                                 break;
4893
4894                         case VM_FAULT_FICTITIOUS_SHORTAGE:
4895                                 vm_page_more_fictitious();
4896
4897                                 vm_object_lock(object);
4898                                 vm_object_paging_begin(object);
4899                                 break;
4900
4901                         case VM_FAULT_MEMORY_SHORTAGE:
4902                                 if (vm_page_wait(interruptible)) {
4903                                         vm_object_lock(object);
4904                                         vm_object_paging_begin(object);
4905                                         break;
4906                                 }
4907                                 /* fall thru */
4908
4909                         case VM_FAULT_INTERRUPTED:
4910                                 error_code = MACH_SEND_INTERRUPTED;
4911                         case VM_FAULT_MEMORY_ERROR:
4912                                 ret = (error_code ? error_code: KERN_MEMORY_ERROR);
4913
4914                                 vm_object_lock(object);
4915                                 vm_object_paging_begin(object);
4916                                 goto return_err;
4917                         }
4918                    } while (result != VM_FAULT_SUCCESS);
4919                 }
4920
4921                 if ( (cntrl_flags & UPL_NEED_32BIT_ADDR) &&
4922                      dst_page->phys_page >= (max_valid_dma_address >> PAGE_SHIFT) ) {
4923                         vm_page_t       low_page;
4924                         int             refmod;
4925
4926                         /*
4927                          * support devices that can't DMA above 32 bits
4928                          * by substituting pages from a pool of low address
4929                          * memory for any pages we find above the 4G mark
4930                          * can't substitute if the page is already wired because
4931                          * we don't know whether that physical address has been
4932                          * handed out to some other 64 bit capable DMA device to use
4933                          */
4934                         if (dst_page->wire_count) {
4935                                 ret = KERN_PROTECTION_FAILURE;
4936                                 goto return_err;
4937                         }
4938                         if (delayed_unlock) {
4939                                 delayed_unlock = 0;
4940                                 vm_page_unlock_queues();
4941                         }
4942                         low_page = vm_page_grablo();
4943
4944                         if (low_page == VM_PAGE_NULL) {
4945                                 ret = KERN_RESOURCE_SHORTAGE;
4946                                 goto return_err;
4947                         }
4948                         /*
4949                          * from here until the vm_page_replace completes
4950                          * we musn't drop the object lock... we don't
4951                          * want anyone refaulting this page in and using
4952                          * it after we disconnect it... we want the fault
4953                          * to find the new page being substituted.
4954                          */
4955                         if (dst_page->pmapped)
4956                                 refmod = pmap_disconnect(dst_page->phys_page);
4957                         else
4958                                 refmod = 0;
4959                         vm_page_copy(dst_page, low_page);
4960
4961                         low_page->reference = dst_page->reference;
4962                         low_page->dirty     = dst_page->dirty;
4963
4964                         if (refmod & VM_MEM_REFERENCED)
4965                                 low_page->reference = TRUE;
4966                         if (refmod & VM_MEM_MODIFIED)
4967                                 low_page->dirty = TRUE;
4968
4969                         vm_page_lock_queues();
4970                         vm_page_replace(low_page, object, dst_offset);
4971                         /*
4972                          * keep the queue lock since we're going to
4973                          * need it immediately
4974                          */
4975                         delayed_unlock = 1;
4976
4977                         dst_page = low_page;
4978                         /*
4979                          * vm_page_grablo returned the page marked
4980                          * BUSY... we don't need a PAGE_WAKEUP_DONE
4981                          * here, because we've never dropped the object lock
4982                          */
4983                         dst_page->busy = FALSE;
4984                 }
4985                 if (delayed_unlock == 0)
4986                         vm_page_lock_queues();
4987
4988                 vm_page_wire(dst_page);
4989
4990                 if (cntrl_flags & UPL_BLOCK_ACCESS) {
4991                         /*
4992                          * Mark the page "busy" to block any future page fault
4993                          * on this page.  We'll also remove the mapping
4994                          * of all these pages before leaving this routine.
4995                          */
4996                         assert(!dst_page->fictitious);
4997                         dst_page->busy = TRUE;
4998                 }
4999                 pg_num = (dst_offset-offset)/PAGE_SIZE;
5000                 lite_list[pg_num>>5] |= 1 << (pg_num & 31);
5001
5002                 /*
5003                  * expect the page to be used
5004                  * page queues lock must be held to set 'reference'
5005                  */
5006                 dst_page->reference = TRUE;
5007
5008                 if (!(cntrl_flags & UPL_COPYOUT_FROM))
5009                         dst_page->dirty = TRUE;
5010
5011                 if (dst_page->phys_page > upl->highest_page)
5012                         upl->highest_page = dst_page->phys_page;
5013
5014                 if (user_page_list) {
5015                         user_page_list[entry].phys_addr = dst_page->phys_page;
5016                         user_page_list[entry].dirty     = dst_page->dirty;
5017                         user_page_list[entry].pageout   = dst_page->pageout;
5018                         user_page_list[entry].absent    = dst_page->absent;
5019                         user_page_list[entry].precious  = dst_page->precious;
5020
5021                         if (dst_page->clustered == TRUE)
5022                                 user_page_list[entry].speculative = dst_page->speculative;
5023                         else
5024                                 user_page_list[entry].speculative = FALSE;
5025                 }
5026                 /*
5027                  * someone is explicitly grabbing this page...
5028                  * update clustered and speculative state
5029                  *
5030                  */
5031                 VM_PAGE_CONSUME_CLUSTERED(dst_page);
5032
5033                 if (delayed_unlock++ > UPL_DELAYED_UNLOCK_LIMIT) {
5034                         mutex_yield(&vm_page_queue_lock);
5035                         delayed_unlock = 1;
5036                 }
5037                 entry++;
5038                 dst_offset += PAGE_SIZE_64;
5039                 xfer_size -= PAGE_SIZE;
5040         }
5041         if (delayed_unlock)
5042                 vm_page_unlock_queues();
5043
5044         if (page_list_count != NULL) {
5045                 if (upl->flags & UPL_INTERNAL)
5046                         *page_list_count = 0;
5047                 else if (*page_list_count > entry)
5048                         *page_list_count = entry;
5049         }
5050         vm_object_unlock(object);
5051
5052         if (cntrl_flags & UPL_BLOCK_ACCESS) {
5053                 /*
5054                  * We've marked all the pages "busy" so that future
5055                  * page faults will block.
5056                  * Now remove the mapping for these pages, so that they
5057                  * can't be accessed without causing a page fault.
5058                  */
5059                 vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
5060                                        PMAP_NULL, 0, VM_PROT_NONE);
5061         }
5062         return KERN_SUCCESS;
5063
5064 return_err:
5065         if (delayed_unlock)
5066                 vm_page_unlock_queues();
5067
5068         for (; offset < dst_offset; offset += PAGE_SIZE) {
5069                 dst_page = vm_page_lookup(object, offset);
5070
5071                 if (dst_page == VM_PAGE_NULL)
5072                         panic("vm_object_iopl_request: Wired pages missing. \n");
5073
5074                 vm_page_lockspin_queues();
5075                 vm_page_unwire(dst_page);
5076                 vm_page_unlock_queues();
5077
5078                 VM_STAT_INCR(reactivations);
5079         }
5080         vm_object_paging_end(object);
5081         vm_object_unlock(object);
5082         upl_destroy(upl);
5083
5084         return ret;
5085 }
5086
5087 kern_return_t
5088 upl_transpose(
5089         upl_t           upl1,
5090         upl_t           upl2)
5091 {
5092         kern_return_t           retval;
5093         boolean_t               upls_locked;
5094         vm_object_t             object1, object2;
5095
5096         if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2) {
5097                 return KERN_INVALID_ARGUMENT;
5098         }
5099
5100         upls_locked = FALSE;
5101
5102         /*
5103          * Since we need to lock both UPLs at the same time,
5104          * avoid deadlocks by always taking locks in the same order.
5105          */
5106         if (upl1 < upl2) {
5107                 upl_lock(upl1);
5108                 upl_lock(upl2);
5109         } else {
5110                 upl_lock(upl2);
5111                 upl_lock(upl1);
5112         }
5113         upls_locked = TRUE;     /* the UPLs will need to be unlocked */
5114
5115         object1 = upl1->map_object;
5116         object2 = upl2->map_object;
5117
5118         if (upl1->offset != 0 || upl2->offset != 0 ||
5119             upl1->size != upl2->size) {
5120                 /*
5121                  * We deal only with full objects, not subsets.
5122                  * That's because we exchange the entire backing store info
5123                  * for the objects: pager, resident pages, etc...  We can't do
5124                  * only part of it.
5125                  */
5126                 retval = KERN_INVALID_VALUE;
5127                 goto done;
5128         }
5129
5130         /*
5131          * Tranpose the VM objects' backing store.
5132          */
5133         retval = vm_object_transpose(object1, object2,
5134                                      (vm_object_size_t) upl1->size);
5135
5136         if (retval == KERN_SUCCESS) {
5137                 /*
5138                  * Make each UPL point to the correct VM object, i.e. the
5139                  * object holding the pages that the UPL refers to...
5140                  */
5141 #ifdef UPL_DEBUG
5142                 queue_remove(&object1->uplq, upl1, upl_t, uplq);
5143                 queue_remove(&object2->uplq, upl2, upl_t, uplq);
5144 #endif
5145                 upl1->map_object = object2;
5146                 upl2->map_object = object1;
5147 #ifdef UPL_DEBUG
5148                 queue_enter(&object1->uplq, upl2, upl_t, uplq);
5149                 queue_enter(&object2->uplq, upl1, upl_t, uplq);
5150 #endif
5151         }
5152
5153 done:
5154         /*
5155          * Cleanup.
5156          */
5157         if (upls_locked) {
5158                 upl_unlock(upl1);
5159                 upl_unlock(upl2);
5160                 upls_locked = FALSE;
5161         }
5162
5163         return retval;
5164 }
5165
5166 /*
5167  * ENCRYPTED SWAP:
5168  *
5169  * Rationale:  the user might have some encrypted data on disk (via
5170  * FileVault or any other mechanism).  That data is then decrypted in
5171  * memory, which is safe as long as the machine is secure.  But that
5172  * decrypted data in memory could be paged out to disk by the default
5173  * pager.  The data would then be stored on disk in clear (not encrypted)
5174  * and it could be accessed by anyone who gets physical access to the
5175  * disk (if the laptop or the disk gets stolen for example).  This weakens
5176  * the security offered by FileVault.
5177  *
5178  * Solution:  the default pager will optionally request that all the
5179  * pages it gathers for pageout be encrypted, via the UPL interfaces,
5180  * before it sends this UPL to disk via the vnode_pageout() path.
5181  *
5182  * Notes:
5183  *
5184  * To avoid disrupting the VM LRU algorithms, we want to keep the
5185  * clean-in-place mechanisms, which allow us to send some extra pages to
5186  * swap (clustering) without actually removing them from the user's
5187  * address space.  We don't want the user to unknowingly access encrypted
5188  * data, so we have to actually remove the encrypted pages from the page
5189  * table.  When the user accesses the data, the hardware will fail to
5190  * locate the virtual page in its page table and will trigger a page
5191  * fault.  We can then decrypt the page and enter it in the page table
5192  * again.  Whenever we allow the user to access the contents of a page,
5193  * we have to make sure it's not encrypted.
5194  *
5195  *
5196  */
5197 /*
5198  * ENCRYPTED SWAP:
5199  * Reserve of virtual addresses in the kernel address space.
5200  * We need to map the physical pages in the kernel, so that we
5201  * can call the encryption/decryption routines with a kernel
5202  * virtual address.  We keep this pool of pre-allocated kernel
5203  * virtual addresses so that we don't have to scan the kernel's
5204  * virtaul address space each time we need to encrypt or decrypt
5205  * a physical page.
5206  * It would be nice to be able to encrypt and decrypt in physical
5207  * mode but that might not always be more efficient...
5208  */
5209 decl_simple_lock_data(,vm_paging_lock)
5210 #define VM_PAGING_NUM_PAGES     64
5211 vm_map_offset_t vm_paging_base_address = 0;
5212 boolean_t       vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
5213 int             vm_paging_max_index = 0;
5214 int             vm_paging_page_waiter = 0;
5215 int             vm_paging_page_waiter_total = 0;
5216 unsigned long   vm_paging_no_kernel_page = 0;
5217 unsigned long   vm_paging_objects_mapped = 0;
5218 unsigned long   vm_paging_pages_mapped = 0;
5219 unsigned long   vm_paging_objects_mapped_slow = 0;
5220 unsigned long   vm_paging_pages_mapped_slow = 0;
5221
5222 void
5223 vm_paging_map_init(void)
5224 {
5225         kern_return_t   kr;
5226         vm_map_offset_t page_map_offset;
5227         vm_map_entry_t  map_entry;
5228
5229         assert(vm_paging_base_address == 0);
5230
5231         /*
5232          * Initialize our pool of pre-allocated kernel
5233          * virtual addresses.
5234          */
5235         page_map_offset = 0;
5236         kr = vm_map_find_space(kernel_map,
5237                                &page_map_offset,
5238                                VM_PAGING_NUM_PAGES * PAGE_SIZE,
5239                                0,
5240                                0,
5241                                &map_entry);
5242         if (kr != KERN_SUCCESS) {
5243                 panic("vm_paging_map_init: kernel_map full\n");
5244         }
5245         map_entry->object.vm_object = kernel_object;
5246         map_entry->offset =
5247                 page_map_offset - VM_MIN_KERNEL_ADDRESS;
5248         vm_object_reference(kernel_object);
5249         vm_map_unlock(kernel_map);
5250
5251         assert(vm_paging_base_address == 0);
5252         vm_paging_base_address = page_map_offset;
5253 }
5254
5255 /*
5256  * ENCRYPTED SWAP:
5257  * vm_paging_map_object:
5258  *      Maps part of a VM object's pages in the kernel
5259  *      virtual address space, using the pre-allocated
5260  *      kernel virtual addresses, if possible.
5261  * Context:
5262  *      The VM object is locked.  This lock will get
5263  *      dropped and re-acquired though, so the caller
5264  *      must make sure the VM object is kept alive
5265  *      (by holding a VM map that has a reference
5266  *      on it, for example, or taking an extra reference).
5267  *      The page should also be kept busy to prevent
5268  *      it from being reclaimed.
5269  */
5270 kern_return_t
5271 vm_paging_map_object(
5272         vm_map_offset_t         *address,
5273         vm_page_t               page,
5274         vm_object_t             object,
5275         vm_object_offset_t      offset,
5276         vm_map_size_t           *size,
5277         boolean_t               can_unlock_object)
5278 {
5279         kern_return_t           kr;
5280         vm_map_offset_t         page_map_offset;
5281         vm_map_size_t           map_size;
5282         vm_object_offset_t      object_offset;
5283         int                     i;
5284
5285
5286         if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
5287                 assert(page->busy);
5288                 /*
5289                  * Use one of the pre-allocated kernel virtual addresses
5290                  * and just enter the VM page in the kernel address space
5291                  * at that virtual address.
5292                  */
5293                 simple_lock(&vm_paging_lock);
5294
5295                 /*
5296                  * Try and find an available kernel virtual address
5297                  * from our pre-allocated pool.
5298                  */
5299                 page_map_offset = 0;
5300                 for (;;) {
5301                         for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
5302                                 if (vm_paging_page_inuse[i] == FALSE) {
5303                                         page_map_offset =
5304                                                 vm_paging_base_address +
5305                                                 (i * PAGE_SIZE);
5306                                         break;
5307                                 }
5308                         }
5309                         if (page_map_offset != 0) {
5310                                 /* found a space to map our page ! */
5311                                 break;
5312                         }
5313
5314                         if (can_unlock_object) {
5315                                 /*
5316                                  * If we can afford to unlock the VM object,
5317                                  * let's take the slow path now...
5318                                  */
5319                                 break;
5320                         }
5321                         /*
5322                          * We can't afford to unlock the VM object, so
5323                          * let's wait for a space to become available...
5324                          */
5325                         vm_paging_page_waiter_total++;
5326                         vm_paging_page_waiter++;
5327                         thread_sleep_fast_usimple_lock(&vm_paging_page_waiter,
5328                                                        &vm_paging_lock,
5329                                                        THREAD_UNINT);
5330                         vm_paging_page_waiter--;
5331                         /* ... and try again */
5332                 }
5333
5334                 if (page_map_offset != 0) {
5335                         /*
5336                          * We found a kernel virtual address;
5337                          * map the physical page to that virtual address.
5338                          */
5339                         if (i > vm_paging_max_index) {
5340                                 vm_paging_max_index = i;
5341                         }
5342                         vm_paging_page_inuse[i] = TRUE;
5343                         simple_unlock(&vm_paging_lock);
5344
5345                         if (page->pmapped == FALSE) {
5346                                 pmap_sync_page_data_phys(page->phys_page);
5347                         }
5348                         page->pmapped = TRUE;
5349
5350                         /*
5351                          * Keep the VM object locked over the PMAP_ENTER
5352                          * and the actual use of the page by the kernel,
5353                          * or this pmap mapping might get undone by a
5354                          * vm_object_pmap_protect() call...
5355                          */
5356                         PMAP_ENTER(kernel_pmap,
5357                                    page_map_offset,
5358                                    page,
5359                                    VM_PROT_DEFAULT,
5360                                    ((int) page->object->wimg_bits &
5361                                     VM_WIMG_MASK),
5362                                    TRUE);
5363                         vm_paging_objects_mapped++;
5364                         vm_paging_pages_mapped++;
5365                         *address = page_map_offset;
5366
5367                         /* all done and mapped, ready to use ! */
5368                         return KERN_SUCCESS;
5369                 }
5370
5371                 /*
5372                  * We ran out of pre-allocated kernel virtual
5373                  * addresses.  Just map the page in the kernel
5374                  * the slow and regular way.
5375                  */
5376                 vm_paging_no_kernel_page++;
5377                 simple_unlock(&vm_paging_lock);
5378         }
5379
5380         if (! can_unlock_object) {
5381                 return KERN_NOT_SUPPORTED;
5382         }
5383
5384         object_offset = vm_object_trunc_page(offset);
5385         map_size = vm_map_round_page(*size);
5386
5387         /*
5388          * Try and map the required range of the object
5389          * in the kernel_map
5390          */
5391
5392         vm_object_reference_locked(object);     /* for the map entry */
5393         vm_object_unlock(object);
5394
5395         kr = vm_map_enter(kernel_map,
5396                           address,
5397                           map_size,
5398                           0,
5399                           VM_FLAGS_ANYWHERE,
5400                           object,
5401                           object_offset,
5402                           FALSE,
5403                           VM_PROT_DEFAULT,
5404                           VM_PROT_ALL,
5405                           VM_INHERIT_NONE);
5406         if (kr != KERN_SUCCESS) {
5407                 *address = 0;
5408                 *size = 0;
5409                 vm_object_deallocate(object);   /* for the map entry */
5410                 vm_object_lock(object);
5411                 return kr;
5412         }
5413
5414         *size = map_size;
5415
5416         /*
5417          * Enter the mapped pages in the page table now.
5418          */
5419         vm_object_lock(object);
5420         /*
5421          * VM object must be kept locked from before PMAP_ENTER()
5422          * until after the kernel is done accessing the page(s).
5423          * Otherwise, the pmap mappings in the kernel could be
5424          * undone by a call to vm_object_pmap_protect().
5425          */
5426
5427         for (page_map_offset = 0;
5428              map_size != 0;
5429              map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
5430                 unsigned int    cache_attr;
5431
5432                 page = vm_page_lookup(object, offset + page_map_offset);
5433                 if (page == VM_PAGE_NULL) {
5434                         printf("vm_paging_map_object: no page !?");
5435                         vm_object_unlock(object);
5436                         kr = vm_map_remove(kernel_map, *address, *size,
5437                                            VM_MAP_NO_FLAGS);
5438                         assert(kr == KERN_SUCCESS);
5439                         *address = 0;
5440                         *size = 0;
5441                         vm_object_lock(object);
5442                         return KERN_MEMORY_ERROR;
5443                 }
5444                 if (page->pmapped == FALSE) {
5445                         pmap_sync_page_data_phys(page->phys_page);
5446                 }
5447                 page->pmapped = TRUE;
5448                 page->wpmapped = TRUE;
5449                 cache_attr = ((unsigned int) object->wimg_bits) & VM_WIMG_MASK;
5450
5451                 //assert(pmap_verify_free(page->phys_page));
5452                 PMAP_ENTER(kernel_pmap,
5453                            *address + page_map_offset,
5454                            page,
5455                            VM_PROT_DEFAULT,
5456                            cache_attr,
5457                            TRUE);
5458         }
5459
5460         vm_paging_objects_mapped_slow++;
5461         vm_paging_pages_mapped_slow += map_size / PAGE_SIZE_64;
5462
5463         return KERN_SUCCESS;
5464 }
5465
5466 /*
5467  * ENCRYPTED SWAP:
5468  * vm_paging_unmap_object:
5469  *      Unmaps part of a VM object's pages from the kernel
5470  *      virtual address space.
5471  * Context:
5472  *      The VM object is locked.  This lock will get
5473  *      dropped and re-acquired though.
5474  */
5475 void
5476 vm_paging_unmap_object(
5477         vm_object_t     object,
5478         vm_map_offset_t start,
5479         vm_map_offset_t end)
5480 {
5481         kern_return_t   kr;
5482         int             i;
5483
5484         if ((vm_paging_base_address == 0) ||
5485             (start < vm_paging_base_address) ||
5486             (end > (vm_paging_base_address
5487                      + (VM_PAGING_NUM_PAGES * PAGE_SIZE)))) {
5488                 /*
5489                  * We didn't use our pre-allocated pool of
5490                  * kernel virtual address.  Deallocate the
5491                  * virtual memory.
5492                  */
5493                 if (object != VM_OBJECT_NULL) {
5494                         vm_object_unlock(object);
5495                 }
5496                 kr = vm_map_remove(kernel_map, start, end, VM_MAP_NO_FLAGS);
5497                 if (object != VM_OBJECT_NULL) {
5498                         vm_object_lock(object);
5499                 }
5500                 assert(kr == KERN_SUCCESS);
5501         } else {
5502                 /*
5503                  * We used a kernel virtual address from our
5504                  * pre-allocated pool.  Put it back in the pool
5505                  * for next time.
5506                  */
5507                 assert(end - start == PAGE_SIZE);
5508                 i = (start - vm_paging_base_address) >> PAGE_SHIFT;
5509
5510                 /* undo the pmap mapping */
5511                 pmap_remove(kernel_pmap, start, end);
5512
5513                 simple_lock(&vm_paging_lock);
5514                 vm_paging_page_inuse[i] = FALSE;
5515                 if (vm_paging_page_waiter) {
5516                         thread_wakeup(&vm_paging_page_waiter);
5517                 }
5518                 simple_unlock(&vm_paging_lock);
5519         }
5520 }
5521
5522 #if CRYPTO
5523 /*
5524  * Encryption data.
5525  * "iv" is the "initial vector".  Ideally, we want to
5526  * have a different one for each page we encrypt, so that
5527  * crackers can't find encryption patterns too easily.
5528  */
5529 #define SWAP_CRYPT_AES_KEY_SIZE 128     /* XXX 192 and 256 don't work ! */
5530 boolean_t               swap_crypt_ctx_initialized = FALSE;
5531 aes_32t                 swap_crypt_key[8]; /* big enough for a 256 key */
5532 aes_ctx                 swap_crypt_ctx;
5533 const unsigned char     swap_crypt_null_iv[AES_BLOCK_SIZE] = {0xa, };
5534
5535 #if DEBUG
5536 boolean_t               swap_crypt_ctx_tested = FALSE;
5537 unsigned char swap_crypt_test_page_ref[4096] __attribute__((aligned(4096)));
5538 unsigned char swap_crypt_test_page_encrypt[4096] __attribute__((aligned(4096)));
5539 unsigned char swap_crypt_test_page_decrypt[4096] __attribute__((aligned(4096)));
5540 #endif /* DEBUG */
5541
5542 extern u_long random(void);
5543
5544 /*
5545  * Initialize the encryption context: key and key size.
5546  */
5547 void swap_crypt_ctx_initialize(void); /* forward */
5548 void
5549 swap_crypt_ctx_initialize(void)
5550 {
5551         unsigned int    i;
5552
5553         /*
5554          * No need for locking to protect swap_crypt_ctx_initialized
5555          * because the first use of encryption will come from the
5556          * pageout thread (we won't pagein before there's been a pageout)
5557          * and there's only one pageout thread.
5558          */
5559         if (swap_crypt_ctx_initialized == FALSE) {
5560                 for (i = 0;
5561                      i < (sizeof (swap_crypt_key) /
5562                           sizeof (swap_crypt_key[0]));
5563                      i++) {
5564                         swap_crypt_key[i] = random();
5565                 }
5566                 aes_encrypt_key((const unsigned char *) swap_crypt_key,
5567                                 SWAP_CRYPT_AES_KEY_SIZE,
5568                                 &swap_crypt_ctx.encrypt);
5569                 aes_decrypt_key((const unsigned char *) swap_crypt_key,
5570                                 SWAP_CRYPT_AES_KEY_SIZE,
5571                                 &swap_crypt_ctx.decrypt);
5572                 swap_crypt_ctx_initialized = TRUE;
5573         }
5574
5575 #if DEBUG
5576         /*
5577          * Validate the encryption algorithms.
5578          */
5579         if (swap_crypt_ctx_tested == FALSE) {
5580                 /* initialize */
5581                 for (i = 0; i < 4096; i++) {
5582                         swap_crypt_test_page_ref[i] = (char) i;
5583                 }
5584                 /* encrypt */
5585                 aes_encrypt_cbc(swap_crypt_test_page_ref,
5586                                 swap_crypt_null_iv,
5587                                 PAGE_SIZE / AES_BLOCK_SIZE,
5588                                 swap_crypt_test_page_encrypt,
5589                                 &swap_crypt_ctx.encrypt);
5590                 /* decrypt */
5591                 aes_decrypt_cbc(swap_crypt_test_page_encrypt,
5592                                 swap_crypt_null_iv,
5593                                 PAGE_SIZE / AES_BLOCK_SIZE,
5594                                 swap_crypt_test_page_decrypt,
5595                                 &swap_crypt_ctx.decrypt);
5596                 /* compare result with original */
5597                 for (i = 0; i < 4096; i ++) {
5598                         if (swap_crypt_test_page_decrypt[i] !=
5599                             swap_crypt_test_page_ref[i]) {
5600                                 panic("encryption test failed");
5601                         }
5602                 }
5603
5604                 /* encrypt again */
5605                 aes_encrypt_cbc(swap_crypt_test_page_decrypt,
5606                                 swap_crypt_null_iv,
5607                                 PAGE_SIZE / AES_BLOCK_SIZE,
5608                                 swap_crypt_test_page_decrypt,
5609                                 &swap_crypt_ctx.encrypt);
5610                 /* decrypt in place */
5611                 aes_decrypt_cbc(swap_crypt_test_page_decrypt,
5612                                 swap_crypt_null_iv,
5613                                 PAGE_SIZE / AES_BLOCK_SIZE,
5614                                 swap_crypt_test_page_decrypt,
5615                                 &swap_crypt_ctx.decrypt);
5616                 for (i = 0; i < 4096; i ++) {
5617                         if (swap_crypt_test_page_decrypt[i] !=
5618                             swap_crypt_test_page_ref[i]) {
5619                                 panic("in place encryption test failed");
5620                         }
5621                 }
5622
5623                 swap_crypt_ctx_tested = TRUE;
5624         }
5625 #endif /* DEBUG */
5626 }
5627
5628 /*
5629  * ENCRYPTED SWAP:
5630  * vm_page_encrypt:
5631  *      Encrypt the given page, for secure paging.
5632  *      The page might already be mapped at kernel virtual
5633  *      address "kernel_mapping_offset".  Otherwise, we need
5634  *      to map it.
5635  *
5636  * Context:
5637  *      The page's object is locked, but this lock will be released
5638  *      and re-acquired.
5639  *      The page is busy and not accessible by users (not entered in any pmap).
5640  */
5641 void
5642 vm_page_encrypt(
5643         vm_page_t       page,
5644         vm_map_offset_t kernel_mapping_offset)
5645 {
5646         kern_return_t           kr;
5647         vm_map_size_t           kernel_mapping_size;
5648         vm_offset_t             kernel_vaddr;
5649         union {
5650                 unsigned char   aes_iv[AES_BLOCK_SIZE];
5651                 struct {
5652                         memory_object_t         pager_object;
5653                         vm_object_offset_t      paging_offset;
5654                 } vm;
5655         } encrypt_iv;
5656
5657         if (! vm_pages_encrypted) {
5658                 vm_pages_encrypted = TRUE;
5659         }
5660
5661         assert(page->busy);
5662         assert(page->dirty || page->precious);
5663
5664         if (page->encrypted) {
5665                 /*
5666                  * Already encrypted: no need to do it again.
5667                  */
5668                 vm_page_encrypt_already_encrypted_counter++;
5669                 return;
5670         }
5671         ASSERT_PAGE_DECRYPTED(page);
5672
5673         /*
5674          * Take a paging-in-progress reference to keep the object
5675          * alive even if we have to unlock it (in vm_paging_map_object()
5676          * for example)...
5677          */
5678         vm_object_paging_begin(page->object);
5679
5680         if (kernel_mapping_offset == 0) {
5681                 /*
5682                  * The page hasn't already been mapped in kernel space
5683                  * by the caller.  Map it now, so that we can access
5684                  * its contents and encrypt them.
5685                  */
5686                 kernel_mapping_size = PAGE_SIZE;
5687                 kr = vm_paging_map_object(&kernel_mapping_offset,
5688                                           page,
5689                                           page->object,
5690                                           page->offset,
5691                                           &kernel_mapping_size,
5692                                           FALSE);
5693                 if (kr != KERN_SUCCESS) {
5694                         panic("vm_page_encrypt: "
5695                               "could not map page in kernel: 0x%x\n",
5696                               kr);
5697                 }
5698         } else {
5699                 kernel_mapping_size = 0;
5700         }
5701         kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
5702
5703         if (swap_crypt_ctx_initialized == FALSE) {
5704                 swap_crypt_ctx_initialize();
5705         }
5706         assert(swap_crypt_ctx_initialized);
5707
5708         /*
5709          * Prepare an "initial vector" for the encryption.
5710          * We use the "pager" and the "paging_offset" for that
5711          * page to obfuscate the encrypted data a bit more and
5712          * prevent crackers from finding patterns that they could
5713          * use to break the key.
5714          */
5715         bzero(&encrypt_iv.aes_iv[0], sizeof (encrypt_iv.aes_iv));
5716         encrypt_iv.vm.pager_object = page->object->pager;
5717         encrypt_iv.vm.paging_offset =
5718                 page->object->paging_offset + page->offset;
5719
5720         /* encrypt the "initial vector" */
5721         aes_encrypt_cbc((const unsigned char *) &encrypt_iv.aes_iv[0],
5722                         swap_crypt_null_iv,
5723                         1,
5724                         &encrypt_iv.aes_iv[0],
5725                         &swap_crypt_ctx.encrypt);
5726
5727         /*
5728          * Encrypt the page.
5729          */
5730         aes_encrypt_cbc((const unsigned char *) kernel_vaddr,
5731                         &encrypt_iv.aes_iv[0],
5732                         PAGE_SIZE / AES_BLOCK_SIZE,
5733                         (unsigned char *) kernel_vaddr,
5734                         &swap_crypt_ctx.encrypt);
5735
5736         vm_page_encrypt_counter++;
5737
5738         /*
5739          * Unmap the page from the kernel's address space,
5740          * if we had to map it ourselves.  Otherwise, let
5741          * the caller undo the mapping if needed.
5742          */
5743         if (kernel_mapping_size != 0) {
5744                 vm_paging_unmap_object(page->object,
5745                                        kernel_mapping_offset,
5746                                        kernel_mapping_offset + kernel_mapping_size);
5747         }
5748
5749         /*
5750          * Clear the "reference" and "modified" bits.
5751          * This should clean up any impact the encryption had
5752          * on them.
5753          * The page was kept busy and disconnected from all pmaps,
5754          * so it can't have been referenced or modified from user
5755          * space.
5756          * The software bits will be reset later after the I/O
5757          * has completed (in upl_commit_range()).
5758          */
5759         pmap_clear_refmod(page->phys_page, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
5760
5761         page->encrypted = TRUE;
5762
5763         vm_object_paging_end(page->object);
5764 }
5765
5766 /*
5767  * ENCRYPTED SWAP:
5768  * vm_page_decrypt:
5769  *      Decrypt the given page.
5770  *      The page might already be mapped at kernel virtual
5771  *      address "kernel_mapping_offset".  Otherwise, we need
5772  *      to map it.
5773  *
5774  * Context:
5775  *      The page's VM object is locked but will be unlocked and relocked.
5776  *      The page is busy and not accessible by users (not entered in any pmap).
5777  */
5778 void
5779 vm_page_decrypt(
5780         vm_page_t       page,
5781         vm_map_offset_t kernel_mapping_offset)
5782 {
5783         kern_return_t           kr;
5784         vm_map_size_t           kernel_mapping_size;
5785         vm_offset_t             kernel_vaddr;
5786         union {
5787                 unsigned char   aes_iv[AES_BLOCK_SIZE];
5788                 struct {
5789                         memory_object_t         pager_object;
5790                         vm_object_offset_t      paging_offset;
5791                 } vm;
5792         } decrypt_iv;
5793
5794         assert(page->busy);
5795         assert(page->encrypted);
5796
5797         /*
5798          * Take a paging-in-progress reference to keep the object
5799          * alive even if we have to unlock it (in vm_paging_map_object()
5800          * for example)...
5801          */
5802         vm_object_paging_begin(page->object);
5803
5804         if (kernel_mapping_offset == 0) {
5805                 /*
5806                  * The page hasn't already been mapped in kernel space
5807                  * by the caller.  Map it now, so that we can access
5808                  * its contents and decrypt them.
5809                  */
5810                 kernel_mapping_size = PAGE_SIZE;
5811                 kr = vm_paging_map_object(&kernel_mapping_offset,
5812                                           page,
5813                                           page->object,
5814                                           page->offset,
5815                                           &kernel_mapping_size,
5816                                           FALSE);
5817                 if (kr != KERN_SUCCESS) {
5818                         panic("vm_page_decrypt: "
5819                               "could not map page in kernel: 0x%x\n",
5820                               kr);
5821                 }
5822         } else {
5823                 kernel_mapping_size = 0;
5824         }
5825         kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
5826
5827         assert(swap_crypt_ctx_initialized);
5828
5829         /*
5830          * Prepare an "initial vector" for the decryption.
5831          * It has to be the same as the "initial vector" we
5832          * used to encrypt that page.
5833          */
5834         bzero(&decrypt_iv.aes_iv[0], sizeof (decrypt_iv.aes_iv));
5835         decrypt_iv.vm.pager_object = page->object->pager;
5836         decrypt_iv.vm.paging_offset =
5837                 page->object->paging_offset + page->offset;
5838
5839         /* encrypt the "initial vector" */
5840         aes_encrypt_cbc((const unsigned char *) &decrypt_iv.aes_iv[0],
5841                         swap_crypt_null_iv,
5842                         1,
5843                         &decrypt_iv.aes_iv[0],
5844                         &swap_crypt_ctx.encrypt);
5845
5846         /*
5847          * Decrypt the page.
5848          */
5849         aes_decrypt_cbc((const unsigned char *) kernel_vaddr,
5850                         &decrypt_iv.aes_iv[0],
5851                         PAGE_SIZE / AES_BLOCK_SIZE,
5852                         (unsigned char *) kernel_vaddr,
5853                         &swap_crypt_ctx.decrypt);
5854         vm_page_decrypt_counter++;
5855
5856         /*
5857          * Unmap the page from the kernel's address space,
5858          * if we had to map it ourselves.  Otherwise, let
5859          * the caller undo the mapping if needed.
5860          */
5861         if (kernel_mapping_size != 0) {
5862                 vm_paging_unmap_object(page->object,
5863                                        kernel_vaddr,
5864                                        kernel_vaddr + PAGE_SIZE);
5865         }
5866
5867         /*
5868          * After decryption, the page is actually clean.
5869          * It was encrypted as part of paging, which "cleans"
5870          * the "dirty" pages.
5871          * Noone could access it after it was encrypted
5872          * and the decryption doesn't count.
5873          */
5874         page->dirty = FALSE;
5875         if (page->cs_validated && !page->cs_tainted) {
5876                 /*
5877                  * CODE SIGNING:
5878                  * This page is no longer dirty
5879                  * but could have been modified,
5880                  * so it will need to be
5881                  * re-validated.
5882                  */
5883                 page->cs_validated = FALSE;
5884                 vm_cs_validated_resets++;
5885         }
5886         pmap_clear_refmod(page->phys_page, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
5887
5888         page->encrypted = FALSE;
5889
5890         /*
5891          * We've just modified the page's contents via the data cache and part
5892          * of the new contents might still be in the cache and not yet in RAM.
5893          * Since the page is now available and might get gathered in a UPL to
5894          * be part of a DMA transfer from a driver that expects the memory to
5895          * be coherent at this point, we have to flush the data cache.
5896          */
5897         pmap_sync_page_attributes_phys(page->phys_page);
5898         /*
5899          * Since the page is not mapped yet, some code might assume that it
5900          * doesn't need to invalidate the instruction cache when writing to
5901          * that page.  That code relies on "pmapped" being FALSE, so that the
5902          * caches get synchronized when the page is first mapped.
5903          */
5904         assert(pmap_verify_free(page->phys_page));
5905         page->pmapped = FALSE;
5906         page->wpmapped = FALSE;
5907
5908         vm_object_paging_end(page->object);
5909 }
5910
5911 unsigned long upl_encrypt_upls = 0;
5912 unsigned long upl_encrypt_pages = 0;
5913
5914 /*
5915  * ENCRYPTED SWAP:
5916  *
5917  * upl_encrypt:
5918  *      Encrypts all the pages in the UPL, within the specified range.
5919  *
5920  */
5921 void
5922 upl_encrypt(
5923         upl_t                   upl,
5924         upl_offset_t            crypt_offset,
5925         upl_size_t              crypt_size)
5926 {
5927         upl_size_t              upl_size;
5928         upl_offset_t            upl_offset;
5929         vm_object_t             upl_object;
5930         vm_page_t               page;
5931         vm_object_t             shadow_object;
5932         vm_object_offset_t      shadow_offset;
5933         vm_object_offset_t      paging_offset;
5934         vm_object_offset_t      base_offset;
5935
5936         upl_encrypt_upls++;
5937         upl_encrypt_pages += crypt_size / PAGE_SIZE;
5938
5939         upl_object = upl->map_object;
5940         upl_offset = upl->offset;
5941         upl_size = upl->size;
5942
5943         vm_object_lock(upl_object);
5944
5945         /*
5946          * Find the VM object that contains the actual pages.
5947          */
5948         if (upl_object->pageout) {
5949                 shadow_object = upl_object->shadow;
5950                 /*
5951                  * The offset in the shadow object is actually also
5952                  * accounted for in upl->offset.  It possibly shouldn't be
5953                  * this way, but for now don't account for it twice.
5954                  */
5955                 shadow_offset = 0;
5956                 assert(upl_object->paging_offset == 0); /* XXX ? */
5957                 vm_object_lock(shadow_object);
5958         } else {
5959                 shadow_object = upl_object;
5960                 shadow_offset = 0;
5961         }
5962
5963         paging_offset = shadow_object->paging_offset;
5964         vm_object_paging_begin(shadow_object);
5965
5966         if (shadow_object != upl_object)
5967                 vm_object_unlock(upl_object);
5968
5969
5970         base_offset = shadow_offset;
5971         base_offset += upl_offset;
5972         base_offset += crypt_offset;
5973         base_offset -= paging_offset;
5974
5975         assert(crypt_offset + crypt_size <= upl_size);
5976
5977         for (upl_offset = 0;
5978              upl_offset < crypt_size;
5979              upl_offset += PAGE_SIZE) {
5980                 page = vm_page_lookup(shadow_object,
5981                                       base_offset + upl_offset);
5982                 if (page == VM_PAGE_NULL) {
5983                         panic("upl_encrypt: "
5984                               "no page for (obj=%p,off=%lld+%d)!\n",
5985                               shadow_object,
5986                               base_offset,
5987                               upl_offset);
5988                 }
5989                 /*
5990                  * Disconnect the page from all pmaps, so that nobody can
5991                  * access it while it's encrypted.  After that point, all
5992                  * accesses to this page will cause a page fault and block
5993                  * while the page is busy being encrypted.  After the
5994                  * encryption completes, any access will cause a
5995                  * page fault and the page gets decrypted at that time.
5996                  */
5997                 pmap_disconnect(page->phys_page);
5998                 vm_page_encrypt(page, 0);
5999
6000                 if (shadow_object == vm_pageout_scan_wants_object) {
6001                         /*
6002                          * Give vm_pageout_scan() a chance to convert more
6003                          * pages from "clean-in-place" to "clean-and-free",
6004                          * if it's interested in the same pages we selected
6005                          * in this cluster.
6006                          */
6007                         vm_object_unlock(shadow_object);
6008                         vm_object_lock(shadow_object);
6009                 }
6010         }
6011
6012         vm_object_paging_end(shadow_object);
6013         vm_object_unlock(shadow_object);
6014 }
6015
6016 #else /* CRYPTO */
6017 void
6018 upl_encrypt(
6019         __unused upl_t                  upl,
6020         __unused upl_offset_t   crypt_offset,
6021         __unused upl_size_t     crypt_size)
6022 {
6023 }
6024
6025 void
6026 vm_page_encrypt(
6027         __unused vm_page_t              page,
6028         __unused vm_map_offset_t        kernel_mapping_offset)
6029 {
6030 }
6031
6032 void
6033 vm_page_decrypt(
6034         __unused vm_page_t              page,
6035         __unused vm_map_offset_t        kernel_mapping_offset)
6036 {
6037 }
6038
6039 #endif /* CRYPTO */
6040
6041 vm_size_t
6042 upl_get_internal_pagelist_offset(void)
6043 {
6044         return sizeof(struct upl);
6045 }
6046
6047 void
6048 upl_clear_dirty(
6049         upl_t           upl,
6050         boolean_t       value)
6051 {
6052         if (value) {
6053                 upl->flags |= UPL_CLEAR_DIRTY;
6054         } else {
6055                 upl->flags &= ~UPL_CLEAR_DIRTY;
6056         }
6057 }
6058
6059
6060 #ifdef MACH_BSD
6061
6062 boolean_t  upl_device_page(upl_page_info_t *upl)
6063 {
6064         return(UPL_DEVICE_PAGE(upl));
6065 }
6066 boolean_t  upl_page_present(upl_page_info_t *upl, int index)
6067 {
6068         return(UPL_PAGE_PRESENT(upl, index));
6069 }
6070 boolean_t  upl_speculative_page(upl_page_info_t *upl, int index)
6071 {
6072         return(UPL_SPECULATIVE_PAGE(upl, index));
6073 }
6074 boolean_t  upl_dirty_page(upl_page_info_t *upl, int index)
6075 {
6076         return(UPL_DIRTY_PAGE(upl, index));
6077 }
6078 boolean_t  upl_valid_page(upl_page_info_t *upl, int index)
6079 {
6080         return(UPL_VALID_PAGE(upl, index));
6081 }
6082 ppnum_t  upl_phys_page(upl_page_info_t *upl, int index)
6083 {
6084         return(UPL_PHYS_PAGE(upl, index));
6085 }
6086
6087
6088 void
6089 vm_countdirtypages(void)
6090 {
6091         vm_page_t m;
6092         int dpages;
6093         int pgopages;
6094         int precpages;
6095
6096
6097         dpages=0;
6098         pgopages=0;
6099         precpages=0;
6100
6101         vm_page_lock_queues();
6102         m = (vm_page_t) queue_first(&vm_page_queue_inactive);
6103         do {
6104                 if (m ==(vm_page_t )0) break;
6105
6106                 if(m->dirty) dpages++;
6107                 if(m->pageout) pgopages++;
6108                 if(m->precious) precpages++;
6109
6110                 assert(m->object != kernel_object);
6111                 m = (vm_page_t) queue_next(&m->pageq);
6112                 if (m ==(vm_page_t )0) break;
6113
6114         } while (!queue_end(&vm_page_queue_inactive,(queue_entry_t) m));
6115         vm_page_unlock_queues();
6116
6117         vm_page_lock_queues();
6118         m = (vm_page_t) queue_first(&vm_page_queue_throttled);
6119         do {
6120                 if (m ==(vm_page_t )0) break;
6121
6122                 dpages++;
6123                 assert(m->dirty);
6124                 assert(!m->pageout);
6125                 assert(m->object != kernel_object);
6126                 m = (vm_page_t) queue_next(&m->pageq);
6127                 if (m ==(vm_page_t )0) break;
6128
6129         } while (!queue_end(&vm_page_queue_throttled,(queue_entry_t) m));
6130         vm_page_unlock_queues();
6131
6132         vm_page_lock_queues();
6133         m = (vm_page_t) queue_first(&vm_page_queue_zf);
6134         do {
6135                 if (m ==(vm_page_t )0) break;
6136
6137                 if(m->dirty) dpages++;
6138                 if(m->pageout) pgopages++;
6139                 if(m->precious) precpages++;
6140
6141                 assert(m->object != kernel_object);
6142                 m = (vm_page_t) queue_next(&m->pageq);
6143                 if (m ==(vm_page_t )0) break;
6144
6145         } while (!queue_end(&vm_page_queue_zf,(queue_entry_t) m));
6146         vm_page_unlock_queues();
6147
6148         printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
6149
6150         dpages=0;
6151         pgopages=0;
6152         precpages=0;
6153
6154         vm_page_lock_queues();
6155         m = (vm_page_t) queue_first(&vm_page_queue_active);
6156
6157         do {
6158                 if(m == (vm_page_t )0) break;
6159                 if(m->dirty) dpages++;
6160                 if(m->pageout) pgopages++;
6161                 if(m->precious) precpages++;
6162
6163                 assert(m->object != kernel_object);
6164                 m = (vm_page_t) queue_next(&m->pageq);
6165                 if(m == (vm_page_t )0) break;
6166
6167         } while (!queue_end(&vm_page_queue_active,(queue_entry_t) m));
6168         vm_page_unlock_queues();
6169
6170         printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
6171
6172 }
6173 #endif /* MACH_BSD */
6174
6175 ppnum_t upl_get_highest_page(
6176                              upl_t                      upl)
6177 {
6178         return upl->highest_page;
6179 }
6180
6181 #ifdef UPL_DEBUG
6182 kern_return_t  upl_ubc_alias_set(upl_t upl, unsigned int alias1, unsigned int alias2)
6183 {
6184         upl->ubc_alias1 = alias1;
6185         upl->ubc_alias2 = alias2;
6186         return KERN_SUCCESS;
6187 }
6188 int  upl_ubc_alias_get(upl_t upl, unsigned int * al, unsigned int * al2)
6189 {
6190         if(al)
6191                 *al = upl->ubc_alias1;
6192         if(al2)
6193                 *al2 = upl->ubc_alias2;
6194         return KERN_SUCCESS;
6195 }
6196 #endif /* UPL_DEBUG */
6197
6198
6199
6200 #if     MACH_KDB
6201 #include <ddb/db_output.h>
6202 #include <ddb/db_print.h>
6203 #include <vm/vm_print.h>
6204
6205 #define printf  kdbprintf
6206 void            db_pageout(void);
6207
6208 void
6209 db_vm(void)
6210 {
6211
6212         iprintf("VM Statistics:\n");
6213         db_indent += 2;
6214         iprintf("pages:\n");
6215         db_indent += 2;
6216         iprintf("activ %5d  inact %5d  free  %5d",
6217                 vm_page_active_count, vm_page_inactive_count,
6218                 vm_page_free_count);
6219         printf("   wire  %5d  gobbl %5d\n",
6220                vm_page_wire_count, vm_page_gobble_count);
6221         db_indent -= 2;
6222         iprintf("target:\n");
6223         db_indent += 2;
6224         iprintf("min   %5d  inact %5d  free  %5d",
6225                 vm_page_free_min, vm_page_inactive_target,
6226                 vm_page_free_target);
6227         printf("   resrv %5d\n", vm_page_free_reserved);
6228         db_indent -= 2;
6229         iprintf("pause:\n");
6230         db_pageout();
6231         db_indent -= 2;
6232 }
6233
6234 #if     MACH_COUNTERS
6235 extern int c_laundry_pages_freed;
6236 #endif  /* MACH_COUNTERS */
6237
6238 void
6239 db_pageout(void)
6240 {
6241         iprintf("Pageout Statistics:\n");
6242         db_indent += 2;
6243         iprintf("active %5d  inactv %5d\n",
6244                 vm_pageout_active, vm_pageout_inactive);
6245         iprintf("nolock %5d  avoid  %5d  busy   %5d  absent %5d\n",
6246                 vm_pageout_inactive_nolock, vm_pageout_inactive_avoid,
6247                 vm_pageout_inactive_busy, vm_pageout_inactive_absent);
6248         iprintf("used   %5d  clean  %5d  dirty  %5d\n",
6249                 vm_pageout_inactive_used, vm_pageout_inactive_clean,
6250                 vm_pageout_inactive_dirty);
6251 #if     MACH_COUNTERS
6252         iprintf("laundry_pages_freed %d\n", c_laundry_pages_freed);
6253 #endif  /* MACH_COUNTERS */
6254 #if     MACH_CLUSTER_STATS
6255         iprintf("Cluster Statistics:\n");
6256         db_indent += 2;
6257         iprintf("dirtied   %5d   cleaned  %5d   collisions  %5d\n",
6258                 vm_pageout_cluster_dirtied, vm_pageout_cluster_cleaned,
6259                 vm_pageout_cluster_collisions);
6260         iprintf("clusters  %5d   conversions  %5d\n",
6261                 vm_pageout_cluster_clusters, vm_pageout_cluster_conversions);
6262         db_indent -= 2;
6263         iprintf("Target Statistics:\n");
6264         db_indent += 2;
6265         iprintf("collisions   %5d   page_dirtied  %5d   page_freed  %5d\n",
6266                 vm_pageout_target_collisions, vm_pageout_target_page_dirtied,
6267                 vm_pageout_target_page_freed);
6268         db_indent -= 2;
6269 #endif  /* MACH_CLUSTER_STATS */
6270         db_indent -= 2;
6271 }
6272
6273 #endif  /* MACH_KDB */