osfmk/vm/vm_pageout.c

   1 /*
   2  * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58 /*
  59  *      File:   vm/vm_pageout.c
  60  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  61  *      Date:   1985
  62  *
  63  *      The proverbial page-out daemon.
  64  */
  65
  66 #include <stdint.h>
  67
  68 #include <debug.h>
  69 #include <mach_pagemap.h>
  70 #include <mach_cluster_stats.h>
  71 #include <mach_kdb.h>
  72 #include <advisory_pageout.h>
  73
  74 #include <mach/mach_types.h>
  75 #include <mach/memory_object.h>
  76 #include <mach/memory_object_default.h>
  77 #include <mach/memory_object_control_server.h>
  78 #include <mach/mach_host_server.h>
  79 #include <mach/upl.h>
  80 #include <mach/vm_map.h>
  81 #include <mach/vm_param.h>
  82 #include <mach/vm_statistics.h>
  83 #include <mach/sdt.h>
  84
  85 #include <kern/kern_types.h>
  86 #include <kern/counters.h>
  87 #include <kern/host_statistics.h>
  88 #include <kern/machine.h>
  89 #include <kern/misc_protos.h>
  90 #include <kern/thread.h>
  91 #include <kern/xpr.h>
  92 #include <kern/kalloc.h>
  93
  94 #include <machine/vm_tuning.h>
  95
  96 #if CONFIG_EMBEDDED
  97 #include <sys/kern_memorystatus.h>
  98 #endif
  99
 100 #include <vm/pmap.h>
 101 #include <vm/vm_fault.h>
 102 #include <vm/vm_map.h>
 103 #include <vm/vm_object.h>
 104 #include <vm/vm_page.h>
 105 #include <vm/vm_pageout.h>
 106 #include <vm/vm_protos.h> /* must be last */
 107 #include <vm/memory_object.h>
 108 #include <vm/vm_purgeable_internal.h>
 109
 110 /*
 111  * ENCRYPTED SWAP:
 112  */
 113 #include <../bsd/crypto/aes/aes.h>
 114
 115
 116 #ifndef VM_PAGEOUT_BURST_ACTIVE_THROTTLE   /* maximum iterations of the active queue to move pages to inactive */
 117 #ifdef  CONFIG_EMBEDDED
 118 #define VM_PAGEOUT_BURST_ACTIVE_THROTTLE  2048
 119 #else
 120 #define VM_PAGEOUT_BURST_ACTIVE_THROTTLE  100
 121 #endif
 122 #endif
 123
 124 #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE  /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
 125 #ifdef  CONFIG_EMBEDDED
 126 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 1024
 127 #else
 128 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096
 129 #endif
 130 #endif
 131
 132 #ifndef VM_PAGEOUT_DEADLOCK_RELIEF
 133 #define VM_PAGEOUT_DEADLOCK_RELIEF 100  /* number of pages to move to break deadlock */
 134 #endif
 135
 136 #ifndef VM_PAGEOUT_INACTIVE_RELIEF
 137 #define VM_PAGEOUT_INACTIVE_RELIEF 50   /* minimum number of pages to move to the inactive q */
 138 #endif
 139
 140 #ifndef VM_PAGE_LAUNDRY_MAX
 141 #define VM_PAGE_LAUNDRY_MAX     16UL    /* maximum pageouts on a given pageout queue */
 142 #endif  /* VM_PAGEOUT_LAUNDRY_MAX */
 143
 144 #ifndef VM_PAGEOUT_BURST_WAIT
 145 #define VM_PAGEOUT_BURST_WAIT   30      /* milliseconds per page */
 146 #endif  /* VM_PAGEOUT_BURST_WAIT */
 147
 148 #ifndef VM_PAGEOUT_EMPTY_WAIT
 149 #define VM_PAGEOUT_EMPTY_WAIT   200     /* milliseconds */
 150 #endif  /* VM_PAGEOUT_EMPTY_WAIT */
 151
 152 #ifndef VM_PAGEOUT_DEADLOCK_WAIT
 153 #define VM_PAGEOUT_DEADLOCK_WAIT        300     /* milliseconds */
 154 #endif  /* VM_PAGEOUT_DEADLOCK_WAIT */
 155
 156 #ifndef VM_PAGEOUT_IDLE_WAIT
 157 #define VM_PAGEOUT_IDLE_WAIT    10      /* milliseconds */
 158 #endif  /* VM_PAGEOUT_IDLE_WAIT */
 159
 160 #ifndef VM_PAGE_SPECULATIVE_TARGET
 161 #define VM_PAGE_SPECULATIVE_TARGET(total) ((total) * 1 / 20)
 162 #endif /* VM_PAGE_SPECULATIVE_TARGET */
 163
 164 #ifndef VM_PAGE_INACTIVE_HEALTHY_LIMIT
 165 #define VM_PAGE_INACTIVE_HEALTHY_LIMIT(total) ((total) * 1 / 200)
 166 #endif /* VM_PAGE_INACTIVE_HEALTHY_LIMIT */
 167
 168
 169 /*
 170  *      To obtain a reasonable LRU approximation, the inactive queue
 171  *      needs to be large enough to give pages on it a chance to be
 172  *      referenced a second time.  This macro defines the fraction
 173  *      of active+inactive pages that should be inactive.
 174  *      The pageout daemon uses it to update vm_page_inactive_target.
 175  *
 176  *      If vm_page_free_count falls below vm_page_free_target and
 177  *      vm_page_inactive_count is below vm_page_inactive_target,
 178  *      then the pageout daemon starts running.
 179  */
 180
 181 #ifndef VM_PAGE_INACTIVE_TARGET
 182 #define VM_PAGE_INACTIVE_TARGET(avail)  ((avail) * 1 / 3)
 183 #endif  /* VM_PAGE_INACTIVE_TARGET */
 184
 185 /*
 186  *      Once the pageout daemon starts running, it keeps going
 187  *      until vm_page_free_count meets or exceeds vm_page_free_target.
 188  */
 189
 190 #ifndef VM_PAGE_FREE_TARGET
 191 #ifdef  CONFIG_EMBEDDED
 192 #define VM_PAGE_FREE_TARGET(free)       (15 + (free) / 100)
 193 #else
 194 #define VM_PAGE_FREE_TARGET(free)       (15 + (free) / 80)
 195 #endif
 196 #endif  /* VM_PAGE_FREE_TARGET */
 197
 198 /*
 199  *      The pageout daemon always starts running once vm_page_free_count
 200  *      falls below vm_page_free_min.
 201  */
 202
 203 #ifndef VM_PAGE_FREE_MIN
 204 #ifdef  CONFIG_EMBEDDED
 205 #define VM_PAGE_FREE_MIN(free)          (10 + (free) / 200)
 206 #else
 207 #define VM_PAGE_FREE_MIN(free)          (10 + (free) / 100)
 208 #endif
 209 #endif  /* VM_PAGE_FREE_MIN */
 210
 211 #define VM_PAGE_FREE_MIN_LIMIT          1500
 212 #define VM_PAGE_FREE_TARGET_LIMIT       2000
 213
 214
 215 /*
 216  *      When vm_page_free_count falls below vm_page_free_reserved,
 217  *      only vm-privileged threads can allocate pages.  vm-privilege
 218  *      allows the pageout daemon and default pager (and any other
 219  *      associated threads needed for default pageout) to continue
 220  *      operation by dipping into the reserved pool of pages.
 221  */
 222
 223 #ifndef VM_PAGE_FREE_RESERVED
 224 #define VM_PAGE_FREE_RESERVED(n)        \
 225         ((6 * VM_PAGE_LAUNDRY_MAX) + (n))
 226 #endif  /* VM_PAGE_FREE_RESERVED */
 227
 228 /*
 229  *      When we dequeue pages from the inactive list, they are
 230  *      reactivated (ie, put back on the active queue) if referenced.
 231  *      However, it is possible to starve the free list if other
 232  *      processors are referencing pages faster than we can turn off
 233  *      the referenced bit.  So we limit the number of reactivations
 234  *      we will make per call of vm_pageout_scan().
 235  */
 236 #define VM_PAGE_REACTIVATE_LIMIT_MAX 20000
 237 #ifndef VM_PAGE_REACTIVATE_LIMIT
 238 #ifdef  CONFIG_EMBEDDED
 239 #define VM_PAGE_REACTIVATE_LIMIT(avail) (VM_PAGE_INACTIVE_TARGET(avail) / 2)
 240 #else
 241 #define VM_PAGE_REACTIVATE_LIMIT(avail) (MAX((avail) * 1 / 20,VM_PAGE_REACTIVATE_LIMIT_MAX))
 242 #endif
 243 #endif  /* VM_PAGE_REACTIVATE_LIMIT */
 244 #define VM_PAGEOUT_INACTIVE_FORCE_RECLAIM       100
 245
 246
 247 /*
 248  * must hold the page queues lock to
 249  * manipulate this structure
 250  */
 251 struct vm_pageout_queue {
 252         queue_head_t    pgo_pending;    /* laundry pages to be processed by pager's iothread */
 253         unsigned int    pgo_laundry;    /* current count of laundry pages on queue or in flight */
 254         unsigned int    pgo_maxlaundry;
 255
 256         unsigned int    pgo_idle:1,     /* iothread is blocked waiting for work to do */
 257                         pgo_busy:1,     /* iothread is currently processing request from pgo_pending */
 258                         pgo_throttled:1,/* vm_pageout_scan thread needs a wakeup when pgo_laundry drops */
 259                         :0;
 260 };
 261
 262 #define VM_PAGE_Q_THROTTLED(q)          \
 263         ((q)->pgo_laundry >= (q)->pgo_maxlaundry)
 264
 265
 266 /*
 267  * Exported variable used to broadcast the activation of the pageout scan
 268  * Working Set uses this to throttle its use of pmap removes.  In this
 269  * way, code which runs within memory in an uncontested context does
 270  * not keep encountering soft faults.
 271  */
 272
 273 unsigned int    vm_pageout_scan_event_counter = 0;
 274
 275 /*
 276  * Forward declarations for internal routines.
 277  */
 278
 279 static void vm_pageout_garbage_collect(int);
 280 static void vm_pageout_iothread_continue(struct vm_pageout_queue *);
 281 static void vm_pageout_iothread_external(void);
 282 static void vm_pageout_iothread_internal(void);
 283 static void vm_pageout_queue_steal(vm_page_t);
 284
 285 extern void vm_pageout_continue(void);
 286 extern void vm_pageout_scan(void);
 287
 288 static thread_t vm_pageout_external_iothread = THREAD_NULL;
 289 static thread_t vm_pageout_internal_iothread = THREAD_NULL;
 290
 291 unsigned int vm_pageout_reserved_internal = 0;
 292 unsigned int vm_pageout_reserved_really = 0;
 293
 294 unsigned int vm_pageout_idle_wait = 0;          /* milliseconds */
 295 unsigned int vm_pageout_empty_wait = 0;         /* milliseconds */
 296 unsigned int vm_pageout_burst_wait = 0;         /* milliseconds */
 297 unsigned int vm_pageout_deadlock_wait = 0;      /* milliseconds */
 298 unsigned int vm_pageout_deadlock_relief = 0;
 299 unsigned int vm_pageout_inactive_relief = 0;
 300 unsigned int vm_pageout_burst_active_throttle = 0;
 301 unsigned int vm_pageout_burst_inactive_throttle = 0;
 302
 303 /*
 304  *      Protection against zero fill flushing live working sets derived
 305  *      from existing backing store and files
 306  */
 307 unsigned int vm_accellerate_zf_pageout_trigger = 400;
 308 unsigned int zf_queue_min_count = 100;
 309 unsigned int vm_zf_count = 0;
 310 unsigned int vm_zf_queue_count = 0;
 311
 312 /*
 313  *      These variables record the pageout daemon's actions:
 314  *      how many pages it looks at and what happens to those pages.
 315  *      No locking needed because only one thread modifies the variables.
 316  */
 317
 318 unsigned int vm_pageout_active = 0;             /* debugging */
 319 unsigned int vm_pageout_inactive = 0;           /* debugging */
 320 unsigned int vm_pageout_inactive_throttled = 0; /* debugging */
 321 unsigned int vm_pageout_inactive_forced = 0;    /* debugging */
 322 unsigned int vm_pageout_inactive_nolock = 0;    /* debugging */
 323 unsigned int vm_pageout_inactive_avoid = 0;     /* debugging */
 324 unsigned int vm_pageout_inactive_busy = 0;      /* debugging */
 325 unsigned int vm_pageout_inactive_absent = 0;    /* debugging */
 326 unsigned int vm_pageout_inactive_used = 0;      /* debugging */
 327 unsigned int vm_pageout_inactive_clean = 0;     /* debugging */
 328 unsigned int vm_pageout_inactive_dirty = 0;     /* debugging */
 329 unsigned int vm_pageout_dirty_no_pager = 0;     /* debugging */
 330 unsigned int vm_pageout_purged_objects = 0;     /* debugging */
 331 unsigned int vm_stat_discard = 0;               /* debugging */
 332 unsigned int vm_stat_discard_sent = 0;          /* debugging */
 333 unsigned int vm_stat_discard_failure = 0;       /* debugging */
 334 unsigned int vm_stat_discard_throttle = 0;      /* debugging */
 335 unsigned int vm_pageout_reactivation_limit_exceeded = 0;        /* debugging */
 336 unsigned int vm_pageout_catch_ups = 0;                          /* debugging */
 337 unsigned int vm_pageout_inactive_force_reclaim = 0;     /* debugging */
 338
 339 unsigned int vm_pageout_scan_active_throttled = 0;
 340 unsigned int vm_pageout_scan_inactive_throttled = 0;
 341 unsigned int vm_pageout_scan_throttle = 0;                      /* debugging */
 342 unsigned int vm_pageout_scan_burst_throttle = 0;                /* debugging */
 343 unsigned int vm_pageout_scan_empty_throttle = 0;                /* debugging */
 344 unsigned int vm_pageout_scan_deadlock_detected = 0;             /* debugging */
 345 unsigned int vm_pageout_scan_active_throttle_success = 0;       /* debugging */
 346 unsigned int vm_pageout_scan_inactive_throttle_success = 0;     /* debugging */
 347 /*
 348  * Backing store throttle when BS is exhausted
 349  */
 350 unsigned int    vm_backing_store_low = 0;
 351
 352 unsigned int vm_pageout_out_of_line  = 0;
 353 unsigned int vm_pageout_in_place  = 0;
 354
 355 /*
 356  * ENCRYPTED SWAP:
 357  * counters and statistics...
 358  */
 359 unsigned long vm_page_decrypt_counter = 0;
 360 unsigned long vm_page_decrypt_for_upl_counter = 0;
 361 unsigned long vm_page_encrypt_counter = 0;
 362 unsigned long vm_page_encrypt_abort_counter = 0;
 363 unsigned long vm_page_encrypt_already_encrypted_counter = 0;
 364 boolean_t vm_pages_encrypted = FALSE; /* are there encrypted pages ? */
 365
 366 struct  vm_pageout_queue vm_pageout_queue_internal;
 367 struct  vm_pageout_queue vm_pageout_queue_external;
 368
 369 unsigned int vm_page_speculative_target = 0;
 370
 371 vm_object_t     vm_pageout_scan_wants_object = VM_OBJECT_NULL;
 372
 373 unsigned long vm_cs_validated_resets = 0;
 374
 375 /*
 376  *      Routine:        vm_backing_store_disable
 377  *      Purpose:
 378  *              Suspend non-privileged threads wishing to extend
 379  *              backing store when we are low on backing store
 380  *              (Synchronized by caller)
 381  */
 382 void
 383 vm_backing_store_disable(
 384         boolean_t       disable)
 385 {
 386         if(disable) {
 387                 vm_backing_store_low = 1;
 388         } else {
 389                 if(vm_backing_store_low) {
 390                         vm_backing_store_low = 0;
 391                         thread_wakeup((event_t) &vm_backing_store_low);
 392                 }
 393         }
 394 }
 395
 396
 397 #if MACH_CLUSTER_STATS
 398 unsigned long vm_pageout_cluster_dirtied = 0;
 399 unsigned long vm_pageout_cluster_cleaned = 0;
 400 unsigned long vm_pageout_cluster_collisions = 0;
 401 unsigned long vm_pageout_cluster_clusters = 0;
 402 unsigned long vm_pageout_cluster_conversions = 0;
 403 unsigned long vm_pageout_target_collisions = 0;
 404 unsigned long vm_pageout_target_page_dirtied = 0;
 405 unsigned long vm_pageout_target_page_freed = 0;
 406 #define CLUSTER_STAT(clause)    clause
 407 #else   /* MACH_CLUSTER_STATS */
 408 #define CLUSTER_STAT(clause)
 409 #endif  /* MACH_CLUSTER_STATS */
 410
 411 /*
 412  *      Routine:        vm_pageout_object_terminate
 413  *      Purpose:
 414  *              Destroy the pageout_object, and perform all of the
 415  *              required cleanup actions.
 416  *
 417  *      In/Out conditions:
 418  *              The object must be locked, and will be returned locked.
 419  */
 420 void
 421 vm_pageout_object_terminate(
 422         vm_object_t     object)
 423 {
 424         vm_object_t     shadow_object;
 425
 426         /*
 427          * Deal with the deallocation (last reference) of a pageout object
 428          * (used for cleaning-in-place) by dropping the paging references/
 429          * freeing pages in the original object.
 430          */
 431
 432         assert(object->pageout);
 433         shadow_object = object->shadow;
 434         vm_object_lock(shadow_object);
 435
 436         while (!queue_empty(&object->memq)) {
 437                 vm_page_t               p, m;
 438                 vm_object_offset_t      offset;
 439
 440                 p = (vm_page_t) queue_first(&object->memq);
 441
 442                 assert(p->private);
 443                 assert(p->pageout);
 444                 p->pageout = FALSE;
 445                 assert(!p->cleaning);
 446
 447                 offset = p->offset;
 448                 VM_PAGE_FREE(p);
 449                 p = VM_PAGE_NULL;
 450
 451                 m = vm_page_lookup(shadow_object,
 452                         offset + object->shadow_offset);
 453
 454                 if(m == VM_PAGE_NULL)
 455                         continue;
 456                 assert(m->cleaning);
 457                 /* used as a trigger on upl_commit etc to recognize the */
 458                 /* pageout daemon's subseqent desire to pageout a cleaning */
 459                 /* page.  When the bit is on the upl commit code will   */
 460                 /* respect the pageout bit in the target page over the  */
 461                 /* caller's page list indication */
 462                 m->dump_cleaning = FALSE;
 463
 464                 assert((m->dirty) || (m->precious) ||
 465                                 (m->busy && m->cleaning));
 466
 467                 /*
 468                  * Handle the trusted pager throttle.
 469                  * Also decrement the burst throttle (if external).
 470                  */
 471                 vm_page_lock_queues();
 472                 if (m->laundry) {
 473                         vm_pageout_throttle_up(m);
 474                 }
 475
 476                 /*
 477                  * Handle the "target" page(s). These pages are to be freed if
 478                  * successfully cleaned. Target pages are always busy, and are
 479                  * wired exactly once. The initial target pages are not mapped,
 480                  * (so cannot be referenced or modified) but converted target
 481                  * pages may have been modified between the selection as an
 482                  * adjacent page and conversion to a target.
 483                  */
 484                 if (m->pageout) {
 485                         assert(m->busy);
 486                         assert(m->wire_count == 1);
 487                         m->cleaning = FALSE;
 488                         m->encrypted_cleaning = FALSE;
 489                         m->pageout = FALSE;
 490 #if MACH_CLUSTER_STATS
 491                         if (m->wanted) vm_pageout_target_collisions++;
 492 #endif
 493                         /*
 494                          * Revoke all access to the page. Since the object is
 495                          * locked, and the page is busy, this prevents the page
 496                          * from being dirtied after the pmap_disconnect() call
 497                          * returns.
 498                          *
 499                          * Since the page is left "dirty" but "not modifed", we
 500                          * can detect whether the page was redirtied during
 501                          * pageout by checking the modify state.
 502                          */
 503                         if (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED)
 504                               m->dirty = TRUE;
 505                         else
 506                               m->dirty = FALSE;
 507
 508                         if (m->dirty) {
 509                                 CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
 510                                 vm_page_unwire(m);/* reactivates */
 511                                 VM_STAT_INCR(reactivations);
 512                                 PAGE_WAKEUP_DONE(m);
 513                         } else {
 514                                 CLUSTER_STAT(vm_pageout_target_page_freed++;)
 515                                 vm_page_free(m);/* clears busy, etc. */
 516                         }
 517                         vm_page_unlock_queues();
 518                         continue;
 519                 }
 520                 /*
 521                  * Handle the "adjacent" pages. These pages were cleaned in
 522                  * place, and should be left alone.
 523                  * If prep_pin_count is nonzero, then someone is using the
 524                  * page, so make it active.
 525                  */
 526                 if (!m->active && !m->inactive && !m->throttled && !m->private) {
 527                         if (m->reference)
 528                                 vm_page_activate(m);
 529                         else
 530                                 vm_page_deactivate(m);
 531                 }
 532                 if((m->busy) && (m->cleaning)) {
 533
 534                         /* the request_page_list case, (COPY_OUT_FROM FALSE) */
 535                         m->busy = FALSE;
 536
 537                         /* We do not re-set m->dirty ! */
 538                         /* The page was busy so no extraneous activity     */
 539                         /* could have occurred. COPY_INTO is a read into the */
 540                         /* new pages. CLEAN_IN_PLACE does actually write   */
 541                         /* out the pages but handling outside of this code */
 542                         /* will take care of resetting dirty. We clear the */
 543                         /* modify however for the Programmed I/O case.     */
 544                         pmap_clear_modify(m->phys_page);
 545
 546                         m->absent = FALSE;
 547                         m->overwriting = FALSE;
 548                 } else if (m->overwriting) {
 549                         /* alternate request page list, write to page_list */
 550                         /* case.  Occurs when the original page was wired  */
 551                         /* at the time of the list request */
 552                         assert(m->wire_count != 0);
 553                         vm_page_unwire(m);/* reactivates */
 554                         m->overwriting = FALSE;
 555                 } else {
 556                 /*
 557                  * Set the dirty state according to whether or not the page was
 558                  * modified during the pageout. Note that we purposefully do
 559                  * NOT call pmap_clear_modify since the page is still mapped.
 560                  * If the page were to be dirtied between the 2 calls, this
 561                  * this fact would be lost. This code is only necessary to
 562                  * maintain statistics, since the pmap module is always
 563                  * consulted if m->dirty is false.
 564                  */
 565 #if MACH_CLUSTER_STATS
 566                         m->dirty = pmap_is_modified(m->phys_page);
 567
 568                         if (m->dirty)   vm_pageout_cluster_dirtied++;
 569                         else            vm_pageout_cluster_cleaned++;
 570                         if (m->wanted)  vm_pageout_cluster_collisions++;
 571 #else
 572                         m->dirty = 0;
 573 #endif
 574                 }
 575                 m->cleaning = FALSE;
 576                 m->encrypted_cleaning = FALSE;
 577
 578                 /*
 579                  * Wakeup any thread waiting for the page to be un-cleaning.
 580                  */
 581                 PAGE_WAKEUP(m);
 582                 vm_page_unlock_queues();
 583         }
 584         /*
 585          * Account for the paging reference taken in vm_paging_object_allocate.
 586          */
 587         vm_object_paging_end(shadow_object);
 588         vm_object_unlock(shadow_object);
 589
 590         assert(object->ref_count == 0);
 591         assert(object->paging_in_progress == 0);
 592         assert(object->resident_page_count == 0);
 593         return;
 594 }
 595
 596 /*
 597  * Routine:     vm_pageclean_setup
 598  *
 599  * Purpose:     setup a page to be cleaned (made non-dirty), but not
 600  *              necessarily flushed from the VM page cache.
 601  *              This is accomplished by cleaning in place.
 602  *
 603  *              The page must not be busy, and the object and page
 604  *              queues must be locked.
 605  *
 606  */
 607 void
 608 vm_pageclean_setup(
 609         vm_page_t               m,
 610         vm_page_t               new_m,
 611         vm_object_t             new_object,
 612         vm_object_offset_t      new_offset)
 613 {
 614         assert(!m->busy);
 615 #if 0
 616         assert(!m->cleaning);
 617 #endif
 618
 619         XPR(XPR_VM_PAGEOUT,
 620     "vm_pageclean_setup, obj 0x%X off 0x%X page 0x%X new 0x%X new_off 0x%X\n",
 621                 (integer_t)m->object, m->offset, (integer_t)m,
 622                 (integer_t)new_m, new_offset);
 623
 624         pmap_clear_modify(m->phys_page);
 625
 626         /*
 627          * Mark original page as cleaning in place.
 628          */
 629         m->cleaning = TRUE;
 630         m->dirty = TRUE;
 631         m->precious = FALSE;
 632
 633         /*
 634          * Convert the fictitious page to a private shadow of
 635          * the real page.
 636          */
 637         assert(new_m->fictitious);
 638         assert(new_m->phys_page == vm_page_fictitious_addr);
 639         new_m->fictitious = FALSE;
 640         new_m->private = TRUE;
 641         new_m->pageout = TRUE;
 642         new_m->phys_page = m->phys_page;
 643         vm_page_wire(new_m);
 644
 645         vm_page_insert(new_m, new_object, new_offset);
 646         assert(!new_m->wanted);
 647         new_m->busy = FALSE;
 648 }
 649
 650 /*
 651  *      Routine:        vm_pageout_initialize_page
 652  *      Purpose:
 653  *              Causes the specified page to be initialized in
 654  *              the appropriate memory object. This routine is used to push
 655  *              pages into a copy-object when they are modified in the
 656  *              permanent object.
 657  *
 658  *              The page is moved to a temporary object and paged out.
 659  *
 660  *      In/out conditions:
 661  *              The page in question must not be on any pageout queues.
 662  *              The object to which it belongs must be locked.
 663  *              The page must be busy, but not hold a paging reference.
 664  *
 665  *      Implementation:
 666  *              Move this page to a completely new object.
 667  */
 668 void
 669 vm_pageout_initialize_page(
 670         vm_page_t       m)
 671 {
 672         vm_object_t             object;
 673         vm_object_offset_t      paging_offset;
 674         vm_page_t               holding_page;
 675         memory_object_t         pager;
 676
 677         XPR(XPR_VM_PAGEOUT,
 678                 "vm_pageout_initialize_page, page 0x%X\n",
 679                 (integer_t)m, 0, 0, 0, 0);
 680         assert(m->busy);
 681
 682         /*
 683          *      Verify that we really want to clean this page
 684          */
 685         assert(!m->absent);
 686         assert(!m->error);
 687         assert(m->dirty);
 688
 689         /*
 690          *      Create a paging reference to let us play with the object.
 691          */
 692         object = m->object;
 693         paging_offset = m->offset + object->paging_offset;
 694
 695         if (m->absent || m->error || m->restart || (!m->dirty && !m->precious)) {
 696                 VM_PAGE_FREE(m);
 697                 panic("reservation without pageout?"); /* alan */
 698                 vm_object_unlock(object);
 699
 700                 return;
 701         }
 702
 703         /*
 704          * If there's no pager, then we can't clean the page.  This should
 705          * never happen since this should be a copy object and therefore not
 706          * an external object, so the pager should always be there.
 707          */
 708
 709         pager = object->pager;
 710
 711         if (pager == MEMORY_OBJECT_NULL) {
 712                 VM_PAGE_FREE(m);
 713                 panic("missing pager for copy object");
 714                 return;
 715         }
 716
 717         /* set the page for future call to vm_fault_list_request */
 718         vm_object_paging_begin(object);
 719         holding_page = NULL;
 720         vm_page_lock_queues();
 721         pmap_clear_modify(m->phys_page);
 722         m->dirty = TRUE;
 723         m->busy = TRUE;
 724         m->list_req_pending = TRUE;
 725         m->cleaning = TRUE;
 726         m->pageout = TRUE;
 727         vm_page_wire(m);
 728         vm_page_unlock_queues();
 729         vm_object_unlock(object);
 730
 731         /*
 732          *      Write the data to its pager.
 733          *      Note that the data is passed by naming the new object,
 734          *      not a virtual address; the pager interface has been
 735          *      manipulated to use the "internal memory" data type.
 736          *      [The object reference from its allocation is donated
 737          *      to the eventual recipient.]
 738          */
 739         memory_object_data_initialize(pager, paging_offset, PAGE_SIZE);
 740
 741         vm_object_lock(object);
 742         vm_object_paging_end(object);
 743 }
 744
 745 #if     MACH_CLUSTER_STATS
 746 #define MAXCLUSTERPAGES 16
 747 struct {
 748         unsigned long pages_in_cluster;
 749         unsigned long pages_at_higher_offsets;
 750         unsigned long pages_at_lower_offsets;
 751 } cluster_stats[MAXCLUSTERPAGES];
 752 #endif  /* MACH_CLUSTER_STATS */
 753
 754
 755 /*
 756  * vm_pageout_cluster:
 757  *
 758  * Given a page, queue it to the appropriate I/O thread,
 759  * which will page it out and attempt to clean adjacent pages
 760  * in the same operation.
 761  *
 762  * The page must be busy, and the object and queues locked. We will take a
 763  * paging reference to prevent deallocation or collapse when we
 764  * release the object lock back at the call site.  The I/O thread
 765  * is responsible for consuming this reference
 766  *
 767  * The page must not be on any pageout queue.
 768  */
 769
 770 void
 771 vm_pageout_cluster(vm_page_t m)
 772 {
 773         vm_object_t     object = m->object;
 774         struct          vm_pageout_queue *q;
 775
 776
 777         XPR(XPR_VM_PAGEOUT,
 778                 "vm_pageout_cluster, object 0x%X offset 0x%X page 0x%X\n",
 779                 (integer_t)object, m->offset, (integer_t)m, 0, 0);
 780
 781         /*
 782          * Only a certain kind of page is appreciated here.
 783          */
 784         assert(m->busy && (m->dirty || m->precious) && (m->wire_count == 0));
 785         assert(!m->cleaning && !m->pageout && !m->inactive && !m->active);
 786         assert(!m->throttled);
 787
 788         /*
 789          * protect the object from collapse -
 790          * locking in the object's paging_offset.
 791          */
 792         vm_object_paging_begin(object);
 793
 794         /*
 795          * set the page for future call to vm_fault_list_request
 796          * page should already be marked busy
 797          */
 798         vm_page_wire(m);
 799         m->list_req_pending = TRUE;
 800         m->cleaning = TRUE;
 801         m->pageout = TRUE;
 802         m->laundry = TRUE;
 803
 804         if (object->internal == TRUE)
 805                 q = &vm_pageout_queue_internal;
 806         else
 807                 q = &vm_pageout_queue_external;
 808         q->pgo_laundry++;
 809
 810         m->pageout_queue = TRUE;
 811         queue_enter(&q->pgo_pending, m, vm_page_t, pageq);
 812
 813         if (q->pgo_idle == TRUE) {
 814                 q->pgo_idle = FALSE;
 815                 thread_wakeup((event_t) &q->pgo_pending);
 816         }
 817 }
 818
 819
 820 unsigned long vm_pageout_throttle_up_count = 0;
 821
 822 /*
 823  * A page is back from laundry.  See if there are some pages waiting to
 824  * go to laundry and if we can let some of them go now.
 825  *
 826  * Object and page queues must be locked.
 827  */
 828 void
 829 vm_pageout_throttle_up(
 830         vm_page_t       m)
 831 {
 832         struct vm_pageout_queue *q;
 833
 834         vm_pageout_throttle_up_count++;
 835
 836         assert(m->laundry);
 837         assert(m->object != VM_OBJECT_NULL);
 838         assert(m->object != kernel_object);
 839
 840         if (m->object->internal == TRUE)
 841                 q = &vm_pageout_queue_internal;
 842         else
 843                 q = &vm_pageout_queue_external;
 844
 845         m->laundry = FALSE;
 846         q->pgo_laundry--;
 847
 848         if (q->pgo_throttled == TRUE) {
 849                 q->pgo_throttled = FALSE;
 850                 thread_wakeup((event_t) &q->pgo_laundry);
 851         }
 852 }
 853
 854
 855 /*
 856  *      vm_pageout_scan does the dirty work for the pageout daemon.
 857  *      It returns with vm_page_queue_free_lock held and
 858  *      vm_page_free_wanted == 0.
 859  */
 860
 861 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT  (3 * MAX_UPL_TRANSFER)
 862
 863 #define FCS_IDLE                0
 864 #define FCS_DELAYED             1
 865 #define FCS_DEADLOCK_DETECTED   2
 866
 867 struct flow_control {
 868         int             state;
 869         mach_timespec_t ts;
 870 };
 871
 872 void
 873 vm_pageout_scan(void)
 874 {
 875         unsigned int loop_count = 0;
 876         unsigned int inactive_burst_count = 0;
 877         unsigned int active_burst_count = 0;
 878         unsigned int reactivated_this_call;
 879         unsigned int reactivate_limit;
 880         vm_page_t   local_freeq = NULL;
 881         int         local_freed = 0;
 882         int         delayed_unlock;
 883         int         need_internal_inactive = 0;
 884         int         refmod_state = 0;
 885         int     vm_pageout_deadlock_target = 0;
 886         struct  vm_pageout_queue *iq;
 887         struct  vm_pageout_queue *eq;
 888         struct  vm_speculative_age_q *sq;
 889         struct  flow_control    flow_control;
 890         boolean_t inactive_throttled = FALSE;
 891         boolean_t try_failed;
 892         mach_timespec_t         ts;
 893         unsigned int msecs = 0;
 894         vm_object_t     object;
 895         vm_object_t     last_object_tried;
 896         int     zf_ratio;
 897         int     zf_run_count;
 898         uint32_t        catch_up_count = 0;
 899         uint32_t        inactive_reclaim_run;
 900         boolean_t       forced_reclaim;
 901
 902         flow_control.state = FCS_IDLE;
 903         iq = &vm_pageout_queue_internal;
 904         eq = &vm_pageout_queue_external;
 905         sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
 906
 907
 908         XPR(XPR_VM_PAGEOUT, "vm_pageout_scan\n", 0, 0, 0, 0, 0);
 909
 910
 911         vm_page_lock_queues();
 912         delayed_unlock = 1;     /* must be nonzero if Qs are locked, 0 if unlocked */
 913
 914         /*
 915          *      Calculate the max number of referenced pages on the inactive
 916          *      queue that we will reactivate.
 917          */
 918         reactivated_this_call = 0;
 919         reactivate_limit = VM_PAGE_REACTIVATE_LIMIT(vm_page_active_count +
 920                                                     vm_page_inactive_count);
 921         inactive_reclaim_run = 0;
 922
 923
 924 /*???*/ /*
 925          *      We want to gradually dribble pages from the active queue
 926          *      to the inactive queue.  If we let the inactive queue get
 927          *      very small, and then suddenly dump many pages into it,
 928          *      those pages won't get a sufficient chance to be referenced
 929          *      before we start taking them from the inactive queue.
 930          *
 931          *      We must limit the rate at which we send pages to the pagers.
 932          *      data_write messages consume memory, for message buffers and
 933          *      for map-copy objects.  If we get too far ahead of the pagers,
 934          *      we can potentially run out of memory.
 935          *
 936          *      We can use the laundry count to limit directly the number
 937          *      of pages outstanding to the default pager.  A similar
 938          *      strategy for external pagers doesn't work, because
 939          *      external pagers don't have to deallocate the pages sent them,
 940          *      and because we might have to send pages to external pagers
 941          *      even if they aren't processing writes.  So we also
 942          *      use a burst count to limit writes to external pagers.
 943          *
 944          *      When memory is very tight, we can't rely on external pagers to
 945          *      clean pages.  They probably aren't running, because they
 946          *      aren't vm-privileged.  If we kept sending dirty pages to them,
 947          *      we could exhaust the free list.
 948          */
 949
 950
 951 Restart:
 952         assert(delayed_unlock!=0);
 953
 954         /*
 955          *      A page is "zero-filled" if it was not paged in from somewhere,
 956          *      and it belongs to an object at least VM_ZF_OBJECT_SIZE_THRESHOLD big.
 957          *      Recalculate the zero-filled page ratio.  We use this to apportion
 958          *      victimized pages between the normal and zero-filled inactive
 959          *      queues according to their relative abundance in memory.  Thus if a task
 960          *      is flooding memory with zf pages, we begin to hunt them down.
 961          *      It would be better to throttle greedy tasks at a higher level,
 962          *      but at the moment mach vm cannot do this.
 963          */
 964         {
 965                 uint32_t  total  = vm_page_active_count + vm_page_inactive_count;
 966                 uint32_t  normal = total - vm_zf_count;
 967
 968                 /* zf_ratio is the number of zf pages we victimize per normal page */
 969
 970                 if (vm_zf_count < vm_accellerate_zf_pageout_trigger)
 971                         zf_ratio = 0;
 972                 else if ((vm_zf_count <= normal) || (normal == 0))
 973                         zf_ratio = 1;
 974                 else
 975                         zf_ratio = vm_zf_count / normal;
 976
 977                 zf_run_count = 0;
 978         }
 979
 980         /*
 981          *      Recalculate vm_page_inactivate_target.
 982          */
 983         vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
 984                                                           vm_page_inactive_count +
 985                                                           vm_page_speculative_count);
 986         /*
 987          * don't want to wake the pageout_scan thread up everytime we fall below
 988          * the targets... set a low water mark at 0.25% below the target
 989          */
 990         vm_page_inactive_min = vm_page_inactive_target - (vm_page_inactive_target / 400);
 991
 992         vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count +
 993                                                                 vm_page_inactive_count);
 994         object = NULL;
 995         last_object_tried = NULL;
 996         try_failed = FALSE;
 997
 998         if ((vm_page_inactive_count + vm_page_speculative_count) < VM_PAGE_INACTIVE_HEALTHY_LIMIT(vm_page_active_count))
 999                 catch_up_count = vm_page_inactive_count + vm_page_speculative_count;
1000         else
1001                 catch_up_count = 0;
1002
1003         for (;;) {
1004                 vm_page_t m;
1005
1006                 DTRACE_VM2(rev, int, 1, (uint64_t *), NULL);
1007
1008                 if (delayed_unlock == 0) {
1009                         vm_page_lock_queues();
1010                         delayed_unlock = 1;
1011                 }
1012
1013                 /*
1014                  *      Don't sweep through active queue more than the throttle
1015                  *      which should be kept relatively low
1016                  */
1017                 active_burst_count = vm_pageout_burst_active_throttle;
1018
1019                 /*
1020                  *      Move pages from active to inactive.
1021                  */
1022                 if (need_internal_inactive == 0 && (vm_page_inactive_count + vm_page_speculative_count) >= vm_page_inactive_target)
1023                         goto done_moving_active_pages;
1024
1025                 while (!queue_empty(&vm_page_queue_active) &&
1026                        (need_internal_inactive || active_burst_count)) {
1027
1028                         if (active_burst_count)
1029                                active_burst_count--;
1030
1031                         vm_pageout_active++;
1032
1033                         m = (vm_page_t) queue_first(&vm_page_queue_active);
1034
1035                         assert(m->active && !m->inactive);
1036                         assert(!m->laundry);
1037                         assert(m->object != kernel_object);
1038                         assert(m->phys_page != vm_page_guard_addr);
1039
1040                         DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
1041
1042                         /*
1043                          * Try to lock object; since we've already got the
1044                          * page queues lock, we can only 'try' for this one.
1045                          * if the 'try' fails, we need to do a mutex_pause
1046                          * to allow the owner of the object lock a chance to
1047                          * run... otherwise, we're likely to trip over this
1048                          * object in the same state as we work our way through
1049                          * the queue... clumps of pages associated with the same
1050                          * object are fairly typical on the inactive and active queues
1051                          */
1052                         if (m->object != object) {
1053                                 if (object != NULL) {
1054                                         vm_object_unlock(object);
1055                                         object = NULL;
1056                                         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1057                                 }
1058                                 if (!vm_object_lock_try_scan(m->object)) {
1059                                         /*
1060                                          * move page to end of active queue and continue
1061                                          */
1062                                         queue_remove(&vm_page_queue_active, m,
1063                                                      vm_page_t, pageq);
1064                                         queue_enter(&vm_page_queue_active, m,
1065                                                     vm_page_t, pageq);
1066
1067                                         try_failed = TRUE;
1068
1069                                         m = (vm_page_t) queue_first(&vm_page_queue_active);
1070                                         /*
1071                                          * this is the next object we're going to be interested in
1072                                          * try to make sure its available after the mutex_yield
1073                                          * returns control
1074                                          */
1075                                         vm_pageout_scan_wants_object = m->object;
1076
1077                                         goto done_with_activepage;
1078                                 }
1079                                 object = m->object;
1080
1081                                 try_failed = FALSE;
1082                         }
1083
1084                         /*
1085                          * if the page is BUSY, then we pull it
1086                          * off the active queue and leave it alone.
1087                          * when BUSY is cleared, it will get stuck
1088                          * back on the appropriate queue
1089                          */
1090                         if (m->busy) {
1091                                 queue_remove(&vm_page_queue_active, m,
1092                                              vm_page_t, pageq);
1093                                 m->pageq.next = NULL;
1094                                 m->pageq.prev = NULL;
1095
1096                                 if (!m->fictitious)
1097                                         vm_page_active_count--;
1098                                 m->active = FALSE;
1099
1100                                 goto done_with_activepage;
1101                         }
1102
1103                         /*
1104                          *      Deactivate the page while holding the object
1105                          *      locked, so we know the page is still not busy.
1106                          *      This should prevent races between pmap_enter
1107                          *      and pmap_clear_reference.  The page might be
1108                          *      absent or fictitious, but vm_page_deactivate
1109                          *      can handle that.
1110                          */
1111                         vm_page_deactivate(m);
1112
1113                         if (need_internal_inactive) {
1114                                 vm_pageout_scan_active_throttle_success++;
1115                                 need_internal_inactive--;
1116                         }
1117 done_with_activepage:
1118                         if (delayed_unlock++ > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT || try_failed == TRUE) {
1119
1120                                 if (object != NULL) {
1121                                         vm_object_unlock(object);
1122                                         object = NULL;
1123                                         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1124                                 }
1125                                 if (local_freeq) {
1126                                         vm_page_free_list(local_freeq);
1127
1128                                         local_freeq = NULL;
1129                                         local_freed = 0;
1130                                 }
1131                                 mutex_yield(&vm_page_queue_lock);
1132
1133                                 delayed_unlock = 1;
1134
1135                                 /*
1136                                  * continue the while loop processing
1137                                  * the active queue... need to hold
1138                                  * the page queues lock
1139                                  */
1140                         }
1141                 }
1142
1143
1144
1145                 /**********************************************************************
1146                  * above this point we're playing with the active queue
1147                  * below this point we're playing with the throttling mechanisms
1148                  * and the inactive queue
1149                  **********************************************************************/
1150
1151 done_moving_active_pages:
1152
1153                 /*
1154                  *      We are done if we have met our target *and*
1155                  *      nobody is still waiting for a page.
1156                  */
1157                 if (vm_page_free_count + local_freed >= vm_page_free_target) {
1158                         if (object != NULL) {
1159                                 vm_object_unlock(object);
1160                                 object = NULL;
1161                         }
1162                         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1163
1164                         if (local_freeq) {
1165                                 vm_page_free_list(local_freeq);
1166
1167                                 local_freeq = NULL;
1168                                 local_freed = 0;
1169                         }
1170                         /*
1171                          * inactive target still not met... keep going
1172                          * until we get the queues balanced
1173                          */
1174                         if (((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) &&
1175                             !queue_empty(&vm_page_queue_active))
1176                                 continue;
1177
1178                         mutex_lock(&vm_page_queue_free_lock);
1179
1180                         if ((vm_page_free_count >= vm_page_free_target) &&
1181                             (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
1182
1183                                 vm_page_unlock_queues();
1184
1185                                 thread_wakeup((event_t) &vm_pageout_garbage_collect);
1186
1187                                 assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
1188
1189                                 return;
1190                         }
1191                         mutex_unlock(&vm_page_queue_free_lock);
1192                 }
1193                 /*
1194                  * Before anything, we check if we have any ripe volatile objects around.
1195                  * If so, purge the first and see what it gives us.
1196                  */
1197                 assert (available_for_purge>=0);
1198                 if (available_for_purge)
1199                 {
1200                         if (object != NULL) {
1201                                 vm_object_unlock(object);
1202                                 object = NULL;
1203                         }
1204                         vm_purgeable_object_purge_one();
1205                         continue;
1206                 }
1207
1208                 if (queue_empty(&sq->age_q) && vm_page_speculative_count) {
1209                         /*
1210                          * try to pull pages from the aging bins
1211                          * see vm_page.h for an explanation of how
1212                          * this mechanism works
1213                          */
1214                         struct vm_speculative_age_q     *aq;
1215                         mach_timespec_t ts_fully_aged;
1216                         boolean_t       can_steal = FALSE;
1217
1218                         aq = &vm_page_queue_speculative[speculative_steal_index];
1219
1220                         while (queue_empty(&aq->age_q)) {
1221
1222                                 speculative_steal_index++;
1223
1224                                 if (speculative_steal_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q)
1225                                         speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
1226
1227                                 aq = &vm_page_queue_speculative[speculative_steal_index];
1228                         }
1229                         if (vm_page_speculative_count > vm_page_speculative_target)
1230                                 can_steal = TRUE;
1231                         else {
1232                                 ts_fully_aged.tv_sec = (VM_PAGE_MAX_SPECULATIVE_AGE_Q * VM_PAGE_SPECULATIVE_Q_AGE_MS) / 1000;
1233                                 ts_fully_aged.tv_nsec = ((VM_PAGE_MAX_SPECULATIVE_AGE_Q * VM_PAGE_SPECULATIVE_Q_AGE_MS) % 1000)
1234                                                       * 1000 * NSEC_PER_USEC;
1235
1236                                 ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts);
1237
1238                                 clock_get_system_nanotime(&ts.tv_sec, (unsigned *)&ts.tv_nsec);
1239
1240                                 if (CMP_MACH_TIMESPEC(&ts, &ts_fully_aged) >= 0)
1241                                         can_steal = TRUE;
1242                         }
1243                         if (can_steal == TRUE)
1244                                 vm_page_speculate_ageit(aq);
1245                 }
1246
1247                 /*
1248                  * Sometimes we have to pause:
1249                  *      1) No inactive pages - nothing to do.
1250                  *      2) Flow control - default pageout queue is full
1251                  *      3) Loop control - no acceptable pages found on the inactive queue
1252                  *         within the last vm_pageout_burst_inactive_throttle iterations
1253                  */
1254                 if (queue_empty(&vm_page_queue_inactive) && queue_empty(&vm_page_queue_zf) && queue_empty(&sq->age_q) &&
1255                     (VM_PAGE_Q_THROTTLED(iq) || queue_empty(&vm_page_queue_throttled))) {
1256                         vm_pageout_scan_empty_throttle++;
1257                         msecs = vm_pageout_empty_wait;
1258                         goto vm_pageout_scan_delay;
1259
1260                 } else if (inactive_burst_count >= vm_pageout_burst_inactive_throttle) {
1261                         vm_pageout_scan_burst_throttle++;
1262                         msecs = vm_pageout_burst_wait;
1263                         goto vm_pageout_scan_delay;
1264
1265                 } else if (VM_PAGE_Q_THROTTLED(iq) && IP_VALID(memory_manager_default)) {
1266
1267                         switch (flow_control.state) {
1268
1269                         case FCS_IDLE:
1270 reset_deadlock_timer:
1271                                 ts.tv_sec = vm_pageout_deadlock_wait / 1000;
1272                                 ts.tv_nsec = (vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
1273                                 clock_get_system_nanotime(&flow_control.ts.tv_sec,
1274                                                           (unsigned *)&flow_control.ts.tv_nsec);
1275                                 ADD_MACH_TIMESPEC(&flow_control.ts, &ts);
1276
1277                                 flow_control.state = FCS_DELAYED;
1278                                 msecs = vm_pageout_deadlock_wait;
1279
1280                                 break;
1281
1282                         case FCS_DELAYED:
1283                                 clock_get_system_nanotime(&ts.tv_sec,
1284                                                           (unsigned *)&ts.tv_nsec);
1285
1286                                 if (CMP_MACH_TIMESPEC(&ts, &flow_control.ts) >= 0) {
1287                                         /*
1288                                          * the pageout thread for the default pager is potentially
1289                                          * deadlocked since the
1290                                          * default pager queue has been throttled for more than the
1291                                          * allowable time... we need to move some clean pages or dirty
1292                                          * pages belonging to the external pagers if they aren't throttled
1293                                          * vm_page_free_wanted represents the number of threads currently
1294                                          * blocked waiting for pages... we'll move one page for each of
1295                                          * these plus a fixed amount to break the logjam... once we're done
1296                                          * moving this number of pages, we'll re-enter the FSC_DELAYED state
1297                                          * with a new timeout target since we have no way of knowing
1298                                          * whether we've broken the deadlock except through observation
1299                                          * of the queue associated with the default pager... we need to
1300                                          * stop moving pages and allow the system to run to see what
1301                                          * state it settles into.
1302                                          */
1303                                         vm_pageout_deadlock_target = vm_pageout_deadlock_relief + vm_page_free_wanted + vm_page_free_wanted_privileged;
1304                                         vm_pageout_scan_deadlock_detected++;
1305                                         flow_control.state = FCS_DEADLOCK_DETECTED;
1306
1307                                         thread_wakeup((event_t) &vm_pageout_garbage_collect);
1308                                         goto consider_inactive;
1309                                 }
1310                                 /*
1311                                  * just resniff instead of trying
1312                                  * to compute a new delay time... we're going to be
1313                                  * awakened immediately upon a laundry completion,
1314                                  * so we won't wait any longer than necessary
1315                                  */
1316                                 msecs = vm_pageout_idle_wait;
1317                                 break;
1318
1319                         case FCS_DEADLOCK_DETECTED:
1320                                 if (vm_pageout_deadlock_target)
1321                                         goto consider_inactive;
1322                                 goto reset_deadlock_timer;
1323
1324                         }
1325                         vm_pageout_scan_throttle++;
1326                         iq->pgo_throttled = TRUE;
1327 vm_pageout_scan_delay:
1328                         if (object != NULL) {
1329                                 vm_object_unlock(object);
1330                                 object = NULL;
1331                         }
1332                         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1333
1334                         if (local_freeq) {
1335                                 vm_page_free_list(local_freeq);
1336
1337                                 local_freeq = NULL;
1338                                 local_freed = 0;
1339                         }
1340 #if CONFIG_EMBEDDED
1341                         {
1342                         int percent_avail;
1343
1344                         /*
1345                          * Decide if we need to send a memory status notification.
1346                          */
1347                         percent_avail =
1348                                 (vm_page_active_count + vm_page_inactive_count +
1349                                  vm_page_speculative_count + vm_page_free_count +
1350                                  vm_page_purgeable_count ) * 100 /
1351                                 atop_64(max_mem);
1352                         if (percent_avail >= (kern_memorystatus_level + 5) ||
1353                             percent_avail <= (kern_memorystatus_level - 5)) {
1354                                 kern_memorystatus_level = percent_avail;
1355                                 thread_wakeup((event_t)&kern_memorystatus_wakeup);
1356                         }
1357                         }
1358 #endif
1359                         assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000*NSEC_PER_USEC);
1360
1361                         counter(c_vm_pageout_scan_block++);
1362
1363                         vm_page_unlock_queues();
1364
1365                         assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
1366
1367                         thread_block(THREAD_CONTINUE_NULL);
1368
1369                         vm_page_lock_queues();
1370                         delayed_unlock = 1;
1371
1372                         iq->pgo_throttled = FALSE;
1373
1374                         if (loop_count >= vm_page_inactive_count)
1375                                 loop_count = 0;
1376                         inactive_burst_count = 0;
1377
1378                         goto Restart;
1379                         /*NOTREACHED*/
1380                 }
1381
1382
1383                 flow_control.state = FCS_IDLE;
1384 consider_inactive:
1385                 loop_count++;
1386                 inactive_burst_count++;
1387                 vm_pageout_inactive++;
1388
1389                 /* Choose a victim. */
1390
1391                 while (1) {
1392                         m = NULL;
1393
1394                         /*
1395                          * the most eligible pages are ones that were throttled because the
1396                          * pager wasn't ready at the time.  If a pager is ready now,
1397                          * see if one of these is useful.
1398                          */
1399                         if (!VM_PAGE_Q_THROTTLED(iq) && !queue_empty(&vm_page_queue_throttled)) {
1400                                 m = (vm_page_t) queue_first(&vm_page_queue_throttled);
1401                                 break;
1402                         }
1403
1404                         /*
1405                          * The second most eligible pages are ones we paged in speculatively,
1406                          * but which have not yet been touched.
1407                          */
1408                         if ( !queue_empty(&sq->age_q) ) {
1409                                 m = (vm_page_t) queue_first(&sq->age_q);
1410                                 break;
1411                         }
1412                         /*
1413                          * Time for a zero-filled inactive page?
1414                          */
1415                         if ( ((zf_run_count < zf_ratio) && vm_zf_queue_count >= zf_queue_min_count) ||
1416                              queue_empty(&vm_page_queue_inactive)) {
1417                                 if ( !queue_empty(&vm_page_queue_zf) ) {
1418                                         m = (vm_page_t) queue_first(&vm_page_queue_zf);
1419                                         zf_run_count++;
1420                                         break;
1421                                 }
1422                         }
1423                         /*
1424                          * It's either a normal inactive page or nothing.
1425                          */
1426                         if ( !queue_empty(&vm_page_queue_inactive) ) {
1427                                 m = (vm_page_t) queue_first(&vm_page_queue_inactive);
1428                                 zf_run_count = 0;
1429                                 break;
1430                         }
1431
1432                         panic("vm_pageout: no victim");
1433                 }
1434
1435                 assert(!m->active && (m->inactive || m->speculative || m->throttled));
1436                 assert(!m->laundry);
1437                 assert(m->object != kernel_object);
1438                 assert(m->phys_page != vm_page_guard_addr);
1439
1440                 DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
1441
1442                 /*
1443                  * check to see if we currently are working
1444                  * with the same object... if so, we've
1445                  * already got the lock
1446                  */
1447                 if (m->object != object) {
1448                         /*
1449                          * the object associated with candidate page is
1450                          * different from the one we were just working
1451                          * with... dump the lock if we still own it
1452                          */
1453                         if (object != NULL) {
1454                                 vm_object_unlock(object);
1455                                 object = NULL;
1456                                 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1457                         }
1458                         /*
1459                          * Try to lock object; since we've alread got the
1460                          * page queues lock, we can only 'try' for this one.
1461                          * if the 'try' fails, we need to do a mutex_pause
1462                          * to allow the owner of the object lock a chance to
1463                          * run... otherwise, we're likely to trip over this
1464                          * object in the same state as we work our way through
1465                          * the queue... clumps of pages associated with the same
1466                          * object are fairly typical on the inactive and active queues
1467                          */
1468                         if (!vm_object_lock_try_scan(m->object)) {
1469                                 /*
1470                                  *      Move page to end and continue.
1471                                  *      Don't re-issue ticket
1472                                  */
1473                                 if (m->zero_fill) {
1474                                         queue_remove(&vm_page_queue_zf, m,
1475                                                      vm_page_t, pageq);
1476                                         queue_enter(&vm_page_queue_zf, m,
1477                                                     vm_page_t, pageq);
1478                                 } else if (m->speculative) {
1479                                         remque(&m->pageq);
1480                                         m->speculative = FALSE;
1481                                         vm_page_speculative_count--;
1482
1483                                         /*
1484                                          * move to the tail of the inactive queue
1485                                          * to get it out of the way... the speculative
1486                                          * queue is generally too small to depend
1487                                          * on there being enough pages from other
1488                                          * objects to make cycling it back on the
1489                                          * same queue a winning proposition
1490                                          */
1491                                         queue_enter(&vm_page_queue_inactive, m,
1492                                                     vm_page_t, pageq);
1493                                         m->inactive = TRUE;
1494                                         vm_page_inactive_count++;
1495                                         token_new_pagecount++;
1496                                 }  else if (m->throttled) {
1497                                         queue_remove(&vm_page_queue_throttled, m,
1498                                                      vm_page_t, pageq);
1499                                         m->throttled = FALSE;
1500                                         vm_page_throttled_count--;
1501
1502                                         /*
1503                                          * not throttled any more, so can stick
1504                                          * it on the inactive queue.
1505                                          */
1506                                         queue_enter(&vm_page_queue_inactive, m,
1507                                                     vm_page_t, pageq);
1508                                         m->inactive = TRUE;
1509                                         vm_page_inactive_count++;
1510                                         token_new_pagecount++;
1511                                 } else {
1512                                         queue_remove(&vm_page_queue_inactive, m,
1513                                                      vm_page_t, pageq);
1514 #if MACH_ASSERT
1515                                         vm_page_inactive_count--;       /* balance for purgeable queue asserts */
1516 #endif
1517                                         vm_purgeable_q_advance_all(1);
1518
1519                                         queue_enter(&vm_page_queue_inactive, m,
1520                                                     vm_page_t, pageq);
1521 #if MACH_ASSERT
1522                                         vm_page_inactive_count++;       /* balance for purgeable queue asserts */
1523 #endif
1524                                         token_new_pagecount++;
1525                                 }
1526                                 pmap_clear_reference(m->phys_page);
1527                                 m->reference = FALSE;
1528
1529                                 vm_pageout_inactive_nolock++;
1530
1531                                 if ( !queue_empty(&sq->age_q) )
1532                                         m = (vm_page_t) queue_first(&sq->age_q);
1533                                 else if ( ((zf_run_count < zf_ratio) && vm_zf_queue_count >= zf_queue_min_count) ||
1534                                           queue_empty(&vm_page_queue_inactive)) {
1535                                         if ( !queue_empty(&vm_page_queue_zf) )
1536                                                 m = (vm_page_t) queue_first(&vm_page_queue_zf);
1537                                 } else if ( !queue_empty(&vm_page_queue_inactive) ) {
1538                                         m = (vm_page_t) queue_first(&vm_page_queue_inactive);
1539                                 }
1540                                 /*
1541                                  * this is the next object we're going to be interested in
1542                                  * try to make sure its available after the mutex_yield
1543                                  * returns control
1544                                  */
1545                                 vm_pageout_scan_wants_object = m->object;
1546
1547                                 /*
1548                                  * force us to dump any collected free pages
1549                                  * and to pause before moving on
1550                                  */
1551                                 try_failed = TRUE;
1552
1553                                 goto done_with_inactivepage;
1554                         }
1555                         object = m->object;
1556                         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1557
1558                         try_failed = FALSE;
1559                 }
1560
1561                 /*
1562                  *      Paging out pages of external objects which
1563                  *      are currently being created must be avoided.
1564                  *      The pager may claim for memory, thus leading to a
1565                  *      possible dead lock between it and the pageout thread,
1566                  *      if such pages are finally chosen. The remaining assumption
1567                  *      is that there will finally be enough available pages in the
1568                  *      inactive pool to page out in order to satisfy all memory
1569                  *      claimed by the thread which concurrently creates the pager.
1570                  */
1571                 if (!object->pager_initialized && object->pager_created) {
1572                         /*
1573                          *      Move page to end and continue, hoping that
1574                          *      there will be enough other inactive pages to
1575                          *      page out so that the thread which currently
1576                          *      initializes the pager will succeed.
1577                          *      Don't re-grant the ticket, the page should
1578                          *      pulled from the queue and paged out whenever
1579                          *      one of its logically adjacent fellows is
1580                          *      targeted.
1581                          *
1582                          *      Pages found on the speculative list can never be
1583                          *      in this state... they always have a pager associated
1584                          *      with them.
1585                          */
1586                         assert(!m->speculative);
1587
1588                         if (m->zero_fill) {
1589                                 queue_remove(&vm_page_queue_zf, m,
1590                                              vm_page_t, pageq);
1591                                 queue_enter(&vm_page_queue_zf, m,
1592                                             vm_page_t, pageq);
1593                         } else {
1594                                 queue_remove(&vm_page_queue_inactive, m,
1595                                              vm_page_t, pageq);
1596 #if MACH_ASSERT
1597                                 vm_page_inactive_count--;       /* balance for purgeable queue asserts */
1598 #endif
1599                                 vm_purgeable_q_advance_all(1);
1600
1601                                 queue_enter(&vm_page_queue_inactive, m,
1602                                             vm_page_t, pageq);
1603 #if MACH_ASSERT
1604                                 vm_page_inactive_count++;       /* balance for purgeable queue asserts */
1605 #endif
1606                                 token_new_pagecount++;
1607                         }
1608                         vm_pageout_inactive_avoid++;
1609
1610                         goto done_with_inactivepage;
1611                 }
1612                 /*
1613                  *      Remove the page from its list.
1614                  */
1615                 if (m->speculative) {
1616                         remque(&m->pageq);
1617                         m->speculative = FALSE;
1618                         vm_page_speculative_count--;
1619                 } else if (m->throttled) {
1620                         queue_remove(&vm_page_queue_throttled, m, vm_page_t, pageq);
1621                         m->throttled = FALSE;
1622                         vm_page_throttled_count--;
1623                 } else {
1624                         if (m->zero_fill) {
1625                                 queue_remove(&vm_page_queue_zf, m, vm_page_t, pageq);
1626                                 vm_zf_queue_count--;
1627                         } else {
1628                                 queue_remove(&vm_page_queue_inactive, m, vm_page_t, pageq);
1629                         }
1630                         m->inactive = FALSE;
1631                         if (!m->fictitious)
1632                                 vm_page_inactive_count--;
1633                                 vm_purgeable_q_advance_all(1);
1634                 }
1635
1636                 /* If the object is empty, the page must be reclaimed even if dirty or used. */
1637                 /* If the page belongs to a volatile object, we stick it back on. */
1638                 if (object->copy == VM_OBJECT_NULL) {
1639                         if(object->purgable == VM_PURGABLE_EMPTY && !m->cleaning) {
1640                                 m->busy = TRUE;
1641                                 if (m->pmapped == TRUE) {
1642                                         /* unmap the page */
1643                                         refmod_state = pmap_disconnect(m->phys_page);
1644                                         if (refmod_state & VM_MEM_MODIFIED) {
1645                                                 m->dirty = TRUE;
1646                                         }
1647                                 }
1648                                 if (m->dirty || m->precious) {
1649                                         /* we saved the cost of cleaning this page ! */
1650                                         vm_page_purged_count++;
1651                                 }
1652                                 goto reclaim_page;
1653                         }
1654                         if (object->purgable == VM_PURGABLE_VOLATILE) {
1655                                 /* if it's wired, we can't put it on our queue */
1656                                 assert(m->wire_count == 0);
1657                                 /* just stick it back on! */
1658                                 goto reactivate_page;
1659                         }
1660                 }
1661                 m->pageq.next = NULL;
1662                 m->pageq.prev = NULL;
1663
1664                 if ( !m->fictitious && catch_up_count)
1665                         catch_up_count--;
1666
1667                 /*
1668                  * ENCRYPTED SWAP:
1669                  * if this page has already been picked up as part of a
1670                  * page-out cluster, it will be busy because it is being
1671                  * encrypted (see vm_object_upl_request()).  But we still
1672                  * want to demote it from "clean-in-place" (aka "adjacent")
1673                  * to "clean-and-free" (aka "target"), so let's ignore its
1674                  * "busy" bit here and proceed to check for "cleaning" a
1675                  * little bit below...
1676                  */
1677                 if ( !m->encrypted_cleaning && (m->busy || !object->alive)) {
1678                         /*
1679                          *      Somebody is already playing with this page.
1680                          *      Leave it off the pageout queues.
1681                          *
1682                          */
1683                         vm_pageout_inactive_busy++;
1684
1685                         goto done_with_inactivepage;
1686                 }
1687
1688                 /*
1689                  *      If it's absent or in error, we can reclaim the page.
1690                  */
1691
1692                 if (m->absent || m->error) {
1693                         vm_pageout_inactive_absent++;
1694 reclaim_page:
1695                         if (vm_pageout_deadlock_target) {
1696                                 vm_pageout_scan_inactive_throttle_success++;
1697                                 vm_pageout_deadlock_target--;
1698                         }
1699
1700                         DTRACE_VM2(dfree, int, 1, (uint64_t *), NULL);
1701
1702                         if (m->object->internal) {
1703                                 DTRACE_VM2(anonfree, int, 1, (uint64_t *), NULL);
1704                         } else {
1705                                 DTRACE_VM2(fsfree, int, 1, (uint64_t *), NULL);
1706                         }
1707
1708                         vm_page_free_prepare(m);
1709
1710                         assert(m->pageq.next == NULL &&
1711                                m->pageq.prev == NULL);
1712                         m->pageq.next = (queue_entry_t)local_freeq;
1713                         local_freeq = m;
1714                         local_freed++;
1715
1716                         inactive_burst_count = 0;
1717
1718                         goto done_with_inactivepage;
1719                 }
1720
1721                 assert(!m->private);
1722                 assert(!m->fictitious);
1723
1724                 /*
1725                  *      If already cleaning this page in place, convert from
1726                  *      "adjacent" to "target". We can leave the page mapped,
1727                  *      and vm_pageout_object_terminate will determine whether
1728                  *      to free or reactivate.
1729                  */
1730
1731                 if (m->cleaning) {
1732                         m->busy = TRUE;
1733                         m->pageout = TRUE;
1734                         m->dump_cleaning = TRUE;
1735                         vm_page_wire(m);
1736
1737                         CLUSTER_STAT(vm_pageout_cluster_conversions++);
1738
1739                         inactive_burst_count = 0;
1740
1741                         goto done_with_inactivepage;
1742                 }
1743
1744                 /*
1745                  *      If it's being used, reactivate.
1746                  *      (Fictitious pages are either busy or absent.)
1747                  *      First, update the reference and dirty bits
1748                  *      to make sure the page is unreferenced.
1749                  */
1750                 refmod_state = -1;
1751
1752                 if (m->reference == FALSE && m->pmapped == TRUE) {
1753                         refmod_state = pmap_get_refmod(m->phys_page);
1754
1755                         if (refmod_state & VM_MEM_REFERENCED)
1756                                 m->reference = TRUE;
1757                         if (refmod_state & VM_MEM_MODIFIED)
1758                                 m->dirty = TRUE;
1759                 }
1760                 if (m->reference && !m->no_cache) {
1761                         /*
1762                          * The page we pulled off the inactive list has
1763                          * been referenced.  It is possible for other
1764                          * processors to be touching pages faster than we
1765                          * can clear the referenced bit and traverse the
1766                          * inactive queue, so we limit the number of
1767                          * reactivations.
1768                          */
1769                         if (++reactivated_this_call >= reactivate_limit) {
1770                                 vm_pageout_reactivation_limit_exceeded++;
1771                         } else if (catch_up_count) {
1772                                 vm_pageout_catch_ups++;
1773                         } else if (++inactive_reclaim_run >= VM_PAGEOUT_INACTIVE_FORCE_RECLAIM) {
1774                                 vm_pageout_inactive_force_reclaim++;
1775                         } else {
1776                                 /*
1777                                  * The page was being used, so put back on active list.
1778                                  */
1779 reactivate_page:
1780                                 vm_page_activate(m);
1781                                 VM_STAT_INCR(reactivations);
1782
1783                                 vm_pageout_inactive_used++;
1784                                 inactive_burst_count = 0;
1785
1786                                 goto done_with_inactivepage;
1787                         }
1788                         /*
1789                          * Make sure we call pmap_get_refmod() if it
1790                          * wasn't already called just above, to update
1791                          * the dirty bit.
1792                          */
1793                         if ((refmod_state == -1) && !m->dirty && m->pmapped) {
1794                                 refmod_state = pmap_get_refmod(m->phys_page);
1795                                 if (refmod_state & VM_MEM_MODIFIED)
1796                                         m->dirty = TRUE;
1797                         }
1798                         forced_reclaim = TRUE;
1799                 } else {
1800                         forced_reclaim = FALSE;
1801                 }
1802
1803                 XPR(XPR_VM_PAGEOUT,
1804                 "vm_pageout_scan, replace object 0x%X offset 0x%X page 0x%X\n",
1805                 (integer_t)object, (integer_t)m->offset, (integer_t)m, 0,0);
1806
1807                 /*
1808                  * we've got a candidate page to steal...
1809                  *
1810                  * m->dirty is up to date courtesy of the
1811                  * preceding check for m->reference... if
1812                  * we get here, then m->reference had to be
1813                  * FALSE (or possibly "reactivate_limit" was
1814                  * exceeded), but in either case we called
1815                  * pmap_get_refmod() and updated both
1816                  * m->reference and m->dirty
1817                  *
1818                  * if it's dirty or precious we need to
1819                  * see if the target queue is throtttled
1820                  * it if is, we need to skip over it by moving it back
1821                  * to the end of the inactive queue
1822                  */
1823                 inactive_throttled = FALSE;
1824
1825                 if (m->dirty || m->precious) {
1826                         if (object->internal) {
1827                                 if (VM_PAGE_Q_THROTTLED(iq))
1828                                         inactive_throttled = TRUE;
1829                         } else if (VM_PAGE_Q_THROTTLED(eq)) {
1830                                 inactive_throttled = TRUE;
1831                         }
1832                 }
1833                 if (inactive_throttled == TRUE) {
1834 throttle_inactive:
1835                         if (!IP_VALID(memory_manager_default) &&
1836                                 object->internal &&
1837                                 (object->purgable == VM_PURGABLE_DENY ||
1838                                  object->purgable == VM_PURGABLE_NONVOLATILE)) {
1839                                 queue_enter(&vm_page_queue_throttled, m,
1840                                             vm_page_t, pageq);
1841                                 m->throttled = TRUE;
1842                                 vm_page_throttled_count++;
1843                         } else {
1844                                 if (m->zero_fill) {
1845                                         queue_enter(&vm_page_queue_zf, m,
1846                                                     vm_page_t, pageq);
1847                                         vm_zf_queue_count++;
1848                                 } else
1849                                         queue_enter(&vm_page_queue_inactive, m,
1850                                                     vm_page_t, pageq);
1851                                 m->inactive = TRUE;
1852                                 if (!m->fictitious) {
1853                                         vm_page_inactive_count++;
1854                                         token_new_pagecount++;
1855                                 }
1856                         }
1857                         vm_pageout_scan_inactive_throttled++;
1858                         goto done_with_inactivepage;
1859                 }
1860
1861                 /*
1862                  * we've got a page that we can steal...
1863                  * eliminate all mappings and make sure
1864                  * we have the up-to-date modified state
1865                  * first take the page BUSY, so that no new
1866                  * mappings can be made
1867                  */
1868                 m->busy = TRUE;
1869
1870                 /*
1871                  * if we need to do a pmap_disconnect then we
1872                  * need to re-evaluate m->dirty since the pmap_disconnect
1873                  * provides the true state atomically... the
1874                  * page was still mapped up to the pmap_disconnect
1875                  * and may have been dirtied at the last microsecond
1876                  *
1877                  * we also check for the page being referenced 'late'
1878                  * if it was, we first need to do a WAKEUP_DONE on it
1879                  * since we already set m->busy = TRUE, before
1880                  * going off to reactivate it
1881                  *
1882                  * Note that if 'pmapped' is FALSE then the page is not
1883                  * and has not been in any map, so there is no point calling
1884                  * pmap_disconnect().  m->dirty and/or m->reference could
1885                  * have been set in anticipation of likely usage of the page.
1886                  */
1887                 if (m->pmapped == TRUE) {
1888                         refmod_state = pmap_disconnect(m->phys_page);
1889
1890                         if (refmod_state & VM_MEM_MODIFIED)
1891                                 m->dirty = TRUE;
1892                         if (refmod_state & VM_MEM_REFERENCED) {
1893
1894                                 /* If m->reference is already set, this page must have
1895                                  * already failed the reactivate_limit test, so don't
1896                                  * bump the counts twice.
1897                                  */
1898                                 if ( ! m->reference ) {
1899                                         m->reference = TRUE;
1900                                         if (forced_reclaim ||
1901                                             ++reactivated_this_call >= reactivate_limit)
1902                                                 vm_pageout_reactivation_limit_exceeded++;
1903                                         else {
1904                                                 PAGE_WAKEUP_DONE(m);
1905                                                 goto reactivate_page;
1906                                         }
1907                                 }
1908                         }
1909                 }
1910                 /*
1911                  * reset our count of pages that have been reclaimed
1912                  * since the last page was 'stolen'
1913                  */
1914                 inactive_reclaim_run = 0;
1915
1916                 /*
1917                  *      If it's clean and not precious, we can free the page.
1918                  */
1919                 if (!m->dirty && !m->precious) {
1920                         vm_pageout_inactive_clean++;
1921                         goto reclaim_page;
1922                 }
1923
1924                 /*
1925                  * The page may have been dirtied since the last check
1926                  * for a throttled target queue (which may have been skipped
1927                  * if the page was clean then).  With the dirty page
1928                  * disconnected here, we can make one final check.
1929                  */
1930                 {
1931                         boolean_t disconnect_throttled = FALSE;
1932                         if (object->internal) {
1933                                 if (VM_PAGE_Q_THROTTLED(iq))
1934                                         disconnect_throttled = TRUE;
1935                         } else if (VM_PAGE_Q_THROTTLED(eq)) {
1936                                 disconnect_throttled = TRUE;
1937                         }
1938
1939                         if (disconnect_throttled == TRUE) {
1940                                 PAGE_WAKEUP_DONE(m);
1941                                 goto throttle_inactive;
1942                         }
1943                 }
1944
1945                 vm_pageout_cluster(m);
1946
1947                 vm_pageout_inactive_dirty++;
1948
1949                 inactive_burst_count = 0;
1950
1951 done_with_inactivepage:
1952                 if (delayed_unlock++ > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT || try_failed == TRUE) {
1953
1954                         if (object != NULL) {
1955                                 vm_object_unlock(object);
1956                                 object = NULL;
1957                                 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1958                         }
1959                         if (local_freeq) {
1960                                 vm_page_free_list(local_freeq);
1961
1962                                 local_freeq = NULL;
1963                                 local_freed = 0;
1964                         }
1965                         mutex_yield(&vm_page_queue_lock);
1966
1967                         delayed_unlock = 1;
1968                 }
1969                 /*
1970                  * back to top of pageout scan loop
1971                  */
1972         }
1973 }
1974
1975
1976 int vm_page_free_count_init;
1977
1978 void
1979 vm_page_free_reserve(
1980         int pages)
1981 {
1982         int             free_after_reserve;
1983
1984         vm_page_free_reserved += pages;
1985
1986         free_after_reserve = vm_page_free_count_init - vm_page_free_reserved;
1987
1988         vm_page_free_min = vm_page_free_reserved +
1989                 VM_PAGE_FREE_MIN(free_after_reserve);
1990
1991         if (vm_page_free_min > VM_PAGE_FREE_MIN_LIMIT)
1992                 vm_page_free_min = VM_PAGE_FREE_MIN_LIMIT;
1993
1994         vm_page_free_target = vm_page_free_reserved +
1995                 VM_PAGE_FREE_TARGET(free_after_reserve);
1996
1997         if (vm_page_free_target > VM_PAGE_FREE_TARGET_LIMIT)
1998                 vm_page_free_target = VM_PAGE_FREE_TARGET_LIMIT;
1999
2000         if (vm_page_free_target < vm_page_free_min + 5)
2001                 vm_page_free_target = vm_page_free_min + 5;
2002
2003 }
2004
2005 /*
2006  *      vm_pageout is the high level pageout daemon.
2007  */
2008
2009 void
2010 vm_pageout_continue(void)
2011 {
2012         DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL);
2013         vm_pageout_scan_event_counter++;
2014         vm_pageout_scan();
2015         /* we hold vm_page_queue_free_lock now */
2016         assert(vm_page_free_wanted == 0);
2017         assert(vm_page_free_wanted_privileged == 0);
2018         assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
2019         mutex_unlock(&vm_page_queue_free_lock);
2020
2021         counter(c_vm_pageout_block++);
2022         thread_block((thread_continue_t)vm_pageout_continue);
2023         /*NOTREACHED*/
2024 }
2025
2026
2027 /*
2028  * must be called with the
2029  * queues and object locks held
2030  */
2031 static void
2032 vm_pageout_queue_steal(vm_page_t m)
2033 {
2034         struct vm_pageout_queue *q;
2035
2036         if (m->object->internal == TRUE)
2037                 q = &vm_pageout_queue_internal;
2038         else
2039                 q = &vm_pageout_queue_external;
2040
2041         m->laundry = FALSE;
2042         m->pageout_queue = FALSE;
2043         queue_remove(&q->pgo_pending, m, vm_page_t, pageq);
2044
2045         m->pageq.next = NULL;
2046         m->pageq.prev = NULL;
2047
2048         vm_object_paging_end(m->object);
2049
2050         q->pgo_laundry--;
2051 }
2052
2053
2054 #ifdef FAKE_DEADLOCK
2055
2056 #define FAKE_COUNT      5000
2057
2058 int internal_count = 0;
2059 int fake_deadlock = 0;
2060
2061 #endif
2062
2063 static void
2064 vm_pageout_iothread_continue(struct vm_pageout_queue *q)
2065 {
2066         vm_page_t       m = NULL;
2067         vm_object_t     object;
2068         boolean_t       need_wakeup;
2069         memory_object_t pager;
2070         thread_t        self = current_thread();
2071
2072         if ((vm_pageout_internal_iothread != THREAD_NULL)
2073             && (self == vm_pageout_external_iothread )
2074             && (self->options & TH_OPT_VMPRIV))
2075                 self->options &= ~TH_OPT_VMPRIV;
2076
2077         vm_page_lockspin_queues();
2078
2079         while ( !queue_empty(&q->pgo_pending) ) {
2080
2081                    q->pgo_busy = TRUE;
2082                    queue_remove_first(&q->pgo_pending, m, vm_page_t, pageq);
2083                    m->pageout_queue = FALSE;
2084                    vm_page_unlock_queues();
2085
2086                    m->pageq.next = NULL;
2087                    m->pageq.prev = NULL;
2088 #ifdef FAKE_DEADLOCK
2089                    if (q == &vm_pageout_queue_internal) {
2090                            vm_offset_t addr;
2091                            int  pg_count;
2092
2093                            internal_count++;
2094
2095                            if ((internal_count == FAKE_COUNT)) {
2096
2097                                    pg_count = vm_page_free_count + vm_page_free_reserved;
2098
2099                                    if (kmem_alloc(kernel_map, &addr, PAGE_SIZE * pg_count) == KERN_SUCCESS) {
2100                                            kmem_free(kernel_map, addr, PAGE_SIZE * pg_count);
2101                                    }
2102                                    internal_count = 0;
2103                                    fake_deadlock++;
2104                            }
2105                    }
2106 #endif
2107                    object = m->object;
2108
2109                    vm_object_lock(object);
2110
2111                    if (!object->pager_initialized) {
2112
2113                            /*
2114                             *   If there is no memory object for the page, create
2115                             *   one and hand it to the default pager.
2116                             */
2117
2118                            if (!object->pager_initialized)
2119                                    vm_object_collapse(object,
2120                                                       (vm_object_offset_t) 0,
2121                                                       TRUE);
2122                            if (!object->pager_initialized)
2123                                    vm_object_pager_create(object);
2124                            if (!object->pager_initialized) {
2125                                    /*
2126                                     *   Still no pager for the object.
2127                                     *   Reactivate the page.
2128                                     *
2129                                     *   Should only happen if there is no
2130                                     *   default pager.
2131                                     */
2132                                    m->list_req_pending = FALSE;
2133                                    m->cleaning = FALSE;
2134                                    m->pageout = FALSE;
2135
2136                                    vm_page_lockspin_queues();
2137                                    vm_page_unwire(m);
2138                                    vm_pageout_throttle_up(m);
2139                                    vm_pageout_dirty_no_pager++;
2140                                    vm_page_activate(m);
2141                                    vm_page_unlock_queues();
2142
2143                                    /*
2144                                     *   And we are done with it.
2145                                     */
2146                                    PAGE_WAKEUP_DONE(m);
2147
2148                                    vm_object_paging_end(object);
2149                                    vm_object_unlock(object);
2150
2151                                    vm_page_lockspin_queues();
2152                                    continue;
2153                            }
2154                    }
2155                    pager = object->pager;
2156                    if (pager == MEMORY_OBJECT_NULL) {
2157                            /*
2158                             * This pager has been destroyed by either
2159                             * memory_object_destroy or vm_object_destroy, and
2160                             * so there is nowhere for the page to go.
2161                             * Just free the page... VM_PAGE_FREE takes
2162                             * care of cleaning up all the state...
2163                             * including doing the vm_pageout_throttle_up
2164                             */
2165
2166                            VM_PAGE_FREE(m);
2167
2168                            vm_object_paging_end(object);
2169                            vm_object_unlock(object);
2170
2171                            vm_page_lockspin_queues();
2172                            continue;
2173                    }
2174                    vm_object_unlock(object);
2175                    /*
2176                     * we expect the paging_in_progress reference to have
2177                     * already been taken on the object before it was added
2178                     * to the appropriate pageout I/O queue... this will
2179                     * keep the object from being terminated and/or the
2180                     * paging_offset from changing until the I/O has
2181                     * completed... therefore no need to lock the object to
2182                     * pull the paging_offset from it.
2183                     *
2184                     * Send the data to the pager.
2185                     * any pageout clustering happens there
2186                     */
2187                    memory_object_data_return(pager,
2188                                              m->offset + object->paging_offset,
2189                                              PAGE_SIZE,
2190                                              NULL,
2191                                              NULL,
2192                                              FALSE,
2193                                              FALSE,
2194                                              0);
2195
2196                    vm_object_lock(object);
2197                    vm_object_paging_end(object);
2198                    vm_object_unlock(object);
2199
2200                    vm_page_lockspin_queues();
2201         }
2202         assert_wait((event_t) q, THREAD_UNINT);
2203
2204
2205         if (q->pgo_throttled == TRUE && !VM_PAGE_Q_THROTTLED(q)) {
2206                 q->pgo_throttled = FALSE;
2207                 need_wakeup = TRUE;
2208         } else
2209                 need_wakeup = FALSE;
2210
2211         q->pgo_busy = FALSE;
2212         q->pgo_idle = TRUE;
2213         vm_page_unlock_queues();
2214
2215         if (need_wakeup == TRUE)
2216                 thread_wakeup((event_t) &q->pgo_laundry);
2217
2218         thread_block_parameter((thread_continue_t)vm_pageout_iothread_continue, (void *) &q->pgo_pending);
2219         /*NOTREACHED*/
2220 }
2221
2222
2223 static void
2224 vm_pageout_iothread_external(void)
2225 {
2226         thread_t        self = current_thread();
2227
2228         self->options |= TH_OPT_VMPRIV;
2229
2230         vm_pageout_iothread_continue(&vm_pageout_queue_external);
2231         /*NOTREACHED*/
2232 }
2233
2234
2235 static void
2236 vm_pageout_iothread_internal(void)
2237 {
2238         thread_t        self = current_thread();
2239
2240         self->options |= TH_OPT_VMPRIV;
2241
2242         vm_pageout_iothread_continue(&vm_pageout_queue_internal);
2243         /*NOTREACHED*/
2244 }
2245
2246 static void
2247 vm_pageout_garbage_collect(int collect)
2248 {
2249         if (collect) {
2250                 stack_collect();
2251
2252                 /*
2253                  * consider_zone_gc should be last, because the other operations
2254                  * might return memory to zones.
2255                  */
2256                 consider_machine_collect();
2257                 consider_zone_gc();
2258
2259                 consider_machine_adjust();
2260         }
2261
2262         assert_wait((event_t) &vm_pageout_garbage_collect, THREAD_UNINT);
2263
2264         thread_block_parameter((thread_continue_t) vm_pageout_garbage_collect, (void *)1);
2265         /*NOTREACHED*/
2266 }
2267
2268
2269
2270 void
2271 vm_pageout(void)
2272 {
2273         thread_t        self = current_thread();
2274         thread_t        thread;
2275         kern_return_t   result;
2276         spl_t           s;
2277
2278         /*
2279          * Set thread privileges.
2280          */
2281         s = splsched();
2282         thread_lock(self);
2283         self->priority = BASEPRI_PREEMPT - 1;
2284         set_sched_pri(self, self->priority);
2285         thread_unlock(self);
2286
2287         if (!self->reserved_stack)
2288                 self->reserved_stack = self->kernel_stack;
2289
2290         splx(s);
2291
2292         /*
2293          *      Initialize some paging parameters.
2294          */
2295
2296         if (vm_pageout_idle_wait == 0)
2297                 vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
2298
2299         if (vm_pageout_burst_wait == 0)
2300                 vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
2301
2302         if (vm_pageout_empty_wait == 0)
2303                 vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
2304
2305         if (vm_pageout_deadlock_wait == 0)
2306                 vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
2307
2308         if (vm_pageout_deadlock_relief == 0)
2309                 vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
2310
2311         if (vm_pageout_inactive_relief == 0)
2312                 vm_pageout_inactive_relief = VM_PAGEOUT_INACTIVE_RELIEF;
2313
2314         if (vm_pageout_burst_active_throttle == 0)
2315                 vm_pageout_burst_active_throttle = VM_PAGEOUT_BURST_ACTIVE_THROTTLE;
2316
2317         if (vm_pageout_burst_inactive_throttle == 0)
2318                 vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
2319
2320         /*
2321          * Set kernel task to low backing store privileged
2322          * status
2323          */
2324         task_lock(kernel_task);
2325         kernel_task->priv_flags |= VM_BACKING_STORE_PRIV;
2326         task_unlock(kernel_task);
2327
2328         vm_page_free_count_init = vm_page_free_count;
2329
2330         /*
2331          * even if we've already called vm_page_free_reserve
2332          * call it again here to insure that the targets are
2333          * accurately calculated (it uses vm_page_free_count_init)
2334          * calling it with an arg of 0 will not change the reserve
2335          * but will re-calculate free_min and free_target
2336          */
2337         if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
2338                 vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
2339         } else
2340                 vm_page_free_reserve(0);
2341
2342
2343         queue_init(&vm_pageout_queue_external.pgo_pending);
2344         vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
2345         vm_pageout_queue_external.pgo_laundry = 0;
2346         vm_pageout_queue_external.pgo_idle = FALSE;
2347         vm_pageout_queue_external.pgo_busy = FALSE;
2348         vm_pageout_queue_external.pgo_throttled = FALSE;
2349
2350         queue_init(&vm_pageout_queue_internal.pgo_pending);
2351         vm_pageout_queue_internal.pgo_maxlaundry = 0;
2352         vm_pageout_queue_internal.pgo_laundry = 0;
2353         vm_pageout_queue_internal.pgo_idle = FALSE;
2354         vm_pageout_queue_internal.pgo_busy = FALSE;
2355         vm_pageout_queue_internal.pgo_throttled = FALSE;
2356
2357
2358         /* internal pageout thread started when default pager registered first time */
2359         /* external pageout and garbage collection threads started here */
2360
2361         result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external, NULL,
2362                                               BASEPRI_PREEMPT - 1,
2363                                               &vm_pageout_external_iothread);
2364         if (result != KERN_SUCCESS)
2365                 panic("vm_pageout_iothread_external: create failed");
2366
2367         thread_deallocate(vm_pageout_external_iothread);
2368
2369         result = kernel_thread_start_priority((thread_continue_t)vm_pageout_garbage_collect, NULL,
2370                                               MINPRI_KERNEL,
2371                                               &thread);
2372         if (result != KERN_SUCCESS)
2373                 panic("vm_pageout_garbage_collect: create failed");
2374
2375         thread_deallocate(thread);
2376
2377         vm_object_reaper_init();
2378
2379
2380         vm_pageout_continue();
2381
2382         /*
2383          * Unreached code!
2384          *
2385          * The vm_pageout_continue() call above never returns, so the code below is never
2386          * executed.  We take advantage of this to declare several DTrace VM related probe
2387          * points that our kernel doesn't have an analog for.  These are probe points that
2388          * exist in Solaris and are in the DTrace documentation, so people may have written
2389          * scripts that use them.  Declaring the probe points here means their scripts will
2390          * compile and execute which we want for portability of the scripts, but since this
2391          * section of code is never reached, the probe points will simply never fire.  Yes,
2392          * this is basically a hack.  The problem is the DTrace probe points were chosen with
2393          * Solaris specific VM events in mind, not portability to different VM implementations.
2394          */
2395
2396         DTRACE_VM2(execfree, int, 1, (uint64_t *), NULL);
2397         DTRACE_VM2(execpgin, int, 1, (uint64_t *), NULL);
2398         DTRACE_VM2(execpgout, int, 1, (uint64_t *), NULL);
2399         DTRACE_VM2(pgswapin, int, 1, (uint64_t *), NULL);
2400         DTRACE_VM2(pgswapout, int, 1, (uint64_t *), NULL);
2401         DTRACE_VM2(swapin, int, 1, (uint64_t *), NULL);
2402         DTRACE_VM2(swapout, int, 1, (uint64_t *), NULL);
2403         /*NOTREACHED*/
2404 }
2405
2406 kern_return_t
2407 vm_pageout_internal_start(void)
2408 {
2409         kern_return_t result;
2410
2411         vm_pageout_queue_internal.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
2412         result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal, NULL, BASEPRI_PREEMPT - 1, &vm_pageout_internal_iothread);
2413         if (result == KERN_SUCCESS)
2414                 thread_deallocate(vm_pageout_internal_iothread);
2415         return result;
2416 }
2417
2418 #define UPL_DELAYED_UNLOCK_LIMIT  (MAX_UPL_TRANSFER / 2)
2419
2420 static upl_t
2421 upl_create(int type, int flags, upl_size_t size)
2422 {
2423         upl_t   upl;
2424         int     page_field_size = 0;
2425         int     upl_flags = 0;
2426         int     upl_size  = sizeof(struct upl);
2427
2428         if (type & UPL_CREATE_LITE) {
2429                 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
2430                 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
2431
2432                 upl_flags |= UPL_LITE;
2433         }
2434         if (type & UPL_CREATE_INTERNAL) {
2435                 upl_size += sizeof(struct upl_page_info) * (size/PAGE_SIZE);
2436
2437                 upl_flags |= UPL_INTERNAL;
2438         }
2439         upl = (upl_t)kalloc(upl_size + page_field_size);
2440
2441         if (page_field_size)
2442                 bzero((char *)upl + upl_size, page_field_size);
2443
2444         upl->flags = upl_flags | flags;
2445         upl->src_object = NULL;
2446         upl->kaddr = (vm_offset_t)0;
2447         upl->size = 0;
2448         upl->map_object = NULL;
2449         upl->ref_count = 1;
2450         upl->highest_page = 0;
2451         upl_lock_init(upl);
2452 #ifdef UPL_DEBUG
2453         upl->ubc_alias1 = 0;
2454         upl->ubc_alias2 = 0;
2455 #endif /* UPL_DEBUG */
2456         return(upl);
2457 }
2458
2459 static void
2460 upl_destroy(upl_t upl)
2461 {
2462         int     page_field_size;  /* bit field in word size buf */
2463         int     size;
2464
2465 #ifdef UPL_DEBUG
2466         {
2467                 vm_object_t     object;
2468
2469                 if (upl->flags & UPL_SHADOWED) {
2470                         object = upl->map_object->shadow;
2471                 } else {
2472                         object = upl->map_object;
2473                 }
2474                 vm_object_lock(object);
2475                 queue_remove(&object->uplq, upl, upl_t, uplq);
2476                 vm_object_unlock(object);
2477         }
2478 #endif /* UPL_DEBUG */
2479         /*
2480          * drop a reference on the map_object whether or
2481          * not a pageout object is inserted
2482          */
2483         if (upl->flags & UPL_SHADOWED)
2484                 vm_object_deallocate(upl->map_object);
2485
2486         if (upl->flags & UPL_DEVICE_MEMORY)
2487                 size = PAGE_SIZE;
2488         else
2489                 size = upl->size;
2490         page_field_size = 0;
2491
2492         if (upl->flags & UPL_LITE) {
2493                 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
2494                 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
2495         }
2496         if (upl->flags & UPL_INTERNAL) {
2497                 kfree(upl,
2498                       sizeof(struct upl) +
2499                       (sizeof(struct upl_page_info) * (size/PAGE_SIZE))
2500                       + page_field_size);
2501         } else {
2502                 kfree(upl, sizeof(struct upl) + page_field_size);
2503         }
2504 }
2505
2506 void uc_upl_dealloc(upl_t upl);
2507 __private_extern__ void
2508 uc_upl_dealloc(upl_t upl)
2509 {
2510         if (--upl->ref_count == 0)
2511                 upl_destroy(upl);
2512 }
2513
2514 void
2515 upl_deallocate(upl_t upl)
2516 {
2517         if (--upl->ref_count == 0)
2518                 upl_destroy(upl);
2519 }
2520
2521 /*
2522  * Statistics about UPL enforcement of copy-on-write obligations.
2523  */
2524 unsigned long upl_cow = 0;
2525 unsigned long upl_cow_again = 0;
2526 unsigned long upl_cow_contiguous = 0;
2527 unsigned long upl_cow_pages = 0;
2528 unsigned long upl_cow_again_pages = 0;
2529 unsigned long upl_cow_contiguous_pages = 0;
2530
2531 /*
2532  *      Routine:        vm_object_upl_request
2533  *      Purpose:
2534  *              Cause the population of a portion of a vm_object.
2535  *              Depending on the nature of the request, the pages
2536  *              returned may be contain valid data or be uninitialized.
2537  *              A page list structure, listing the physical pages
2538  *              will be returned upon request.
2539  *              This function is called by the file system or any other
2540  *              supplier of backing store to a pager.
2541  *              IMPORTANT NOTE: The caller must still respect the relationship
2542  *              between the vm_object and its backing memory object.  The
2543  *              caller MUST NOT substitute changes in the backing file
2544  *              without first doing a memory_object_lock_request on the
2545  *              target range unless it is know that the pages are not
2546  *              shared with another entity at the pager level.
2547  *              Copy_in_to:
2548  *                      if a page list structure is present
2549  *                      return the mapped physical pages, where a
2550  *                      page is not present, return a non-initialized
2551  *                      one.  If the no_sync bit is turned on, don't
2552  *                      call the pager unlock to synchronize with other
2553  *                      possible copies of the page. Leave pages busy
2554  *                      in the original object, if a page list structure
2555  *                      was specified.  When a commit of the page list
2556  *                      pages is done, the dirty bit will be set for each one.
2557  *              Copy_out_from:
2558  *                      If a page list structure is present, return
2559  *                      all mapped pages.  Where a page does not exist
2560  *                      map a zero filled one. Leave pages busy in
2561  *                      the original object.  If a page list structure
2562  *                      is not specified, this call is a no-op.
2563  *
2564  *              Note:  access of default pager objects has a rather interesting
2565  *              twist.  The caller of this routine, presumably the file system
2566  *              page cache handling code, will never actually make a request
2567  *              against a default pager backed object.  Only the default
2568  *              pager will make requests on backing store related vm_objects
2569  *              In this way the default pager can maintain the relationship
2570  *              between backing store files (abstract memory objects) and
2571  *              the vm_objects (cache objects), they support.
2572  *
2573  */
2574
2575 __private_extern__ kern_return_t
2576 vm_object_upl_request(
2577         vm_object_t             object,
2578         vm_object_offset_t      offset,
2579         upl_size_t              size,
2580         upl_t                   *upl_ptr,
2581         upl_page_info_array_t   user_page_list,
2582         unsigned int            *page_list_count,
2583         int                     cntrl_flags)
2584 {
2585         vm_page_t               dst_page = VM_PAGE_NULL;
2586         vm_object_offset_t      dst_offset;
2587         upl_size_t              xfer_size;
2588         boolean_t               dirty;
2589         boolean_t               hw_dirty;
2590         upl_t                   upl = NULL;
2591         unsigned int            entry;
2592 #if MACH_CLUSTER_STATS
2593         boolean_t               encountered_lrp = FALSE;
2594 #endif
2595         vm_page_t               alias_page = NULL;
2596         int                     refmod_state = 0;
2597         wpl_array_t             lite_list = NULL;
2598         vm_object_t             last_copy_object;
2599         int                     delayed_unlock = 0;
2600         int                     j;
2601
2602         if (cntrl_flags & ~UPL_VALID_FLAGS) {
2603                 /*
2604                  * For forward compatibility's sake,
2605                  * reject any unknown flag.
2606                  */
2607                 return KERN_INVALID_VALUE;
2608         }
2609         if ( (!object->internal) && (object->paging_offset != 0) )
2610                 panic("vm_object_upl_request: external object with non-zero paging offset\n");
2611         if (object->phys_contiguous)
2612                 panic("vm_object_upl_request: contiguous object specified\n");
2613
2614
2615         if ((size / PAGE_SIZE) > MAX_UPL_TRANSFER)
2616                 size = MAX_UPL_TRANSFER * PAGE_SIZE;
2617
2618         if ( (cntrl_flags & UPL_SET_INTERNAL) && page_list_count != NULL)
2619                 *page_list_count = MAX_UPL_TRANSFER;
2620
2621         if (cntrl_flags & UPL_SET_INTERNAL) {
2622                 if (cntrl_flags & UPL_SET_LITE) {
2623
2624                         upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE, 0, size);
2625
2626                         user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
2627                         lite_list = (wpl_array_t)
2628                                         (((uintptr_t)user_page_list) +
2629                                         ((size/PAGE_SIZE) * sizeof(upl_page_info_t)));
2630                 } else {
2631                         upl = upl_create(UPL_CREATE_INTERNAL, 0, size);
2632
2633                         user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
2634                 }
2635         } else {
2636                 if (cntrl_flags & UPL_SET_LITE) {
2637
2638                         upl = upl_create(UPL_CREATE_EXTERNAL | UPL_CREATE_LITE, 0, size);
2639
2640                         lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
2641                 } else {
2642                         upl = upl_create(UPL_CREATE_EXTERNAL, 0, size);
2643                 }
2644         }
2645         *upl_ptr = upl;
2646
2647         if (user_page_list)
2648                 user_page_list[0].device = FALSE;
2649
2650         if (cntrl_flags & UPL_SET_LITE) {
2651                 upl->map_object = object;
2652         } else {
2653                 upl->map_object = vm_object_allocate(size);
2654                 /*
2655                  * No neeed to lock the new object: nobody else knows
2656                  * about it yet, so it's all ours so far.
2657                  */
2658                 upl->map_object->shadow = object;
2659                 upl->map_object->pageout = TRUE;
2660                 upl->map_object->can_persist = FALSE;
2661                 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2662                 upl->map_object->shadow_offset = offset;
2663                 upl->map_object->wimg_bits = object->wimg_bits;
2664
2665                 VM_PAGE_GRAB_FICTITIOUS(alias_page);
2666
2667                 upl->flags |= UPL_SHADOWED;
2668         }
2669         /*
2670          * ENCRYPTED SWAP:
2671          * Just mark the UPL as "encrypted" here.
2672          * We'll actually encrypt the pages later,
2673          * in upl_encrypt(), when the caller has
2674          * selected which pages need to go to swap.
2675          */
2676         if (cntrl_flags & UPL_ENCRYPT)
2677                 upl->flags |= UPL_ENCRYPTED;
2678
2679         if (cntrl_flags & UPL_FOR_PAGEOUT)
2680                 upl->flags |= UPL_PAGEOUT;
2681
2682         vm_object_lock(object);
2683         vm_object_paging_begin(object);
2684
2685         /*
2686          * we can lock in the paging_offset once paging_in_progress is set
2687          */
2688         upl->size = size;
2689         upl->offset = offset + object->paging_offset;
2690
2691 #ifdef UPL_DEBUG
2692         queue_enter(&object->uplq, upl, upl_t, uplq);
2693 #endif /* UPL_DEBUG */
2694
2695         if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != VM_OBJECT_NULL) {
2696                 /*
2697                  * Honor copy-on-write obligations
2698                  *
2699                  * The caller is gathering these pages and
2700                  * might modify their contents.  We need to
2701                  * make sure that the copy object has its own
2702                  * private copies of these pages before we let
2703                  * the caller modify them.
2704                  */
2705                 vm_object_update(object,
2706                                  offset,
2707                                  size,
2708                                  NULL,
2709                                  NULL,
2710                                  FALSE, /* should_return */
2711                                  MEMORY_OBJECT_COPY_SYNC,
2712                                  VM_PROT_NO_CHANGE);
2713                 upl_cow++;
2714                 upl_cow_pages += size >> PAGE_SHIFT;
2715         }
2716         /*
2717          * remember which copy object we synchronized with
2718          */
2719         last_copy_object = object->copy;
2720         entry = 0;
2721
2722         xfer_size = size;
2723         dst_offset = offset;
2724
2725         while (xfer_size) {
2726
2727                 if ((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
2728                         if (delayed_unlock) {
2729                                 delayed_unlock = 0;
2730                                 vm_page_unlock_queues();
2731                         }
2732                         vm_object_unlock(object);
2733                         VM_PAGE_GRAB_FICTITIOUS(alias_page);
2734                         goto relock;
2735                 }
2736                 if (delayed_unlock == 0) {
2737                         /*
2738                          * pageout_scan takes the vm_page_lock_queues first
2739                          * then tries for the object lock... to avoid what
2740                          * is effectively a lock inversion, we'll go to the
2741                          * trouble of taking them in that same order... otherwise
2742                          * if this object contains the majority of the pages resident
2743                          * in the UBC (or a small set of large objects actively being
2744                          * worked on contain the majority of the pages), we could
2745                          * cause the pageout_scan thread to 'starve' in its attempt
2746                          * to find pages to move to the free queue, since it has to
2747                          * successfully acquire the object lock of any candidate page
2748                          * before it can steal/clean it.
2749                          */
2750                         vm_object_unlock(object);
2751 relock:
2752                         for (j = 0; ; j++) {
2753                                 vm_page_lock_queues();
2754
2755                                 if (vm_object_lock_try(object))
2756                                         break;
2757                                 vm_page_unlock_queues();
2758                                 mutex_pause(j);
2759                         }
2760                         delayed_unlock = 1;
2761                 }
2762                 if (cntrl_flags & UPL_COPYOUT_FROM) {
2763                         upl->flags |= UPL_PAGE_SYNC_DONE;
2764
2765                         if ( ((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
2766                                 dst_page->fictitious ||
2767                                 dst_page->absent ||
2768                                 dst_page->error ||
2769                                (dst_page->wire_count && !dst_page->pageout && !dst_page->list_req_pending)) {
2770
2771                                 if (user_page_list)
2772                                         user_page_list[entry].phys_addr = 0;
2773
2774                                 goto delay_unlock_queues;
2775                         }
2776                         /*
2777                          * grab this up front...
2778                          * a high percentange of the time we're going to
2779                          * need the hardware modification state a bit later
2780                          * anyway... so we can eliminate an extra call into
2781                          * the pmap layer by grabbing it here and recording it
2782                          */
2783                         if (dst_page->pmapped)
2784                                 refmod_state = pmap_get_refmod(dst_page->phys_page);
2785                         else
2786                                 refmod_state = 0;
2787
2788                         if ( (refmod_state & VM_MEM_REFERENCED) && dst_page->inactive ) {
2789                                 /*
2790                                  * page is on inactive list and referenced...
2791                                  * reactivate it now... this gets it out of the
2792                                  * way of vm_pageout_scan which would have to
2793                                  * reactivate it upon tripping over it
2794                                  */
2795                                 vm_page_activate(dst_page);
2796                                 VM_STAT_INCR(reactivations);
2797                         }
2798                         if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
2799                                 /*
2800                                  * we're only asking for DIRTY pages to be returned
2801                                  */
2802                                 if (dst_page->list_req_pending || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
2803                                         /*
2804                                          * if we were the page stolen by vm_pageout_scan to be
2805                                          * cleaned (as opposed to a buddy being clustered in
2806                                          * or this request is not being driven by a PAGEOUT cluster
2807                                          * then we only need to check for the page being dirty or
2808                                          * precious to decide whether to return it
2809                                          */
2810                                         if (dst_page->dirty || dst_page->precious || (refmod_state & VM_MEM_MODIFIED))
2811                                                 goto check_busy;
2812                                         goto dont_return;
2813                                 }
2814                                 /*
2815                                  * this is a request for a PAGEOUT cluster and this page
2816                                  * is merely along for the ride as a 'buddy'... not only
2817                                  * does it have to be dirty to be returned, but it also
2818                                  * can't have been referenced recently... note that we've
2819                                  * already filtered above based on whether this page is
2820                                  * currently on the inactive queue or it meets the page
2821                                  * ticket (generation count) check
2822                                  */
2823                                 if ( !(refmod_state & VM_MEM_REFERENCED) &&
2824                                      ((refmod_state & VM_MEM_MODIFIED) || dst_page->dirty || dst_page->precious) ) {
2825                                         goto check_busy;
2826                                 }
2827 dont_return:
2828                                 /*
2829                                  * if we reach here, we're not to return
2830                                  * the page... go on to the next one
2831                                  */
2832                                 if (user_page_list)
2833                                         user_page_list[entry].phys_addr = 0;
2834
2835                                 goto delay_unlock_queues;
2836                         }
2837 check_busy:
2838                         if (dst_page->busy && (!(dst_page->list_req_pending && dst_page->pageout))) {
2839                                 if (cntrl_flags & UPL_NOBLOCK) {
2840                                         if (user_page_list)
2841                                                 user_page_list[entry].phys_addr = 0;
2842
2843                                         goto delay_unlock_queues;
2844                                 }
2845                                 /*
2846                                  * someone else is playing with the
2847                                  * page.  We will have to wait.
2848                                  */
2849                                 delayed_unlock = 0;
2850                                 vm_page_unlock_queues();
2851
2852                                 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
2853
2854                                 continue;
2855                         }
2856                         /*
2857                          * Someone else already cleaning the page?
2858                          */
2859                         if ((dst_page->cleaning || dst_page->absent || dst_page->wire_count != 0) && !dst_page->list_req_pending) {
2860                                 if (user_page_list)
2861                                         user_page_list[entry].phys_addr = 0;
2862
2863                                 goto delay_unlock_queues;
2864                         }
2865                         /*
2866                          * ENCRYPTED SWAP:
2867                          * The caller is gathering this page and might
2868                          * access its contents later on.  Decrypt the
2869                          * page before adding it to the UPL, so that
2870                          * the caller never sees encrypted data.
2871                          */
2872                         if (! (cntrl_flags & UPL_ENCRYPT) && dst_page->encrypted) {
2873                                 int  was_busy;
2874
2875                                 delayed_unlock = 0;
2876                                 vm_page_unlock_queues();
2877                                 /*
2878                                  * save the current state of busy
2879                                  * mark page as busy while decrypt
2880                                  * is in progress since it will drop
2881                                  * the object lock...
2882                                  */
2883                                 was_busy = dst_page->busy;
2884                                 dst_page->busy = TRUE;
2885
2886                                 vm_page_decrypt(dst_page, 0);
2887                                 vm_page_decrypt_for_upl_counter++;
2888                                 /*
2889                                  * restore to original busy state
2890                                  */
2891                                 dst_page->busy = was_busy;
2892
2893                                 vm_page_lock_queues();
2894                                 delayed_unlock = 1;
2895                         }
2896                         if (dst_page->pageout_queue == TRUE)
2897                                 /*
2898                                  * we've buddied up a page for a clustered pageout
2899                                  * that has already been moved to the pageout
2900                                  * queue by pageout_scan... we need to remove
2901                                  * it from the queue and drop the laundry count
2902                                  * on that queue
2903                                  */
2904                                 vm_pageout_queue_steal(dst_page);
2905 #if MACH_CLUSTER_STATS
2906                         /*
2907                          * pageout statistics gathering.  count
2908                          * all the pages we will page out that
2909                          * were not counted in the initial
2910                          * vm_pageout_scan work
2911                          */
2912                         if (dst_page->list_req_pending)
2913                                 encountered_lrp = TRUE;
2914                         if ((dst_page->dirty || (dst_page->object->internal && dst_page->precious)) && !dst_page->list_req_pending) {
2915                                 if (encountered_lrp)
2916                                         CLUSTER_STAT(pages_at_higher_offsets++;)
2917                                 else
2918                                         CLUSTER_STAT(pages_at_lower_offsets++;)
2919                         }
2920 #endif
2921                         /*
2922                          * Turn off busy indication on pending
2923                          * pageout.  Note: we can only get here
2924                          * in the request pending case.
2925                          */
2926                         dst_page->list_req_pending = FALSE;
2927                         dst_page->busy = FALSE;
2928
2929                         hw_dirty = refmod_state & VM_MEM_MODIFIED;
2930                         dirty = hw_dirty ? TRUE : dst_page->dirty;
2931
2932                         if (dst_page->phys_page > upl->highest_page)
2933                                 upl->highest_page = dst_page->phys_page;
2934
2935                         if (cntrl_flags & UPL_SET_LITE) {
2936                                 int     pg_num;
2937
2938                                 pg_num = (dst_offset-offset)/PAGE_SIZE;
2939                                 lite_list[pg_num>>5] |= 1 << (pg_num & 31);
2940
2941                                 if (hw_dirty)
2942                                         pmap_clear_modify(dst_page->phys_page);
2943
2944                                 /*
2945                                  * Mark original page as cleaning
2946                                  * in place.
2947                                  */
2948                                 dst_page->cleaning = TRUE;
2949                                 dst_page->precious = FALSE;
2950                         } else {
2951                                 /*
2952                                  * use pageclean setup, it is more
2953                                  * convenient even for the pageout
2954                                  * cases here
2955                                  */
2956                                 vm_object_lock(upl->map_object);
2957                                 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
2958                                 vm_object_unlock(upl->map_object);
2959
2960                                 alias_page->absent = FALSE;
2961                                 alias_page = NULL;
2962                         }
2963 #if     MACH_PAGEMAP
2964                         /*
2965                          * Record that this page has been
2966                          * written out
2967                          */
2968                         vm_external_state_set(object->existence_map, dst_page->offset);
2969 #endif  /*MACH_PAGEMAP*/
2970                         dst_page->dirty = dirty;
2971
2972                         if (!dirty)
2973                                 dst_page->precious = TRUE;
2974
2975                         if (dst_page->pageout)
2976                                 dst_page->busy = TRUE;
2977
2978                         if ( (cntrl_flags & UPL_ENCRYPT) ) {
2979                                 /*
2980                                  * ENCRYPTED SWAP:
2981                                  * We want to deny access to the target page
2982                                  * because its contents are about to be
2983                                  * encrypted and the user would be very
2984                                  * confused to see encrypted data instead
2985                                  * of their data.
2986                                  * We also set "encrypted_cleaning" to allow
2987                                  * vm_pageout_scan() to demote that page
2988                                  * from "adjacent/clean-in-place" to
2989                                  * "target/clean-and-free" if it bumps into
2990                                  * this page during its scanning while we're
2991                                  * still processing this cluster.
2992                                  */
2993                                 dst_page->busy = TRUE;
2994                                 dst_page->encrypted_cleaning = TRUE;
2995                         }
2996                         if ( !(cntrl_flags & UPL_CLEAN_IN_PLACE) ) {
2997                                 /*
2998                                  * deny access to the target page
2999                                  * while it is being worked on
3000                                  */
3001                                 if ((!dst_page->pageout) && (dst_page->wire_count == 0)) {
3002                                         dst_page->busy = TRUE;
3003                                         dst_page->pageout = TRUE;
3004                                         vm_page_wire(dst_page);
3005                                 }
3006                         }
3007                 } else {
3008                         if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != last_copy_object) {
3009                                 /*
3010                                  * Honor copy-on-write obligations
3011                                  *
3012                                  * The copy object has changed since we
3013                                  * last synchronized for copy-on-write.
3014                                  * Another copy object might have been
3015                                  * inserted while we released the object's
3016                                  * lock.  Since someone could have seen the
3017                                  * original contents of the remaining pages
3018                                  * through that new object, we have to
3019                                  * synchronize with it again for the remaining
3020                                  * pages only.  The previous pages are "busy"
3021                                  * so they can not be seen through the new
3022                                  * mapping.  The new mapping will see our
3023                                  * upcoming changes for those previous pages,
3024                                  * but that's OK since they couldn't see what
3025                                  * was there before.  It's just a race anyway
3026                                  * and there's no guarantee of consistency or
3027                                  * atomicity.  We just don't want new mappings
3028                                  * to see both the *before* and *after* pages.
3029                                  */
3030                                 if (object->copy != VM_OBJECT_NULL) {
3031                                         delayed_unlock = 0;
3032                                         vm_page_unlock_queues();
3033
3034                                         vm_object_update(
3035                                                 object,
3036                                                 dst_offset,/* current offset */
3037                                                 xfer_size, /* remaining size */
3038                                                 NULL,
3039                                                 NULL,
3040                                                 FALSE,     /* should_return */
3041                                                 MEMORY_OBJECT_COPY_SYNC,
3042                                                 VM_PROT_NO_CHANGE);
3043
3044                                         upl_cow_again++;
3045                                         upl_cow_again_pages += xfer_size >> PAGE_SHIFT;
3046
3047                                         vm_page_lock_queues();
3048                                         delayed_unlock = 1;
3049                                 }
3050                                 /*
3051                                  * remember the copy object we synced with
3052                                  */
3053                                 last_copy_object = object->copy;
3054                         }
3055                         dst_page = vm_page_lookup(object, dst_offset);
3056
3057                         if (dst_page != VM_PAGE_NULL) {
3058                                 if ( !(dst_page->list_req_pending) ) {
3059                                         if ((cntrl_flags & UPL_RET_ONLY_ABSENT) && !dst_page->absent) {
3060                                                 /*
3061                                                  * skip over pages already present in the cache
3062                                                  */
3063                                                 if (user_page_list)
3064                                                         user_page_list[entry].phys_addr = 0;
3065
3066                                                 goto delay_unlock_queues;
3067                                         }
3068                                         if (dst_page->cleaning) {
3069                                                 /*
3070                                                  * someone else is writing to the page... wait...
3071                                                  */
3072                                                 delayed_unlock = 0;
3073                                                 vm_page_unlock_queues();
3074
3075                                                 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
3076
3077                                                 continue;
3078                                         }
3079                                 } else {
3080                                         if (dst_page->fictitious &&
3081                                             dst_page->phys_page == vm_page_fictitious_addr) {
3082                                                 assert( !dst_page->speculative);
3083                                                 /*
3084                                                  * dump the fictitious page
3085                                                  */
3086                                                 dst_page->list_req_pending = FALSE;
3087
3088                                                 vm_page_free(dst_page);
3089
3090                                                 dst_page = NULL;
3091                                         } else if (dst_page->absent) {
3092                                                 /*
3093                                                  * the default_pager case
3094                                                  */
3095                                                 dst_page->list_req_pending = FALSE;
3096                                                 dst_page->busy = FALSE;
3097                                         }
3098                                 }
3099                         }
3100                         if (dst_page == VM_PAGE_NULL) {
3101                                 if (object->private) {
3102                                         /*
3103                                          * This is a nasty wrinkle for users
3104                                          * of upl who encounter device or
3105                                          * private memory however, it is
3106                                          * unavoidable, only a fault can
3107                                          * resolve the actual backing
3108                                          * physical page by asking the
3109                                          * backing device.
3110                                          */
3111                                         if (user_page_list)
3112                                                 user_page_list[entry].phys_addr = 0;
3113
3114                                         goto delay_unlock_queues;
3115                                 }
3116                                 /*
3117                                  * need to allocate a page
3118                                  */
3119                                 dst_page = vm_page_grab();
3120
3121                                 if (dst_page == VM_PAGE_NULL) {
3122                                         if ( (cntrl_flags & (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) == (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) {
3123                                                /*
3124                                                 * we don't want to stall waiting for pages to come onto the free list
3125                                                 * while we're already holding absent pages in this UPL
3126                                                 * the caller will deal with the empty slots
3127                                                 */
3128                                                 if (user_page_list)
3129                                                         user_page_list[entry].phys_addr = 0;
3130
3131                                                 goto try_next_page;
3132                                         }
3133                                         /*
3134                                          * no pages available... wait
3135                                          * then try again for the same
3136                                          * offset...
3137                                          */
3138                                         delayed_unlock = 0;
3139                                         vm_page_unlock_queues();
3140
3141                                         vm_object_unlock(object);
3142                                         VM_PAGE_WAIT();
3143
3144                                         /*
3145                                          * pageout_scan takes the vm_page_lock_queues first
3146                                          * then tries for the object lock... to avoid what
3147                                          * is effectively a lock inversion, we'll go to the
3148                                          * trouble of taking them in that same order... otherwise
3149                                          * if this object contains the majority of the pages resident
3150                                          * in the UBC (or a small set of large objects actively being
3151                                          * worked on contain the majority of the pages), we could
3152                                          * cause the pageout_scan thread to 'starve' in its attempt
3153                                          * to find pages to move to the free queue, since it has to
3154                                          * successfully acquire the object lock of any candidate page
3155                                          * before it can steal/clean it.
3156                                          */
3157                                         for (j = 0; ; j++) {
3158                                                 vm_page_lock_queues();
3159
3160                                                 if (vm_object_lock_try(object))
3161                                                         break;
3162                                                 vm_page_unlock_queues();
3163                                                 mutex_pause(j);
3164                                         }
3165                                         delayed_unlock = 1;
3166
3167                                         continue;
3168                                 }
3169                                 vm_page_insert_internal(dst_page, object, dst_offset, TRUE);
3170
3171                                 dst_page->absent = TRUE;
3172                                 dst_page->busy = FALSE;
3173
3174                                 if (cntrl_flags & UPL_RET_ONLY_ABSENT) {
3175                                         /*
3176                                          * if UPL_RET_ONLY_ABSENT was specified,
3177                                          * than we're definitely setting up a
3178                                          * upl for a clustered read/pagein
3179                                          * operation... mark the pages as clustered
3180                                          * so upl_commit_range can put them on the
3181                                          * speculative list
3182                                          */
3183                                         dst_page->clustered = TRUE;
3184                                 }
3185                         }
3186                         /*
3187                          * ENCRYPTED SWAP:
3188                          */
3189                         if (cntrl_flags & UPL_ENCRYPT) {
3190                                 /*
3191                                  * The page is going to be encrypted when we
3192                                  * get it from the pager, so mark it so.
3193                                  */
3194                                 dst_page->encrypted = TRUE;
3195                         } else {
3196                                 /*
3197                                  * Otherwise, the page will not contain
3198                                  * encrypted data.
3199                                  */
3200                                 dst_page->encrypted = FALSE;
3201                         }
3202                         dst_page->overwriting = TRUE;
3203
3204                         if (dst_page->fictitious) {
3205                                 panic("need corner case for fictitious page");
3206                         }
3207                         if (dst_page->busy) {
3208                                 /*
3209                                  * someone else is playing with the
3210                                  * page.  We will have to wait.
3211                                  */
3212                                 delayed_unlock = 0;
3213                                 vm_page_unlock_queues();
3214
3215                                 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
3216
3217                                 continue;
3218                         }
3219                         if (dst_page->pmapped) {
3220                                 if ( !(cntrl_flags & UPL_FILE_IO))
3221                                         /*
3222                                          * eliminate all mappings from the
3223                                          * original object and its prodigy
3224                                          */
3225                                         refmod_state = pmap_disconnect(dst_page->phys_page);
3226                                 else
3227                                         refmod_state = pmap_get_refmod(dst_page->phys_page);
3228                         } else
3229                                 refmod_state = 0;
3230
3231                         hw_dirty = refmod_state & VM_MEM_MODIFIED;
3232                         dirty = hw_dirty ? TRUE : dst_page->dirty;
3233
3234                         if (cntrl_flags & UPL_SET_LITE) {
3235                                 int     pg_num;
3236
3237                                 pg_num = (dst_offset-offset)/PAGE_SIZE;
3238                                 lite_list[pg_num>>5] |= 1 << (pg_num & 31);
3239
3240                                 if (hw_dirty)
3241                                         pmap_clear_modify(dst_page->phys_page);
3242
3243                                 /*
3244                                  * Mark original page as cleaning
3245                                  * in place.
3246                                  */
3247                                 dst_page->cleaning = TRUE;
3248                                 dst_page->precious = FALSE;
3249                         } else {
3250                                 /*
3251                                  * use pageclean setup, it is more
3252                                  * convenient even for the pageout
3253                                  * cases here
3254                                  */
3255                                 vm_object_lock(upl->map_object);
3256                                 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
3257                                 vm_object_unlock(upl->map_object);
3258
3259                                 alias_page->absent = FALSE;
3260                                 alias_page = NULL;
3261                         }
3262
3263                         if (cntrl_flags & UPL_CLEAN_IN_PLACE) {
3264                                 /*
3265                                  * clean in place for read implies
3266                                  * that a write will be done on all
3267                                  * the pages that are dirty before
3268                                  * a upl commit is done.  The caller
3269                                  * is obligated to preserve the
3270                                  * contents of all pages marked dirty
3271                                  */
3272                                 upl->flags |= UPL_CLEAR_DIRTY;
3273                         }
3274                         dst_page->dirty = dirty;
3275
3276                         if (!dirty)
3277                                 dst_page->precious = TRUE;
3278
3279                         if (dst_page->wire_count == 0) {
3280                                 /*
3281                                  * deny access to the target page while
3282                                  * it is being worked on
3283                                  */
3284                                 dst_page->busy = TRUE;
3285                         } else
3286                                 vm_page_wire(dst_page);
3287
3288                         if (dst_page->clustered) {
3289                                 /*
3290                                  * expect the page not to be used
3291                                  * since it's coming in as part
3292                                  * of a speculative cluster...
3293                                  * pages that are 'consumed' will
3294                                  * get a hardware reference
3295                                  */
3296                                 dst_page->reference = FALSE;
3297                         } else {
3298                                 /*
3299                                  * expect the page to be used
3300                                  */
3301                                 dst_page->reference = TRUE;
3302                         }
3303                         dst_page->precious = (cntrl_flags & UPL_PRECIOUS) ? TRUE : FALSE;
3304                 }
3305                 if (dst_page->phys_page > upl->highest_page)
3306                         upl->highest_page = dst_page->phys_page;
3307                 if (user_page_list) {
3308                         user_page_list[entry].phys_addr = dst_page->phys_page;
3309                         user_page_list[entry].dirty     = dst_page->dirty;
3310                         user_page_list[entry].pageout   = dst_page->pageout;
3311                         user_page_list[entry].absent    = dst_page->absent;
3312                         user_page_list[entry].precious  = dst_page->precious;
3313
3314                         if (dst_page->clustered == TRUE)
3315                                 user_page_list[entry].speculative = dst_page->speculative;
3316                         else
3317                                 user_page_list[entry].speculative = FALSE;
3318                 }
3319                 /*
3320                  * if UPL_RET_ONLY_ABSENT is set, then
3321                  * we are working with a fresh page and we've
3322                  * just set the clustered flag on it to
3323                  * indicate that it was drug in as part of a
3324                  * speculative cluster... so leave it alone
3325                  */
3326                 if ( !(cntrl_flags & UPL_RET_ONLY_ABSENT)) {
3327                         /*
3328                          * someone is explicitly grabbing this page...
3329                          * update clustered and speculative state
3330                          *
3331                          */
3332                         VM_PAGE_CONSUME_CLUSTERED(dst_page);
3333                 }
3334 delay_unlock_queues:
3335                 if (delayed_unlock++ > UPL_DELAYED_UNLOCK_LIMIT) {
3336                         /*
3337                          * pageout_scan takes the vm_page_lock_queues first
3338                          * then tries for the object lock... to avoid what
3339                          * is effectively a lock inversion, we'll go to the
3340                          * trouble of taking them in that same order... otherwise
3341                          * if this object contains the majority of the pages resident
3342                          * in the UBC (or a small set of large objects actively being
3343                          * worked on contain the majority of the pages), we could
3344                          * cause the pageout_scan thread to 'starve' in its attempt
3345                          * to find pages to move to the free queue, since it has to
3346                          * successfully acquire the object lock of any candidate page
3347                          * before it can steal/clean it.
3348                          */
3349                         vm_object_unlock(object);
3350                         mutex_yield(&vm_page_queue_lock);
3351
3352                         for (j = 0; ; j++) {
3353                                 if (vm_object_lock_try(object))
3354                                         break;
3355                                 vm_page_unlock_queues();
3356                                 mutex_pause(j);
3357                                 vm_page_lock_queues();
3358                         }
3359                         delayed_unlock = 1;
3360                 }
3361 try_next_page:
3362                 entry++;
3363                 dst_offset += PAGE_SIZE_64;
3364                 xfer_size -= PAGE_SIZE;
3365         }
3366         if (alias_page != NULL) {
3367                 if (delayed_unlock == 0) {
3368                         vm_page_lock_queues();
3369                         delayed_unlock = 1;
3370                 }
3371                 vm_page_free(alias_page);
3372         }
3373         if (delayed_unlock)
3374                 vm_page_unlock_queues();
3375
3376         if (page_list_count != NULL) {
3377                 if (upl->flags & UPL_INTERNAL)
3378                         *page_list_count = 0;
3379                 else if (*page_list_count > entry)
3380                         *page_list_count = entry;
3381         }
3382         vm_object_unlock(object);
3383
3384         return KERN_SUCCESS;
3385 }
3386
3387 /* JMM - Backward compatability for now */
3388 kern_return_t
3389 vm_fault_list_request(                  /* forward */
3390         memory_object_control_t         control,
3391         vm_object_offset_t      offset,
3392         upl_size_t              size,
3393         upl_t                   *upl_ptr,
3394         upl_page_info_t         **user_page_list_ptr,
3395         unsigned int            page_list_count,
3396         int                     cntrl_flags);
3397 kern_return_t
3398 vm_fault_list_request(
3399         memory_object_control_t         control,
3400         vm_object_offset_t      offset,
3401         upl_size_t              size,
3402         upl_t                   *upl_ptr,
3403         upl_page_info_t         **user_page_list_ptr,
3404         unsigned int            page_list_count,
3405         int                     cntrl_flags)
3406 {
3407         unsigned int            local_list_count;
3408         upl_page_info_t         *user_page_list;
3409         kern_return_t           kr;
3410
3411         if (user_page_list_ptr != NULL) {
3412                 local_list_count = page_list_count;
3413                 user_page_list = *user_page_list_ptr;
3414         } else {
3415                 local_list_count = 0;
3416                 user_page_list = NULL;
3417         }
3418         kr =  memory_object_upl_request(control,
3419                                 offset,
3420                                 size,
3421                                 upl_ptr,
3422                                 user_page_list,
3423                                 &local_list_count,
3424                                 cntrl_flags);
3425
3426         if(kr != KERN_SUCCESS)
3427                 return kr;
3428
3429         if ((user_page_list_ptr != NULL) && (cntrl_flags & UPL_INTERNAL)) {
3430                 *user_page_list_ptr = UPL_GET_INTERNAL_PAGE_LIST(*upl_ptr);
3431         }
3432
3433         return KERN_SUCCESS;
3434 }
3435
3436
3437
3438 /*
3439  *      Routine:        vm_object_super_upl_request
3440  *      Purpose:
3441  *              Cause the population of a portion of a vm_object
3442  *              in much the same way as memory_object_upl_request.
3443  *              Depending on the nature of the request, the pages
3444  *              returned may be contain valid data or be uninitialized.
3445  *              However, the region may be expanded up to the super
3446  *              cluster size provided.
3447  */
3448
3449 __private_extern__ kern_return_t
3450 vm_object_super_upl_request(
3451         vm_object_t object,
3452         vm_object_offset_t      offset,
3453         upl_size_t              size,
3454         upl_size_t              super_cluster,
3455         upl_t                   *upl,
3456         upl_page_info_t         *user_page_list,
3457         unsigned int            *page_list_count,
3458         int                     cntrl_flags)
3459 {
3460         if (object->paging_offset > offset)
3461                 return KERN_FAILURE;
3462
3463         assert(object->paging_in_progress);
3464         offset = offset - object->paging_offset;
3465
3466         if (super_cluster > size) {
3467
3468                 vm_object_offset_t      base_offset;
3469                 upl_size_t              super_size;
3470
3471                 base_offset = (offset & ~((vm_object_offset_t) super_cluster - 1));
3472                 super_size = (offset + size) > (base_offset + super_cluster) ? super_cluster<<1 : super_cluster;
3473                 super_size = ((base_offset + super_size) > object->size) ? (object->size - base_offset) : super_size;
3474
3475                 if (offset > (base_offset + super_size)) {
3476                         panic("vm_object_super_upl_request: Missed target pageout"
3477                               " %#llx,%#llx, %#x, %#x, %#x, %#llx\n",
3478                               offset, base_offset, super_size, super_cluster,
3479                               size, object->paging_offset);
3480                 }
3481                 /*
3482                  * apparently there is a case where the vm requests a
3483                  * page to be written out who's offset is beyond the
3484                  * object size
3485                  */
3486                 if ((offset + size) > (base_offset + super_size))
3487                         super_size = (offset + size) - base_offset;
3488
3489                 offset = base_offset;
3490                 size = super_size;
3491         }
3492         return vm_object_upl_request(object, offset, size, upl, user_page_list, page_list_count, cntrl_flags);
3493 }
3494
3495
3496 kern_return_t
3497 vm_map_create_upl(
3498         vm_map_t                map,
3499         vm_map_address_t        offset,
3500         upl_size_t              *upl_size,
3501         upl_t                   *upl,
3502         upl_page_info_array_t   page_list,
3503         unsigned int            *count,
3504         int                     *flags)
3505 {
3506         vm_map_entry_t  entry;
3507         int             caller_flags;
3508         int             force_data_sync;
3509         int             sync_cow_data;
3510         vm_object_t     local_object;
3511         vm_map_offset_t local_offset;
3512         vm_map_offset_t local_start;
3513         kern_return_t   ret;
3514
3515         caller_flags = *flags;
3516
3517         if (caller_flags & ~UPL_VALID_FLAGS) {
3518                 /*
3519                  * For forward compatibility's sake,
3520                  * reject any unknown flag.
3521                  */
3522                 return KERN_INVALID_VALUE;
3523         }
3524         force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
3525         sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
3526
3527         if (upl == NULL)
3528                 return KERN_INVALID_ARGUMENT;
3529
3530 REDISCOVER_ENTRY:
3531         vm_map_lock(map);
3532
3533         if (vm_map_lookup_entry(map, offset, &entry)) {
3534
3535                 if ((entry->vme_end - offset) < *upl_size)
3536                         *upl_size = entry->vme_end - offset;
3537
3538                 if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
3539                         *flags = 0;
3540
3541                         if (entry->object.vm_object != VM_OBJECT_NULL) {
3542                                 if (entry->object.vm_object->private)
3543                                         *flags = UPL_DEV_MEMORY;
3544
3545                                 if (entry->object.vm_object->phys_contiguous)
3546                                         *flags |= UPL_PHYS_CONTIG;
3547                         }
3548                         vm_map_unlock(map);
3549
3550                         return KERN_SUCCESS;
3551                 }
3552                 if (entry->object.vm_object == VM_OBJECT_NULL || !entry->object.vm_object->phys_contiguous) {
3553                         if ((*upl_size/page_size) > MAX_UPL_TRANSFER)
3554                                 *upl_size = MAX_UPL_TRANSFER * page_size;
3555                 }
3556                 /*
3557                  *      Create an object if necessary.
3558                  */
3559                 if (entry->object.vm_object == VM_OBJECT_NULL) {
3560                         entry->object.vm_object = vm_object_allocate((vm_size_t)(entry->vme_end - entry->vme_start));
3561                         entry->offset = 0;
3562                 }
3563                 if (!(caller_flags & UPL_COPYOUT_FROM)) {
3564                         if (!(entry->protection & VM_PROT_WRITE)) {
3565                                 vm_map_unlock(map);
3566                                 return KERN_PROTECTION_FAILURE;
3567                         }
3568                         if (entry->needs_copy)  {
3569                                 vm_map_t                local_map;
3570                                 vm_object_t             object;
3571                                 vm_object_offset_t      new_offset;
3572                                 vm_prot_t               prot;
3573                                 boolean_t               wired;
3574                                 vm_map_version_t        version;
3575                                 vm_map_t                real_map;
3576
3577                                 local_map = map;
3578                                 vm_map_lock_write_to_read(map);
3579
3580                                 if (vm_map_lookup_locked(&local_map,
3581                                                          offset, VM_PROT_WRITE,
3582                                                          OBJECT_LOCK_EXCLUSIVE,
3583                                                          &version, &object,
3584                                                          &new_offset, &prot, &wired,
3585                                                          NULL,
3586                                                          &real_map)) {
3587                                         vm_map_unlock(local_map);
3588                                         return KERN_FAILURE;
3589                                 }
3590                                 if (real_map != map)
3591                                         vm_map_unlock(real_map);
3592                                 vm_object_unlock(object);
3593                                 vm_map_unlock(local_map);
3594
3595                                 goto REDISCOVER_ENTRY;
3596                         }
3597                 }
3598                 if (entry->is_sub_map) {
3599                         vm_map_t        submap;
3600
3601                         submap = entry->object.sub_map;
3602                         local_start = entry->vme_start;
3603                         local_offset = entry->offset;
3604
3605                         vm_map_reference(submap);
3606                         vm_map_unlock(map);
3607
3608                         ret = vm_map_create_upl(submap,
3609                                                 local_offset + (offset - local_start),
3610                                                 upl_size, upl, page_list, count, flags);
3611                         vm_map_deallocate(submap);
3612
3613                         return ret;
3614                 }
3615                 if (sync_cow_data) {
3616                         if (entry->object.vm_object->shadow || entry->object.vm_object->copy) {
3617                                 local_object = entry->object.vm_object;
3618                                 local_start = entry->vme_start;
3619                                 local_offset = entry->offset;
3620
3621                                 vm_object_reference(local_object);
3622                                 vm_map_unlock(map);
3623
3624                                 if (entry->object.vm_object->shadow && entry->object.vm_object->copy) {
3625                                         vm_object_lock_request(
3626                                                                local_object->shadow,
3627                                                                (vm_object_offset_t)
3628                                                                ((offset - local_start) +
3629                                                                 local_offset) +
3630                                                                local_object->shadow_offset,
3631                                                                *upl_size, FALSE,
3632                                                                MEMORY_OBJECT_DATA_SYNC,
3633                                                                VM_PROT_NO_CHANGE);
3634                                 }
3635                                 sync_cow_data = FALSE;
3636                                 vm_object_deallocate(local_object);
3637
3638                                 goto REDISCOVER_ENTRY;
3639                         }
3640                 }
3641                 if (force_data_sync) {
3642                         local_object = entry->object.vm_object;
3643                         local_start = entry->vme_start;
3644                         local_offset = entry->offset;
3645
3646                         vm_object_reference(local_object);
3647                         vm_map_unlock(map);
3648
3649                         vm_object_lock_request(
3650                                                local_object,
3651                                                (vm_object_offset_t)
3652                                                ((offset - local_start) + local_offset),
3653                                                (vm_object_size_t)*upl_size, FALSE,
3654                                                MEMORY_OBJECT_DATA_SYNC,
3655                                                VM_PROT_NO_CHANGE);
3656
3657                         force_data_sync = FALSE;
3658                         vm_object_deallocate(local_object);
3659
3660                         goto REDISCOVER_ENTRY;
3661                 }
3662                 if (entry->object.vm_object->private)
3663                         *flags = UPL_DEV_MEMORY;
3664                 else
3665                         *flags = 0;
3666
3667                 if (entry->object.vm_object->phys_contiguous)
3668                         *flags |= UPL_PHYS_CONTIG;
3669
3670                 local_object = entry->object.vm_object;
3671                 local_offset = entry->offset;
3672                 local_start = entry->vme_start;
3673
3674                 vm_object_reference(local_object);
3675                 vm_map_unlock(map);
3676
3677                 ret = vm_object_iopl_request(local_object,
3678                                               (vm_object_offset_t) ((offset - local_start) + local_offset),
3679                                               *upl_size,
3680                                               upl,
3681                                               page_list,
3682                                               count,
3683                                               caller_flags);
3684                 vm_object_deallocate(local_object);
3685
3686                 return(ret);
3687         }
3688         vm_map_unlock(map);
3689
3690         return(KERN_FAILURE);
3691 }
3692
3693 /*
3694  * Internal routine to enter a UPL into a VM map.
3695  *
3696  * JMM - This should just be doable through the standard
3697  * vm_map_enter() API.
3698  */
3699 kern_return_t
3700 vm_map_enter_upl(
3701         vm_map_t                map,
3702         upl_t                   upl,
3703         vm_map_offset_t *dst_addr)
3704 {
3705         vm_map_size_t           size;
3706         vm_object_offset_t      offset;
3707         vm_map_offset_t         addr;
3708         vm_page_t               m;
3709         kern_return_t           kr;
3710
3711         if (upl == UPL_NULL)
3712                 return KERN_INVALID_ARGUMENT;
3713
3714         upl_lock(upl);
3715
3716         /*
3717          * check to see if already mapped
3718          */
3719         if (UPL_PAGE_LIST_MAPPED & upl->flags) {
3720                 upl_unlock(upl);
3721                 return KERN_FAILURE;
3722         }
3723
3724         if ((!(upl->flags & UPL_SHADOWED)) && !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) ||
3725                                                (upl->map_object->phys_contiguous))) {
3726                 vm_object_t             object;
3727                 vm_page_t               alias_page;
3728                 vm_object_offset_t      new_offset;
3729                 int                     pg_num;
3730                 wpl_array_t             lite_list;
3731
3732                 if (upl->flags & UPL_INTERNAL) {
3733                         lite_list = (wpl_array_t)
3734                                 ((((uintptr_t)upl) + sizeof(struct upl))
3735                                  + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
3736                 } else {
3737                         lite_list = (wpl_array_t)(((uintptr_t)upl) + sizeof(struct upl));
3738                 }
3739                 object = upl->map_object;
3740                 upl->map_object = vm_object_allocate(upl->size);
3741
3742                 vm_object_lock(upl->map_object);
3743
3744                 upl->map_object->shadow = object;
3745                 upl->map_object->pageout = TRUE;
3746                 upl->map_object->can_persist = FALSE;
3747                 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3748                 upl->map_object->shadow_offset = upl->offset - object->paging_offset;
3749                 upl->map_object->wimg_bits = object->wimg_bits;
3750                 offset = upl->map_object->shadow_offset;
3751                 new_offset = 0;
3752                 size = upl->size;
3753
3754                 upl->flags |= UPL_SHADOWED;
3755
3756                 while (size) {
3757                         pg_num = (new_offset)/PAGE_SIZE;
3758
3759                         if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
3760
3761                                 VM_PAGE_GRAB_FICTITIOUS(alias_page);
3762
3763                                 vm_object_lock(object);
3764
3765                                 m = vm_page_lookup(object, offset);
3766                                 if (m == VM_PAGE_NULL) {
3767                                         panic("vm_upl_map: page missing\n");
3768                                 }
3769
3770                                 /*
3771                                  * Convert the fictitious page to a private
3772                                  * shadow of the real page.
3773                                  */
3774                                 assert(alias_page->fictitious);
3775                                 alias_page->fictitious = FALSE;
3776                                 alias_page->private = TRUE;
3777                                 alias_page->pageout = TRUE;
3778                                 /*
3779                                  * since m is a page in the upl it must
3780                                  * already be wired or BUSY, so it's
3781                                  * safe to assign the underlying physical
3782                                  * page to the alias
3783                                  */
3784                                 alias_page->phys_page = m->phys_page;
3785
3786                                 vm_object_unlock(object);
3787
3788                                 vm_page_lockspin_queues();
3789                                 vm_page_wire(alias_page);
3790                                 vm_page_unlock_queues();
3791
3792                                 /*
3793                                  * ENCRYPTED SWAP:
3794                                  * The virtual page ("m") has to be wired in some way
3795                                  * here or its physical page ("m->phys_page") could
3796                                  * be recycled at any time.
3797                                  * Assuming this is enforced by the caller, we can't
3798                                  * get an encrypted page here.  Since the encryption
3799                                  * key depends on the VM page's "pager" object and
3800                                  * the "paging_offset", we couldn't handle 2 pageable
3801                                  * VM pages (with different pagers and paging_offsets)
3802                                  * sharing the same physical page:  we could end up
3803                                  * encrypting with one key (via one VM page) and
3804                                  * decrypting with another key (via the alias VM page).
3805                                  */
3806                                 ASSERT_PAGE_DECRYPTED(m);
3807
3808                                 vm_page_insert(alias_page, upl->map_object, new_offset);
3809
3810                                 assert(!alias_page->wanted);
3811                                 alias_page->busy = FALSE;
3812                                 alias_page->absent = FALSE;
3813                         }
3814                         size -= PAGE_SIZE;
3815                         offset += PAGE_SIZE_64;
3816                         new_offset += PAGE_SIZE_64;
3817                 }
3818                 vm_object_unlock(upl->map_object);
3819         }
3820         if ((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || upl->map_object->phys_contiguous)
3821                 offset = upl->offset - upl->map_object->paging_offset;
3822         else
3823                 offset = 0;
3824         size = upl->size;
3825
3826         vm_object_reference(upl->map_object);
3827
3828         *dst_addr = 0;
3829         /*
3830          * NEED A UPL_MAP ALIAS
3831          */
3832         kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
3833                           VM_FLAGS_ANYWHERE, upl->map_object, offset, FALSE,
3834                           VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
3835
3836         if (kr != KERN_SUCCESS) {
3837                 upl_unlock(upl);
3838                 return(kr);
3839         }
3840         vm_object_lock(upl->map_object);
3841
3842         for (addr = *dst_addr; size > 0; size -= PAGE_SIZE, addr += PAGE_SIZE) {
3843                 m = vm_page_lookup(upl->map_object, offset);
3844
3845                 if (m) {
3846                         unsigned int    cache_attr;
3847                         cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
3848
3849                         m->pmapped = TRUE;
3850                         m->wpmapped = TRUE;
3851
3852                         PMAP_ENTER(map->pmap, addr, m, VM_PROT_ALL, cache_attr, TRUE);
3853                 }
3854                 offset += PAGE_SIZE_64;
3855         }
3856         vm_object_unlock(upl->map_object);
3857
3858         /*
3859          * hold a reference for the mapping
3860          */
3861         upl->ref_count++;
3862         upl->flags |= UPL_PAGE_LIST_MAPPED;
3863         upl->kaddr = *dst_addr;
3864         upl_unlock(upl);
3865
3866         return KERN_SUCCESS;
3867 }
3868
3869 /*
3870  * Internal routine to remove a UPL mapping from a VM map.
3871  *
3872  * XXX - This should just be doable through a standard
3873  * vm_map_remove() operation.  Otherwise, implicit clean-up
3874  * of the target map won't be able to correctly remove
3875  * these (and release the reference on the UPL).  Having
3876  * to do this means we can't map these into user-space
3877  * maps yet.
3878  */
3879 kern_return_t
3880 vm_map_remove_upl(
3881         vm_map_t        map,
3882         upl_t           upl)
3883 {
3884         vm_address_t    addr;
3885         upl_size_t      size;
3886
3887         if (upl == UPL_NULL)
3888                 return KERN_INVALID_ARGUMENT;
3889
3890         upl_lock(upl);
3891
3892         if (upl->flags & UPL_PAGE_LIST_MAPPED) {
3893                 addr = upl->kaddr;
3894                 size = upl->size;
3895
3896                 assert(upl->ref_count > 1);
3897                 upl->ref_count--;               /* removing mapping ref */
3898
3899                 upl->flags &= ~UPL_PAGE_LIST_MAPPED;
3900                 upl->kaddr = (vm_offset_t) 0;
3901                 upl_unlock(upl);
3902
3903                 vm_map_remove(map,
3904                               vm_map_trunc_page(addr),
3905                               vm_map_round_page(addr + size),
3906                               VM_MAP_NO_FLAGS);
3907
3908                 return KERN_SUCCESS;
3909         }
3910         upl_unlock(upl);
3911
3912         return KERN_FAILURE;
3913 }
3914
3915 kern_return_t
3916 upl_commit_range(
3917         upl_t                   upl,
3918         upl_offset_t            offset,
3919         upl_size_t              size,
3920         int                     flags,
3921         upl_page_info_t         *page_list,
3922         mach_msg_type_number_t  count,
3923         boolean_t               *empty)
3924 {
3925         upl_size_t              xfer_size;
3926         vm_object_t             shadow_object;
3927         vm_object_t             object;
3928         vm_object_offset_t      target_offset;
3929         int                     entry;
3930         wpl_array_t             lite_list;
3931         int                     occupied;
3932         int                     delayed_unlock = 0;
3933         int                     clear_refmod = 0;
3934         int                     pgpgout_count = 0;
3935         int                     j;
3936
3937         *empty = FALSE;
3938
3939         if (upl == UPL_NULL)
3940                 return KERN_INVALID_ARGUMENT;
3941
3942         if (count == 0)
3943                 page_list = NULL;
3944
3945         if (upl->flags & UPL_DEVICE_MEMORY)
3946                 xfer_size = 0;
3947         else if ((offset + size) <= upl->size)
3948                 xfer_size = size;
3949         else
3950                 return KERN_FAILURE;
3951
3952         upl_lock(upl);
3953
3954         if (upl->flags & UPL_ACCESS_BLOCKED) {
3955                 /*
3956                  * We used this UPL to block access to the pages by marking
3957                  * them "busy".  Now we need to clear the "busy" bit to allow
3958                  * access to these pages again.
3959                  */
3960                 flags |= UPL_COMMIT_ALLOW_ACCESS;
3961         }
3962         if (upl->flags & UPL_CLEAR_DIRTY)
3963                 flags |= UPL_COMMIT_CLEAR_DIRTY;
3964
3965         if (upl->flags & UPL_INTERNAL)
3966                 lite_list = (wpl_array_t) ((((uintptr_t)upl) + sizeof(struct upl))
3967                                            + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
3968         else
3969                 lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
3970
3971         object = upl->map_object;
3972
3973         if (upl->flags & UPL_SHADOWED) {
3974                 vm_object_lock(object);
3975                 shadow_object = object->shadow;
3976         } else {
3977                 shadow_object = object;
3978         }
3979         entry = offset/PAGE_SIZE;
3980         target_offset = (vm_object_offset_t)offset;
3981
3982         /*
3983          * pageout_scan takes the vm_page_lock_queues first
3984          * then tries for the object lock... to avoid what
3985          * is effectively a lock inversion, we'll go to the
3986          * trouble of taking them in that same order... otherwise
3987          * if this object contains the majority of the pages resident
3988          * in the UBC (or a small set of large objects actively being
3989          * worked on contain the majority of the pages), we could
3990          * cause the pageout_scan thread to 'starve' in its attempt
3991          * to find pages to move to the free queue, since it has to
3992          * successfully acquire the object lock of any candidate page
3993          * before it can steal/clean it.
3994          */
3995         for (j = 0; ; j++) {
3996                 vm_page_lock_queues();
3997
3998                 if (vm_object_lock_try(shadow_object))
3999                         break;
4000                 vm_page_unlock_queues();
4001                 mutex_pause(j);
4002         }
4003         delayed_unlock = 1;
4004
4005         while (xfer_size) {
4006                 vm_page_t       t, m;
4007
4008                 m = VM_PAGE_NULL;
4009
4010                 if (upl->flags & UPL_LITE) {
4011                         int     pg_num;
4012
4013                         pg_num = target_offset/PAGE_SIZE;
4014
4015                         if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
4016                                 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
4017
4018                                 m = vm_page_lookup(shadow_object, target_offset + (upl->offset - shadow_object->paging_offset));
4019                         }
4020                 }
4021                 if (upl->flags & UPL_SHADOWED) {
4022                         if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
4023
4024                                 t->pageout = FALSE;
4025
4026                                 vm_page_free(t);
4027
4028                                 if (m == VM_PAGE_NULL)
4029                                         m = vm_page_lookup(shadow_object, target_offset + object->shadow_offset);
4030                         }
4031                 }
4032                 if (m != VM_PAGE_NULL) {
4033
4034                         clear_refmod = 0;
4035
4036                         if (upl->flags & UPL_IO_WIRE) {
4037
4038                                 vm_page_unwire(m);
4039
4040                                 if (page_list)
4041                                         page_list[entry].phys_addr = 0;
4042
4043                                 if (flags & UPL_COMMIT_SET_DIRTY)
4044                                         m->dirty = TRUE;
4045                                 else if (flags & UPL_COMMIT_CLEAR_DIRTY) {
4046                                         m->dirty = FALSE;
4047                                         if (m->cs_validated && !m->cs_tainted) {
4048                                                 /*
4049                                                  * CODE SIGNING:
4050                                                  * This page is no longer dirty
4051                                                  * but could have been modified,
4052                                                  * so it will need to be
4053                                                  * re-validated.
4054                                                  */
4055                                                 m->cs_validated = FALSE;
4056                                                 vm_cs_validated_resets++;
4057                                         }
4058                                         clear_refmod |= VM_MEM_MODIFIED;
4059                                 }
4060                                 if (flags & UPL_COMMIT_INACTIVATE)
4061                                         vm_page_deactivate(m);
4062
4063                                 if (clear_refmod)
4064                                         pmap_clear_refmod(m->phys_page, clear_refmod);
4065
4066                                 if (flags & UPL_COMMIT_ALLOW_ACCESS) {
4067                                         /*
4068                                          * We blocked access to the pages in this UPL.
4069                                          * Clear the "busy" bit and wake up any waiter
4070                                          * for this page.
4071                                          */
4072                                         PAGE_WAKEUP_DONE(m);
4073                                 }
4074                                 goto commit_next_page;
4075                         }
4076                         /*
4077                          * make sure to clear the hardware
4078                          * modify or reference bits before
4079                          * releasing the BUSY bit on this page
4080                          * otherwise we risk losing a legitimate
4081                          * change of state
4082                          */
4083                         if (flags & UPL_COMMIT_CLEAR_DIRTY) {
4084                                 m->dirty = FALSE;
4085                                 if (m->cs_validated && !m->cs_tainted) {
4086                                         /*
4087                                          * CODE SIGNING:
4088                                          * This page is no longer dirty
4089                                          * but could have been modified,
4090                                          * so it will need to be
4091                                          * re-validated.
4092                                          */
4093                                         m->cs_validated = FALSE;
4094                                         vm_cs_validated_resets++;
4095                                 }
4096                                 clear_refmod |= VM_MEM_MODIFIED;
4097                         }
4098                         if (clear_refmod)
4099                                 pmap_clear_refmod(m->phys_page, clear_refmod);
4100
4101                         if (page_list) {
4102                                 upl_page_info_t *p;
4103
4104                                 p = &(page_list[entry]);
4105
4106                                 if (p->phys_addr && p->pageout && !m->pageout) {
4107                                         m->busy = TRUE;
4108                                         m->pageout = TRUE;
4109                                         vm_page_wire(m);
4110                                 } else if (p->phys_addr &&
4111                                            !p->pageout && m->pageout &&
4112                                            !m->dump_cleaning) {
4113                                         m->pageout = FALSE;
4114                                         m->absent = FALSE;
4115                                         m->overwriting = FALSE;
4116                                         vm_page_unwire(m);
4117
4118                                         PAGE_WAKEUP_DONE(m);
4119                                 }
4120                                 page_list[entry].phys_addr = 0;
4121                         }
4122                         m->dump_cleaning = FALSE;
4123
4124                         if (m->laundry)
4125                                 vm_pageout_throttle_up(m);
4126
4127                         if (m->pageout) {
4128                                 m->cleaning = FALSE;
4129                                 m->encrypted_cleaning = FALSE;
4130                                 m->pageout = FALSE;
4131 #if MACH_CLUSTER_STATS
4132                                 if (m->wanted) vm_pageout_target_collisions++;
4133 #endif
4134                                 m->dirty = FALSE;
4135                                 if (m->cs_validated && !m->cs_tainted) {
4136                                         /*
4137                                          * CODE SIGNING:
4138                                          * This page is no longer dirty
4139                                          * but could have been modified,
4140                                          * so it will need to be
4141                                          * re-validated.
4142                                          */
4143                                         m->cs_validated = FALSE;
4144                                         vm_cs_validated_resets++;
4145                                 }
4146
4147                                 if (m->pmapped && (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED))
4148                                         m->dirty = TRUE;
4149
4150                                 if (m->dirty) {
4151                                        /*
4152                                         * page was re-dirtied after we started
4153                                         * the pageout... reactivate it since
4154                                         * we don't know whether the on-disk
4155                                         * copy matches what is now in memory
4156                                         */
4157                                         vm_page_unwire(m);
4158
4159                                         if (upl->flags & UPL_PAGEOUT) {
4160                                                 CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
4161                                                 VM_STAT_INCR(reactivations);
4162                                                 DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL);
4163                                         }
4164                                         PAGE_WAKEUP_DONE(m);
4165                                 } else {
4166                                         /*
4167                                          * page has been successfully cleaned
4168                                          * go ahead and free it for other use
4169                                          */
4170
4171                                         if (m->object->internal) {
4172                                                 DTRACE_VM2(anonpgout, int, 1, (uint64_t *), NULL);
4173                                         } else {
4174                                                 DTRACE_VM2(fspgout, int, 1, (uint64_t *), NULL);
4175                                         }
4176
4177                                         vm_page_free(m);
4178
4179                                         if (upl->flags & UPL_PAGEOUT) {
4180                                                 CLUSTER_STAT(vm_pageout_target_page_freed++;)
4181
4182                                                 if (page_list[entry].dirty) {
4183                                                         VM_STAT_INCR(pageouts);
4184                                                         DTRACE_VM2(pgout, int, 1, (uint64_t *), NULL);
4185                                                         pgpgout_count++;
4186                                                 }
4187                                         }
4188                                 }
4189                                 goto commit_next_page;
4190                         }
4191 #if MACH_CLUSTER_STATS
4192                         if (m->wpmapped)
4193                                 m->dirty = pmap_is_modified(m->phys_page);
4194
4195                         if (m->dirty)   vm_pageout_cluster_dirtied++;
4196                         else            vm_pageout_cluster_cleaned++;
4197                         if (m->wanted)  vm_pageout_cluster_collisions++;
4198 #endif
4199                         m->dirty = FALSE;
4200                         if (m->cs_validated && !m->cs_tainted) {
4201                                 /*
4202                                  * CODE SIGNING:
4203                                  * This page is no longer dirty
4204                                  * but could have been modified,
4205                                  * so it will need to be
4206                                  * re-validated.
4207                                  */
4208                                 m->cs_validated = FALSE;
4209                                 vm_cs_validated_resets++;
4210                         }
4211
4212                         if ((m->busy) && (m->cleaning)) {
4213                                 /*
4214                                  * the request_page_list case
4215                                  */
4216                                 m->absent = FALSE;
4217                                 m->overwriting = FALSE;
4218                                 m->busy = FALSE;
4219                         } else if (m->overwriting) {
4220                                 /*
4221                                  * alternate request page list, write to
4222                                  * page_list case.  Occurs when the original
4223                                  * page was wired at the time of the list
4224                                  * request
4225                                  */
4226                                 assert(m->wire_count != 0);
4227                                 vm_page_unwire(m);/* reactivates */
4228                                 m->overwriting = FALSE;
4229                         }
4230                         m->cleaning = FALSE;
4231                         m->encrypted_cleaning = FALSE;
4232
4233                         /*
4234                          * It is a part of the semantic of COPYOUT_FROM
4235                          * UPLs that a commit implies cache sync
4236                          * between the vm page and the backing store
4237                          * this can be used to strip the precious bit
4238                          * as well as clean
4239                          */
4240                         if (upl->flags & UPL_PAGE_SYNC_DONE)
4241                                 m->precious = FALSE;
4242
4243                         if (flags & UPL_COMMIT_SET_DIRTY)
4244                                 m->dirty = TRUE;
4245
4246                         if ((flags & UPL_COMMIT_INACTIVATE) && !m->clustered && !m->speculative) {
4247                                 vm_page_deactivate(m);
4248                         } else if (!m->active && !m->inactive && !m->speculative) {
4249
4250                                 if (m->clustered)
4251                                         vm_page_speculate(m, TRUE);
4252                                 else if (m->reference)
4253                                         vm_page_activate(m);
4254                                 else
4255                                         vm_page_deactivate(m);
4256                         }
4257                         if (flags & UPL_COMMIT_ALLOW_ACCESS) {
4258                                 /*
4259                                  * We blocked access to the pages in this URL.
4260                                  * Clear the "busy" bit on this page before we
4261                                  * wake up any waiter.
4262                                  */
4263                                 m->busy = FALSE;
4264                         }
4265                         /*
4266                          * Wakeup any thread waiting for the page to be un-cleaning.
4267                          */
4268                         PAGE_WAKEUP(m);
4269                 }
4270 commit_next_page:
4271                 target_offset += PAGE_SIZE_64;
4272                 xfer_size -= PAGE_SIZE;
4273                 entry++;
4274
4275                 if (delayed_unlock++ > UPL_DELAYED_UNLOCK_LIMIT) {
4276                         /*
4277                          * pageout_scan takes the vm_page_lock_queues first
4278                          * then tries for the object lock... to avoid what
4279                          * is effectively a lock inversion, we'll go to the
4280                          * trouble of taking them in that same order... otherwise
4281                          * if this object contains the majority of the pages resident
4282                          * in the UBC (or a small set of large objects actively being
4283                          * worked on contain the majority of the pages), we could
4284                          * cause the pageout_scan thread to 'starve' in its attempt
4285                          * to find pages to move to the free queue, since it has to
4286                          * successfully acquire the object lock of any candidate page
4287                          * before it can steal/clean it.
4288                          */
4289                         vm_object_unlock(shadow_object);
4290                         mutex_yield(&vm_page_queue_lock);
4291
4292                         for (j = 0; ; j++) {
4293                                 if (vm_object_lock_try(shadow_object))
4294                                         break;
4295                                 vm_page_unlock_queues();
4296                                 mutex_pause(j);
4297                                 vm_page_lock_queues();
4298                         }
4299                         delayed_unlock = 1;
4300                 }
4301         }
4302         if (delayed_unlock)
4303                 vm_page_unlock_queues();
4304
4305         occupied = 1;
4306
4307         if (upl->flags & UPL_DEVICE_MEMORY)  {
4308                 occupied = 0;
4309         } else if (upl->flags & UPL_LITE) {
4310                 int     pg_num;
4311                 int     i;
4312
4313                 pg_num = upl->size/PAGE_SIZE;
4314                 pg_num = (pg_num + 31) >> 5;
4315                 occupied = 0;
4316
4317                 for (i = 0; i < pg_num; i++) {
4318                         if (lite_list[i] != 0) {
4319                                 occupied = 1;
4320                                 break;
4321                         }
4322                 }
4323         } else {
4324                 if (queue_empty(&upl->map_object->memq))
4325                         occupied = 0;
4326         }
4327         if (occupied == 0) {
4328                 if (upl->flags & UPL_COMMIT_NOTIFY_EMPTY)
4329                         *empty = TRUE;
4330
4331                 if (object == shadow_object) {
4332                         /*
4333                          * this is not a paging object
4334                          * so we need to drop the paging reference
4335                          * that was taken when we created the UPL
4336                          * against this object
4337                          */
4338                         vm_object_paging_end(shadow_object);
4339                 } else {
4340                          /*
4341                           * we dontated the paging reference to
4342                           * the map object... vm_pageout_object_terminate
4343                           * will drop this reference
4344                           */
4345                 }
4346         }
4347         vm_object_unlock(shadow_object);
4348         if (object != shadow_object)
4349                 vm_object_unlock(object);
4350         upl_unlock(upl);
4351
4352         if (pgpgout_count) {
4353                 DTRACE_VM2(pgpgout, int, pgpgout_count, (uint64_t *), NULL);
4354         }
4355
4356         return KERN_SUCCESS;
4357 }
4358
4359 kern_return_t
4360 upl_abort_range(
4361         upl_t                   upl,
4362         upl_offset_t            offset,
4363         upl_size_t              size,
4364         int                     error,
4365         boolean_t               *empty)
4366 {
4367         upl_size_t              xfer_size;
4368         vm_object_t             shadow_object;
4369         vm_object_t             object;
4370         vm_object_offset_t      target_offset;
4371         int                     entry;
4372         wpl_array_t             lite_list;
4373         int                     occupied;
4374         int                     delayed_unlock = 0;
4375         int                     j;
4376
4377         *empty = FALSE;
4378
4379         if (upl == UPL_NULL)
4380                 return KERN_INVALID_ARGUMENT;
4381
4382         if ( (upl->flags & UPL_IO_WIRE) && !(error & UPL_ABORT_DUMP_PAGES) )
4383                 return upl_commit_range(upl, offset, size, 0, NULL, 0, empty);
4384
4385         if (upl->flags & UPL_DEVICE_MEMORY)
4386                 xfer_size = 0;
4387         else if ((offset + size) <= upl->size)
4388                 xfer_size = size;
4389         else
4390                 return KERN_FAILURE;
4391
4392         upl_lock(upl);
4393
4394         if (upl->flags & UPL_INTERNAL) {
4395                 lite_list = (wpl_array_t)
4396                         ((((uintptr_t)upl) + sizeof(struct upl))
4397                         + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
4398         } else {
4399                 lite_list = (wpl_array_t)
4400                         (((uintptr_t)upl) + sizeof(struct upl));
4401         }
4402         object = upl->map_object;
4403
4404         if (upl->flags & UPL_SHADOWED) {
4405                 vm_object_lock(object);
4406                 shadow_object = object->shadow;
4407         } else
4408                 shadow_object = object;
4409
4410         entry = offset/PAGE_SIZE;
4411         target_offset = (vm_object_offset_t)offset;
4412
4413         /*
4414          * pageout_scan takes the vm_page_lock_queues first
4415          * then tries for the object lock... to avoid what
4416          * is effectively a lock inversion, we'll go to the
4417          * trouble of taking them in that same order... otherwise
4418          * if this object contains the majority of the pages resident
4419          * in the UBC (or a small set of large objects actively being
4420          * worked on contain the majority of the pages), we could
4421          * cause the pageout_scan thread to 'starve' in its attempt
4422          * to find pages to move to the free queue, since it has to
4423          * successfully acquire the object lock of any candidate page
4424          * before it can steal/clean it.
4425          */
4426         for (j = 0; ; j++) {
4427                 vm_page_lock_queues();
4428
4429                 if (vm_object_lock_try(shadow_object))
4430                         break;
4431                 vm_page_unlock_queues();
4432                 mutex_pause(j);
4433         }
4434         delayed_unlock = 1;
4435
4436         while (xfer_size) {
4437                 vm_page_t       t, m;
4438
4439                 m = VM_PAGE_NULL;
4440
4441                 if (upl->flags & UPL_LITE) {
4442                         int     pg_num;
4443                         pg_num = target_offset/PAGE_SIZE;
4444
4445                         if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
4446                                 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
4447
4448                                 m = vm_page_lookup(shadow_object, target_offset +
4449                                                    (upl->offset - shadow_object->paging_offset));
4450                         }
4451                 }
4452                 if (upl->flags & UPL_SHADOWED) {
4453                         if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
4454                                 t->pageout = FALSE;
4455
4456                                 vm_page_free(t);
4457
4458                                 if (m == VM_PAGE_NULL)
4459                                         m = vm_page_lookup(shadow_object, target_offset + object->shadow_offset);
4460                         }
4461                 }
4462                 if (m != VM_PAGE_NULL) {
4463
4464                         if (m->absent) {
4465                                 boolean_t must_free = TRUE;
4466
4467                                 m->clustered = FALSE;
4468                                 /*
4469                                  * COPYOUT = FALSE case
4470                                  * check for error conditions which must
4471                                  * be passed back to the pages customer
4472                                  */
4473                                 if (error & UPL_ABORT_RESTART) {
4474                                         m->restart = TRUE;
4475                                         m->absent = FALSE;
4476                                         m->error = TRUE;
4477                                         m->unusual = TRUE;
4478                                         must_free = FALSE;
4479                                 } else if (error & UPL_ABORT_UNAVAILABLE) {
4480                                         m->restart = FALSE;
4481                                         m->unusual = TRUE;
4482                                         must_free = FALSE;
4483                                 } else if (error & UPL_ABORT_ERROR) {
4484                                         m->restart = FALSE;
4485                                         m->absent = FALSE;
4486                                         m->error = TRUE;
4487                                         m->unusual = TRUE;
4488                                         must_free = FALSE;
4489                                 }
4490
4491                                 /*
4492                                  * ENCRYPTED SWAP:
4493                                  * If the page was already encrypted,
4494                                  * we don't really need to decrypt it
4495                                  * now.  It will get decrypted later,
4496                                  * on demand, as soon as someone needs
4497                                  * to access its contents.
4498                                  */
4499
4500                                 m->cleaning = FALSE;
4501                                 m->encrypted_cleaning = FALSE;
4502                                 m->overwriting = FALSE;
4503                                 PAGE_WAKEUP_DONE(m);
4504
4505                                 if (must_free == TRUE)
4506                                         vm_page_free(m);
4507                                 else
4508                                         vm_page_activate(m);
4509                         } else {
4510                                 /*
4511                                  * Handle the trusted pager throttle.
4512                                  */
4513                                 if (m->laundry)
4514                                         vm_pageout_throttle_up(m);
4515
4516                                 if (m->pageout) {
4517                                         assert(m->busy);
4518                                         assert(m->wire_count == 1);
4519                                         m->pageout = FALSE;
4520                                         vm_page_unwire(m);
4521                                 }
4522                                 m->dump_cleaning = FALSE;
4523                                 m->cleaning = FALSE;
4524                                 m->encrypted_cleaning = FALSE;
4525                                 m->overwriting = FALSE;
4526 #if     MACH_PAGEMAP
4527                                 vm_external_state_clr(m->object->existence_map, m->offset);
4528 #endif  /* MACH_PAGEMAP */
4529                                 if (error & UPL_ABORT_DUMP_PAGES) {
4530                                         pmap_disconnect(m->phys_page);
4531                                         vm_page_free(m);
4532                                 } else {
4533                                         if (error & UPL_ABORT_REFERENCE) {
4534                                                 /*
4535                                                  * we've been told to explictly
4536                                                  * reference this page... for
4537                                                  * file I/O, this is done by
4538                                                  * implementing an LRU on the inactive q
4539                                                  */
4540                                                 vm_page_lru(m);
4541                                         }
4542                                         PAGE_WAKEUP_DONE(m);
4543                                 }
4544                         }
4545                 }
4546                 if (delayed_unlock++ > UPL_DELAYED_UNLOCK_LIMIT) {
4547                         /*
4548                          * pageout_scan takes the vm_page_lock_queues first
4549                          * then tries for the object lock... to avoid what
4550                          * is effectively a lock inversion, we'll go to the
4551                          * trouble of taking them in that same order... otherwise
4552                          * if this object contains the majority of the pages resident
4553                          * in the UBC (or a small set of large objects actively being
4554                          * worked on contain the majority of the pages), we could
4555                          * cause the pageout_scan thread to 'starve' in its attempt
4556                          * to find pages to move to the free queue, since it has to
4557                          * successfully acquire the object lock of any candidate page
4558                          * before it can steal/clean it.
4559                          */
4560                         vm_object_unlock(shadow_object);
4561                         mutex_yield(&vm_page_queue_lock);
4562
4563                         for (j = 0; ; j++) {
4564                                 if (vm_object_lock_try(shadow_object))
4565                                         break;
4566                                 vm_page_unlock_queues();
4567                                 mutex_pause(j);
4568                                 vm_page_lock_queues();
4569                         }
4570                         delayed_unlock = 1;
4571                 }
4572                 target_offset += PAGE_SIZE_64;
4573                 xfer_size -= PAGE_SIZE;
4574                 entry++;
4575         }
4576         if (delayed_unlock)
4577                 vm_page_unlock_queues();
4578
4579         occupied = 1;
4580
4581         if (upl->flags & UPL_DEVICE_MEMORY)  {
4582                 occupied = 0;
4583         } else if (upl->flags & UPL_LITE) {
4584                 int     pg_num;
4585                 int     i;
4586
4587                 pg_num = upl->size/PAGE_SIZE;
4588                 pg_num = (pg_num + 31) >> 5;
4589                 occupied = 0;
4590
4591                 for (i = 0; i < pg_num; i++) {
4592                         if (lite_list[i] != 0) {
4593                                 occupied = 1;
4594                                 break;
4595                         }
4596                 }
4597         } else {
4598                 if (queue_empty(&upl->map_object->memq))
4599                         occupied = 0;
4600         }
4601         if (occupied == 0) {
4602                 if (upl->flags & UPL_COMMIT_NOTIFY_EMPTY)
4603                         *empty = TRUE;
4604
4605                 if (object == shadow_object) {
4606                         /*
4607                          * this is not a paging object
4608                          * so we need to drop the paging reference
4609                          * that was taken when we created the UPL
4610                          * against this object
4611                          */
4612                         vm_object_paging_end(shadow_object);
4613                 } else {
4614                          /*
4615                           * we dontated the paging reference to
4616                           * the map object... vm_pageout_object_terminate
4617                           * will drop this reference
4618                           */
4619                 }
4620         }
4621         vm_object_unlock(shadow_object);
4622         if (object != shadow_object)
4623                 vm_object_unlock(object);
4624         upl_unlock(upl);
4625
4626         return KERN_SUCCESS;
4627 }
4628
4629
4630 kern_return_t
4631 upl_abort(
4632         upl_t   upl,
4633         int     error)
4634 {
4635         boolean_t       empty;
4636
4637         return upl_abort_range(upl, 0, upl->size, error, &empty);
4638 }
4639
4640
4641 /* an option on commit should be wire */
4642 kern_return_t
4643 upl_commit(
4644         upl_t                   upl,
4645         upl_page_info_t         *page_list,
4646         mach_msg_type_number_t  count)
4647 {
4648         boolean_t       empty;
4649
4650         return upl_commit_range(upl, 0, upl->size, 0, page_list, count, &empty);
4651 }
4652
4653
4654 kern_return_t
4655 vm_object_iopl_request(
4656         vm_object_t             object,
4657         vm_object_offset_t      offset,
4658         upl_size_t              size,
4659         upl_t                   *upl_ptr,
4660         upl_page_info_array_t   user_page_list,
4661         unsigned int            *page_list_count,
4662         int                     cntrl_flags)
4663 {
4664         vm_page_t               dst_page;
4665         vm_object_offset_t      dst_offset;
4666         upl_size_t              xfer_size;
4667         upl_t                   upl = NULL;
4668         unsigned int            entry;
4669         wpl_array_t             lite_list = NULL;
4670         int                     delayed_unlock = 0;
4671         int                     no_zero_fill = FALSE;
4672         u_int32_t               psize;
4673         kern_return_t           ret;
4674         vm_prot_t               prot;
4675         struct vm_object_fault_info fault_info;
4676
4677
4678         if (cntrl_flags & ~UPL_VALID_FLAGS) {
4679                 /*
4680                  * For forward compatibility's sake,
4681                  * reject any unknown flag.
4682                  */
4683                 return KERN_INVALID_VALUE;
4684         }
4685         if (vm_lopage_poolsize == 0)
4686                 cntrl_flags &= ~UPL_NEED_32BIT_ADDR;
4687
4688         if (cntrl_flags & UPL_NEED_32BIT_ADDR) {
4689                 if ( (cntrl_flags & (UPL_SET_IO_WIRE | UPL_SET_LITE)) != (UPL_SET_IO_WIRE | UPL_SET_LITE))
4690                         return KERN_INVALID_VALUE;
4691
4692                 if (object->phys_contiguous) {
4693                         if ((offset + object->shadow_offset) >= (vm_object_offset_t)max_valid_dma_address)
4694                                 return KERN_INVALID_ADDRESS;
4695
4696                         if (((offset + object->shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address)
4697                                 return KERN_INVALID_ADDRESS;
4698                 }
4699         }
4700
4701         if (cntrl_flags & UPL_ENCRYPT) {
4702                 /*
4703                  * ENCRYPTED SWAP:
4704                  * The paging path doesn't use this interface,
4705                  * so we don't support the UPL_ENCRYPT flag
4706                  * here.  We won't encrypt the pages.
4707                  */
4708                 assert(! (cntrl_flags & UPL_ENCRYPT));
4709         }
4710         if (cntrl_flags & UPL_NOZEROFILL)
4711                 no_zero_fill = TRUE;
4712
4713         if (cntrl_flags & UPL_COPYOUT_FROM)
4714                 prot = VM_PROT_READ;
4715         else
4716                 prot = VM_PROT_READ | VM_PROT_WRITE;
4717
4718         if (((size/page_size) > MAX_UPL_TRANSFER) && !object->phys_contiguous)
4719                 size = MAX_UPL_TRANSFER * page_size;
4720
4721         if (cntrl_flags & UPL_SET_INTERNAL) {
4722                 if (page_list_count != NULL)
4723                         *page_list_count = MAX_UPL_TRANSFER;
4724         }
4725         if (((cntrl_flags & UPL_SET_INTERNAL) && !(object->phys_contiguous)) &&
4726             ((page_list_count != NULL) && (*page_list_count != 0) && *page_list_count < (size/page_size)))
4727                 return KERN_INVALID_ARGUMENT;
4728
4729         if ((!object->internal) && (object->paging_offset != 0))
4730                 panic("vm_object_iopl_request: external object with non-zero paging offset\n");
4731
4732
4733         if (object->phys_contiguous)
4734                 psize = PAGE_SIZE;
4735         else
4736                 psize = size;
4737
4738         if (cntrl_flags & UPL_SET_INTERNAL) {
4739                 upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE, UPL_IO_WIRE, psize);
4740
4741                 user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
4742                 lite_list = (wpl_array_t) (((uintptr_t)user_page_list) +
4743                                            ((psize / PAGE_SIZE) * sizeof(upl_page_info_t)));
4744         } else {
4745                 upl = upl_create(UPL_CREATE_LITE, UPL_IO_WIRE, psize);
4746
4747                 lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
4748         }
4749         if (user_page_list)
4750                 user_page_list[0].device = FALSE;
4751         *upl_ptr = upl;
4752
4753         upl->map_object = object;
4754         upl->size = size;
4755
4756         vm_object_lock(object);
4757         vm_object_paging_begin(object);
4758         /*
4759          * paging in progress also protects the paging_offset
4760          */
4761         upl->offset = offset + object->paging_offset;
4762
4763         if (object->phys_contiguous) {
4764 #ifdef UPL_DEBUG
4765                 queue_enter(&object->uplq, upl, upl_t, uplq);
4766 #endif /* UPL_DEBUG */
4767
4768                 vm_object_unlock(object);
4769
4770                 /*
4771                  * don't need any shadow mappings for this one
4772                  * since it is already I/O memory
4773                  */
4774                 upl->flags |= UPL_DEVICE_MEMORY;
4775
4776                 upl->highest_page = (offset + object->shadow_offset + size - 1)>>PAGE_SHIFT;
4777
4778                 if (user_page_list) {
4779                         user_page_list[0].phys_addr = (offset + object->shadow_offset)>>PAGE_SHIFT;
4780                         user_page_list[0].device = TRUE;
4781                 }
4782                 if (page_list_count != NULL) {
4783                         if (upl->flags & UPL_INTERNAL)
4784                                 *page_list_count = 0;
4785                         else
4786                                 *page_list_count = 1;
4787                 }
4788                 return KERN_SUCCESS;
4789         }
4790         /*
4791          * Protect user space from future COW operations
4792          */
4793         object->true_share = TRUE;
4794
4795         if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC)
4796                 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
4797
4798 #ifdef UPL_DEBUG
4799         queue_enter(&object->uplq, upl, upl_t, uplq);
4800 #endif /* UPL_DEBUG */
4801
4802         if (cntrl_flags & UPL_BLOCK_ACCESS) {
4803                 /*
4804                  * The user requested that access to the pages in this URL
4805                  * be blocked until the UPL is commited or aborted.
4806                  */
4807                 upl->flags |= UPL_ACCESS_BLOCKED;
4808         }
4809         entry = 0;
4810
4811         xfer_size = size;
4812         dst_offset = offset;
4813
4814         fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
4815         fault_info.user_tag  = 0;
4816         fault_info.lo_offset = offset;
4817         fault_info.hi_offset = offset + xfer_size;
4818         fault_info.no_cache  = FALSE;
4819
4820         while (xfer_size) {
4821                 vm_fault_return_t       result;
4822                 int                     pg_num;
4823
4824                 dst_page = vm_page_lookup(object, dst_offset);
4825
4826                 /*
4827                  * ENCRYPTED SWAP:
4828                  * If the page is encrypted, we need to decrypt it,
4829                  * so force a soft page fault.
4830                  */
4831                 if ((dst_page == VM_PAGE_NULL) || (dst_page->busy) ||
4832                     (dst_page->encrypted) ||
4833                     (dst_page->unusual && (dst_page->error ||
4834                                            dst_page->restart ||
4835                                            dst_page->absent ||
4836                                            dst_page->fictitious))) {
4837
4838                    do {
4839                         vm_page_t       top_page;
4840                         kern_return_t   error_code;
4841                         int             interruptible;
4842
4843                         if (delayed_unlock) {
4844                                 delayed_unlock = 0;
4845                                 vm_page_unlock_queues();
4846                         }
4847                         if (cntrl_flags & UPL_SET_INTERRUPTIBLE)
4848                                 interruptible = THREAD_ABORTSAFE;
4849                         else
4850                                 interruptible = THREAD_UNINT;
4851
4852                         fault_info.interruptible = interruptible;
4853                         fault_info.cluster_size = xfer_size;
4854
4855                         result = vm_fault_page(object, dst_offset,
4856                                                prot | VM_PROT_WRITE, FALSE,
4857                                                &prot, &dst_page, &top_page,
4858                                                (int *)0,
4859                                                &error_code, no_zero_fill,
4860                                                FALSE, &fault_info);
4861
4862                         switch (result) {
4863
4864                         case VM_FAULT_SUCCESS:
4865
4866                                 PAGE_WAKEUP_DONE(dst_page);
4867                                 /*
4868                                  *      Release paging references and
4869                                  *      top-level placeholder page, if any.
4870                                  */
4871                                 if (top_page != VM_PAGE_NULL) {
4872                                         vm_object_t local_object;
4873
4874                                         local_object = top_page->object;
4875
4876                                         if (top_page->object != dst_page->object) {
4877                                                 vm_object_lock(local_object);
4878                                                 VM_PAGE_FREE(top_page);
4879                                                 vm_object_paging_end(local_object);
4880                                                 vm_object_unlock(local_object);
4881                                         } else {
4882                                                 VM_PAGE_FREE(top_page);
4883                                                 vm_object_paging_end(local_object);
4884                                         }
4885                                 }
4886                                 break;
4887
4888                         case VM_FAULT_RETRY:
4889                                 vm_object_lock(object);
4890                                 vm_object_paging_begin(object);
4891                                 break;
4892
4893                         case VM_FAULT_FICTITIOUS_SHORTAGE:
4894                                 vm_page_more_fictitious();
4895
4896                                 vm_object_lock(object);
4897                                 vm_object_paging_begin(object);
4898                                 break;
4899
4900                         case VM_FAULT_MEMORY_SHORTAGE:
4901                                 if (vm_page_wait(interruptible)) {
4902                                         vm_object_lock(object);
4903                                         vm_object_paging_begin(object);
4904                                         break;
4905                                 }
4906                                 /* fall thru */
4907
4908                         case VM_FAULT_INTERRUPTED:
4909                                 error_code = MACH_SEND_INTERRUPTED;
4910                         case VM_FAULT_MEMORY_ERROR:
4911                                 ret = (error_code ? error_code: KERN_MEMORY_ERROR);
4912
4913                                 vm_object_lock(object);
4914                                 vm_object_paging_begin(object);
4915                                 goto return_err;
4916                         }
4917                    } while (result != VM_FAULT_SUCCESS);
4918                 }
4919
4920                 if ( (cntrl_flags & UPL_NEED_32BIT_ADDR) &&
4921                      dst_page->phys_page >= (max_valid_dma_address >> PAGE_SHIFT) ) {
4922                         vm_page_t       low_page;
4923                         int             refmod;
4924
4925                         /*
4926                          * support devices that can't DMA above 32 bits
4927                          * by substituting pages from a pool of low address
4928                          * memory for any pages we find above the 4G mark
4929                          * can't substitute if the page is already wired because
4930                          * we don't know whether that physical address has been
4931                          * handed out to some other 64 bit capable DMA device to use
4932                          */
4933                         if (dst_page->wire_count) {
4934                                 ret = KERN_PROTECTION_FAILURE;
4935                                 goto return_err;
4936                         }
4937                         if (delayed_unlock) {
4938                                 delayed_unlock = 0;
4939                                 vm_page_unlock_queues();
4940                         }
4941                         low_page = vm_page_grablo();
4942
4943                         if (low_page == VM_PAGE_NULL) {
4944                                 ret = KERN_RESOURCE_SHORTAGE;
4945                                 goto return_err;
4946                         }
4947                         /*
4948                          * from here until the vm_page_replace completes
4949                          * we musn't drop the object lock... we don't
4950                          * want anyone refaulting this page in and using
4951                          * it after we disconnect it... we want the fault
4952                          * to find the new page being substituted.
4953                          */
4954                         if (dst_page->pmapped)
4955                                 refmod = pmap_disconnect(dst_page->phys_page);
4956                         else
4957                                 refmod = 0;
4958                         vm_page_copy(dst_page, low_page);
4959
4960                         low_page->reference = dst_page->reference;
4961                         low_page->dirty     = dst_page->dirty;
4962
4963                         if (refmod & VM_MEM_REFERENCED)
4964                                 low_page->reference = TRUE;
4965                         if (refmod & VM_MEM_MODIFIED)
4966                                 low_page->dirty = TRUE;
4967
4968                         vm_page_lock_queues();
4969                         vm_page_replace(low_page, object, dst_offset);
4970                         /*
4971                          * keep the queue lock since we're going to
4972                          * need it immediately
4973                          */
4974                         delayed_unlock = 1;
4975
4976                         dst_page = low_page;
4977                         /*
4978                          * vm_page_grablo returned the page marked
4979                          * BUSY... we don't need a PAGE_WAKEUP_DONE
4980                          * here, because we've never dropped the object lock
4981                          */
4982                         dst_page->busy = FALSE;
4983                 }
4984                 if (delayed_unlock == 0)
4985                         vm_page_lock_queues();
4986
4987                 vm_page_wire(dst_page);
4988
4989                 if (cntrl_flags & UPL_BLOCK_ACCESS) {
4990                         /*
4991                          * Mark the page "busy" to block any future page fault
4992                          * on this page.  We'll also remove the mapping
4993                          * of all these pages before leaving this routine.
4994                          */
4995                         assert(!dst_page->fictitious);
4996                         dst_page->busy = TRUE;
4997                 }
4998                 pg_num = (dst_offset-offset)/PAGE_SIZE;
4999                 lite_list[pg_num>>5] |= 1 << (pg_num & 31);
5000
5001                 /*
5002                  * expect the page to be used
5003                  * page queues lock must be held to set 'reference'
5004                  */
5005                 dst_page->reference = TRUE;
5006
5007                 if (!(cntrl_flags & UPL_COPYOUT_FROM))
5008                         dst_page->dirty = TRUE;
5009
5010                 if (dst_page->phys_page > upl->highest_page)
5011                         upl->highest_page = dst_page->phys_page;
5012
5013                 if (user_page_list) {
5014                         user_page_list[entry].phys_addr = dst_page->phys_page;
5015                         user_page_list[entry].dirty     = dst_page->dirty;
5016                         user_page_list[entry].pageout   = dst_page->pageout;
5017                         user_page_list[entry].absent    = dst_page->absent;
5018                         user_page_list[entry].precious  = dst_page->precious;
5019
5020                         if (dst_page->clustered == TRUE)
5021                                 user_page_list[entry].speculative = dst_page->speculative;
5022                         else
5023                                 user_page_list[entry].speculative = FALSE;
5024                 }
5025                 /*
5026                  * someone is explicitly grabbing this page...
5027                  * update clustered and speculative state
5028                  *
5029                  */
5030                 VM_PAGE_CONSUME_CLUSTERED(dst_page);
5031
5032                 if (delayed_unlock++ > UPL_DELAYED_UNLOCK_LIMIT) {
5033                         mutex_yield(&vm_page_queue_lock);
5034                         delayed_unlock = 1;
5035                 }
5036                 entry++;
5037                 dst_offset += PAGE_SIZE_64;
5038                 xfer_size -= PAGE_SIZE;
5039         }
5040         if (delayed_unlock)
5041                 vm_page_unlock_queues();
5042
5043         if (page_list_count != NULL) {
5044                 if (upl->flags & UPL_INTERNAL)
5045                         *page_list_count = 0;
5046                 else if (*page_list_count > entry)
5047                         *page_list_count = entry;
5048         }
5049         vm_object_unlock(object);
5050
5051         if (cntrl_flags & UPL_BLOCK_ACCESS) {
5052                 /*
5053                  * We've marked all the pages "busy" so that future
5054                  * page faults will block.
5055                  * Now remove the mapping for these pages, so that they
5056                  * can't be accessed without causing a page fault.
5057                  */
5058                 vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
5059                                        PMAP_NULL, 0, VM_PROT_NONE);
5060         }
5061         return KERN_SUCCESS;
5062
5063 return_err:
5064         if (delayed_unlock)
5065                 vm_page_unlock_queues();
5066
5067         for (; offset < dst_offset; offset += PAGE_SIZE) {
5068                 dst_page = vm_page_lookup(object, offset);
5069
5070                 if (dst_page == VM_PAGE_NULL)
5071                         panic("vm_object_iopl_request: Wired pages missing. \n");
5072
5073                 vm_page_lockspin_queues();
5074                 vm_page_unwire(dst_page);
5075                 vm_page_unlock_queues();
5076
5077                 VM_STAT_INCR(reactivations);
5078         }
5079         vm_object_paging_end(object);
5080         vm_object_unlock(object);
5081         upl_destroy(upl);
5082
5083         return ret;
5084 }
5085
5086 kern_return_t
5087 upl_transpose(
5088         upl_t           upl1,
5089         upl_t           upl2)
5090 {
5091         kern_return_t           retval;
5092         boolean_t               upls_locked;
5093         vm_object_t             object1, object2;
5094
5095         if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2) {
5096                 return KERN_INVALID_ARGUMENT;
5097         }
5098
5099         upls_locked = FALSE;
5100
5101         /*
5102          * Since we need to lock both UPLs at the same time,
5103          * avoid deadlocks by always taking locks in the same order.
5104          */
5105         if (upl1 < upl2) {
5106                 upl_lock(upl1);
5107                 upl_lock(upl2);
5108         } else {
5109                 upl_lock(upl2);
5110                 upl_lock(upl1);
5111         }
5112         upls_locked = TRUE;     /* the UPLs will need to be unlocked */
5113
5114         object1 = upl1->map_object;
5115         object2 = upl2->map_object;
5116
5117         if (upl1->offset != 0 || upl2->offset != 0 ||
5118             upl1->size != upl2->size) {
5119                 /*
5120                  * We deal only with full objects, not subsets.
5121                  * That's because we exchange the entire backing store info
5122                  * for the objects: pager, resident pages, etc...  We can't do
5123                  * only part of it.
5124                  */
5125                 retval = KERN_INVALID_VALUE;
5126                 goto done;
5127         }
5128
5129         /*
5130          * Tranpose the VM objects' backing store.
5131          */
5132         retval = vm_object_transpose(object1, object2,
5133                                      (vm_object_size_t) upl1->size);
5134
5135         if (retval == KERN_SUCCESS) {
5136                 /*
5137                  * Make each UPL point to the correct VM object, i.e. the
5138                  * object holding the pages that the UPL refers to...
5139                  */
5140 #ifdef UPL_DEBUG
5141                 queue_remove(&object1->uplq, upl1, upl_t, uplq);
5142                 queue_remove(&object2->uplq, upl2, upl_t, uplq);
5143 #endif
5144                 upl1->map_object = object2;
5145                 upl2->map_object = object1;
5146 #ifdef UPL_DEBUG
5147                 queue_enter(&object1->uplq, upl2, upl_t, uplq);
5148                 queue_enter(&object2->uplq, upl1, upl_t, uplq);
5149 #endif
5150         }
5151
5152 done:
5153         /*
5154          * Cleanup.
5155          */
5156         if (upls_locked) {
5157                 upl_unlock(upl1);
5158                 upl_unlock(upl2);
5159                 upls_locked = FALSE;
5160         }
5161
5162         return retval;
5163 }
5164
5165 /*
5166  * ENCRYPTED SWAP:
5167  *
5168  * Rationale:  the user might have some encrypted data on disk (via
5169  * FileVault or any other mechanism).  That data is then decrypted in
5170  * memory, which is safe as long as the machine is secure.  But that
5171  * decrypted data in memory could be paged out to disk by the default
5172  * pager.  The data would then be stored on disk in clear (not encrypted)
5173  * and it could be accessed by anyone who gets physical access to the
5174  * disk (if the laptop or the disk gets stolen for example).  This weakens
5175  * the security offered by FileVault.
5176  *
5177  * Solution:  the default pager will optionally request that all the
5178  * pages it gathers for pageout be encrypted, via the UPL interfaces,
5179  * before it sends this UPL to disk via the vnode_pageout() path.
5180  *
5181  * Notes:
5182  *
5183  * To avoid disrupting the VM LRU algorithms, we want to keep the
5184  * clean-in-place mechanisms, which allow us to send some extra pages to
5185  * swap (clustering) without actually removing them from the user's
5186  * address space.  We don't want the user to unknowingly access encrypted
5187  * data, so we have to actually remove the encrypted pages from the page
5188  * table.  When the user accesses the data, the hardware will fail to
5189  * locate the virtual page in its page table and will trigger a page
5190  * fault.  We can then decrypt the page and enter it in the page table
5191  * again.  Whenever we allow the user to access the contents of a page,
5192  * we have to make sure it's not encrypted.
5193  *
5194  *
5195  */
5196 /*
5197  * ENCRYPTED SWAP:
5198  * Reserve of virtual addresses in the kernel address space.
5199  * We need to map the physical pages in the kernel, so that we
5200  * can call the encryption/decryption routines with a kernel
5201  * virtual address.  We keep this pool of pre-allocated kernel
5202  * virtual addresses so that we don't have to scan the kernel's
5203  * virtaul address space each time we need to encrypt or decrypt
5204  * a physical page.
5205  * It would be nice to be able to encrypt and decrypt in physical
5206  * mode but that might not always be more efficient...
5207  */
5208 decl_simple_lock_data(,vm_paging_lock)
5209 #define VM_PAGING_NUM_PAGES     64
5210 vm_map_offset_t vm_paging_base_address = 0;
5211 boolean_t       vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
5212 int             vm_paging_max_index = 0;
5213 int             vm_paging_page_waiter = 0;
5214 int             vm_paging_page_waiter_total = 0;
5215 unsigned long   vm_paging_no_kernel_page = 0;
5216 unsigned long   vm_paging_objects_mapped = 0;
5217 unsigned long   vm_paging_pages_mapped = 0;
5218 unsigned long   vm_paging_objects_mapped_slow = 0;
5219 unsigned long   vm_paging_pages_mapped_slow = 0;
5220
5221 void
5222 vm_paging_map_init(void)
5223 {
5224         kern_return_t   kr;
5225         vm_map_offset_t page_map_offset;
5226         vm_map_entry_t  map_entry;
5227
5228         assert(vm_paging_base_address == 0);
5229
5230         /*
5231          * Initialize our pool of pre-allocated kernel
5232          * virtual addresses.
5233          */
5234         page_map_offset = 0;
5235         kr = vm_map_find_space(kernel_map,
5236                                &page_map_offset,
5237                                VM_PAGING_NUM_PAGES * PAGE_SIZE,
5238                                0,
5239                                0,
5240                                &map_entry);
5241         if (kr != KERN_SUCCESS) {
5242                 panic("vm_paging_map_init: kernel_map full\n");
5243         }
5244         map_entry->object.vm_object = kernel_object;
5245         map_entry->offset =
5246                 page_map_offset - VM_MIN_KERNEL_ADDRESS;
5247         vm_object_reference(kernel_object);
5248         vm_map_unlock(kernel_map);
5249
5250         assert(vm_paging_base_address == 0);
5251         vm_paging_base_address = page_map_offset;
5252 }
5253
5254 /*
5255  * ENCRYPTED SWAP:
5256  * vm_paging_map_object:
5257  *      Maps part of a VM object's pages in the kernel
5258  *      virtual address space, using the pre-allocated
5259  *      kernel virtual addresses, if possible.
5260  * Context:
5261  *      The VM object is locked.  This lock will get
5262  *      dropped and re-acquired though, so the caller
5263  *      must make sure the VM object is kept alive
5264  *      (by holding a VM map that has a reference
5265  *      on it, for example, or taking an extra reference).
5266  *      The page should also be kept busy to prevent
5267  *      it from being reclaimed.
5268  */
5269 kern_return_t
5270 vm_paging_map_object(
5271         vm_map_offset_t         *address,
5272         vm_page_t               page,
5273         vm_object_t             object,
5274         vm_object_offset_t      offset,
5275         vm_map_size_t           *size,
5276         boolean_t               can_unlock_object)
5277 {
5278         kern_return_t           kr;
5279         vm_map_offset_t         page_map_offset;
5280         vm_map_size_t           map_size;
5281         vm_object_offset_t      object_offset;
5282         int                     i;
5283
5284
5285         if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
5286                 assert(page->busy);
5287                 /*
5288                  * Use one of the pre-allocated kernel virtual addresses
5289                  * and just enter the VM page in the kernel address space
5290                  * at that virtual address.
5291                  */
5292                 simple_lock(&vm_paging_lock);
5293
5294                 /*
5295                  * Try and find an available kernel virtual address
5296                  * from our pre-allocated pool.
5297                  */
5298                 page_map_offset = 0;
5299                 for (;;) {
5300                         for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
5301                                 if (vm_paging_page_inuse[i] == FALSE) {
5302                                         page_map_offset =
5303                                                 vm_paging_base_address +
5304                                                 (i * PAGE_SIZE);
5305                                         break;
5306                                 }
5307                         }
5308                         if (page_map_offset != 0) {
5309                                 /* found a space to map our page ! */
5310                                 break;
5311                         }
5312
5313                         if (can_unlock_object) {
5314                                 /*
5315                                  * If we can afford to unlock the VM object,
5316                                  * let's take the slow path now...
5317                                  */
5318                                 break;
5319                         }
5320                         /*
5321                          * We can't afford to unlock the VM object, so
5322                          * let's wait for a space to become available...
5323                          */
5324                         vm_paging_page_waiter_total++;
5325                         vm_paging_page_waiter++;
5326                         thread_sleep_fast_usimple_lock(&vm_paging_page_waiter,
5327                                                        &vm_paging_lock,
5328                                                        THREAD_UNINT);
5329                         vm_paging_page_waiter--;
5330                         /* ... and try again */
5331                 }
5332
5333                 if (page_map_offset != 0) {
5334                         /*
5335                          * We found a kernel virtual address;
5336                          * map the physical page to that virtual address.
5337                          */
5338                         if (i > vm_paging_max_index) {
5339                                 vm_paging_max_index = i;
5340                         }
5341                         vm_paging_page_inuse[i] = TRUE;
5342                         simple_unlock(&vm_paging_lock);
5343
5344                         if (page->pmapped == FALSE) {
5345                                 pmap_sync_page_data_phys(page->phys_page);
5346                         }
5347                         page->pmapped = TRUE;
5348
5349                         /*
5350                          * Keep the VM object locked over the PMAP_ENTER
5351                          * and the actual use of the page by the kernel,
5352                          * or this pmap mapping might get undone by a
5353                          * vm_object_pmap_protect() call...
5354                          */
5355                         PMAP_ENTER(kernel_pmap,
5356                                    page_map_offset,
5357                                    page,
5358                                    VM_PROT_DEFAULT,
5359                                    ((int) page->object->wimg_bits &
5360                                     VM_WIMG_MASK),
5361                                    TRUE);
5362                         vm_paging_objects_mapped++;
5363                         vm_paging_pages_mapped++;
5364                         *address = page_map_offset;
5365
5366                         /* all done and mapped, ready to use ! */
5367                         return KERN_SUCCESS;
5368                 }
5369
5370                 /*
5371                  * We ran out of pre-allocated kernel virtual
5372                  * addresses.  Just map the page in the kernel
5373                  * the slow and regular way.
5374                  */
5375                 vm_paging_no_kernel_page++;
5376                 simple_unlock(&vm_paging_lock);
5377         }
5378
5379         if (! can_unlock_object) {
5380                 return KERN_NOT_SUPPORTED;
5381         }
5382
5383         object_offset = vm_object_trunc_page(offset);
5384         map_size = vm_map_round_page(*size);
5385
5386         /*
5387          * Try and map the required range of the object
5388          * in the kernel_map
5389          */
5390
5391         vm_object_reference_locked(object);     /* for the map entry */
5392         vm_object_unlock(object);
5393
5394         kr = vm_map_enter(kernel_map,
5395                           address,
5396                           map_size,
5397                           0,
5398                           VM_FLAGS_ANYWHERE,
5399                           object,
5400                           object_offset,
5401                           FALSE,
5402                           VM_PROT_DEFAULT,
5403                           VM_PROT_ALL,
5404                           VM_INHERIT_NONE);
5405         if (kr != KERN_SUCCESS) {
5406                 *address = 0;
5407                 *size = 0;
5408                 vm_object_deallocate(object);   /* for the map entry */
5409                 vm_object_lock(object);
5410                 return kr;
5411         }
5412
5413         *size = map_size;
5414
5415         /*
5416          * Enter the mapped pages in the page table now.
5417          */
5418         vm_object_lock(object);
5419         /*
5420          * VM object must be kept locked from before PMAP_ENTER()
5421          * until after the kernel is done accessing the page(s).
5422          * Otherwise, the pmap mappings in the kernel could be
5423          * undone by a call to vm_object_pmap_protect().
5424          */
5425
5426         for (page_map_offset = 0;
5427              map_size != 0;
5428              map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
5429                 unsigned int    cache_attr;
5430
5431                 page = vm_page_lookup(object, offset + page_map_offset);
5432                 if (page == VM_PAGE_NULL) {
5433                         printf("vm_paging_map_object: no page !?");
5434                         vm_object_unlock(object);
5435                         kr = vm_map_remove(kernel_map, *address, *size,
5436                                            VM_MAP_NO_FLAGS);
5437                         assert(kr == KERN_SUCCESS);
5438                         *address = 0;
5439                         *size = 0;
5440                         vm_object_lock(object);
5441                         return KERN_MEMORY_ERROR;
5442                 }
5443                 if (page->pmapped == FALSE) {
5444                         pmap_sync_page_data_phys(page->phys_page);
5445                 }
5446                 page->pmapped = TRUE;
5447                 page->wpmapped = TRUE;
5448                 cache_attr = ((unsigned int) object->wimg_bits) & VM_WIMG_MASK;
5449
5450                 //assert(pmap_verify_free(page->phys_page));
5451                 PMAP_ENTER(kernel_pmap,
5452                            *address + page_map_offset,
5453                            page,
5454                            VM_PROT_DEFAULT,
5455                            cache_attr,
5456                            TRUE);
5457         }
5458
5459         vm_paging_objects_mapped_slow++;
5460         vm_paging_pages_mapped_slow += map_size / PAGE_SIZE_64;
5461
5462         return KERN_SUCCESS;
5463 }
5464
5465 /*
5466  * ENCRYPTED SWAP:
5467  * vm_paging_unmap_object:
5468  *      Unmaps part of a VM object's pages from the kernel
5469  *      virtual address space.
5470  * Context:
5471  *      The VM object is locked.  This lock will get
5472  *      dropped and re-acquired though.
5473  */
5474 void
5475 vm_paging_unmap_object(
5476         vm_object_t     object,
5477         vm_map_offset_t start,
5478         vm_map_offset_t end)
5479 {
5480         kern_return_t   kr;
5481         int             i;
5482
5483         if ((vm_paging_base_address == 0) ||
5484             (start < vm_paging_base_address) ||
5485             (end > (vm_paging_base_address
5486                      + (VM_PAGING_NUM_PAGES * PAGE_SIZE)))) {
5487                 /*
5488                  * We didn't use our pre-allocated pool of
5489                  * kernel virtual address.  Deallocate the
5490                  * virtual memory.
5491                  */
5492                 if (object != VM_OBJECT_NULL) {
5493                         vm_object_unlock(object);
5494                 }
5495                 kr = vm_map_remove(kernel_map, start, end, VM_MAP_NO_FLAGS);
5496                 if (object != VM_OBJECT_NULL) {
5497                         vm_object_lock(object);
5498                 }
5499                 assert(kr == KERN_SUCCESS);
5500         } else {
5501                 /*
5502                  * We used a kernel virtual address from our
5503                  * pre-allocated pool.  Put it back in the pool
5504                  * for next time.
5505                  */
5506                 assert(end - start == PAGE_SIZE);
5507                 i = (start - vm_paging_base_address) >> PAGE_SHIFT;
5508
5509                 /* undo the pmap mapping */
5510                 pmap_remove(kernel_pmap, start, end);
5511
5512                 simple_lock(&vm_paging_lock);
5513                 vm_paging_page_inuse[i] = FALSE;
5514                 if (vm_paging_page_waiter) {
5515                         thread_wakeup(&vm_paging_page_waiter);
5516                 }
5517                 simple_unlock(&vm_paging_lock);
5518         }
5519 }
5520
5521 #if CRYPTO
5522 /*
5523  * Encryption data.
5524  * "iv" is the "initial vector".  Ideally, we want to
5525  * have a different one for each page we encrypt, so that
5526  * crackers can't find encryption patterns too easily.
5527  */
5528 #define SWAP_CRYPT_AES_KEY_SIZE 128     /* XXX 192 and 256 don't work ! */
5529 boolean_t               swap_crypt_ctx_initialized = FALSE;
5530 aes_32t                 swap_crypt_key[8]; /* big enough for a 256 key */
5531 aes_ctx                 swap_crypt_ctx;
5532 const unsigned char     swap_crypt_null_iv[AES_BLOCK_SIZE] = {0xa, };
5533
5534 #if DEBUG
5535 boolean_t               swap_crypt_ctx_tested = FALSE;
5536 unsigned char swap_crypt_test_page_ref[4096] __attribute__((aligned(4096)));
5537 unsigned char swap_crypt_test_page_encrypt[4096] __attribute__((aligned(4096)));
5538 unsigned char swap_crypt_test_page_decrypt[4096] __attribute__((aligned(4096)));
5539 #endif /* DEBUG */
5540
5541 extern u_long random(void);
5542
5543 /*
5544  * Initialize the encryption context: key and key size.
5545  */
5546 void swap_crypt_ctx_initialize(void); /* forward */
5547 void
5548 swap_crypt_ctx_initialize(void)
5549 {
5550         unsigned int    i;
5551
5552         /*
5553          * No need for locking to protect swap_crypt_ctx_initialized
5554          * because the first use of encryption will come from the
5555          * pageout thread (we won't pagein before there's been a pageout)
5556          * and there's only one pageout thread.
5557          */
5558         if (swap_crypt_ctx_initialized == FALSE) {
5559                 for (i = 0;
5560                      i < (sizeof (swap_crypt_key) /
5561                           sizeof (swap_crypt_key[0]));
5562                      i++) {
5563                         swap_crypt_key[i] = random();
5564                 }
5565                 aes_encrypt_key((const unsigned char *) swap_crypt_key,
5566                                 SWAP_CRYPT_AES_KEY_SIZE,
5567                                 &swap_crypt_ctx.encrypt);
5568                 aes_decrypt_key((const unsigned char *) swap_crypt_key,
5569                                 SWAP_CRYPT_AES_KEY_SIZE,
5570                                 &swap_crypt_ctx.decrypt);
5571                 swap_crypt_ctx_initialized = TRUE;
5572         }
5573
5574 #if DEBUG
5575         /*
5576          * Validate the encryption algorithms.
5577          */
5578         if (swap_crypt_ctx_tested == FALSE) {
5579                 /* initialize */
5580                 for (i = 0; i < 4096; i++) {
5581                         swap_crypt_test_page_ref[i] = (char) i;
5582                 }
5583                 /* encrypt */
5584                 aes_encrypt_cbc(swap_crypt_test_page_ref,
5585                                 swap_crypt_null_iv,
5586                                 PAGE_SIZE / AES_BLOCK_SIZE,
5587                                 swap_crypt_test_page_encrypt,
5588                                 &swap_crypt_ctx.encrypt);
5589                 /* decrypt */
5590                 aes_decrypt_cbc(swap_crypt_test_page_encrypt,
5591                                 swap_crypt_null_iv,
5592                                 PAGE_SIZE / AES_BLOCK_SIZE,
5593                                 swap_crypt_test_page_decrypt,
5594                                 &swap_crypt_ctx.decrypt);
5595                 /* compare result with original */
5596                 for (i = 0; i < 4096; i ++) {
5597                         if (swap_crypt_test_page_decrypt[i] !=
5598                             swap_crypt_test_page_ref[i]) {
5599                                 panic("encryption test failed");
5600                         }
5601                 }
5602
5603                 /* encrypt again */
5604                 aes_encrypt_cbc(swap_crypt_test_page_decrypt,
5605                                 swap_crypt_null_iv,
5606                                 PAGE_SIZE / AES_BLOCK_SIZE,
5607                                 swap_crypt_test_page_decrypt,
5608                                 &swap_crypt_ctx.encrypt);
5609                 /* decrypt in place */
5610                 aes_decrypt_cbc(swap_crypt_test_page_decrypt,
5611                                 swap_crypt_null_iv,
5612                                 PAGE_SIZE / AES_BLOCK_SIZE,
5613                                 swap_crypt_test_page_decrypt,
5614                                 &swap_crypt_ctx.decrypt);
5615                 for (i = 0; i < 4096; i ++) {
5616                         if (swap_crypt_test_page_decrypt[i] !=
5617                             swap_crypt_test_page_ref[i]) {
5618                                 panic("in place encryption test failed");
5619                         }
5620                 }
5621
5622                 swap_crypt_ctx_tested = TRUE;
5623         }
5624 #endif /* DEBUG */
5625 }
5626
5627 /*
5628  * ENCRYPTED SWAP:
5629  * vm_page_encrypt:
5630  *      Encrypt the given page, for secure paging.
5631  *      The page might already be mapped at kernel virtual
5632  *      address "kernel_mapping_offset".  Otherwise, we need
5633  *      to map it.
5634  *
5635  * Context:
5636  *      The page's object is locked, but this lock will be released
5637  *      and re-acquired.
5638  *      The page is busy and not accessible by users (not entered in any pmap).
5639  */
5640 void
5641 vm_page_encrypt(
5642         vm_page_t       page,
5643         vm_map_offset_t kernel_mapping_offset)
5644 {
5645         kern_return_t           kr;
5646         vm_map_size_t           kernel_mapping_size;
5647         vm_offset_t             kernel_vaddr;
5648         union {
5649                 unsigned char   aes_iv[AES_BLOCK_SIZE];
5650                 struct {
5651                         memory_object_t         pager_object;
5652                         vm_object_offset_t      paging_offset;
5653                 } vm;
5654         } encrypt_iv;
5655
5656         if (! vm_pages_encrypted) {
5657                 vm_pages_encrypted = TRUE;
5658         }
5659
5660         assert(page->busy);
5661         assert(page->dirty || page->precious);
5662
5663         if (page->encrypted) {
5664                 /*
5665                  * Already encrypted: no need to do it again.
5666                  */
5667                 vm_page_encrypt_already_encrypted_counter++;
5668                 return;
5669         }
5670         ASSERT_PAGE_DECRYPTED(page);
5671
5672         /*
5673          * Take a paging-in-progress reference to keep the object
5674          * alive even if we have to unlock it (in vm_paging_map_object()
5675          * for example)...
5676          */
5677         vm_object_paging_begin(page->object);
5678
5679         if (kernel_mapping_offset == 0) {
5680                 /*
5681                  * The page hasn't already been mapped in kernel space
5682                  * by the caller.  Map it now, so that we can access
5683                  * its contents and encrypt them.
5684                  */
5685                 kernel_mapping_size = PAGE_SIZE;
5686                 kr = vm_paging_map_object(&kernel_mapping_offset,
5687                                           page,
5688                                           page->object,
5689                                           page->offset,
5690                                           &kernel_mapping_size,
5691                                           FALSE);
5692                 if (kr != KERN_SUCCESS) {
5693                         panic("vm_page_encrypt: "
5694                               "could not map page in kernel: 0x%x\n",
5695                               kr);
5696                 }
5697         } else {
5698                 kernel_mapping_size = 0;
5699         }
5700         kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
5701
5702         if (swap_crypt_ctx_initialized == FALSE) {
5703                 swap_crypt_ctx_initialize();
5704         }
5705         assert(swap_crypt_ctx_initialized);
5706
5707         /*
5708          * Prepare an "initial vector" for the encryption.
5709          * We use the "pager" and the "paging_offset" for that
5710          * page to obfuscate the encrypted data a bit more and
5711          * prevent crackers from finding patterns that they could
5712          * use to break the key.
5713          */
5714         bzero(&encrypt_iv.aes_iv[0], sizeof (encrypt_iv.aes_iv));
5715         encrypt_iv.vm.pager_object = page->object->pager;
5716         encrypt_iv.vm.paging_offset =
5717                 page->object->paging_offset + page->offset;
5718
5719         /* encrypt the "initial vector" */
5720         aes_encrypt_cbc((const unsigned char *) &encrypt_iv.aes_iv[0],
5721                         swap_crypt_null_iv,
5722                         1,
5723                         &encrypt_iv.aes_iv[0],
5724                         &swap_crypt_ctx.encrypt);
5725
5726         /*
5727          * Encrypt the page.
5728          */
5729         aes_encrypt_cbc((const unsigned char *) kernel_vaddr,
5730                         &encrypt_iv.aes_iv[0],
5731                         PAGE_SIZE / AES_BLOCK_SIZE,
5732                         (unsigned char *) kernel_vaddr,
5733                         &swap_crypt_ctx.encrypt);
5734
5735         vm_page_encrypt_counter++;
5736
5737         /*
5738          * Unmap the page from the kernel's address space,
5739          * if we had to map it ourselves.  Otherwise, let
5740          * the caller undo the mapping if needed.
5741          */
5742         if (kernel_mapping_size != 0) {
5743                 vm_paging_unmap_object(page->object,
5744                                        kernel_mapping_offset,
5745                                        kernel_mapping_offset + kernel_mapping_size);
5746         }
5747
5748         /*
5749          * Clear the "reference" and "modified" bits.
5750          * This should clean up any impact the encryption had
5751          * on them.
5752          * The page was kept busy and disconnected from all pmaps,
5753          * so it can't have been referenced or modified from user
5754          * space.
5755          * The software bits will be reset later after the I/O
5756          * has completed (in upl_commit_range()).
5757          */
5758         pmap_clear_refmod(page->phys_page, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
5759
5760         page->encrypted = TRUE;
5761
5762         vm_object_paging_end(page->object);
5763 }
5764
5765 /*
5766  * ENCRYPTED SWAP:
5767  * vm_page_decrypt:
5768  *      Decrypt the given page.
5769  *      The page might already be mapped at kernel virtual
5770  *      address "kernel_mapping_offset".  Otherwise, we need
5771  *      to map it.
5772  *
5773  * Context:
5774  *      The page's VM object is locked but will be unlocked and relocked.
5775  *      The page is busy and not accessible by users (not entered in any pmap).
5776  */
5777 void
5778 vm_page_decrypt(
5779         vm_page_t       page,
5780         vm_map_offset_t kernel_mapping_offset)
5781 {
5782         kern_return_t           kr;
5783         vm_map_size_t           kernel_mapping_size;
5784         vm_offset_t             kernel_vaddr;
5785         union {
5786                 unsigned char   aes_iv[AES_BLOCK_SIZE];
5787                 struct {
5788                         memory_object_t         pager_object;
5789                         vm_object_offset_t      paging_offset;
5790                 } vm;
5791         } decrypt_iv;
5792
5793         assert(page->busy);
5794         assert(page->encrypted);
5795
5796         /*
5797          * Take a paging-in-progress reference to keep the object
5798          * alive even if we have to unlock it (in vm_paging_map_object()
5799          * for example)...
5800          */
5801         vm_object_paging_begin(page->object);
5802
5803         if (kernel_mapping_offset == 0) {
5804                 /*
5805                  * The page hasn't already been mapped in kernel space
5806                  * by the caller.  Map it now, so that we can access
5807                  * its contents and decrypt them.
5808                  */
5809                 kernel_mapping_size = PAGE_SIZE;
5810                 kr = vm_paging_map_object(&kernel_mapping_offset,
5811                                           page,
5812                                           page->object,
5813                                           page->offset,
5814                                           &kernel_mapping_size,
5815                                           FALSE);
5816                 if (kr != KERN_SUCCESS) {
5817                         panic("vm_page_decrypt: "
5818                               "could not map page in kernel: 0x%x\n",
5819                               kr);
5820                 }
5821         } else {
5822                 kernel_mapping_size = 0;
5823         }
5824         kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
5825
5826         assert(swap_crypt_ctx_initialized);
5827
5828         /*
5829          * Prepare an "initial vector" for the decryption.
5830          * It has to be the same as the "initial vector" we
5831          * used to encrypt that page.
5832          */
5833         bzero(&decrypt_iv.aes_iv[0], sizeof (decrypt_iv.aes_iv));
5834         decrypt_iv.vm.pager_object = page->object->pager;
5835         decrypt_iv.vm.paging_offset =
5836                 page->object->paging_offset + page->offset;
5837
5838         /* encrypt the "initial vector" */
5839         aes_encrypt_cbc((const unsigned char *) &decrypt_iv.aes_iv[0],
5840                         swap_crypt_null_iv,
5841                         1,
5842                         &decrypt_iv.aes_iv[0],
5843                         &swap_crypt_ctx.encrypt);
5844
5845         /*
5846          * Decrypt the page.
5847          */
5848         aes_decrypt_cbc((const unsigned char *) kernel_vaddr,
5849                         &decrypt_iv.aes_iv[0],
5850                         PAGE_SIZE / AES_BLOCK_SIZE,
5851                         (unsigned char *) kernel_vaddr,
5852                         &swap_crypt_ctx.decrypt);
5853         vm_page_decrypt_counter++;
5854
5855         /*
5856          * Unmap the page from the kernel's address space,
5857          * if we had to map it ourselves.  Otherwise, let
5858          * the caller undo the mapping if needed.
5859          */
5860         if (kernel_mapping_size != 0) {
5861                 vm_paging_unmap_object(page->object,
5862                                        kernel_vaddr,
5863                                        kernel_vaddr + PAGE_SIZE);
5864         }
5865
5866         /*
5867          * After decryption, the page is actually clean.
5868          * It was encrypted as part of paging, which "cleans"
5869          * the "dirty" pages.
5870          * Noone could access it after it was encrypted
5871          * and the decryption doesn't count.
5872          */
5873         page->dirty = FALSE;
5874         if (page->cs_validated && !page->cs_tainted) {
5875                 /*
5876                  * CODE SIGNING:
5877                  * This page is no longer dirty
5878                  * but could have been modified,
5879                  * so it will need to be
5880                  * re-validated.
5881                  */
5882                 page->cs_validated = FALSE;
5883                 vm_cs_validated_resets++;
5884         }
5885         pmap_clear_refmod(page->phys_page, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
5886
5887         page->encrypted = FALSE;
5888
5889         /*
5890          * We've just modified the page's contents via the data cache and part
5891          * of the new contents might still be in the cache and not yet in RAM.
5892          * Since the page is now available and might get gathered in a UPL to
5893          * be part of a DMA transfer from a driver that expects the memory to
5894          * be coherent at this point, we have to flush the data cache.
5895          */
5896         pmap_sync_page_attributes_phys(page->phys_page);
5897         /*
5898          * Since the page is not mapped yet, some code might assume that it
5899          * doesn't need to invalidate the instruction cache when writing to
5900          * that page.  That code relies on "pmapped" being FALSE, so that the
5901          * caches get synchronized when the page is first mapped.
5902          */
5903         assert(pmap_verify_free(page->phys_page));
5904         page->pmapped = FALSE;
5905         page->wpmapped = FALSE;
5906
5907         vm_object_paging_end(page->object);
5908 }
5909
5910 unsigned long upl_encrypt_upls = 0;
5911 unsigned long upl_encrypt_pages = 0;
5912
5913 /*
5914  * ENCRYPTED SWAP:
5915  *
5916  * upl_encrypt:
5917  *      Encrypts all the pages in the UPL, within the specified range.
5918  *
5919  */
5920 void
5921 upl_encrypt(
5922         upl_t                   upl,
5923         upl_offset_t            crypt_offset,
5924         upl_size_t              crypt_size)
5925 {
5926         upl_size_t              upl_size;
5927         upl_offset_t            upl_offset;
5928         vm_object_t             upl_object;
5929         vm_page_t               page;
5930         vm_object_t             shadow_object;
5931         vm_object_offset_t      shadow_offset;
5932         vm_object_offset_t      paging_offset;
5933         vm_object_offset_t      base_offset;
5934
5935         upl_encrypt_upls++;
5936         upl_encrypt_pages += crypt_size / PAGE_SIZE;
5937
5938         upl_object = upl->map_object;
5939         upl_offset = upl->offset;
5940         upl_size = upl->size;
5941
5942         vm_object_lock(upl_object);
5943
5944         /*
5945          * Find the VM object that contains the actual pages.
5946          */
5947         if (upl_object->pageout) {
5948                 shadow_object = upl_object->shadow;
5949                 /*
5950                  * The offset in the shadow object is actually also
5951                  * accounted for in upl->offset.  It possibly shouldn't be
5952                  * this way, but for now don't account for it twice.
5953                  */
5954                 shadow_offset = 0;
5955                 assert(upl_object->paging_offset == 0); /* XXX ? */
5956                 vm_object_lock(shadow_object);
5957         } else {
5958                 shadow_object = upl_object;
5959                 shadow_offset = 0;
5960         }
5961
5962         paging_offset = shadow_object->paging_offset;
5963         vm_object_paging_begin(shadow_object);
5964
5965         if (shadow_object != upl_object)
5966                 vm_object_unlock(upl_object);
5967
5968
5969         base_offset = shadow_offset;
5970         base_offset += upl_offset;
5971         base_offset += crypt_offset;
5972         base_offset -= paging_offset;
5973
5974         assert(crypt_offset + crypt_size <= upl_size);
5975
5976         for (upl_offset = 0;
5977              upl_offset < crypt_size;
5978              upl_offset += PAGE_SIZE) {
5979                 page = vm_page_lookup(shadow_object,
5980                                       base_offset + upl_offset);
5981                 if (page == VM_PAGE_NULL) {
5982                         panic("upl_encrypt: "
5983                               "no page for (obj=%p,off=%lld+%d)!\n",
5984                               shadow_object,
5985                               base_offset,
5986                               upl_offset);
5987                 }
5988                 /*
5989                  * Disconnect the page from all pmaps, so that nobody can
5990                  * access it while it's encrypted.  After that point, all
5991                  * accesses to this page will cause a page fault and block
5992                  * while the page is busy being encrypted.  After the
5993                  * encryption completes, any access will cause a
5994                  * page fault and the page gets decrypted at that time.
5995                  */
5996                 pmap_disconnect(page->phys_page);
5997                 vm_page_encrypt(page, 0);
5998
5999                 if (shadow_object == vm_pageout_scan_wants_object) {
6000                         /*
6001                          * Give vm_pageout_scan() a chance to convert more
6002                          * pages from "clean-in-place" to "clean-and-free",
6003                          * if it's interested in the same pages we selected
6004                          * in this cluster.
6005                          */
6006                         vm_object_unlock(shadow_object);
6007                         vm_object_lock(shadow_object);
6008                 }
6009         }
6010
6011         vm_object_paging_end(shadow_object);
6012         vm_object_unlock(shadow_object);
6013 }
6014
6015 #else /* CRYPTO */
6016 void
6017 upl_encrypt(
6018         __unused upl_t                  upl,
6019         __unused upl_offset_t   crypt_offset,
6020         __unused upl_size_t     crypt_size)
6021 {
6022 }
6023
6024 void
6025 vm_page_encrypt(
6026         __unused vm_page_t              page,
6027         __unused vm_map_offset_t        kernel_mapping_offset)
6028 {
6029 }
6030
6031 void
6032 vm_page_decrypt(
6033         __unused vm_page_t              page,
6034         __unused vm_map_offset_t        kernel_mapping_offset)
6035 {
6036 }
6037
6038 #endif /* CRYPTO */
6039
6040 vm_size_t
6041 upl_get_internal_pagelist_offset(void)
6042 {
6043         return sizeof(struct upl);
6044 }
6045
6046 void
6047 upl_clear_dirty(
6048         upl_t           upl,
6049         boolean_t       value)
6050 {
6051         if (value) {
6052                 upl->flags |= UPL_CLEAR_DIRTY;
6053         } else {
6054                 upl->flags &= ~UPL_CLEAR_DIRTY;
6055         }
6056 }
6057
6058
6059 #ifdef MACH_BSD
6060
6061 boolean_t  upl_device_page(upl_page_info_t *upl)
6062 {
6063         return(UPL_DEVICE_PAGE(upl));
6064 }
6065 boolean_t  upl_page_present(upl_page_info_t *upl, int index)
6066 {
6067         return(UPL_PAGE_PRESENT(upl, index));
6068 }
6069 boolean_t  upl_speculative_page(upl_page_info_t *upl, int index)
6070 {
6071         return(UPL_SPECULATIVE_PAGE(upl, index));
6072 }
6073 boolean_t  upl_dirty_page(upl_page_info_t *upl, int index)
6074 {
6075         return(UPL_DIRTY_PAGE(upl, index));
6076 }
6077 boolean_t  upl_valid_page(upl_page_info_t *upl, int index)
6078 {
6079         return(UPL_VALID_PAGE(upl, index));
6080 }
6081 ppnum_t  upl_phys_page(upl_page_info_t *upl, int index)
6082 {
6083         return(UPL_PHYS_PAGE(upl, index));
6084 }
6085
6086
6087 void
6088 vm_countdirtypages(void)
6089 {
6090         vm_page_t m;
6091         int dpages;
6092         int pgopages;
6093         int precpages;
6094
6095
6096         dpages=0;
6097         pgopages=0;
6098         precpages=0;
6099
6100         vm_page_lock_queues();
6101         m = (vm_page_t) queue_first(&vm_page_queue_inactive);
6102         do {
6103                 if (m ==(vm_page_t )0) break;
6104
6105                 if(m->dirty) dpages++;
6106                 if(m->pageout) pgopages++;
6107                 if(m->precious) precpages++;
6108
6109                 assert(m->object != kernel_object);
6110                 m = (vm_page_t) queue_next(&m->pageq);
6111                 if (m ==(vm_page_t )0) break;
6112
6113         } while (!queue_end(&vm_page_queue_inactive,(queue_entry_t) m));
6114         vm_page_unlock_queues();
6115
6116         vm_page_lock_queues();
6117         m = (vm_page_t) queue_first(&vm_page_queue_throttled);
6118         do {
6119                 if (m ==(vm_page_t )0) break;
6120
6121                 dpages++;
6122                 assert(m->dirty);
6123                 assert(!m->pageout);
6124                 assert(m->object != kernel_object);
6125                 m = (vm_page_t) queue_next(&m->pageq);
6126                 if (m ==(vm_page_t )0) break;
6127
6128         } while (!queue_end(&vm_page_queue_throttled,(queue_entry_t) m));
6129         vm_page_unlock_queues();
6130
6131         vm_page_lock_queues();
6132         m = (vm_page_t) queue_first(&vm_page_queue_zf);
6133         do {
6134                 if (m ==(vm_page_t )0) break;
6135
6136                 if(m->dirty) dpages++;
6137                 if(m->pageout) pgopages++;
6138                 if(m->precious) precpages++;
6139
6140                 assert(m->object != kernel_object);
6141                 m = (vm_page_t) queue_next(&m->pageq);
6142                 if (m ==(vm_page_t )0) break;
6143
6144         } while (!queue_end(&vm_page_queue_zf,(queue_entry_t) m));
6145         vm_page_unlock_queues();
6146
6147         printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
6148
6149         dpages=0;
6150         pgopages=0;
6151         precpages=0;
6152
6153         vm_page_lock_queues();
6154         m = (vm_page_t) queue_first(&vm_page_queue_active);
6155
6156         do {
6157                 if(m == (vm_page_t )0) break;
6158                 if(m->dirty) dpages++;
6159                 if(m->pageout) pgopages++;
6160                 if(m->precious) precpages++;
6161
6162                 assert(m->object != kernel_object);
6163                 m = (vm_page_t) queue_next(&m->pageq);
6164                 if(m == (vm_page_t )0) break;
6165
6166         } while (!queue_end(&vm_page_queue_active,(queue_entry_t) m));
6167         vm_page_unlock_queues();
6168
6169         printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
6170
6171 }
6172 #endif /* MACH_BSD */
6173
6174 ppnum_t upl_get_highest_page(
6175                              upl_t                      upl)
6176 {
6177         return upl->highest_page;
6178 }
6179
6180 #ifdef UPL_DEBUG
6181 kern_return_t  upl_ubc_alias_set(upl_t upl, unsigned int alias1, unsigned int alias2)
6182 {
6183         upl->ubc_alias1 = alias1;
6184         upl->ubc_alias2 = alias2;
6185         return KERN_SUCCESS;
6186 }
6187 int  upl_ubc_alias_get(upl_t upl, unsigned int * al, unsigned int * al2)
6188 {
6189         if(al)
6190                 *al = upl->ubc_alias1;
6191         if(al2)
6192                 *al2 = upl->ubc_alias2;
6193         return KERN_SUCCESS;
6194 }
6195 #endif /* UPL_DEBUG */
6196
6197
6198
6199 #if     MACH_KDB
6200 #include <ddb/db_output.h>
6201 #include <ddb/db_print.h>
6202 #include <vm/vm_print.h>
6203
6204 #define printf  kdbprintf
6205 void            db_pageout(void);
6206
6207 void
6208 db_vm(void)
6209 {
6210
6211         iprintf("VM Statistics:\n");
6212         db_indent += 2;
6213         iprintf("pages:\n");
6214         db_indent += 2;
6215         iprintf("activ %5d  inact %5d  free  %5d",
6216                 vm_page_active_count, vm_page_inactive_count,
6217                 vm_page_free_count);
6218         printf("   wire  %5d  gobbl %5d\n",
6219                vm_page_wire_count, vm_page_gobble_count);
6220         db_indent -= 2;
6221         iprintf("target:\n");
6222         db_indent += 2;
6223         iprintf("min   %5d  inact %5d  free  %5d",
6224                 vm_page_free_min, vm_page_inactive_target,
6225                 vm_page_free_target);
6226         printf("   resrv %5d\n", vm_page_free_reserved);
6227         db_indent -= 2;
6228         iprintf("pause:\n");
6229         db_pageout();
6230         db_indent -= 2;
6231 }
6232
6233 #if     MACH_COUNTERS
6234 extern int c_laundry_pages_freed;
6235 #endif  /* MACH_COUNTERS */
6236
6237 void
6238 db_pageout(void)
6239 {
6240         iprintf("Pageout Statistics:\n");
6241         db_indent += 2;
6242         iprintf("active %5d  inactv %5d\n",
6243                 vm_pageout_active, vm_pageout_inactive);
6244         iprintf("nolock %5d  avoid  %5d  busy   %5d  absent %5d\n",
6245                 vm_pageout_inactive_nolock, vm_pageout_inactive_avoid,
6246                 vm_pageout_inactive_busy, vm_pageout_inactive_absent);
6247         iprintf("used   %5d  clean  %5d  dirty  %5d\n",
6248                 vm_pageout_inactive_used, vm_pageout_inactive_clean,
6249                 vm_pageout_inactive_dirty);
6250 #if     MACH_COUNTERS
6251         iprintf("laundry_pages_freed %d\n", c_laundry_pages_freed);
6252 #endif  /* MACH_COUNTERS */
6253 #if     MACH_CLUSTER_STATS
6254         iprintf("Cluster Statistics:\n");
6255         db_indent += 2;
6256         iprintf("dirtied   %5d   cleaned  %5d   collisions  %5d\n",
6257                 vm_pageout_cluster_dirtied, vm_pageout_cluster_cleaned,
6258                 vm_pageout_cluster_collisions);
6259         iprintf("clusters  %5d   conversions  %5d\n",
6260                 vm_pageout_cluster_clusters, vm_pageout_cluster_conversions);
6261         db_indent -= 2;
6262         iprintf("Target Statistics:\n");
6263         db_indent += 2;
6264         iprintf("collisions   %5d   page_dirtied  %5d   page_freed  %5d\n",
6265                 vm_pageout_target_collisions, vm_pageout_target_page_dirtied,
6266                 vm_pageout_target_page_freed);
6267         db_indent -= 2;
6268 #endif  /* MACH_CLUSTER_STATS */
6269         db_indent -= 2;
6270 }
6271
6272 #endif  /* MACH_KDB */