osfmk/vm/vm_pageout.c

   1 /*
   2  * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58 /*
  59  *      File:   vm/vm_pageout.c
  60  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  61  *      Date:   1985
  62  *
  63  *      The proverbial page-out daemon.
  64  */
  65
  66 #include <stdint.h>
  67
  68 #include <debug.h>
  69 #include <mach_pagemap.h>
  70 #include <mach_cluster_stats.h>
  71 #include <mach_kdb.h>
  72 #include <advisory_pageout.h>
  73
  74 #include <mach/mach_types.h>
  75 #include <mach/memory_object.h>
  76 #include <mach/memory_object_default.h>
  77 #include <mach/memory_object_control_server.h>
  78 #include <mach/mach_host_server.h>
  79 #include <mach/upl.h>
  80 #include <mach/vm_map.h>
  81 #include <mach/vm_param.h>
  82 #include <mach/vm_statistics.h>
  83 #include <mach/sdt.h>
  84
  85 #include <kern/kern_types.h>
  86 #include <kern/counters.h>
  87 #include <kern/host_statistics.h>
  88 #include <kern/machine.h>
  89 #include <kern/misc_protos.h>
  90 #include <kern/thread.h>
  91 #include <kern/xpr.h>
  92 #include <kern/kalloc.h>
  93
  94 #include <machine/vm_tuning.h>
  95
  96 #if CONFIG_EMBEDDED
  97 #include <sys/kern_memorystatus.h>
  98 #endif
  99
 100 #include <vm/pmap.h>
 101 #include <vm/vm_fault.h>
 102 #include <vm/vm_map.h>
 103 #include <vm/vm_object.h>
 104 #include <vm/vm_page.h>
 105 #include <vm/vm_pageout.h>
 106 #include <vm/vm_protos.h> /* must be last */
 107 #include <vm/memory_object.h>
 108 #include <vm/vm_purgeable_internal.h>
 109
 110 /*
 111  * ENCRYPTED SWAP:
 112  */
 113 #include <../bsd/crypto/aes/aes.h>
 114
 115
 116 #ifndef VM_PAGEOUT_BURST_ACTIVE_THROTTLE   /* maximum iterations of the active queue to move pages to inactive */
 117 #define VM_PAGEOUT_BURST_ACTIVE_THROTTLE  100
 118 #endif
 119
 120 #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE  /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
 121 #ifdef  CONFIG_EMBEDDED
 122 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 1024
 123 #else
 124 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096
 125 #endif
 126 #endif
 127
 128 #ifndef VM_PAGEOUT_DEADLOCK_RELIEF
 129 #define VM_PAGEOUT_DEADLOCK_RELIEF 100  /* number of pages to move to break deadlock */
 130 #endif
 131
 132 #ifndef VM_PAGEOUT_INACTIVE_RELIEF
 133 #define VM_PAGEOUT_INACTIVE_RELIEF 50   /* minimum number of pages to move to the inactive q */
 134 #endif
 135
 136 #ifndef VM_PAGE_LAUNDRY_MAX
 137 #define VM_PAGE_LAUNDRY_MAX     16UL    /* maximum pageouts on a given pageout queue */
 138 #endif  /* VM_PAGEOUT_LAUNDRY_MAX */
 139
 140 #ifndef VM_PAGEOUT_BURST_WAIT
 141 #define VM_PAGEOUT_BURST_WAIT   30      /* milliseconds per page */
 142 #endif  /* VM_PAGEOUT_BURST_WAIT */
 143
 144 #ifndef VM_PAGEOUT_EMPTY_WAIT
 145 #define VM_PAGEOUT_EMPTY_WAIT   200     /* milliseconds */
 146 #endif  /* VM_PAGEOUT_EMPTY_WAIT */
 147
 148 #ifndef VM_PAGEOUT_DEADLOCK_WAIT
 149 #define VM_PAGEOUT_DEADLOCK_WAIT        300     /* milliseconds */
 150 #endif  /* VM_PAGEOUT_DEADLOCK_WAIT */
 151
 152 #ifndef VM_PAGEOUT_IDLE_WAIT
 153 #define VM_PAGEOUT_IDLE_WAIT    10      /* milliseconds */
 154 #endif  /* VM_PAGEOUT_IDLE_WAIT */
 155
 156 #ifndef VM_PAGE_SPECULATIVE_TARGET
 157 #define VM_PAGE_SPECULATIVE_TARGET(total) ((total) * 1 / 20)
 158 #endif /* VM_PAGE_SPECULATIVE_TARGET */
 159
 160 #ifndef VM_PAGE_INACTIVE_HEALTHY_LIMIT
 161 #define VM_PAGE_INACTIVE_HEALTHY_LIMIT(total) ((total) * 1 / 200)
 162 #endif /* VM_PAGE_INACTIVE_HEALTHY_LIMIT */
 163
 164
 165 /*
 166  *      To obtain a reasonable LRU approximation, the inactive queue
 167  *      needs to be large enough to give pages on it a chance to be
 168  *      referenced a second time.  This macro defines the fraction
 169  *      of active+inactive pages that should be inactive.
 170  *      The pageout daemon uses it to update vm_page_inactive_target.
 171  *
 172  *      If vm_page_free_count falls below vm_page_free_target and
 173  *      vm_page_inactive_count is below vm_page_inactive_target,
 174  *      then the pageout daemon starts running.
 175  */
 176
 177 #ifndef VM_PAGE_INACTIVE_TARGET
 178 #define VM_PAGE_INACTIVE_TARGET(avail)  ((avail) * 1 / 3)
 179 #endif  /* VM_PAGE_INACTIVE_TARGET */
 180
 181 /*
 182  *      Once the pageout daemon starts running, it keeps going
 183  *      until vm_page_free_count meets or exceeds vm_page_free_target.
 184  */
 185
 186 #ifndef VM_PAGE_FREE_TARGET
 187 #ifdef  CONFIG_EMBEDDED
 188 #define VM_PAGE_FREE_TARGET(free)       (15 + (free) / 100)
 189 #else
 190 #define VM_PAGE_FREE_TARGET(free)       (15 + (free) / 80)
 191 #endif
 192 #endif  /* VM_PAGE_FREE_TARGET */
 193
 194 /*
 195  *      The pageout daemon always starts running once vm_page_free_count
 196  *      falls below vm_page_free_min.
 197  */
 198
 199 #ifndef VM_PAGE_FREE_MIN
 200 #ifdef  CONFIG_EMBEDDED
 201 #define VM_PAGE_FREE_MIN(free)          (10 + (free) / 200)
 202 #else
 203 #define VM_PAGE_FREE_MIN(free)          (10 + (free) / 100)
 204 #endif
 205 #endif  /* VM_PAGE_FREE_MIN */
 206
 207 #define VM_PAGE_FREE_MIN_LIMIT          1500
 208 #define VM_PAGE_FREE_TARGET_LIMIT       2000
 209
 210
 211 /*
 212  *      When vm_page_free_count falls below vm_page_free_reserved,
 213  *      only vm-privileged threads can allocate pages.  vm-privilege
 214  *      allows the pageout daemon and default pager (and any other
 215  *      associated threads needed for default pageout) to continue
 216  *      operation by dipping into the reserved pool of pages.
 217  */
 218
 219 #ifndef VM_PAGE_FREE_RESERVED
 220 #define VM_PAGE_FREE_RESERVED(n)        \
 221         ((6 * VM_PAGE_LAUNDRY_MAX) + (n))
 222 #endif  /* VM_PAGE_FREE_RESERVED */
 223
 224 /*
 225  *      When we dequeue pages from the inactive list, they are
 226  *      reactivated (ie, put back on the active queue) if referenced.
 227  *      However, it is possible to starve the free list if other
 228  *      processors are referencing pages faster than we can turn off
 229  *      the referenced bit.  So we limit the number of reactivations
 230  *      we will make per call of vm_pageout_scan().
 231  */
 232 #define VM_PAGE_REACTIVATE_LIMIT_MAX 20000
 233 #ifndef VM_PAGE_REACTIVATE_LIMIT
 234 #ifdef  CONFIG_EMBEDDED
 235 #define VM_PAGE_REACTIVATE_LIMIT(avail) (VM_PAGE_INACTIVE_TARGET(avail) / 2)
 236 #else
 237 #define VM_PAGE_REACTIVATE_LIMIT(avail) (MAX((avail) * 1 / 20,VM_PAGE_REACTIVATE_LIMIT_MAX))
 238 #endif
 239 #endif  /* VM_PAGE_REACTIVATE_LIMIT */
 240 #define VM_PAGEOUT_INACTIVE_FORCE_RECLAIM       100
 241
 242
 243 /*
 244  * must hold the page queues lock to
 245  * manipulate this structure
 246  */
 247 struct vm_pageout_queue {
 248         queue_head_t    pgo_pending;    /* laundry pages to be processed by pager's iothread */
 249         unsigned int    pgo_laundry;    /* current count of laundry pages on queue or in flight */
 250         unsigned int    pgo_maxlaundry;
 251
 252         unsigned int    pgo_idle:1,     /* iothread is blocked waiting for work to do */
 253                         pgo_busy:1,     /* iothread is currently processing request from pgo_pending */
 254                         pgo_throttled:1,/* vm_pageout_scan thread needs a wakeup when pgo_laundry drops */
 255                         :0;
 256 };
 257
 258 #define VM_PAGE_Q_THROTTLED(q)          \
 259         ((q)->pgo_laundry >= (q)->pgo_maxlaundry)
 260
 261
 262 /*
 263  * Exported variable used to broadcast the activation of the pageout scan
 264  * Working Set uses this to throttle its use of pmap removes.  In this
 265  * way, code which runs within memory in an uncontested context does
 266  * not keep encountering soft faults.
 267  */
 268
 269 unsigned int    vm_pageout_scan_event_counter = 0;
 270
 271 /*
 272  * Forward declarations for internal routines.
 273  */
 274
 275 static void vm_pageout_garbage_collect(int);
 276 static void vm_pageout_iothread_continue(struct vm_pageout_queue *);
 277 static void vm_pageout_iothread_external(void);
 278 static void vm_pageout_iothread_internal(void);
 279 static void vm_pageout_queue_steal(vm_page_t);
 280
 281 extern void vm_pageout_continue(void);
 282 extern void vm_pageout_scan(void);
 283
 284 static thread_t vm_pageout_external_iothread = THREAD_NULL;
 285 static thread_t vm_pageout_internal_iothread = THREAD_NULL;
 286
 287 unsigned int vm_pageout_reserved_internal = 0;
 288 unsigned int vm_pageout_reserved_really = 0;
 289
 290 unsigned int vm_pageout_idle_wait = 0;          /* milliseconds */
 291 unsigned int vm_pageout_empty_wait = 0;         /* milliseconds */
 292 unsigned int vm_pageout_burst_wait = 0;         /* milliseconds */
 293 unsigned int vm_pageout_deadlock_wait = 0;      /* milliseconds */
 294 unsigned int vm_pageout_deadlock_relief = 0;
 295 unsigned int vm_pageout_inactive_relief = 0;
 296 unsigned int vm_pageout_burst_active_throttle = 0;
 297 unsigned int vm_pageout_burst_inactive_throttle = 0;
 298
 299 /*
 300  *      Protection against zero fill flushing live working sets derived
 301  *      from existing backing store and files
 302  */
 303 unsigned int vm_accellerate_zf_pageout_trigger = 400;
 304 unsigned int zf_queue_min_count = 100;
 305 unsigned int vm_zf_count = 0;
 306 unsigned int vm_zf_queue_count = 0;
 307
 308 /*
 309  *      These variables record the pageout daemon's actions:
 310  *      how many pages it looks at and what happens to those pages.
 311  *      No locking needed because only one thread modifies the variables.
 312  */
 313
 314 unsigned int vm_pageout_active = 0;             /* debugging */
 315 unsigned int vm_pageout_inactive = 0;           /* debugging */
 316 unsigned int vm_pageout_inactive_throttled = 0; /* debugging */
 317 unsigned int vm_pageout_inactive_forced = 0;    /* debugging */
 318 unsigned int vm_pageout_inactive_nolock = 0;    /* debugging */
 319 unsigned int vm_pageout_inactive_avoid = 0;     /* debugging */
 320 unsigned int vm_pageout_inactive_busy = 0;      /* debugging */
 321 unsigned int vm_pageout_inactive_absent = 0;    /* debugging */
 322 unsigned int vm_pageout_inactive_used = 0;      /* debugging */
 323 unsigned int vm_pageout_inactive_clean = 0;     /* debugging */
 324 unsigned int vm_pageout_inactive_dirty = 0;     /* debugging */
 325 unsigned int vm_pageout_dirty_no_pager = 0;     /* debugging */
 326 unsigned int vm_pageout_purged_objects = 0;     /* debugging */
 327 unsigned int vm_stat_discard = 0;               /* debugging */
 328 unsigned int vm_stat_discard_sent = 0;          /* debugging */
 329 unsigned int vm_stat_discard_failure = 0;       /* debugging */
 330 unsigned int vm_stat_discard_throttle = 0;      /* debugging */
 331 unsigned int vm_pageout_reactivation_limit_exceeded = 0;        /* debugging */
 332 unsigned int vm_pageout_catch_ups = 0;                          /* debugging */
 333 unsigned int vm_pageout_inactive_force_reclaim = 0;     /* debugging */
 334
 335 unsigned int vm_pageout_scan_active_throttled = 0;
 336 unsigned int vm_pageout_scan_inactive_throttled = 0;
 337 unsigned int vm_pageout_scan_throttle = 0;                      /* debugging */
 338 unsigned int vm_pageout_scan_burst_throttle = 0;                /* debugging */
 339 unsigned int vm_pageout_scan_empty_throttle = 0;                /* debugging */
 340 unsigned int vm_pageout_scan_deadlock_detected = 0;             /* debugging */
 341 unsigned int vm_pageout_scan_active_throttle_success = 0;       /* debugging */
 342 unsigned int vm_pageout_scan_inactive_throttle_success = 0;     /* debugging */
 343 /*
 344  * Backing store throttle when BS is exhausted
 345  */
 346 unsigned int    vm_backing_store_low = 0;
 347
 348 unsigned int vm_pageout_out_of_line  = 0;
 349 unsigned int vm_pageout_in_place  = 0;
 350
 351 /*
 352  * ENCRYPTED SWAP:
 353  * counters and statistics...
 354  */
 355 unsigned long vm_page_decrypt_counter = 0;
 356 unsigned long vm_page_decrypt_for_upl_counter = 0;
 357 unsigned long vm_page_encrypt_counter = 0;
 358 unsigned long vm_page_encrypt_abort_counter = 0;
 359 unsigned long vm_page_encrypt_already_encrypted_counter = 0;
 360 boolean_t vm_pages_encrypted = FALSE; /* are there encrypted pages ? */
 361
 362 struct  vm_pageout_queue vm_pageout_queue_internal;
 363 struct  vm_pageout_queue vm_pageout_queue_external;
 364
 365 unsigned int vm_page_speculative_target = 0;
 366
 367 vm_object_t     vm_pageout_scan_wants_object = VM_OBJECT_NULL;
 368
 369 unsigned long vm_cs_validated_resets = 0;
 370
 371 /*
 372  *      Routine:        vm_backing_store_disable
 373  *      Purpose:
 374  *              Suspend non-privileged threads wishing to extend
 375  *              backing store when we are low on backing store
 376  *              (Synchronized by caller)
 377  */
 378 void
 379 vm_backing_store_disable(
 380         boolean_t       disable)
 381 {
 382         if(disable) {
 383                 vm_backing_store_low = 1;
 384         } else {
 385                 if(vm_backing_store_low) {
 386                         vm_backing_store_low = 0;
 387                         thread_wakeup((event_t) &vm_backing_store_low);
 388                 }
 389         }
 390 }
 391
 392
 393 #if MACH_CLUSTER_STATS
 394 unsigned long vm_pageout_cluster_dirtied = 0;
 395 unsigned long vm_pageout_cluster_cleaned = 0;
 396 unsigned long vm_pageout_cluster_collisions = 0;
 397 unsigned long vm_pageout_cluster_clusters = 0;
 398 unsigned long vm_pageout_cluster_conversions = 0;
 399 unsigned long vm_pageout_target_collisions = 0;
 400 unsigned long vm_pageout_target_page_dirtied = 0;
 401 unsigned long vm_pageout_target_page_freed = 0;
 402 #define CLUSTER_STAT(clause)    clause
 403 #else   /* MACH_CLUSTER_STATS */
 404 #define CLUSTER_STAT(clause)
 405 #endif  /* MACH_CLUSTER_STATS */
 406
 407 /*
 408  *      Routine:        vm_pageout_object_terminate
 409  *      Purpose:
 410  *              Destroy the pageout_object, and perform all of the
 411  *              required cleanup actions.
 412  *
 413  *      In/Out conditions:
 414  *              The object must be locked, and will be returned locked.
 415  */
 416 void
 417 vm_pageout_object_terminate(
 418         vm_object_t     object)
 419 {
 420         vm_object_t     shadow_object;
 421
 422         /*
 423          * Deal with the deallocation (last reference) of a pageout object
 424          * (used for cleaning-in-place) by dropping the paging references/
 425          * freeing pages in the original object.
 426          */
 427
 428         assert(object->pageout);
 429         shadow_object = object->shadow;
 430         vm_object_lock(shadow_object);
 431
 432         while (!queue_empty(&object->memq)) {
 433                 vm_page_t               p, m;
 434                 vm_object_offset_t      offset;
 435
 436                 p = (vm_page_t) queue_first(&object->memq);
 437
 438                 assert(p->private);
 439                 assert(p->pageout);
 440                 p->pageout = FALSE;
 441                 assert(!p->cleaning);
 442
 443                 offset = p->offset;
 444                 VM_PAGE_FREE(p);
 445                 p = VM_PAGE_NULL;
 446
 447                 m = vm_page_lookup(shadow_object,
 448                         offset + object->shadow_offset);
 449
 450                 if(m == VM_PAGE_NULL)
 451                         continue;
 452                 assert(m->cleaning);
 453                 /* used as a trigger on upl_commit etc to recognize the */
 454                 /* pageout daemon's subseqent desire to pageout a cleaning */
 455                 /* page.  When the bit is on the upl commit code will   */
 456                 /* respect the pageout bit in the target page over the  */
 457                 /* caller's page list indication */
 458                 m->dump_cleaning = FALSE;
 459
 460                 assert((m->dirty) || (m->precious) ||
 461                                 (m->busy && m->cleaning));
 462
 463                 /*
 464                  * Handle the trusted pager throttle.
 465                  * Also decrement the burst throttle (if external).
 466                  */
 467                 vm_page_lock_queues();
 468                 if (m->laundry) {
 469                         vm_pageout_throttle_up(m);
 470                 }
 471
 472                 /*
 473                  * Handle the "target" page(s). These pages are to be freed if
 474                  * successfully cleaned. Target pages are always busy, and are
 475                  * wired exactly once. The initial target pages are not mapped,
 476                  * (so cannot be referenced or modified) but converted target
 477                  * pages may have been modified between the selection as an
 478                  * adjacent page and conversion to a target.
 479                  */
 480                 if (m->pageout) {
 481                         assert(m->busy);
 482                         assert(m->wire_count == 1);
 483                         m->cleaning = FALSE;
 484                         m->encrypted_cleaning = FALSE;
 485                         m->pageout = FALSE;
 486 #if MACH_CLUSTER_STATS
 487                         if (m->wanted) vm_pageout_target_collisions++;
 488 #endif
 489                         /*
 490                          * Revoke all access to the page. Since the object is
 491                          * locked, and the page is busy, this prevents the page
 492                          * from being dirtied after the pmap_disconnect() call
 493                          * returns.
 494                          *
 495                          * Since the page is left "dirty" but "not modifed", we
 496                          * can detect whether the page was redirtied during
 497                          * pageout by checking the modify state.
 498                          */
 499                         if (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED)
 500                               m->dirty = TRUE;
 501                         else
 502                               m->dirty = FALSE;
 503
 504                         if (m->dirty) {
 505                                 CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
 506                                 vm_page_unwire(m);/* reactivates */
 507                                 VM_STAT_INCR(reactivations);
 508                                 PAGE_WAKEUP_DONE(m);
 509                         } else {
 510                                 CLUSTER_STAT(vm_pageout_target_page_freed++;)
 511                                 vm_page_free(m);/* clears busy, etc. */
 512                         }
 513                         vm_page_unlock_queues();
 514                         continue;
 515                 }
 516                 /*
 517                  * Handle the "adjacent" pages. These pages were cleaned in
 518                  * place, and should be left alone.
 519                  * If prep_pin_count is nonzero, then someone is using the
 520                  * page, so make it active.
 521                  */
 522                 if (!m->active && !m->inactive && !m->throttled && !m->private) {
 523                         if (m->reference)
 524                                 vm_page_activate(m);
 525                         else
 526                                 vm_page_deactivate(m);
 527                 }
 528                 if((m->busy) && (m->cleaning)) {
 529
 530                         /* the request_page_list case, (COPY_OUT_FROM FALSE) */
 531                         m->busy = FALSE;
 532
 533                         /* We do not re-set m->dirty ! */
 534                         /* The page was busy so no extraneous activity     */
 535                         /* could have occurred. COPY_INTO is a read into the */
 536                         /* new pages. CLEAN_IN_PLACE does actually write   */
 537                         /* out the pages but handling outside of this code */
 538                         /* will take care of resetting dirty. We clear the */
 539                         /* modify however for the Programmed I/O case.     */
 540                         pmap_clear_modify(m->phys_page);
 541
 542                         m->absent = FALSE;
 543                         m->overwriting = FALSE;
 544                 } else if (m->overwriting) {
 545                         /* alternate request page list, write to page_list */
 546                         /* case.  Occurs when the original page was wired  */
 547                         /* at the time of the list request */
 548                         assert(m->wire_count != 0);
 549                         vm_page_unwire(m);/* reactivates */
 550                         m->overwriting = FALSE;
 551                 } else {
 552                 /*
 553                  * Set the dirty state according to whether or not the page was
 554                  * modified during the pageout. Note that we purposefully do
 555                  * NOT call pmap_clear_modify since the page is still mapped.
 556                  * If the page were to be dirtied between the 2 calls, this
 557                  * this fact would be lost. This code is only necessary to
 558                  * maintain statistics, since the pmap module is always
 559                  * consulted if m->dirty is false.
 560                  */
 561 #if MACH_CLUSTER_STATS
 562                         m->dirty = pmap_is_modified(m->phys_page);
 563
 564                         if (m->dirty)   vm_pageout_cluster_dirtied++;
 565                         else            vm_pageout_cluster_cleaned++;
 566                         if (m->wanted)  vm_pageout_cluster_collisions++;
 567 #else
 568                         m->dirty = 0;
 569 #endif
 570                 }
 571                 m->cleaning = FALSE;
 572                 m->encrypted_cleaning = FALSE;
 573
 574                 /*
 575                  * Wakeup any thread waiting for the page to be un-cleaning.
 576                  */
 577                 PAGE_WAKEUP(m);
 578                 vm_page_unlock_queues();
 579         }
 580         /*
 581          * Account for the paging reference taken in vm_paging_object_allocate.
 582          */
 583         vm_object_paging_end(shadow_object);
 584         vm_object_unlock(shadow_object);
 585
 586         assert(object->ref_count == 0);
 587         assert(object->paging_in_progress == 0);
 588         assert(object->resident_page_count == 0);
 589         return;
 590 }
 591
 592 /*
 593  * Routine:     vm_pageclean_setup
 594  *
 595  * Purpose:     setup a page to be cleaned (made non-dirty), but not
 596  *              necessarily flushed from the VM page cache.
 597  *              This is accomplished by cleaning in place.
 598  *
 599  *              The page must not be busy, and the object and page
 600  *              queues must be locked.
 601  *
 602  */
 603 void
 604 vm_pageclean_setup(
 605         vm_page_t               m,
 606         vm_page_t               new_m,
 607         vm_object_t             new_object,
 608         vm_object_offset_t      new_offset)
 609 {
 610         assert(!m->busy);
 611 #if 0
 612         assert(!m->cleaning);
 613 #endif
 614
 615         XPR(XPR_VM_PAGEOUT,
 616     "vm_pageclean_setup, obj 0x%X off 0x%X page 0x%X new 0x%X new_off 0x%X\n",
 617                 (integer_t)m->object, m->offset, (integer_t)m,
 618                 (integer_t)new_m, new_offset);
 619
 620         pmap_clear_modify(m->phys_page);
 621
 622         /*
 623          * Mark original page as cleaning in place.
 624          */
 625         m->cleaning = TRUE;
 626         m->dirty = TRUE;
 627         m->precious = FALSE;
 628
 629         /*
 630          * Convert the fictitious page to a private shadow of
 631          * the real page.
 632          */
 633         assert(new_m->fictitious);
 634         assert(new_m->phys_page == vm_page_fictitious_addr);
 635         new_m->fictitious = FALSE;
 636         new_m->private = TRUE;
 637         new_m->pageout = TRUE;
 638         new_m->phys_page = m->phys_page;
 639         vm_page_wire(new_m);
 640
 641         vm_page_insert(new_m, new_object, new_offset);
 642         assert(!new_m->wanted);
 643         new_m->busy = FALSE;
 644 }
 645
 646 /*
 647  *      Routine:        vm_pageout_initialize_page
 648  *      Purpose:
 649  *              Causes the specified page to be initialized in
 650  *              the appropriate memory object. This routine is used to push
 651  *              pages into a copy-object when they are modified in the
 652  *              permanent object.
 653  *
 654  *              The page is moved to a temporary object and paged out.
 655  *
 656  *      In/out conditions:
 657  *              The page in question must not be on any pageout queues.
 658  *              The object to which it belongs must be locked.
 659  *              The page must be busy, but not hold a paging reference.
 660  *
 661  *      Implementation:
 662  *              Move this page to a completely new object.
 663  */
 664 void
 665 vm_pageout_initialize_page(
 666         vm_page_t       m)
 667 {
 668         vm_object_t             object;
 669         vm_object_offset_t      paging_offset;
 670         vm_page_t               holding_page;
 671         memory_object_t         pager;
 672
 673         XPR(XPR_VM_PAGEOUT,
 674                 "vm_pageout_initialize_page, page 0x%X\n",
 675                 (integer_t)m, 0, 0, 0, 0);
 676         assert(m->busy);
 677
 678         /*
 679          *      Verify that we really want to clean this page
 680          */
 681         assert(!m->absent);
 682         assert(!m->error);
 683         assert(m->dirty);
 684
 685         /*
 686          *      Create a paging reference to let us play with the object.
 687          */
 688         object = m->object;
 689         paging_offset = m->offset + object->paging_offset;
 690
 691         if (m->absent || m->error || m->restart || (!m->dirty && !m->precious)) {
 692                 VM_PAGE_FREE(m);
 693                 panic("reservation without pageout?"); /* alan */
 694                 vm_object_unlock(object);
 695
 696                 return;
 697         }
 698
 699         /*
 700          * If there's no pager, then we can't clean the page.  This should
 701          * never happen since this should be a copy object and therefore not
 702          * an external object, so the pager should always be there.
 703          */
 704
 705         pager = object->pager;
 706
 707         if (pager == MEMORY_OBJECT_NULL) {
 708                 VM_PAGE_FREE(m);
 709                 panic("missing pager for copy object");
 710                 return;
 711         }
 712
 713         /* set the page for future call to vm_fault_list_request */
 714         vm_object_paging_begin(object);
 715         holding_page = NULL;
 716         vm_page_lock_queues();
 717         pmap_clear_modify(m->phys_page);
 718         m->dirty = TRUE;
 719         m->busy = TRUE;
 720         m->list_req_pending = TRUE;
 721         m->cleaning = TRUE;
 722         m->pageout = TRUE;
 723         vm_page_wire(m);
 724         vm_page_unlock_queues();
 725         vm_object_unlock(object);
 726
 727         /*
 728          *      Write the data to its pager.
 729          *      Note that the data is passed by naming the new object,
 730          *      not a virtual address; the pager interface has been
 731          *      manipulated to use the "internal memory" data type.
 732          *      [The object reference from its allocation is donated
 733          *      to the eventual recipient.]
 734          */
 735         memory_object_data_initialize(pager, paging_offset, PAGE_SIZE);
 736
 737         vm_object_lock(object);
 738         vm_object_paging_end(object);
 739 }
 740
 741 #if     MACH_CLUSTER_STATS
 742 #define MAXCLUSTERPAGES 16
 743 struct {
 744         unsigned long pages_in_cluster;
 745         unsigned long pages_at_higher_offsets;
 746         unsigned long pages_at_lower_offsets;
 747 } cluster_stats[MAXCLUSTERPAGES];
 748 #endif  /* MACH_CLUSTER_STATS */
 749
 750
 751 /*
 752  * vm_pageout_cluster:
 753  *
 754  * Given a page, queue it to the appropriate I/O thread,
 755  * which will page it out and attempt to clean adjacent pages
 756  * in the same operation.
 757  *
 758  * The page must be busy, and the object and queues locked. We will take a
 759  * paging reference to prevent deallocation or collapse when we
 760  * release the object lock back at the call site.  The I/O thread
 761  * is responsible for consuming this reference
 762  *
 763  * The page must not be on any pageout queue.
 764  */
 765
 766 void
 767 vm_pageout_cluster(vm_page_t m)
 768 {
 769         vm_object_t     object = m->object;
 770         struct          vm_pageout_queue *q;
 771
 772
 773         XPR(XPR_VM_PAGEOUT,
 774                 "vm_pageout_cluster, object 0x%X offset 0x%X page 0x%X\n",
 775                 (integer_t)object, m->offset, (integer_t)m, 0, 0);
 776
 777         /*
 778          * Only a certain kind of page is appreciated here.
 779          */
 780         assert(m->busy && (m->dirty || m->precious) && (m->wire_count == 0));
 781         assert(!m->cleaning && !m->pageout && !m->inactive && !m->active);
 782         assert(!m->throttled);
 783
 784         /*
 785          * protect the object from collapse -
 786          * locking in the object's paging_offset.
 787          */
 788         vm_object_paging_begin(object);
 789
 790         /*
 791          * set the page for future call to vm_fault_list_request
 792          * page should already be marked busy
 793          */
 794         vm_page_wire(m);
 795         m->list_req_pending = TRUE;
 796         m->cleaning = TRUE;
 797         m->pageout = TRUE;
 798         m->laundry = TRUE;
 799
 800         if (object->internal == TRUE)
 801                 q = &vm_pageout_queue_internal;
 802         else
 803                 q = &vm_pageout_queue_external;
 804         q->pgo_laundry++;
 805
 806         m->pageout_queue = TRUE;
 807         queue_enter(&q->pgo_pending, m, vm_page_t, pageq);
 808
 809         if (q->pgo_idle == TRUE) {
 810                 q->pgo_idle = FALSE;
 811                 thread_wakeup((event_t) &q->pgo_pending);
 812         }
 813 }
 814
 815
 816 unsigned long vm_pageout_throttle_up_count = 0;
 817
 818 /*
 819  * A page is back from laundry.  See if there are some pages waiting to
 820  * go to laundry and if we can let some of them go now.
 821  *
 822  * Object and page queues must be locked.
 823  */
 824 void
 825 vm_pageout_throttle_up(
 826         vm_page_t       m)
 827 {
 828         struct vm_pageout_queue *q;
 829
 830         vm_pageout_throttle_up_count++;
 831
 832         assert(m->laundry);
 833         assert(m->object != VM_OBJECT_NULL);
 834         assert(m->object != kernel_object);
 835
 836         if (m->object->internal == TRUE)
 837                 q = &vm_pageout_queue_internal;
 838         else
 839                 q = &vm_pageout_queue_external;
 840
 841         m->laundry = FALSE;
 842         q->pgo_laundry--;
 843
 844         if (q->pgo_throttled == TRUE) {
 845                 q->pgo_throttled = FALSE;
 846                 thread_wakeup((event_t) &q->pgo_laundry);
 847         }
 848 }
 849
 850
 851 /*
 852  *      vm_pageout_scan does the dirty work for the pageout daemon.
 853  *      It returns with vm_page_queue_free_lock held and
 854  *      vm_page_free_wanted == 0.
 855  */
 856
 857 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT  (3 * MAX_UPL_TRANSFER)
 858
 859 #define FCS_IDLE                0
 860 #define FCS_DELAYED             1
 861 #define FCS_DEADLOCK_DETECTED   2
 862
 863 struct flow_control {
 864         int             state;
 865         mach_timespec_t ts;
 866 };
 867
 868 void
 869 vm_pageout_scan(void)
 870 {
 871         unsigned int loop_count = 0;
 872         unsigned int inactive_burst_count = 0;
 873         unsigned int active_burst_count = 0;
 874         unsigned int reactivated_this_call;
 875         unsigned int reactivate_limit;
 876         vm_page_t   local_freeq = NULL;
 877         int         local_freed = 0;
 878         int         delayed_unlock;
 879         int         need_internal_inactive = 0;
 880         int         refmod_state = 0;
 881         int     vm_pageout_deadlock_target = 0;
 882         struct  vm_pageout_queue *iq;
 883         struct  vm_pageout_queue *eq;
 884         struct  vm_speculative_age_q *sq;
 885         struct  flow_control    flow_control;
 886         boolean_t inactive_throttled = FALSE;
 887         boolean_t try_failed;
 888         mach_timespec_t         ts;
 889         unsigned int msecs = 0;
 890         vm_object_t     object;
 891         vm_object_t     last_object_tried;
 892         int     zf_ratio;
 893         int     zf_run_count;
 894         uint32_t        catch_up_count = 0;
 895         uint32_t        inactive_reclaim_run;
 896         boolean_t       forced_reclaim;
 897
 898         flow_control.state = FCS_IDLE;
 899         iq = &vm_pageout_queue_internal;
 900         eq = &vm_pageout_queue_external;
 901         sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
 902
 903
 904         XPR(XPR_VM_PAGEOUT, "vm_pageout_scan\n", 0, 0, 0, 0, 0);
 905
 906
 907         vm_page_lock_queues();
 908         delayed_unlock = 1;     /* must be nonzero if Qs are locked, 0 if unlocked */
 909
 910         /*
 911          *      Calculate the max number of referenced pages on the inactive
 912          *      queue that we will reactivate.
 913          */
 914         reactivated_this_call = 0;
 915         reactivate_limit = VM_PAGE_REACTIVATE_LIMIT(vm_page_active_count +
 916                                                     vm_page_inactive_count);
 917         inactive_reclaim_run = 0;
 918
 919
 920 /*???*/ /*
 921          *      We want to gradually dribble pages from the active queue
 922          *      to the inactive queue.  If we let the inactive queue get
 923          *      very small, and then suddenly dump many pages into it,
 924          *      those pages won't get a sufficient chance to be referenced
 925          *      before we start taking them from the inactive queue.
 926          *
 927          *      We must limit the rate at which we send pages to the pagers.
 928          *      data_write messages consume memory, for message buffers and
 929          *      for map-copy objects.  If we get too far ahead of the pagers,
 930          *      we can potentially run out of memory.
 931          *
 932          *      We can use the laundry count to limit directly the number
 933          *      of pages outstanding to the default pager.  A similar
 934          *      strategy for external pagers doesn't work, because
 935          *      external pagers don't have to deallocate the pages sent them,
 936          *      and because we might have to send pages to external pagers
 937          *      even if they aren't processing writes.  So we also
 938          *      use a burst count to limit writes to external pagers.
 939          *
 940          *      When memory is very tight, we can't rely on external pagers to
 941          *      clean pages.  They probably aren't running, because they
 942          *      aren't vm-privileged.  If we kept sending dirty pages to them,
 943          *      we could exhaust the free list.
 944          */
 945
 946
 947 Restart:
 948         assert(delayed_unlock!=0);
 949
 950         /*
 951          *      A page is "zero-filled" if it was not paged in from somewhere,
 952          *      and it belongs to an object at least VM_ZF_OBJECT_SIZE_THRESHOLD big.
 953          *      Recalculate the zero-filled page ratio.  We use this to apportion
 954          *      victimized pages between the normal and zero-filled inactive
 955          *      queues according to their relative abundance in memory.  Thus if a task
 956          *      is flooding memory with zf pages, we begin to hunt them down.
 957          *      It would be better to throttle greedy tasks at a higher level,
 958          *      but at the moment mach vm cannot do this.
 959          */
 960         {
 961                 uint32_t  total  = vm_page_active_count + vm_page_inactive_count;
 962                 uint32_t  normal = total - vm_zf_count;
 963
 964                 /* zf_ratio is the number of zf pages we victimize per normal page */
 965
 966                 if (vm_zf_count < vm_accellerate_zf_pageout_trigger)
 967                         zf_ratio = 0;
 968                 else if ((vm_zf_count <= normal) || (normal == 0))
 969                         zf_ratio = 1;
 970                 else
 971                         zf_ratio = vm_zf_count / normal;
 972
 973                 zf_run_count = 0;
 974         }
 975
 976         /*
 977          *      Recalculate vm_page_inactivate_target.
 978          */
 979         vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
 980                                                           vm_page_inactive_count +
 981                                                           vm_page_speculative_count);
 982         /*
 983          * don't want to wake the pageout_scan thread up everytime we fall below
 984          * the targets... set a low water mark at 0.25% below the target
 985          */
 986         vm_page_inactive_min = vm_page_inactive_target - (vm_page_inactive_target / 400);
 987
 988         vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count +
 989                                                                 vm_page_inactive_count);
 990         object = NULL;
 991         last_object_tried = NULL;
 992         try_failed = FALSE;
 993
 994         if ((vm_page_inactive_count + vm_page_speculative_count) < VM_PAGE_INACTIVE_HEALTHY_LIMIT(vm_page_active_count))
 995                 catch_up_count = vm_page_inactive_count + vm_page_speculative_count;
 996         else
 997                 catch_up_count = 0;
 998
 999         for (;;) {
1000                 vm_page_t m;
1001
1002                 DTRACE_VM2(rev, int, 1, (uint64_t *), NULL);
1003
1004                 if (delayed_unlock == 0) {
1005                         vm_page_lock_queues();
1006                         delayed_unlock = 1;
1007                 }
1008
1009                 /*
1010                  *      Don't sweep through active queue more than the throttle
1011                  *      which should be kept relatively low
1012                  */
1013                 active_burst_count = MIN(vm_pageout_burst_active_throttle, vm_page_active_count);
1014
1015                 /*
1016                  *      Move pages from active to inactive.
1017                  */
1018                 if (need_internal_inactive == 0 && (vm_page_inactive_count + vm_page_speculative_count) >= vm_page_inactive_target)
1019                         goto done_moving_active_pages;
1020
1021                 while (!queue_empty(&vm_page_queue_active) &&
1022                        (need_internal_inactive || active_burst_count)) {
1023
1024                         if (active_burst_count)
1025                                active_burst_count--;
1026
1027                         vm_pageout_active++;
1028
1029                         m = (vm_page_t) queue_first(&vm_page_queue_active);
1030
1031                         assert(m->active && !m->inactive);
1032                         assert(!m->laundry);
1033                         assert(m->object != kernel_object);
1034                         assert(m->phys_page != vm_page_guard_addr);
1035
1036                         DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
1037
1038                         /*
1039                          * Try to lock object; since we've already got the
1040                          * page queues lock, we can only 'try' for this one.
1041                          * if the 'try' fails, we need to do a mutex_pause
1042                          * to allow the owner of the object lock a chance to
1043                          * run... otherwise, we're likely to trip over this
1044                          * object in the same state as we work our way through
1045                          * the queue... clumps of pages associated with the same
1046                          * object are fairly typical on the inactive and active queues
1047                          */
1048                         if (m->object != object) {
1049                                 if (object != NULL) {
1050                                         vm_object_unlock(object);
1051                                         object = NULL;
1052                                         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1053                                 }
1054                                 if (!vm_object_lock_try_scan(m->object)) {
1055                                         /*
1056                                          * move page to end of active queue and continue
1057                                          */
1058                                         queue_remove(&vm_page_queue_active, m,
1059                                                      vm_page_t, pageq);
1060                                         queue_enter(&vm_page_queue_active, m,
1061                                                     vm_page_t, pageq);
1062
1063                                         try_failed = TRUE;
1064
1065                                         m = (vm_page_t) queue_first(&vm_page_queue_active);
1066                                         /*
1067                                          * this is the next object we're going to be interested in
1068                                          * try to make sure its available after the mutex_yield
1069                                          * returns control
1070                                          */
1071                                         vm_pageout_scan_wants_object = m->object;
1072
1073                                         goto done_with_activepage;
1074                                 }
1075                                 object = m->object;
1076
1077                                 try_failed = FALSE;
1078                         }
1079
1080                         /*
1081                          * if the page is BUSY, then we pull it
1082                          * off the active queue and leave it alone.
1083                          * when BUSY is cleared, it will get stuck
1084                          * back on the appropriate queue
1085                          */
1086                         if (m->busy) {
1087                                 queue_remove(&vm_page_queue_active, m,
1088                                              vm_page_t, pageq);
1089                                 m->pageq.next = NULL;
1090                                 m->pageq.prev = NULL;
1091
1092                                 if (!m->fictitious)
1093                                         vm_page_active_count--;
1094                                 m->active = FALSE;
1095
1096                                 goto done_with_activepage;
1097                         }
1098
1099                         /*
1100                          *      Deactivate the page while holding the object
1101                          *      locked, so we know the page is still not busy.
1102                          *      This should prevent races between pmap_enter
1103                          *      and pmap_clear_reference.  The page might be
1104                          *      absent or fictitious, but vm_page_deactivate
1105                          *      can handle that.
1106                          */
1107                         vm_page_deactivate(m);
1108
1109                         if (need_internal_inactive) {
1110                                 vm_pageout_scan_active_throttle_success++;
1111                                 need_internal_inactive--;
1112                         }
1113 done_with_activepage:
1114                         if (delayed_unlock++ > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT || try_failed == TRUE) {
1115
1116                                 if (object != NULL) {
1117                                         vm_object_unlock(object);
1118                                         object = NULL;
1119                                         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1120                                 }
1121                                 if (local_freeq) {
1122                                         vm_page_free_list(local_freeq);
1123
1124                                         local_freeq = NULL;
1125                                         local_freed = 0;
1126                                 }
1127                                 mutex_yield(&vm_page_queue_lock);
1128
1129                                 delayed_unlock = 1;
1130
1131                                 /*
1132                                  * continue the while loop processing
1133                                  * the active queue... need to hold
1134                                  * the page queues lock
1135                                  */
1136                         }
1137                 }
1138
1139
1140
1141                 /**********************************************************************
1142                  * above this point we're playing with the active queue
1143                  * below this point we're playing with the throttling mechanisms
1144                  * and the inactive queue
1145                  **********************************************************************/
1146
1147 done_moving_active_pages:
1148
1149                 /*
1150                  *      We are done if we have met our target *and*
1151                  *      nobody is still waiting for a page.
1152                  */
1153                 if (vm_page_free_count + local_freed >= vm_page_free_target) {
1154                         if (object != NULL) {
1155                                 vm_object_unlock(object);
1156                                 object = NULL;
1157                         }
1158                         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1159
1160                         if (local_freeq) {
1161                                 vm_page_free_list(local_freeq);
1162
1163                                 local_freeq = NULL;
1164                                 local_freed = 0;
1165                         }
1166                         /*
1167                          * inactive target still not met... keep going
1168                          * until we get the queues balanced
1169                          */
1170
1171                         /*
1172                          *      Recalculate vm_page_inactivate_target.
1173                          */
1174                         vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
1175                                                                           vm_page_inactive_count +
1176                                                                           vm_page_speculative_count);
1177
1178 #ifndef CONFIG_EMBEDDED
1179                         /*
1180                          * XXX: if no active pages can be reclaimed, pageout scan can be stuck trying
1181                          *      to balance the queues
1182                          */
1183                         if (((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) &&
1184                             !queue_empty(&vm_page_queue_active))
1185                                 continue;
1186 #endif
1187
1188                         mutex_lock(&vm_page_queue_free_lock);
1189
1190                         if ((vm_page_free_count >= vm_page_free_target) &&
1191                             (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
1192
1193                                 vm_page_unlock_queues();
1194
1195                                 thread_wakeup((event_t) &vm_pageout_garbage_collect);
1196
1197                                 assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
1198
1199                                 return;
1200                         }
1201                         mutex_unlock(&vm_page_queue_free_lock);
1202                 }
1203                 /*
1204                  * Before anything, we check if we have any ripe volatile objects around.
1205                  * If so, purge the first and see what it gives us.
1206                  */
1207                 assert (available_for_purge>=0);
1208                 if (available_for_purge)
1209                 {
1210                         if (object != NULL) {
1211                                 vm_object_unlock(object);
1212                                 object = NULL;
1213                         }
1214                         vm_purgeable_object_purge_one();
1215                         continue;
1216                 }
1217
1218                 if (queue_empty(&sq->age_q) && vm_page_speculative_count) {
1219                         /*
1220                          * try to pull pages from the aging bins
1221                          * see vm_page.h for an explanation of how
1222                          * this mechanism works
1223                          */
1224                         struct vm_speculative_age_q     *aq;
1225                         mach_timespec_t ts_fully_aged;
1226                         boolean_t       can_steal = FALSE;
1227
1228                         aq = &vm_page_queue_speculative[speculative_steal_index];
1229
1230                         while (queue_empty(&aq->age_q)) {
1231
1232                                 speculative_steal_index++;
1233
1234                                 if (speculative_steal_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q)
1235                                         speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
1236
1237                                 aq = &vm_page_queue_speculative[speculative_steal_index];
1238                         }
1239                         if (vm_page_speculative_count > vm_page_speculative_target)
1240                                 can_steal = TRUE;
1241                         else {
1242                                 ts_fully_aged.tv_sec = (VM_PAGE_MAX_SPECULATIVE_AGE_Q * VM_PAGE_SPECULATIVE_Q_AGE_MS) / 1000;
1243                                 ts_fully_aged.tv_nsec = ((VM_PAGE_MAX_SPECULATIVE_AGE_Q * VM_PAGE_SPECULATIVE_Q_AGE_MS) % 1000)
1244                                                       * 1000 * NSEC_PER_USEC;
1245
1246                                 ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts);
1247
1248                                 clock_get_system_nanotime(&ts.tv_sec, (unsigned *)&ts.tv_nsec);
1249
1250                                 if (CMP_MACH_TIMESPEC(&ts, &ts_fully_aged) >= 0)
1251                                         can_steal = TRUE;
1252                         }
1253                         if (can_steal == TRUE)
1254                                 vm_page_speculate_ageit(aq);
1255                 }
1256
1257                 /*
1258                  * Sometimes we have to pause:
1259                  *      1) No inactive pages - nothing to do.
1260                  *      2) Flow control - default pageout queue is full
1261                  *      3) Loop control - no acceptable pages found on the inactive queue
1262                  *         within the last vm_pageout_burst_inactive_throttle iterations
1263                  */
1264                 if (queue_empty(&vm_page_queue_inactive) && queue_empty(&vm_page_queue_zf) && queue_empty(&sq->age_q) &&
1265                     (VM_PAGE_Q_THROTTLED(iq) || queue_empty(&vm_page_queue_throttled))) {
1266                         vm_pageout_scan_empty_throttle++;
1267                         msecs = vm_pageout_empty_wait;
1268                         goto vm_pageout_scan_delay;
1269
1270                 } else if (inactive_burst_count >=
1271                            MIN(vm_pageout_burst_inactive_throttle,
1272                                (vm_page_inactive_count +
1273                                 vm_page_speculative_count))) {
1274                         vm_pageout_scan_burst_throttle++;
1275                         msecs = vm_pageout_burst_wait;
1276                         goto vm_pageout_scan_delay;
1277
1278                 } else if (VM_PAGE_Q_THROTTLED(iq) && IP_VALID(memory_manager_default)) {
1279
1280                         switch (flow_control.state) {
1281
1282                         case FCS_IDLE:
1283 reset_deadlock_timer:
1284                                 ts.tv_sec = vm_pageout_deadlock_wait / 1000;
1285                                 ts.tv_nsec = (vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
1286                                 clock_get_system_nanotime(&flow_control.ts.tv_sec,
1287                                                           (unsigned *)&flow_control.ts.tv_nsec);
1288                                 ADD_MACH_TIMESPEC(&flow_control.ts, &ts);
1289
1290                                 flow_control.state = FCS_DELAYED;
1291                                 msecs = vm_pageout_deadlock_wait;
1292
1293                                 break;
1294
1295                         case FCS_DELAYED:
1296                                 clock_get_system_nanotime(&ts.tv_sec,
1297                                                           (unsigned *)&ts.tv_nsec);
1298
1299                                 if (CMP_MACH_TIMESPEC(&ts, &flow_control.ts) >= 0) {
1300                                         /*
1301                                          * the pageout thread for the default pager is potentially
1302                                          * deadlocked since the
1303                                          * default pager queue has been throttled for more than the
1304                                          * allowable time... we need to move some clean pages or dirty
1305                                          * pages belonging to the external pagers if they aren't throttled
1306                                          * vm_page_free_wanted represents the number of threads currently
1307                                          * blocked waiting for pages... we'll move one page for each of
1308                                          * these plus a fixed amount to break the logjam... once we're done
1309                                          * moving this number of pages, we'll re-enter the FSC_DELAYED state
1310                                          * with a new timeout target since we have no way of knowing
1311                                          * whether we've broken the deadlock except through observation
1312                                          * of the queue associated with the default pager... we need to
1313                                          * stop moving pages and allow the system to run to see what
1314                                          * state it settles into.
1315                                          */
1316                                         vm_pageout_deadlock_target = vm_pageout_deadlock_relief + vm_page_free_wanted + vm_page_free_wanted_privileged;
1317                                         vm_pageout_scan_deadlock_detected++;
1318                                         flow_control.state = FCS_DEADLOCK_DETECTED;
1319
1320                                         thread_wakeup((event_t) &vm_pageout_garbage_collect);
1321                                         goto consider_inactive;
1322                                 }
1323                                 /*
1324                                  * just resniff instead of trying
1325                                  * to compute a new delay time... we're going to be
1326                                  * awakened immediately upon a laundry completion,
1327                                  * so we won't wait any longer than necessary
1328                                  */
1329                                 msecs = vm_pageout_idle_wait;
1330                                 break;
1331
1332                         case FCS_DEADLOCK_DETECTED:
1333                                 if (vm_pageout_deadlock_target)
1334                                         goto consider_inactive;
1335                                 goto reset_deadlock_timer;
1336
1337                         }
1338                         vm_pageout_scan_throttle++;
1339                         iq->pgo_throttled = TRUE;
1340 vm_pageout_scan_delay:
1341                         if (object != NULL) {
1342                                 vm_object_unlock(object);
1343                                 object = NULL;
1344                         }
1345                         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1346
1347                         if (local_freeq) {
1348                                 vm_page_free_list(local_freeq);
1349
1350                                 local_freeq = NULL;
1351                                 local_freed = 0;
1352                         }
1353 #if CONFIG_EMBEDDED
1354                         {
1355                         int percent_avail;
1356
1357                         /*
1358                          * Decide if we need to send a memory status notification.
1359                          */
1360                         percent_avail =
1361                                 (vm_page_active_count + vm_page_inactive_count +
1362                                  vm_page_speculative_count + vm_page_free_count +
1363                                  (IP_VALID(memory_manager_default)?0:vm_page_purgeable_count) ) * 100 /
1364                                 atop_64(max_mem);
1365                         if (percent_avail >= (kern_memorystatus_level + 5) ||
1366                             percent_avail <= (kern_memorystatus_level - 5)) {
1367                                 kern_memorystatus_level = percent_avail;
1368                                 thread_wakeup((event_t)&kern_memorystatus_wakeup);
1369                         }
1370                         }
1371 #endif
1372                         assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000*NSEC_PER_USEC);
1373
1374                         counter(c_vm_pageout_scan_block++);
1375
1376                         vm_page_unlock_queues();
1377
1378                         assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
1379
1380                         thread_block(THREAD_CONTINUE_NULL);
1381
1382                         vm_page_lock_queues();
1383                         delayed_unlock = 1;
1384
1385                         iq->pgo_throttled = FALSE;
1386
1387                         if (loop_count >= vm_page_inactive_count)
1388                                 loop_count = 0;
1389                         inactive_burst_count = 0;
1390
1391                         goto Restart;
1392                         /*NOTREACHED*/
1393                 }
1394
1395
1396                 flow_control.state = FCS_IDLE;
1397 consider_inactive:
1398                 loop_count++;
1399                 inactive_burst_count++;
1400                 vm_pageout_inactive++;
1401
1402                 /* Choose a victim. */
1403
1404                 while (1) {
1405                         m = NULL;
1406
1407                         /*
1408                          * the most eligible pages are ones that were throttled because the
1409                          * pager wasn't ready at the time.  If a pager is ready now,
1410                          * see if one of these is useful.
1411                          */
1412                         if (!VM_PAGE_Q_THROTTLED(iq) && !queue_empty(&vm_page_queue_throttled)) {
1413                                 m = (vm_page_t) queue_first(&vm_page_queue_throttled);
1414                                 break;
1415                         }
1416
1417                         /*
1418                          * The second most eligible pages are ones we paged in speculatively,
1419                          * but which have not yet been touched.
1420                          */
1421                         if ( !queue_empty(&sq->age_q) ) {
1422                                 m = (vm_page_t) queue_first(&sq->age_q);
1423                                 break;
1424                         }
1425                         /*
1426                          * Time for a zero-filled inactive page?
1427                          */
1428                         if ( ((zf_run_count < zf_ratio) && vm_zf_queue_count >= zf_queue_min_count) ||
1429                              queue_empty(&vm_page_queue_inactive)) {
1430                                 if ( !queue_empty(&vm_page_queue_zf) ) {
1431                                         m = (vm_page_t) queue_first(&vm_page_queue_zf);
1432                                         zf_run_count++;
1433                                         break;
1434                                 }
1435                         }
1436                         /*
1437                          * It's either a normal inactive page or nothing.
1438                          */
1439                         if ( !queue_empty(&vm_page_queue_inactive) ) {
1440                                 m = (vm_page_t) queue_first(&vm_page_queue_inactive);
1441                                 zf_run_count = 0;
1442                                 break;
1443                         }
1444
1445                         panic("vm_pageout: no victim");
1446                 }
1447
1448                 assert(!m->active && (m->inactive || m->speculative || m->throttled));
1449                 assert(!m->laundry);
1450                 assert(m->object != kernel_object);
1451                 assert(m->phys_page != vm_page_guard_addr);
1452
1453                 DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
1454
1455                 /*
1456                  * check to see if we currently are working
1457                  * with the same object... if so, we've
1458                  * already got the lock
1459                  */
1460                 if (m->object != object) {
1461                         /*
1462                          * the object associated with candidate page is
1463                          * different from the one we were just working
1464                          * with... dump the lock if we still own it
1465                          */
1466                         if (object != NULL) {
1467                                 vm_object_unlock(object);
1468                                 object = NULL;
1469                                 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1470                         }
1471                         /*
1472                          * Try to lock object; since we've alread got the
1473                          * page queues lock, we can only 'try' for this one.
1474                          * if the 'try' fails, we need to do a mutex_pause
1475                          * to allow the owner of the object lock a chance to
1476                          * run... otherwise, we're likely to trip over this
1477                          * object in the same state as we work our way through
1478                          * the queue... clumps of pages associated with the same
1479                          * object are fairly typical on the inactive and active queues
1480                          */
1481                         if (!vm_object_lock_try_scan(m->object)) {
1482                                 /*
1483                                  *      Move page to end and continue.
1484                                  *      Don't re-issue ticket
1485                                  */
1486                                 if (m->zero_fill) {
1487                                         queue_remove(&vm_page_queue_zf, m,
1488                                                      vm_page_t, pageq);
1489                                         queue_enter(&vm_page_queue_zf, m,
1490                                                     vm_page_t, pageq);
1491                                 } else if (m->speculative) {
1492                                         remque(&m->pageq);
1493                                         m->speculative = FALSE;
1494                                         vm_page_speculative_count--;
1495
1496                                         /*
1497                                          * move to the tail of the inactive queue
1498                                          * to get it out of the way... the speculative
1499                                          * queue is generally too small to depend
1500                                          * on there being enough pages from other
1501                                          * objects to make cycling it back on the
1502                                          * same queue a winning proposition
1503                                          */
1504                                         queue_enter(&vm_page_queue_inactive, m,
1505                                                     vm_page_t, pageq);
1506                                         m->inactive = TRUE;
1507                                         vm_page_inactive_count++;
1508                                         token_new_pagecount++;
1509                                 }  else if (m->throttled) {
1510                                         queue_remove(&vm_page_queue_throttled, m,
1511                                                      vm_page_t, pageq);
1512                                         m->throttled = FALSE;
1513                                         vm_page_throttled_count--;
1514
1515                                         /*
1516                                          * not throttled any more, so can stick
1517                                          * it on the inactive queue.
1518                                          */
1519                                         queue_enter(&vm_page_queue_inactive, m,
1520                                                     vm_page_t, pageq);
1521                                         m->inactive = TRUE;
1522                                         vm_page_inactive_count++;
1523                                         token_new_pagecount++;
1524                                 } else {
1525                                         queue_remove(&vm_page_queue_inactive, m,
1526                                                      vm_page_t, pageq);
1527 #if MACH_ASSERT
1528                                         vm_page_inactive_count--;       /* balance for purgeable queue asserts */
1529 #endif
1530                                         vm_purgeable_q_advance_all();
1531
1532                                         queue_enter(&vm_page_queue_inactive, m,
1533                                                     vm_page_t, pageq);
1534 #if MACH_ASSERT
1535                                         vm_page_inactive_count++;       /* balance for purgeable queue asserts */
1536 #endif
1537                                         token_new_pagecount++;
1538                                 }
1539                                 pmap_clear_reference(m->phys_page);
1540                                 m->reference = FALSE;
1541
1542                                 vm_pageout_inactive_nolock++;
1543
1544                                 if ( !queue_empty(&sq->age_q) )
1545                                         m = (vm_page_t) queue_first(&sq->age_q);
1546                                 else if ( ((zf_run_count < zf_ratio) && vm_zf_queue_count >= zf_queue_min_count) ||
1547                                           queue_empty(&vm_page_queue_inactive)) {
1548                                         if ( !queue_empty(&vm_page_queue_zf) )
1549                                                 m = (vm_page_t) queue_first(&vm_page_queue_zf);
1550                                 } else if ( !queue_empty(&vm_page_queue_inactive) ) {
1551                                         m = (vm_page_t) queue_first(&vm_page_queue_inactive);
1552                                 }
1553                                 /*
1554                                  * this is the next object we're going to be interested in
1555                                  * try to make sure its available after the mutex_yield
1556                                  * returns control
1557                                  */
1558                                 vm_pageout_scan_wants_object = m->object;
1559
1560                                 /*
1561                                  * force us to dump any collected free pages
1562                                  * and to pause before moving on
1563                                  */
1564                                 try_failed = TRUE;
1565
1566                                 goto done_with_inactivepage;
1567                         }
1568                         object = m->object;
1569                         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1570
1571                         try_failed = FALSE;
1572                 }
1573
1574                 /*
1575                  *      Paging out pages of external objects which
1576                  *      are currently being created must be avoided.
1577                  *      The pager may claim for memory, thus leading to a
1578                  *      possible dead lock between it and the pageout thread,
1579                  *      if such pages are finally chosen. The remaining assumption
1580                  *      is that there will finally be enough available pages in the
1581                  *      inactive pool to page out in order to satisfy all memory
1582                  *      claimed by the thread which concurrently creates the pager.
1583                  */
1584                 if (!object->pager_initialized && object->pager_created) {
1585                         /*
1586                          *      Move page to end and continue, hoping that
1587                          *      there will be enough other inactive pages to
1588                          *      page out so that the thread which currently
1589                          *      initializes the pager will succeed.
1590                          *      Don't re-grant the ticket, the page should
1591                          *      pulled from the queue and paged out whenever
1592                          *      one of its logically adjacent fellows is
1593                          *      targeted.
1594                          *
1595                          *      Pages found on the speculative list can never be
1596                          *      in this state... they always have a pager associated
1597                          *      with them.
1598                          */
1599                         assert(!m->speculative);
1600
1601                         if (m->zero_fill) {
1602                                 queue_remove(&vm_page_queue_zf, m,
1603                                              vm_page_t, pageq);
1604                                 queue_enter(&vm_page_queue_zf, m,
1605                                             vm_page_t, pageq);
1606                         } else {
1607                                 queue_remove(&vm_page_queue_inactive, m,
1608                                              vm_page_t, pageq);
1609 #if MACH_ASSERT
1610                                 vm_page_inactive_count--;       /* balance for purgeable queue asserts */
1611 #endif
1612                                 vm_purgeable_q_advance_all();
1613
1614                                 queue_enter(&vm_page_queue_inactive, m,
1615                                             vm_page_t, pageq);
1616 #if MACH_ASSERT
1617                                 vm_page_inactive_count++;       /* balance for purgeable queue asserts */
1618 #endif
1619                                 token_new_pagecount++;
1620                         }
1621                         vm_pageout_inactive_avoid++;
1622
1623                         goto done_with_inactivepage;
1624                 }
1625                 /*
1626                  *      Remove the page from its list.
1627                  */
1628                 if (m->speculative) {
1629                         remque(&m->pageq);
1630                         m->speculative = FALSE;
1631                         vm_page_speculative_count--;
1632                 } else if (m->throttled) {
1633                         queue_remove(&vm_page_queue_throttled, m, vm_page_t, pageq);
1634                         m->throttled = FALSE;
1635                         vm_page_throttled_count--;
1636                 } else {
1637                         if (m->zero_fill) {
1638                                 queue_remove(&vm_page_queue_zf, m, vm_page_t, pageq);
1639                                 vm_zf_queue_count--;
1640                         } else {
1641                                 queue_remove(&vm_page_queue_inactive, m, vm_page_t, pageq);
1642                         }
1643                         m->inactive = FALSE;
1644                         if (!m->fictitious)
1645                                 vm_page_inactive_count--;
1646                                 vm_purgeable_q_advance_all();
1647                 }
1648
1649                 /* If the object is empty, the page must be reclaimed even if dirty or used. */
1650                 /* If the page belongs to a volatile object, we stick it back on. */
1651                 if (object->copy == VM_OBJECT_NULL) {
1652                         if(object->purgable == VM_PURGABLE_EMPTY && !m->cleaning) {
1653                                 m->busy = TRUE;
1654                                 if (m->pmapped == TRUE) {
1655                                         /* unmap the page */
1656                                         refmod_state = pmap_disconnect(m->phys_page);
1657                                         if (refmod_state & VM_MEM_MODIFIED) {
1658                                                 m->dirty = TRUE;
1659                                         }
1660                                 }
1661                                 if (m->dirty || m->precious) {
1662                                         /* we saved the cost of cleaning this page ! */
1663                                         vm_page_purged_count++;
1664                                 }
1665                                 goto reclaim_page;
1666                         }
1667                         if (object->purgable == VM_PURGABLE_VOLATILE) {
1668                                 /* if it's wired, we can't put it on our queue */
1669                                 assert(m->wire_count == 0);
1670                                 /* just stick it back on! */
1671                                 goto reactivate_page;
1672                         }
1673                 }
1674                 m->pageq.next = NULL;
1675                 m->pageq.prev = NULL;
1676
1677                 if ( !m->fictitious && catch_up_count)
1678                         catch_up_count--;
1679
1680                 /*
1681                  * ENCRYPTED SWAP:
1682                  * if this page has already been picked up as part of a
1683                  * page-out cluster, it will be busy because it is being
1684                  * encrypted (see vm_object_upl_request()).  But we still
1685                  * want to demote it from "clean-in-place" (aka "adjacent")
1686                  * to "clean-and-free" (aka "target"), so let's ignore its
1687                  * "busy" bit here and proceed to check for "cleaning" a
1688                  * little bit below...
1689                  */
1690                 if ( !m->encrypted_cleaning && (m->busy || !object->alive)) {
1691                         /*
1692                          *      Somebody is already playing with this page.
1693                          *      Leave it off the pageout queues.
1694                          *
1695                          */
1696                         vm_pageout_inactive_busy++;
1697
1698                         goto done_with_inactivepage;
1699                 }
1700
1701                 /*
1702                  *      If it's absent or in error, we can reclaim the page.
1703                  */
1704
1705                 if (m->absent || m->error) {
1706                         vm_pageout_inactive_absent++;
1707 reclaim_page:
1708                         if (vm_pageout_deadlock_target) {
1709                                 vm_pageout_scan_inactive_throttle_success++;
1710                                 vm_pageout_deadlock_target--;
1711                         }
1712
1713                         DTRACE_VM2(dfree, int, 1, (uint64_t *), NULL);
1714
1715                         if (m->object->internal) {
1716                                 DTRACE_VM2(anonfree, int, 1, (uint64_t *), NULL);
1717                         } else {
1718                                 DTRACE_VM2(fsfree, int, 1, (uint64_t *), NULL);
1719                         }
1720
1721                         vm_page_free_prepare(m);
1722
1723                         assert(m->pageq.next == NULL &&
1724                                m->pageq.prev == NULL);
1725                         m->pageq.next = (queue_entry_t)local_freeq;
1726                         local_freeq = m;
1727                         local_freed++;
1728
1729                         inactive_burst_count = 0;
1730
1731                         goto done_with_inactivepage;
1732                 }
1733
1734                 assert(!m->private);
1735                 assert(!m->fictitious);
1736
1737                 /*
1738                  *      If already cleaning this page in place, convert from
1739                  *      "adjacent" to "target". We can leave the page mapped,
1740                  *      and vm_pageout_object_terminate will determine whether
1741                  *      to free or reactivate.
1742                  */
1743
1744                 if (m->cleaning) {
1745                         m->busy = TRUE;
1746                         m->pageout = TRUE;
1747                         m->dump_cleaning = TRUE;
1748                         vm_page_wire(m);
1749
1750                         CLUSTER_STAT(vm_pageout_cluster_conversions++);
1751
1752                         inactive_burst_count = 0;
1753
1754                         goto done_with_inactivepage;
1755                 }
1756
1757                 /*
1758                  *      If it's being used, reactivate.
1759                  *      (Fictitious pages are either busy or absent.)
1760                  *      First, update the reference and dirty bits
1761                  *      to make sure the page is unreferenced.
1762                  */
1763                 refmod_state = -1;
1764
1765                 if (m->reference == FALSE && m->pmapped == TRUE) {
1766                         refmod_state = pmap_get_refmod(m->phys_page);
1767
1768                         if (refmod_state & VM_MEM_REFERENCED)
1769                                 m->reference = TRUE;
1770                         if (refmod_state & VM_MEM_MODIFIED)
1771                                 m->dirty = TRUE;
1772                 }
1773                 if (m->reference && !m->no_cache) {
1774                         /*
1775                          * The page we pulled off the inactive list has
1776                          * been referenced.  It is possible for other
1777                          * processors to be touching pages faster than we
1778                          * can clear the referenced bit and traverse the
1779                          * inactive queue, so we limit the number of
1780                          * reactivations.
1781                          */
1782                         if (++reactivated_this_call >= reactivate_limit) {
1783                                 vm_pageout_reactivation_limit_exceeded++;
1784                         } else if (catch_up_count) {
1785                                 vm_pageout_catch_ups++;
1786                         } else if (++inactive_reclaim_run >= VM_PAGEOUT_INACTIVE_FORCE_RECLAIM) {
1787                                 vm_pageout_inactive_force_reclaim++;
1788                         } else {
1789                                 /*
1790                                  * The page was being used, so put back on active list.
1791                                  */
1792 reactivate_page:
1793                                 vm_page_activate(m);
1794                                 VM_STAT_INCR(reactivations);
1795
1796                                 vm_pageout_inactive_used++;
1797                                 inactive_burst_count = 0;
1798
1799                                 goto done_with_inactivepage;
1800                         }
1801                         /*
1802                          * Make sure we call pmap_get_refmod() if it
1803                          * wasn't already called just above, to update
1804                          * the dirty bit.
1805                          */
1806                         if ((refmod_state == -1) && !m->dirty && m->pmapped) {
1807                                 refmod_state = pmap_get_refmod(m->phys_page);
1808                                 if (refmod_state & VM_MEM_MODIFIED)
1809                                         m->dirty = TRUE;
1810                         }
1811                         forced_reclaim = TRUE;
1812                 } else {
1813                         forced_reclaim = FALSE;
1814                 }
1815
1816                 XPR(XPR_VM_PAGEOUT,
1817                 "vm_pageout_scan, replace object 0x%X offset 0x%X page 0x%X\n",
1818                 (integer_t)object, (integer_t)m->offset, (integer_t)m, 0,0);
1819
1820                 /*
1821                  * we've got a candidate page to steal...
1822                  *
1823                  * m->dirty is up to date courtesy of the
1824                  * preceding check for m->reference... if
1825                  * we get here, then m->reference had to be
1826                  * FALSE (or possibly "reactivate_limit" was
1827                  * exceeded), but in either case we called
1828                  * pmap_get_refmod() and updated both
1829                  * m->reference and m->dirty
1830                  *
1831                  * if it's dirty or precious we need to
1832                  * see if the target queue is throtttled
1833                  * it if is, we need to skip over it by moving it back
1834                  * to the end of the inactive queue
1835                  */
1836                 inactive_throttled = FALSE;
1837
1838                 if (m->dirty || m->precious) {
1839                         if (object->internal) {
1840                                 if (VM_PAGE_Q_THROTTLED(iq))
1841                                         inactive_throttled = TRUE;
1842                         } else if (VM_PAGE_Q_THROTTLED(eq)) {
1843                                 inactive_throttled = TRUE;
1844                         }
1845                 }
1846                 if (inactive_throttled == TRUE) {
1847 throttle_inactive:
1848                         if (!IP_VALID(memory_manager_default) &&
1849                                 object->internal &&
1850                                 (object->purgable == VM_PURGABLE_DENY ||
1851                                  object->purgable == VM_PURGABLE_NONVOLATILE ||
1852                                  object->purgable == VM_PURGABLE_VOLATILE )) {
1853                                 queue_enter(&vm_page_queue_throttled, m,
1854                                             vm_page_t, pageq);
1855                                 m->throttled = TRUE;
1856                                 vm_page_throttled_count++;
1857                         } else {
1858                                 if (m->zero_fill) {
1859                                         queue_enter(&vm_page_queue_zf, m,
1860                                                     vm_page_t, pageq);
1861                                         vm_zf_queue_count++;
1862                                 } else
1863                                         queue_enter(&vm_page_queue_inactive, m,
1864                                                     vm_page_t, pageq);
1865                                 m->inactive = TRUE;
1866                                 if (!m->fictitious) {
1867                                         vm_page_inactive_count++;
1868                                         token_new_pagecount++;
1869                                 }
1870                         }
1871                         vm_pageout_scan_inactive_throttled++;
1872                         goto done_with_inactivepage;
1873                 }
1874
1875                 /*
1876                  * we've got a page that we can steal...
1877                  * eliminate all mappings and make sure
1878                  * we have the up-to-date modified state
1879                  * first take the page BUSY, so that no new
1880                  * mappings can be made
1881                  */
1882                 m->busy = TRUE;
1883
1884                 /*
1885                  * if we need to do a pmap_disconnect then we
1886                  * need to re-evaluate m->dirty since the pmap_disconnect
1887                  * provides the true state atomically... the
1888                  * page was still mapped up to the pmap_disconnect
1889                  * and may have been dirtied at the last microsecond
1890                  *
1891                  * we also check for the page being referenced 'late'
1892                  * if it was, we first need to do a WAKEUP_DONE on it
1893                  * since we already set m->busy = TRUE, before
1894                  * going off to reactivate it
1895                  *
1896                  * Note that if 'pmapped' is FALSE then the page is not
1897                  * and has not been in any map, so there is no point calling
1898                  * pmap_disconnect().  m->dirty and/or m->reference could
1899                  * have been set in anticipation of likely usage of the page.
1900                  */
1901                 if (m->pmapped == TRUE) {
1902                         refmod_state = pmap_disconnect(m->phys_page);
1903
1904                         if (refmod_state & VM_MEM_MODIFIED)
1905                                 m->dirty = TRUE;
1906                         if (refmod_state & VM_MEM_REFERENCED) {
1907
1908                                 /* If m->reference is already set, this page must have
1909                                  * already failed the reactivate_limit test, so don't
1910                                  * bump the counts twice.
1911                                  */
1912                                 if ( ! m->reference ) {
1913                                         m->reference = TRUE;
1914                                         if (forced_reclaim ||
1915                                             ++reactivated_this_call >= reactivate_limit)
1916                                                 vm_pageout_reactivation_limit_exceeded++;
1917                                         else {
1918                                                 PAGE_WAKEUP_DONE(m);
1919                                                 goto reactivate_page;
1920                                         }
1921                                 }
1922                         }
1923                 }
1924                 /*
1925                  * reset our count of pages that have been reclaimed
1926                  * since the last page was 'stolen'
1927                  */
1928                 inactive_reclaim_run = 0;
1929
1930                 /*
1931                  *      If it's clean and not precious, we can free the page.
1932                  */
1933                 if (!m->dirty && !m->precious) {
1934                         vm_pageout_inactive_clean++;
1935                         goto reclaim_page;
1936                 }
1937
1938                 /*
1939                  * The page may have been dirtied since the last check
1940                  * for a throttled target queue (which may have been skipped
1941                  * if the page was clean then).  With the dirty page
1942                  * disconnected here, we can make one final check.
1943                  */
1944                 {
1945                         boolean_t disconnect_throttled = FALSE;
1946                         if (object->internal) {
1947                                 if (VM_PAGE_Q_THROTTLED(iq))
1948                                         disconnect_throttled = TRUE;
1949                         } else if (VM_PAGE_Q_THROTTLED(eq)) {
1950                                 disconnect_throttled = TRUE;
1951                         }
1952
1953                         if (disconnect_throttled == TRUE) {
1954                                 PAGE_WAKEUP_DONE(m);
1955                                 goto throttle_inactive;
1956                         }
1957                 }
1958
1959                 vm_pageout_cluster(m);
1960
1961                 vm_pageout_inactive_dirty++;
1962
1963                 inactive_burst_count = 0;
1964
1965 done_with_inactivepage:
1966                 if (delayed_unlock++ > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT || try_failed == TRUE) {
1967
1968                         if (object != NULL) {
1969                                 vm_object_unlock(object);
1970                                 object = NULL;
1971                                 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1972                         }
1973                         if (local_freeq) {
1974                                 vm_page_free_list(local_freeq);
1975
1976                                 local_freeq = NULL;
1977                                 local_freed = 0;
1978                         }
1979                         mutex_yield(&vm_page_queue_lock);
1980
1981                         delayed_unlock = 1;
1982                 }
1983                 /*
1984                  * back to top of pageout scan loop
1985                  */
1986         }
1987 }
1988
1989
1990 int vm_page_free_count_init;
1991
1992 void
1993 vm_page_free_reserve(
1994         int pages)
1995 {
1996         int             free_after_reserve;
1997
1998         vm_page_free_reserved += pages;
1999
2000         free_after_reserve = vm_page_free_count_init - vm_page_free_reserved;
2001
2002         vm_page_free_min = vm_page_free_reserved +
2003                 VM_PAGE_FREE_MIN(free_after_reserve);
2004
2005         if (vm_page_free_min > VM_PAGE_FREE_MIN_LIMIT)
2006                 vm_page_free_min = VM_PAGE_FREE_MIN_LIMIT;
2007
2008         vm_page_free_target = vm_page_free_reserved +
2009                 VM_PAGE_FREE_TARGET(free_after_reserve);
2010
2011         if (vm_page_free_target > VM_PAGE_FREE_TARGET_LIMIT)
2012                 vm_page_free_target = VM_PAGE_FREE_TARGET_LIMIT;
2013
2014         if (vm_page_free_target < vm_page_free_min + 5)
2015                 vm_page_free_target = vm_page_free_min + 5;
2016
2017 }
2018
2019 /*
2020  *      vm_pageout is the high level pageout daemon.
2021  */
2022
2023 void
2024 vm_pageout_continue(void)
2025 {
2026         DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL);
2027         vm_pageout_scan_event_counter++;
2028         vm_pageout_scan();
2029         /* we hold vm_page_queue_free_lock now */
2030         assert(vm_page_free_wanted == 0);
2031         assert(vm_page_free_wanted_privileged == 0);
2032         assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
2033         mutex_unlock(&vm_page_queue_free_lock);
2034
2035         counter(c_vm_pageout_block++);
2036         thread_block((thread_continue_t)vm_pageout_continue);
2037         /*NOTREACHED*/
2038 }
2039
2040
2041 /*
2042  * must be called with the
2043  * queues and object locks held
2044  */
2045 static void
2046 vm_pageout_queue_steal(vm_page_t m)
2047 {
2048         struct vm_pageout_queue *q;
2049
2050         if (m->object->internal == TRUE)
2051                 q = &vm_pageout_queue_internal;
2052         else
2053                 q = &vm_pageout_queue_external;
2054
2055         m->laundry = FALSE;
2056         m->pageout_queue = FALSE;
2057         queue_remove(&q->pgo_pending, m, vm_page_t, pageq);
2058
2059         m->pageq.next = NULL;
2060         m->pageq.prev = NULL;
2061
2062         vm_object_paging_end(m->object);
2063
2064         q->pgo_laundry--;
2065 }
2066
2067
2068 #ifdef FAKE_DEADLOCK
2069
2070 #define FAKE_COUNT      5000
2071
2072 int internal_count = 0;
2073 int fake_deadlock = 0;
2074
2075 #endif
2076
2077 static void
2078 vm_pageout_iothread_continue(struct vm_pageout_queue *q)
2079 {
2080         vm_page_t       m = NULL;
2081         vm_object_t     object;
2082         boolean_t       need_wakeup;
2083         memory_object_t pager;
2084         thread_t        self = current_thread();
2085
2086         if ((vm_pageout_internal_iothread != THREAD_NULL)
2087             && (self == vm_pageout_external_iothread )
2088             && (self->options & TH_OPT_VMPRIV))
2089                 self->options &= ~TH_OPT_VMPRIV;
2090
2091         vm_page_lockspin_queues();
2092
2093         while ( !queue_empty(&q->pgo_pending) ) {
2094
2095                    q->pgo_busy = TRUE;
2096                    queue_remove_first(&q->pgo_pending, m, vm_page_t, pageq);
2097                    m->pageout_queue = FALSE;
2098                    vm_page_unlock_queues();
2099
2100                    m->pageq.next = NULL;
2101                    m->pageq.prev = NULL;
2102 #ifdef FAKE_DEADLOCK
2103                    if (q == &vm_pageout_queue_internal) {
2104                            vm_offset_t addr;
2105                            int  pg_count;
2106
2107                            internal_count++;
2108
2109                            if ((internal_count == FAKE_COUNT)) {
2110
2111                                    pg_count = vm_page_free_count + vm_page_free_reserved;
2112
2113                                    if (kmem_alloc(kernel_map, &addr, PAGE_SIZE * pg_count) == KERN_SUCCESS) {
2114                                            kmem_free(kernel_map, addr, PAGE_SIZE * pg_count);
2115                                    }
2116                                    internal_count = 0;
2117                                    fake_deadlock++;
2118                            }
2119                    }
2120 #endif
2121                    object = m->object;
2122
2123                    vm_object_lock(object);
2124
2125                    if (!object->pager_initialized) {
2126
2127                            /*
2128                             *   If there is no memory object for the page, create
2129                             *   one and hand it to the default pager.
2130                             */
2131
2132                            if (!object->pager_initialized)
2133                                    vm_object_collapse(object,
2134                                                       (vm_object_offset_t) 0,
2135                                                       TRUE);
2136                            if (!object->pager_initialized)
2137                                    vm_object_pager_create(object);
2138                            if (!object->pager_initialized) {
2139                                    /*
2140                                     *   Still no pager for the object.
2141                                     *   Reactivate the page.
2142                                     *
2143                                     *   Should only happen if there is no
2144                                     *   default pager.
2145                                     */
2146                                    m->list_req_pending = FALSE;
2147                                    m->cleaning = FALSE;
2148                                    m->pageout = FALSE;
2149
2150                                    vm_page_lockspin_queues();
2151                                    vm_page_unwire(m);
2152                                    vm_pageout_throttle_up(m);
2153                                    vm_pageout_dirty_no_pager++;
2154                                    vm_page_activate(m);
2155                                    vm_page_unlock_queues();
2156
2157                                    /*
2158                                     *   And we are done with it.
2159                                     */
2160                                    PAGE_WAKEUP_DONE(m);
2161
2162                                    vm_object_paging_end(object);
2163                                    vm_object_unlock(object);
2164
2165                                    vm_page_lockspin_queues();
2166                                    continue;
2167                            }
2168                    }
2169                    pager = object->pager;
2170                    if (pager == MEMORY_OBJECT_NULL) {
2171                            /*
2172                             * This pager has been destroyed by either
2173                             * memory_object_destroy or vm_object_destroy, and
2174                             * so there is nowhere for the page to go.
2175                             * Just free the page... VM_PAGE_FREE takes
2176                             * care of cleaning up all the state...
2177                             * including doing the vm_pageout_throttle_up
2178                             */
2179
2180                            VM_PAGE_FREE(m);
2181
2182                            vm_object_paging_end(object);
2183                            vm_object_unlock(object);
2184
2185                            vm_page_lockspin_queues();
2186                            continue;
2187                    }
2188                    vm_object_unlock(object);
2189                    /*
2190                     * we expect the paging_in_progress reference to have
2191                     * already been taken on the object before it was added
2192                     * to the appropriate pageout I/O queue... this will
2193                     * keep the object from being terminated and/or the
2194                     * paging_offset from changing until the I/O has
2195                     * completed... therefore no need to lock the object to
2196                     * pull the paging_offset from it.
2197                     *
2198                     * Send the data to the pager.
2199                     * any pageout clustering happens there
2200                     */
2201                    memory_object_data_return(pager,
2202                                              m->offset + object->paging_offset,
2203                                              PAGE_SIZE,
2204                                              NULL,
2205                                              NULL,
2206                                              FALSE,
2207                                              FALSE,
2208                                              0);
2209
2210                    vm_object_lock(object);
2211                    vm_object_paging_end(object);
2212                    vm_object_unlock(object);
2213
2214                    vm_page_lockspin_queues();
2215         }
2216         assert_wait((event_t) q, THREAD_UNINT);
2217
2218
2219         if (q->pgo_throttled == TRUE && !VM_PAGE_Q_THROTTLED(q)) {
2220                 q->pgo_throttled = FALSE;
2221                 need_wakeup = TRUE;
2222         } else
2223                 need_wakeup = FALSE;
2224
2225         q->pgo_busy = FALSE;
2226         q->pgo_idle = TRUE;
2227         vm_page_unlock_queues();
2228
2229         if (need_wakeup == TRUE)
2230                 thread_wakeup((event_t) &q->pgo_laundry);
2231
2232         thread_block_parameter((thread_continue_t)vm_pageout_iothread_continue, (void *) &q->pgo_pending);
2233         /*NOTREACHED*/
2234 }
2235
2236
2237 static void
2238 vm_pageout_iothread_external(void)
2239 {
2240         thread_t        self = current_thread();
2241
2242         self->options |= TH_OPT_VMPRIV;
2243
2244         vm_pageout_iothread_continue(&vm_pageout_queue_external);
2245         /*NOTREACHED*/
2246 }
2247
2248
2249 static void
2250 vm_pageout_iothread_internal(void)
2251 {
2252         thread_t        self = current_thread();
2253
2254         self->options |= TH_OPT_VMPRIV;
2255
2256         vm_pageout_iothread_continue(&vm_pageout_queue_internal);
2257         /*NOTREACHED*/
2258 }
2259
2260 static void
2261 vm_pageout_garbage_collect(int collect)
2262 {
2263         if (collect) {
2264                 stack_collect();
2265
2266                 /*
2267                  * consider_zone_gc should be last, because the other operations
2268                  * might return memory to zones.
2269                  */
2270                 consider_machine_collect();
2271                 consider_zone_gc();
2272
2273                 consider_machine_adjust();
2274         }
2275
2276         assert_wait((event_t) &vm_pageout_garbage_collect, THREAD_UNINT);
2277
2278         thread_block_parameter((thread_continue_t) vm_pageout_garbage_collect, (void *)1);
2279         /*NOTREACHED*/
2280 }
2281
2282
2283
2284 void
2285 vm_pageout(void)
2286 {
2287         thread_t        self = current_thread();
2288         thread_t        thread;
2289         kern_return_t   result;
2290         spl_t           s;
2291
2292         /*
2293          * Set thread privileges.
2294          */
2295         s = splsched();
2296         thread_lock(self);
2297         self->priority = BASEPRI_PREEMPT - 1;
2298         set_sched_pri(self, self->priority);
2299         thread_unlock(self);
2300
2301         if (!self->reserved_stack)
2302                 self->reserved_stack = self->kernel_stack;
2303
2304         splx(s);
2305
2306         /*
2307          *      Initialize some paging parameters.
2308          */
2309
2310         if (vm_pageout_idle_wait == 0)
2311                 vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
2312
2313         if (vm_pageout_burst_wait == 0)
2314                 vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
2315
2316         if (vm_pageout_empty_wait == 0)
2317                 vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
2318
2319         if (vm_pageout_deadlock_wait == 0)
2320                 vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
2321
2322         if (vm_pageout_deadlock_relief == 0)
2323                 vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
2324
2325         if (vm_pageout_inactive_relief == 0)
2326                 vm_pageout_inactive_relief = VM_PAGEOUT_INACTIVE_RELIEF;
2327
2328         if (vm_pageout_burst_active_throttle == 0)
2329                 vm_pageout_burst_active_throttle = VM_PAGEOUT_BURST_ACTIVE_THROTTLE;
2330
2331         if (vm_pageout_burst_inactive_throttle == 0)
2332                 vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
2333
2334         /*
2335          * Set kernel task to low backing store privileged
2336          * status
2337          */
2338         task_lock(kernel_task);
2339         kernel_task->priv_flags |= VM_BACKING_STORE_PRIV;
2340         task_unlock(kernel_task);
2341
2342         vm_page_free_count_init = vm_page_free_count;
2343
2344         /*
2345          * even if we've already called vm_page_free_reserve
2346          * call it again here to insure that the targets are
2347          * accurately calculated (it uses vm_page_free_count_init)
2348          * calling it with an arg of 0 will not change the reserve
2349          * but will re-calculate free_min and free_target
2350          */
2351         if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
2352                 vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
2353         } else
2354                 vm_page_free_reserve(0);
2355
2356
2357         queue_init(&vm_pageout_queue_external.pgo_pending);
2358         vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
2359         vm_pageout_queue_external.pgo_laundry = 0;
2360         vm_pageout_queue_external.pgo_idle = FALSE;
2361         vm_pageout_queue_external.pgo_busy = FALSE;
2362         vm_pageout_queue_external.pgo_throttled = FALSE;
2363
2364         queue_init(&vm_pageout_queue_internal.pgo_pending);
2365         vm_pageout_queue_internal.pgo_maxlaundry = 0;
2366         vm_pageout_queue_internal.pgo_laundry = 0;
2367         vm_pageout_queue_internal.pgo_idle = FALSE;
2368         vm_pageout_queue_internal.pgo_busy = FALSE;
2369         vm_pageout_queue_internal.pgo_throttled = FALSE;
2370
2371
2372         /* internal pageout thread started when default pager registered first time */
2373         /* external pageout and garbage collection threads started here */
2374
2375         result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external, NULL,
2376                                               BASEPRI_PREEMPT - 1,
2377                                               &vm_pageout_external_iothread);
2378         if (result != KERN_SUCCESS)
2379                 panic("vm_pageout_iothread_external: create failed");
2380
2381         thread_deallocate(vm_pageout_external_iothread);
2382
2383         result = kernel_thread_start_priority((thread_continue_t)vm_pageout_garbage_collect, NULL,
2384                                               MINPRI_KERNEL,
2385                                               &thread);
2386         if (result != KERN_SUCCESS)
2387                 panic("vm_pageout_garbage_collect: create failed");
2388
2389         thread_deallocate(thread);
2390
2391         vm_object_reaper_init();
2392
2393
2394         vm_pageout_continue();
2395
2396         /*
2397          * Unreached code!
2398          *
2399          * The vm_pageout_continue() call above never returns, so the code below is never
2400          * executed.  We take advantage of this to declare several DTrace VM related probe
2401          * points that our kernel doesn't have an analog for.  These are probe points that
2402          * exist in Solaris and are in the DTrace documentation, so people may have written
2403          * scripts that use them.  Declaring the probe points here means their scripts will
2404          * compile and execute which we want for portability of the scripts, but since this
2405          * section of code is never reached, the probe points will simply never fire.  Yes,
2406          * this is basically a hack.  The problem is the DTrace probe points were chosen with
2407          * Solaris specific VM events in mind, not portability to different VM implementations.
2408          */
2409
2410         DTRACE_VM2(execfree, int, 1, (uint64_t *), NULL);
2411         DTRACE_VM2(execpgin, int, 1, (uint64_t *), NULL);
2412         DTRACE_VM2(execpgout, int, 1, (uint64_t *), NULL);
2413         DTRACE_VM2(pgswapin, int, 1, (uint64_t *), NULL);
2414         DTRACE_VM2(pgswapout, int, 1, (uint64_t *), NULL);
2415         DTRACE_VM2(swapin, int, 1, (uint64_t *), NULL);
2416         DTRACE_VM2(swapout, int, 1, (uint64_t *), NULL);
2417         /*NOTREACHED*/
2418 }
2419
2420 kern_return_t
2421 vm_pageout_internal_start(void)
2422 {
2423         kern_return_t result;
2424
2425         vm_pageout_queue_internal.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
2426         result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal, NULL, BASEPRI_PREEMPT - 1, &vm_pageout_internal_iothread);
2427         if (result == KERN_SUCCESS)
2428                 thread_deallocate(vm_pageout_internal_iothread);
2429         return result;
2430 }
2431
2432 #define UPL_DELAYED_UNLOCK_LIMIT  (MAX_UPL_TRANSFER / 2)
2433
2434 static upl_t
2435 upl_create(int type, int flags, upl_size_t size)
2436 {
2437         upl_t   upl;
2438         int     page_field_size = 0;
2439         int     upl_flags = 0;
2440         int     upl_size  = sizeof(struct upl);
2441
2442         if (type & UPL_CREATE_LITE) {
2443                 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
2444                 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
2445
2446                 upl_flags |= UPL_LITE;
2447         }
2448         if (type & UPL_CREATE_INTERNAL) {
2449                 upl_size += sizeof(struct upl_page_info) * (size/PAGE_SIZE);
2450
2451                 upl_flags |= UPL_INTERNAL;
2452         }
2453         upl = (upl_t)kalloc(upl_size + page_field_size);
2454
2455         if (page_field_size)
2456                 bzero((char *)upl + upl_size, page_field_size);
2457
2458         upl->flags = upl_flags | flags;
2459         upl->src_object = NULL;
2460         upl->kaddr = (vm_offset_t)0;
2461         upl->size = 0;
2462         upl->map_object = NULL;
2463         upl->ref_count = 1;
2464         upl->highest_page = 0;
2465         upl_lock_init(upl);
2466 #ifdef UPL_DEBUG
2467         upl->ubc_alias1 = 0;
2468         upl->ubc_alias2 = 0;
2469 #endif /* UPL_DEBUG */
2470         return(upl);
2471 }
2472
2473 static void
2474 upl_destroy(upl_t upl)
2475 {
2476         int     page_field_size;  /* bit field in word size buf */
2477         int     size;
2478
2479 #ifdef UPL_DEBUG
2480         {
2481                 vm_object_t     object;
2482
2483                 if (upl->flags & UPL_SHADOWED) {
2484                         object = upl->map_object->shadow;
2485                 } else {
2486                         object = upl->map_object;
2487                 }
2488                 vm_object_lock(object);
2489                 queue_remove(&object->uplq, upl, upl_t, uplq);
2490                 vm_object_unlock(object);
2491         }
2492 #endif /* UPL_DEBUG */
2493         /*
2494          * drop a reference on the map_object whether or
2495          * not a pageout object is inserted
2496          */
2497         if (upl->flags & UPL_SHADOWED)
2498                 vm_object_deallocate(upl->map_object);
2499
2500         if (upl->flags & UPL_DEVICE_MEMORY)
2501                 size = PAGE_SIZE;
2502         else
2503                 size = upl->size;
2504         page_field_size = 0;
2505
2506         if (upl->flags & UPL_LITE) {
2507                 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
2508                 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
2509         }
2510         if (upl->flags & UPL_INTERNAL) {
2511                 kfree(upl,
2512                       sizeof(struct upl) +
2513                       (sizeof(struct upl_page_info) * (size/PAGE_SIZE))
2514                       + page_field_size);
2515         } else {
2516                 kfree(upl, sizeof(struct upl) + page_field_size);
2517         }
2518 }
2519
2520 void uc_upl_dealloc(upl_t upl);
2521 __private_extern__ void
2522 uc_upl_dealloc(upl_t upl)
2523 {
2524         if (--upl->ref_count == 0)
2525                 upl_destroy(upl);
2526 }
2527
2528 void
2529 upl_deallocate(upl_t upl)
2530 {
2531         if (--upl->ref_count == 0)
2532                 upl_destroy(upl);
2533 }
2534
2535 /*
2536  * Statistics about UPL enforcement of copy-on-write obligations.
2537  */
2538 unsigned long upl_cow = 0;
2539 unsigned long upl_cow_again = 0;
2540 unsigned long upl_cow_contiguous = 0;
2541 unsigned long upl_cow_pages = 0;
2542 unsigned long upl_cow_again_pages = 0;
2543 unsigned long upl_cow_contiguous_pages = 0;
2544
2545 /*
2546  *      Routine:        vm_object_upl_request
2547  *      Purpose:
2548  *              Cause the population of a portion of a vm_object.
2549  *              Depending on the nature of the request, the pages
2550  *              returned may be contain valid data or be uninitialized.
2551  *              A page list structure, listing the physical pages
2552  *              will be returned upon request.
2553  *              This function is called by the file system or any other
2554  *              supplier of backing store to a pager.
2555  *              IMPORTANT NOTE: The caller must still respect the relationship
2556  *              between the vm_object and its backing memory object.  The
2557  *              caller MUST NOT substitute changes in the backing file
2558  *              without first doing a memory_object_lock_request on the
2559  *              target range unless it is know that the pages are not
2560  *              shared with another entity at the pager level.
2561  *              Copy_in_to:
2562  *                      if a page list structure is present
2563  *                      return the mapped physical pages, where a
2564  *                      page is not present, return a non-initialized
2565  *                      one.  If the no_sync bit is turned on, don't
2566  *                      call the pager unlock to synchronize with other
2567  *                      possible copies of the page. Leave pages busy
2568  *                      in the original object, if a page list structure
2569  *                      was specified.  When a commit of the page list
2570  *                      pages is done, the dirty bit will be set for each one.
2571  *              Copy_out_from:
2572  *                      If a page list structure is present, return
2573  *                      all mapped pages.  Where a page does not exist
2574  *                      map a zero filled one. Leave pages busy in
2575  *                      the original object.  If a page list structure
2576  *                      is not specified, this call is a no-op.
2577  *
2578  *              Note:  access of default pager objects has a rather interesting
2579  *              twist.  The caller of this routine, presumably the file system
2580  *              page cache handling code, will never actually make a request
2581  *              against a default pager backed object.  Only the default
2582  *              pager will make requests on backing store related vm_objects
2583  *              In this way the default pager can maintain the relationship
2584  *              between backing store files (abstract memory objects) and
2585  *              the vm_objects (cache objects), they support.
2586  *
2587  */
2588
2589 __private_extern__ kern_return_t
2590 vm_object_upl_request(
2591         vm_object_t             object,
2592         vm_object_offset_t      offset,
2593         upl_size_t              size,
2594         upl_t                   *upl_ptr,
2595         upl_page_info_array_t   user_page_list,
2596         unsigned int            *page_list_count,
2597         int                     cntrl_flags)
2598 {
2599         vm_page_t               dst_page = VM_PAGE_NULL;
2600         vm_object_offset_t      dst_offset;
2601         upl_size_t              xfer_size;
2602         boolean_t               dirty;
2603         boolean_t               hw_dirty;
2604         upl_t                   upl = NULL;
2605         unsigned int            entry;
2606 #if MACH_CLUSTER_STATS
2607         boolean_t               encountered_lrp = FALSE;
2608 #endif
2609         vm_page_t               alias_page = NULL;
2610         int                     refmod_state = 0;
2611         wpl_array_t             lite_list = NULL;
2612         vm_object_t             last_copy_object;
2613         int                     delayed_unlock = 0;
2614         int                     j;
2615
2616         if (cntrl_flags & ~UPL_VALID_FLAGS) {
2617                 /*
2618                  * For forward compatibility's sake,
2619                  * reject any unknown flag.
2620                  */
2621                 return KERN_INVALID_VALUE;
2622         }
2623         if ( (!object->internal) && (object->paging_offset != 0) )
2624                 panic("vm_object_upl_request: external object with non-zero paging offset\n");
2625         if (object->phys_contiguous)
2626                 panic("vm_object_upl_request: contiguous object specified\n");
2627
2628
2629         if ((size / PAGE_SIZE) > MAX_UPL_SIZE)
2630                 size = MAX_UPL_SIZE * PAGE_SIZE;
2631
2632         if ( (cntrl_flags & UPL_SET_INTERNAL) && page_list_count != NULL)
2633                 *page_list_count = MAX_UPL_SIZE;
2634
2635         if (cntrl_flags & UPL_SET_INTERNAL) {
2636                 if (cntrl_flags & UPL_SET_LITE) {
2637
2638                         upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE, 0, size);
2639
2640                         user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
2641                         lite_list = (wpl_array_t)
2642                                         (((uintptr_t)user_page_list) +
2643                                         ((size/PAGE_SIZE) * sizeof(upl_page_info_t)));
2644                 } else {
2645                         upl = upl_create(UPL_CREATE_INTERNAL, 0, size);
2646
2647                         user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
2648                 }
2649         } else {
2650                 if (cntrl_flags & UPL_SET_LITE) {
2651
2652                         upl = upl_create(UPL_CREATE_EXTERNAL | UPL_CREATE_LITE, 0, size);
2653
2654                         lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
2655                 } else {
2656                         upl = upl_create(UPL_CREATE_EXTERNAL, 0, size);
2657                 }
2658         }
2659         *upl_ptr = upl;
2660
2661         if (user_page_list)
2662                 user_page_list[0].device = FALSE;
2663
2664         if (cntrl_flags & UPL_SET_LITE) {
2665                 upl->map_object = object;
2666         } else {
2667                 upl->map_object = vm_object_allocate(size);
2668                 /*
2669                  * No neeed to lock the new object: nobody else knows
2670                  * about it yet, so it's all ours so far.
2671                  */
2672                 upl->map_object->shadow = object;
2673                 upl->map_object->pageout = TRUE;
2674                 upl->map_object->can_persist = FALSE;
2675                 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2676                 upl->map_object->shadow_offset = offset;
2677                 upl->map_object->wimg_bits = object->wimg_bits;
2678
2679                 VM_PAGE_GRAB_FICTITIOUS(alias_page);
2680
2681                 upl->flags |= UPL_SHADOWED;
2682         }
2683         /*
2684          * ENCRYPTED SWAP:
2685          * Just mark the UPL as "encrypted" here.
2686          * We'll actually encrypt the pages later,
2687          * in upl_encrypt(), when the caller has
2688          * selected which pages need to go to swap.
2689          */
2690         if (cntrl_flags & UPL_ENCRYPT)
2691                 upl->flags |= UPL_ENCRYPTED;
2692
2693         if (cntrl_flags & UPL_FOR_PAGEOUT)
2694                 upl->flags |= UPL_PAGEOUT;
2695
2696         vm_object_lock(object);
2697         vm_object_paging_begin(object);
2698
2699         /*
2700          * we can lock in the paging_offset once paging_in_progress is set
2701          */
2702         upl->size = size;
2703         upl->offset = offset + object->paging_offset;
2704
2705 #ifdef UPL_DEBUG
2706         queue_enter(&object->uplq, upl, upl_t, uplq);
2707 #endif /* UPL_DEBUG */
2708
2709         if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != VM_OBJECT_NULL) {
2710                 /*
2711                  * Honor copy-on-write obligations
2712                  *
2713                  * The caller is gathering these pages and
2714                  * might modify their contents.  We need to
2715                  * make sure that the copy object has its own
2716                  * private copies of these pages before we let
2717                  * the caller modify them.
2718                  */
2719                 vm_object_update(object,
2720                                  offset,
2721                                  size,
2722                                  NULL,
2723                                  NULL,
2724                                  FALSE, /* should_return */
2725                                  MEMORY_OBJECT_COPY_SYNC,
2726                                  VM_PROT_NO_CHANGE);
2727                 upl_cow++;
2728                 upl_cow_pages += size >> PAGE_SHIFT;
2729         }
2730         /*
2731          * remember which copy object we synchronized with
2732          */
2733         last_copy_object = object->copy;
2734         entry = 0;
2735
2736         xfer_size = size;
2737         dst_offset = offset;
2738
2739         while (xfer_size) {
2740
2741                 if ((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
2742                         if (delayed_unlock) {
2743                                 delayed_unlock = 0;
2744                                 vm_page_unlock_queues();
2745                         }
2746                         vm_object_unlock(object);
2747                         VM_PAGE_GRAB_FICTITIOUS(alias_page);
2748                         goto relock;
2749                 }
2750                 if (delayed_unlock == 0) {
2751                         /*
2752                          * pageout_scan takes the vm_page_lock_queues first
2753                          * then tries for the object lock... to avoid what
2754                          * is effectively a lock inversion, we'll go to the
2755                          * trouble of taking them in that same order... otherwise
2756                          * if this object contains the majority of the pages resident
2757                          * in the UBC (or a small set of large objects actively being
2758                          * worked on contain the majority of the pages), we could
2759                          * cause the pageout_scan thread to 'starve' in its attempt
2760                          * to find pages to move to the free queue, since it has to
2761                          * successfully acquire the object lock of any candidate page
2762                          * before it can steal/clean it.
2763                          */
2764                         vm_object_unlock(object);
2765 relock:
2766                         for (j = 0; ; j++) {
2767                                 vm_page_lock_queues();
2768
2769                                 if (vm_object_lock_try(object))
2770                                         break;
2771                                 vm_page_unlock_queues();
2772                                 mutex_pause(j);
2773                         }
2774                         delayed_unlock = 1;
2775                 }
2776                 if (cntrl_flags & UPL_COPYOUT_FROM) {
2777                         upl->flags |= UPL_PAGE_SYNC_DONE;
2778
2779                         if ( ((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
2780                                 dst_page->fictitious ||
2781                                 dst_page->absent ||
2782                                 dst_page->error ||
2783                                (dst_page->wire_count && !dst_page->pageout && !dst_page->list_req_pending)) {
2784
2785                                 if (user_page_list)
2786                                         user_page_list[entry].phys_addr = 0;
2787
2788                                 goto delay_unlock_queues;
2789                         }
2790                         /*
2791                          * grab this up front...
2792                          * a high percentange of the time we're going to
2793                          * need the hardware modification state a bit later
2794                          * anyway... so we can eliminate an extra call into
2795                          * the pmap layer by grabbing it here and recording it
2796                          */
2797                         if (dst_page->pmapped)
2798                                 refmod_state = pmap_get_refmod(dst_page->phys_page);
2799                         else
2800                                 refmod_state = 0;
2801
2802                         if ( (refmod_state & VM_MEM_REFERENCED) && dst_page->inactive ) {
2803                                 /*
2804                                  * page is on inactive list and referenced...
2805                                  * reactivate it now... this gets it out of the
2806                                  * way of vm_pageout_scan which would have to
2807                                  * reactivate it upon tripping over it
2808                                  */
2809                                 vm_page_activate(dst_page);
2810                                 VM_STAT_INCR(reactivations);
2811                         }
2812                         if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
2813                                 /*
2814                                  * we're only asking for DIRTY pages to be returned
2815                                  */
2816                                 if (dst_page->list_req_pending || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
2817                                         /*
2818                                          * if we were the page stolen by vm_pageout_scan to be
2819                                          * cleaned (as opposed to a buddy being clustered in
2820                                          * or this request is not being driven by a PAGEOUT cluster
2821                                          * then we only need to check for the page being dirty or
2822                                          * precious to decide whether to return it
2823                                          */
2824                                         if (dst_page->dirty || dst_page->precious || (refmod_state & VM_MEM_MODIFIED))
2825                                                 goto check_busy;
2826                                         goto dont_return;
2827                                 }
2828                                 /*
2829                                  * this is a request for a PAGEOUT cluster and this page
2830                                  * is merely along for the ride as a 'buddy'... not only
2831                                  * does it have to be dirty to be returned, but it also
2832                                  * can't have been referenced recently... note that we've
2833                                  * already filtered above based on whether this page is
2834                                  * currently on the inactive queue or it meets the page
2835                                  * ticket (generation count) check
2836                                  */
2837                                 if ( !(refmod_state & VM_MEM_REFERENCED) &&
2838                                      ((refmod_state & VM_MEM_MODIFIED) || dst_page->dirty || dst_page->precious) ) {
2839                                         goto check_busy;
2840                                 }
2841 dont_return:
2842                                 /*
2843                                  * if we reach here, we're not to return
2844                                  * the page... go on to the next one
2845                                  */
2846                                 if (user_page_list)
2847                                         user_page_list[entry].phys_addr = 0;
2848
2849                                 goto delay_unlock_queues;
2850                         }
2851 check_busy:
2852                         if (dst_page->busy && (!(dst_page->list_req_pending && dst_page->pageout))) {
2853                                 if (cntrl_flags & UPL_NOBLOCK) {
2854                                         if (user_page_list)
2855                                                 user_page_list[entry].phys_addr = 0;
2856
2857                                         goto delay_unlock_queues;
2858                                 }
2859                                 /*
2860                                  * someone else is playing with the
2861                                  * page.  We will have to wait.
2862                                  */
2863                                 delayed_unlock = 0;
2864                                 vm_page_unlock_queues();
2865
2866                                 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
2867
2868                                 continue;
2869                         }
2870                         /*
2871                          * Someone else already cleaning the page?
2872                          */
2873                         if ((dst_page->cleaning || dst_page->absent || dst_page->wire_count != 0) && !dst_page->list_req_pending) {
2874                                 if (user_page_list)
2875                                         user_page_list[entry].phys_addr = 0;
2876
2877                                 goto delay_unlock_queues;
2878                         }
2879                         /*
2880                          * ENCRYPTED SWAP:
2881                          * The caller is gathering this page and might
2882                          * access its contents later on.  Decrypt the
2883                          * page before adding it to the UPL, so that
2884                          * the caller never sees encrypted data.
2885                          */
2886                         if (! (cntrl_flags & UPL_ENCRYPT) && dst_page->encrypted) {
2887                                 int  was_busy;
2888
2889                                 delayed_unlock = 0;
2890                                 vm_page_unlock_queues();
2891                                 /*
2892                                  * save the current state of busy
2893                                  * mark page as busy while decrypt
2894                                  * is in progress since it will drop
2895                                  * the object lock...
2896                                  */
2897                                 was_busy = dst_page->busy;
2898                                 dst_page->busy = TRUE;
2899
2900                                 vm_page_decrypt(dst_page, 0);
2901                                 vm_page_decrypt_for_upl_counter++;
2902                                 /*
2903                                  * restore to original busy state
2904                                  */
2905                                 dst_page->busy = was_busy;
2906
2907                                 vm_page_lock_queues();
2908                                 delayed_unlock = 1;
2909                         }
2910                         if (dst_page->pageout_queue == TRUE)
2911                                 /*
2912                                  * we've buddied up a page for a clustered pageout
2913                                  * that has already been moved to the pageout
2914                                  * queue by pageout_scan... we need to remove
2915                                  * it from the queue and drop the laundry count
2916                                  * on that queue
2917                                  */
2918                                 vm_pageout_queue_steal(dst_page);
2919 #if MACH_CLUSTER_STATS
2920                         /*
2921                          * pageout statistics gathering.  count
2922                          * all the pages we will page out that
2923                          * were not counted in the initial
2924                          * vm_pageout_scan work
2925                          */
2926                         if (dst_page->list_req_pending)
2927                                 encountered_lrp = TRUE;
2928                         if ((dst_page->dirty || (dst_page->object->internal && dst_page->precious)) && !dst_page->list_req_pending) {
2929                                 if (encountered_lrp)
2930                                         CLUSTER_STAT(pages_at_higher_offsets++;)
2931                                 else
2932                                         CLUSTER_STAT(pages_at_lower_offsets++;)
2933                         }
2934 #endif
2935                         /*
2936                          * Turn off busy indication on pending
2937                          * pageout.  Note: we can only get here
2938                          * in the request pending case.
2939                          */
2940                         dst_page->list_req_pending = FALSE;
2941                         dst_page->busy = FALSE;
2942
2943                         hw_dirty = refmod_state & VM_MEM_MODIFIED;
2944                         dirty = hw_dirty ? TRUE : dst_page->dirty;
2945
2946                         if (dst_page->phys_page > upl->highest_page)
2947                                 upl->highest_page = dst_page->phys_page;
2948
2949                         if (cntrl_flags & UPL_SET_LITE) {
2950                                 int     pg_num;
2951
2952                                 pg_num = (dst_offset-offset)/PAGE_SIZE;
2953                                 lite_list[pg_num>>5] |= 1 << (pg_num & 31);
2954
2955                                 if (hw_dirty)
2956                                         pmap_clear_modify(dst_page->phys_page);
2957
2958                                 /*
2959                                  * Mark original page as cleaning
2960                                  * in place.
2961                                  */
2962                                 dst_page->cleaning = TRUE;
2963                                 dst_page->precious = FALSE;
2964                         } else {
2965                                 /*
2966                                  * use pageclean setup, it is more
2967                                  * convenient even for the pageout
2968                                  * cases here
2969                                  */
2970                                 vm_object_lock(upl->map_object);
2971                                 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
2972                                 vm_object_unlock(upl->map_object);
2973
2974                                 alias_page->absent = FALSE;
2975                                 alias_page = NULL;
2976                         }
2977 #if     MACH_PAGEMAP
2978                         /*
2979                          * Record that this page has been
2980                          * written out
2981                          */
2982                         vm_external_state_set(object->existence_map, dst_page->offset);
2983 #endif  /*MACH_PAGEMAP*/
2984                         dst_page->dirty = dirty;
2985
2986                         if (!dirty)
2987                                 dst_page->precious = TRUE;
2988
2989                         if (dst_page->pageout)
2990                                 dst_page->busy = TRUE;
2991
2992                         if ( (cntrl_flags & UPL_ENCRYPT) ) {
2993                                 /*
2994                                  * ENCRYPTED SWAP:
2995                                  * We want to deny access to the target page
2996                                  * because its contents are about to be
2997                                  * encrypted and the user would be very
2998                                  * confused to see encrypted data instead
2999                                  * of their data.
3000                                  * We also set "encrypted_cleaning" to allow
3001                                  * vm_pageout_scan() to demote that page
3002                                  * from "adjacent/clean-in-place" to
3003                                  * "target/clean-and-free" if it bumps into
3004                                  * this page during its scanning while we're
3005                                  * still processing this cluster.
3006                                  */
3007                                 dst_page->busy = TRUE;
3008                                 dst_page->encrypted_cleaning = TRUE;
3009                         }
3010                         if ( !(cntrl_flags & UPL_CLEAN_IN_PLACE) ) {
3011                                 /*
3012                                  * deny access to the target page
3013                                  * while it is being worked on
3014                                  */
3015                                 if ((!dst_page->pageout) && (dst_page->wire_count == 0)) {
3016                                         dst_page->busy = TRUE;
3017                                         dst_page->pageout = TRUE;
3018                                         vm_page_wire(dst_page);
3019                                 }
3020                         }
3021                 } else {
3022                         if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != last_copy_object) {
3023                                 /*
3024                                  * Honor copy-on-write obligations
3025                                  *
3026                                  * The copy object has changed since we
3027                                  * last synchronized for copy-on-write.
3028                                  * Another copy object might have been
3029                                  * inserted while we released the object's
3030                                  * lock.  Since someone could have seen the
3031                                  * original contents of the remaining pages
3032                                  * through that new object, we have to
3033                                  * synchronize with it again for the remaining
3034                                  * pages only.  The previous pages are "busy"
3035                                  * so they can not be seen through the new
3036                                  * mapping.  The new mapping will see our
3037                                  * upcoming changes for those previous pages,
3038                                  * but that's OK since they couldn't see what
3039                                  * was there before.  It's just a race anyway
3040                                  * and there's no guarantee of consistency or
3041                                  * atomicity.  We just don't want new mappings
3042                                  * to see both the *before* and *after* pages.
3043                                  */
3044                                 if (object->copy != VM_OBJECT_NULL) {
3045                                         delayed_unlock = 0;
3046                                         vm_page_unlock_queues();
3047
3048                                         vm_object_update(
3049                                                 object,
3050                                                 dst_offset,/* current offset */
3051                                                 xfer_size, /* remaining size */
3052                                                 NULL,
3053                                                 NULL,
3054                                                 FALSE,     /* should_return */
3055                                                 MEMORY_OBJECT_COPY_SYNC,
3056                                                 VM_PROT_NO_CHANGE);
3057
3058                                         upl_cow_again++;
3059                                         upl_cow_again_pages += xfer_size >> PAGE_SHIFT;
3060
3061                                         vm_page_lock_queues();
3062                                         delayed_unlock = 1;
3063                                 }
3064                                 /*
3065                                  * remember the copy object we synced with
3066                                  */
3067                                 last_copy_object = object->copy;
3068                         }
3069                         dst_page = vm_page_lookup(object, dst_offset);
3070
3071                         if (dst_page != VM_PAGE_NULL) {
3072                                 if ( !(dst_page->list_req_pending) ) {
3073                                         if ((cntrl_flags & UPL_RET_ONLY_ABSENT) && !dst_page->absent) {
3074                                                 /*
3075                                                  * skip over pages already present in the cache
3076                                                  */
3077                                                 if (user_page_list)
3078                                                         user_page_list[entry].phys_addr = 0;
3079
3080                                                 goto delay_unlock_queues;
3081                                         }
3082                                         if (dst_page->cleaning) {
3083                                                 /*
3084                                                  * someone else is writing to the page... wait...
3085                                                  */
3086                                                 delayed_unlock = 0;
3087                                                 vm_page_unlock_queues();
3088
3089                                                 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
3090
3091                                                 continue;
3092                                         }
3093                                 } else {
3094                                         if (dst_page->fictitious &&
3095                                             dst_page->phys_page == vm_page_fictitious_addr) {
3096                                                 assert( !dst_page->speculative);
3097                                                 /*
3098                                                  * dump the fictitious page
3099                                                  */
3100                                                 dst_page->list_req_pending = FALSE;
3101
3102                                                 vm_page_free(dst_page);
3103
3104                                                 dst_page = NULL;
3105                                         } else if (dst_page->absent) {
3106                                                 /*
3107                                                  * the default_pager case
3108                                                  */
3109                                                 dst_page->list_req_pending = FALSE;
3110                                                 dst_page->busy = FALSE;
3111                                         }
3112                                 }
3113                         }
3114                         if (dst_page == VM_PAGE_NULL) {
3115                                 if (object->private) {
3116                                         /*
3117                                          * This is a nasty wrinkle for users
3118                                          * of upl who encounter device or
3119                                          * private memory however, it is
3120                                          * unavoidable, only a fault can
3121                                          * resolve the actual backing
3122                                          * physical page by asking the
3123                                          * backing device.
3124                                          */
3125                                         if (user_page_list)
3126                                                 user_page_list[entry].phys_addr = 0;
3127
3128                                         goto delay_unlock_queues;
3129                                 }
3130                                 /*
3131                                  * need to allocate a page
3132                                  */
3133                                 dst_page = vm_page_grab();
3134
3135                                 if (dst_page == VM_PAGE_NULL) {
3136                                         if ( (cntrl_flags & (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) == (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) {
3137                                                /*
3138                                                 * we don't want to stall waiting for pages to come onto the free list
3139                                                 * while we're already holding absent pages in this UPL
3140                                                 * the caller will deal with the empty slots
3141                                                 */
3142                                                 if (user_page_list)
3143                                                         user_page_list[entry].phys_addr = 0;
3144
3145                                                 goto try_next_page;
3146                                         }
3147                                         /*
3148                                          * no pages available... wait
3149                                          * then try again for the same
3150                                          * offset...
3151                                          */
3152                                         delayed_unlock = 0;
3153                                         vm_page_unlock_queues();
3154
3155                                         vm_object_unlock(object);
3156                                         VM_PAGE_WAIT();
3157
3158                                         /*
3159                                          * pageout_scan takes the vm_page_lock_queues first
3160                                          * then tries for the object lock... to avoid what
3161                                          * is effectively a lock inversion, we'll go to the
3162                                          * trouble of taking them in that same order... otherwise
3163                                          * if this object contains the majority of the pages resident
3164                                          * in the UBC (or a small set of large objects actively being
3165                                          * worked on contain the majority of the pages), we could
3166                                          * cause the pageout_scan thread to 'starve' in its attempt
3167                                          * to find pages to move to the free queue, since it has to
3168                                          * successfully acquire the object lock of any candidate page
3169                                          * before it can steal/clean it.
3170                                          */
3171                                         for (j = 0; ; j++) {
3172                                                 vm_page_lock_queues();
3173
3174                                                 if (vm_object_lock_try(object))
3175                                                         break;
3176                                                 vm_page_unlock_queues();
3177                                                 mutex_pause(j);
3178                                         }
3179                                         delayed_unlock = 1;
3180
3181                                         continue;
3182                                 }
3183                                 vm_page_insert_internal(dst_page, object, dst_offset, TRUE);
3184
3185                                 dst_page->absent = TRUE;
3186                                 dst_page->busy = FALSE;
3187
3188                                 if (cntrl_flags & UPL_RET_ONLY_ABSENT) {
3189                                         /*
3190                                          * if UPL_RET_ONLY_ABSENT was specified,
3191                                          * than we're definitely setting up a
3192                                          * upl for a clustered read/pagein
3193                                          * operation... mark the pages as clustered
3194                                          * so upl_commit_range can put them on the
3195                                          * speculative list
3196                                          */
3197                                         dst_page->clustered = TRUE;
3198                                 }
3199                         }
3200                         /*
3201                          * ENCRYPTED SWAP:
3202                          */
3203                         if (cntrl_flags & UPL_ENCRYPT) {
3204                                 /*
3205                                  * The page is going to be encrypted when we
3206                                  * get it from the pager, so mark it so.
3207                                  */
3208                                 dst_page->encrypted = TRUE;
3209                         } else {
3210                                 /*
3211                                  * Otherwise, the page will not contain
3212                                  * encrypted data.
3213                                  */
3214                                 dst_page->encrypted = FALSE;
3215                         }
3216                         dst_page->overwriting = TRUE;
3217
3218                         if (dst_page->fictitious) {
3219                                 panic("need corner case for fictitious page");
3220                         }
3221                         if (dst_page->busy) {
3222                                 /*
3223                                  * someone else is playing with the
3224                                  * page.  We will have to wait.
3225                                  */
3226                                 delayed_unlock = 0;
3227                                 vm_page_unlock_queues();
3228
3229                                 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
3230
3231                                 continue;
3232                         }
3233                         if (dst_page->pmapped) {
3234                                 if ( !(cntrl_flags & UPL_FILE_IO))
3235                                         /*
3236                                          * eliminate all mappings from the
3237                                          * original object and its prodigy
3238                                          */
3239                                         refmod_state = pmap_disconnect(dst_page->phys_page);
3240                                 else
3241                                         refmod_state = pmap_get_refmod(dst_page->phys_page);
3242                         } else
3243                                 refmod_state = 0;
3244
3245                         hw_dirty = refmod_state & VM_MEM_MODIFIED;
3246                         dirty = hw_dirty ? TRUE : dst_page->dirty;
3247
3248                         if (cntrl_flags & UPL_SET_LITE) {
3249                                 int     pg_num;
3250
3251                                 pg_num = (dst_offset-offset)/PAGE_SIZE;
3252                                 lite_list[pg_num>>5] |= 1 << (pg_num & 31);
3253
3254                                 if (hw_dirty)
3255                                         pmap_clear_modify(dst_page->phys_page);
3256
3257                                 /*
3258                                  * Mark original page as cleaning
3259                                  * in place.
3260                                  */
3261                                 dst_page->cleaning = TRUE;
3262                                 dst_page->precious = FALSE;
3263                         } else {
3264                                 /*
3265                                  * use pageclean setup, it is more
3266                                  * convenient even for the pageout
3267                                  * cases here
3268                                  */
3269                                 vm_object_lock(upl->map_object);
3270                                 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
3271                                 vm_object_unlock(upl->map_object);
3272
3273                                 alias_page->absent = FALSE;
3274                                 alias_page = NULL;
3275                         }
3276
3277                         if (cntrl_flags & UPL_CLEAN_IN_PLACE) {
3278                                 /*
3279                                  * clean in place for read implies
3280                                  * that a write will be done on all
3281                                  * the pages that are dirty before
3282                                  * a upl commit is done.  The caller
3283                                  * is obligated to preserve the
3284                                  * contents of all pages marked dirty
3285                                  */
3286                                 upl->flags |= UPL_CLEAR_DIRTY;
3287                         }
3288                         dst_page->dirty = dirty;
3289
3290                         if (!dirty)
3291                                 dst_page->precious = TRUE;
3292
3293                         if (dst_page->wire_count == 0) {
3294                                 /*
3295                                  * deny access to the target page while
3296                                  * it is being worked on
3297                                  */
3298                                 dst_page->busy = TRUE;
3299                         } else
3300                                 vm_page_wire(dst_page);
3301
3302                         if (dst_page->clustered) {
3303                                 /*
3304                                  * expect the page not to be used
3305                                  * since it's coming in as part
3306                                  * of a speculative cluster...
3307                                  * pages that are 'consumed' will
3308                                  * get a hardware reference
3309                                  */
3310                                 dst_page->reference = FALSE;
3311                         } else {
3312                                 /*
3313                                  * expect the page to be used
3314                                  */
3315                                 dst_page->reference = TRUE;
3316                         }
3317                         dst_page->precious = (cntrl_flags & UPL_PRECIOUS) ? TRUE : FALSE;
3318                 }
3319                 if (dst_page->phys_page > upl->highest_page)
3320                         upl->highest_page = dst_page->phys_page;
3321                 if (user_page_list) {
3322                         user_page_list[entry].phys_addr = dst_page->phys_page;
3323                         user_page_list[entry].pageout   = dst_page->pageout;
3324                         user_page_list[entry].absent    = dst_page->absent;
3325                         user_page_list[entry].dirty     = dst_page->dirty;
3326                         user_page_list[entry].precious  = dst_page->precious;
3327                         user_page_list[entry].device    = FALSE;
3328                         if (dst_page->clustered == TRUE)
3329                                 user_page_list[entry].speculative = dst_page->speculative;
3330                         else
3331                                 user_page_list[entry].speculative = FALSE;
3332                         user_page_list[entry].cs_validated = dst_page->cs_validated;
3333                         user_page_list[entry].cs_tainted = dst_page->cs_tainted;
3334                 }
3335                 /*
3336                  * if UPL_RET_ONLY_ABSENT is set, then
3337                  * we are working with a fresh page and we've
3338                  * just set the clustered flag on it to
3339                  * indicate that it was drug in as part of a
3340                  * speculative cluster... so leave it alone
3341                  */
3342                 if ( !(cntrl_flags & UPL_RET_ONLY_ABSENT)) {
3343                         /*
3344                          * someone is explicitly grabbing this page...
3345                          * update clustered and speculative state
3346                          *
3347                          */
3348                         VM_PAGE_CONSUME_CLUSTERED(dst_page);
3349                 }
3350 delay_unlock_queues:
3351                 if (delayed_unlock++ > UPL_DELAYED_UNLOCK_LIMIT) {
3352                         /*
3353                          * pageout_scan takes the vm_page_lock_queues first
3354                          * then tries for the object lock... to avoid what
3355                          * is effectively a lock inversion, we'll go to the
3356                          * trouble of taking them in that same order... otherwise
3357                          * if this object contains the majority of the pages resident
3358                          * in the UBC (or a small set of large objects actively being
3359                          * worked on contain the majority of the pages), we could
3360                          * cause the pageout_scan thread to 'starve' in its attempt
3361                          * to find pages to move to the free queue, since it has to
3362                          * successfully acquire the object lock of any candidate page
3363                          * before it can steal/clean it.
3364                          */
3365                         vm_object_unlock(object);
3366                         mutex_yield(&vm_page_queue_lock);
3367
3368                         for (j = 0; ; j++) {
3369                                 if (vm_object_lock_try(object))
3370                                         break;
3371                                 vm_page_unlock_queues();
3372                                 mutex_pause(j);
3373                                 vm_page_lock_queues();
3374                         }
3375                         delayed_unlock = 1;
3376                 }
3377 try_next_page:
3378                 entry++;
3379                 dst_offset += PAGE_SIZE_64;
3380                 xfer_size -= PAGE_SIZE;
3381         }
3382         if (alias_page != NULL) {
3383                 if (delayed_unlock == 0) {
3384                         vm_page_lock_queues();
3385                         delayed_unlock = 1;
3386                 }
3387                 vm_page_free(alias_page);
3388         }
3389         if (delayed_unlock)
3390                 vm_page_unlock_queues();
3391
3392         if (page_list_count != NULL) {
3393                 if (upl->flags & UPL_INTERNAL)
3394                         *page_list_count = 0;
3395                 else if (*page_list_count > entry)
3396                         *page_list_count = entry;
3397         }
3398         vm_object_unlock(object);
3399
3400         return KERN_SUCCESS;
3401 }
3402
3403 /* JMM - Backward compatability for now */
3404 kern_return_t
3405 vm_fault_list_request(                  /* forward */
3406         memory_object_control_t         control,
3407         vm_object_offset_t      offset,
3408         upl_size_t              size,
3409         upl_t                   *upl_ptr,
3410         upl_page_info_t         **user_page_list_ptr,
3411         unsigned int            page_list_count,
3412         int                     cntrl_flags);
3413 kern_return_t
3414 vm_fault_list_request(
3415         memory_object_control_t         control,
3416         vm_object_offset_t      offset,
3417         upl_size_t              size,
3418         upl_t                   *upl_ptr,
3419         upl_page_info_t         **user_page_list_ptr,
3420         unsigned int            page_list_count,
3421         int                     cntrl_flags)
3422 {
3423         unsigned int            local_list_count;
3424         upl_page_info_t         *user_page_list;
3425         kern_return_t           kr;
3426
3427         if (user_page_list_ptr != NULL) {
3428                 local_list_count = page_list_count;
3429                 user_page_list = *user_page_list_ptr;
3430         } else {
3431                 local_list_count = 0;
3432                 user_page_list = NULL;
3433         }
3434         kr =  memory_object_upl_request(control,
3435                                 offset,
3436                                 size,
3437                                 upl_ptr,
3438                                 user_page_list,
3439                                 &local_list_count,
3440                                 cntrl_flags);
3441
3442         if(kr != KERN_SUCCESS)
3443                 return kr;
3444
3445         if ((user_page_list_ptr != NULL) && (cntrl_flags & UPL_INTERNAL)) {
3446                 *user_page_list_ptr = UPL_GET_INTERNAL_PAGE_LIST(*upl_ptr);
3447         }
3448
3449         return KERN_SUCCESS;
3450 }
3451
3452
3453
3454 /*
3455  *      Routine:        vm_object_super_upl_request
3456  *      Purpose:
3457  *              Cause the population of a portion of a vm_object
3458  *              in much the same way as memory_object_upl_request.
3459  *              Depending on the nature of the request, the pages
3460  *              returned may be contain valid data or be uninitialized.
3461  *              However, the region may be expanded up to the super
3462  *              cluster size provided.
3463  */
3464
3465 __private_extern__ kern_return_t
3466 vm_object_super_upl_request(
3467         vm_object_t object,
3468         vm_object_offset_t      offset,
3469         upl_size_t              size,
3470         upl_size_t              super_cluster,
3471         upl_t                   *upl,
3472         upl_page_info_t         *user_page_list,
3473         unsigned int            *page_list_count,
3474         int                     cntrl_flags)
3475 {
3476         if (object->paging_offset > offset)
3477                 return KERN_FAILURE;
3478
3479         assert(object->paging_in_progress);
3480         offset = offset - object->paging_offset;
3481
3482         if (super_cluster > size) {
3483
3484                 vm_object_offset_t      base_offset;
3485                 upl_size_t              super_size;
3486
3487                 base_offset = (offset & ~((vm_object_offset_t) super_cluster - 1));
3488                 super_size = (offset + size) > (base_offset + super_cluster) ? super_cluster<<1 : super_cluster;
3489                 super_size = ((base_offset + super_size) > object->size) ? (object->size - base_offset) : super_size;
3490
3491                 if (offset > (base_offset + super_size)) {
3492                         panic("vm_object_super_upl_request: Missed target pageout"
3493                               " %#llx,%#llx, %#x, %#x, %#x, %#llx\n",
3494                               offset, base_offset, super_size, super_cluster,
3495                               size, object->paging_offset);
3496                 }
3497                 /*
3498                  * apparently there is a case where the vm requests a
3499                  * page to be written out who's offset is beyond the
3500                  * object size
3501                  */
3502                 if ((offset + size) > (base_offset + super_size))
3503                         super_size = (offset + size) - base_offset;
3504
3505                 offset = base_offset;
3506                 size = super_size;
3507         }
3508         return vm_object_upl_request(object, offset, size, upl, user_page_list, page_list_count, cntrl_flags);
3509 }
3510
3511
3512 kern_return_t
3513 vm_map_create_upl(
3514         vm_map_t                map,
3515         vm_map_address_t        offset,
3516         upl_size_t              *upl_size,
3517         upl_t                   *upl,
3518         upl_page_info_array_t   page_list,
3519         unsigned int            *count,
3520         int                     *flags)
3521 {
3522         vm_map_entry_t  entry;
3523         int             caller_flags;
3524         int             force_data_sync;
3525         int             sync_cow_data;
3526         vm_object_t     local_object;
3527         vm_map_offset_t local_offset;
3528         vm_map_offset_t local_start;
3529         kern_return_t   ret;
3530
3531         caller_flags = *flags;
3532
3533         if (caller_flags & ~UPL_VALID_FLAGS) {
3534                 /*
3535                  * For forward compatibility's sake,
3536                  * reject any unknown flag.
3537                  */
3538                 return KERN_INVALID_VALUE;
3539         }
3540         force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
3541         sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
3542
3543         if (upl == NULL)
3544                 return KERN_INVALID_ARGUMENT;
3545
3546 REDISCOVER_ENTRY:
3547         vm_map_lock(map);
3548
3549         if (vm_map_lookup_entry(map, offset, &entry)) {
3550
3551                 if ((entry->vme_end - offset) < *upl_size)
3552                         *upl_size = entry->vme_end - offset;
3553
3554                 if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
3555                         *flags = 0;
3556
3557                         if (entry->object.vm_object != VM_OBJECT_NULL) {
3558                                 if (entry->object.vm_object->private)
3559                                         *flags = UPL_DEV_MEMORY;
3560
3561                                 if (entry->object.vm_object->phys_contiguous)
3562                                         *flags |= UPL_PHYS_CONTIG;
3563                         }
3564                         vm_map_unlock(map);
3565
3566                         return KERN_SUCCESS;
3567                 }
3568                 if (entry->object.vm_object == VM_OBJECT_NULL || !entry->object.vm_object->phys_contiguous) {
3569                         if ((*upl_size/page_size) > MAX_UPL_SIZE)
3570                                 *upl_size = MAX_UPL_SIZE * page_size;
3571                 }
3572                 /*
3573                  *      Create an object if necessary.
3574                  */
3575                 if (entry->object.vm_object == VM_OBJECT_NULL) {
3576                         entry->object.vm_object = vm_object_allocate((vm_size_t)(entry->vme_end - entry->vme_start));
3577                         entry->offset = 0;
3578                 }
3579                 if (!(caller_flags & UPL_COPYOUT_FROM)) {
3580                         if (!(entry->protection & VM_PROT_WRITE)) {
3581                                 vm_map_unlock(map);
3582                                 return KERN_PROTECTION_FAILURE;
3583                         }
3584                         if (entry->needs_copy)  {
3585                                 vm_map_t                local_map;
3586                                 vm_object_t             object;
3587                                 vm_object_offset_t      new_offset;
3588                                 vm_prot_t               prot;
3589                                 boolean_t               wired;
3590                                 vm_map_version_t        version;
3591                                 vm_map_t                real_map;
3592
3593                                 local_map = map;
3594                                 vm_map_lock_write_to_read(map);
3595
3596                                 if (vm_map_lookup_locked(&local_map,
3597                                                          offset, VM_PROT_WRITE,
3598                                                          OBJECT_LOCK_EXCLUSIVE,
3599                                                          &version, &object,
3600                                                          &new_offset, &prot, &wired,
3601                                                          NULL,
3602                                                          &real_map)) {
3603                                         vm_map_unlock(local_map);
3604                                         return KERN_FAILURE;
3605                                 }
3606                                 if (real_map != map)
3607                                         vm_map_unlock(real_map);
3608                                 vm_object_unlock(object);
3609                                 vm_map_unlock(local_map);
3610
3611                                 goto REDISCOVER_ENTRY;
3612                         }
3613                 }
3614                 if (entry->is_sub_map) {
3615                         vm_map_t        submap;
3616
3617                         submap = entry->object.sub_map;
3618                         local_start = entry->vme_start;
3619                         local_offset = entry->offset;
3620
3621                         vm_map_reference(submap);
3622                         vm_map_unlock(map);
3623
3624                         ret = vm_map_create_upl(submap,
3625                                                 local_offset + (offset - local_start),
3626                                                 upl_size, upl, page_list, count, flags);
3627                         vm_map_deallocate(submap);
3628
3629                         return ret;
3630                 }
3631                 if (sync_cow_data) {
3632                         if (entry->object.vm_object->shadow || entry->object.vm_object->copy) {
3633                                 local_object = entry->object.vm_object;
3634                                 local_start = entry->vme_start;
3635                                 local_offset = entry->offset;
3636
3637                                 vm_object_reference(local_object);
3638                                 vm_map_unlock(map);
3639
3640                                 if (entry->object.vm_object->shadow && entry->object.vm_object->copy) {
3641                                         vm_object_lock_request(
3642                                                                local_object->shadow,
3643                                                                (vm_object_offset_t)
3644                                                                ((offset - local_start) +
3645                                                                 local_offset) +
3646                                                                local_object->shadow_offset,
3647                                                                *upl_size, FALSE,
3648                                                                MEMORY_OBJECT_DATA_SYNC,
3649                                                                VM_PROT_NO_CHANGE);
3650                                 }
3651                                 sync_cow_data = FALSE;
3652                                 vm_object_deallocate(local_object);
3653
3654                                 goto REDISCOVER_ENTRY;
3655                         }
3656                 }
3657                 if (force_data_sync) {
3658                         local_object = entry->object.vm_object;
3659                         local_start = entry->vme_start;
3660                         local_offset = entry->offset;
3661
3662                         vm_object_reference(local_object);
3663                         vm_map_unlock(map);
3664
3665                         vm_object_lock_request(
3666                                                local_object,
3667                                                (vm_object_offset_t)
3668                                                ((offset - local_start) + local_offset),
3669                                                (vm_object_size_t)*upl_size, FALSE,
3670                                                MEMORY_OBJECT_DATA_SYNC,
3671                                                VM_PROT_NO_CHANGE);
3672
3673                         force_data_sync = FALSE;
3674                         vm_object_deallocate(local_object);
3675
3676                         goto REDISCOVER_ENTRY;
3677                 }
3678                 if (entry->object.vm_object->private)
3679                         *flags = UPL_DEV_MEMORY;
3680                 else
3681                         *flags = 0;
3682
3683                 if (entry->object.vm_object->phys_contiguous)
3684                         *flags |= UPL_PHYS_CONTIG;
3685
3686                 local_object = entry->object.vm_object;
3687                 local_offset = entry->offset;
3688                 local_start = entry->vme_start;
3689
3690                 vm_object_reference(local_object);
3691                 vm_map_unlock(map);
3692
3693                 ret = vm_object_iopl_request(local_object,
3694                                               (vm_object_offset_t) ((offset - local_start) + local_offset),
3695                                               *upl_size,
3696                                               upl,
3697                                               page_list,
3698                                               count,
3699                                               caller_flags);
3700                 vm_object_deallocate(local_object);
3701
3702                 return(ret);
3703         }
3704         vm_map_unlock(map);
3705
3706         return(KERN_FAILURE);
3707 }
3708
3709 /*
3710  * Internal routine to enter a UPL into a VM map.
3711  *
3712  * JMM - This should just be doable through the standard
3713  * vm_map_enter() API.
3714  */
3715 kern_return_t
3716 vm_map_enter_upl(
3717         vm_map_t                map,
3718         upl_t                   upl,
3719         vm_map_offset_t *dst_addr)
3720 {
3721         vm_map_size_t           size;
3722         vm_object_offset_t      offset;
3723         vm_map_offset_t         addr;
3724         vm_page_t               m;
3725         kern_return_t           kr;
3726
3727         if (upl == UPL_NULL)
3728                 return KERN_INVALID_ARGUMENT;
3729
3730         upl_lock(upl);
3731
3732         /*
3733          * check to see if already mapped
3734          */
3735         if (UPL_PAGE_LIST_MAPPED & upl->flags) {
3736                 upl_unlock(upl);
3737                 return KERN_FAILURE;
3738         }
3739
3740         if ((!(upl->flags & UPL_SHADOWED)) && !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) ||
3741                                                (upl->map_object->phys_contiguous))) {
3742                 vm_object_t             object;
3743                 vm_page_t               alias_page;
3744                 vm_object_offset_t      new_offset;
3745                 int                     pg_num;
3746                 wpl_array_t             lite_list;
3747
3748                 if (upl->flags & UPL_INTERNAL) {
3749                         lite_list = (wpl_array_t)
3750                                 ((((uintptr_t)upl) + sizeof(struct upl))
3751                                  + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
3752                 } else {
3753                         lite_list = (wpl_array_t)(((uintptr_t)upl) + sizeof(struct upl));
3754                 }
3755                 object = upl->map_object;
3756                 upl->map_object = vm_object_allocate(upl->size);
3757
3758                 vm_object_lock(upl->map_object);
3759
3760                 upl->map_object->shadow = object;
3761                 upl->map_object->pageout = TRUE;
3762                 upl->map_object->can_persist = FALSE;
3763                 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3764                 upl->map_object->shadow_offset = upl->offset - object->paging_offset;
3765                 upl->map_object->wimg_bits = object->wimg_bits;
3766                 offset = upl->map_object->shadow_offset;
3767                 new_offset = 0;
3768                 size = upl->size;
3769
3770                 upl->flags |= UPL_SHADOWED;
3771
3772                 while (size) {
3773                         pg_num = (new_offset)/PAGE_SIZE;
3774
3775                         if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
3776
3777                                 VM_PAGE_GRAB_FICTITIOUS(alias_page);
3778
3779                                 vm_object_lock(object);
3780
3781                                 m = vm_page_lookup(object, offset);
3782                                 if (m == VM_PAGE_NULL) {
3783                                         panic("vm_upl_map: page missing\n");
3784                                 }
3785
3786                                 /*
3787                                  * Convert the fictitious page to a private
3788                                  * shadow of the real page.
3789                                  */
3790                                 assert(alias_page->fictitious);
3791                                 alias_page->fictitious = FALSE;
3792                                 alias_page->private = TRUE;
3793                                 alias_page->pageout = TRUE;
3794                                 /*
3795                                  * since m is a page in the upl it must
3796                                  * already be wired or BUSY, so it's
3797                                  * safe to assign the underlying physical
3798                                  * page to the alias
3799                                  */
3800                                 alias_page->phys_page = m->phys_page;
3801
3802                                 vm_object_unlock(object);
3803
3804                                 vm_page_lockspin_queues();
3805                                 vm_page_wire(alias_page);
3806                                 vm_page_unlock_queues();
3807
3808                                 /*
3809                                  * ENCRYPTED SWAP:
3810                                  * The virtual page ("m") has to be wired in some way
3811                                  * here or its physical page ("m->phys_page") could
3812                                  * be recycled at any time.
3813                                  * Assuming this is enforced by the caller, we can't
3814                                  * get an encrypted page here.  Since the encryption
3815                                  * key depends on the VM page's "pager" object and
3816                                  * the "paging_offset", we couldn't handle 2 pageable
3817                                  * VM pages (with different pagers and paging_offsets)
3818                                  * sharing the same physical page:  we could end up
3819                                  * encrypting with one key (via one VM page) and
3820                                  * decrypting with another key (via the alias VM page).
3821                                  */
3822                                 ASSERT_PAGE_DECRYPTED(m);
3823
3824                                 vm_page_insert(alias_page, upl->map_object, new_offset);
3825
3826                                 assert(!alias_page->wanted);
3827                                 alias_page->busy = FALSE;
3828                                 alias_page->absent = FALSE;
3829                         }
3830                         size -= PAGE_SIZE;
3831                         offset += PAGE_SIZE_64;
3832                         new_offset += PAGE_SIZE_64;
3833                 }
3834                 vm_object_unlock(upl->map_object);
3835         }
3836         if ((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || upl->map_object->phys_contiguous)
3837                 offset = upl->offset - upl->map_object->paging_offset;
3838         else
3839                 offset = 0;
3840         size = upl->size;
3841
3842         vm_object_reference(upl->map_object);
3843
3844         *dst_addr = 0;
3845         /*
3846          * NEED A UPL_MAP ALIAS
3847          */
3848         kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
3849                           VM_FLAGS_ANYWHERE, upl->map_object, offset, FALSE,
3850                           VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
3851
3852         if (kr != KERN_SUCCESS) {
3853                 upl_unlock(upl);
3854                 return(kr);
3855         }
3856         vm_object_lock(upl->map_object);
3857
3858         for (addr = *dst_addr; size > 0; size -= PAGE_SIZE, addr += PAGE_SIZE) {
3859                 m = vm_page_lookup(upl->map_object, offset);
3860
3861                 if (m) {
3862                         unsigned int    cache_attr;
3863                         cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
3864
3865                         m->pmapped = TRUE;
3866                         m->wpmapped = TRUE;
3867
3868                         PMAP_ENTER(map->pmap, addr, m, VM_PROT_ALL, cache_attr, TRUE);
3869                 }
3870                 offset += PAGE_SIZE_64;
3871         }
3872         vm_object_unlock(upl->map_object);
3873
3874         /*
3875          * hold a reference for the mapping
3876          */
3877         upl->ref_count++;
3878         upl->flags |= UPL_PAGE_LIST_MAPPED;
3879         upl->kaddr = *dst_addr;
3880         upl_unlock(upl);
3881
3882         return KERN_SUCCESS;
3883 }
3884
3885 /*
3886  * Internal routine to remove a UPL mapping from a VM map.
3887  *
3888  * XXX - This should just be doable through a standard
3889  * vm_map_remove() operation.  Otherwise, implicit clean-up
3890  * of the target map won't be able to correctly remove
3891  * these (and release the reference on the UPL).  Having
3892  * to do this means we can't map these into user-space
3893  * maps yet.
3894  */
3895 kern_return_t
3896 vm_map_remove_upl(
3897         vm_map_t        map,
3898         upl_t           upl)
3899 {
3900         vm_address_t    addr;
3901         upl_size_t      size;
3902
3903         if (upl == UPL_NULL)
3904                 return KERN_INVALID_ARGUMENT;
3905
3906         upl_lock(upl);
3907
3908         if (upl->flags & UPL_PAGE_LIST_MAPPED) {
3909                 addr = upl->kaddr;
3910                 size = upl->size;
3911
3912                 assert(upl->ref_count > 1);
3913                 upl->ref_count--;               /* removing mapping ref */
3914
3915                 upl->flags &= ~UPL_PAGE_LIST_MAPPED;
3916                 upl->kaddr = (vm_offset_t) 0;
3917                 upl_unlock(upl);
3918
3919                 vm_map_remove(map,
3920                               vm_map_trunc_page(addr),
3921                               vm_map_round_page(addr + size),
3922                               VM_MAP_NO_FLAGS);
3923
3924                 return KERN_SUCCESS;
3925         }
3926         upl_unlock(upl);
3927
3928         return KERN_FAILURE;
3929 }
3930
3931 kern_return_t
3932 upl_commit_range(
3933         upl_t                   upl,
3934         upl_offset_t            offset,
3935         upl_size_t              size,
3936         int                     flags,
3937         upl_page_info_t         *page_list,
3938         mach_msg_type_number_t  count,
3939         boolean_t               *empty)
3940 {
3941         upl_size_t              xfer_size;
3942         vm_object_t             shadow_object;
3943         vm_object_t             object;
3944         vm_object_offset_t      target_offset;
3945         int                     entry;
3946         wpl_array_t             lite_list;
3947         int                     occupied;
3948         int                     delayed_unlock = 0;
3949         int                     clear_refmod = 0;
3950         int                     pgpgout_count = 0;
3951         int                     j;
3952
3953         *empty = FALSE;
3954
3955         if (upl == UPL_NULL)
3956                 return KERN_INVALID_ARGUMENT;
3957
3958         if (count == 0)
3959                 page_list = NULL;
3960
3961         if (upl->flags & UPL_DEVICE_MEMORY)
3962                 xfer_size = 0;
3963         else if ((offset + size) <= upl->size)
3964                 xfer_size = size;
3965         else
3966                 return KERN_FAILURE;
3967
3968         upl_lock(upl);
3969
3970         if (upl->flags & UPL_ACCESS_BLOCKED) {
3971                 /*
3972                  * We used this UPL to block access to the pages by marking
3973                  * them "busy".  Now we need to clear the "busy" bit to allow
3974                  * access to these pages again.
3975                  */
3976                 flags |= UPL_COMMIT_ALLOW_ACCESS;
3977         }
3978         if (upl->flags & UPL_CLEAR_DIRTY)
3979                 flags |= UPL_COMMIT_CLEAR_DIRTY;
3980
3981         if (upl->flags & UPL_INTERNAL)
3982                 lite_list = (wpl_array_t) ((((uintptr_t)upl) + sizeof(struct upl))
3983                                            + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
3984         else
3985                 lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
3986
3987         object = upl->map_object;
3988
3989         if (upl->flags & UPL_SHADOWED) {
3990                 vm_object_lock(object);
3991                 shadow_object = object->shadow;
3992         } else {
3993                 shadow_object = object;
3994         }
3995         entry = offset/PAGE_SIZE;
3996         target_offset = (vm_object_offset_t)offset;
3997
3998         /*
3999          * pageout_scan takes the vm_page_lock_queues first
4000          * then tries for the object lock... to avoid what
4001          * is effectively a lock inversion, we'll go to the
4002          * trouble of taking them in that same order... otherwise
4003          * if this object contains the majority of the pages resident
4004          * in the UBC (or a small set of large objects actively being
4005          * worked on contain the majority of the pages), we could
4006          * cause the pageout_scan thread to 'starve' in its attempt
4007          * to find pages to move to the free queue, since it has to
4008          * successfully acquire the object lock of any candidate page
4009          * before it can steal/clean it.
4010          */
4011         for (j = 0; ; j++) {
4012                 vm_page_lock_queues();
4013
4014                 if (vm_object_lock_try(shadow_object))
4015                         break;
4016                 vm_page_unlock_queues();
4017                 mutex_pause(j);
4018         }
4019         delayed_unlock = 1;
4020
4021         if (shadow_object->code_signed) {
4022                 /*
4023                  * CODE SIGNING:
4024                  * If the object is code-signed, do not let this UPL tell
4025                  * us if the pages are valid or not.  Let the pages be
4026                  * validated by VM the normal way (when they get mapped or
4027                  * copied).
4028                  */
4029                 flags &= ~UPL_COMMIT_CS_VALIDATED;
4030         }
4031         if (! page_list) {
4032                 /*
4033                  * No page list to get the code-signing info from !?
4034                  */
4035                 flags &= ~UPL_COMMIT_CS_VALIDATED;
4036         }
4037
4038         while (xfer_size) {
4039                 vm_page_t       t, m;
4040
4041                 m = VM_PAGE_NULL;
4042
4043                 if (upl->flags & UPL_LITE) {
4044                         int     pg_num;
4045
4046                         pg_num = target_offset/PAGE_SIZE;
4047
4048                         if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
4049                                 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
4050
4051                                 m = vm_page_lookup(shadow_object, target_offset + (upl->offset - shadow_object->paging_offset));
4052                         }
4053                 }
4054                 if (upl->flags & UPL_SHADOWED) {
4055                         if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
4056
4057                                 t->pageout = FALSE;
4058
4059                                 vm_page_free(t);
4060
4061                                 if (m == VM_PAGE_NULL)
4062                                         m = vm_page_lookup(shadow_object, target_offset + object->shadow_offset);
4063                         }
4064                 }
4065                 if (m == VM_PAGE_NULL) {
4066                         goto commit_next_page;
4067                 }
4068
4069                 clear_refmod = 0;
4070
4071                 if (flags & UPL_COMMIT_CS_VALIDATED) {
4072                         /*
4073                          * CODE SIGNING:
4074                          * Set the code signing bits according to
4075                          * what the UPL says they should be.
4076                          */
4077                         m->cs_validated = page_list[entry].cs_validated;
4078                         m->cs_tainted = page_list[entry].cs_tainted;
4079                 }
4080                 if (upl->flags & UPL_IO_WIRE) {
4081
4082                         vm_page_unwire(m);
4083
4084                         if (page_list)
4085                                 page_list[entry].phys_addr = 0;
4086
4087                         if (flags & UPL_COMMIT_SET_DIRTY)
4088                                 m->dirty = TRUE;
4089                         else if (flags & UPL_COMMIT_CLEAR_DIRTY) {
4090                                 m->dirty = FALSE;
4091                                 if (! (flags & UPL_COMMIT_CS_VALIDATED) &&
4092                                     m->cs_validated && !m->cs_tainted) {
4093                                         /*
4094                                          * CODE SIGNING:
4095                                          * This page is no longer dirty
4096                                          * but could have been modified,
4097                                          * so it will need to be
4098                                          * re-validated.
4099                                          */
4100                                         m->cs_validated = FALSE;
4101                                         vm_cs_validated_resets++;
4102                                 }
4103                                 clear_refmod |= VM_MEM_MODIFIED;
4104                         }
4105
4106                         if (flags & UPL_COMMIT_INACTIVATE)
4107                                 vm_page_deactivate(m);
4108
4109                         if (clear_refmod)
4110                                 pmap_clear_refmod(m->phys_page, clear_refmod);
4111
4112                         if (flags & UPL_COMMIT_ALLOW_ACCESS) {
4113                                 /*
4114                                  * We blocked access to the pages in this UPL.
4115                                  * Clear the "busy" bit and wake up any waiter
4116                                  * for this page.
4117                                  */
4118                                 PAGE_WAKEUP_DONE(m);
4119                         }
4120                         goto commit_next_page;
4121                 }
4122                 /*
4123                  * make sure to clear the hardware
4124                  * modify or reference bits before
4125                  * releasing the BUSY bit on this page
4126                  * otherwise we risk losing a legitimate
4127                  * change of state
4128                  */
4129                 if (flags & UPL_COMMIT_CLEAR_DIRTY) {
4130                         m->dirty = FALSE;
4131
4132                         if (! (flags & UPL_COMMIT_CS_VALIDATED) &&
4133                             m->cs_validated && !m->cs_tainted) {
4134                                 /*
4135                                  * CODE SIGNING:
4136                                  * This page is no longer dirty
4137                                  * but could have been modified,
4138                                  * so it will need to be
4139                                  * re-validated.
4140                                  */
4141                                 m->cs_validated = FALSE;
4142 #if DEVELOPMENT || DEBUG
4143                                 vm_cs_validated_resets++;
4144 #endif
4145                         }
4146                         clear_refmod |= VM_MEM_MODIFIED;
4147                 }
4148                 if (clear_refmod)
4149                         pmap_clear_refmod(m->phys_page, clear_refmod);
4150
4151                 if (page_list) {
4152                         upl_page_info_t *p;
4153
4154                         p = &(page_list[entry]);
4155
4156                         if (p->phys_addr && p->pageout && !m->pageout) {
4157                                 m->busy = TRUE;
4158                                 m->pageout = TRUE;
4159                                 vm_page_wire(m);
4160                         } else if (p->phys_addr &&
4161                                    !p->pageout && m->pageout &&
4162                                    !m->dump_cleaning) {
4163                                 m->pageout = FALSE;
4164                                 m->absent = FALSE;
4165                                 m->overwriting = FALSE;
4166                                 vm_page_unwire(m);
4167
4168                                 PAGE_WAKEUP_DONE(m);
4169                         }
4170                         page_list[entry].phys_addr = 0;
4171                 }
4172                 m->dump_cleaning = FALSE;
4173
4174                 if (m->laundry)
4175                         vm_pageout_throttle_up(m);
4176
4177                 if (m->pageout) {
4178                         m->cleaning = FALSE;
4179                         m->encrypted_cleaning = FALSE;
4180                         m->pageout = FALSE;
4181 #if MACH_CLUSTER_STATS
4182                         if (m->wanted) vm_pageout_target_collisions++;
4183 #endif
4184                         m->dirty = FALSE;
4185
4186                         if (! (flags & UPL_COMMIT_CS_VALIDATED) &&
4187                             m->cs_validated && !m->cs_tainted) {
4188                                 /*
4189                                  * CODE SIGNING:
4190                                  * This page is no longer dirty
4191                                  * but could have been modified,
4192                                  * so it will need to be
4193                                  * re-validated.
4194                                  */
4195                                 m->cs_validated = FALSE;
4196 #if DEVELOPMENT || DEBUG
4197                                 vm_cs_validated_resets++;
4198 #endif
4199                         }
4200
4201                         if (m->pmapped && (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED))
4202                                 m->dirty = TRUE;
4203
4204                         if (m->dirty) {
4205                                 /*
4206                                  * page was re-dirtied after we started
4207                                  * the pageout... reactivate it since
4208                                  * we don't know whether the on-disk
4209                                  * copy matches what is now in memory
4210                                  */
4211                                 vm_page_unwire(m);
4212
4213                                 if (upl->flags & UPL_PAGEOUT) {
4214                                         CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
4215                                         VM_STAT_INCR(reactivations);
4216                                         DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL);
4217                                 }
4218                                 PAGE_WAKEUP_DONE(m);
4219                         } else {
4220                                 /*
4221                                  * page has been successfully cleaned
4222                                  * go ahead and free it for other use
4223                                  */
4224
4225                                 if (m->object->internal) {
4226                                         DTRACE_VM2(anonpgout, int, 1, (uint64_t *), NULL);
4227                                 } else {
4228                                         DTRACE_VM2(fspgout, int, 1, (uint64_t *), NULL);
4229                                 }
4230
4231                                 vm_page_free(m);
4232
4233                                 if (upl->flags & UPL_PAGEOUT) {
4234                                         CLUSTER_STAT(vm_pageout_target_page_freed++;)
4235
4236                                         if (page_list[entry].dirty) {
4237                                                 VM_STAT_INCR(pageouts);
4238                                                 DTRACE_VM2(pgout, int, 1, (uint64_t *), NULL);
4239                                                 pgpgout_count++;
4240                                         }
4241                                 }
4242                         }
4243                         goto commit_next_page;
4244                 }
4245 #if MACH_CLUSTER_STATS
4246                 if (m->wpmapped)
4247                         m->dirty = pmap_is_modified(m->phys_page);
4248
4249                 if (m->dirty)   vm_pageout_cluster_dirtied++;
4250                 else            vm_pageout_cluster_cleaned++;
4251                 if (m->wanted)  vm_pageout_cluster_collisions++;
4252 #endif
4253                 m->dirty = FALSE;
4254
4255                 if (! (flags & UPL_COMMIT_CS_VALIDATED) &&
4256                     m->cs_validated && !m->cs_tainted) {
4257                         /*
4258                          * CODE SIGNING:
4259                          * This page is no longer dirty
4260                          * but could have been modified,
4261                          * so it will need to be
4262                          * re-validated.
4263                          */
4264                         m->cs_validated = FALSE;
4265 #if DEVELOPMENT || DEBUG
4266                         vm_cs_validated_resets++;
4267 #endif
4268                 }
4269
4270                 if ((m->busy) && (m->cleaning)) {
4271                         /*
4272                          * the request_page_list case
4273                          */
4274                         m->absent = FALSE;
4275                         m->overwriting = FALSE;
4276                         m->busy = FALSE;
4277                 } else if (m->overwriting) {
4278                         /*
4279                          * alternate request page list, write to
4280                          * page_list case.  Occurs when the original
4281                          * page was wired at the time of the list
4282                          * request
4283                          */
4284                         assert(m->wire_count != 0);
4285                         vm_page_unwire(m);/* reactivates */
4286                         m->overwriting = FALSE;
4287                 }
4288                 m->cleaning = FALSE;
4289                 m->encrypted_cleaning = FALSE;
4290
4291                 /*
4292                  * It is a part of the semantic of COPYOUT_FROM
4293                  * UPLs that a commit implies cache sync
4294                  * between the vm page and the backing store
4295                  * this can be used to strip the precious bit
4296                  * as well as clean
4297                  */
4298                 if (upl->flags & UPL_PAGE_SYNC_DONE)
4299                         m->precious = FALSE;
4300
4301                 if (flags & UPL_COMMIT_SET_DIRTY)
4302                         m->dirty = TRUE;
4303
4304                 if ((flags & UPL_COMMIT_INACTIVATE) && !m->clustered && !m->speculative) {
4305                         vm_page_deactivate(m);
4306                 } else if (!m->active && !m->inactive && !m->speculative) {
4307
4308                         if (m->clustered)
4309                                 vm_page_speculate(m, TRUE);
4310                         else if (m->reference)
4311                                 vm_page_activate(m);
4312                         else
4313                                 vm_page_deactivate(m);
4314                 }
4315                 if (flags & UPL_COMMIT_ALLOW_ACCESS) {
4316                         /*
4317                          * We blocked access to the pages in this URL.
4318                          * Clear the "busy" bit on this page before we
4319                          * wake up any waiter.
4320                          */
4321                         m->busy = FALSE;
4322                 }
4323                 /*
4324                  * Wakeup any thread waiting for the page to be un-cleaning.
4325                  */
4326                 PAGE_WAKEUP(m);
4327
4328 commit_next_page:
4329                 target_offset += PAGE_SIZE_64;
4330                 xfer_size -= PAGE_SIZE;
4331                 entry++;
4332
4333                 if (delayed_unlock++ > UPL_DELAYED_UNLOCK_LIMIT) {
4334                         /*
4335                          * pageout_scan takes the vm_page_lock_queues first
4336                          * then tries for the object lock... to avoid what
4337                          * is effectively a lock inversion, we'll go to the
4338                          * trouble of taking them in that same order... otherwise
4339                          * if this object contains the majority of the pages resident
4340                          * in the UBC (or a small set of large objects actively being
4341                          * worked on contain the majority of the pages), we could
4342                          * cause the pageout_scan thread to 'starve' in its attempt
4343                          * to find pages to move to the free queue, since it has to
4344                          * successfully acquire the object lock of any candidate page
4345                          * before it can steal/clean it.
4346                          */
4347                         vm_object_unlock(shadow_object);
4348                         mutex_yield(&vm_page_queue_lock);
4349
4350                         for (j = 0; ; j++) {
4351                                 if (vm_object_lock_try(shadow_object))
4352                                         break;
4353                                 vm_page_unlock_queues();
4354                                 mutex_pause(j);
4355                                 vm_page_lock_queues();
4356                         }
4357                         delayed_unlock = 1;
4358                 }
4359         }
4360         if (delayed_unlock)
4361                 vm_page_unlock_queues();
4362
4363         occupied = 1;
4364
4365         if (upl->flags & UPL_DEVICE_MEMORY)  {
4366                 occupied = 0;
4367         } else if (upl->flags & UPL_LITE) {
4368                 int     pg_num;
4369                 int     i;
4370
4371                 pg_num = upl->size/PAGE_SIZE;
4372                 pg_num = (pg_num + 31) >> 5;
4373                 occupied = 0;
4374
4375                 for (i = 0; i < pg_num; i++) {
4376                         if (lite_list[i] != 0) {
4377                                 occupied = 1;
4378                                 break;
4379                         }
4380                 }
4381         } else {
4382                 if (queue_empty(&upl->map_object->memq))
4383                         occupied = 0;
4384         }
4385         if (occupied == 0) {
4386                 if (upl->flags & UPL_COMMIT_NOTIFY_EMPTY)
4387                         *empty = TRUE;
4388
4389                 if (object == shadow_object) {
4390                         /*
4391                          * this is not a paging object
4392                          * so we need to drop the paging reference
4393                          * that was taken when we created the UPL
4394                          * against this object
4395                          */
4396                         vm_object_paging_end(shadow_object);
4397                 } else {
4398                          /*
4399                           * we dontated the paging reference to
4400                           * the map object... vm_pageout_object_terminate
4401                           * will drop this reference
4402                           */
4403                 }
4404         }
4405         vm_object_unlock(shadow_object);
4406         if (object != shadow_object)
4407                 vm_object_unlock(object);
4408         upl_unlock(upl);
4409
4410         if (pgpgout_count) {
4411                 DTRACE_VM2(pgpgout, int, pgpgout_count, (uint64_t *), NULL);
4412         }
4413
4414         return KERN_SUCCESS;
4415 }
4416
4417 kern_return_t
4418 upl_abort_range(
4419         upl_t                   upl,
4420         upl_offset_t            offset,
4421         upl_size_t              size,
4422         int                     error,
4423         boolean_t               *empty)
4424 {
4425         upl_size_t              xfer_size;
4426         vm_object_t             shadow_object;
4427         vm_object_t             object;
4428         vm_object_offset_t      target_offset;
4429         int                     entry;
4430         wpl_array_t             lite_list;
4431         int                     occupied;
4432         int                     delayed_unlock = 0;
4433         int                     j;
4434
4435         *empty = FALSE;
4436
4437         if (upl == UPL_NULL)
4438                 return KERN_INVALID_ARGUMENT;
4439
4440         if ( (upl->flags & UPL_IO_WIRE) && !(error & UPL_ABORT_DUMP_PAGES) )
4441                 return upl_commit_range(upl, offset, size, 0, NULL, 0, empty);
4442
4443         if (upl->flags & UPL_DEVICE_MEMORY)
4444                 xfer_size = 0;
4445         else if ((offset + size) <= upl->size)
4446                 xfer_size = size;
4447         else
4448                 return KERN_FAILURE;
4449
4450         upl_lock(upl);
4451
4452         if (upl->flags & UPL_INTERNAL) {
4453                 lite_list = (wpl_array_t)
4454                         ((((uintptr_t)upl) + sizeof(struct upl))
4455                         + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
4456         } else {
4457                 lite_list = (wpl_array_t)
4458                         (((uintptr_t)upl) + sizeof(struct upl));
4459         }
4460         object = upl->map_object;
4461
4462         if (upl->flags & UPL_SHADOWED) {
4463                 vm_object_lock(object);
4464                 shadow_object = object->shadow;
4465         } else
4466                 shadow_object = object;
4467
4468         entry = offset/PAGE_SIZE;
4469         target_offset = (vm_object_offset_t)offset;
4470
4471         /*
4472          * pageout_scan takes the vm_page_lock_queues first
4473          * then tries for the object lock... to avoid what
4474          * is effectively a lock inversion, we'll go to the
4475          * trouble of taking them in that same order... otherwise
4476          * if this object contains the majority of the pages resident
4477          * in the UBC (or a small set of large objects actively being
4478          * worked on contain the majority of the pages), we could
4479          * cause the pageout_scan thread to 'starve' in its attempt
4480          * to find pages to move to the free queue, since it has to
4481          * successfully acquire the object lock of any candidate page
4482          * before it can steal/clean it.
4483          */
4484         for (j = 0; ; j++) {
4485                 vm_page_lock_queues();
4486
4487                 if (vm_object_lock_try(shadow_object))
4488                         break;
4489                 vm_page_unlock_queues();
4490                 mutex_pause(j);
4491         }
4492         delayed_unlock = 1;
4493
4494         while (xfer_size) {
4495                 vm_page_t       t, m;
4496
4497                 m = VM_PAGE_NULL;
4498
4499                 if (upl->flags & UPL_LITE) {
4500                         int     pg_num;
4501                         pg_num = target_offset/PAGE_SIZE;
4502
4503                         if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
4504                                 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
4505
4506                                 m = vm_page_lookup(shadow_object, target_offset +
4507                                                    (upl->offset - shadow_object->paging_offset));
4508                         }
4509                 }
4510                 if (upl->flags & UPL_SHADOWED) {
4511                         if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
4512                                 t->pageout = FALSE;
4513
4514                                 vm_page_free(t);
4515
4516                                 if (m == VM_PAGE_NULL)
4517                                         m = vm_page_lookup(shadow_object, target_offset + object->shadow_offset);
4518                         }
4519                 }
4520                 if (m != VM_PAGE_NULL) {
4521
4522                         if (m->absent) {
4523                                 boolean_t must_free = TRUE;
4524
4525                                 m->clustered = FALSE;
4526                                 /*
4527                                  * COPYOUT = FALSE case
4528                                  * check for error conditions which must
4529                                  * be passed back to the pages customer
4530                                  */
4531                                 if (error & UPL_ABORT_RESTART) {
4532                                         m->restart = TRUE;
4533                                         m->absent = FALSE;
4534                                         m->error = TRUE;
4535                                         m->unusual = TRUE;
4536                                         must_free = FALSE;
4537                                 } else if (error & UPL_ABORT_UNAVAILABLE) {
4538                                         m->restart = FALSE;
4539                                         m->unusual = TRUE;
4540                                         must_free = FALSE;
4541                                 } else if (error & UPL_ABORT_ERROR) {
4542                                         m->restart = FALSE;
4543                                         m->absent = FALSE;
4544                                         m->error = TRUE;
4545                                         m->unusual = TRUE;
4546                                         must_free = FALSE;
4547                                 }
4548
4549                                 /*
4550                                  * ENCRYPTED SWAP:
4551                                  * If the page was already encrypted,
4552                                  * we don't really need to decrypt it
4553                                  * now.  It will get decrypted later,
4554                                  * on demand, as soon as someone needs
4555                                  * to access its contents.
4556                                  */
4557
4558                                 m->cleaning = FALSE;
4559                                 m->encrypted_cleaning = FALSE;
4560                                 m->overwriting = FALSE;
4561                                 PAGE_WAKEUP_DONE(m);
4562
4563                                 if (must_free == TRUE)
4564                                         vm_page_free(m);
4565                                 else
4566                                         vm_page_activate(m);
4567                         } else {
4568                                 /*
4569                                  * Handle the trusted pager throttle.
4570                                  */
4571                                 if (m->laundry)
4572                                         vm_pageout_throttle_up(m);
4573
4574                                 if (m->pageout) {
4575                                         assert(m->busy);
4576                                         assert(m->wire_count == 1);
4577                                         m->pageout = FALSE;
4578                                         vm_page_unwire(m);
4579                                 }
4580                                 m->dump_cleaning = FALSE;
4581                                 m->cleaning = FALSE;
4582                                 m->encrypted_cleaning = FALSE;
4583                                 m->overwriting = FALSE;
4584 #if     MACH_PAGEMAP
4585                                 vm_external_state_clr(m->object->existence_map, m->offset);
4586 #endif  /* MACH_PAGEMAP */
4587                                 if (error & UPL_ABORT_DUMP_PAGES) {
4588                                         pmap_disconnect(m->phys_page);
4589                                         vm_page_free(m);
4590                                 } else {
4591                                         if (error & UPL_ABORT_REFERENCE) {
4592                                                 /*
4593                                                  * we've been told to explictly
4594                                                  * reference this page... for
4595                                                  * file I/O, this is done by
4596                                                  * implementing an LRU on the inactive q
4597                                                  */
4598                                                 vm_page_lru(m);
4599                                         }
4600                                         PAGE_WAKEUP_DONE(m);
4601                                 }
4602                         }
4603                 }
4604                 if (delayed_unlock++ > UPL_DELAYED_UNLOCK_LIMIT) {
4605                         /*
4606                          * pageout_scan takes the vm_page_lock_queues first
4607                          * then tries for the object lock... to avoid what
4608                          * is effectively a lock inversion, we'll go to the
4609                          * trouble of taking them in that same order... otherwise
4610                          * if this object contains the majority of the pages resident
4611                          * in the UBC (or a small set of large objects actively being
4612                          * worked on contain the majority of the pages), we could
4613                          * cause the pageout_scan thread to 'starve' in its attempt
4614                          * to find pages to move to the free queue, since it has to
4615                          * successfully acquire the object lock of any candidate page
4616                          * before it can steal/clean it.
4617                          */
4618                         vm_object_unlock(shadow_object);
4619                         mutex_yield(&vm_page_queue_lock);
4620
4621                         for (j = 0; ; j++) {
4622                                 if (vm_object_lock_try(shadow_object))
4623                                         break;
4624                                 vm_page_unlock_queues();
4625                                 mutex_pause(j);
4626                                 vm_page_lock_queues();
4627                         }
4628                         delayed_unlock = 1;
4629                 }
4630                 target_offset += PAGE_SIZE_64;
4631                 xfer_size -= PAGE_SIZE;
4632                 entry++;
4633         }
4634         if (delayed_unlock)
4635                 vm_page_unlock_queues();
4636
4637         occupied = 1;
4638
4639         if (upl->flags & UPL_DEVICE_MEMORY)  {
4640                 occupied = 0;
4641         } else if (upl->flags & UPL_LITE) {
4642                 int     pg_num;
4643                 int     i;
4644
4645                 pg_num = upl->size/PAGE_SIZE;
4646                 pg_num = (pg_num + 31) >> 5;
4647                 occupied = 0;
4648
4649                 for (i = 0; i < pg_num; i++) {
4650                         if (lite_list[i] != 0) {
4651                                 occupied = 1;
4652                                 break;
4653                         }
4654                 }
4655         } else {
4656                 if (queue_empty(&upl->map_object->memq))
4657                         occupied = 0;
4658         }
4659         if (occupied == 0) {
4660                 if (upl->flags & UPL_COMMIT_NOTIFY_EMPTY)
4661                         *empty = TRUE;
4662
4663                 if (object == shadow_object) {
4664                         /*
4665                          * this is not a paging object
4666                          * so we need to drop the paging reference
4667                          * that was taken when we created the UPL
4668                          * against this object
4669                          */
4670                         vm_object_paging_end(shadow_object);
4671                 } else {
4672                          /*
4673                           * we dontated the paging reference to
4674                           * the map object... vm_pageout_object_terminate
4675                           * will drop this reference
4676                           */
4677                 }
4678         }
4679         vm_object_unlock(shadow_object);
4680         if (object != shadow_object)
4681                 vm_object_unlock(object);
4682         upl_unlock(upl);
4683
4684         return KERN_SUCCESS;
4685 }
4686
4687
4688 kern_return_t
4689 upl_abort(
4690         upl_t   upl,
4691         int     error)
4692 {
4693         boolean_t       empty;
4694
4695         return upl_abort_range(upl, 0, upl->size, error, &empty);
4696 }
4697
4698
4699 /* an option on commit should be wire */
4700 kern_return_t
4701 upl_commit(
4702         upl_t                   upl,
4703         upl_page_info_t         *page_list,
4704         mach_msg_type_number_t  count)
4705 {
4706         boolean_t       empty;
4707
4708         return upl_commit_range(upl, 0, upl->size, 0, page_list, count, &empty);
4709 }
4710
4711
4712 kern_return_t
4713 vm_object_iopl_request(
4714         vm_object_t             object,
4715         vm_object_offset_t      offset,
4716         upl_size_t              size,
4717         upl_t                   *upl_ptr,
4718         upl_page_info_array_t   user_page_list,
4719         unsigned int            *page_list_count,
4720         int                     cntrl_flags)
4721 {
4722         vm_page_t               dst_page;
4723         vm_object_offset_t      dst_offset;
4724         upl_size_t              xfer_size;
4725         upl_t                   upl = NULL;
4726         unsigned int            entry;
4727         wpl_array_t             lite_list = NULL;
4728         int                     delayed_unlock = 0;
4729         int                     no_zero_fill = FALSE;
4730         u_int32_t               psize;
4731         kern_return_t           ret;
4732         vm_prot_t               prot;
4733         struct vm_object_fault_info fault_info;
4734
4735
4736         if (cntrl_flags & ~UPL_VALID_FLAGS) {
4737                 /*
4738                  * For forward compatibility's sake,
4739                  * reject any unknown flag.
4740                  */
4741                 return KERN_INVALID_VALUE;
4742         }
4743         if (vm_lopage_poolsize == 0)
4744                 cntrl_flags &= ~UPL_NEED_32BIT_ADDR;
4745
4746         if (cntrl_flags & UPL_NEED_32BIT_ADDR) {
4747                 if ( (cntrl_flags & (UPL_SET_IO_WIRE | UPL_SET_LITE)) != (UPL_SET_IO_WIRE | UPL_SET_LITE))
4748                         return KERN_INVALID_VALUE;
4749
4750                 if (object->phys_contiguous) {
4751                         if ((offset + object->shadow_offset) >= (vm_object_offset_t)max_valid_dma_address)
4752                                 return KERN_INVALID_ADDRESS;
4753
4754                         if (((offset + object->shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address)
4755                                 return KERN_INVALID_ADDRESS;
4756                 }
4757         }
4758
4759         if (cntrl_flags & UPL_ENCRYPT) {
4760                 /*
4761                  * ENCRYPTED SWAP:
4762                  * The paging path doesn't use this interface,
4763                  * so we don't support the UPL_ENCRYPT flag
4764                  * here.  We won't encrypt the pages.
4765                  */
4766                 assert(! (cntrl_flags & UPL_ENCRYPT));
4767         }
4768         if (cntrl_flags & UPL_NOZEROFILL)
4769                 no_zero_fill = TRUE;
4770
4771         if (cntrl_flags & UPL_COPYOUT_FROM)
4772                 prot = VM_PROT_READ;
4773         else
4774                 prot = VM_PROT_READ | VM_PROT_WRITE;
4775
4776         if (((size/page_size) > MAX_UPL_SIZE) && !object->phys_contiguous)
4777                 size = MAX_UPL_SIZE * page_size;
4778
4779         if (cntrl_flags & UPL_SET_INTERNAL) {
4780                 if (page_list_count != NULL)
4781                         *page_list_count = MAX_UPL_SIZE;
4782         }
4783         if (((cntrl_flags & UPL_SET_INTERNAL) && !(object->phys_contiguous)) &&
4784             ((page_list_count != NULL) && (*page_list_count != 0) && *page_list_count < (size/page_size)))
4785                 return KERN_INVALID_ARGUMENT;
4786
4787         if ((!object->internal) && (object->paging_offset != 0))
4788                 panic("vm_object_iopl_request: external object with non-zero paging offset\n");
4789
4790
4791         if (object->phys_contiguous)
4792                 psize = PAGE_SIZE;
4793         else
4794                 psize = size;
4795
4796         if (cntrl_flags & UPL_SET_INTERNAL) {
4797                 upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE, UPL_IO_WIRE, psize);
4798
4799                 user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
4800                 lite_list = (wpl_array_t) (((uintptr_t)user_page_list) +
4801                                            ((psize / PAGE_SIZE) * sizeof(upl_page_info_t)));
4802         } else {
4803                 upl = upl_create(UPL_CREATE_LITE, UPL_IO_WIRE, psize);
4804
4805                 lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
4806         }
4807         if (user_page_list)
4808                 user_page_list[0].device = FALSE;
4809         *upl_ptr = upl;
4810
4811         upl->map_object = object;
4812         upl->size = size;
4813
4814         vm_object_lock(object);
4815         vm_object_paging_begin(object);
4816         /*
4817          * paging in progress also protects the paging_offset
4818          */
4819         upl->offset = offset + object->paging_offset;
4820
4821         if (object->phys_contiguous) {
4822 #ifdef UPL_DEBUG
4823                 queue_enter(&object->uplq, upl, upl_t, uplq);
4824 #endif /* UPL_DEBUG */
4825
4826                 vm_object_unlock(object);
4827
4828                 /*
4829                  * don't need any shadow mappings for this one
4830                  * since it is already I/O memory
4831                  */
4832                 upl->flags |= UPL_DEVICE_MEMORY;
4833
4834                 upl->highest_page = (offset + object->shadow_offset + size - 1)>>PAGE_SHIFT;
4835
4836                 if (user_page_list) {
4837                         user_page_list[0].phys_addr = (offset + object->shadow_offset)>>PAGE_SHIFT;
4838                         user_page_list[0].device = TRUE;
4839                 }
4840                 if (page_list_count != NULL) {
4841                         if (upl->flags & UPL_INTERNAL)
4842                                 *page_list_count = 0;
4843                         else
4844                                 *page_list_count = 1;
4845                 }
4846                 return KERN_SUCCESS;
4847         }
4848         /*
4849          * Protect user space from future COW operations
4850          */
4851         object->true_share = TRUE;
4852
4853         if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC)
4854                 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
4855
4856 #ifdef UPL_DEBUG
4857         queue_enter(&object->uplq, upl, upl_t, uplq);
4858 #endif /* UPL_DEBUG */
4859
4860         if (cntrl_flags & UPL_BLOCK_ACCESS) {
4861                 /*
4862                  * The user requested that access to the pages in this URL
4863                  * be blocked until the UPL is commited or aborted.
4864                  */
4865                 upl->flags |= UPL_ACCESS_BLOCKED;
4866         }
4867         entry = 0;
4868
4869         xfer_size = size;
4870         dst_offset = offset;
4871
4872         fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
4873         fault_info.user_tag  = 0;
4874         fault_info.lo_offset = offset;
4875         fault_info.hi_offset = offset + xfer_size;
4876         fault_info.no_cache  = FALSE;
4877
4878         while (xfer_size) {
4879                 vm_fault_return_t       result;
4880                 int                     pg_num;
4881
4882                 dst_page = vm_page_lookup(object, dst_offset);
4883
4884                 /*
4885                  * ENCRYPTED SWAP:
4886                  * If the page is encrypted, we need to decrypt it,
4887                  * so force a soft page fault.
4888                  */
4889                 if ((dst_page == VM_PAGE_NULL) || (dst_page->busy) ||
4890                     (dst_page->encrypted) ||
4891                     (dst_page->unusual && (dst_page->error ||
4892                                            dst_page->restart ||
4893                                            dst_page->absent ||
4894                                            dst_page->fictitious))) {
4895
4896                    do {
4897                         vm_page_t       top_page;
4898                         kern_return_t   error_code;
4899                         int             interruptible;
4900
4901                         if (delayed_unlock) {
4902                                 delayed_unlock = 0;
4903                                 vm_page_unlock_queues();
4904                         }
4905                         if (cntrl_flags & UPL_SET_INTERRUPTIBLE)
4906                                 interruptible = THREAD_ABORTSAFE;
4907                         else
4908                                 interruptible = THREAD_UNINT;
4909
4910                         fault_info.interruptible = interruptible;
4911                         fault_info.cluster_size = xfer_size;
4912
4913                         result = vm_fault_page(object, dst_offset,
4914                                                prot | VM_PROT_WRITE, FALSE,
4915                                                &prot, &dst_page, &top_page,
4916                                                (int *)0,
4917                                                &error_code, no_zero_fill,
4918                                                FALSE, &fault_info);
4919
4920                         switch (result) {
4921
4922                         case VM_FAULT_SUCCESS:
4923
4924                                 PAGE_WAKEUP_DONE(dst_page);
4925                                 /*
4926                                  *      Release paging references and
4927                                  *      top-level placeholder page, if any.
4928                                  */
4929                                 if (top_page != VM_PAGE_NULL) {
4930                                         vm_object_t local_object;
4931
4932                                         local_object = top_page->object;
4933
4934                                         if (top_page->object != dst_page->object) {
4935                                                 vm_object_lock(local_object);
4936                                                 VM_PAGE_FREE(top_page);
4937                                                 vm_object_paging_end(local_object);
4938                                                 vm_object_unlock(local_object);
4939                                         } else {
4940                                                 VM_PAGE_FREE(top_page);
4941                                                 vm_object_paging_end(local_object);
4942                                         }
4943                                 }
4944                                 break;
4945
4946                         case VM_FAULT_RETRY:
4947                                 vm_object_lock(object);
4948                                 vm_object_paging_begin(object);
4949                                 break;
4950
4951                         case VM_FAULT_FICTITIOUS_SHORTAGE:
4952                                 vm_page_more_fictitious();
4953
4954                                 vm_object_lock(object);
4955                                 vm_object_paging_begin(object);
4956                                 break;
4957
4958                         case VM_FAULT_MEMORY_SHORTAGE:
4959                                 if (vm_page_wait(interruptible)) {
4960                                         vm_object_lock(object);
4961                                         vm_object_paging_begin(object);
4962                                         break;
4963                                 }
4964                                 /* fall thru */
4965
4966                         case VM_FAULT_INTERRUPTED:
4967                                 error_code = MACH_SEND_INTERRUPTED;
4968                         case VM_FAULT_MEMORY_ERROR:
4969                                 ret = (error_code ? error_code: KERN_MEMORY_ERROR);
4970
4971                                 vm_object_lock(object);
4972                                 vm_object_paging_begin(object);
4973                                 goto return_err;
4974                         }
4975                    } while (result != VM_FAULT_SUCCESS);
4976                 }
4977
4978                 if ( (cntrl_flags & UPL_NEED_32BIT_ADDR) &&
4979                      dst_page->phys_page >= (max_valid_dma_address >> PAGE_SHIFT) ) {
4980                         vm_page_t       low_page;
4981                         int             refmod;
4982
4983                         /*
4984                          * support devices that can't DMA above 32 bits
4985                          * by substituting pages from a pool of low address
4986                          * memory for any pages we find above the 4G mark
4987                          * can't substitute if the page is already wired because
4988                          * we don't know whether that physical address has been
4989                          * handed out to some other 64 bit capable DMA device to use
4990                          */
4991                         if (dst_page->wire_count) {
4992                                 ret = KERN_PROTECTION_FAILURE;
4993                                 goto return_err;
4994                         }
4995                         if (delayed_unlock) {
4996                                 delayed_unlock = 0;
4997                                 vm_page_unlock_queues();
4998                         }
4999                         low_page = vm_page_grablo();
5000
5001                         if (low_page == VM_PAGE_NULL) {
5002                                 ret = KERN_RESOURCE_SHORTAGE;
5003                                 goto return_err;
5004                         }
5005                         /*
5006                          * from here until the vm_page_replace completes
5007                          * we musn't drop the object lock... we don't
5008                          * want anyone refaulting this page in and using
5009                          * it after we disconnect it... we want the fault
5010                          * to find the new page being substituted.
5011                          */
5012                         if (dst_page->pmapped)
5013                                 refmod = pmap_disconnect(dst_page->phys_page);
5014                         else
5015                                 refmod = 0;
5016                         vm_page_copy(dst_page, low_page);
5017
5018                         low_page->reference = dst_page->reference;
5019                         low_page->dirty     = dst_page->dirty;
5020
5021                         if (refmod & VM_MEM_REFERENCED)
5022                                 low_page->reference = TRUE;
5023                         if (refmod & VM_MEM_MODIFIED)
5024                                 low_page->dirty = TRUE;
5025
5026                         vm_page_lock_queues();
5027                         vm_page_replace(low_page, object, dst_offset);
5028                         /*
5029                          * keep the queue lock since we're going to
5030                          * need it immediately
5031                          */
5032                         delayed_unlock = 1;
5033
5034                         dst_page = low_page;
5035                         /*
5036                          * vm_page_grablo returned the page marked
5037                          * BUSY... we don't need a PAGE_WAKEUP_DONE
5038                          * here, because we've never dropped the object lock
5039                          */
5040                         dst_page->busy = FALSE;
5041                 }
5042                 if (delayed_unlock == 0)
5043                         vm_page_lock_queues();
5044
5045                 vm_page_wire(dst_page);
5046
5047                 if (cntrl_flags & UPL_BLOCK_ACCESS) {
5048                         /*
5049                          * Mark the page "busy" to block any future page fault
5050                          * on this page.  We'll also remove the mapping
5051                          * of all these pages before leaving this routine.
5052                          */
5053                         assert(!dst_page->fictitious);
5054                         dst_page->busy = TRUE;
5055                 }
5056                 pg_num = (dst_offset-offset)/PAGE_SIZE;
5057                 lite_list[pg_num>>5] |= 1 << (pg_num & 31);
5058
5059                 /*
5060                  * expect the page to be used
5061                  * page queues lock must be held to set 'reference'
5062                  */
5063                 dst_page->reference = TRUE;
5064
5065                 if (!(cntrl_flags & UPL_COPYOUT_FROM))
5066                         dst_page->dirty = TRUE;
5067
5068                 if (dst_page->phys_page > upl->highest_page)
5069                         upl->highest_page = dst_page->phys_page;
5070
5071                 if (user_page_list) {
5072                         user_page_list[entry].phys_addr = dst_page->phys_page;
5073                         user_page_list[entry].pageout   = dst_page->pageout;
5074                         user_page_list[entry].absent    = dst_page->absent;
5075                         user_page_list[entry].dirty     = dst_page->dirty;
5076                         user_page_list[entry].precious  = dst_page->precious;
5077                         user_page_list[entry].device    = FALSE;
5078                         if (dst_page->clustered == TRUE)
5079                                 user_page_list[entry].speculative = dst_page->speculative;
5080                         else
5081                                 user_page_list[entry].speculative = FALSE;
5082                         user_page_list[entry].cs_validated = dst_page->cs_validated;
5083                         user_page_list[entry].cs_tainted = dst_page->cs_tainted;
5084                 }
5085                 /*
5086                  * someone is explicitly grabbing this page...
5087                  * update clustered and speculative state
5088                  *
5089                  */
5090                 VM_PAGE_CONSUME_CLUSTERED(dst_page);
5091
5092                 if (delayed_unlock++ > UPL_DELAYED_UNLOCK_LIMIT) {
5093                         mutex_yield(&vm_page_queue_lock);
5094                         delayed_unlock = 1;
5095                 }
5096                 entry++;
5097                 dst_offset += PAGE_SIZE_64;
5098                 xfer_size -= PAGE_SIZE;
5099         }
5100         if (delayed_unlock)
5101                 vm_page_unlock_queues();
5102
5103         if (page_list_count != NULL) {
5104                 if (upl->flags & UPL_INTERNAL)
5105                         *page_list_count = 0;
5106                 else if (*page_list_count > entry)
5107                         *page_list_count = entry;
5108         }
5109         vm_object_unlock(object);
5110
5111         if (cntrl_flags & UPL_BLOCK_ACCESS) {
5112                 /*
5113                  * We've marked all the pages "busy" so that future
5114                  * page faults will block.
5115                  * Now remove the mapping for these pages, so that they
5116                  * can't be accessed without causing a page fault.
5117                  */
5118                 vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
5119                                        PMAP_NULL, 0, VM_PROT_NONE);
5120         }
5121         return KERN_SUCCESS;
5122
5123 return_err:
5124         if (delayed_unlock)
5125                 vm_page_unlock_queues();
5126
5127         for (; offset < dst_offset; offset += PAGE_SIZE) {
5128                 dst_page = vm_page_lookup(object, offset);
5129
5130                 if (dst_page == VM_PAGE_NULL)
5131                         panic("vm_object_iopl_request: Wired pages missing. \n");
5132
5133                 vm_page_lockspin_queues();
5134                 vm_page_unwire(dst_page);
5135                 vm_page_unlock_queues();
5136
5137                 VM_STAT_INCR(reactivations);
5138         }
5139         vm_object_paging_end(object);
5140         vm_object_unlock(object);
5141         upl_destroy(upl);
5142
5143         return ret;
5144 }
5145
5146 kern_return_t
5147 upl_transpose(
5148         upl_t           upl1,
5149         upl_t           upl2)
5150 {
5151         kern_return_t           retval;
5152         boolean_t               upls_locked;
5153         vm_object_t             object1, object2;
5154
5155         if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2) {
5156                 return KERN_INVALID_ARGUMENT;
5157         }
5158
5159         upls_locked = FALSE;
5160
5161         /*
5162          * Since we need to lock both UPLs at the same time,
5163          * avoid deadlocks by always taking locks in the same order.
5164          */
5165         if (upl1 < upl2) {
5166                 upl_lock(upl1);
5167                 upl_lock(upl2);
5168         } else {
5169                 upl_lock(upl2);
5170                 upl_lock(upl1);
5171         }
5172         upls_locked = TRUE;     /* the UPLs will need to be unlocked */
5173
5174         object1 = upl1->map_object;
5175         object2 = upl2->map_object;
5176
5177         if (upl1->offset != 0 || upl2->offset != 0 ||
5178             upl1->size != upl2->size) {
5179                 /*
5180                  * We deal only with full objects, not subsets.
5181                  * That's because we exchange the entire backing store info
5182                  * for the objects: pager, resident pages, etc...  We can't do
5183                  * only part of it.
5184                  */
5185                 retval = KERN_INVALID_VALUE;
5186                 goto done;
5187         }
5188
5189         /*
5190          * Tranpose the VM objects' backing store.
5191          */
5192         retval = vm_object_transpose(object1, object2,
5193                                      (vm_object_size_t) upl1->size);
5194
5195         if (retval == KERN_SUCCESS) {
5196                 /*
5197                  * Make each UPL point to the correct VM object, i.e. the
5198                  * object holding the pages that the UPL refers to...
5199                  */
5200 #ifdef UPL_DEBUG
5201                 queue_remove(&object1->uplq, upl1, upl_t, uplq);
5202                 queue_remove(&object2->uplq, upl2, upl_t, uplq);
5203 #endif
5204                 upl1->map_object = object2;
5205                 upl2->map_object = object1;
5206 #ifdef UPL_DEBUG
5207                 queue_enter(&object1->uplq, upl2, upl_t, uplq);
5208                 queue_enter(&object2->uplq, upl1, upl_t, uplq);
5209 #endif
5210         }
5211
5212 done:
5213         /*
5214          * Cleanup.
5215          */
5216         if (upls_locked) {
5217                 upl_unlock(upl1);
5218                 upl_unlock(upl2);
5219                 upls_locked = FALSE;
5220         }
5221
5222         return retval;
5223 }
5224
5225 /*
5226  * ENCRYPTED SWAP:
5227  *
5228  * Rationale:  the user might have some encrypted data on disk (via
5229  * FileVault or any other mechanism).  That data is then decrypted in
5230  * memory, which is safe as long as the machine is secure.  But that
5231  * decrypted data in memory could be paged out to disk by the default
5232  * pager.  The data would then be stored on disk in clear (not encrypted)
5233  * and it could be accessed by anyone who gets physical access to the
5234  * disk (if the laptop or the disk gets stolen for example).  This weakens
5235  * the security offered by FileVault.
5236  *
5237  * Solution:  the default pager will optionally request that all the
5238  * pages it gathers for pageout be encrypted, via the UPL interfaces,
5239  * before it sends this UPL to disk via the vnode_pageout() path.
5240  *
5241  * Notes:
5242  *
5243  * To avoid disrupting the VM LRU algorithms, we want to keep the
5244  * clean-in-place mechanisms, which allow us to send some extra pages to
5245  * swap (clustering) without actually removing them from the user's
5246  * address space.  We don't want the user to unknowingly access encrypted
5247  * data, so we have to actually remove the encrypted pages from the page
5248  * table.  When the user accesses the data, the hardware will fail to
5249  * locate the virtual page in its page table and will trigger a page
5250  * fault.  We can then decrypt the page and enter it in the page table
5251  * again.  Whenever we allow the user to access the contents of a page,
5252  * we have to make sure it's not encrypted.
5253  *
5254  *
5255  */
5256 /*
5257  * ENCRYPTED SWAP:
5258  * Reserve of virtual addresses in the kernel address space.
5259  * We need to map the physical pages in the kernel, so that we
5260  * can call the encryption/decryption routines with a kernel
5261  * virtual address.  We keep this pool of pre-allocated kernel
5262  * virtual addresses so that we don't have to scan the kernel's
5263  * virtaul address space each time we need to encrypt or decrypt
5264  * a physical page.
5265  * It would be nice to be able to encrypt and decrypt in physical
5266  * mode but that might not always be more efficient...
5267  */
5268 decl_simple_lock_data(,vm_paging_lock)
5269 #define VM_PAGING_NUM_PAGES     64
5270 vm_map_offset_t vm_paging_base_address = 0;
5271 boolean_t       vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
5272 int             vm_paging_max_index = 0;
5273 int             vm_paging_page_waiter = 0;
5274 int             vm_paging_page_waiter_total = 0;
5275 unsigned long   vm_paging_no_kernel_page = 0;
5276 unsigned long   vm_paging_objects_mapped = 0;
5277 unsigned long   vm_paging_pages_mapped = 0;
5278 unsigned long   vm_paging_objects_mapped_slow = 0;
5279 unsigned long   vm_paging_pages_mapped_slow = 0;
5280
5281 void
5282 vm_paging_map_init(void)
5283 {
5284         kern_return_t   kr;
5285         vm_map_offset_t page_map_offset;
5286         vm_map_entry_t  map_entry;
5287
5288         assert(vm_paging_base_address == 0);
5289
5290         /*
5291          * Initialize our pool of pre-allocated kernel
5292          * virtual addresses.
5293          */
5294         page_map_offset = 0;
5295         kr = vm_map_find_space(kernel_map,
5296                                &page_map_offset,
5297                                VM_PAGING_NUM_PAGES * PAGE_SIZE,
5298                                0,
5299                                0,
5300                                &map_entry);
5301         if (kr != KERN_SUCCESS) {
5302                 panic("vm_paging_map_init: kernel_map full\n");
5303         }
5304         map_entry->object.vm_object = kernel_object;
5305         map_entry->offset =
5306                 page_map_offset - VM_MIN_KERNEL_ADDRESS;
5307         vm_object_reference(kernel_object);
5308         vm_map_unlock(kernel_map);
5309
5310         assert(vm_paging_base_address == 0);
5311         vm_paging_base_address = page_map_offset;
5312 }
5313
5314 /*
5315  * ENCRYPTED SWAP:
5316  * vm_paging_map_object:
5317  *      Maps part of a VM object's pages in the kernel
5318  *      virtual address space, using the pre-allocated
5319  *      kernel virtual addresses, if possible.
5320  * Context:
5321  *      The VM object is locked.  This lock will get
5322  *      dropped and re-acquired though, so the caller
5323  *      must make sure the VM object is kept alive
5324  *      (by holding a VM map that has a reference
5325  *      on it, for example, or taking an extra reference).
5326  *      The page should also be kept busy to prevent
5327  *      it from being reclaimed.
5328  */
5329 kern_return_t
5330 vm_paging_map_object(
5331         vm_map_offset_t         *address,
5332         vm_page_t               page,
5333         vm_object_t             object,
5334         vm_object_offset_t      offset,
5335         vm_map_size_t           *size,
5336         vm_prot_t               protection,
5337         boolean_t               can_unlock_object)
5338 {
5339         kern_return_t           kr;
5340         vm_map_offset_t         page_map_offset;
5341         vm_map_size_t           map_size;
5342         vm_object_offset_t      object_offset;
5343         int                     i;
5344
5345
5346         if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
5347                 assert(page->busy);
5348                 /*
5349                  * Use one of the pre-allocated kernel virtual addresses
5350                  * and just enter the VM page in the kernel address space
5351                  * at that virtual address.
5352                  */
5353                 simple_lock(&vm_paging_lock);
5354
5355                 /*
5356                  * Try and find an available kernel virtual address
5357                  * from our pre-allocated pool.
5358                  */
5359                 page_map_offset = 0;
5360                 for (;;) {
5361                         for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
5362                                 if (vm_paging_page_inuse[i] == FALSE) {
5363                                         page_map_offset =
5364                                                 vm_paging_base_address +
5365                                                 (i * PAGE_SIZE);
5366                                         break;
5367                                 }
5368                         }
5369                         if (page_map_offset != 0) {
5370                                 /* found a space to map our page ! */
5371                                 break;
5372                         }
5373
5374                         if (can_unlock_object) {
5375                                 /*
5376                                  * If we can afford to unlock the VM object,
5377                                  * let's take the slow path now...
5378                                  */
5379                                 break;
5380                         }
5381                         /*
5382                          * We can't afford to unlock the VM object, so
5383                          * let's wait for a space to become available...
5384                          */
5385                         vm_paging_page_waiter_total++;
5386                         vm_paging_page_waiter++;
5387                         thread_sleep_fast_usimple_lock(&vm_paging_page_waiter,
5388                                                        &vm_paging_lock,
5389                                                        THREAD_UNINT);
5390                         vm_paging_page_waiter--;
5391                         /* ... and try again */
5392                 }
5393
5394                 if (page_map_offset != 0) {
5395                         /*
5396                          * We found a kernel virtual address;
5397                          * map the physical page to that virtual address.
5398                          */
5399                         if (i > vm_paging_max_index) {
5400                                 vm_paging_max_index = i;
5401                         }
5402                         vm_paging_page_inuse[i] = TRUE;
5403                         simple_unlock(&vm_paging_lock);
5404
5405                         if (page->pmapped == FALSE) {
5406                                 pmap_sync_page_data_phys(page->phys_page);
5407                         }
5408                         page->pmapped = TRUE;
5409
5410                         /*
5411                          * Keep the VM object locked over the PMAP_ENTER
5412                          * and the actual use of the page by the kernel,
5413                          * or this pmap mapping might get undone by a
5414                          * vm_object_pmap_protect() call...
5415                          */
5416                         PMAP_ENTER(kernel_pmap,
5417                                    page_map_offset,
5418                                    page,
5419                                    protection,
5420                                    ((int) page->object->wimg_bits &
5421                                     VM_WIMG_MASK),
5422                                    TRUE);
5423                         vm_paging_objects_mapped++;
5424                         vm_paging_pages_mapped++;
5425                         *address = page_map_offset;
5426
5427                         /* all done and mapped, ready to use ! */
5428                         return KERN_SUCCESS;
5429                 }
5430
5431                 /*
5432                  * We ran out of pre-allocated kernel virtual
5433                  * addresses.  Just map the page in the kernel
5434                  * the slow and regular way.
5435                  */
5436                 vm_paging_no_kernel_page++;
5437                 simple_unlock(&vm_paging_lock);
5438         }
5439
5440         if (! can_unlock_object) {
5441                 return KERN_NOT_SUPPORTED;
5442         }
5443
5444         object_offset = vm_object_trunc_page(offset);
5445         map_size = vm_map_round_page(*size);
5446
5447         /*
5448          * Try and map the required range of the object
5449          * in the kernel_map
5450          */
5451
5452         vm_object_reference_locked(object);     /* for the map entry */
5453         vm_object_unlock(object);
5454
5455         kr = vm_map_enter(kernel_map,
5456                           address,
5457                           map_size,
5458                           0,
5459                           VM_FLAGS_ANYWHERE,
5460                           object,
5461                           object_offset,
5462                           FALSE,
5463                           protection,
5464                           VM_PROT_ALL,
5465                           VM_INHERIT_NONE);
5466         if (kr != KERN_SUCCESS) {
5467                 *address = 0;
5468                 *size = 0;
5469                 vm_object_deallocate(object);   /* for the map entry */
5470                 vm_object_lock(object);
5471                 return kr;
5472         }
5473
5474         *size = map_size;
5475
5476         /*
5477          * Enter the mapped pages in the page table now.
5478          */
5479         vm_object_lock(object);
5480         /*
5481          * VM object must be kept locked from before PMAP_ENTER()
5482          * until after the kernel is done accessing the page(s).
5483          * Otherwise, the pmap mappings in the kernel could be
5484          * undone by a call to vm_object_pmap_protect().
5485          */
5486
5487         for (page_map_offset = 0;
5488              map_size != 0;
5489              map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
5490                 unsigned int    cache_attr;
5491
5492                 page = vm_page_lookup(object, offset + page_map_offset);
5493                 if (page == VM_PAGE_NULL) {
5494                         printf("vm_paging_map_object: no page !?");
5495                         vm_object_unlock(object);
5496                         kr = vm_map_remove(kernel_map, *address, *size,
5497                                            VM_MAP_NO_FLAGS);
5498                         assert(kr == KERN_SUCCESS);
5499                         *address = 0;
5500                         *size = 0;
5501                         vm_object_lock(object);
5502                         return KERN_MEMORY_ERROR;
5503                 }
5504                 if (page->pmapped == FALSE) {
5505                         pmap_sync_page_data_phys(page->phys_page);
5506                 }
5507                 page->pmapped = TRUE;
5508                 cache_attr = ((unsigned int) object->wimg_bits) & VM_WIMG_MASK;
5509
5510                 //assert(pmap_verify_free(page->phys_page));
5511                 PMAP_ENTER(kernel_pmap,
5512                            *address + page_map_offset,
5513                            page,
5514                            protection,
5515                            cache_attr,
5516                            TRUE);
5517         }
5518
5519         vm_paging_objects_mapped_slow++;
5520         vm_paging_pages_mapped_slow += map_size / PAGE_SIZE_64;
5521
5522         return KERN_SUCCESS;
5523 }
5524
5525 /*
5526  * ENCRYPTED SWAP:
5527  * vm_paging_unmap_object:
5528  *      Unmaps part of a VM object's pages from the kernel
5529  *      virtual address space.
5530  * Context:
5531  *      The VM object is locked.  This lock will get
5532  *      dropped and re-acquired though.
5533  */
5534 void
5535 vm_paging_unmap_object(
5536         vm_object_t     object,
5537         vm_map_offset_t start,
5538         vm_map_offset_t end)
5539 {
5540         kern_return_t   kr;
5541         int             i;
5542
5543         if ((vm_paging_base_address == 0) ||
5544             (start < vm_paging_base_address) ||
5545             (end > (vm_paging_base_address
5546                      + (VM_PAGING_NUM_PAGES * PAGE_SIZE)))) {
5547                 /*
5548                  * We didn't use our pre-allocated pool of
5549                  * kernel virtual address.  Deallocate the
5550                  * virtual memory.
5551                  */
5552                 if (object != VM_OBJECT_NULL) {
5553                         vm_object_unlock(object);
5554                 }
5555                 kr = vm_map_remove(kernel_map, start, end, VM_MAP_NO_FLAGS);
5556                 if (object != VM_OBJECT_NULL) {
5557                         vm_object_lock(object);
5558                 }
5559                 assert(kr == KERN_SUCCESS);
5560         } else {
5561                 /*
5562                  * We used a kernel virtual address from our
5563                  * pre-allocated pool.  Put it back in the pool
5564                  * for next time.
5565                  */
5566                 assert(end - start == PAGE_SIZE);
5567                 i = (start - vm_paging_base_address) >> PAGE_SHIFT;
5568
5569                 /* undo the pmap mapping */
5570                 pmap_remove(kernel_pmap, start, end);
5571
5572                 simple_lock(&vm_paging_lock);
5573                 vm_paging_page_inuse[i] = FALSE;
5574                 if (vm_paging_page_waiter) {
5575                         thread_wakeup(&vm_paging_page_waiter);
5576                 }
5577                 simple_unlock(&vm_paging_lock);
5578         }
5579 }
5580
5581 #if CRYPTO
5582 /*
5583  * Encryption data.
5584  * "iv" is the "initial vector".  Ideally, we want to
5585  * have a different one for each page we encrypt, so that
5586  * crackers can't find encryption patterns too easily.
5587  */
5588 #define SWAP_CRYPT_AES_KEY_SIZE 128     /* XXX 192 and 256 don't work ! */
5589 boolean_t               swap_crypt_ctx_initialized = FALSE;
5590 aes_32t                 swap_crypt_key[8]; /* big enough for a 256 key */
5591 aes_ctx                 swap_crypt_ctx;
5592 const unsigned char     swap_crypt_null_iv[AES_BLOCK_SIZE] = {0xa, };
5593
5594 #if DEBUG
5595 boolean_t               swap_crypt_ctx_tested = FALSE;
5596 unsigned char swap_crypt_test_page_ref[4096] __attribute__((aligned(4096)));
5597 unsigned char swap_crypt_test_page_encrypt[4096] __attribute__((aligned(4096)));
5598 unsigned char swap_crypt_test_page_decrypt[4096] __attribute__((aligned(4096)));
5599 #endif /* DEBUG */
5600
5601 extern u_long random(void);
5602
5603 /*
5604  * Initialize the encryption context: key and key size.
5605  */
5606 void swap_crypt_ctx_initialize(void); /* forward */
5607 void
5608 swap_crypt_ctx_initialize(void)
5609 {
5610         unsigned int    i;
5611
5612         /*
5613          * No need for locking to protect swap_crypt_ctx_initialized
5614          * because the first use of encryption will come from the
5615          * pageout thread (we won't pagein before there's been a pageout)
5616          * and there's only one pageout thread.
5617          */
5618         if (swap_crypt_ctx_initialized == FALSE) {
5619                 for (i = 0;
5620                      i < (sizeof (swap_crypt_key) /
5621                           sizeof (swap_crypt_key[0]));
5622                      i++) {
5623                         swap_crypt_key[i] = random();
5624                 }
5625                 aes_encrypt_key((const unsigned char *) swap_crypt_key,
5626                                 SWAP_CRYPT_AES_KEY_SIZE,
5627                                 &swap_crypt_ctx.encrypt);
5628                 aes_decrypt_key((const unsigned char *) swap_crypt_key,
5629                                 SWAP_CRYPT_AES_KEY_SIZE,
5630                                 &swap_crypt_ctx.decrypt);
5631                 swap_crypt_ctx_initialized = TRUE;
5632         }
5633
5634 #if DEBUG
5635         /*
5636          * Validate the encryption algorithms.
5637          */
5638         if (swap_crypt_ctx_tested == FALSE) {
5639                 /* initialize */
5640                 for (i = 0; i < 4096; i++) {
5641                         swap_crypt_test_page_ref[i] = (char) i;
5642                 }
5643                 /* encrypt */
5644                 aes_encrypt_cbc(swap_crypt_test_page_ref,
5645                                 swap_crypt_null_iv,
5646                                 PAGE_SIZE / AES_BLOCK_SIZE,
5647                                 swap_crypt_test_page_encrypt,
5648                                 &swap_crypt_ctx.encrypt);
5649                 /* decrypt */
5650                 aes_decrypt_cbc(swap_crypt_test_page_encrypt,
5651                                 swap_crypt_null_iv,
5652                                 PAGE_SIZE / AES_BLOCK_SIZE,
5653                                 swap_crypt_test_page_decrypt,
5654                                 &swap_crypt_ctx.decrypt);
5655                 /* compare result with original */
5656                 for (i = 0; i < 4096; i ++) {
5657                         if (swap_crypt_test_page_decrypt[i] !=
5658                             swap_crypt_test_page_ref[i]) {
5659                                 panic("encryption test failed");
5660                         }
5661                 }
5662
5663                 /* encrypt again */
5664                 aes_encrypt_cbc(swap_crypt_test_page_decrypt,
5665                                 swap_crypt_null_iv,
5666                                 PAGE_SIZE / AES_BLOCK_SIZE,
5667                                 swap_crypt_test_page_decrypt,
5668                                 &swap_crypt_ctx.encrypt);
5669                 /* decrypt in place */
5670                 aes_decrypt_cbc(swap_crypt_test_page_decrypt,
5671                                 swap_crypt_null_iv,
5672                                 PAGE_SIZE / AES_BLOCK_SIZE,
5673                                 swap_crypt_test_page_decrypt,
5674                                 &swap_crypt_ctx.decrypt);
5675                 for (i = 0; i < 4096; i ++) {
5676                         if (swap_crypt_test_page_decrypt[i] !=
5677                             swap_crypt_test_page_ref[i]) {
5678                                 panic("in place encryption test failed");
5679                         }
5680                 }
5681
5682                 swap_crypt_ctx_tested = TRUE;
5683         }
5684 #endif /* DEBUG */
5685 }
5686
5687 /*
5688  * ENCRYPTED SWAP:
5689  * vm_page_encrypt:
5690  *      Encrypt the given page, for secure paging.
5691  *      The page might already be mapped at kernel virtual
5692  *      address "kernel_mapping_offset".  Otherwise, we need
5693  *      to map it.
5694  *
5695  * Context:
5696  *      The page's object is locked, but this lock will be released
5697  *      and re-acquired.
5698  *      The page is busy and not accessible by users (not entered in any pmap).
5699  */
5700 void
5701 vm_page_encrypt(
5702         vm_page_t       page,
5703         vm_map_offset_t kernel_mapping_offset)
5704 {
5705         kern_return_t           kr;
5706         vm_map_size_t           kernel_mapping_size;
5707         vm_offset_t             kernel_vaddr;
5708         union {
5709                 unsigned char   aes_iv[AES_BLOCK_SIZE];
5710                 struct {
5711                         memory_object_t         pager_object;
5712                         vm_object_offset_t      paging_offset;
5713                 } vm;
5714         } encrypt_iv;
5715
5716         if (! vm_pages_encrypted) {
5717                 vm_pages_encrypted = TRUE;
5718         }
5719
5720         assert(page->busy);
5721         assert(page->dirty || page->precious);
5722
5723         if (page->encrypted) {
5724                 /*
5725                  * Already encrypted: no need to do it again.
5726                  */
5727                 vm_page_encrypt_already_encrypted_counter++;
5728                 return;
5729         }
5730         ASSERT_PAGE_DECRYPTED(page);
5731
5732         /*
5733          * Take a paging-in-progress reference to keep the object
5734          * alive even if we have to unlock it (in vm_paging_map_object()
5735          * for example)...
5736          */
5737         vm_object_paging_begin(page->object);
5738
5739         if (kernel_mapping_offset == 0) {
5740                 /*
5741                  * The page hasn't already been mapped in kernel space
5742                  * by the caller.  Map it now, so that we can access
5743                  * its contents and encrypt them.
5744                  */
5745                 kernel_mapping_size = PAGE_SIZE;
5746                 kr = vm_paging_map_object(&kernel_mapping_offset,
5747                                           page,
5748                                           page->object,
5749                                           page->offset,
5750                                           &kernel_mapping_size,
5751                                           VM_PROT_READ | VM_PROT_WRITE,
5752                                           FALSE);
5753                 if (kr != KERN_SUCCESS) {
5754                         panic("vm_page_encrypt: "
5755                               "could not map page in kernel: 0x%x\n",
5756                               kr);
5757                 }
5758         } else {
5759                 kernel_mapping_size = 0;
5760         }
5761         kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
5762
5763         if (swap_crypt_ctx_initialized == FALSE) {
5764                 swap_crypt_ctx_initialize();
5765         }
5766         assert(swap_crypt_ctx_initialized);
5767
5768         /*
5769          * Prepare an "initial vector" for the encryption.
5770          * We use the "pager" and the "paging_offset" for that
5771          * page to obfuscate the encrypted data a bit more and
5772          * prevent crackers from finding patterns that they could
5773          * use to break the key.
5774          */
5775         bzero(&encrypt_iv.aes_iv[0], sizeof (encrypt_iv.aes_iv));
5776         encrypt_iv.vm.pager_object = page->object->pager;
5777         encrypt_iv.vm.paging_offset =
5778                 page->object->paging_offset + page->offset;
5779
5780         /* encrypt the "initial vector" */
5781         aes_encrypt_cbc((const unsigned char *) &encrypt_iv.aes_iv[0],
5782                         swap_crypt_null_iv,
5783                         1,
5784                         &encrypt_iv.aes_iv[0],
5785                         &swap_crypt_ctx.encrypt);
5786
5787         /*
5788          * Encrypt the page.
5789          */
5790         aes_encrypt_cbc((const unsigned char *) kernel_vaddr,
5791                         &encrypt_iv.aes_iv[0],
5792                         PAGE_SIZE / AES_BLOCK_SIZE,
5793                         (unsigned char *) kernel_vaddr,
5794                         &swap_crypt_ctx.encrypt);
5795
5796         vm_page_encrypt_counter++;
5797
5798         /*
5799          * Unmap the page from the kernel's address space,
5800          * if we had to map it ourselves.  Otherwise, let
5801          * the caller undo the mapping if needed.
5802          */
5803         if (kernel_mapping_size != 0) {
5804                 vm_paging_unmap_object(page->object,
5805                                        kernel_mapping_offset,
5806                                        kernel_mapping_offset + kernel_mapping_size);
5807         }
5808
5809         /*
5810          * Clear the "reference" and "modified" bits.
5811          * This should clean up any impact the encryption had
5812          * on them.
5813          * The page was kept busy and disconnected from all pmaps,
5814          * so it can't have been referenced or modified from user
5815          * space.
5816          * The software bits will be reset later after the I/O
5817          * has completed (in upl_commit_range()).
5818          */
5819         pmap_clear_refmod(page->phys_page, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
5820
5821         page->encrypted = TRUE;
5822
5823         vm_object_paging_end(page->object);
5824 }
5825
5826 /*
5827  * ENCRYPTED SWAP:
5828  * vm_page_decrypt:
5829  *      Decrypt the given page.
5830  *      The page might already be mapped at kernel virtual
5831  *      address "kernel_mapping_offset".  Otherwise, we need
5832  *      to map it.
5833  *
5834  * Context:
5835  *      The page's VM object is locked but will be unlocked and relocked.
5836  *      The page is busy and not accessible by users (not entered in any pmap).
5837  */
5838 void
5839 vm_page_decrypt(
5840         vm_page_t       page,
5841         vm_map_offset_t kernel_mapping_offset)
5842 {
5843         kern_return_t           kr;
5844         vm_map_size_t           kernel_mapping_size;
5845         vm_offset_t             kernel_vaddr;
5846         union {
5847                 unsigned char   aes_iv[AES_BLOCK_SIZE];
5848                 struct {
5849                         memory_object_t         pager_object;
5850                         vm_object_offset_t      paging_offset;
5851                 } vm;
5852         } decrypt_iv;
5853
5854         assert(page->busy);
5855         assert(page->encrypted);
5856
5857         /*
5858          * Take a paging-in-progress reference to keep the object
5859          * alive even if we have to unlock it (in vm_paging_map_object()
5860          * for example)...
5861          */
5862         vm_object_paging_begin(page->object);
5863
5864         if (kernel_mapping_offset == 0) {
5865                 /*
5866                  * The page hasn't already been mapped in kernel space
5867                  * by the caller.  Map it now, so that we can access
5868                  * its contents and decrypt them.
5869                  */
5870                 kernel_mapping_size = PAGE_SIZE;
5871                 kr = vm_paging_map_object(&kernel_mapping_offset,
5872                                           page,
5873                                           page->object,
5874                                           page->offset,
5875                                           &kernel_mapping_size,
5876                                           VM_PROT_READ | VM_PROT_WRITE,
5877                                           FALSE);
5878                 if (kr != KERN_SUCCESS) {
5879                         panic("vm_page_decrypt: "
5880                               "could not map page in kernel: 0x%x\n",
5881                               kr);
5882                 }
5883         } else {
5884                 kernel_mapping_size = 0;
5885         }
5886         kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
5887
5888         assert(swap_crypt_ctx_initialized);
5889
5890         /*
5891          * Prepare an "initial vector" for the decryption.
5892          * It has to be the same as the "initial vector" we
5893          * used to encrypt that page.
5894          */
5895         bzero(&decrypt_iv.aes_iv[0], sizeof (decrypt_iv.aes_iv));
5896         decrypt_iv.vm.pager_object = page->object->pager;
5897         decrypt_iv.vm.paging_offset =
5898                 page->object->paging_offset + page->offset;
5899
5900         /* encrypt the "initial vector" */
5901         aes_encrypt_cbc((const unsigned char *) &decrypt_iv.aes_iv[0],
5902                         swap_crypt_null_iv,
5903                         1,
5904                         &decrypt_iv.aes_iv[0],
5905                         &swap_crypt_ctx.encrypt);
5906
5907         /*
5908          * Decrypt the page.
5909          */
5910         aes_decrypt_cbc((const unsigned char *) kernel_vaddr,
5911                         &decrypt_iv.aes_iv[0],
5912                         PAGE_SIZE / AES_BLOCK_SIZE,
5913                         (unsigned char *) kernel_vaddr,
5914                         &swap_crypt_ctx.decrypt);
5915         vm_page_decrypt_counter++;
5916
5917         /*
5918          * Unmap the page from the kernel's address space,
5919          * if we had to map it ourselves.  Otherwise, let
5920          * the caller undo the mapping if needed.
5921          */
5922         if (kernel_mapping_size != 0) {
5923                 vm_paging_unmap_object(page->object,
5924                                        kernel_vaddr,
5925                                        kernel_vaddr + PAGE_SIZE);
5926         }
5927
5928         /*
5929          * After decryption, the page is actually clean.
5930          * It was encrypted as part of paging, which "cleans"
5931          * the "dirty" pages.
5932          * Noone could access it after it was encrypted
5933          * and the decryption doesn't count.
5934          */
5935         page->dirty = FALSE;
5936         if (page->cs_validated && !page->cs_tainted) {
5937                 /*
5938                  * CODE SIGNING:
5939                  * This page is no longer dirty
5940                  * but could have been modified,
5941                  * so it will need to be
5942                  * re-validated.
5943                  */
5944                 page->cs_validated = FALSE;
5945                 vm_cs_validated_resets++;
5946         }
5947         pmap_clear_refmod(page->phys_page, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
5948
5949         page->encrypted = FALSE;
5950
5951         /*
5952          * We've just modified the page's contents via the data cache and part
5953          * of the new contents might still be in the cache and not yet in RAM.
5954          * Since the page is now available and might get gathered in a UPL to
5955          * be part of a DMA transfer from a driver that expects the memory to
5956          * be coherent at this point, we have to flush the data cache.
5957          */
5958         pmap_sync_page_attributes_phys(page->phys_page);
5959         /*
5960          * Since the page is not mapped yet, some code might assume that it
5961          * doesn't need to invalidate the instruction cache when writing to
5962          * that page.  That code relies on "pmapped" being FALSE, so that the
5963          * caches get synchronized when the page is first mapped.
5964          */
5965         assert(pmap_verify_free(page->phys_page));
5966         page->pmapped = FALSE;
5967         page->wpmapped = FALSE;
5968
5969         vm_object_paging_end(page->object);
5970 }
5971
5972 unsigned long upl_encrypt_upls = 0;
5973 unsigned long upl_encrypt_pages = 0;
5974
5975 /*
5976  * ENCRYPTED SWAP:
5977  *
5978  * upl_encrypt:
5979  *      Encrypts all the pages in the UPL, within the specified range.
5980  *
5981  */
5982 void
5983 upl_encrypt(
5984         upl_t                   upl,
5985         upl_offset_t            crypt_offset,
5986         upl_size_t              crypt_size)
5987 {
5988         upl_size_t              upl_size;
5989         upl_offset_t            upl_offset;
5990         vm_object_t             upl_object;
5991         vm_page_t               page;
5992         vm_object_t             shadow_object;
5993         vm_object_offset_t      shadow_offset;
5994         vm_object_offset_t      paging_offset;
5995         vm_object_offset_t      base_offset;
5996
5997         upl_encrypt_upls++;
5998         upl_encrypt_pages += crypt_size / PAGE_SIZE;
5999
6000         upl_object = upl->map_object;
6001         upl_offset = upl->offset;
6002         upl_size = upl->size;
6003
6004         vm_object_lock(upl_object);
6005
6006         /*
6007          * Find the VM object that contains the actual pages.
6008          */
6009         if (upl_object->pageout) {
6010                 shadow_object = upl_object->shadow;
6011                 /*
6012                  * The offset in the shadow object is actually also
6013                  * accounted for in upl->offset.  It possibly shouldn't be
6014                  * this way, but for now don't account for it twice.
6015                  */
6016                 shadow_offset = 0;
6017                 assert(upl_object->paging_offset == 0); /* XXX ? */
6018                 vm_object_lock(shadow_object);
6019         } else {
6020                 shadow_object = upl_object;
6021                 shadow_offset = 0;
6022         }
6023
6024         paging_offset = shadow_object->paging_offset;
6025         vm_object_paging_begin(shadow_object);
6026
6027         if (shadow_object != upl_object)
6028                 vm_object_unlock(upl_object);
6029
6030
6031         base_offset = shadow_offset;
6032         base_offset += upl_offset;
6033         base_offset += crypt_offset;
6034         base_offset -= paging_offset;
6035
6036         assert(crypt_offset + crypt_size <= upl_size);
6037
6038         for (upl_offset = 0;
6039              upl_offset < crypt_size;
6040              upl_offset += PAGE_SIZE) {
6041                 page = vm_page_lookup(shadow_object,
6042                                       base_offset + upl_offset);
6043                 if (page == VM_PAGE_NULL) {
6044                         panic("upl_encrypt: "
6045                               "no page for (obj=%p,off=%lld+%d)!\n",
6046                               shadow_object,
6047                               base_offset,
6048                               upl_offset);
6049                 }
6050                 /*
6051                  * Disconnect the page from all pmaps, so that nobody can
6052                  * access it while it's encrypted.  After that point, all
6053                  * accesses to this page will cause a page fault and block
6054                  * while the page is busy being encrypted.  After the
6055                  * encryption completes, any access will cause a
6056                  * page fault and the page gets decrypted at that time.
6057                  */
6058                 pmap_disconnect(page->phys_page);
6059                 vm_page_encrypt(page, 0);
6060
6061                 if (shadow_object == vm_pageout_scan_wants_object) {
6062                         /*
6063                          * Give vm_pageout_scan() a chance to convert more
6064                          * pages from "clean-in-place" to "clean-and-free",
6065                          * if it's interested in the same pages we selected
6066                          * in this cluster.
6067                          */
6068                         vm_object_unlock(shadow_object);
6069                         vm_object_lock(shadow_object);
6070                 }
6071         }
6072
6073         vm_object_paging_end(shadow_object);
6074         vm_object_unlock(shadow_object);
6075 }
6076
6077 #else /* CRYPTO */
6078 void
6079 upl_encrypt(
6080         __unused upl_t                  upl,
6081         __unused upl_offset_t   crypt_offset,
6082         __unused upl_size_t     crypt_size)
6083 {
6084 }
6085
6086 void
6087 vm_page_encrypt(
6088         __unused vm_page_t              page,
6089         __unused vm_map_offset_t        kernel_mapping_offset)
6090 {
6091 }
6092
6093 void
6094 vm_page_decrypt(
6095         __unused vm_page_t              page,
6096         __unused vm_map_offset_t        kernel_mapping_offset)
6097 {
6098 }
6099
6100 #endif /* CRYPTO */
6101
6102 vm_size_t
6103 upl_get_internal_pagelist_offset(void)
6104 {
6105         return sizeof(struct upl);
6106 }
6107
6108 void
6109 upl_clear_dirty(
6110         upl_t           upl,
6111         boolean_t       value)
6112 {
6113         if (value) {
6114                 upl->flags |= UPL_CLEAR_DIRTY;
6115         } else {
6116                 upl->flags &= ~UPL_CLEAR_DIRTY;
6117         }
6118 }
6119
6120
6121 #ifdef MACH_BSD
6122
6123 boolean_t  upl_device_page(upl_page_info_t *upl)
6124 {
6125         return(UPL_DEVICE_PAGE(upl));
6126 }
6127 boolean_t  upl_page_present(upl_page_info_t *upl, int index)
6128 {
6129         return(UPL_PAGE_PRESENT(upl, index));
6130 }
6131 boolean_t  upl_speculative_page(upl_page_info_t *upl, int index)
6132 {
6133         return(UPL_SPECULATIVE_PAGE(upl, index));
6134 }
6135 boolean_t  upl_dirty_page(upl_page_info_t *upl, int index)
6136 {
6137         return(UPL_DIRTY_PAGE(upl, index));
6138 }
6139 boolean_t  upl_valid_page(upl_page_info_t *upl, int index)
6140 {
6141         return(UPL_VALID_PAGE(upl, index));
6142 }
6143 ppnum_t  upl_phys_page(upl_page_info_t *upl, int index)
6144 {
6145         return(UPL_PHYS_PAGE(upl, index));
6146 }
6147
6148
6149 void
6150 vm_countdirtypages(void)
6151 {
6152         vm_page_t m;
6153         int dpages;
6154         int pgopages;
6155         int precpages;
6156
6157
6158         dpages=0;
6159         pgopages=0;
6160         precpages=0;
6161
6162         vm_page_lock_queues();
6163         m = (vm_page_t) queue_first(&vm_page_queue_inactive);
6164         do {
6165                 if (m ==(vm_page_t )0) break;
6166
6167                 if(m->dirty) dpages++;
6168                 if(m->pageout) pgopages++;
6169                 if(m->precious) precpages++;
6170
6171                 assert(m->object != kernel_object);
6172                 m = (vm_page_t) queue_next(&m->pageq);
6173                 if (m ==(vm_page_t )0) break;
6174
6175         } while (!queue_end(&vm_page_queue_inactive,(queue_entry_t) m));
6176         vm_page_unlock_queues();
6177
6178         vm_page_lock_queues();
6179         m = (vm_page_t) queue_first(&vm_page_queue_throttled);
6180         do {
6181                 if (m ==(vm_page_t )0) break;
6182
6183                 dpages++;
6184                 assert(m->dirty);
6185                 assert(!m->pageout);
6186                 assert(m->object != kernel_object);
6187                 m = (vm_page_t) queue_next(&m->pageq);
6188                 if (m ==(vm_page_t )0) break;
6189
6190         } while (!queue_end(&vm_page_queue_throttled,(queue_entry_t) m));
6191         vm_page_unlock_queues();
6192
6193         vm_page_lock_queues();
6194         m = (vm_page_t) queue_first(&vm_page_queue_zf);
6195         do {
6196                 if (m ==(vm_page_t )0) break;
6197
6198                 if(m->dirty) dpages++;
6199                 if(m->pageout) pgopages++;
6200                 if(m->precious) precpages++;
6201
6202                 assert(m->object != kernel_object);
6203                 m = (vm_page_t) queue_next(&m->pageq);
6204                 if (m ==(vm_page_t )0) break;
6205
6206         } while (!queue_end(&vm_page_queue_zf,(queue_entry_t) m));
6207         vm_page_unlock_queues();
6208
6209         printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
6210
6211         dpages=0;
6212         pgopages=0;
6213         precpages=0;
6214
6215         vm_page_lock_queues();
6216         m = (vm_page_t) queue_first(&vm_page_queue_active);
6217
6218         do {
6219                 if(m == (vm_page_t )0) break;
6220                 if(m->dirty) dpages++;
6221                 if(m->pageout) pgopages++;
6222                 if(m->precious) precpages++;
6223
6224                 assert(m->object != kernel_object);
6225                 m = (vm_page_t) queue_next(&m->pageq);
6226                 if(m == (vm_page_t )0) break;
6227
6228         } while (!queue_end(&vm_page_queue_active,(queue_entry_t) m));
6229         vm_page_unlock_queues();
6230
6231         printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
6232
6233 }
6234 #endif /* MACH_BSD */
6235
6236 ppnum_t upl_get_highest_page(
6237                              upl_t                      upl)
6238 {
6239         return upl->highest_page;
6240 }
6241
6242 #ifdef UPL_DEBUG
6243 kern_return_t  upl_ubc_alias_set(upl_t upl, unsigned int alias1, unsigned int alias2)
6244 {
6245         upl->ubc_alias1 = alias1;
6246         upl->ubc_alias2 = alias2;
6247         return KERN_SUCCESS;
6248 }
6249 int  upl_ubc_alias_get(upl_t upl, unsigned int * al, unsigned int * al2)
6250 {
6251         if(al)
6252                 *al = upl->ubc_alias1;
6253         if(al2)
6254                 *al2 = upl->ubc_alias2;
6255         return KERN_SUCCESS;
6256 }
6257 #endif /* UPL_DEBUG */
6258
6259
6260
6261 #if     MACH_KDB
6262 #include <ddb/db_output.h>
6263 #include <ddb/db_print.h>
6264 #include <vm/vm_print.h>
6265
6266 #define printf  kdbprintf
6267 void            db_pageout(void);
6268
6269 void
6270 db_vm(void)
6271 {
6272
6273         iprintf("VM Statistics:\n");
6274         db_indent += 2;
6275         iprintf("pages:\n");
6276         db_indent += 2;
6277         iprintf("activ %5d  inact %5d  free  %5d",
6278                 vm_page_active_count, vm_page_inactive_count,
6279                 vm_page_free_count);
6280         printf("   wire  %5d  gobbl %5d\n",
6281                vm_page_wire_count, vm_page_gobble_count);
6282         db_indent -= 2;
6283         iprintf("target:\n");
6284         db_indent += 2;
6285         iprintf("min   %5d  inact %5d  free  %5d",
6286                 vm_page_free_min, vm_page_inactive_target,
6287                 vm_page_free_target);
6288         printf("   resrv %5d\n", vm_page_free_reserved);
6289         db_indent -= 2;
6290         iprintf("pause:\n");
6291         db_pageout();
6292         db_indent -= 2;
6293 }
6294
6295 #if     MACH_COUNTERS
6296 extern int c_laundry_pages_freed;
6297 #endif  /* MACH_COUNTERS */
6298
6299 void
6300 db_pageout(void)
6301 {
6302         iprintf("Pageout Statistics:\n");
6303         db_indent += 2;
6304         iprintf("active %5d  inactv %5d\n",
6305                 vm_pageout_active, vm_pageout_inactive);
6306         iprintf("nolock %5d  avoid  %5d  busy   %5d  absent %5d\n",
6307                 vm_pageout_inactive_nolock, vm_pageout_inactive_avoid,
6308                 vm_pageout_inactive_busy, vm_pageout_inactive_absent);
6309         iprintf("used   %5d  clean  %5d  dirty  %5d\n",
6310                 vm_pageout_inactive_used, vm_pageout_inactive_clean,
6311                 vm_pageout_inactive_dirty);
6312 #if     MACH_COUNTERS
6313         iprintf("laundry_pages_freed %d\n", c_laundry_pages_freed);
6314 #endif  /* MACH_COUNTERS */
6315 #if     MACH_CLUSTER_STATS
6316         iprintf("Cluster Statistics:\n");
6317         db_indent += 2;
6318         iprintf("dirtied   %5d   cleaned  %5d   collisions  %5d\n",
6319                 vm_pageout_cluster_dirtied, vm_pageout_cluster_cleaned,
6320                 vm_pageout_cluster_collisions);
6321         iprintf("clusters  %5d   conversions  %5d\n",
6322                 vm_pageout_cluster_clusters, vm_pageout_cluster_conversions);
6323         db_indent -= 2;
6324         iprintf("Target Statistics:\n");
6325         db_indent += 2;
6326         iprintf("collisions   %5d   page_dirtied  %5d   page_freed  %5d\n",
6327                 vm_pageout_target_collisions, vm_pageout_target_page_dirtied,
6328                 vm_pageout_target_page_freed);
6329         db_indent -= 2;
6330 #endif  /* MACH_CLUSTER_STATS */
6331         db_indent -= 2;
6332 }
6333
6334 #endif  /* MACH_KDB */