osfmk/vm/vm_pageout.c

   1 /*
   2  * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58 /*
  59  *      File:   vm/vm_pageout.c
  60  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  61  *      Date:   1985
  62  *
  63  *      The proverbial page-out daemon.
  64  */
  65
  66 #include <stdint.h>
  67
  68 #include <debug.h>
  69 #include <mach_pagemap.h>
  70 #include <mach_cluster_stats.h>
  71 #include <mach_kdb.h>
  72 #include <advisory_pageout.h>
  73
  74 #include <mach/mach_types.h>
  75 #include <mach/memory_object.h>
  76 #include <mach/memory_object_default.h>
  77 #include <mach/memory_object_control_server.h>
  78 #include <mach/mach_host_server.h>
  79 #include <mach/upl.h>
  80 #include <mach/vm_map.h>
  81 #include <mach/vm_param.h>
  82 #include <mach/vm_statistics.h>
  83 #include <mach/sdt.h>
  84
  85 #include <kern/kern_types.h>
  86 #include <kern/counters.h>
  87 #include <kern/host_statistics.h>
  88 #include <kern/machine.h>
  89 #include <kern/misc_protos.h>
  90 #include <kern/thread.h>
  91 #include <kern/xpr.h>
  92 #include <kern/kalloc.h>
  93
  94 #include <machine/vm_tuning.h>
  95
  96 #if CONFIG_EMBEDDED
  97 #include <sys/kern_memorystatus.h>
  98 #endif
  99
 100 #include <vm/pmap.h>
 101 #include <vm/vm_fault.h>
 102 #include <vm/vm_map.h>
 103 #include <vm/vm_object.h>
 104 #include <vm/vm_page.h>
 105 #include <vm/vm_pageout.h>
 106 #include <vm/vm_protos.h> /* must be last */
 107 #include <vm/memory_object.h>
 108 #include <vm/vm_purgeable_internal.h>
 109
 110 /*
 111  * ENCRYPTED SWAP:
 112  */
 113 #include <../bsd/crypto/aes/aes.h>
 114
 115
 116 #ifndef VM_PAGEOUT_BURST_ACTIVE_THROTTLE   /* maximum iterations of the active queue to move pages to inactive */
 117 #ifdef  CONFIG_EMBEDDED
 118 #define VM_PAGEOUT_BURST_ACTIVE_THROTTLE  2048
 119 #else
 120 #define VM_PAGEOUT_BURST_ACTIVE_THROTTLE  100
 121 #endif
 122 #endif
 123
 124 #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE  /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
 125 #ifdef  CONFIG_EMBEDDED
 126 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 1024
 127 #else
 128 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096
 129 #endif
 130 #endif
 131
 132 #ifndef VM_PAGEOUT_DEADLOCK_RELIEF
 133 #define VM_PAGEOUT_DEADLOCK_RELIEF 100  /* number of pages to move to break deadlock */
 134 #endif
 135
 136 #ifndef VM_PAGEOUT_INACTIVE_RELIEF
 137 #define VM_PAGEOUT_INACTIVE_RELIEF 50   /* minimum number of pages to move to the inactive q */
 138 #endif
 139
 140 #ifndef VM_PAGE_LAUNDRY_MAX
 141 #define VM_PAGE_LAUNDRY_MAX     16UL    /* maximum pageouts on a given pageout queue */
 142 #endif  /* VM_PAGEOUT_LAUNDRY_MAX */
 143
 144 #ifndef VM_PAGEOUT_BURST_WAIT
 145 #define VM_PAGEOUT_BURST_WAIT   30      /* milliseconds per page */
 146 #endif  /* VM_PAGEOUT_BURST_WAIT */
 147
 148 #ifndef VM_PAGEOUT_EMPTY_WAIT
 149 #define VM_PAGEOUT_EMPTY_WAIT   200     /* milliseconds */
 150 #endif  /* VM_PAGEOUT_EMPTY_WAIT */
 151
 152 #ifndef VM_PAGEOUT_DEADLOCK_WAIT
 153 #define VM_PAGEOUT_DEADLOCK_WAIT        300     /* milliseconds */
 154 #endif  /* VM_PAGEOUT_DEADLOCK_WAIT */
 155
 156 #ifndef VM_PAGEOUT_IDLE_WAIT
 157 #define VM_PAGEOUT_IDLE_WAIT    10      /* milliseconds */
 158 #endif  /* VM_PAGEOUT_IDLE_WAIT */
 159
 160 #ifndef VM_PAGE_SPECULATIVE_TARGET
 161 #define VM_PAGE_SPECULATIVE_TARGET(total) ((total) * 1 / 20)
 162 #endif /* VM_PAGE_SPECULATIVE_TARGET */
 163
 164 #ifndef VM_PAGE_INACTIVE_HEALTHY_LIMIT
 165 #define VM_PAGE_INACTIVE_HEALTHY_LIMIT(total) ((total) * 1 / 200)
 166 #endif /* VM_PAGE_INACTIVE_HEALTHY_LIMIT */
 167
 168
 169 /*
 170  *      To obtain a reasonable LRU approximation, the inactive queue
 171  *      needs to be large enough to give pages on it a chance to be
 172  *      referenced a second time.  This macro defines the fraction
 173  *      of active+inactive pages that should be inactive.
 174  *      The pageout daemon uses it to update vm_page_inactive_target.
 175  *
 176  *      If vm_page_free_count falls below vm_page_free_target and
 177  *      vm_page_inactive_count is below vm_page_inactive_target,
 178  *      then the pageout daemon starts running.
 179  */
 180
 181 #ifndef VM_PAGE_INACTIVE_TARGET
 182 #define VM_PAGE_INACTIVE_TARGET(avail)  ((avail) * 1 / 3)
 183 #endif  /* VM_PAGE_INACTIVE_TARGET */
 184
 185 /*
 186  *      Once the pageout daemon starts running, it keeps going
 187  *      until vm_page_free_count meets or exceeds vm_page_free_target.
 188  */
 189
 190 #ifndef VM_PAGE_FREE_TARGET
 191 #ifdef  CONFIG_EMBEDDED
 192 #define VM_PAGE_FREE_TARGET(free)       (15 + (free) / 100)
 193 #else
 194 #define VM_PAGE_FREE_TARGET(free)       (15 + (free) / 80)
 195 #endif
 196 #endif  /* VM_PAGE_FREE_TARGET */
 197
 198 /*
 199  *      The pageout daemon always starts running once vm_page_free_count
 200  *      falls below vm_page_free_min.
 201  */
 202
 203 #ifndef VM_PAGE_FREE_MIN
 204 #ifdef  CONFIG_EMBEDDED
 205 #define VM_PAGE_FREE_MIN(free)          (10 + (free) / 200)
 206 #else
 207 #define VM_PAGE_FREE_MIN(free)          (10 + (free) / 100)
 208 #endif
 209 #endif  /* VM_PAGE_FREE_MIN */
 210
 211 #define VM_PAGE_FREE_MIN_LIMIT          1500
 212 #define VM_PAGE_FREE_TARGET_LIMIT       2000
 213
 214
 215 /*
 216  *      When vm_page_free_count falls below vm_page_free_reserved,
 217  *      only vm-privileged threads can allocate pages.  vm-privilege
 218  *      allows the pageout daemon and default pager (and any other
 219  *      associated threads needed for default pageout) to continue
 220  *      operation by dipping into the reserved pool of pages.
 221  */
 222
 223 #ifndef VM_PAGE_FREE_RESERVED
 224 #define VM_PAGE_FREE_RESERVED(n)        \
 225         ((6 * VM_PAGE_LAUNDRY_MAX) + (n))
 226 #endif  /* VM_PAGE_FREE_RESERVED */
 227
 228 /*
 229  *      When we dequeue pages from the inactive list, they are
 230  *      reactivated (ie, put back on the active queue) if referenced.
 231  *      However, it is possible to starve the free list if other
 232  *      processors are referencing pages faster than we can turn off
 233  *      the referenced bit.  So we limit the number of reactivations
 234  *      we will make per call of vm_pageout_scan().
 235  */
 236 #define VM_PAGE_REACTIVATE_LIMIT_MAX 20000
 237 #ifndef VM_PAGE_REACTIVATE_LIMIT
 238 #ifdef  CONFIG_EMBEDDED
 239 #define VM_PAGE_REACTIVATE_LIMIT(avail) (VM_PAGE_INACTIVE_TARGET(avail) / 2)
 240 #else
 241 #define VM_PAGE_REACTIVATE_LIMIT(avail) (MAX((avail) * 1 / 20,VM_PAGE_REACTIVATE_LIMIT_MAX))
 242 #endif
 243 #endif  /* VM_PAGE_REACTIVATE_LIMIT */
 244 #define VM_PAGEOUT_INACTIVE_FORCE_RECLAIM       100
 245
 246
 247 /*
 248  * must hold the page queues lock to
 249  * manipulate this structure
 250  */
 251 struct vm_pageout_queue {
 252         queue_head_t    pgo_pending;    /* laundry pages to be processed by pager's iothread */
 253         unsigned int    pgo_laundry;    /* current count of laundry pages on queue or in flight */
 254         unsigned int    pgo_maxlaundry;
 255
 256         unsigned int    pgo_idle:1,     /* iothread is blocked waiting for work to do */
 257                         pgo_busy:1,     /* iothread is currently processing request from pgo_pending */
 258                         pgo_throttled:1,/* vm_pageout_scan thread needs a wakeup when pgo_laundry drops */
 259                         :0;
 260 };
 261
 262 #define VM_PAGE_Q_THROTTLED(q)          \
 263         ((q)->pgo_laundry >= (q)->pgo_maxlaundry)
 264
 265
 266 /*
 267  * Exported variable used to broadcast the activation of the pageout scan
 268  * Working Set uses this to throttle its use of pmap removes.  In this
 269  * way, code which runs within memory in an uncontested context does
 270  * not keep encountering soft faults.
 271  */
 272
 273 unsigned int    vm_pageout_scan_event_counter = 0;
 274
 275 /*
 276  * Forward declarations for internal routines.
 277  */
 278
 279 static void vm_pageout_garbage_collect(int);
 280 static void vm_pageout_iothread_continue(struct vm_pageout_queue *);
 281 static void vm_pageout_iothread_external(void);
 282 static void vm_pageout_iothread_internal(void);
 283 static void vm_pageout_queue_steal(vm_page_t);
 284
 285 extern void vm_pageout_continue(void);
 286 extern void vm_pageout_scan(void);
 287
 288 static thread_t vm_pageout_external_iothread = THREAD_NULL;
 289 static thread_t vm_pageout_internal_iothread = THREAD_NULL;
 290
 291 unsigned int vm_pageout_reserved_internal = 0;
 292 unsigned int vm_pageout_reserved_really = 0;
 293
 294 unsigned int vm_pageout_idle_wait = 0;          /* milliseconds */
 295 unsigned int vm_pageout_empty_wait = 0;         /* milliseconds */
 296 unsigned int vm_pageout_burst_wait = 0;         /* milliseconds */
 297 unsigned int vm_pageout_deadlock_wait = 0;      /* milliseconds */
 298 unsigned int vm_pageout_deadlock_relief = 0;
 299 unsigned int vm_pageout_inactive_relief = 0;
 300 unsigned int vm_pageout_burst_active_throttle = 0;
 301 unsigned int vm_pageout_burst_inactive_throttle = 0;
 302
 303 /*
 304  *      Protection against zero fill flushing live working sets derived
 305  *      from existing backing store and files
 306  */
 307 unsigned int vm_accellerate_zf_pageout_trigger = 400;
 308 unsigned int zf_queue_min_count = 100;
 309 unsigned int vm_zf_count = 0;
 310 unsigned int vm_zf_queue_count = 0;
 311
 312 /*
 313  *      These variables record the pageout daemon's actions:
 314  *      how many pages it looks at and what happens to those pages.
 315  *      No locking needed because only one thread modifies the variables.
 316  */
 317
 318 unsigned int vm_pageout_active = 0;             /* debugging */
 319 unsigned int vm_pageout_inactive = 0;           /* debugging */
 320 unsigned int vm_pageout_inactive_throttled = 0; /* debugging */
 321 unsigned int vm_pageout_inactive_forced = 0;    /* debugging */
 322 unsigned int vm_pageout_inactive_nolock = 0;    /* debugging */
 323 unsigned int vm_pageout_inactive_avoid = 0;     /* debugging */
 324 unsigned int vm_pageout_inactive_busy = 0;      /* debugging */
 325 unsigned int vm_pageout_inactive_absent = 0;    /* debugging */
 326 unsigned int vm_pageout_inactive_used = 0;      /* debugging */
 327 unsigned int vm_pageout_inactive_clean = 0;     /* debugging */
 328 unsigned int vm_pageout_inactive_dirty = 0;     /* debugging */
 329 unsigned int vm_pageout_dirty_no_pager = 0;     /* debugging */
 330 unsigned int vm_pageout_purged_objects = 0;     /* debugging */
 331 unsigned int vm_stat_discard = 0;               /* debugging */
 332 unsigned int vm_stat_discard_sent = 0;          /* debugging */
 333 unsigned int vm_stat_discard_failure = 0;       /* debugging */
 334 unsigned int vm_stat_discard_throttle = 0;      /* debugging */
 335 unsigned int vm_pageout_reactivation_limit_exceeded = 0;        /* debugging */
 336 unsigned int vm_pageout_catch_ups = 0;                          /* debugging */
 337 unsigned int vm_pageout_inactive_force_reclaim = 0;     /* debugging */
 338
 339 unsigned int vm_pageout_scan_active_throttled = 0;
 340 unsigned int vm_pageout_scan_inactive_throttled = 0;
 341 unsigned int vm_pageout_scan_throttle = 0;                      /* debugging */
 342 unsigned int vm_pageout_scan_burst_throttle = 0;                /* debugging */
 343 unsigned int vm_pageout_scan_empty_throttle = 0;                /* debugging */
 344 unsigned int vm_pageout_scan_deadlock_detected = 0;             /* debugging */
 345 unsigned int vm_pageout_scan_active_throttle_success = 0;       /* debugging */
 346 unsigned int vm_pageout_scan_inactive_throttle_success = 0;     /* debugging */
 347 /*
 348  * Backing store throttle when BS is exhausted
 349  */
 350 unsigned int    vm_backing_store_low = 0;
 351
 352 unsigned int vm_pageout_out_of_line  = 0;
 353 unsigned int vm_pageout_in_place  = 0;
 354
 355 /*
 356  * ENCRYPTED SWAP:
 357  * counters and statistics...
 358  */
 359 unsigned long vm_page_decrypt_counter = 0;
 360 unsigned long vm_page_decrypt_for_upl_counter = 0;
 361 unsigned long vm_page_encrypt_counter = 0;
 362 unsigned long vm_page_encrypt_abort_counter = 0;
 363 unsigned long vm_page_encrypt_already_encrypted_counter = 0;
 364 boolean_t vm_pages_encrypted = FALSE; /* are there encrypted pages ? */
 365
 366 struct  vm_pageout_queue vm_pageout_queue_internal;
 367 struct  vm_pageout_queue vm_pageout_queue_external;
 368
 369 unsigned int vm_page_speculative_target = 0;
 370
 371 vm_object_t     vm_pageout_scan_wants_object = VM_OBJECT_NULL;
 372
 373
 374 /*
 375  *      Routine:        vm_backing_store_disable
 376  *      Purpose:
 377  *              Suspend non-privileged threads wishing to extend
 378  *              backing store when we are low on backing store
 379  *              (Synchronized by caller)
 380  */
 381 void
 382 vm_backing_store_disable(
 383         boolean_t       disable)
 384 {
 385         if(disable) {
 386                 vm_backing_store_low = 1;
 387         } else {
 388                 if(vm_backing_store_low) {
 389                         vm_backing_store_low = 0;
 390                         thread_wakeup((event_t) &vm_backing_store_low);
 391                 }
 392         }
 393 }
 394
 395
 396 #if MACH_CLUSTER_STATS
 397 unsigned long vm_pageout_cluster_dirtied = 0;
 398 unsigned long vm_pageout_cluster_cleaned = 0;
 399 unsigned long vm_pageout_cluster_collisions = 0;
 400 unsigned long vm_pageout_cluster_clusters = 0;
 401 unsigned long vm_pageout_cluster_conversions = 0;
 402 unsigned long vm_pageout_target_collisions = 0;
 403 unsigned long vm_pageout_target_page_dirtied = 0;
 404 unsigned long vm_pageout_target_page_freed = 0;
 405 #define CLUSTER_STAT(clause)    clause
 406 #else   /* MACH_CLUSTER_STATS */
 407 #define CLUSTER_STAT(clause)
 408 #endif  /* MACH_CLUSTER_STATS */
 409
 410 /*
 411  *      Routine:        vm_pageout_object_terminate
 412  *      Purpose:
 413  *              Destroy the pageout_object, and perform all of the
 414  *              required cleanup actions.
 415  *
 416  *      In/Out conditions:
 417  *              The object must be locked, and will be returned locked.
 418  */
 419 void
 420 vm_pageout_object_terminate(
 421         vm_object_t     object)
 422 {
 423         vm_object_t     shadow_object;
 424
 425         /*
 426          * Deal with the deallocation (last reference) of a pageout object
 427          * (used for cleaning-in-place) by dropping the paging references/
 428          * freeing pages in the original object.
 429          */
 430
 431         assert(object->pageout);
 432         shadow_object = object->shadow;
 433         vm_object_lock(shadow_object);
 434
 435         while (!queue_empty(&object->memq)) {
 436                 vm_page_t               p, m;
 437                 vm_object_offset_t      offset;
 438
 439                 p = (vm_page_t) queue_first(&object->memq);
 440
 441                 assert(p->private);
 442                 assert(p->pageout);
 443                 p->pageout = FALSE;
 444                 assert(!p->cleaning);
 445
 446                 offset = p->offset;
 447                 VM_PAGE_FREE(p);
 448                 p = VM_PAGE_NULL;
 449
 450                 m = vm_page_lookup(shadow_object,
 451                         offset + object->shadow_offset);
 452
 453                 if(m == VM_PAGE_NULL)
 454                         continue;
 455                 assert(m->cleaning);
 456                 /* used as a trigger on upl_commit etc to recognize the */
 457                 /* pageout daemon's subseqent desire to pageout a cleaning */
 458                 /* page.  When the bit is on the upl commit code will   */
 459                 /* respect the pageout bit in the target page over the  */
 460                 /* caller's page list indication */
 461                 m->dump_cleaning = FALSE;
 462
 463                 assert((m->dirty) || (m->precious) ||
 464                                 (m->busy && m->cleaning));
 465
 466                 /*
 467                  * Handle the trusted pager throttle.
 468                  * Also decrement the burst throttle (if external).
 469                  */
 470                 vm_page_lock_queues();
 471                 if (m->laundry) {
 472                         vm_pageout_throttle_up(m);
 473                 }
 474
 475                 /*
 476                  * Handle the "target" page(s). These pages are to be freed if
 477                  * successfully cleaned. Target pages are always busy, and are
 478                  * wired exactly once. The initial target pages are not mapped,
 479                  * (so cannot be referenced or modified) but converted target
 480                  * pages may have been modified between the selection as an
 481                  * adjacent page and conversion to a target.
 482                  */
 483                 if (m->pageout) {
 484                         assert(m->busy);
 485                         assert(m->wire_count == 1);
 486                         m->cleaning = FALSE;
 487                         m->encrypted_cleaning = FALSE;
 488                         m->pageout = FALSE;
 489 #if MACH_CLUSTER_STATS
 490                         if (m->wanted) vm_pageout_target_collisions++;
 491 #endif
 492                         /*
 493                          * Revoke all access to the page. Since the object is
 494                          * locked, and the page is busy, this prevents the page
 495                          * from being dirtied after the pmap_disconnect() call
 496                          * returns.
 497                          *
 498                          * Since the page is left "dirty" but "not modifed", we
 499                          * can detect whether the page was redirtied during
 500                          * pageout by checking the modify state.
 501                          */
 502                         if (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED)
 503                               m->dirty = TRUE;
 504                         else
 505                               m->dirty = FALSE;
 506
 507                         if (m->dirty) {
 508                                 CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
 509                                 vm_page_unwire(m);/* reactivates */
 510                                 VM_STAT_INCR(reactivations);
 511                                 PAGE_WAKEUP_DONE(m);
 512                         } else {
 513                                 CLUSTER_STAT(vm_pageout_target_page_freed++;)
 514                                 vm_page_free(m);/* clears busy, etc. */
 515                         }
 516                         vm_page_unlock_queues();
 517                         continue;
 518                 }
 519                 /*
 520                  * Handle the "adjacent" pages. These pages were cleaned in
 521                  * place, and should be left alone.
 522                  * If prep_pin_count is nonzero, then someone is using the
 523                  * page, so make it active.
 524                  */
 525                 if (!m->active && !m->inactive && !m->throttled && !m->private) {
 526                         if (m->reference)
 527                                 vm_page_activate(m);
 528                         else
 529                                 vm_page_deactivate(m);
 530                 }
 531                 if((m->busy) && (m->cleaning)) {
 532
 533                         /* the request_page_list case, (COPY_OUT_FROM FALSE) */
 534                         m->busy = FALSE;
 535
 536                         /* We do not re-set m->dirty ! */
 537                         /* The page was busy so no extraneous activity     */
 538                         /* could have occurred. COPY_INTO is a read into the */
 539                         /* new pages. CLEAN_IN_PLACE does actually write   */
 540                         /* out the pages but handling outside of this code */
 541                         /* will take care of resetting dirty. We clear the */
 542                         /* modify however for the Programmed I/O case.     */
 543                         pmap_clear_modify(m->phys_page);
 544
 545                         m->absent = FALSE;
 546                         m->overwriting = FALSE;
 547                 } else if (m->overwriting) {
 548                         /* alternate request page list, write to page_list */
 549                         /* case.  Occurs when the original page was wired  */
 550                         /* at the time of the list request */
 551                         assert(m->wire_count != 0);
 552                         vm_page_unwire(m);/* reactivates */
 553                         m->overwriting = FALSE;
 554                 } else {
 555                 /*
 556                  * Set the dirty state according to whether or not the page was
 557                  * modified during the pageout. Note that we purposefully do
 558                  * NOT call pmap_clear_modify since the page is still mapped.
 559                  * If the page were to be dirtied between the 2 calls, this
 560                  * this fact would be lost. This code is only necessary to
 561                  * maintain statistics, since the pmap module is always
 562                  * consulted if m->dirty is false.
 563                  */
 564 #if MACH_CLUSTER_STATS
 565                         m->dirty = pmap_is_modified(m->phys_page);
 566
 567                         if (m->dirty)   vm_pageout_cluster_dirtied++;
 568                         else            vm_pageout_cluster_cleaned++;
 569                         if (m->wanted)  vm_pageout_cluster_collisions++;
 570 #else
 571                         m->dirty = 0;
 572 #endif
 573                 }
 574                 m->cleaning = FALSE;
 575                 m->encrypted_cleaning = FALSE;
 576
 577                 /*
 578                  * Wakeup any thread waiting for the page to be un-cleaning.
 579                  */
 580                 PAGE_WAKEUP(m);
 581                 vm_page_unlock_queues();
 582         }
 583         /*
 584          * Account for the paging reference taken in vm_paging_object_allocate.
 585          */
 586         vm_object_paging_end(shadow_object);
 587         vm_object_unlock(shadow_object);
 588
 589         assert(object->ref_count == 0);
 590         assert(object->paging_in_progress == 0);
 591         assert(object->resident_page_count == 0);
 592         return;
 593 }
 594
 595 /*
 596  * Routine:     vm_pageclean_setup
 597  *
 598  * Purpose:     setup a page to be cleaned (made non-dirty), but not
 599  *              necessarily flushed from the VM page cache.
 600  *              This is accomplished by cleaning in place.
 601  *
 602  *              The page must not be busy, and the object and page
 603  *              queues must be locked.
 604  *
 605  */
 606 void
 607 vm_pageclean_setup(
 608         vm_page_t               m,
 609         vm_page_t               new_m,
 610         vm_object_t             new_object,
 611         vm_object_offset_t      new_offset)
 612 {
 613         assert(!m->busy);
 614 #if 0
 615         assert(!m->cleaning);
 616 #endif
 617
 618         XPR(XPR_VM_PAGEOUT,
 619     "vm_pageclean_setup, obj 0x%X off 0x%X page 0x%X new 0x%X new_off 0x%X\n",
 620                 (integer_t)m->object, m->offset, (integer_t)m,
 621                 (integer_t)new_m, new_offset);
 622
 623         pmap_clear_modify(m->phys_page);
 624
 625         /*
 626          * Mark original page as cleaning in place.
 627          */
 628         m->cleaning = TRUE;
 629         m->dirty = TRUE;
 630         m->precious = FALSE;
 631
 632         /*
 633          * Convert the fictitious page to a private shadow of
 634          * the real page.
 635          */
 636         assert(new_m->fictitious);
 637         assert(new_m->phys_page == vm_page_fictitious_addr);
 638         new_m->fictitious = FALSE;
 639         new_m->private = TRUE;
 640         new_m->pageout = TRUE;
 641         new_m->phys_page = m->phys_page;
 642         vm_page_wire(new_m);
 643
 644         vm_page_insert(new_m, new_object, new_offset);
 645         assert(!new_m->wanted);
 646         new_m->busy = FALSE;
 647 }
 648
 649 /*
 650  *      Routine:        vm_pageout_initialize_page
 651  *      Purpose:
 652  *              Causes the specified page to be initialized in
 653  *              the appropriate memory object. This routine is used to push
 654  *              pages into a copy-object when they are modified in the
 655  *              permanent object.
 656  *
 657  *              The page is moved to a temporary object and paged out.
 658  *
 659  *      In/out conditions:
 660  *              The page in question must not be on any pageout queues.
 661  *              The object to which it belongs must be locked.
 662  *              The page must be busy, but not hold a paging reference.
 663  *
 664  *      Implementation:
 665  *              Move this page to a completely new object.
 666  */
 667 void
 668 vm_pageout_initialize_page(
 669         vm_page_t       m)
 670 {
 671         vm_object_t             object;
 672         vm_object_offset_t      paging_offset;
 673         vm_page_t               holding_page;
 674         memory_object_t         pager;
 675
 676         XPR(XPR_VM_PAGEOUT,
 677                 "vm_pageout_initialize_page, page 0x%X\n",
 678                 (integer_t)m, 0, 0, 0, 0);
 679         assert(m->busy);
 680
 681         /*
 682          *      Verify that we really want to clean this page
 683          */
 684         assert(!m->absent);
 685         assert(!m->error);
 686         assert(m->dirty);
 687
 688         /*
 689          *      Create a paging reference to let us play with the object.
 690          */
 691         object = m->object;
 692         paging_offset = m->offset + object->paging_offset;
 693
 694         if (m->absent || m->error || m->restart || (!m->dirty && !m->precious)) {
 695                 VM_PAGE_FREE(m);
 696                 panic("reservation without pageout?"); /* alan */
 697                 vm_object_unlock(object);
 698
 699                 return;
 700         }
 701
 702         /*
 703          * If there's no pager, then we can't clean the page.  This should
 704          * never happen since this should be a copy object and therefore not
 705          * an external object, so the pager should always be there.
 706          */
 707
 708         pager = object->pager;
 709
 710         if (pager == MEMORY_OBJECT_NULL) {
 711                 VM_PAGE_FREE(m);
 712                 panic("missing pager for copy object");
 713                 return;
 714         }
 715
 716         /* set the page for future call to vm_fault_list_request */
 717         vm_object_paging_begin(object);
 718         holding_page = NULL;
 719         vm_page_lock_queues();
 720         pmap_clear_modify(m->phys_page);
 721         m->dirty = TRUE;
 722         m->busy = TRUE;
 723         m->list_req_pending = TRUE;
 724         m->cleaning = TRUE;
 725         m->pageout = TRUE;
 726         vm_page_wire(m);
 727         vm_page_unlock_queues();
 728         vm_object_unlock(object);
 729
 730         /*
 731          *      Write the data to its pager.
 732          *      Note that the data is passed by naming the new object,
 733          *      not a virtual address; the pager interface has been
 734          *      manipulated to use the "internal memory" data type.
 735          *      [The object reference from its allocation is donated
 736          *      to the eventual recipient.]
 737          */
 738         memory_object_data_initialize(pager, paging_offset, PAGE_SIZE);
 739
 740         vm_object_lock(object);
 741         vm_object_paging_end(object);
 742 }
 743
 744 #if     MACH_CLUSTER_STATS
 745 #define MAXCLUSTERPAGES 16
 746 struct {
 747         unsigned long pages_in_cluster;
 748         unsigned long pages_at_higher_offsets;
 749         unsigned long pages_at_lower_offsets;
 750 } cluster_stats[MAXCLUSTERPAGES];
 751 #endif  /* MACH_CLUSTER_STATS */
 752
 753
 754 /*
 755  * vm_pageout_cluster:
 756  *
 757  * Given a page, queue it to the appropriate I/O thread,
 758  * which will page it out and attempt to clean adjacent pages
 759  * in the same operation.
 760  *
 761  * The page must be busy, and the object and queues locked. We will take a
 762  * paging reference to prevent deallocation or collapse when we
 763  * release the object lock back at the call site.  The I/O thread
 764  * is responsible for consuming this reference
 765  *
 766  * The page must not be on any pageout queue.
 767  */
 768
 769 void
 770 vm_pageout_cluster(vm_page_t m)
 771 {
 772         vm_object_t     object = m->object;
 773         struct          vm_pageout_queue *q;
 774
 775
 776         XPR(XPR_VM_PAGEOUT,
 777                 "vm_pageout_cluster, object 0x%X offset 0x%X page 0x%X\n",
 778                 (integer_t)object, m->offset, (integer_t)m, 0, 0);
 779
 780         /*
 781          * Only a certain kind of page is appreciated here.
 782          */
 783         assert(m->busy && (m->dirty || m->precious) && (m->wire_count == 0));
 784         assert(!m->cleaning && !m->pageout && !m->inactive && !m->active);
 785         assert(!m->throttled);
 786
 787         /*
 788          * protect the object from collapse -
 789          * locking in the object's paging_offset.
 790          */
 791         vm_object_paging_begin(object);
 792
 793         /*
 794          * set the page for future call to vm_fault_list_request
 795          * page should already be marked busy
 796          */
 797         vm_page_wire(m);
 798         m->list_req_pending = TRUE;
 799         m->cleaning = TRUE;
 800         m->pageout = TRUE;
 801         m->laundry = TRUE;
 802
 803         if (object->internal == TRUE)
 804                 q = &vm_pageout_queue_internal;
 805         else
 806                 q = &vm_pageout_queue_external;
 807         q->pgo_laundry++;
 808
 809         m->pageout_queue = TRUE;
 810         queue_enter(&q->pgo_pending, m, vm_page_t, pageq);
 811
 812         if (q->pgo_idle == TRUE) {
 813                 q->pgo_idle = FALSE;
 814                 thread_wakeup((event_t) &q->pgo_pending);
 815         }
 816 }
 817
 818
 819 unsigned long vm_pageout_throttle_up_count = 0;
 820
 821 /*
 822  * A page is back from laundry.  See if there are some pages waiting to
 823  * go to laundry and if we can let some of them go now.
 824  *
 825  * Object and page queues must be locked.
 826  */
 827 void
 828 vm_pageout_throttle_up(
 829         vm_page_t       m)
 830 {
 831         struct vm_pageout_queue *q;
 832
 833         vm_pageout_throttle_up_count++;
 834
 835         assert(m->laundry);
 836         assert(m->object != VM_OBJECT_NULL);
 837         assert(m->object != kernel_object);
 838
 839         if (m->object->internal == TRUE)
 840                 q = &vm_pageout_queue_internal;
 841         else
 842                 q = &vm_pageout_queue_external;
 843
 844         m->laundry = FALSE;
 845         q->pgo_laundry--;
 846
 847         if (q->pgo_throttled == TRUE) {
 848                 q->pgo_throttled = FALSE;
 849                 thread_wakeup((event_t) &q->pgo_laundry);
 850         }
 851 }
 852
 853
 854 /*
 855  *      vm_pageout_scan does the dirty work for the pageout daemon.
 856  *      It returns with vm_page_queue_free_lock held and
 857  *      vm_page_free_wanted == 0.
 858  */
 859
 860 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT  (3 * MAX_UPL_TRANSFER)
 861
 862 #define FCS_IDLE                0
 863 #define FCS_DELAYED             1
 864 #define FCS_DEADLOCK_DETECTED   2
 865
 866 struct flow_control {
 867         int             state;
 868         mach_timespec_t ts;
 869 };
 870
 871 void
 872 vm_pageout_scan(void)
 873 {
 874         unsigned int loop_count = 0;
 875         unsigned int inactive_burst_count = 0;
 876         unsigned int active_burst_count = 0;
 877         unsigned int reactivated_this_call;
 878         unsigned int reactivate_limit;
 879         vm_page_t   local_freeq = NULL;
 880         int         local_freed = 0;
 881         int         delayed_unlock;
 882         int         need_internal_inactive = 0;
 883         int         refmod_state = 0;
 884         int     vm_pageout_deadlock_target = 0;
 885         struct  vm_pageout_queue *iq;
 886         struct  vm_pageout_queue *eq;
 887         struct  vm_speculative_age_q *sq;
 888         struct  flow_control    flow_control;
 889         boolean_t inactive_throttled = FALSE;
 890         boolean_t try_failed;
 891         mach_timespec_t         ts;
 892         unsigned int msecs = 0;
 893         vm_object_t     object;
 894         vm_object_t     last_object_tried;
 895         int     zf_ratio;
 896         int     zf_run_count;
 897         uint32_t        catch_up_count = 0;
 898         uint32_t        inactive_reclaim_run;
 899         boolean_t       forced_reclaim;
 900
 901         flow_control.state = FCS_IDLE;
 902         iq = &vm_pageout_queue_internal;
 903         eq = &vm_pageout_queue_external;
 904         sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
 905
 906
 907         XPR(XPR_VM_PAGEOUT, "vm_pageout_scan\n", 0, 0, 0, 0, 0);
 908
 909
 910         vm_page_lock_queues();
 911         delayed_unlock = 1;     /* must be nonzero if Qs are locked, 0 if unlocked */
 912
 913         /*
 914          *      Calculate the max number of referenced pages on the inactive
 915          *      queue that we will reactivate.
 916          */
 917         reactivated_this_call = 0;
 918         reactivate_limit = VM_PAGE_REACTIVATE_LIMIT(vm_page_active_count +
 919                                                     vm_page_inactive_count);
 920         inactive_reclaim_run = 0;
 921
 922
 923 /*???*/ /*
 924          *      We want to gradually dribble pages from the active queue
 925          *      to the inactive queue.  If we let the inactive queue get
 926          *      very small, and then suddenly dump many pages into it,
 927          *      those pages won't get a sufficient chance to be referenced
 928          *      before we start taking them from the inactive queue.
 929          *
 930          *      We must limit the rate at which we send pages to the pagers.
 931          *      data_write messages consume memory, for message buffers and
 932          *      for map-copy objects.  If we get too far ahead of the pagers,
 933          *      we can potentially run out of memory.
 934          *
 935          *      We can use the laundry count to limit directly the number
 936          *      of pages outstanding to the default pager.  A similar
 937          *      strategy for external pagers doesn't work, because
 938          *      external pagers don't have to deallocate the pages sent them,
 939          *      and because we might have to send pages to external pagers
 940          *      even if they aren't processing writes.  So we also
 941          *      use a burst count to limit writes to external pagers.
 942          *
 943          *      When memory is very tight, we can't rely on external pagers to
 944          *      clean pages.  They probably aren't running, because they
 945          *      aren't vm-privileged.  If we kept sending dirty pages to them,
 946          *      we could exhaust the free list.
 947          */
 948
 949
 950 Restart:
 951         assert(delayed_unlock!=0);
 952
 953         /*
 954          *      A page is "zero-filled" if it was not paged in from somewhere,
 955          *      and it belongs to an object at least VM_ZF_OBJECT_SIZE_THRESHOLD big.
 956          *      Recalculate the zero-filled page ratio.  We use this to apportion
 957          *      victimized pages between the normal and zero-filled inactive
 958          *      queues according to their relative abundance in memory.  Thus if a task
 959          *      is flooding memory with zf pages, we begin to hunt them down.
 960          *      It would be better to throttle greedy tasks at a higher level,
 961          *      but at the moment mach vm cannot do this.
 962          */
 963         {
 964                 uint32_t  total  = vm_page_active_count + vm_page_inactive_count;
 965                 uint32_t  normal = total - vm_zf_count;
 966
 967                 /* zf_ratio is the number of zf pages we victimize per normal page */
 968
 969                 if (vm_zf_count < vm_accellerate_zf_pageout_trigger)
 970                         zf_ratio = 0;
 971                 else if ((vm_zf_count <= normal) || (normal == 0))
 972                         zf_ratio = 1;
 973                 else
 974                         zf_ratio = vm_zf_count / normal;
 975
 976                 zf_run_count = 0;
 977         }
 978
 979         /*
 980          *      Recalculate vm_page_inactivate_target.
 981          */
 982         vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
 983                                                           vm_page_inactive_count +
 984                                                           vm_page_speculative_count);
 985         /*
 986          * don't want to wake the pageout_scan thread up everytime we fall below
 987          * the targets... set a low water mark at 0.25% below the target
 988          */
 989         vm_page_inactive_min = vm_page_inactive_target - (vm_page_inactive_target / 400);
 990
 991         vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count +
 992                                                                 vm_page_inactive_count);
 993         object = NULL;
 994         last_object_tried = NULL;
 995         try_failed = FALSE;
 996
 997         if ((vm_page_inactive_count + vm_page_speculative_count) < VM_PAGE_INACTIVE_HEALTHY_LIMIT(vm_page_active_count))
 998                 catch_up_count = vm_page_inactive_count + vm_page_speculative_count;
 999         else
1000                 catch_up_count = 0;
1001
1002         for (;;) {
1003                 vm_page_t m;
1004
1005                 DTRACE_VM2(rev, int, 1, (uint64_t *), NULL);
1006
1007                 if (delayed_unlock == 0) {
1008                         vm_page_lock_queues();
1009                         delayed_unlock = 1;
1010                 }
1011
1012                 /*
1013                  *      Don't sweep through active queue more than the throttle
1014                  *      which should be kept relatively low
1015                  */
1016                 active_burst_count = vm_pageout_burst_active_throttle;
1017
1018                 /*
1019                  *      Move pages from active to inactive.
1020                  */
1021                 if (need_internal_inactive == 0 && (vm_page_inactive_count + vm_page_speculative_count) >= vm_page_inactive_target)
1022                         goto done_moving_active_pages;
1023
1024                 while (!queue_empty(&vm_page_queue_active) &&
1025                        (need_internal_inactive || active_burst_count)) {
1026
1027                         if (active_burst_count)
1028                                active_burst_count--;
1029
1030                         vm_pageout_active++;
1031
1032                         m = (vm_page_t) queue_first(&vm_page_queue_active);
1033
1034                         assert(m->active && !m->inactive);
1035                         assert(!m->laundry);
1036                         assert(m->object != kernel_object);
1037                         assert(m->phys_page != vm_page_guard_addr);
1038
1039                         DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
1040
1041                         /*
1042                          * Try to lock object; since we've already got the
1043                          * page queues lock, we can only 'try' for this one.
1044                          * if the 'try' fails, we need to do a mutex_pause
1045                          * to allow the owner of the object lock a chance to
1046                          * run... otherwise, we're likely to trip over this
1047                          * object in the same state as we work our way through
1048                          * the queue... clumps of pages associated with the same
1049                          * object are fairly typical on the inactive and active queues
1050                          */
1051                         if (m->object != object) {
1052                                 if (object != NULL) {
1053                                         vm_object_unlock(object);
1054                                         object = NULL;
1055                                         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1056                                 }
1057                                 if (!vm_object_lock_try_scan(m->object)) {
1058                                         /*
1059                                          * move page to end of active queue and continue
1060                                          */
1061                                         queue_remove(&vm_page_queue_active, m,
1062                                                      vm_page_t, pageq);
1063                                         queue_enter(&vm_page_queue_active, m,
1064                                                     vm_page_t, pageq);
1065
1066                                         try_failed = TRUE;
1067
1068                                         m = (vm_page_t) queue_first(&vm_page_queue_active);
1069                                         /*
1070                                          * this is the next object we're going to be interested in
1071                                          * try to make sure its available after the mutex_yield
1072                                          * returns control
1073                                          */
1074                                         vm_pageout_scan_wants_object = m->object;
1075
1076                                         goto done_with_activepage;
1077                                 }
1078                                 object = m->object;
1079
1080                                 try_failed = FALSE;
1081                         }
1082
1083                         /*
1084                          * if the page is BUSY, then we pull it
1085                          * off the active queue and leave it alone.
1086                          * when BUSY is cleared, it will get stuck
1087                          * back on the appropriate queue
1088                          */
1089                         if (m->busy) {
1090                                 queue_remove(&vm_page_queue_active, m,
1091                                              vm_page_t, pageq);
1092                                 m->pageq.next = NULL;
1093                                 m->pageq.prev = NULL;
1094
1095                                 if (!m->fictitious)
1096                                         vm_page_active_count--;
1097                                 m->active = FALSE;
1098
1099                                 goto done_with_activepage;
1100                         }
1101
1102                         /*
1103                          *      Deactivate the page while holding the object
1104                          *      locked, so we know the page is still not busy.
1105                          *      This should prevent races between pmap_enter
1106                          *      and pmap_clear_reference.  The page might be
1107                          *      absent or fictitious, but vm_page_deactivate
1108                          *      can handle that.
1109                          */
1110                         vm_page_deactivate(m);
1111
1112                         if (need_internal_inactive) {
1113                                 vm_pageout_scan_active_throttle_success++;
1114                                 need_internal_inactive--;
1115                         }
1116 done_with_activepage:
1117                         if (delayed_unlock++ > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT || try_failed == TRUE) {
1118
1119                                 if (object != NULL) {
1120                                         vm_object_unlock(object);
1121                                         object = NULL;
1122                                         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1123                                 }
1124                                 if (local_freeq) {
1125                                         vm_page_free_list(local_freeq);
1126
1127                                         local_freeq = NULL;
1128                                         local_freed = 0;
1129                                 }
1130                                 mutex_yield(&vm_page_queue_lock);
1131
1132                                 delayed_unlock = 1;
1133
1134                                 /*
1135                                  * continue the while loop processing
1136                                  * the active queue... need to hold
1137                                  * the page queues lock
1138                                  */
1139                         }
1140                 }
1141
1142
1143
1144                 /**********************************************************************
1145                  * above this point we're playing with the active queue
1146                  * below this point we're playing with the throttling mechanisms
1147                  * and the inactive queue
1148                  **********************************************************************/
1149
1150 done_moving_active_pages:
1151
1152                 /*
1153                  *      We are done if we have met our target *and*
1154                  *      nobody is still waiting for a page.
1155                  */
1156                 if (vm_page_free_count + local_freed >= vm_page_free_target) {
1157                         if (object != NULL) {
1158                                 vm_object_unlock(object);
1159                                 object = NULL;
1160                         }
1161                         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1162
1163                         if (local_freeq) {
1164                                 vm_page_free_list(local_freeq);
1165
1166                                 local_freeq = NULL;
1167                                 local_freed = 0;
1168                         }
1169                         /*
1170                          * inactive target still not met... keep going
1171                          * until we get the queues balanced
1172                          */
1173                         if (((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) &&
1174                             !queue_empty(&vm_page_queue_active))
1175                                 continue;
1176
1177                         mutex_lock(&vm_page_queue_free_lock);
1178
1179                         if ((vm_page_free_count >= vm_page_free_target) &&
1180                             (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
1181
1182                                 vm_page_unlock_queues();
1183
1184                                 thread_wakeup((event_t) &vm_pageout_garbage_collect);
1185
1186                                 assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
1187
1188                                 return;
1189                         }
1190                         mutex_unlock(&vm_page_queue_free_lock);
1191                 }
1192                 /*
1193                  * Before anything, we check if we have any ripe volatile objects around.
1194                  * If so, purge the first and see what it gives us.
1195                  */
1196                 assert (available_for_purge>=0);
1197                 if (available_for_purge)
1198                 {
1199                         if (object != NULL) {
1200                                 vm_object_unlock(object);
1201                                 object = NULL;
1202                         }
1203                         vm_purgeable_object_purge_one();
1204                         continue;
1205                 }
1206
1207                 if (queue_empty(&sq->age_q) && vm_page_speculative_count) {
1208                         /*
1209                          * try to pull pages from the aging bins
1210                          * see vm_page.h for an explanation of how
1211                          * this mechanism works
1212                          */
1213                         struct vm_speculative_age_q     *aq;
1214                         mach_timespec_t ts_fully_aged;
1215                         boolean_t       can_steal = FALSE;
1216
1217                         aq = &vm_page_queue_speculative[speculative_steal_index];
1218
1219                         while (queue_empty(&aq->age_q)) {
1220
1221                                 speculative_steal_index++;
1222
1223                                 if (speculative_steal_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q)
1224                                         speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
1225
1226                                 aq = &vm_page_queue_speculative[speculative_steal_index];
1227                         }
1228                         if (vm_page_speculative_count > vm_page_speculative_target)
1229                                 can_steal = TRUE;
1230                         else {
1231                                 ts_fully_aged.tv_sec = (VM_PAGE_MAX_SPECULATIVE_AGE_Q * VM_PAGE_SPECULATIVE_Q_AGE_MS) / 1000;
1232                                 ts_fully_aged.tv_nsec = ((VM_PAGE_MAX_SPECULATIVE_AGE_Q * VM_PAGE_SPECULATIVE_Q_AGE_MS) % 1000)
1233                                                       * 1000 * NSEC_PER_USEC;
1234
1235                                 ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts);
1236
1237                                 clock_get_system_nanotime(&ts.tv_sec, (unsigned *)&ts.tv_nsec);
1238
1239                                 if (CMP_MACH_TIMESPEC(&ts, &ts_fully_aged) >= 0)
1240                                         can_steal = TRUE;
1241                         }
1242                         if (can_steal == TRUE)
1243                                 vm_page_speculate_ageit(aq);
1244                 }
1245
1246                 /*
1247                  * Sometimes we have to pause:
1248                  *      1) No inactive pages - nothing to do.
1249                  *      2) Flow control - default pageout queue is full
1250                  *      3) Loop control - no acceptable pages found on the inactive queue
1251                  *         within the last vm_pageout_burst_inactive_throttle iterations
1252                  */
1253                 if (queue_empty(&vm_page_queue_inactive) && queue_empty(&vm_page_queue_zf) && queue_empty(&sq->age_q) &&
1254                     (VM_PAGE_Q_THROTTLED(iq) || queue_empty(&vm_page_queue_throttled))) {
1255                         vm_pageout_scan_empty_throttle++;
1256                         msecs = vm_pageout_empty_wait;
1257                         goto vm_pageout_scan_delay;
1258
1259                 } else if (inactive_burst_count >= vm_pageout_burst_inactive_throttle) {
1260                         vm_pageout_scan_burst_throttle++;
1261                         msecs = vm_pageout_burst_wait;
1262                         goto vm_pageout_scan_delay;
1263
1264                 } else if (VM_PAGE_Q_THROTTLED(iq) && IP_VALID(memory_manager_default)) {
1265
1266                         switch (flow_control.state) {
1267
1268                         case FCS_IDLE:
1269 reset_deadlock_timer:
1270                                 ts.tv_sec = vm_pageout_deadlock_wait / 1000;
1271                                 ts.tv_nsec = (vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
1272                                 clock_get_system_nanotime(&flow_control.ts.tv_sec,
1273                                                           (unsigned *)&flow_control.ts.tv_nsec);
1274                                 ADD_MACH_TIMESPEC(&flow_control.ts, &ts);
1275
1276                                 flow_control.state = FCS_DELAYED;
1277                                 msecs = vm_pageout_deadlock_wait;
1278
1279                                 break;
1280
1281                         case FCS_DELAYED:
1282                                 clock_get_system_nanotime(&ts.tv_sec,
1283                                                           (unsigned *)&ts.tv_nsec);
1284
1285                                 if (CMP_MACH_TIMESPEC(&ts, &flow_control.ts) >= 0) {
1286                                         /*
1287                                          * the pageout thread for the default pager is potentially
1288                                          * deadlocked since the
1289                                          * default pager queue has been throttled for more than the
1290                                          * allowable time... we need to move some clean pages or dirty
1291                                          * pages belonging to the external pagers if they aren't throttled
1292                                          * vm_page_free_wanted represents the number of threads currently
1293                                          * blocked waiting for pages... we'll move one page for each of
1294                                          * these plus a fixed amount to break the logjam... once we're done
1295                                          * moving this number of pages, we'll re-enter the FSC_DELAYED state
1296                                          * with a new timeout target since we have no way of knowing
1297                                          * whether we've broken the deadlock except through observation
1298                                          * of the queue associated with the default pager... we need to
1299                                          * stop moving pages and allow the system to run to see what
1300                                          * state it settles into.
1301                                          */
1302                                         vm_pageout_deadlock_target = vm_pageout_deadlock_relief + vm_page_free_wanted + vm_page_free_wanted_privileged;
1303                                         vm_pageout_scan_deadlock_detected++;
1304                                         flow_control.state = FCS_DEADLOCK_DETECTED;
1305
1306                                         thread_wakeup((event_t) &vm_pageout_garbage_collect);
1307                                         goto consider_inactive;
1308                                 }
1309                                 /*
1310                                  * just resniff instead of trying
1311                                  * to compute a new delay time... we're going to be
1312                                  * awakened immediately upon a laundry completion,
1313                                  * so we won't wait any longer than necessary
1314                                  */
1315                                 msecs = vm_pageout_idle_wait;
1316                                 break;
1317
1318                         case FCS_DEADLOCK_DETECTED:
1319                                 if (vm_pageout_deadlock_target)
1320                                         goto consider_inactive;
1321                                 goto reset_deadlock_timer;
1322
1323                         }
1324                         vm_pageout_scan_throttle++;
1325                         iq->pgo_throttled = TRUE;
1326 vm_pageout_scan_delay:
1327                         if (object != NULL) {
1328                                 vm_object_unlock(object);
1329                                 object = NULL;
1330                         }
1331                         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1332
1333                         if (local_freeq) {
1334                                 vm_page_free_list(local_freeq);
1335
1336                                 local_freeq = NULL;
1337                                 local_freed = 0;
1338                         }
1339 #if CONFIG_EMBEDDED
1340                         {
1341                         int percent_avail;
1342
1343                         /*
1344                          * Decide if we need to send a memory status notification.
1345                          */
1346                         percent_avail =
1347                                 (vm_page_active_count + vm_page_inactive_count +
1348                                  vm_page_speculative_count + vm_page_free_count +
1349                                  vm_page_purgeable_count ) * 100 /
1350                                 atop_64(max_mem);
1351                         if (percent_avail >= (kern_memorystatus_level + 5) ||
1352                             percent_avail <= (kern_memorystatus_level - 5)) {
1353                                 kern_memorystatus_level = percent_avail;
1354                                 thread_wakeup((event_t)&kern_memorystatus_wakeup);
1355                         }
1356                         }
1357 #endif
1358                         assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000*NSEC_PER_USEC);
1359
1360                         counter(c_vm_pageout_scan_block++);
1361
1362                         vm_page_unlock_queues();
1363
1364                         assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
1365
1366                         thread_block(THREAD_CONTINUE_NULL);
1367
1368                         vm_page_lock_queues();
1369                         delayed_unlock = 1;
1370
1371                         iq->pgo_throttled = FALSE;
1372
1373                         if (loop_count >= vm_page_inactive_count)
1374                                 loop_count = 0;
1375                         inactive_burst_count = 0;
1376
1377                         goto Restart;
1378                         /*NOTREACHED*/
1379                 }
1380
1381
1382                 flow_control.state = FCS_IDLE;
1383 consider_inactive:
1384                 loop_count++;
1385                 inactive_burst_count++;
1386                 vm_pageout_inactive++;
1387
1388                 /* Choose a victim. */
1389
1390                 while (1) {
1391                         m = NULL;
1392
1393                         /*
1394                          * the most eligible pages are ones that were throttled because the
1395                          * pager wasn't ready at the time.  If a pager is ready now,
1396                          * see if one of these is useful.
1397                          */
1398                         if (!VM_PAGE_Q_THROTTLED(iq) && !queue_empty(&vm_page_queue_throttled)) {
1399                                 m = (vm_page_t) queue_first(&vm_page_queue_throttled);
1400                                 break;
1401                         }
1402
1403                         /*
1404                          * The second most eligible pages are ones we paged in speculatively,
1405                          * but which have not yet been touched.
1406                          */
1407                         if ( !queue_empty(&sq->age_q) ) {
1408                                 m = (vm_page_t) queue_first(&sq->age_q);
1409                                 break;
1410                         }
1411                         /*
1412                          * Time for a zero-filled inactive page?
1413                          */
1414                         if ( ((zf_run_count < zf_ratio) && vm_zf_queue_count >= zf_queue_min_count) ||
1415                              queue_empty(&vm_page_queue_inactive)) {
1416                                 if ( !queue_empty(&vm_page_queue_zf) ) {
1417                                         m = (vm_page_t) queue_first(&vm_page_queue_zf);
1418                                         zf_run_count++;
1419                                         break;
1420                                 }
1421                         }
1422                         /*
1423                          * It's either a normal inactive page or nothing.
1424                          */
1425                         if ( !queue_empty(&vm_page_queue_inactive) ) {
1426                                 m = (vm_page_t) queue_first(&vm_page_queue_inactive);
1427                                 zf_run_count = 0;
1428                                 break;
1429                         }
1430
1431                         panic("vm_pageout: no victim");
1432                 }
1433
1434                 assert(!m->active && (m->inactive || m->speculative || m->throttled));
1435                 assert(!m->laundry);
1436                 assert(m->object != kernel_object);
1437                 assert(m->phys_page != vm_page_guard_addr);
1438
1439                 DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
1440
1441                 /*
1442                  * check to see if we currently are working
1443                  * with the same object... if so, we've
1444                  * already got the lock
1445                  */
1446                 if (m->object != object) {
1447                         /*
1448                          * the object associated with candidate page is
1449                          * different from the one we were just working
1450                          * with... dump the lock if we still own it
1451                          */
1452                         if (object != NULL) {
1453                                 vm_object_unlock(object);
1454                                 object = NULL;
1455                                 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1456                         }
1457                         /*
1458                          * Try to lock object; since we've alread got the
1459                          * page queues lock, we can only 'try' for this one.
1460                          * if the 'try' fails, we need to do a mutex_pause
1461                          * to allow the owner of the object lock a chance to
1462                          * run... otherwise, we're likely to trip over this
1463                          * object in the same state as we work our way through
1464                          * the queue... clumps of pages associated with the same
1465                          * object are fairly typical on the inactive and active queues
1466                          */
1467                         if (!vm_object_lock_try_scan(m->object)) {
1468                                 /*
1469                                  *      Move page to end and continue.
1470                                  *      Don't re-issue ticket
1471                                  */
1472                                 if (m->zero_fill) {
1473                                         queue_remove(&vm_page_queue_zf, m,
1474                                                      vm_page_t, pageq);
1475                                         queue_enter(&vm_page_queue_zf, m,
1476                                                     vm_page_t, pageq);
1477                                 } else if (m->speculative) {
1478                                         remque(&m->pageq);
1479                                         m->speculative = FALSE;
1480                                         vm_page_speculative_count--;
1481
1482                                         /*
1483                                          * move to the tail of the inactive queue
1484                                          * to get it out of the way... the speculative
1485                                          * queue is generally too small to depend
1486                                          * on there being enough pages from other
1487                                          * objects to make cycling it back on the
1488                                          * same queue a winning proposition
1489                                          */
1490                                         queue_enter(&vm_page_queue_inactive, m,
1491                                                     vm_page_t, pageq);
1492                                         m->inactive = TRUE;
1493                                         vm_page_inactive_count++;
1494                                         token_new_pagecount++;
1495                                 }  else if (m->throttled) {
1496                                         queue_remove(&vm_page_queue_throttled, m,
1497                                                      vm_page_t, pageq);
1498                                         m->throttled = FALSE;
1499                                         vm_page_throttled_count--;
1500
1501                                         /*
1502                                          * not throttled any more, so can stick
1503                                          * it on the inactive queue.
1504                                          */
1505                                         queue_enter(&vm_page_queue_inactive, m,
1506                                                     vm_page_t, pageq);
1507                                         m->inactive = TRUE;
1508                                         vm_page_inactive_count++;
1509                                         token_new_pagecount++;
1510                                 } else {
1511                                         queue_remove(&vm_page_queue_inactive, m,
1512                                                      vm_page_t, pageq);
1513 #if MACH_ASSERT
1514                                         vm_page_inactive_count--;       /* balance for purgeable queue asserts */
1515 #endif
1516                                         vm_purgeable_q_advance_all(1);
1517
1518                                         queue_enter(&vm_page_queue_inactive, m,
1519                                                     vm_page_t, pageq);
1520 #if MACH_ASSERT
1521                                         vm_page_inactive_count++;       /* balance for purgeable queue asserts */
1522 #endif
1523                                         token_new_pagecount++;
1524                                 }
1525                                 pmap_clear_reference(m->phys_page);
1526                                 m->reference = FALSE;
1527
1528                                 vm_pageout_inactive_nolock++;
1529
1530                                 if ( !queue_empty(&sq->age_q) )
1531                                         m = (vm_page_t) queue_first(&sq->age_q);
1532                                 else if ( ((zf_run_count < zf_ratio) && vm_zf_queue_count >= zf_queue_min_count) ||
1533                                           queue_empty(&vm_page_queue_inactive)) {
1534                                         if ( !queue_empty(&vm_page_queue_zf) )
1535                                                 m = (vm_page_t) queue_first(&vm_page_queue_zf);
1536                                 } else if ( !queue_empty(&vm_page_queue_inactive) ) {
1537                                         m = (vm_page_t) queue_first(&vm_page_queue_inactive);
1538                                 }
1539                                 /*
1540                                  * this is the next object we're going to be interested in
1541                                  * try to make sure its available after the mutex_yield
1542                                  * returns control
1543                                  */
1544                                 vm_pageout_scan_wants_object = m->object;
1545
1546                                 /*
1547                                  * force us to dump any collected free pages
1548                                  * and to pause before moving on
1549                                  */
1550                                 try_failed = TRUE;
1551
1552                                 goto done_with_inactivepage;
1553                         }
1554                         object = m->object;
1555                         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1556
1557                         try_failed = FALSE;
1558                 }
1559
1560                 /*
1561                  *      Paging out pages of external objects which
1562                  *      are currently being created must be avoided.
1563                  *      The pager may claim for memory, thus leading to a
1564                  *      possible dead lock between it and the pageout thread,
1565                  *      if such pages are finally chosen. The remaining assumption
1566                  *      is that there will finally be enough available pages in the
1567                  *      inactive pool to page out in order to satisfy all memory
1568                  *      claimed by the thread which concurrently creates the pager.
1569                  */
1570                 if (!object->pager_initialized && object->pager_created) {
1571                         /*
1572                          *      Move page to end and continue, hoping that
1573                          *      there will be enough other inactive pages to
1574                          *      page out so that the thread which currently
1575                          *      initializes the pager will succeed.
1576                          *      Don't re-grant the ticket, the page should
1577                          *      pulled from the queue and paged out whenever
1578                          *      one of its logically adjacent fellows is
1579                          *      targeted.
1580                          *
1581                          *      Pages found on the speculative list can never be
1582                          *      in this state... they always have a pager associated
1583                          *      with them.
1584                          */
1585                         assert(!m->speculative);
1586
1587                         if (m->zero_fill) {
1588                                 queue_remove(&vm_page_queue_zf, m,
1589                                              vm_page_t, pageq);
1590                                 queue_enter(&vm_page_queue_zf, m,
1591                                             vm_page_t, pageq);
1592                         } else {
1593                                 queue_remove(&vm_page_queue_inactive, m,
1594                                              vm_page_t, pageq);
1595 #if MACH_ASSERT
1596                                 vm_page_inactive_count--;       /* balance for purgeable queue asserts */
1597 #endif
1598                                 vm_purgeable_q_advance_all(1);
1599
1600                                 queue_enter(&vm_page_queue_inactive, m,
1601                                             vm_page_t, pageq);
1602 #if MACH_ASSERT
1603                                 vm_page_inactive_count++;       /* balance for purgeable queue asserts */
1604 #endif
1605                                 token_new_pagecount++;
1606                         }
1607                         vm_pageout_inactive_avoid++;
1608
1609                         goto done_with_inactivepage;
1610                 }
1611                 /*
1612                  *      Remove the page from its list.
1613                  */
1614                 if (m->speculative) {
1615                         remque(&m->pageq);
1616                         m->speculative = FALSE;
1617                         vm_page_speculative_count--;
1618                 } else if (m->throttled) {
1619                         queue_remove(&vm_page_queue_throttled, m, vm_page_t, pageq);
1620                         m->throttled = FALSE;
1621                         vm_page_throttled_count--;
1622                 } else {
1623                         if (m->zero_fill) {
1624                                 queue_remove(&vm_page_queue_zf, m, vm_page_t, pageq);
1625                                 vm_zf_queue_count--;
1626                         } else {
1627                                 queue_remove(&vm_page_queue_inactive, m, vm_page_t, pageq);
1628                         }
1629                         m->inactive = FALSE;
1630                         if (!m->fictitious)
1631                                 vm_page_inactive_count--;
1632                                 vm_purgeable_q_advance_all(1);
1633                 }
1634
1635                 if (object->copy == VM_OBJECT_NULL &&
1636                     (object->purgable == VM_PURGABLE_EMPTY ||
1637                      object->purgable == VM_PURGABLE_VOLATILE)) {
1638                         assert(m->wire_count == 0);     /* if it's wired, we can't put it on our queue */
1639                         /* just stick it back on! */
1640                         goto reactivate_page;
1641                 }
1642                 m->pageq.next = NULL;
1643                 m->pageq.prev = NULL;
1644
1645                 if ( !m->fictitious && catch_up_count)
1646                         catch_up_count--;
1647
1648                 /*
1649                  * ENCRYPTED SWAP:
1650                  * if this page has already been picked up as part of a
1651                  * page-out cluster, it will be busy because it is being
1652                  * encrypted (see vm_object_upl_request()).  But we still
1653                  * want to demote it from "clean-in-place" (aka "adjacent")
1654                  * to "clean-and-free" (aka "target"), so let's ignore its
1655                  * "busy" bit here and proceed to check for "cleaning" a
1656                  * little bit below...
1657                  */
1658                 if ( !m->encrypted_cleaning && (m->busy || !object->alive)) {
1659                         /*
1660                          *      Somebody is already playing with this page.
1661                          *      Leave it off the pageout queues.
1662                          *
1663                          */
1664                         vm_pageout_inactive_busy++;
1665
1666                         goto done_with_inactivepage;
1667                 }
1668
1669                 /*
1670                  *      If it's absent or in error, we can reclaim the page.
1671                  */
1672
1673                 if (m->absent || m->error) {
1674                         vm_pageout_inactive_absent++;
1675 reclaim_page:
1676                         if (vm_pageout_deadlock_target) {
1677                                 vm_pageout_scan_inactive_throttle_success++;
1678                                 vm_pageout_deadlock_target--;
1679                         }
1680
1681                         DTRACE_VM2(dfree, int, 1, (uint64_t *), NULL);
1682
1683                         if (m->object->internal) {
1684                                 DTRACE_VM2(anonfree, int, 1, (uint64_t *), NULL);
1685                         } else {
1686                                 DTRACE_VM2(fsfree, int, 1, (uint64_t *), NULL);
1687                         }
1688
1689                         vm_page_free_prepare(m);
1690
1691                         assert(m->pageq.next == NULL &&
1692                                m->pageq.prev == NULL);
1693                         m->pageq.next = (queue_entry_t)local_freeq;
1694                         local_freeq = m;
1695                         local_freed++;
1696
1697                         inactive_burst_count = 0;
1698
1699                         goto done_with_inactivepage;
1700                 }
1701
1702                 assert(!m->private);
1703                 assert(!m->fictitious);
1704
1705                 /*
1706                  *      If already cleaning this page in place, convert from
1707                  *      "adjacent" to "target". We can leave the page mapped,
1708                  *      and vm_pageout_object_terminate will determine whether
1709                  *      to free or reactivate.
1710                  */
1711
1712                 if (m->cleaning) {
1713                         m->busy = TRUE;
1714                         m->pageout = TRUE;
1715                         m->dump_cleaning = TRUE;
1716                         vm_page_wire(m);
1717
1718                         CLUSTER_STAT(vm_pageout_cluster_conversions++);
1719
1720                         inactive_burst_count = 0;
1721
1722                         goto done_with_inactivepage;
1723                 }
1724
1725                 /*
1726                  *      If it's being used, reactivate.
1727                  *      (Fictitious pages are either busy or absent.)
1728                  *      First, update the reference and dirty bits
1729                  *      to make sure the page is unreferenced.
1730                  */
1731                 refmod_state = -1;
1732
1733                 if (m->reference == FALSE && m->pmapped == TRUE) {
1734                         refmod_state = pmap_get_refmod(m->phys_page);
1735
1736                         if (refmod_state & VM_MEM_REFERENCED)
1737                                 m->reference = TRUE;
1738                         if (refmod_state & VM_MEM_MODIFIED)
1739                                 m->dirty = TRUE;
1740                 }
1741                 if (m->reference && !m->no_cache) {
1742                         /*
1743                          * The page we pulled off the inactive list has
1744                          * been referenced.  It is possible for other
1745                          * processors to be touching pages faster than we
1746                          * can clear the referenced bit and traverse the
1747                          * inactive queue, so we limit the number of
1748                          * reactivations.
1749                          */
1750                         if (++reactivated_this_call >= reactivate_limit) {
1751                                 vm_pageout_reactivation_limit_exceeded++;
1752                         } else if (catch_up_count) {
1753                                 vm_pageout_catch_ups++;
1754                         } else if (++inactive_reclaim_run >= VM_PAGEOUT_INACTIVE_FORCE_RECLAIM) {
1755                                 vm_pageout_inactive_force_reclaim++;
1756                         } else {
1757                                 /*
1758                                  * The page was being used, so put back on active list.
1759                                  */
1760 reactivate_page:
1761                                 vm_page_activate(m);
1762                                 VM_STAT_INCR(reactivations);
1763
1764                                 vm_pageout_inactive_used++;
1765                                 inactive_burst_count = 0;
1766
1767                                 goto done_with_inactivepage;
1768                         }
1769                         /*
1770                          * Make sure we call pmap_get_refmod() if it
1771                          * wasn't already called just above, to update
1772                          * the dirty bit.
1773                          */
1774                         if ((refmod_state == -1) && !m->dirty && m->pmapped) {
1775                                 refmod_state = pmap_get_refmod(m->phys_page);
1776                                 if (refmod_state & VM_MEM_MODIFIED)
1777                                         m->dirty = TRUE;
1778                         }
1779                         forced_reclaim = TRUE;
1780                 } else {
1781                         forced_reclaim = FALSE;
1782                 }
1783
1784                 XPR(XPR_VM_PAGEOUT,
1785                 "vm_pageout_scan, replace object 0x%X offset 0x%X page 0x%X\n",
1786                 (integer_t)object, (integer_t)m->offset, (integer_t)m, 0,0);
1787
1788                 /*
1789                  * we've got a candidate page to steal...
1790                  *
1791                  * m->dirty is up to date courtesy of the
1792                  * preceding check for m->reference... if
1793                  * we get here, then m->reference had to be
1794                  * FALSE (or possibly "reactivate_limit" was
1795                  * exceeded), but in either case we called
1796                  * pmap_get_refmod() and updated both
1797                  * m->reference and m->dirty
1798                  *
1799                  * if it's dirty or precious we need to
1800                  * see if the target queue is throtttled
1801                  * it if is, we need to skip over it by moving it back
1802                  * to the end of the inactive queue
1803                  */
1804                 inactive_throttled = FALSE;
1805
1806                 if (m->dirty || m->precious) {
1807                         if (object->internal) {
1808                                 if (VM_PAGE_Q_THROTTLED(iq))
1809                                         inactive_throttled = TRUE;
1810                         } else if (VM_PAGE_Q_THROTTLED(eq)) {
1811                                 inactive_throttled = TRUE;
1812                         }
1813                 }
1814                 if (inactive_throttled == TRUE) {
1815 throttle_inactive:
1816                         if (!IP_VALID(memory_manager_default) &&
1817                                 object->internal &&
1818                                 (object->purgable == VM_PURGABLE_DENY ||
1819                                  object->purgable == VM_PURGABLE_NONVOLATILE)) {
1820                                 queue_enter(&vm_page_queue_throttled, m,
1821                                             vm_page_t, pageq);
1822                                 m->throttled = TRUE;
1823                                 vm_page_throttled_count++;
1824                         } else {
1825                                 if (m->zero_fill) {
1826                                         queue_enter(&vm_page_queue_zf, m,
1827                                                     vm_page_t, pageq);
1828                                         vm_zf_queue_count++;
1829                                 } else
1830                                         queue_enter(&vm_page_queue_inactive, m,
1831                                                     vm_page_t, pageq);
1832                                 m->inactive = TRUE;
1833                                 if (!m->fictitious) {
1834                                         vm_page_inactive_count++;
1835                                         token_new_pagecount++;
1836                                 }
1837                         }
1838                         vm_pageout_scan_inactive_throttled++;
1839                         goto done_with_inactivepage;
1840                 }
1841
1842                 /*
1843                  * we've got a page that we can steal...
1844                  * eliminate all mappings and make sure
1845                  * we have the up-to-date modified state
1846                  * first take the page BUSY, so that no new
1847                  * mappings can be made
1848                  */
1849                 m->busy = TRUE;
1850
1851                 /*
1852                  * if we need to do a pmap_disconnect then we
1853                  * need to re-evaluate m->dirty since the pmap_disconnect
1854                  * provides the true state atomically... the
1855                  * page was still mapped up to the pmap_disconnect
1856                  * and may have been dirtied at the last microsecond
1857                  *
1858                  * we also check for the page being referenced 'late'
1859                  * if it was, we first need to do a WAKEUP_DONE on it
1860                  * since we already set m->busy = TRUE, before
1861                  * going off to reactivate it
1862                  *
1863                  * Note that if 'pmapped' is FALSE then the page is not
1864                  * and has not been in any map, so there is no point calling
1865                  * pmap_disconnect().  m->dirty and/or m->reference could
1866                  * have been set in anticipation of likely usage of the page.
1867                  */
1868                 if (m->pmapped == TRUE) {
1869                         refmod_state = pmap_disconnect(m->phys_page);
1870
1871                         if (refmod_state & VM_MEM_MODIFIED)
1872                                 m->dirty = TRUE;
1873                         if (refmod_state & VM_MEM_REFERENCED) {
1874
1875                                 /* If m->reference is already set, this page must have
1876                                  * already failed the reactivate_limit test, so don't
1877                                  * bump the counts twice.
1878                                  */
1879                                 if ( ! m->reference ) {
1880                                         m->reference = TRUE;
1881                                         if (forced_reclaim ||
1882                                             ++reactivated_this_call >= reactivate_limit)
1883                                                 vm_pageout_reactivation_limit_exceeded++;
1884                                         else {
1885                                                 PAGE_WAKEUP_DONE(m);
1886                                                 goto reactivate_page;
1887                                         }
1888                                 }
1889                         }
1890                 }
1891                 /*
1892                  * reset our count of pages that have been reclaimed
1893                  * since the last page was 'stolen'
1894                  */
1895                 inactive_reclaim_run = 0;
1896
1897                 /*
1898                  *      If it's clean and not precious, we can free the page.
1899                  */
1900                 if (!m->dirty && !m->precious) {
1901                         vm_pageout_inactive_clean++;
1902                         goto reclaim_page;
1903                 }
1904
1905                 /*
1906                  * The page may have been dirtied since the last check
1907                  * for a throttled target queue (which may have been skipped
1908                  * if the page was clean then).  With the dirty page
1909                  * disconnected here, we can make one final check.
1910                  */
1911                 {
1912                         boolean_t disconnect_throttled = FALSE;
1913                         if (object->internal) {
1914                                 if (VM_PAGE_Q_THROTTLED(iq))
1915                                         disconnect_throttled = TRUE;
1916                         } else if (VM_PAGE_Q_THROTTLED(eq)) {
1917                                 disconnect_throttled = TRUE;
1918                         }
1919
1920                         if (disconnect_throttled == TRUE) {
1921                                 PAGE_WAKEUP_DONE(m);
1922                                 goto throttle_inactive;
1923                         }
1924                 }
1925
1926                 vm_pageout_cluster(m);
1927
1928                 vm_pageout_inactive_dirty++;
1929
1930                 inactive_burst_count = 0;
1931
1932 done_with_inactivepage:
1933                 if (delayed_unlock++ > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT || try_failed == TRUE) {
1934
1935                         if (object != NULL) {
1936                                 vm_object_unlock(object);
1937                                 object = NULL;
1938                                 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1939                         }
1940                         if (local_freeq) {
1941                                 vm_page_free_list(local_freeq);
1942
1943                                 local_freeq = NULL;
1944                                 local_freed = 0;
1945                         }
1946                         mutex_yield(&vm_page_queue_lock);
1947
1948                         delayed_unlock = 1;
1949                 }
1950                 /*
1951                  * back to top of pageout scan loop
1952                  */
1953         }
1954 }
1955
1956
1957 int vm_page_free_count_init;
1958
1959 void
1960 vm_page_free_reserve(
1961         int pages)
1962 {
1963         int             free_after_reserve;
1964
1965         vm_page_free_reserved += pages;
1966
1967         free_after_reserve = vm_page_free_count_init - vm_page_free_reserved;
1968
1969         vm_page_free_min = vm_page_free_reserved +
1970                 VM_PAGE_FREE_MIN(free_after_reserve);
1971
1972         if (vm_page_free_min > VM_PAGE_FREE_MIN_LIMIT)
1973                 vm_page_free_min = VM_PAGE_FREE_MIN_LIMIT;
1974
1975         vm_page_free_target = vm_page_free_reserved +
1976                 VM_PAGE_FREE_TARGET(free_after_reserve);
1977
1978         if (vm_page_free_target > VM_PAGE_FREE_TARGET_LIMIT)
1979                 vm_page_free_target = VM_PAGE_FREE_TARGET_LIMIT;
1980
1981         if (vm_page_free_target < vm_page_free_min + 5)
1982                 vm_page_free_target = vm_page_free_min + 5;
1983
1984 }
1985
1986 /*
1987  *      vm_pageout is the high level pageout daemon.
1988  */
1989
1990 void
1991 vm_pageout_continue(void)
1992 {
1993         DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL);
1994         vm_pageout_scan_event_counter++;
1995         vm_pageout_scan();
1996         /* we hold vm_page_queue_free_lock now */
1997         assert(vm_page_free_wanted == 0);
1998         assert(vm_page_free_wanted_privileged == 0);
1999         assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
2000         mutex_unlock(&vm_page_queue_free_lock);
2001
2002         counter(c_vm_pageout_block++);
2003         thread_block((thread_continue_t)vm_pageout_continue);
2004         /*NOTREACHED*/
2005 }
2006
2007
2008 /*
2009  * must be called with the
2010  * queues and object locks held
2011  */
2012 static void
2013 vm_pageout_queue_steal(vm_page_t m)
2014 {
2015         struct vm_pageout_queue *q;
2016
2017         if (m->object->internal == TRUE)
2018                 q = &vm_pageout_queue_internal;
2019         else
2020                 q = &vm_pageout_queue_external;
2021
2022         m->laundry = FALSE;
2023         m->pageout_queue = FALSE;
2024         queue_remove(&q->pgo_pending, m, vm_page_t, pageq);
2025
2026         m->pageq.next = NULL;
2027         m->pageq.prev = NULL;
2028
2029         vm_object_paging_end(m->object);
2030
2031         q->pgo_laundry--;
2032 }
2033
2034
2035 #ifdef FAKE_DEADLOCK
2036
2037 #define FAKE_COUNT      5000
2038
2039 int internal_count = 0;
2040 int fake_deadlock = 0;
2041
2042 #endif
2043
2044 static void
2045 vm_pageout_iothread_continue(struct vm_pageout_queue *q)
2046 {
2047         vm_page_t       m = NULL;
2048         vm_object_t     object;
2049         boolean_t       need_wakeup;
2050         memory_object_t pager;
2051         thread_t        self = current_thread();
2052
2053         if ((vm_pageout_internal_iothread != THREAD_NULL)
2054             && (self == vm_pageout_external_iothread )
2055             && (self->options & TH_OPT_VMPRIV))
2056                 self->options &= ~TH_OPT_VMPRIV;
2057
2058         vm_page_lockspin_queues();
2059
2060         while ( !queue_empty(&q->pgo_pending) ) {
2061
2062                    q->pgo_busy = TRUE;
2063                    queue_remove_first(&q->pgo_pending, m, vm_page_t, pageq);
2064                    m->pageout_queue = FALSE;
2065                    vm_page_unlock_queues();
2066
2067                    m->pageq.next = NULL;
2068                    m->pageq.prev = NULL;
2069 #ifdef FAKE_DEADLOCK
2070                    if (q == &vm_pageout_queue_internal) {
2071                            vm_offset_t addr;
2072                            int  pg_count;
2073
2074                            internal_count++;
2075
2076                            if ((internal_count == FAKE_COUNT)) {
2077
2078                                    pg_count = vm_page_free_count + vm_page_free_reserved;
2079
2080                                    if (kmem_alloc(kernel_map, &addr, PAGE_SIZE * pg_count) == KERN_SUCCESS) {
2081                                            kmem_free(kernel_map, addr, PAGE_SIZE * pg_count);
2082                                    }
2083                                    internal_count = 0;
2084                                    fake_deadlock++;
2085                            }
2086                    }
2087 #endif
2088                    object = m->object;
2089
2090                    vm_object_lock(object);
2091
2092                    if (!object->pager_initialized) {
2093
2094                            /*
2095                             *   If there is no memory object for the page, create
2096                             *   one and hand it to the default pager.
2097                             */
2098
2099                            if (!object->pager_initialized)
2100                                    vm_object_collapse(object,
2101                                                       (vm_object_offset_t) 0,
2102                                                       TRUE);
2103                            if (!object->pager_initialized)
2104                                    vm_object_pager_create(object);
2105                            if (!object->pager_initialized) {
2106                                    /*
2107                                     *   Still no pager for the object.
2108                                     *   Reactivate the page.
2109                                     *
2110                                     *   Should only happen if there is no
2111                                     *   default pager.
2112                                     */
2113                                    m->list_req_pending = FALSE;
2114                                    m->cleaning = FALSE;
2115                                    m->pageout = FALSE;
2116
2117                                    vm_page_lockspin_queues();
2118                                    vm_page_unwire(m);
2119                                    vm_pageout_throttle_up(m);
2120                                    vm_pageout_dirty_no_pager++;
2121                                    vm_page_activate(m);
2122                                    vm_page_unlock_queues();
2123
2124                                    /*
2125                                     *   And we are done with it.
2126                                     */
2127                                    PAGE_WAKEUP_DONE(m);
2128
2129                                    vm_object_paging_end(object);
2130                                    vm_object_unlock(object);
2131
2132                                    vm_page_lockspin_queues();
2133                                    continue;
2134                            }
2135                    }
2136                    pager = object->pager;
2137                    if (pager == MEMORY_OBJECT_NULL) {
2138                            /*
2139                             * This pager has been destroyed by either
2140                             * memory_object_destroy or vm_object_destroy, and
2141                             * so there is nowhere for the page to go.
2142                             * Just free the page... VM_PAGE_FREE takes
2143                             * care of cleaning up all the state...
2144                             * including doing the vm_pageout_throttle_up
2145                             */
2146
2147                            VM_PAGE_FREE(m);
2148
2149                            vm_object_paging_end(object);
2150                            vm_object_unlock(object);
2151
2152                            vm_page_lockspin_queues();
2153                            continue;
2154                    }
2155                    vm_object_unlock(object);
2156                    /*
2157                     * we expect the paging_in_progress reference to have
2158                     * already been taken on the object before it was added
2159                     * to the appropriate pageout I/O queue... this will
2160                     * keep the object from being terminated and/or the
2161                     * paging_offset from changing until the I/O has
2162                     * completed... therefore no need to lock the object to
2163                     * pull the paging_offset from it.
2164                     *
2165                     * Send the data to the pager.
2166                     * any pageout clustering happens there
2167                     */
2168                    memory_object_data_return(pager,
2169                                              m->offset + object->paging_offset,
2170                                              PAGE_SIZE,
2171                                              NULL,
2172                                              NULL,
2173                                              FALSE,
2174                                              FALSE,
2175                                              0);
2176
2177                    vm_object_lock(object);
2178                    vm_object_paging_end(object);
2179                    vm_object_unlock(object);
2180
2181                    vm_page_lockspin_queues();
2182         }
2183         assert_wait((event_t) q, THREAD_UNINT);
2184
2185
2186         if (q->pgo_throttled == TRUE && !VM_PAGE_Q_THROTTLED(q)) {
2187                 q->pgo_throttled = FALSE;
2188                 need_wakeup = TRUE;
2189         } else
2190                 need_wakeup = FALSE;
2191
2192         q->pgo_busy = FALSE;
2193         q->pgo_idle = TRUE;
2194         vm_page_unlock_queues();
2195
2196         if (need_wakeup == TRUE)
2197                 thread_wakeup((event_t) &q->pgo_laundry);
2198
2199         thread_block_parameter((thread_continue_t)vm_pageout_iothread_continue, (void *) &q->pgo_pending);
2200         /*NOTREACHED*/
2201 }
2202
2203
2204 static void
2205 vm_pageout_iothread_external(void)
2206 {
2207         thread_t        self = current_thread();
2208
2209         self->options |= TH_OPT_VMPRIV;
2210
2211         vm_pageout_iothread_continue(&vm_pageout_queue_external);
2212         /*NOTREACHED*/
2213 }
2214
2215
2216 static void
2217 vm_pageout_iothread_internal(void)
2218 {
2219         thread_t        self = current_thread();
2220
2221         self->options |= TH_OPT_VMPRIV;
2222
2223         vm_pageout_iothread_continue(&vm_pageout_queue_internal);
2224         /*NOTREACHED*/
2225 }
2226
2227 static void
2228 vm_pageout_garbage_collect(int collect)
2229 {
2230         if (collect) {
2231                 stack_collect();
2232
2233                 /*
2234                  * consider_zone_gc should be last, because the other operations
2235                  * might return memory to zones.
2236                  */
2237                 consider_machine_collect();
2238                 consider_zone_gc();
2239
2240                 consider_machine_adjust();
2241         }
2242
2243         assert_wait((event_t) &vm_pageout_garbage_collect, THREAD_UNINT);
2244
2245         thread_block_parameter((thread_continue_t) vm_pageout_garbage_collect, (void *)1);
2246         /*NOTREACHED*/
2247 }
2248
2249
2250
2251 void
2252 vm_pageout(void)
2253 {
2254         thread_t        self = current_thread();
2255         thread_t        thread;
2256         kern_return_t   result;
2257         spl_t           s;
2258
2259         /*
2260          * Set thread privileges.
2261          */
2262         s = splsched();
2263         thread_lock(self);
2264         self->priority = BASEPRI_PREEMPT - 1;
2265         set_sched_pri(self, self->priority);
2266         thread_unlock(self);
2267
2268         if (!self->reserved_stack)
2269                 self->reserved_stack = self->kernel_stack;
2270
2271         splx(s);
2272
2273         /*
2274          *      Initialize some paging parameters.
2275          */
2276
2277         if (vm_pageout_idle_wait == 0)
2278                 vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
2279
2280         if (vm_pageout_burst_wait == 0)
2281                 vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
2282
2283         if (vm_pageout_empty_wait == 0)
2284                 vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
2285
2286         if (vm_pageout_deadlock_wait == 0)
2287                 vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
2288
2289         if (vm_pageout_deadlock_relief == 0)
2290                 vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
2291
2292         if (vm_pageout_inactive_relief == 0)
2293                 vm_pageout_inactive_relief = VM_PAGEOUT_INACTIVE_RELIEF;
2294
2295         if (vm_pageout_burst_active_throttle == 0)
2296                 vm_pageout_burst_active_throttle = VM_PAGEOUT_BURST_ACTIVE_THROTTLE;
2297
2298         if (vm_pageout_burst_inactive_throttle == 0)
2299                 vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
2300
2301         /*
2302          * Set kernel task to low backing store privileged
2303          * status
2304          */
2305         task_lock(kernel_task);
2306         kernel_task->priv_flags |= VM_BACKING_STORE_PRIV;
2307         task_unlock(kernel_task);
2308
2309         vm_page_free_count_init = vm_page_free_count;
2310
2311         /*
2312          * even if we've already called vm_page_free_reserve
2313          * call it again here to insure that the targets are
2314          * accurately calculated (it uses vm_page_free_count_init)
2315          * calling it with an arg of 0 will not change the reserve
2316          * but will re-calculate free_min and free_target
2317          */
2318         if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
2319                 vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
2320         } else
2321                 vm_page_free_reserve(0);
2322
2323
2324         queue_init(&vm_pageout_queue_external.pgo_pending);
2325         vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
2326         vm_pageout_queue_external.pgo_laundry = 0;
2327         vm_pageout_queue_external.pgo_idle = FALSE;
2328         vm_pageout_queue_external.pgo_busy = FALSE;
2329         vm_pageout_queue_external.pgo_throttled = FALSE;
2330
2331         queue_init(&vm_pageout_queue_internal.pgo_pending);
2332         vm_pageout_queue_internal.pgo_maxlaundry = 0;
2333         vm_pageout_queue_internal.pgo_laundry = 0;
2334         vm_pageout_queue_internal.pgo_idle = FALSE;
2335         vm_pageout_queue_internal.pgo_busy = FALSE;
2336         vm_pageout_queue_internal.pgo_throttled = FALSE;
2337
2338
2339         /* internal pageout thread started when default pager registered first time */
2340         /* external pageout and garbage collection threads started here */
2341
2342         result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external, NULL,
2343                                               BASEPRI_PREEMPT - 1,
2344                                               &vm_pageout_external_iothread);
2345         if (result != KERN_SUCCESS)
2346                 panic("vm_pageout_iothread_external: create failed");
2347
2348         thread_deallocate(vm_pageout_external_iothread);
2349
2350         result = kernel_thread_start_priority((thread_continue_t)vm_pageout_garbage_collect, NULL,
2351                                               MINPRI_KERNEL,
2352                                               &thread);
2353         if (result != KERN_SUCCESS)
2354                 panic("vm_pageout_garbage_collect: create failed");
2355
2356         thread_deallocate(thread);
2357
2358         vm_object_reaper_init();
2359
2360
2361         vm_pageout_continue();
2362
2363         /*
2364          * Unreached code!
2365          *
2366          * The vm_pageout_continue() call above never returns, so the code below is never
2367          * executed.  We take advantage of this to declare several DTrace VM related probe
2368          * points that our kernel doesn't have an analog for.  These are probe points that
2369          * exist in Solaris and are in the DTrace documentation, so people may have written
2370          * scripts that use them.  Declaring the probe points here means their scripts will
2371          * compile and execute which we want for portability of the scripts, but since this
2372          * section of code is never reached, the probe points will simply never fire.  Yes,
2373          * this is basically a hack.  The problem is the DTrace probe points were chosen with
2374          * Solaris specific VM events in mind, not portability to different VM implementations.
2375          */
2376
2377         DTRACE_VM2(execfree, int, 1, (uint64_t *), NULL);
2378         DTRACE_VM2(execpgin, int, 1, (uint64_t *), NULL);
2379         DTRACE_VM2(execpgout, int, 1, (uint64_t *), NULL);
2380         DTRACE_VM2(pgswapin, int, 1, (uint64_t *), NULL);
2381         DTRACE_VM2(pgswapout, int, 1, (uint64_t *), NULL);
2382         DTRACE_VM2(swapin, int, 1, (uint64_t *), NULL);
2383         DTRACE_VM2(swapout, int, 1, (uint64_t *), NULL);
2384         /*NOTREACHED*/
2385 }
2386
2387 kern_return_t
2388 vm_pageout_internal_start(void)
2389 {
2390         kern_return_t result;
2391
2392         vm_pageout_queue_internal.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
2393         result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal, NULL, BASEPRI_PREEMPT - 1, &vm_pageout_internal_iothread);
2394         if (result == KERN_SUCCESS)
2395                 thread_deallocate(vm_pageout_internal_iothread);
2396         return result;
2397 }
2398
2399 #define UPL_DELAYED_UNLOCK_LIMIT  (MAX_UPL_TRANSFER / 2)
2400
2401 static upl_t
2402 upl_create(int type, int flags, upl_size_t size)
2403 {
2404         upl_t   upl;
2405         int     page_field_size = 0;
2406         int     upl_flags = 0;
2407         int     upl_size  = sizeof(struct upl);
2408
2409         if (type & UPL_CREATE_LITE) {
2410                 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
2411                 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
2412
2413                 upl_flags |= UPL_LITE;
2414         }
2415         if (type & UPL_CREATE_INTERNAL) {
2416                 upl_size += sizeof(struct upl_page_info) * (size/PAGE_SIZE);
2417
2418                 upl_flags |= UPL_INTERNAL;
2419         }
2420         upl = (upl_t)kalloc(upl_size + page_field_size);
2421
2422         if (page_field_size)
2423                 bzero((char *)upl + upl_size, page_field_size);
2424
2425         upl->flags = upl_flags | flags;
2426         upl->src_object = NULL;
2427         upl->kaddr = (vm_offset_t)0;
2428         upl->size = 0;
2429         upl->map_object = NULL;
2430         upl->ref_count = 1;
2431         upl->highest_page = 0;
2432         upl_lock_init(upl);
2433 #ifdef UPL_DEBUG
2434         upl->ubc_alias1 = 0;
2435         upl->ubc_alias2 = 0;
2436 #endif /* UPL_DEBUG */
2437         return(upl);
2438 }
2439
2440 static void
2441 upl_destroy(upl_t upl)
2442 {
2443         int     page_field_size;  /* bit field in word size buf */
2444         int     size;
2445
2446 #ifdef UPL_DEBUG
2447         {
2448                 vm_object_t     object;
2449
2450                 if (upl->flags & UPL_SHADOWED) {
2451                         object = upl->map_object->shadow;
2452                 } else {
2453                         object = upl->map_object;
2454                 }
2455                 vm_object_lock(object);
2456                 queue_remove(&object->uplq, upl, upl_t, uplq);
2457                 vm_object_unlock(object);
2458         }
2459 #endif /* UPL_DEBUG */
2460         /*
2461          * drop a reference on the map_object whether or
2462          * not a pageout object is inserted
2463          */
2464         if (upl->flags & UPL_SHADOWED)
2465                 vm_object_deallocate(upl->map_object);
2466
2467         if (upl->flags & UPL_DEVICE_MEMORY)
2468                 size = PAGE_SIZE;
2469         else
2470                 size = upl->size;
2471         page_field_size = 0;
2472
2473         if (upl->flags & UPL_LITE) {
2474                 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
2475                 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
2476         }
2477         if (upl->flags & UPL_INTERNAL) {
2478                 kfree(upl,
2479                       sizeof(struct upl) +
2480                       (sizeof(struct upl_page_info) * (size/PAGE_SIZE))
2481                       + page_field_size);
2482         } else {
2483                 kfree(upl, sizeof(struct upl) + page_field_size);
2484         }
2485 }
2486
2487 void uc_upl_dealloc(upl_t upl);
2488 __private_extern__ void
2489 uc_upl_dealloc(upl_t upl)
2490 {
2491         if (--upl->ref_count == 0)
2492                 upl_destroy(upl);
2493 }
2494
2495 void
2496 upl_deallocate(upl_t upl)
2497 {
2498         if (--upl->ref_count == 0)
2499                 upl_destroy(upl);
2500 }
2501
2502 /*
2503  * Statistics about UPL enforcement of copy-on-write obligations.
2504  */
2505 unsigned long upl_cow = 0;
2506 unsigned long upl_cow_again = 0;
2507 unsigned long upl_cow_contiguous = 0;
2508 unsigned long upl_cow_pages = 0;
2509 unsigned long upl_cow_again_pages = 0;
2510 unsigned long upl_cow_contiguous_pages = 0;
2511
2512 /*
2513  *      Routine:        vm_object_upl_request
2514  *      Purpose:
2515  *              Cause the population of a portion of a vm_object.
2516  *              Depending on the nature of the request, the pages
2517  *              returned may be contain valid data or be uninitialized.
2518  *              A page list structure, listing the physical pages
2519  *              will be returned upon request.
2520  *              This function is called by the file system or any other
2521  *              supplier of backing store to a pager.
2522  *              IMPORTANT NOTE: The caller must still respect the relationship
2523  *              between the vm_object and its backing memory object.  The
2524  *              caller MUST NOT substitute changes in the backing file
2525  *              without first doing a memory_object_lock_request on the
2526  *              target range unless it is know that the pages are not
2527  *              shared with another entity at the pager level.
2528  *              Copy_in_to:
2529  *                      if a page list structure is present
2530  *                      return the mapped physical pages, where a
2531  *                      page is not present, return a non-initialized
2532  *                      one.  If the no_sync bit is turned on, don't
2533  *                      call the pager unlock to synchronize with other
2534  *                      possible copies of the page. Leave pages busy
2535  *                      in the original object, if a page list structure
2536  *                      was specified.  When a commit of the page list
2537  *                      pages is done, the dirty bit will be set for each one.
2538  *              Copy_out_from:
2539  *                      If a page list structure is present, return
2540  *                      all mapped pages.  Where a page does not exist
2541  *                      map a zero filled one. Leave pages busy in
2542  *                      the original object.  If a page list structure
2543  *                      is not specified, this call is a no-op.
2544  *
2545  *              Note:  access of default pager objects has a rather interesting
2546  *              twist.  The caller of this routine, presumably the file system
2547  *              page cache handling code, will never actually make a request
2548  *              against a default pager backed object.  Only the default
2549  *              pager will make requests on backing store related vm_objects
2550  *              In this way the default pager can maintain the relationship
2551  *              between backing store files (abstract memory objects) and
2552  *              the vm_objects (cache objects), they support.
2553  *
2554  */
2555
2556 __private_extern__ kern_return_t
2557 vm_object_upl_request(
2558         vm_object_t             object,
2559         vm_object_offset_t      offset,
2560         upl_size_t              size,
2561         upl_t                   *upl_ptr,
2562         upl_page_info_array_t   user_page_list,
2563         unsigned int            *page_list_count,
2564         int                     cntrl_flags)
2565 {
2566         vm_page_t               dst_page = VM_PAGE_NULL;
2567         vm_object_offset_t      dst_offset;
2568         upl_size_t              xfer_size;
2569         boolean_t               dirty;
2570         boolean_t               hw_dirty;
2571         upl_t                   upl = NULL;
2572         unsigned int            entry;
2573 #if MACH_CLUSTER_STATS
2574         boolean_t               encountered_lrp = FALSE;
2575 #endif
2576         vm_page_t               alias_page = NULL;
2577         int                     refmod_state = 0;
2578         wpl_array_t             lite_list = NULL;
2579         vm_object_t             last_copy_object;
2580         int                     delayed_unlock = 0;
2581
2582         if (cntrl_flags & ~UPL_VALID_FLAGS) {
2583                 /*
2584                  * For forward compatibility's sake,
2585                  * reject any unknown flag.
2586                  */
2587                 return KERN_INVALID_VALUE;
2588         }
2589         if ( (!object->internal) && (object->paging_offset != 0) )
2590                 panic("vm_object_upl_request: external object with non-zero paging offset\n");
2591         if (object->phys_contiguous)
2592                 panic("vm_object_upl_request: contiguous object specified\n");
2593
2594
2595         if ((size / PAGE_SIZE) > MAX_UPL_TRANSFER)
2596                 size = MAX_UPL_TRANSFER * PAGE_SIZE;
2597
2598         if ( (cntrl_flags & UPL_SET_INTERNAL) && page_list_count != NULL)
2599                 *page_list_count = MAX_UPL_TRANSFER;
2600
2601         if (cntrl_flags & UPL_SET_INTERNAL) {
2602                 if (cntrl_flags & UPL_SET_LITE) {
2603
2604                         upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE, 0, size);
2605
2606                         user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
2607                         lite_list = (wpl_array_t)
2608                                         (((uintptr_t)user_page_list) +
2609                                         ((size/PAGE_SIZE) * sizeof(upl_page_info_t)));
2610                 } else {
2611                         upl = upl_create(UPL_CREATE_INTERNAL, 0, size);
2612
2613                         user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
2614                 }
2615         } else {
2616                 if (cntrl_flags & UPL_SET_LITE) {
2617
2618                         upl = upl_create(UPL_CREATE_EXTERNAL | UPL_CREATE_LITE, 0, size);
2619
2620                         lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
2621                 } else {
2622                         upl = upl_create(UPL_CREATE_EXTERNAL, 0, size);
2623                 }
2624         }
2625         *upl_ptr = upl;
2626
2627         if (user_page_list)
2628                 user_page_list[0].device = FALSE;
2629
2630         if (cntrl_flags & UPL_SET_LITE) {
2631                 upl->map_object = object;
2632         } else {
2633                 upl->map_object = vm_object_allocate(size);
2634                 /*
2635                  * No neeed to lock the new object: nobody else knows
2636                  * about it yet, so it's all ours so far.
2637                  */
2638                 upl->map_object->shadow = object;
2639                 upl->map_object->pageout = TRUE;
2640                 upl->map_object->can_persist = FALSE;
2641                 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2642                 upl->map_object->shadow_offset = offset;
2643                 upl->map_object->wimg_bits = object->wimg_bits;
2644
2645                 VM_PAGE_GRAB_FICTITIOUS(alias_page);
2646
2647                 upl->flags |= UPL_SHADOWED;
2648         }
2649         /*
2650          * ENCRYPTED SWAP:
2651          * Just mark the UPL as "encrypted" here.
2652          * We'll actually encrypt the pages later,
2653          * in upl_encrypt(), when the caller has
2654          * selected which pages need to go to swap.
2655          */
2656         if (cntrl_flags & UPL_ENCRYPT)
2657                 upl->flags |= UPL_ENCRYPTED;
2658
2659         if (cntrl_flags & UPL_FOR_PAGEOUT)
2660                 upl->flags |= UPL_PAGEOUT;
2661
2662         vm_object_lock(object);
2663         vm_object_paging_begin(object);
2664
2665         /*
2666          * we can lock in the paging_offset once paging_in_progress is set
2667          */
2668         upl->size = size;
2669         upl->offset = offset + object->paging_offset;
2670
2671 #ifdef UPL_DEBUG
2672         queue_enter(&object->uplq, upl, upl_t, uplq);
2673 #endif /* UPL_DEBUG */
2674
2675         if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != VM_OBJECT_NULL) {
2676                 /*
2677                  * Honor copy-on-write obligations
2678                  *
2679                  * The caller is gathering these pages and
2680                  * might modify their contents.  We need to
2681                  * make sure that the copy object has its own
2682                  * private copies of these pages before we let
2683                  * the caller modify them.
2684                  */
2685                 vm_object_update(object,
2686                                  offset,
2687                                  size,
2688                                  NULL,
2689                                  NULL,
2690                                  FALSE, /* should_return */
2691                                  MEMORY_OBJECT_COPY_SYNC,
2692                                  VM_PROT_NO_CHANGE);
2693                 upl_cow++;
2694                 upl_cow_pages += size >> PAGE_SHIFT;
2695         }
2696         /*
2697          * remember which copy object we synchronized with
2698          */
2699         last_copy_object = object->copy;
2700         entry = 0;
2701
2702         xfer_size = size;
2703         dst_offset = offset;
2704
2705         while (xfer_size) {
2706
2707                 if ((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
2708                         if (delayed_unlock) {
2709                                 delayed_unlock = 0;
2710                                 vm_page_unlock_queues();
2711                         }
2712                         vm_object_unlock(object);
2713                         VM_PAGE_GRAB_FICTITIOUS(alias_page);
2714                         vm_object_lock(object);
2715                 }
2716                 if (delayed_unlock == 0)
2717                         vm_page_lock_queues();
2718
2719                 if (cntrl_flags & UPL_COPYOUT_FROM) {
2720                         upl->flags |= UPL_PAGE_SYNC_DONE;
2721
2722                         if ( ((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
2723                                 dst_page->fictitious ||
2724                                 dst_page->absent ||
2725                                 dst_page->error ||
2726                                (dst_page->wire_count && !dst_page->pageout && !dst_page->list_req_pending)) {
2727
2728                                 if (user_page_list)
2729                                         user_page_list[entry].phys_addr = 0;
2730
2731                                 goto delay_unlock_queues;
2732                         }
2733                         /*
2734                          * grab this up front...
2735                          * a high percentange of the time we're going to
2736                          * need the hardware modification state a bit later
2737                          * anyway... so we can eliminate an extra call into
2738                          * the pmap layer by grabbing it here and recording it
2739                          */
2740                         if (dst_page->pmapped)
2741                                 refmod_state = pmap_get_refmod(dst_page->phys_page);
2742                         else
2743                                 refmod_state = 0;
2744
2745                         if ( (refmod_state & VM_MEM_REFERENCED) && dst_page->inactive ) {
2746                                 /*
2747                                  * page is on inactive list and referenced...
2748                                  * reactivate it now... this gets it out of the
2749                                  * way of vm_pageout_scan which would have to
2750                                  * reactivate it upon tripping over it
2751                                  */
2752                                 vm_page_activate(dst_page);
2753                                 VM_STAT_INCR(reactivations);
2754                         }
2755                         if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
2756                                 /*
2757                                  * we're only asking for DIRTY pages to be returned
2758                                  */
2759                                 if (dst_page->list_req_pending || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
2760                                         /*
2761                                          * if we were the page stolen by vm_pageout_scan to be
2762                                          * cleaned (as opposed to a buddy being clustered in
2763                                          * or this request is not being driven by a PAGEOUT cluster
2764                                          * then we only need to check for the page being dirty or
2765                                          * precious to decide whether to return it
2766                                          */
2767                                         if (dst_page->dirty || dst_page->precious || (refmod_state & VM_MEM_MODIFIED))
2768                                                 goto check_busy;
2769                                         goto dont_return;
2770                                 }
2771                                 /*
2772                                  * this is a request for a PAGEOUT cluster and this page
2773                                  * is merely along for the ride as a 'buddy'... not only
2774                                  * does it have to be dirty to be returned, but it also
2775                                  * can't have been referenced recently... note that we've
2776                                  * already filtered above based on whether this page is
2777                                  * currently on the inactive queue or it meets the page
2778                                  * ticket (generation count) check
2779                                  */
2780                                 if ( !(refmod_state & VM_MEM_REFERENCED) &&
2781                                      ((refmod_state & VM_MEM_MODIFIED) || dst_page->dirty || dst_page->precious) ) {
2782                                         goto check_busy;
2783                                 }
2784 dont_return:
2785                                 /*
2786                                  * if we reach here, we're not to return
2787                                  * the page... go on to the next one
2788                                  */
2789                                 if (user_page_list)
2790                                         user_page_list[entry].phys_addr = 0;
2791
2792                                 goto delay_unlock_queues;
2793                         }
2794 check_busy:
2795                         if (dst_page->busy && (!(dst_page->list_req_pending && dst_page->pageout))) {
2796                                 if (cntrl_flags & UPL_NOBLOCK) {
2797                                         if (user_page_list)
2798                                                 user_page_list[entry].phys_addr = 0;
2799
2800                                         goto delay_unlock_queues;
2801                                 }
2802                                 /*
2803                                  * someone else is playing with the
2804                                  * page.  We will have to wait.
2805                                  */
2806                                 delayed_unlock = 0;
2807                                 vm_page_unlock_queues();
2808
2809                                 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
2810
2811                                 continue;
2812                         }
2813                         /*
2814                          * Someone else already cleaning the page?
2815                          */
2816                         if ((dst_page->cleaning || dst_page->absent || dst_page->wire_count != 0) && !dst_page->list_req_pending) {
2817                                 if (user_page_list)
2818                                         user_page_list[entry].phys_addr = 0;
2819
2820                                 goto delay_unlock_queues;
2821                         }
2822                         /*
2823                          * ENCRYPTED SWAP:
2824                          * The caller is gathering this page and might
2825                          * access its contents later on.  Decrypt the
2826                          * page before adding it to the UPL, so that
2827                          * the caller never sees encrypted data.
2828                          */
2829                         if (! (cntrl_flags & UPL_ENCRYPT) && dst_page->encrypted) {
2830                                 int  was_busy;
2831
2832                                 delayed_unlock = 0;
2833                                 vm_page_unlock_queues();
2834                                 /*
2835                                  * save the current state of busy
2836                                  * mark page as busy while decrypt
2837                                  * is in progress since it will drop
2838                                  * the object lock...
2839                                  */
2840                                 was_busy = dst_page->busy;
2841                                 dst_page->busy = TRUE;
2842
2843                                 vm_page_decrypt(dst_page, 0);
2844                                 vm_page_decrypt_for_upl_counter++;
2845                                 /*
2846                                  * restore to original busy state
2847                                  */
2848                                 dst_page->busy = was_busy;
2849
2850                                 vm_page_lock_queues();
2851                         }
2852                         if (dst_page->pageout_queue == TRUE)
2853                                 /*
2854                                  * we've buddied up a page for a clustered pageout
2855                                  * that has already been moved to the pageout
2856                                  * queue by pageout_scan... we need to remove
2857                                  * it from the queue and drop the laundry count
2858                                  * on that queue
2859                                  */
2860                                 vm_pageout_queue_steal(dst_page);
2861 #if MACH_CLUSTER_STATS
2862                         /*
2863                          * pageout statistics gathering.  count
2864                          * all the pages we will page out that
2865                          * were not counted in the initial
2866                          * vm_pageout_scan work
2867                          */
2868                         if (dst_page->list_req_pending)
2869                                 encountered_lrp = TRUE;
2870                         if ((dst_page->dirty || (dst_page->object->internal && dst_page->precious)) && !dst_page->list_req_pending) {
2871                                 if (encountered_lrp)
2872                                         CLUSTER_STAT(pages_at_higher_offsets++;)
2873                                 else
2874                                         CLUSTER_STAT(pages_at_lower_offsets++;)
2875                         }
2876 #endif
2877                         /*
2878                          * Turn off busy indication on pending
2879                          * pageout.  Note: we can only get here
2880                          * in the request pending case.
2881                          */
2882                         dst_page->list_req_pending = FALSE;
2883                         dst_page->busy = FALSE;
2884
2885                         hw_dirty = refmod_state & VM_MEM_MODIFIED;
2886                         dirty = hw_dirty ? TRUE : dst_page->dirty;
2887
2888                         if (dst_page->phys_page > upl->highest_page)
2889                                 upl->highest_page = dst_page->phys_page;
2890
2891                         if (cntrl_flags & UPL_SET_LITE) {
2892                                 int     pg_num;
2893
2894                                 pg_num = (dst_offset-offset)/PAGE_SIZE;
2895                                 lite_list[pg_num>>5] |= 1 << (pg_num & 31);
2896
2897                                 if (hw_dirty)
2898                                         pmap_clear_modify(dst_page->phys_page);
2899
2900                                 /*
2901                                  * Mark original page as cleaning
2902                                  * in place.
2903                                  */
2904                                 dst_page->cleaning = TRUE;
2905                                 dst_page->precious = FALSE;
2906                         } else {
2907                                 /*
2908                                  * use pageclean setup, it is more
2909                                  * convenient even for the pageout
2910                                  * cases here
2911                                  */
2912                                 vm_object_lock(upl->map_object);
2913                                 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
2914                                 vm_object_unlock(upl->map_object);
2915
2916                                 alias_page->absent = FALSE;
2917                                 alias_page = NULL;
2918                         }
2919 #if     MACH_PAGEMAP
2920                         /*
2921                          * Record that this page has been
2922                          * written out
2923                          */
2924                         vm_external_state_set(object->existence_map, dst_page->offset);
2925 #endif  /*MACH_PAGEMAP*/
2926                         dst_page->dirty = dirty;
2927
2928                         if (!dirty)
2929                                 dst_page->precious = TRUE;
2930
2931                         if (dst_page->pageout)
2932                                 dst_page->busy = TRUE;
2933
2934                         if ( (cntrl_flags & UPL_ENCRYPT) ) {
2935                                 /*
2936                                  * ENCRYPTED SWAP:
2937                                  * We want to deny access to the target page
2938                                  * because its contents are about to be
2939                                  * encrypted and the user would be very
2940                                  * confused to see encrypted data instead
2941                                  * of their data.
2942                                  * We also set "encrypted_cleaning" to allow
2943                                  * vm_pageout_scan() to demote that page
2944                                  * from "adjacent/clean-in-place" to
2945                                  * "target/clean-and-free" if it bumps into
2946                                  * this page during its scanning while we're
2947                                  * still processing this cluster.
2948                                  */
2949                                 dst_page->busy = TRUE;
2950                                 dst_page->encrypted_cleaning = TRUE;
2951                         }
2952                         if ( !(cntrl_flags & UPL_CLEAN_IN_PLACE) ) {
2953                                 /*
2954                                  * deny access to the target page
2955                                  * while it is being worked on
2956                                  */
2957                                 if ((!dst_page->pageout) && (dst_page->wire_count == 0)) {
2958                                         dst_page->busy = TRUE;
2959                                         dst_page->pageout = TRUE;
2960                                         vm_page_wire(dst_page);
2961                                 }
2962                         }
2963                 } else {
2964                         if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != last_copy_object) {
2965                                 /*
2966                                  * Honor copy-on-write obligations
2967                                  *
2968                                  * The copy object has changed since we
2969                                  * last synchronized for copy-on-write.
2970                                  * Another copy object might have been
2971                                  * inserted while we released the object's
2972                                  * lock.  Since someone could have seen the
2973                                  * original contents of the remaining pages
2974                                  * through that new object, we have to
2975                                  * synchronize with it again for the remaining
2976                                  * pages only.  The previous pages are "busy"
2977                                  * so they can not be seen through the new
2978                                  * mapping.  The new mapping will see our
2979                                  * upcoming changes for those previous pages,
2980                                  * but that's OK since they couldn't see what
2981                                  * was there before.  It's just a race anyway
2982                                  * and there's no guarantee of consistency or
2983                                  * atomicity.  We just don't want new mappings
2984                                  * to see both the *before* and *after* pages.
2985                                  */
2986                                 if (object->copy != VM_OBJECT_NULL) {
2987                                         delayed_unlock = 0;
2988                                         vm_page_unlock_queues();
2989
2990                                         vm_object_update(
2991                                                 object,
2992                                                 dst_offset,/* current offset */
2993                                                 xfer_size, /* remaining size */
2994                                                 NULL,
2995                                                 NULL,
2996                                                 FALSE,     /* should_return */
2997                                                 MEMORY_OBJECT_COPY_SYNC,
2998                                                 VM_PROT_NO_CHANGE);
2999
3000                                         upl_cow_again++;
3001                                         upl_cow_again_pages += xfer_size >> PAGE_SHIFT;
3002
3003                                         vm_page_lock_queues();
3004                                 }
3005                                 /*
3006                                  * remember the copy object we synced with
3007                                  */
3008                                 last_copy_object = object->copy;
3009                         }
3010                         dst_page = vm_page_lookup(object, dst_offset);
3011
3012                         if (dst_page != VM_PAGE_NULL) {
3013                                 if ( !(dst_page->list_req_pending) ) {
3014                                         if ((cntrl_flags & UPL_RET_ONLY_ABSENT) && !dst_page->absent) {
3015                                                 /*
3016                                                  * skip over pages already present in the cache
3017                                                  */
3018                                                 if (user_page_list)
3019                                                         user_page_list[entry].phys_addr = 0;
3020
3021                                                 goto delay_unlock_queues;
3022                                         }
3023                                         if (dst_page->cleaning) {
3024                                                 /*
3025                                                  * someone else is writing to the page... wait...
3026                                                  */
3027                                                 delayed_unlock = 0;
3028                                                 vm_page_unlock_queues();
3029
3030                                                 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
3031
3032                                                 continue;
3033                                         }
3034                                 } else {
3035                                         if (dst_page->fictitious &&
3036                                             dst_page->phys_page == vm_page_fictitious_addr) {
3037                                                 assert( !dst_page->speculative);
3038                                                 /*
3039                                                  * dump the fictitious page
3040                                                  */
3041                                                 dst_page->list_req_pending = FALSE;
3042
3043                                                 vm_page_free(dst_page);
3044
3045                                                 dst_page = NULL;
3046                                         } else if (dst_page->absent) {
3047                                                 /*
3048                                                  * the default_pager case
3049                                                  */
3050                                                 dst_page->list_req_pending = FALSE;
3051                                                 dst_page->busy = FALSE;
3052                                         }
3053                                 }
3054                         }
3055                         if (dst_page == VM_PAGE_NULL) {
3056                                 if (object->private) {
3057                                         /*
3058                                          * This is a nasty wrinkle for users
3059                                          * of upl who encounter device or
3060                                          * private memory however, it is
3061                                          * unavoidable, only a fault can
3062                                          * resolve the actual backing
3063                                          * physical page by asking the
3064                                          * backing device.
3065                                          */
3066                                         if (user_page_list)
3067                                                 user_page_list[entry].phys_addr = 0;
3068
3069                                         goto delay_unlock_queues;
3070                                 }
3071                                 /*
3072                                  * need to allocate a page
3073                                  * vm_page_alloc may grab the
3074                                  * queues lock for a purgeable object
3075                                  * so drop it
3076                                  */
3077                                 delayed_unlock = 0;
3078                                 vm_page_unlock_queues();
3079
3080                                 dst_page = vm_page_alloc(object, dst_offset);
3081
3082                                 if (dst_page == VM_PAGE_NULL) {
3083                                         if ( (cntrl_flags & (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) == (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) {
3084                                                /*
3085                                                 * we don't want to stall waiting for pages to come onto the free list
3086                                                 * while we're already holding absent pages in this UPL
3087                                                 * the caller will deal with the empty slots
3088                                                 */
3089                                                 if (user_page_list)
3090                                                         user_page_list[entry].phys_addr = 0;
3091
3092                                                 goto try_next_page;
3093                                         }
3094                                         /*
3095                                          * no pages available... wait
3096                                          * then try again for the same
3097                                          * offset...
3098                                          */
3099                                         vm_object_unlock(object);
3100                                         VM_PAGE_WAIT();
3101                                         vm_object_lock(object);
3102
3103                                         continue;
3104                                 }
3105                                 dst_page->busy = FALSE;
3106                                 dst_page->absent = TRUE;
3107
3108                                 if (cntrl_flags & UPL_RET_ONLY_ABSENT) {
3109                                         /*
3110                                          * if UPL_RET_ONLY_ABSENT was specified,
3111                                          * than we're definitely setting up a
3112                                          * upl for a clustered read/pagein
3113                                          * operation... mark the pages as clustered
3114                                          * so upl_commit_range can put them on the
3115                                          * speculative list
3116                                          */
3117                                         dst_page->clustered = TRUE;
3118                                 }
3119                                 vm_page_lock_queues();
3120                         }
3121                         /*
3122                          * ENCRYPTED SWAP:
3123                          */
3124                         if (cntrl_flags & UPL_ENCRYPT) {
3125                                 /*
3126                                  * The page is going to be encrypted when we
3127                                  * get it from the pager, so mark it so.
3128                                  */
3129                                 dst_page->encrypted = TRUE;
3130                         } else {
3131                                 /*
3132                                  * Otherwise, the page will not contain
3133                                  * encrypted data.
3134                                  */
3135                                 dst_page->encrypted = FALSE;
3136                         }
3137                         dst_page->overwriting = TRUE;
3138
3139                         if (dst_page->fictitious) {
3140                                 panic("need corner case for fictitious page");
3141                         }
3142                         if (dst_page->busy) {
3143                                 /*
3144                                  * someone else is playing with the
3145                                  * page.  We will have to wait.
3146                                  */
3147                                 delayed_unlock = 0;
3148                                 vm_page_unlock_queues();
3149
3150                                 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
3151
3152                                 continue;
3153                         }
3154                         if (dst_page->pmapped) {
3155                                 if ( !(cntrl_flags & UPL_FILE_IO))
3156                                         /*
3157                                          * eliminate all mappings from the
3158                                          * original object and its prodigy
3159                                          */
3160                                         refmod_state = pmap_disconnect(dst_page->phys_page);
3161                                 else
3162                                         refmod_state = pmap_get_refmod(dst_page->phys_page);
3163                         } else
3164                                 refmod_state = 0;
3165
3166                         hw_dirty = refmod_state & VM_MEM_MODIFIED;
3167                         dirty = hw_dirty ? TRUE : dst_page->dirty;
3168
3169                         if (cntrl_flags & UPL_SET_LITE) {
3170                                 int     pg_num;
3171
3172                                 pg_num = (dst_offset-offset)/PAGE_SIZE;
3173                                 lite_list[pg_num>>5] |= 1 << (pg_num & 31);
3174
3175                                 if (hw_dirty)
3176                                         pmap_clear_modify(dst_page->phys_page);
3177
3178                                 /*
3179                                  * Mark original page as cleaning
3180                                  * in place.
3181                                  */
3182                                 dst_page->cleaning = TRUE;
3183                                 dst_page->precious = FALSE;
3184                         } else {
3185                                 /*
3186                                  * use pageclean setup, it is more
3187                                  * convenient even for the pageout
3188                                  * cases here
3189                                  */
3190                                 vm_object_lock(upl->map_object);
3191                                 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
3192                                 vm_object_unlock(upl->map_object);
3193
3194                                 alias_page->absent = FALSE;
3195                                 alias_page = NULL;
3196                         }
3197
3198                         if (cntrl_flags & UPL_CLEAN_IN_PLACE) {
3199                                 /*
3200                                  * clean in place for read implies
3201                                  * that a write will be done on all
3202                                  * the pages that are dirty before
3203                                  * a upl commit is done.  The caller
3204                                  * is obligated to preserve the
3205                                  * contents of all pages marked dirty
3206                                  */
3207                                 upl->flags |= UPL_CLEAR_DIRTY;
3208                         }
3209                         dst_page->dirty = dirty;
3210
3211                         if (!dirty)
3212                                 dst_page->precious = TRUE;
3213
3214                         if (dst_page->wire_count == 0) {
3215                                 /*
3216                                  * deny access to the target page while
3217                                  * it is being worked on
3218                                  */
3219                                 dst_page->busy = TRUE;
3220                         } else
3221                                 vm_page_wire(dst_page);
3222
3223                         if (dst_page->clustered) {
3224                                 /*
3225                                  * expect the page not to be used
3226                                  * since it's coming in as part
3227                                  * of a speculative cluster...
3228                                  * pages that are 'consumed' will
3229                                  * get a hardware reference
3230                                  */
3231                                 dst_page->reference = FALSE;
3232                         } else {
3233                                 /*
3234                                  * expect the page to be used
3235                                  */
3236                                 dst_page->reference = TRUE;
3237                         }
3238                         dst_page->precious = (cntrl_flags & UPL_PRECIOUS) ? TRUE : FALSE;
3239                 }
3240                 if (dst_page->phys_page > upl->highest_page)
3241                         upl->highest_page = dst_page->phys_page;
3242                 if (user_page_list) {
3243                         user_page_list[entry].phys_addr = dst_page->phys_page;
3244                         user_page_list[entry].dirty     = dst_page->dirty;
3245                         user_page_list[entry].pageout   = dst_page->pageout;
3246                         user_page_list[entry].absent    = dst_page->absent;
3247                         user_page_list[entry].precious  = dst_page->precious;
3248
3249                         if (dst_page->clustered == TRUE)
3250                                 user_page_list[entry].speculative = dst_page->speculative;
3251                         else
3252                                 user_page_list[entry].speculative = FALSE;
3253                 }
3254                 /*
3255                  * if UPL_RET_ONLY_ABSENT is set, then
3256                  * we are working with a fresh page and we've
3257                  * just set the clustered flag on it to
3258                  * indicate that it was drug in as part of a
3259                  * speculative cluster... so leave it alone
3260                  */
3261                 if ( !(cntrl_flags & UPL_RET_ONLY_ABSENT)) {
3262                         /*
3263                          * someone is explicitly grabbing this page...
3264                          * update clustered and speculative state
3265                          *
3266                          */
3267                         VM_PAGE_CONSUME_CLUSTERED(dst_page);
3268                 }
3269 delay_unlock_queues:
3270                 if (delayed_unlock++ > UPL_DELAYED_UNLOCK_LIMIT) {
3271                         mutex_yield(&vm_page_queue_lock);
3272                         delayed_unlock = 1;
3273                 }
3274 try_next_page:
3275                 entry++;
3276                 dst_offset += PAGE_SIZE_64;
3277                 xfer_size -= PAGE_SIZE;
3278         }
3279         if (alias_page != NULL) {
3280                 if (delayed_unlock == 0) {
3281                         vm_page_lock_queues();
3282                         delayed_unlock++;
3283                 }
3284                 vm_page_free(alias_page);
3285         }
3286         if (delayed_unlock)
3287                 vm_page_unlock_queues();
3288
3289         if (page_list_count != NULL) {
3290                 if (upl->flags & UPL_INTERNAL)
3291                         *page_list_count = 0;
3292                 else if (*page_list_count > entry)
3293                         *page_list_count = entry;
3294         }
3295         vm_object_unlock(object);
3296
3297         return KERN_SUCCESS;
3298 }
3299
3300 /* JMM - Backward compatability for now */
3301 kern_return_t
3302 vm_fault_list_request(                  /* forward */
3303         memory_object_control_t         control,
3304         vm_object_offset_t      offset,
3305         upl_size_t              size,
3306         upl_t                   *upl_ptr,
3307         upl_page_info_t         **user_page_list_ptr,
3308         unsigned int            page_list_count,
3309         int                     cntrl_flags);
3310 kern_return_t
3311 vm_fault_list_request(
3312         memory_object_control_t         control,
3313         vm_object_offset_t      offset,
3314         upl_size_t              size,
3315         upl_t                   *upl_ptr,
3316         upl_page_info_t         **user_page_list_ptr,
3317         unsigned int            page_list_count,
3318         int                     cntrl_flags)
3319 {
3320         unsigned int            local_list_count;
3321         upl_page_info_t         *user_page_list;
3322         kern_return_t           kr;
3323
3324         if (user_page_list_ptr != NULL) {
3325                 local_list_count = page_list_count;
3326                 user_page_list = *user_page_list_ptr;
3327         } else {
3328                 local_list_count = 0;
3329                 user_page_list = NULL;
3330         }
3331         kr =  memory_object_upl_request(control,
3332                                 offset,
3333                                 size,
3334                                 upl_ptr,
3335                                 user_page_list,
3336                                 &local_list_count,
3337                                 cntrl_flags);
3338
3339         if(kr != KERN_SUCCESS)
3340                 return kr;
3341
3342         if ((user_page_list_ptr != NULL) && (cntrl_flags & UPL_INTERNAL)) {
3343                 *user_page_list_ptr = UPL_GET_INTERNAL_PAGE_LIST(*upl_ptr);
3344         }
3345
3346         return KERN_SUCCESS;
3347 }
3348
3349
3350
3351 /*
3352  *      Routine:        vm_object_super_upl_request
3353  *      Purpose:
3354  *              Cause the population of a portion of a vm_object
3355  *              in much the same way as memory_object_upl_request.
3356  *              Depending on the nature of the request, the pages
3357  *              returned may be contain valid data or be uninitialized.
3358  *              However, the region may be expanded up to the super
3359  *              cluster size provided.
3360  */
3361
3362 __private_extern__ kern_return_t
3363 vm_object_super_upl_request(
3364         vm_object_t object,
3365         vm_object_offset_t      offset,
3366         upl_size_t              size,
3367         upl_size_t              super_cluster,
3368         upl_t                   *upl,
3369         upl_page_info_t         *user_page_list,
3370         unsigned int            *page_list_count,
3371         int                     cntrl_flags)
3372 {
3373         if (object->paging_offset > offset)
3374                 return KERN_FAILURE;
3375
3376         assert(object->paging_in_progress);
3377         offset = offset - object->paging_offset;
3378
3379         if (super_cluster > size) {
3380
3381                 vm_object_offset_t      base_offset;
3382                 upl_size_t              super_size;
3383
3384                 base_offset = (offset & ~((vm_object_offset_t) super_cluster - 1));
3385                 super_size = (offset + size) > (base_offset + super_cluster) ? super_cluster<<1 : super_cluster;
3386                 super_size = ((base_offset + super_size) > object->size) ? (object->size - base_offset) : super_size;
3387
3388                 if (offset > (base_offset + super_size)) {
3389                         panic("vm_object_super_upl_request: Missed target pageout"
3390                               " %#llx,%#llx, %#x, %#x, %#x, %#llx\n",
3391                               offset, base_offset, super_size, super_cluster,
3392                               size, object->paging_offset);
3393                 }
3394                 /*
3395                  * apparently there is a case where the vm requests a
3396                  * page to be written out who's offset is beyond the
3397                  * object size
3398                  */
3399                 if ((offset + size) > (base_offset + super_size))
3400                         super_size = (offset + size) - base_offset;
3401
3402                 offset = base_offset;
3403                 size = super_size;
3404         }
3405         return vm_object_upl_request(object, offset, size, upl, user_page_list, page_list_count, cntrl_flags);
3406 }
3407
3408
3409 kern_return_t
3410 vm_map_create_upl(
3411         vm_map_t                map,
3412         vm_map_address_t        offset,
3413         upl_size_t              *upl_size,
3414         upl_t                   *upl,
3415         upl_page_info_array_t   page_list,
3416         unsigned int            *count,
3417         int                     *flags)
3418 {
3419         vm_map_entry_t  entry;
3420         int             caller_flags;
3421         int             force_data_sync;
3422         int             sync_cow_data;
3423         vm_object_t     local_object;
3424         vm_map_offset_t local_offset;
3425         vm_map_offset_t local_start;
3426         kern_return_t   ret;
3427
3428         caller_flags = *flags;
3429
3430         if (caller_flags & ~UPL_VALID_FLAGS) {
3431                 /*
3432                  * For forward compatibility's sake,
3433                  * reject any unknown flag.
3434                  */
3435                 return KERN_INVALID_VALUE;
3436         }
3437         force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
3438         sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
3439
3440         if (upl == NULL)
3441                 return KERN_INVALID_ARGUMENT;
3442
3443 REDISCOVER_ENTRY:
3444         vm_map_lock(map);
3445
3446         if (vm_map_lookup_entry(map, offset, &entry)) {
3447
3448                 if ((entry->vme_end - offset) < *upl_size)
3449                         *upl_size = entry->vme_end - offset;
3450
3451                 if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
3452                         *flags = 0;
3453
3454                         if (entry->object.vm_object != VM_OBJECT_NULL) {
3455                                 if (entry->object.vm_object->private)
3456                                         *flags = UPL_DEV_MEMORY;
3457
3458                                 if (entry->object.vm_object->phys_contiguous)
3459                                         *flags |= UPL_PHYS_CONTIG;
3460                         }
3461                         vm_map_unlock(map);
3462
3463                         return KERN_SUCCESS;
3464                 }
3465                 if (entry->object.vm_object == VM_OBJECT_NULL || !entry->object.vm_object->phys_contiguous) {
3466                         if ((*upl_size/page_size) > MAX_UPL_TRANSFER)
3467                                 *upl_size = MAX_UPL_TRANSFER * page_size;
3468                 }
3469                 /*
3470                  *      Create an object if necessary.
3471                  */
3472                 if (entry->object.vm_object == VM_OBJECT_NULL) {
3473                         entry->object.vm_object = vm_object_allocate((vm_size_t)(entry->vme_end - entry->vme_start));
3474                         entry->offset = 0;
3475                 }
3476                 if (!(caller_flags & UPL_COPYOUT_FROM)) {
3477                         if (!(entry->protection & VM_PROT_WRITE)) {
3478                                 vm_map_unlock(map);
3479                                 return KERN_PROTECTION_FAILURE;
3480                         }
3481                         if (entry->needs_copy)  {
3482                                 vm_map_t                local_map;
3483                                 vm_object_t             object;
3484                                 vm_object_offset_t      new_offset;
3485                                 vm_prot_t               prot;
3486                                 boolean_t               wired;
3487                                 vm_map_version_t        version;
3488                                 vm_map_t                real_map;
3489
3490                                 local_map = map;
3491                                 vm_map_lock_write_to_read(map);
3492
3493                                 if (vm_map_lookup_locked(&local_map,
3494                                                          offset, VM_PROT_WRITE,
3495                                                          OBJECT_LOCK_EXCLUSIVE,
3496                                                          &version, &object,
3497                                                          &new_offset, &prot, &wired,
3498                                                          NULL,
3499                                                          &real_map)) {
3500                                         vm_map_unlock(local_map);
3501                                         return KERN_FAILURE;
3502                                 }
3503                                 if (real_map != map)
3504                                         vm_map_unlock(real_map);
3505                                 vm_object_unlock(object);
3506                                 vm_map_unlock(local_map);
3507
3508                                 goto REDISCOVER_ENTRY;
3509                         }
3510                 }
3511                 if (entry->is_sub_map) {
3512                         vm_map_t        submap;
3513
3514                         submap = entry->object.sub_map;
3515                         local_start = entry->vme_start;
3516                         local_offset = entry->offset;
3517
3518                         vm_map_reference(submap);
3519                         vm_map_unlock(map);
3520
3521                         ret = vm_map_create_upl(submap,
3522                                                 local_offset + (offset - local_start),
3523                                                 upl_size, upl, page_list, count, flags);
3524                         vm_map_deallocate(submap);
3525
3526                         return ret;
3527                 }
3528                 if (sync_cow_data) {
3529                         if (entry->object.vm_object->shadow || entry->object.vm_object->copy) {
3530                                 local_object = entry->object.vm_object;
3531                                 local_start = entry->vme_start;
3532                                 local_offset = entry->offset;
3533
3534                                 vm_object_reference(local_object);
3535                                 vm_map_unlock(map);
3536
3537                                 if (entry->object.vm_object->shadow && entry->object.vm_object->copy) {
3538                                         vm_object_lock_request(
3539                                                                local_object->shadow,
3540                                                                (vm_object_offset_t)
3541                                                                ((offset - local_start) +
3542                                                                 local_offset) +
3543                                                                local_object->shadow_offset,
3544                                                                *upl_size, FALSE,
3545                                                                MEMORY_OBJECT_DATA_SYNC,
3546                                                                VM_PROT_NO_CHANGE);
3547                                 }
3548                                 sync_cow_data = FALSE;
3549                                 vm_object_deallocate(local_object);
3550
3551                                 goto REDISCOVER_ENTRY;
3552                         }
3553                 }
3554                 if (force_data_sync) {
3555                         local_object = entry->object.vm_object;
3556                         local_start = entry->vme_start;
3557                         local_offset = entry->offset;
3558
3559                         vm_object_reference(local_object);
3560                         vm_map_unlock(map);
3561
3562                         vm_object_lock_request(
3563                                                local_object,
3564                                                (vm_object_offset_t)
3565                                                ((offset - local_start) + local_offset),
3566                                                (vm_object_size_t)*upl_size, FALSE,
3567                                                MEMORY_OBJECT_DATA_SYNC,
3568                                                VM_PROT_NO_CHANGE);
3569
3570                         force_data_sync = FALSE;
3571                         vm_object_deallocate(local_object);
3572
3573                         goto REDISCOVER_ENTRY;
3574                 }
3575                 if (entry->object.vm_object->private)
3576                         *flags = UPL_DEV_MEMORY;
3577                 else
3578                         *flags = 0;
3579
3580                 if (entry->object.vm_object->phys_contiguous)
3581                         *flags |= UPL_PHYS_CONTIG;
3582
3583                 local_object = entry->object.vm_object;
3584                 local_offset = entry->offset;
3585                 local_start = entry->vme_start;
3586
3587                 vm_object_reference(local_object);
3588                 vm_map_unlock(map);
3589
3590                 ret = vm_object_iopl_request(local_object,
3591                                               (vm_object_offset_t) ((offset - local_start) + local_offset),
3592                                               *upl_size,
3593                                               upl,
3594                                               page_list,
3595                                               count,
3596                                               caller_flags);
3597                 vm_object_deallocate(local_object);
3598
3599                 return(ret);
3600         }
3601         vm_map_unlock(map);
3602
3603         return(KERN_FAILURE);
3604 }
3605
3606 /*
3607  * Internal routine to enter a UPL into a VM map.
3608  *
3609  * JMM - This should just be doable through the standard
3610  * vm_map_enter() API.
3611  */
3612 kern_return_t
3613 vm_map_enter_upl(
3614         vm_map_t                map,
3615         upl_t                   upl,
3616         vm_map_offset_t *dst_addr)
3617 {
3618         vm_map_size_t           size;
3619         vm_object_offset_t      offset;
3620         vm_map_offset_t         addr;
3621         vm_page_t               m;
3622         kern_return_t           kr;
3623
3624         if (upl == UPL_NULL)
3625                 return KERN_INVALID_ARGUMENT;
3626
3627         upl_lock(upl);
3628
3629         /*
3630          * check to see if already mapped
3631          */
3632         if (UPL_PAGE_LIST_MAPPED & upl->flags) {
3633                 upl_unlock(upl);
3634                 return KERN_FAILURE;
3635         }
3636
3637         if ((!(upl->flags & UPL_SHADOWED)) && !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) ||
3638                                                (upl->map_object->phys_contiguous))) {
3639                 vm_object_t             object;
3640                 vm_page_t               alias_page;
3641                 vm_object_offset_t      new_offset;
3642                 int                     pg_num;
3643                 wpl_array_t             lite_list;
3644
3645                 if (upl->flags & UPL_INTERNAL) {
3646                         lite_list = (wpl_array_t)
3647                                 ((((uintptr_t)upl) + sizeof(struct upl))
3648                                  + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
3649                 } else {
3650                         lite_list = (wpl_array_t)(((uintptr_t)upl) + sizeof(struct upl));
3651                 }
3652                 object = upl->map_object;
3653                 upl->map_object = vm_object_allocate(upl->size);
3654
3655                 vm_object_lock(upl->map_object);
3656
3657                 upl->map_object->shadow = object;
3658                 upl->map_object->pageout = TRUE;
3659                 upl->map_object->can_persist = FALSE;
3660                 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3661                 upl->map_object->shadow_offset = upl->offset - object->paging_offset;
3662                 upl->map_object->wimg_bits = object->wimg_bits;
3663                 offset = upl->map_object->shadow_offset;
3664                 new_offset = 0;
3665                 size = upl->size;
3666
3667                 upl->flags |= UPL_SHADOWED;
3668
3669                 while (size) {
3670                         pg_num = (new_offset)/PAGE_SIZE;
3671
3672                         if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
3673
3674                                 VM_PAGE_GRAB_FICTITIOUS(alias_page);
3675
3676                                 vm_object_lock(object);
3677
3678                                 m = vm_page_lookup(object, offset);
3679                                 if (m == VM_PAGE_NULL) {
3680                                         panic("vm_upl_map: page missing\n");
3681                                 }
3682
3683                                 /*
3684                                  * Convert the fictitious page to a private
3685                                  * shadow of the real page.
3686                                  */
3687                                 assert(alias_page->fictitious);
3688                                 alias_page->fictitious = FALSE;
3689                                 alias_page->private = TRUE;
3690                                 alias_page->pageout = TRUE;
3691                                 /*
3692                                  * since m is a page in the upl it must
3693                                  * already be wired or BUSY, so it's
3694                                  * safe to assign the underlying physical
3695                                  * page to the alias
3696                                  */
3697                                 alias_page->phys_page = m->phys_page;
3698
3699                                 vm_object_unlock(object);
3700
3701                                 vm_page_lockspin_queues();
3702                                 vm_page_wire(alias_page);
3703                                 vm_page_unlock_queues();
3704
3705                                 /*
3706                                  * ENCRYPTED SWAP:
3707                                  * The virtual page ("m") has to be wired in some way
3708                                  * here or its physical page ("m->phys_page") could
3709                                  * be recycled at any time.
3710                                  * Assuming this is enforced by the caller, we can't
3711                                  * get an encrypted page here.  Since the encryption
3712                                  * key depends on the VM page's "pager" object and
3713                                  * the "paging_offset", we couldn't handle 2 pageable
3714                                  * VM pages (with different pagers and paging_offsets)
3715                                  * sharing the same physical page:  we could end up
3716                                  * encrypting with one key (via one VM page) and
3717                                  * decrypting with another key (via the alias VM page).
3718                                  */
3719                                 ASSERT_PAGE_DECRYPTED(m);
3720
3721                                 vm_page_insert(alias_page, upl->map_object, new_offset);
3722
3723                                 assert(!alias_page->wanted);
3724                                 alias_page->busy = FALSE;
3725                                 alias_page->absent = FALSE;
3726                         }
3727                         size -= PAGE_SIZE;
3728                         offset += PAGE_SIZE_64;
3729                         new_offset += PAGE_SIZE_64;
3730                 }
3731                 vm_object_unlock(upl->map_object);
3732         }
3733         if ((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || upl->map_object->phys_contiguous)
3734                 offset = upl->offset - upl->map_object->paging_offset;
3735         else
3736                 offset = 0;
3737         size = upl->size;
3738
3739         vm_object_reference(upl->map_object);
3740
3741         *dst_addr = 0;
3742         /*
3743          * NEED A UPL_MAP ALIAS
3744          */
3745         kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
3746                           VM_FLAGS_ANYWHERE, upl->map_object, offset, FALSE,
3747                           VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
3748
3749         if (kr != KERN_SUCCESS) {
3750                 upl_unlock(upl);
3751                 return(kr);
3752         }
3753         vm_object_lock(upl->map_object);
3754
3755         for (addr = *dst_addr; size > 0; size -= PAGE_SIZE, addr += PAGE_SIZE) {
3756                 m = vm_page_lookup(upl->map_object, offset);
3757
3758                 if (m) {
3759                         unsigned int    cache_attr;
3760                         cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
3761
3762                         m->pmapped = TRUE;
3763
3764                         PMAP_ENTER(map->pmap, addr, m, VM_PROT_ALL, cache_attr, TRUE);
3765                 }
3766                 offset += PAGE_SIZE_64;
3767         }
3768         vm_object_unlock(upl->map_object);
3769
3770         /*
3771          * hold a reference for the mapping
3772          */
3773         upl->ref_count++;
3774         upl->flags |= UPL_PAGE_LIST_MAPPED;
3775         upl->kaddr = *dst_addr;
3776         upl_unlock(upl);
3777
3778         return KERN_SUCCESS;
3779 }
3780
3781 /*
3782  * Internal routine to remove a UPL mapping from a VM map.
3783  *
3784  * XXX - This should just be doable through a standard
3785  * vm_map_remove() operation.  Otherwise, implicit clean-up
3786  * of the target map won't be able to correctly remove
3787  * these (and release the reference on the UPL).  Having
3788  * to do this means we can't map these into user-space
3789  * maps yet.
3790  */
3791 kern_return_t
3792 vm_map_remove_upl(
3793         vm_map_t        map,
3794         upl_t           upl)
3795 {
3796         vm_address_t    addr;
3797         upl_size_t      size;
3798
3799         if (upl == UPL_NULL)
3800                 return KERN_INVALID_ARGUMENT;
3801
3802         upl_lock(upl);
3803
3804         if (upl->flags & UPL_PAGE_LIST_MAPPED) {
3805                 addr = upl->kaddr;
3806                 size = upl->size;
3807
3808                 assert(upl->ref_count > 1);
3809                 upl->ref_count--;               /* removing mapping ref */
3810
3811                 upl->flags &= ~UPL_PAGE_LIST_MAPPED;
3812                 upl->kaddr = (vm_offset_t) 0;
3813                 upl_unlock(upl);
3814
3815                 vm_map_remove(map,
3816                               vm_map_trunc_page(addr),
3817                               vm_map_round_page(addr + size),
3818                               VM_MAP_NO_FLAGS);
3819
3820                 return KERN_SUCCESS;
3821         }
3822         upl_unlock(upl);
3823
3824         return KERN_FAILURE;
3825 }
3826
3827 kern_return_t
3828 upl_commit_range(
3829         upl_t                   upl,
3830         upl_offset_t            offset,
3831         upl_size_t              size,
3832         int                     flags,
3833         upl_page_info_t         *page_list,
3834         mach_msg_type_number_t  count,
3835         boolean_t               *empty)
3836 {
3837         upl_size_t              xfer_size;
3838         vm_object_t             shadow_object;
3839         vm_object_t             object;
3840         vm_object_offset_t      target_offset;
3841         int                     entry;
3842         wpl_array_t             lite_list;
3843         int                     occupied;
3844         int                     delayed_unlock = 0;
3845         int                     clear_refmod = 0;
3846         int                     pgpgout_count = 0;
3847
3848         *empty = FALSE;
3849
3850         if (upl == UPL_NULL)
3851                 return KERN_INVALID_ARGUMENT;
3852
3853         if (count == 0)
3854                 page_list = NULL;
3855
3856         if (upl->flags & UPL_DEVICE_MEMORY)
3857                 xfer_size = 0;
3858         else if ((offset + size) <= upl->size)
3859                 xfer_size = size;
3860         else
3861                 return KERN_FAILURE;
3862
3863         upl_lock(upl);
3864
3865         if (upl->flags & UPL_ACCESS_BLOCKED) {
3866                 /*
3867                  * We used this UPL to block access to the pages by marking
3868                  * them "busy".  Now we need to clear the "busy" bit to allow
3869                  * access to these pages again.
3870                  */
3871                 flags |= UPL_COMMIT_ALLOW_ACCESS;
3872         }
3873         if (upl->flags & UPL_CLEAR_DIRTY)
3874                 flags |= UPL_COMMIT_CLEAR_DIRTY;
3875
3876         if (upl->flags & UPL_INTERNAL)
3877                 lite_list = (wpl_array_t) ((((uintptr_t)upl) + sizeof(struct upl))
3878                                            + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
3879         else
3880                 lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
3881
3882         object = upl->map_object;
3883
3884         if (upl->flags & UPL_SHADOWED) {
3885                 vm_object_lock(object);
3886                 shadow_object = object->shadow;
3887         } else {
3888                 shadow_object = object;
3889         }
3890         vm_object_lock(shadow_object);
3891
3892         entry = offset/PAGE_SIZE;
3893         target_offset = (vm_object_offset_t)offset;
3894
3895         while (xfer_size) {
3896                 vm_page_t       t, m;
3897
3898                 if (delayed_unlock == 0)
3899                         vm_page_lock_queues();
3900
3901                 m = VM_PAGE_NULL;
3902
3903                 if (upl->flags & UPL_LITE) {
3904                         int     pg_num;
3905
3906                         pg_num = target_offset/PAGE_SIZE;
3907
3908                         if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
3909                                 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
3910
3911                                 m = vm_page_lookup(shadow_object, target_offset + (upl->offset - shadow_object->paging_offset));
3912                         }
3913                 }
3914                 if (upl->flags & UPL_SHADOWED) {
3915                         if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
3916
3917                                 t->pageout = FALSE;
3918
3919                                 vm_page_free(t);
3920
3921                                 if (m == VM_PAGE_NULL)
3922                                         m = vm_page_lookup(shadow_object, target_offset + object->shadow_offset);
3923                         }
3924                 }
3925                 if (m != VM_PAGE_NULL) {
3926
3927                         clear_refmod = 0;
3928
3929                         if (upl->flags & UPL_IO_WIRE) {
3930
3931                                 vm_page_unwire(m);
3932
3933                                 if (page_list)
3934                                         page_list[entry].phys_addr = 0;
3935
3936                                 if (flags & UPL_COMMIT_SET_DIRTY)
3937                                         m->dirty = TRUE;
3938                                 else if (flags & UPL_COMMIT_CLEAR_DIRTY) {
3939                                         m->dirty = FALSE;
3940                                         clear_refmod |= VM_MEM_MODIFIED;
3941                                 }
3942                                 if (flags & UPL_COMMIT_INACTIVATE)
3943                                         vm_page_deactivate(m);
3944
3945                                 if (clear_refmod)
3946                                         pmap_clear_refmod(m->phys_page, clear_refmod);
3947
3948                                 if (flags & UPL_COMMIT_ALLOW_ACCESS) {
3949                                         /*
3950                                          * We blocked access to the pages in this UPL.
3951                                          * Clear the "busy" bit and wake up any waiter
3952                                          * for this page.
3953                                          */
3954                                         PAGE_WAKEUP_DONE(m);
3955                                 }
3956                                 goto commit_next_page;
3957                         }
3958                         /*
3959                          * make sure to clear the hardware
3960                          * modify or reference bits before
3961                          * releasing the BUSY bit on this page
3962                          * otherwise we risk losing a legitimate
3963                          * change of state
3964                          */
3965                         if (flags & UPL_COMMIT_CLEAR_DIRTY) {
3966                                 m->dirty = FALSE;
3967                                 clear_refmod |= VM_MEM_MODIFIED;
3968                         }
3969                         if (clear_refmod)
3970                                 pmap_clear_refmod(m->phys_page, clear_refmod);
3971
3972                         if (page_list) {
3973                                 upl_page_info_t *p;
3974
3975                                 p = &(page_list[entry]);
3976
3977                                 if (p->phys_addr && p->pageout && !m->pageout) {
3978                                         m->busy = TRUE;
3979                                         m->pageout = TRUE;
3980                                         vm_page_wire(m);
3981                                 } else if (p->phys_addr &&
3982                                            !p->pageout && m->pageout &&
3983                                            !m->dump_cleaning) {
3984                                         m->pageout = FALSE;
3985                                         m->absent = FALSE;
3986                                         m->overwriting = FALSE;
3987                                         vm_page_unwire(m);
3988
3989                                         PAGE_WAKEUP_DONE(m);
3990                                 }
3991                                 page_list[entry].phys_addr = 0;
3992                         }
3993                         m->dump_cleaning = FALSE;
3994
3995                         if (m->laundry)
3996                                 vm_pageout_throttle_up(m);
3997
3998                         if (m->pageout) {
3999                                 m->cleaning = FALSE;
4000                                 m->encrypted_cleaning = FALSE;
4001                                 m->pageout = FALSE;
4002 #if MACH_CLUSTER_STATS
4003                                 if (m->wanted) vm_pageout_target_collisions++;
4004 #endif
4005                                 m->dirty = FALSE;
4006
4007                                 if (m->pmapped && (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED))
4008                                         m->dirty = TRUE;
4009
4010                                 if (m->dirty) {
4011                                        /*
4012                                         * page was re-dirtied after we started
4013                                         * the pageout... reactivate it since
4014                                         * we don't know whether the on-disk
4015                                         * copy matches what is now in memory
4016                                         */
4017                                         vm_page_unwire(m);
4018
4019                                         if (upl->flags & UPL_PAGEOUT) {
4020                                                 CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
4021                                                 VM_STAT_INCR(reactivations);
4022                                                 DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL);
4023                                         }
4024                                         PAGE_WAKEUP_DONE(m);
4025                                 } else {
4026                                         /*
4027                                          * page has been successfully cleaned
4028                                          * go ahead and free it for other use
4029                                          */
4030
4031                                         if (m->object->internal) {
4032                                                 DTRACE_VM2(anonpgout, int, 1, (uint64_t *), NULL);
4033                                         } else {
4034                                                 DTRACE_VM2(fspgout, int, 1, (uint64_t *), NULL);
4035                                         }
4036
4037                                         vm_page_free(m);
4038
4039                                         if (upl->flags & UPL_PAGEOUT) {
4040                                                 CLUSTER_STAT(vm_pageout_target_page_freed++;)
4041
4042                                                 if (page_list[entry].dirty) {
4043                                                         VM_STAT_INCR(pageouts);
4044                                                         DTRACE_VM2(pgout, int, 1, (uint64_t *), NULL);
4045                                                         pgpgout_count++;
4046                                                 }
4047                                         }
4048                                 }
4049                                 goto commit_next_page;
4050                         }
4051 #if MACH_CLUSTER_STATS
4052                         if (m->pmapped)
4053                                 m->dirty = pmap_is_modified(m->phys_page);
4054
4055                         if (m->dirty)   vm_pageout_cluster_dirtied++;
4056                         else            vm_pageout_cluster_cleaned++;
4057                         if (m->wanted)  vm_pageout_cluster_collisions++;
4058 #endif
4059                         m->dirty = FALSE;
4060
4061                         if ((m->busy) && (m->cleaning)) {
4062                                 /*
4063                                  * the request_page_list case
4064                                  */
4065                                 m->absent = FALSE;
4066                                 m->overwriting = FALSE;
4067                                 m->busy = FALSE;
4068                         } else if (m->overwriting) {
4069                                 /*
4070                                  * alternate request page list, write to
4071                                  * page_list case.  Occurs when the original
4072                                  * page was wired at the time of the list
4073                                  * request
4074                                  */
4075                                 assert(m->wire_count != 0);
4076                                 vm_page_unwire(m);/* reactivates */
4077                                 m->overwriting = FALSE;
4078                         }
4079                         m->cleaning = FALSE;
4080                         m->encrypted_cleaning = FALSE;
4081
4082                         /*
4083                          * It is a part of the semantic of COPYOUT_FROM
4084                          * UPLs that a commit implies cache sync
4085                          * between the vm page and the backing store
4086                          * this can be used to strip the precious bit
4087                          * as well as clean
4088                          */
4089                         if (upl->flags & UPL_PAGE_SYNC_DONE)
4090                                 m->precious = FALSE;
4091
4092                         if (flags & UPL_COMMIT_SET_DIRTY)
4093                                 m->dirty = TRUE;
4094
4095                         if ((flags & UPL_COMMIT_INACTIVATE) && !m->clustered && !m->speculative) {
4096                                 vm_page_deactivate(m);
4097                         } else if (!m->active && !m->inactive && !m->speculative) {
4098
4099                                 if (m->clustered)
4100                                         vm_page_speculate(m, TRUE);
4101                                 else if (m->reference)
4102                                         vm_page_activate(m);
4103                                 else
4104                                         vm_page_deactivate(m);
4105                         }
4106                         if (flags & UPL_COMMIT_ALLOW_ACCESS) {
4107                                 /*
4108                                  * We blocked access to the pages in this URL.
4109                                  * Clear the "busy" bit on this page before we
4110                                  * wake up any waiter.
4111                                  */
4112                                 m->busy = FALSE;
4113                         }
4114                         /*
4115                          * Wakeup any thread waiting for the page to be un-cleaning.
4116                          */
4117                         PAGE_WAKEUP(m);
4118                 }
4119 commit_next_page:
4120                 target_offset += PAGE_SIZE_64;
4121                 xfer_size -= PAGE_SIZE;
4122                 entry++;
4123
4124                 if (delayed_unlock++ > UPL_DELAYED_UNLOCK_LIMIT) {
4125                         mutex_yield(&vm_page_queue_lock);
4126                         delayed_unlock = 1;
4127                 }
4128         }
4129         if (delayed_unlock)
4130                 vm_page_unlock_queues();
4131
4132         occupied = 1;
4133
4134         if (upl->flags & UPL_DEVICE_MEMORY)  {
4135                 occupied = 0;
4136         } else if (upl->flags & UPL_LITE) {
4137                 int     pg_num;
4138                 int     i;
4139
4140                 pg_num = upl->size/PAGE_SIZE;
4141                 pg_num = (pg_num + 31) >> 5;
4142                 occupied = 0;
4143
4144                 for (i = 0; i < pg_num; i++) {
4145                         if (lite_list[i] != 0) {
4146                                 occupied = 1;
4147                                 break;
4148                         }
4149                 }
4150         } else {
4151                 if (queue_empty(&upl->map_object->memq))
4152                         occupied = 0;
4153         }
4154         if (occupied == 0) {
4155                 if (upl->flags & UPL_COMMIT_NOTIFY_EMPTY)
4156                         *empty = TRUE;
4157
4158                 if (object == shadow_object) {
4159                         /*
4160                          * this is not a paging object
4161                          * so we need to drop the paging reference
4162                          * that was taken when we created the UPL
4163                          * against this object
4164                          */
4165                         vm_object_paging_end(shadow_object);
4166                 } else {
4167                          /*
4168                           * we dontated the paging reference to
4169                           * the map object... vm_pageout_object_terminate
4170                           * will drop this reference
4171                           */
4172                 }
4173         }
4174         vm_object_unlock(shadow_object);
4175         if (object != shadow_object)
4176                 vm_object_unlock(object);
4177         upl_unlock(upl);
4178
4179         if (pgpgout_count) {
4180                 DTRACE_VM2(pgpgout, int, pgpgout_count, (uint64_t *), NULL);
4181         }
4182
4183         return KERN_SUCCESS;
4184 }
4185
4186 kern_return_t
4187 upl_abort_range(
4188         upl_t                   upl,
4189         upl_offset_t            offset,
4190         upl_size_t              size,
4191         int                     error,
4192         boolean_t               *empty)
4193 {
4194         upl_size_t              xfer_size;
4195         vm_object_t             shadow_object;
4196         vm_object_t             object;
4197         vm_object_offset_t      target_offset;
4198         int                     entry;
4199         wpl_array_t             lite_list;
4200         int                     occupied;
4201         int                     delayed_unlock = 0;
4202
4203         *empty = FALSE;
4204
4205         if (upl == UPL_NULL)
4206                 return KERN_INVALID_ARGUMENT;
4207
4208         if ( (upl->flags & UPL_IO_WIRE) && !(error & UPL_ABORT_DUMP_PAGES) )
4209                 return upl_commit_range(upl, offset, size, 0, NULL, 0, empty);
4210
4211         if (upl->flags & UPL_DEVICE_MEMORY)
4212                 xfer_size = 0;
4213         else if ((offset + size) <= upl->size)
4214                 xfer_size = size;
4215         else
4216                 return KERN_FAILURE;
4217
4218         upl_lock(upl);
4219
4220         if (upl->flags & UPL_INTERNAL) {
4221                 lite_list = (wpl_array_t)
4222                         ((((uintptr_t)upl) + sizeof(struct upl))
4223                         + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
4224         } else {
4225                 lite_list = (wpl_array_t)
4226                         (((uintptr_t)upl) + sizeof(struct upl));
4227         }
4228         object = upl->map_object;
4229
4230         if (upl->flags & UPL_SHADOWED) {
4231                 vm_object_lock(object);
4232                 shadow_object = object->shadow;
4233         } else
4234                 shadow_object = object;
4235
4236         vm_object_lock(shadow_object);
4237
4238         entry = offset/PAGE_SIZE;
4239         target_offset = (vm_object_offset_t)offset;
4240
4241         while (xfer_size) {
4242                 vm_page_t       t, m;
4243
4244                 if (delayed_unlock == 0)
4245                         vm_page_lock_queues();
4246
4247                 m = VM_PAGE_NULL;
4248
4249                 if (upl->flags & UPL_LITE) {
4250                         int     pg_num;
4251                         pg_num = target_offset/PAGE_SIZE;
4252
4253                         if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
4254                                 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
4255
4256                                 m = vm_page_lookup(shadow_object, target_offset +
4257                                                    (upl->offset - shadow_object->paging_offset));
4258                         }
4259                 }
4260                 if (upl->flags & UPL_SHADOWED) {
4261                         if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
4262                                 t->pageout = FALSE;
4263
4264                                 vm_page_free(t);
4265
4266                                 if (m == VM_PAGE_NULL)
4267                                         m = vm_page_lookup(shadow_object, target_offset + object->shadow_offset);
4268                         }
4269                 }
4270                 if (m != VM_PAGE_NULL) {
4271
4272                         if (m->absent) {
4273                                 boolean_t must_free = TRUE;
4274
4275                                 m->clustered = FALSE;
4276                                 /*
4277                                  * COPYOUT = FALSE case
4278                                  * check for error conditions which must
4279                                  * be passed back to the pages customer
4280                                  */
4281                                 if (error & UPL_ABORT_RESTART) {
4282                                         m->restart = TRUE;
4283                                         m->absent = FALSE;
4284                                         m->error = TRUE;
4285                                         m->unusual = TRUE;
4286                                         must_free = FALSE;
4287                                 } else if (error & UPL_ABORT_UNAVAILABLE) {
4288                                         m->restart = FALSE;
4289                                         m->unusual = TRUE;
4290                                         must_free = FALSE;
4291                                 } else if (error & UPL_ABORT_ERROR) {
4292                                         m->restart = FALSE;
4293                                         m->absent = FALSE;
4294                                         m->error = TRUE;
4295                                         m->unusual = TRUE;
4296                                         must_free = FALSE;
4297                                 }
4298
4299                                 /*
4300                                  * ENCRYPTED SWAP:
4301                                  * If the page was already encrypted,
4302                                  * we don't really need to decrypt it
4303                                  * now.  It will get decrypted later,
4304                                  * on demand, as soon as someone needs
4305                                  * to access its contents.
4306                                  */
4307
4308                                 m->cleaning = FALSE;
4309                                 m->encrypted_cleaning = FALSE;
4310                                 m->overwriting = FALSE;
4311                                 PAGE_WAKEUP_DONE(m);
4312
4313                                 if (must_free == TRUE)
4314                                         vm_page_free(m);
4315                                 else
4316                                         vm_page_activate(m);
4317                         } else {
4318                                 /*
4319                                  * Handle the trusted pager throttle.
4320                                  */
4321                                 if (m->laundry)
4322                                         vm_pageout_throttle_up(m);
4323
4324                                 if (m->pageout) {
4325                                         assert(m->busy);
4326                                         assert(m->wire_count == 1);
4327                                         m->pageout = FALSE;
4328                                         vm_page_unwire(m);
4329                                 }
4330                                 m->dump_cleaning = FALSE;
4331                                 m->cleaning = FALSE;
4332                                 m->encrypted_cleaning = FALSE;
4333                                 m->overwriting = FALSE;
4334 #if     MACH_PAGEMAP
4335                                 vm_external_state_clr(m->object->existence_map, m->offset);
4336 #endif  /* MACH_PAGEMAP */
4337                                 if (error & UPL_ABORT_DUMP_PAGES) {
4338                                         pmap_disconnect(m->phys_page);
4339                                         vm_page_free(m);
4340                                 } else {
4341                                         if (error & UPL_ABORT_REFERENCE) {
4342                                                 /*
4343                                                  * we've been told to explictly
4344                                                  * reference this page... for
4345                                                  * file I/O, this is done by
4346                                                  * implementing an LRU on the inactive q
4347                                                  */
4348                                                 vm_page_lru(m);
4349                                         }
4350                                         PAGE_WAKEUP_DONE(m);
4351                                 }
4352                         }
4353                 }
4354                 if (delayed_unlock++ > UPL_DELAYED_UNLOCK_LIMIT) {
4355                         mutex_yield(&vm_page_queue_lock);
4356                         delayed_unlock = 1;
4357                 }
4358                 target_offset += PAGE_SIZE_64;
4359                 xfer_size -= PAGE_SIZE;
4360                 entry++;
4361         }
4362         if (delayed_unlock)
4363                 vm_page_unlock_queues();
4364
4365         occupied = 1;
4366
4367         if (upl->flags & UPL_DEVICE_MEMORY)  {
4368                 occupied = 0;
4369         } else if (upl->flags & UPL_LITE) {
4370                 int     pg_num;
4371                 int     i;
4372
4373                 pg_num = upl->size/PAGE_SIZE;
4374                 pg_num = (pg_num + 31) >> 5;
4375                 occupied = 0;
4376
4377                 for (i = 0; i < pg_num; i++) {
4378                         if (lite_list[i] != 0) {
4379                                 occupied = 1;
4380                                 break;
4381                         }
4382                 }
4383         } else {
4384                 if (queue_empty(&upl->map_object->memq))
4385                         occupied = 0;
4386         }
4387         if (occupied == 0) {
4388                 if (upl->flags & UPL_COMMIT_NOTIFY_EMPTY)
4389                         *empty = TRUE;
4390
4391                 if (object == shadow_object) {
4392                         /*
4393                          * this is not a paging object
4394                          * so we need to drop the paging reference
4395                          * that was taken when we created the UPL
4396                          * against this object
4397                          */
4398                         vm_object_paging_end(shadow_object);
4399                 } else {
4400                          /*
4401                           * we dontated the paging reference to
4402                           * the map object... vm_pageout_object_terminate
4403                           * will drop this reference
4404                           */
4405                 }
4406         }
4407         vm_object_unlock(shadow_object);
4408         if (object != shadow_object)
4409                 vm_object_unlock(object);
4410         upl_unlock(upl);
4411
4412         return KERN_SUCCESS;
4413 }
4414
4415
4416 kern_return_t
4417 upl_abort(
4418         upl_t   upl,
4419         int     error)
4420 {
4421         boolean_t       empty;
4422
4423         return upl_abort_range(upl, 0, upl->size, error, &empty);
4424 }
4425
4426
4427 /* an option on commit should be wire */
4428 kern_return_t
4429 upl_commit(
4430         upl_t                   upl,
4431         upl_page_info_t         *page_list,
4432         mach_msg_type_number_t  count)
4433 {
4434         boolean_t       empty;
4435
4436         return upl_commit_range(upl, 0, upl->size, 0, page_list, count, &empty);
4437 }
4438
4439
4440 kern_return_t
4441 vm_object_iopl_request(
4442         vm_object_t             object,
4443         vm_object_offset_t      offset,
4444         upl_size_t              size,
4445         upl_t                   *upl_ptr,
4446         upl_page_info_array_t   user_page_list,
4447         unsigned int            *page_list_count,
4448         int                     cntrl_flags)
4449 {
4450         vm_page_t               dst_page;
4451         vm_object_offset_t      dst_offset;
4452         upl_size_t              xfer_size;
4453         upl_t                   upl = NULL;
4454         unsigned int            entry;
4455         wpl_array_t             lite_list = NULL;
4456         int                     delayed_unlock = 0;
4457         int                     no_zero_fill = FALSE;
4458         u_int32_t               psize;
4459         kern_return_t           ret;
4460         vm_prot_t               prot;
4461         struct vm_object_fault_info fault_info;
4462
4463
4464         if (cntrl_flags & ~UPL_VALID_FLAGS) {
4465                 /*
4466                  * For forward compatibility's sake,
4467                  * reject any unknown flag.
4468                  */
4469                 return KERN_INVALID_VALUE;
4470         }
4471         if (vm_lopage_poolsize == 0)
4472                 cntrl_flags &= ~UPL_NEED_32BIT_ADDR;
4473
4474         if (cntrl_flags & UPL_NEED_32BIT_ADDR) {
4475                 if ( (cntrl_flags & (UPL_SET_IO_WIRE | UPL_SET_LITE)) != (UPL_SET_IO_WIRE | UPL_SET_LITE))
4476                         return KERN_INVALID_VALUE;
4477
4478                 if (object->phys_contiguous) {
4479                         if ((offset + object->shadow_offset) >= (vm_object_offset_t)max_valid_dma_address)
4480                                 return KERN_INVALID_ADDRESS;
4481
4482                         if (((offset + object->shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address)
4483                                 return KERN_INVALID_ADDRESS;
4484                 }
4485         }
4486
4487         if (cntrl_flags & UPL_ENCRYPT) {
4488                 /*
4489                  * ENCRYPTED SWAP:
4490                  * The paging path doesn't use this interface,
4491                  * so we don't support the UPL_ENCRYPT flag
4492                  * here.  We won't encrypt the pages.
4493                  */
4494                 assert(! (cntrl_flags & UPL_ENCRYPT));
4495         }
4496         if (cntrl_flags & UPL_NOZEROFILL)
4497                 no_zero_fill = TRUE;
4498
4499         if (cntrl_flags & UPL_COPYOUT_FROM)
4500                 prot = VM_PROT_READ;
4501         else
4502                 prot = VM_PROT_READ | VM_PROT_WRITE;
4503
4504         if (((size/page_size) > MAX_UPL_TRANSFER) && !object->phys_contiguous)
4505                 size = MAX_UPL_TRANSFER * page_size;
4506
4507         if (cntrl_flags & UPL_SET_INTERNAL) {
4508                 if (page_list_count != NULL)
4509                         *page_list_count = MAX_UPL_TRANSFER;
4510         }
4511         if (((cntrl_flags & UPL_SET_INTERNAL) && !(object->phys_contiguous)) &&
4512             ((page_list_count != NULL) && (*page_list_count != 0) && *page_list_count < (size/page_size)))
4513                 return KERN_INVALID_ARGUMENT;
4514
4515         if ((!object->internal) && (object->paging_offset != 0))
4516                 panic("vm_object_iopl_request: external object with non-zero paging offset\n");
4517
4518
4519         if (object->phys_contiguous)
4520                 psize = PAGE_SIZE;
4521         else
4522                 psize = size;
4523
4524         if (cntrl_flags & UPL_SET_INTERNAL) {
4525                 upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE, UPL_IO_WIRE, psize);
4526
4527                 user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
4528                 lite_list = (wpl_array_t) (((uintptr_t)user_page_list) +
4529                                            ((psize / PAGE_SIZE) * sizeof(upl_page_info_t)));
4530         } else {
4531                 upl = upl_create(UPL_CREATE_LITE, UPL_IO_WIRE, psize);
4532
4533                 lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
4534         }
4535         if (user_page_list)
4536                 user_page_list[0].device = FALSE;
4537         *upl_ptr = upl;
4538
4539         upl->map_object = object;
4540         upl->size = size;
4541
4542         vm_object_lock(object);
4543         vm_object_paging_begin(object);
4544         /*
4545          * paging in progress also protects the paging_offset
4546          */
4547         upl->offset = offset + object->paging_offset;
4548
4549         if (object->phys_contiguous) {
4550 #ifdef UPL_DEBUG
4551                 queue_enter(&object->uplq, upl, upl_t, uplq);
4552 #endif /* UPL_DEBUG */
4553
4554                 vm_object_unlock(object);
4555
4556                 /*
4557                  * don't need any shadow mappings for this one
4558                  * since it is already I/O memory
4559                  */
4560                 upl->flags |= UPL_DEVICE_MEMORY;
4561
4562                 upl->highest_page = (offset + object->shadow_offset + size - 1)>>PAGE_SHIFT;
4563
4564                 if (user_page_list) {
4565                         user_page_list[0].phys_addr = (offset + object->shadow_offset)>>PAGE_SHIFT;
4566                         user_page_list[0].device = TRUE;
4567                 }
4568                 if (page_list_count != NULL) {
4569                         if (upl->flags & UPL_INTERNAL)
4570                                 *page_list_count = 0;
4571                         else
4572                                 *page_list_count = 1;
4573                 }
4574                 return KERN_SUCCESS;
4575         }
4576         /*
4577          * Protect user space from future COW operations
4578          */
4579         object->true_share = TRUE;
4580
4581         if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC)
4582                 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
4583
4584 #ifdef UPL_DEBUG
4585         queue_enter(&object->uplq, upl, upl_t, uplq);
4586 #endif /* UPL_DEBUG */
4587
4588         if (cntrl_flags & UPL_BLOCK_ACCESS) {
4589                 /*
4590                  * The user requested that access to the pages in this URL
4591                  * be blocked until the UPL is commited or aborted.
4592                  */
4593                 upl->flags |= UPL_ACCESS_BLOCKED;
4594         }
4595         entry = 0;
4596
4597         xfer_size = size;
4598         dst_offset = offset;
4599
4600         fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
4601         fault_info.user_tag  = 0;
4602         fault_info.lo_offset = offset;
4603         fault_info.hi_offset = offset + xfer_size;
4604         fault_info.no_cache  = FALSE;
4605
4606         while (xfer_size) {
4607                 vm_fault_return_t       result;
4608                 int                     pg_num;
4609
4610                 dst_page = vm_page_lookup(object, dst_offset);
4611
4612                 /*
4613                  * ENCRYPTED SWAP:
4614                  * If the page is encrypted, we need to decrypt it,
4615                  * so force a soft page fault.
4616                  */
4617                 if ((dst_page == VM_PAGE_NULL) || (dst_page->busy) ||
4618                     (dst_page->encrypted) ||
4619                     (dst_page->unusual && (dst_page->error ||
4620                                            dst_page->restart ||
4621                                            dst_page->absent ||
4622                                            dst_page->fictitious))) {
4623
4624                    do {
4625                         vm_page_t       top_page;
4626                         kern_return_t   error_code;
4627                         int             interruptible;
4628
4629                         if (delayed_unlock) {
4630                                 delayed_unlock = 0;
4631                                 vm_page_unlock_queues();
4632                         }
4633                         if (cntrl_flags & UPL_SET_INTERRUPTIBLE)
4634                                 interruptible = THREAD_ABORTSAFE;
4635                         else
4636                                 interruptible = THREAD_UNINT;
4637
4638                         fault_info.interruptible = interruptible;
4639                         fault_info.cluster_size = xfer_size;
4640
4641                         result = vm_fault_page(object, dst_offset,
4642                                                prot | VM_PROT_WRITE, FALSE,
4643                                                &prot, &dst_page, &top_page,
4644                                                (int *)0,
4645                                                &error_code, no_zero_fill,
4646                                                FALSE, &fault_info);
4647
4648                         switch (result) {
4649
4650                         case VM_FAULT_SUCCESS:
4651
4652                                 PAGE_WAKEUP_DONE(dst_page);
4653                                 /*
4654                                  *      Release paging references and
4655                                  *      top-level placeholder page, if any.
4656                                  */
4657                                 if (top_page != VM_PAGE_NULL) {
4658                                         vm_object_t local_object;
4659
4660                                         local_object = top_page->object;
4661
4662                                         if (top_page->object != dst_page->object) {
4663                                                 vm_object_lock(local_object);
4664                                                 VM_PAGE_FREE(top_page);
4665                                                 vm_object_paging_end(local_object);
4666                                                 vm_object_unlock(local_object);
4667                                         } else {
4668                                                 VM_PAGE_FREE(top_page);
4669                                                 vm_object_paging_end(local_object);
4670                                         }
4671                                 }
4672                                 break;
4673
4674                         case VM_FAULT_RETRY:
4675                                 vm_object_lock(object);
4676                                 vm_object_paging_begin(object);
4677                                 break;
4678
4679                         case VM_FAULT_FICTITIOUS_SHORTAGE:
4680                                 vm_page_more_fictitious();
4681
4682                                 vm_object_lock(object);
4683                                 vm_object_paging_begin(object);
4684                                 break;
4685
4686                         case VM_FAULT_MEMORY_SHORTAGE:
4687                                 if (vm_page_wait(interruptible)) {
4688                                         vm_object_lock(object);
4689                                         vm_object_paging_begin(object);
4690                                         break;
4691                                 }
4692                                 /* fall thru */
4693
4694                         case VM_FAULT_INTERRUPTED:
4695                                 error_code = MACH_SEND_INTERRUPTED;
4696                         case VM_FAULT_MEMORY_ERROR:
4697                                 ret = (error_code ? error_code: KERN_MEMORY_ERROR);
4698
4699                                 vm_object_lock(object);
4700                                 vm_object_paging_begin(object);
4701                                 goto return_err;
4702                         }
4703                    } while (result != VM_FAULT_SUCCESS);
4704                 }
4705
4706                 if ( (cntrl_flags & UPL_NEED_32BIT_ADDR) &&
4707                      dst_page->phys_page >= (max_valid_dma_address >> PAGE_SHIFT) ) {
4708                         vm_page_t       low_page;
4709                         int             refmod;
4710
4711                         /*
4712                          * support devices that can't DMA above 32 bits
4713                          * by substituting pages from a pool of low address
4714                          * memory for any pages we find above the 4G mark
4715                          * can't substitute if the page is already wired because
4716                          * we don't know whether that physical address has been
4717                          * handed out to some other 64 bit capable DMA device to use
4718                          */
4719                         if (dst_page->wire_count) {
4720                                 ret = KERN_PROTECTION_FAILURE;
4721                                 goto return_err;
4722                         }
4723                         if (delayed_unlock) {
4724                                 delayed_unlock = 0;
4725                                 vm_page_unlock_queues();
4726                         }
4727                         low_page = vm_page_grablo();
4728
4729                         if (low_page == VM_PAGE_NULL) {
4730                                 ret = KERN_RESOURCE_SHORTAGE;
4731                                 goto return_err;
4732                         }
4733                         /*
4734                          * from here until the vm_page_replace completes
4735                          * we musn't drop the object lock... we don't
4736                          * want anyone refaulting this page in and using
4737                          * it after we disconnect it... we want the fault
4738                          * to find the new page being substituted.
4739                          */
4740                         if (dst_page->pmapped)
4741                                 refmod = pmap_disconnect(dst_page->phys_page);
4742                         else
4743                                 refmod = 0;
4744                         vm_page_copy(dst_page, low_page);
4745
4746                         low_page->reference = dst_page->reference;
4747                         low_page->dirty     = dst_page->dirty;
4748
4749                         if (refmod & VM_MEM_REFERENCED)
4750                                 low_page->reference = TRUE;
4751                         if (refmod & VM_MEM_MODIFIED)
4752                                 low_page->dirty = TRUE;
4753
4754                         vm_page_lock_queues();
4755                         vm_page_replace(low_page, object, dst_offset);
4756                         /*
4757                          * keep the queue lock since we're going to
4758                          * need it immediately
4759                          */
4760                         delayed_unlock = 1;
4761
4762                         dst_page = low_page;
4763                         /*
4764                          * vm_page_grablo returned the page marked
4765                          * BUSY... we don't need a PAGE_WAKEUP_DONE
4766                          * here, because we've never dropped the object lock
4767                          */
4768                         dst_page->busy = FALSE;
4769                 }
4770                 if (delayed_unlock == 0)
4771                         vm_page_lock_queues();
4772
4773                 vm_page_wire(dst_page);
4774
4775                 if (cntrl_flags & UPL_BLOCK_ACCESS) {
4776                         /*
4777                          * Mark the page "busy" to block any future page fault
4778                          * on this page.  We'll also remove the mapping
4779                          * of all these pages before leaving this routine.
4780                          */
4781                         assert(!dst_page->fictitious);
4782                         dst_page->busy = TRUE;
4783                 }
4784                 pg_num = (dst_offset-offset)/PAGE_SIZE;
4785                 lite_list[pg_num>>5] |= 1 << (pg_num & 31);
4786
4787                 /*
4788                  * expect the page to be used
4789                  * page queues lock must be held to set 'reference'
4790                  */
4791                 dst_page->reference = TRUE;
4792
4793                 if (!(cntrl_flags & UPL_COPYOUT_FROM))
4794                         dst_page->dirty = TRUE;
4795
4796                 if (dst_page->phys_page > upl->highest_page)
4797                         upl->highest_page = dst_page->phys_page;
4798
4799                 if (user_page_list) {
4800                         user_page_list[entry].phys_addr = dst_page->phys_page;
4801                         user_page_list[entry].dirty     = dst_page->dirty;
4802                         user_page_list[entry].pageout   = dst_page->pageout;
4803                         user_page_list[entry].absent    = dst_page->absent;
4804                         user_page_list[entry].precious  = dst_page->precious;
4805
4806                         if (dst_page->clustered == TRUE)
4807                                 user_page_list[entry].speculative = dst_page->speculative;
4808                         else
4809                                 user_page_list[entry].speculative = FALSE;
4810                 }
4811                 /*
4812                  * someone is explicitly grabbing this page...
4813                  * update clustered and speculative state
4814                  *
4815                  */
4816                 VM_PAGE_CONSUME_CLUSTERED(dst_page);
4817
4818                 if (delayed_unlock++ > UPL_DELAYED_UNLOCK_LIMIT) {
4819                         mutex_yield(&vm_page_queue_lock);
4820                         delayed_unlock = 1;
4821                 }
4822                 entry++;
4823                 dst_offset += PAGE_SIZE_64;
4824                 xfer_size -= PAGE_SIZE;
4825         }
4826         if (delayed_unlock)
4827                 vm_page_unlock_queues();
4828
4829         if (page_list_count != NULL) {
4830                 if (upl->flags & UPL_INTERNAL)
4831                         *page_list_count = 0;
4832                 else if (*page_list_count > entry)
4833                         *page_list_count = entry;
4834         }
4835         vm_object_unlock(object);
4836
4837         if (cntrl_flags & UPL_BLOCK_ACCESS) {
4838                 /*
4839                  * We've marked all the pages "busy" so that future
4840                  * page faults will block.
4841                  * Now remove the mapping for these pages, so that they
4842                  * can't be accessed without causing a page fault.
4843                  */
4844                 vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
4845                                        PMAP_NULL, 0, VM_PROT_NONE);
4846         }
4847         return KERN_SUCCESS;
4848
4849 return_err:
4850         if (delayed_unlock)
4851                 vm_page_unlock_queues();
4852
4853         for (; offset < dst_offset; offset += PAGE_SIZE) {
4854                 dst_page = vm_page_lookup(object, offset);
4855
4856                 if (dst_page == VM_PAGE_NULL)
4857                         panic("vm_object_iopl_request: Wired pages missing. \n");
4858
4859                 vm_page_lockspin_queues();
4860                 vm_page_unwire(dst_page);
4861                 vm_page_unlock_queues();
4862
4863                 VM_STAT_INCR(reactivations);
4864         }
4865         vm_object_paging_end(object);
4866         vm_object_unlock(object);
4867         upl_destroy(upl);
4868
4869         return ret;
4870 }
4871
4872 kern_return_t
4873 upl_transpose(
4874         upl_t           upl1,
4875         upl_t           upl2)
4876 {
4877         kern_return_t           retval;
4878         boolean_t               upls_locked;
4879         vm_object_t             object1, object2;
4880
4881         if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2) {
4882                 return KERN_INVALID_ARGUMENT;
4883         }
4884
4885         upls_locked = FALSE;
4886
4887         /*
4888          * Since we need to lock both UPLs at the same time,
4889          * avoid deadlocks by always taking locks in the same order.
4890          */
4891         if (upl1 < upl2) {
4892                 upl_lock(upl1);
4893                 upl_lock(upl2);
4894         } else {
4895                 upl_lock(upl2);
4896                 upl_lock(upl1);
4897         }
4898         upls_locked = TRUE;     /* the UPLs will need to be unlocked */
4899
4900         object1 = upl1->map_object;
4901         object2 = upl2->map_object;
4902
4903         if (upl1->offset != 0 || upl2->offset != 0 ||
4904             upl1->size != upl2->size) {
4905                 /*
4906                  * We deal only with full objects, not subsets.
4907                  * That's because we exchange the entire backing store info
4908                  * for the objects: pager, resident pages, etc...  We can't do
4909                  * only part of it.
4910                  */
4911                 retval = KERN_INVALID_VALUE;
4912                 goto done;
4913         }
4914
4915         /*
4916          * Tranpose the VM objects' backing store.
4917          */
4918         retval = vm_object_transpose(object1, object2,
4919                                      (vm_object_size_t) upl1->size);
4920
4921         if (retval == KERN_SUCCESS) {
4922                 /*
4923                  * Make each UPL point to the correct VM object, i.e. the
4924                  * object holding the pages that the UPL refers to...
4925                  */
4926 #ifdef UPL_DEBUG
4927                 queue_remove(&object1->uplq, upl1, upl_t, uplq);
4928                 queue_remove(&object2->uplq, upl2, upl_t, uplq);
4929 #endif
4930                 upl1->map_object = object2;
4931                 upl2->map_object = object1;
4932 #ifdef UPL_DEBUG
4933                 queue_enter(&object1->uplq, upl2, upl_t, uplq);
4934                 queue_enter(&object2->uplq, upl1, upl_t, uplq);
4935 #endif
4936         }
4937
4938 done:
4939         /*
4940          * Cleanup.
4941          */
4942         if (upls_locked) {
4943                 upl_unlock(upl1);
4944                 upl_unlock(upl2);
4945                 upls_locked = FALSE;
4946         }
4947
4948         return retval;
4949 }
4950
4951 /*
4952  * ENCRYPTED SWAP:
4953  *
4954  * Rationale:  the user might have some encrypted data on disk (via
4955  * FileVault or any other mechanism).  That data is then decrypted in
4956  * memory, which is safe as long as the machine is secure.  But that
4957  * decrypted data in memory could be paged out to disk by the default
4958  * pager.  The data would then be stored on disk in clear (not encrypted)
4959  * and it could be accessed by anyone who gets physical access to the
4960  * disk (if the laptop or the disk gets stolen for example).  This weakens
4961  * the security offered by FileVault.
4962  *
4963  * Solution:  the default pager will optionally request that all the
4964  * pages it gathers for pageout be encrypted, via the UPL interfaces,
4965  * before it sends this UPL to disk via the vnode_pageout() path.
4966  *
4967  * Notes:
4968  *
4969  * To avoid disrupting the VM LRU algorithms, we want to keep the
4970  * clean-in-place mechanisms, which allow us to send some extra pages to
4971  * swap (clustering) without actually removing them from the user's
4972  * address space.  We don't want the user to unknowingly access encrypted
4973  * data, so we have to actually remove the encrypted pages from the page
4974  * table.  When the user accesses the data, the hardware will fail to
4975  * locate the virtual page in its page table and will trigger a page
4976  * fault.  We can then decrypt the page and enter it in the page table
4977  * again.  Whenever we allow the user to access the contents of a page,
4978  * we have to make sure it's not encrypted.
4979  *
4980  *
4981  */
4982 /*
4983  * ENCRYPTED SWAP:
4984  * Reserve of virtual addresses in the kernel address space.
4985  * We need to map the physical pages in the kernel, so that we
4986  * can call the encryption/decryption routines with a kernel
4987  * virtual address.  We keep this pool of pre-allocated kernel
4988  * virtual addresses so that we don't have to scan the kernel's
4989  * virtaul address space each time we need to encrypt or decrypt
4990  * a physical page.
4991  * It would be nice to be able to encrypt and decrypt in physical
4992  * mode but that might not always be more efficient...
4993  */
4994 decl_simple_lock_data(,vm_paging_lock)
4995 #define VM_PAGING_NUM_PAGES     64
4996 vm_map_offset_t vm_paging_base_address = 0;
4997 boolean_t       vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
4998 int             vm_paging_max_index = 0;
4999 int             vm_paging_page_waiter = 0;
5000 int             vm_paging_page_waiter_total = 0;
5001 unsigned long   vm_paging_no_kernel_page = 0;
5002 unsigned long   vm_paging_objects_mapped = 0;
5003 unsigned long   vm_paging_pages_mapped = 0;
5004 unsigned long   vm_paging_objects_mapped_slow = 0;
5005 unsigned long   vm_paging_pages_mapped_slow = 0;
5006
5007 void
5008 vm_paging_map_init(void)
5009 {
5010         kern_return_t   kr;
5011         vm_map_offset_t page_map_offset;
5012         vm_map_entry_t  map_entry;
5013
5014         assert(vm_paging_base_address == 0);
5015
5016         /*
5017          * Initialize our pool of pre-allocated kernel
5018          * virtual addresses.
5019          */
5020         page_map_offset = 0;
5021         kr = vm_map_find_space(kernel_map,
5022                                &page_map_offset,
5023                                VM_PAGING_NUM_PAGES * PAGE_SIZE,
5024                                0,
5025                                0,
5026                                &map_entry);
5027         if (kr != KERN_SUCCESS) {
5028                 panic("vm_paging_map_init: kernel_map full\n");
5029         }
5030         map_entry->object.vm_object = kernel_object;
5031         map_entry->offset =
5032                 page_map_offset - VM_MIN_KERNEL_ADDRESS;
5033         vm_object_reference(kernel_object);
5034         vm_map_unlock(kernel_map);
5035
5036         assert(vm_paging_base_address == 0);
5037         vm_paging_base_address = page_map_offset;
5038 }
5039
5040 /*
5041  * ENCRYPTED SWAP:
5042  * vm_paging_map_object:
5043  *      Maps part of a VM object's pages in the kernel
5044  *      virtual address space, using the pre-allocated
5045  *      kernel virtual addresses, if possible.
5046  * Context:
5047  *      The VM object is locked.  This lock will get
5048  *      dropped and re-acquired though, so the caller
5049  *      must make sure the VM object is kept alive
5050  *      (by holding a VM map that has a reference
5051  *      on it, for example, or taking an extra reference).
5052  *      The page should also be kept busy to prevent
5053  *      it from being reclaimed.
5054  */
5055 kern_return_t
5056 vm_paging_map_object(
5057         vm_map_offset_t         *address,
5058         vm_page_t               page,
5059         vm_object_t             object,
5060         vm_object_offset_t      offset,
5061         vm_map_size_t           *size,
5062         boolean_t               can_unlock_object)
5063 {
5064         kern_return_t           kr;
5065         vm_map_offset_t         page_map_offset;
5066         vm_map_size_t           map_size;
5067         vm_object_offset_t      object_offset;
5068         int                     i;
5069
5070
5071         if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
5072                 assert(page->busy);
5073                 /*
5074                  * Use one of the pre-allocated kernel virtual addresses
5075                  * and just enter the VM page in the kernel address space
5076                  * at that virtual address.
5077                  */
5078                 simple_lock(&vm_paging_lock);
5079
5080                 /*
5081                  * Try and find an available kernel virtual address
5082                  * from our pre-allocated pool.
5083                  */
5084                 page_map_offset = 0;
5085                 for (;;) {
5086                         for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
5087                                 if (vm_paging_page_inuse[i] == FALSE) {
5088                                         page_map_offset =
5089                                                 vm_paging_base_address +
5090                                                 (i * PAGE_SIZE);
5091                                         break;
5092                                 }
5093                         }
5094                         if (page_map_offset != 0) {
5095                                 /* found a space to map our page ! */
5096                                 break;
5097                         }
5098
5099                         if (can_unlock_object) {
5100                                 /*
5101                                  * If we can afford to unlock the VM object,
5102                                  * let's take the slow path now...
5103                                  */
5104                                 break;
5105                         }
5106                         /*
5107                          * We can't afford to unlock the VM object, so
5108                          * let's wait for a space to become available...
5109                          */
5110                         vm_paging_page_waiter_total++;
5111                         vm_paging_page_waiter++;
5112                         thread_sleep_fast_usimple_lock(&vm_paging_page_waiter,
5113                                                        &vm_paging_lock,
5114                                                        THREAD_UNINT);
5115                         vm_paging_page_waiter--;
5116                         /* ... and try again */
5117                 }
5118
5119                 if (page_map_offset != 0) {
5120                         /*
5121                          * We found a kernel virtual address;
5122                          * map the physical page to that virtual address.
5123                          */
5124                         if (i > vm_paging_max_index) {
5125                                 vm_paging_max_index = i;
5126                         }
5127                         vm_paging_page_inuse[i] = TRUE;
5128                         simple_unlock(&vm_paging_lock);
5129
5130                         if (page->pmapped == FALSE) {
5131                                 pmap_sync_page_data_phys(page->phys_page);
5132                         }
5133                         page->pmapped = TRUE;
5134
5135                         /*
5136                          * Keep the VM object locked over the PMAP_ENTER
5137                          * and the actual use of the page by the kernel,
5138                          * or this pmap mapping might get undone by a
5139                          * vm_object_pmap_protect() call...
5140                          */
5141                         PMAP_ENTER(kernel_pmap,
5142                                    page_map_offset,
5143                                    page,
5144                                    VM_PROT_DEFAULT,
5145                                    ((int) page->object->wimg_bits &
5146                                     VM_WIMG_MASK),
5147                                    TRUE);
5148                         vm_paging_objects_mapped++;
5149                         vm_paging_pages_mapped++;
5150                         *address = page_map_offset;
5151
5152                         /* all done and mapped, ready to use ! */
5153                         return KERN_SUCCESS;
5154                 }
5155
5156                 /*
5157                  * We ran out of pre-allocated kernel virtual
5158                  * addresses.  Just map the page in the kernel
5159                  * the slow and regular way.
5160                  */
5161                 vm_paging_no_kernel_page++;
5162                 simple_unlock(&vm_paging_lock);
5163         }
5164
5165         if (! can_unlock_object) {
5166                 return KERN_NOT_SUPPORTED;
5167         }
5168
5169         object_offset = vm_object_trunc_page(offset);
5170         map_size = vm_map_round_page(*size);
5171
5172         /*
5173          * Try and map the required range of the object
5174          * in the kernel_map
5175          */
5176
5177         vm_object_reference_locked(object);     /* for the map entry */
5178         vm_object_unlock(object);
5179
5180         kr = vm_map_enter(kernel_map,
5181                           address,
5182                           map_size,
5183                           0,
5184                           VM_FLAGS_ANYWHERE,
5185                           object,
5186                           object_offset,
5187                           FALSE,
5188                           VM_PROT_DEFAULT,
5189                           VM_PROT_ALL,
5190                           VM_INHERIT_NONE);
5191         if (kr != KERN_SUCCESS) {
5192                 *address = 0;
5193                 *size = 0;
5194                 vm_object_deallocate(object);   /* for the map entry */
5195                 vm_object_lock(object);
5196                 return kr;
5197         }
5198
5199         *size = map_size;
5200
5201         /*
5202          * Enter the mapped pages in the page table now.
5203          */
5204         vm_object_lock(object);
5205         /*
5206          * VM object must be kept locked from before PMAP_ENTER()
5207          * until after the kernel is done accessing the page(s).
5208          * Otherwise, the pmap mappings in the kernel could be
5209          * undone by a call to vm_object_pmap_protect().
5210          */
5211
5212         for (page_map_offset = 0;
5213              map_size != 0;
5214              map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
5215                 unsigned int    cache_attr;
5216
5217                 page = vm_page_lookup(object, offset + page_map_offset);
5218                 if (page == VM_PAGE_NULL) {
5219                         printf("vm_paging_map_object: no page !?");
5220                         vm_object_unlock(object);
5221                         kr = vm_map_remove(kernel_map, *address, *size,
5222                                            VM_MAP_NO_FLAGS);
5223                         assert(kr == KERN_SUCCESS);
5224                         *address = 0;
5225                         *size = 0;
5226                         vm_object_lock(object);
5227                         return KERN_MEMORY_ERROR;
5228                 }
5229                 if (page->pmapped == FALSE) {
5230                         pmap_sync_page_data_phys(page->phys_page);
5231                 }
5232                 page->pmapped = TRUE;
5233                 cache_attr = ((unsigned int) object->wimg_bits) & VM_WIMG_MASK;
5234
5235                 //assert(pmap_verify_free(page->phys_page));
5236                 PMAP_ENTER(kernel_pmap,
5237                            *address + page_map_offset,
5238                            page,
5239                            VM_PROT_DEFAULT,
5240                            cache_attr,
5241                            TRUE);
5242         }
5243
5244         vm_paging_objects_mapped_slow++;
5245         vm_paging_pages_mapped_slow += map_size / PAGE_SIZE_64;
5246
5247         return KERN_SUCCESS;
5248 }
5249
5250 /*
5251  * ENCRYPTED SWAP:
5252  * vm_paging_unmap_object:
5253  *      Unmaps part of a VM object's pages from the kernel
5254  *      virtual address space.
5255  * Context:
5256  *      The VM object is locked.  This lock will get
5257  *      dropped and re-acquired though.
5258  */
5259 void
5260 vm_paging_unmap_object(
5261         vm_object_t     object,
5262         vm_map_offset_t start,
5263         vm_map_offset_t end)
5264 {
5265         kern_return_t   kr;
5266         int             i;
5267
5268         if ((vm_paging_base_address == 0) ||
5269             (start < vm_paging_base_address) ||
5270             (end > (vm_paging_base_address
5271                      + (VM_PAGING_NUM_PAGES * PAGE_SIZE)))) {
5272                 /*
5273                  * We didn't use our pre-allocated pool of
5274                  * kernel virtual address.  Deallocate the
5275                  * virtual memory.
5276                  */
5277                 if (object != VM_OBJECT_NULL) {
5278                         vm_object_unlock(object);
5279                 }
5280                 kr = vm_map_remove(kernel_map, start, end, VM_MAP_NO_FLAGS);
5281                 if (object != VM_OBJECT_NULL) {
5282                         vm_object_lock(object);
5283                 }
5284                 assert(kr == KERN_SUCCESS);
5285         } else {
5286                 /*
5287                  * We used a kernel virtual address from our
5288                  * pre-allocated pool.  Put it back in the pool
5289                  * for next time.
5290                  */
5291                 assert(end - start == PAGE_SIZE);
5292                 i = (start - vm_paging_base_address) >> PAGE_SHIFT;
5293
5294                 /* undo the pmap mapping */
5295                 pmap_remove(kernel_pmap, start, end);
5296
5297                 simple_lock(&vm_paging_lock);
5298                 vm_paging_page_inuse[i] = FALSE;
5299                 if (vm_paging_page_waiter) {
5300                         thread_wakeup(&vm_paging_page_waiter);
5301                 }
5302                 simple_unlock(&vm_paging_lock);
5303         }
5304 }
5305
5306 #if CRYPTO
5307 /*
5308  * Encryption data.
5309  * "iv" is the "initial vector".  Ideally, we want to
5310  * have a different one for each page we encrypt, so that
5311  * crackers can't find encryption patterns too easily.
5312  */
5313 #define SWAP_CRYPT_AES_KEY_SIZE 128     /* XXX 192 and 256 don't work ! */
5314 boolean_t               swap_crypt_ctx_initialized = FALSE;
5315 aes_32t                 swap_crypt_key[8]; /* big enough for a 256 key */
5316 aes_ctx                 swap_crypt_ctx;
5317 const unsigned char     swap_crypt_null_iv[AES_BLOCK_SIZE] = {0xa, };
5318
5319 #if DEBUG
5320 boolean_t               swap_crypt_ctx_tested = FALSE;
5321 unsigned char swap_crypt_test_page_ref[4096] __attribute__((aligned(4096)));
5322 unsigned char swap_crypt_test_page_encrypt[4096] __attribute__((aligned(4096)));
5323 unsigned char swap_crypt_test_page_decrypt[4096] __attribute__((aligned(4096)));
5324 #endif /* DEBUG */
5325
5326 extern u_long random(void);
5327
5328 /*
5329  * Initialize the encryption context: key and key size.
5330  */
5331 void swap_crypt_ctx_initialize(void); /* forward */
5332 void
5333 swap_crypt_ctx_initialize(void)
5334 {
5335         unsigned int    i;
5336
5337         /*
5338          * No need for locking to protect swap_crypt_ctx_initialized
5339          * because the first use of encryption will come from the
5340          * pageout thread (we won't pagein before there's been a pageout)
5341          * and there's only one pageout thread.
5342          */
5343         if (swap_crypt_ctx_initialized == FALSE) {
5344                 for (i = 0;
5345                      i < (sizeof (swap_crypt_key) /
5346                           sizeof (swap_crypt_key[0]));
5347                      i++) {
5348                         swap_crypt_key[i] = random();
5349                 }
5350                 aes_encrypt_key((const unsigned char *) swap_crypt_key,
5351                                 SWAP_CRYPT_AES_KEY_SIZE,
5352                                 &swap_crypt_ctx.encrypt);
5353                 aes_decrypt_key((const unsigned char *) swap_crypt_key,
5354                                 SWAP_CRYPT_AES_KEY_SIZE,
5355                                 &swap_crypt_ctx.decrypt);
5356                 swap_crypt_ctx_initialized = TRUE;
5357         }
5358
5359 #if DEBUG
5360         /*
5361          * Validate the encryption algorithms.
5362          */
5363         if (swap_crypt_ctx_tested == FALSE) {
5364                 /* initialize */
5365                 for (i = 0; i < 4096; i++) {
5366                         swap_crypt_test_page_ref[i] = (char) i;
5367                 }
5368                 /* encrypt */
5369                 aes_encrypt_cbc(swap_crypt_test_page_ref,
5370                                 swap_crypt_null_iv,
5371                                 PAGE_SIZE / AES_BLOCK_SIZE,
5372                                 swap_crypt_test_page_encrypt,
5373                                 &swap_crypt_ctx.encrypt);
5374                 /* decrypt */
5375                 aes_decrypt_cbc(swap_crypt_test_page_encrypt,
5376                                 swap_crypt_null_iv,
5377                                 PAGE_SIZE / AES_BLOCK_SIZE,
5378                                 swap_crypt_test_page_decrypt,
5379                                 &swap_crypt_ctx.decrypt);
5380                 /* compare result with original */
5381                 for (i = 0; i < 4096; i ++) {
5382                         if (swap_crypt_test_page_decrypt[i] !=
5383                             swap_crypt_test_page_ref[i]) {
5384                                 panic("encryption test failed");
5385                         }
5386                 }
5387
5388                 /* encrypt again */
5389                 aes_encrypt_cbc(swap_crypt_test_page_decrypt,
5390                                 swap_crypt_null_iv,
5391                                 PAGE_SIZE / AES_BLOCK_SIZE,
5392                                 swap_crypt_test_page_decrypt,
5393                                 &swap_crypt_ctx.encrypt);
5394                 /* decrypt in place */
5395                 aes_decrypt_cbc(swap_crypt_test_page_decrypt,
5396                                 swap_crypt_null_iv,
5397                                 PAGE_SIZE / AES_BLOCK_SIZE,
5398                                 swap_crypt_test_page_decrypt,
5399                                 &swap_crypt_ctx.decrypt);
5400                 for (i = 0; i < 4096; i ++) {
5401                         if (swap_crypt_test_page_decrypt[i] !=
5402                             swap_crypt_test_page_ref[i]) {
5403                                 panic("in place encryption test failed");
5404                         }
5405                 }
5406
5407                 swap_crypt_ctx_tested = TRUE;
5408         }
5409 #endif /* DEBUG */
5410 }
5411
5412 /*
5413  * ENCRYPTED SWAP:
5414  * vm_page_encrypt:
5415  *      Encrypt the given page, for secure paging.
5416  *      The page might already be mapped at kernel virtual
5417  *      address "kernel_mapping_offset".  Otherwise, we need
5418  *      to map it.
5419  *
5420  * Context:
5421  *      The page's object is locked, but this lock will be released
5422  *      and re-acquired.
5423  *      The page is busy and not accessible by users (not entered in any pmap).
5424  */
5425 void
5426 vm_page_encrypt(
5427         vm_page_t       page,
5428         vm_map_offset_t kernel_mapping_offset)
5429 {
5430         kern_return_t           kr;
5431         vm_map_size_t           kernel_mapping_size;
5432         vm_offset_t             kernel_vaddr;
5433         union {
5434                 unsigned char   aes_iv[AES_BLOCK_SIZE];
5435                 struct {
5436                         memory_object_t         pager_object;
5437                         vm_object_offset_t      paging_offset;
5438                 } vm;
5439         } encrypt_iv;
5440
5441         if (! vm_pages_encrypted) {
5442                 vm_pages_encrypted = TRUE;
5443         }
5444
5445         assert(page->busy);
5446         assert(page->dirty || page->precious);
5447
5448         if (page->encrypted) {
5449                 /*
5450                  * Already encrypted: no need to do it again.
5451                  */
5452                 vm_page_encrypt_already_encrypted_counter++;
5453                 return;
5454         }
5455         ASSERT_PAGE_DECRYPTED(page);
5456
5457         /*
5458          * Take a paging-in-progress reference to keep the object
5459          * alive even if we have to unlock it (in vm_paging_map_object()
5460          * for example)...
5461          */
5462         vm_object_paging_begin(page->object);
5463
5464         if (kernel_mapping_offset == 0) {
5465                 /*
5466                  * The page hasn't already been mapped in kernel space
5467                  * by the caller.  Map it now, so that we can access
5468                  * its contents and encrypt them.
5469                  */
5470                 kernel_mapping_size = PAGE_SIZE;
5471                 kr = vm_paging_map_object(&kernel_mapping_offset,
5472                                           page,
5473                                           page->object,
5474                                           page->offset,
5475                                           &kernel_mapping_size,
5476                                           FALSE);
5477                 if (kr != KERN_SUCCESS) {
5478                         panic("vm_page_encrypt: "
5479                               "could not map page in kernel: 0x%x\n",
5480                               kr);
5481                 }
5482         } else {
5483                 kernel_mapping_size = 0;
5484         }
5485         kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
5486
5487         if (swap_crypt_ctx_initialized == FALSE) {
5488                 swap_crypt_ctx_initialize();
5489         }
5490         assert(swap_crypt_ctx_initialized);
5491
5492         /*
5493          * Prepare an "initial vector" for the encryption.
5494          * We use the "pager" and the "paging_offset" for that
5495          * page to obfuscate the encrypted data a bit more and
5496          * prevent crackers from finding patterns that they could
5497          * use to break the key.
5498          */
5499         bzero(&encrypt_iv.aes_iv[0], sizeof (encrypt_iv.aes_iv));
5500         encrypt_iv.vm.pager_object = page->object->pager;
5501         encrypt_iv.vm.paging_offset =
5502                 page->object->paging_offset + page->offset;
5503
5504         /* encrypt the "initial vector" */
5505         aes_encrypt_cbc((const unsigned char *) &encrypt_iv.aes_iv[0],
5506                         swap_crypt_null_iv,
5507                         1,
5508                         &encrypt_iv.aes_iv[0],
5509                         &swap_crypt_ctx.encrypt);
5510
5511         /*
5512          * Encrypt the page.
5513          */
5514         aes_encrypt_cbc((const unsigned char *) kernel_vaddr,
5515                         &encrypt_iv.aes_iv[0],
5516                         PAGE_SIZE / AES_BLOCK_SIZE,
5517                         (unsigned char *) kernel_vaddr,
5518                         &swap_crypt_ctx.encrypt);
5519
5520         vm_page_encrypt_counter++;
5521
5522         /*
5523          * Unmap the page from the kernel's address space,
5524          * if we had to map it ourselves.  Otherwise, let
5525          * the caller undo the mapping if needed.
5526          */
5527         if (kernel_mapping_size != 0) {
5528                 vm_paging_unmap_object(page->object,
5529                                        kernel_mapping_offset,
5530                                        kernel_mapping_offset + kernel_mapping_size);
5531         }
5532
5533         /*
5534          * Clear the "reference" and "modified" bits.
5535          * This should clean up any impact the encryption had
5536          * on them.
5537          * The page was kept busy and disconnected from all pmaps,
5538          * so it can't have been referenced or modified from user
5539          * space.
5540          * The software bits will be reset later after the I/O
5541          * has completed (in upl_commit_range()).
5542          */
5543         pmap_clear_refmod(page->phys_page, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
5544
5545         page->encrypted = TRUE;
5546
5547         vm_object_paging_end(page->object);
5548 }
5549
5550 /*
5551  * ENCRYPTED SWAP:
5552  * vm_page_decrypt:
5553  *      Decrypt the given page.
5554  *      The page might already be mapped at kernel virtual
5555  *      address "kernel_mapping_offset".  Otherwise, we need
5556  *      to map it.
5557  *
5558  * Context:
5559  *      The page's VM object is locked but will be unlocked and relocked.
5560  *      The page is busy and not accessible by users (not entered in any pmap).
5561  */
5562 void
5563 vm_page_decrypt(
5564         vm_page_t       page,
5565         vm_map_offset_t kernel_mapping_offset)
5566 {
5567         kern_return_t           kr;
5568         vm_map_size_t           kernel_mapping_size;
5569         vm_offset_t             kernel_vaddr;
5570         union {
5571                 unsigned char   aes_iv[AES_BLOCK_SIZE];
5572                 struct {
5573                         memory_object_t         pager_object;
5574                         vm_object_offset_t      paging_offset;
5575                 } vm;
5576         } decrypt_iv;
5577
5578         assert(page->busy);
5579         assert(page->encrypted);
5580
5581         /*
5582          * Take a paging-in-progress reference to keep the object
5583          * alive even if we have to unlock it (in vm_paging_map_object()
5584          * for example)...
5585          */
5586         vm_object_paging_begin(page->object);
5587
5588         if (kernel_mapping_offset == 0) {
5589                 /*
5590                  * The page hasn't already been mapped in kernel space
5591                  * by the caller.  Map it now, so that we can access
5592                  * its contents and decrypt them.
5593                  */
5594                 kernel_mapping_size = PAGE_SIZE;
5595                 kr = vm_paging_map_object(&kernel_mapping_offset,
5596                                           page,
5597                                           page->object,
5598                                           page->offset,
5599                                           &kernel_mapping_size,
5600                                           FALSE);
5601                 if (kr != KERN_SUCCESS) {
5602                         panic("vm_page_decrypt: "
5603                               "could not map page in kernel: 0x%x\n",
5604                               kr);
5605                 }
5606         } else {
5607                 kernel_mapping_size = 0;
5608         }
5609         kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
5610
5611         assert(swap_crypt_ctx_initialized);
5612
5613         /*
5614          * Prepare an "initial vector" for the decryption.
5615          * It has to be the same as the "initial vector" we
5616          * used to encrypt that page.
5617          */
5618         bzero(&decrypt_iv.aes_iv[0], sizeof (decrypt_iv.aes_iv));
5619         decrypt_iv.vm.pager_object = page->object->pager;
5620         decrypt_iv.vm.paging_offset =
5621                 page->object->paging_offset + page->offset;
5622
5623         /* encrypt the "initial vector" */
5624         aes_encrypt_cbc((const unsigned char *) &decrypt_iv.aes_iv[0],
5625                         swap_crypt_null_iv,
5626                         1,
5627                         &decrypt_iv.aes_iv[0],
5628                         &swap_crypt_ctx.encrypt);
5629
5630         /*
5631          * Decrypt the page.
5632          */
5633         aes_decrypt_cbc((const unsigned char *) kernel_vaddr,
5634                         &decrypt_iv.aes_iv[0],
5635                         PAGE_SIZE / AES_BLOCK_SIZE,
5636                         (unsigned char *) kernel_vaddr,
5637                         &swap_crypt_ctx.decrypt);
5638         vm_page_decrypt_counter++;
5639
5640         /*
5641          * Unmap the page from the kernel's address space,
5642          * if we had to map it ourselves.  Otherwise, let
5643          * the caller undo the mapping if needed.
5644          */
5645         if (kernel_mapping_size != 0) {
5646                 vm_paging_unmap_object(page->object,
5647                                        kernel_vaddr,
5648                                        kernel_vaddr + PAGE_SIZE);
5649         }
5650
5651         /*
5652          * After decryption, the page is actually clean.
5653          * It was encrypted as part of paging, which "cleans"
5654          * the "dirty" pages.
5655          * Noone could access it after it was encrypted
5656          * and the decryption doesn't count.
5657          */
5658         page->dirty = FALSE;
5659         pmap_clear_refmod(page->phys_page, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
5660
5661         page->encrypted = FALSE;
5662
5663         /*
5664          * We've just modified the page's contents via the data cache and part
5665          * of the new contents might still be in the cache and not yet in RAM.
5666          * Since the page is now available and might get gathered in a UPL to
5667          * be part of a DMA transfer from a driver that expects the memory to
5668          * be coherent at this point, we have to flush the data cache.
5669          */
5670         pmap_sync_page_attributes_phys(page->phys_page);
5671         /*
5672          * Since the page is not mapped yet, some code might assume that it
5673          * doesn't need to invalidate the instruction cache when writing to
5674          * that page.  That code relies on "pmapped" being FALSE, so that the
5675          * caches get synchronized when the page is first mapped.
5676          */
5677         assert(pmap_verify_free(page->phys_page));
5678         page->pmapped = FALSE;
5679
5680         vm_object_paging_end(page->object);
5681 }
5682
5683 unsigned long upl_encrypt_upls = 0;
5684 unsigned long upl_encrypt_pages = 0;
5685
5686 /*
5687  * ENCRYPTED SWAP:
5688  *
5689  * upl_encrypt:
5690  *      Encrypts all the pages in the UPL, within the specified range.
5691  *
5692  */
5693 void
5694 upl_encrypt(
5695         upl_t                   upl,
5696         upl_offset_t            crypt_offset,
5697         upl_size_t              crypt_size)
5698 {
5699         upl_size_t              upl_size;
5700         upl_offset_t            upl_offset;
5701         vm_object_t             upl_object;
5702         vm_page_t               page;
5703         vm_object_t             shadow_object;
5704         vm_object_offset_t      shadow_offset;
5705         vm_object_offset_t      paging_offset;
5706         vm_object_offset_t      base_offset;
5707
5708         upl_encrypt_upls++;
5709         upl_encrypt_pages += crypt_size / PAGE_SIZE;
5710
5711         upl_object = upl->map_object;
5712         upl_offset = upl->offset;
5713         upl_size = upl->size;
5714
5715         vm_object_lock(upl_object);
5716
5717         /*
5718          * Find the VM object that contains the actual pages.
5719          */
5720         if (upl_object->pageout) {
5721                 shadow_object = upl_object->shadow;
5722                 /*
5723                  * The offset in the shadow object is actually also
5724                  * accounted for in upl->offset.  It possibly shouldn't be
5725                  * this way, but for now don't account for it twice.
5726                  */
5727                 shadow_offset = 0;
5728                 assert(upl_object->paging_offset == 0); /* XXX ? */
5729                 vm_object_lock(shadow_object);
5730         } else {
5731                 shadow_object = upl_object;
5732                 shadow_offset = 0;
5733         }
5734
5735         paging_offset = shadow_object->paging_offset;
5736         vm_object_paging_begin(shadow_object);
5737
5738         if (shadow_object != upl_object)
5739                 vm_object_unlock(upl_object);
5740
5741
5742         base_offset = shadow_offset;
5743         base_offset += upl_offset;
5744         base_offset += crypt_offset;
5745         base_offset -= paging_offset;
5746
5747         assert(crypt_offset + crypt_size <= upl_size);
5748
5749         for (upl_offset = 0;
5750              upl_offset < crypt_size;
5751              upl_offset += PAGE_SIZE) {
5752                 page = vm_page_lookup(shadow_object,
5753                                       base_offset + upl_offset);
5754                 if (page == VM_PAGE_NULL) {
5755                         panic("upl_encrypt: "
5756                               "no page for (obj=%p,off=%lld+%d)!\n",
5757                               shadow_object,
5758                               base_offset,
5759                               upl_offset);
5760                 }
5761                 /*
5762                  * Disconnect the page from all pmaps, so that nobody can
5763                  * access it while it's encrypted.  After that point, all
5764                  * accesses to this page will cause a page fault and block
5765                  * while the page is busy being encrypted.  After the
5766                  * encryption completes, any access will cause a
5767                  * page fault and the page gets decrypted at that time.
5768                  */
5769                 pmap_disconnect(page->phys_page);
5770                 vm_page_encrypt(page, 0);
5771
5772                 if (shadow_object == vm_pageout_scan_wants_object) {
5773                         /*
5774                          * Give vm_pageout_scan() a chance to convert more
5775                          * pages from "clean-in-place" to "clean-and-free",
5776                          * if it's interested in the same pages we selected
5777                          * in this cluster.
5778                          */
5779                         vm_object_unlock(shadow_object);
5780                         vm_object_lock(shadow_object);
5781                 }
5782         }
5783
5784         vm_object_paging_end(shadow_object);
5785         vm_object_unlock(shadow_object);
5786 }
5787
5788 #else /* CRYPTO */
5789 void
5790 upl_encrypt(
5791         __unused upl_t                  upl,
5792         __unused upl_offset_t   crypt_offset,
5793         __unused upl_size_t     crypt_size)
5794 {
5795 }
5796
5797 void
5798 vm_page_encrypt(
5799         __unused vm_page_t              page,
5800         __unused vm_map_offset_t        kernel_mapping_offset)
5801 {
5802 }
5803
5804 void
5805 vm_page_decrypt(
5806         __unused vm_page_t              page,
5807         __unused vm_map_offset_t        kernel_mapping_offset)
5808 {
5809 }
5810
5811 #endif /* CRYPTO */
5812
5813 vm_size_t
5814 upl_get_internal_pagelist_offset(void)
5815 {
5816         return sizeof(struct upl);
5817 }
5818
5819 void
5820 upl_clear_dirty(
5821         upl_t           upl,
5822         boolean_t       value)
5823 {
5824         if (value) {
5825                 upl->flags |= UPL_CLEAR_DIRTY;
5826         } else {
5827                 upl->flags &= ~UPL_CLEAR_DIRTY;
5828         }
5829 }
5830
5831
5832 #ifdef MACH_BSD
5833
5834 boolean_t  upl_device_page(upl_page_info_t *upl)
5835 {
5836         return(UPL_DEVICE_PAGE(upl));
5837 }
5838 boolean_t  upl_page_present(upl_page_info_t *upl, int index)
5839 {
5840         return(UPL_PAGE_PRESENT(upl, index));
5841 }
5842 boolean_t  upl_speculative_page(upl_page_info_t *upl, int index)
5843 {
5844         return(UPL_SPECULATIVE_PAGE(upl, index));
5845 }
5846 boolean_t  upl_dirty_page(upl_page_info_t *upl, int index)
5847 {
5848         return(UPL_DIRTY_PAGE(upl, index));
5849 }
5850 boolean_t  upl_valid_page(upl_page_info_t *upl, int index)
5851 {
5852         return(UPL_VALID_PAGE(upl, index));
5853 }
5854 ppnum_t  upl_phys_page(upl_page_info_t *upl, int index)
5855 {
5856         return(UPL_PHYS_PAGE(upl, index));
5857 }
5858
5859
5860 void
5861 vm_countdirtypages(void)
5862 {
5863         vm_page_t m;
5864         int dpages;
5865         int pgopages;
5866         int precpages;
5867
5868
5869         dpages=0;
5870         pgopages=0;
5871         precpages=0;
5872
5873         vm_page_lock_queues();
5874         m = (vm_page_t) queue_first(&vm_page_queue_inactive);
5875         do {
5876                 if (m ==(vm_page_t )0) break;
5877
5878                 if(m->dirty) dpages++;
5879                 if(m->pageout) pgopages++;
5880                 if(m->precious) precpages++;
5881
5882                 assert(m->object != kernel_object);
5883                 m = (vm_page_t) queue_next(&m->pageq);
5884                 if (m ==(vm_page_t )0) break;
5885
5886         } while (!queue_end(&vm_page_queue_inactive,(queue_entry_t) m));
5887         vm_page_unlock_queues();
5888
5889         vm_page_lock_queues();
5890         m = (vm_page_t) queue_first(&vm_page_queue_throttled);
5891         do {
5892                 if (m ==(vm_page_t )0) break;
5893
5894                 dpages++;
5895                 assert(m->dirty);
5896                 assert(!m->pageout);
5897                 assert(m->object != kernel_object);
5898                 m = (vm_page_t) queue_next(&m->pageq);
5899                 if (m ==(vm_page_t )0) break;
5900
5901         } while (!queue_end(&vm_page_queue_throttled,(queue_entry_t) m));
5902         vm_page_unlock_queues();
5903
5904         vm_page_lock_queues();
5905         m = (vm_page_t) queue_first(&vm_page_queue_zf);
5906         do {
5907                 if (m ==(vm_page_t )0) break;
5908
5909                 if(m->dirty) dpages++;
5910                 if(m->pageout) pgopages++;
5911                 if(m->precious) precpages++;
5912
5913                 assert(m->object != kernel_object);
5914                 m = (vm_page_t) queue_next(&m->pageq);
5915                 if (m ==(vm_page_t )0) break;
5916
5917         } while (!queue_end(&vm_page_queue_zf,(queue_entry_t) m));
5918         vm_page_unlock_queues();
5919
5920         printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
5921
5922         dpages=0;
5923         pgopages=0;
5924         precpages=0;
5925
5926         vm_page_lock_queues();
5927         m = (vm_page_t) queue_first(&vm_page_queue_active);
5928
5929         do {
5930                 if(m == (vm_page_t )0) break;
5931                 if(m->dirty) dpages++;
5932                 if(m->pageout) pgopages++;
5933                 if(m->precious) precpages++;
5934
5935                 assert(m->object != kernel_object);
5936                 m = (vm_page_t) queue_next(&m->pageq);
5937                 if(m == (vm_page_t )0) break;
5938
5939         } while (!queue_end(&vm_page_queue_active,(queue_entry_t) m));
5940         vm_page_unlock_queues();
5941
5942         printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
5943
5944 }
5945 #endif /* MACH_BSD */
5946
5947 ppnum_t upl_get_highest_page(
5948                              upl_t                      upl)
5949 {
5950         return upl->highest_page;
5951 }
5952
5953 #ifdef UPL_DEBUG
5954 kern_return_t  upl_ubc_alias_set(upl_t upl, unsigned int alias1, unsigned int alias2)
5955 {
5956         upl->ubc_alias1 = alias1;
5957         upl->ubc_alias2 = alias2;
5958         return KERN_SUCCESS;
5959 }
5960 int  upl_ubc_alias_get(upl_t upl, unsigned int * al, unsigned int * al2)
5961 {
5962         if(al)
5963                 *al = upl->ubc_alias1;
5964         if(al2)
5965                 *al2 = upl->ubc_alias2;
5966         return KERN_SUCCESS;
5967 }
5968 #endif /* UPL_DEBUG */
5969
5970
5971
5972 #if     MACH_KDB
5973 #include <ddb/db_output.h>
5974 #include <ddb/db_print.h>
5975 #include <vm/vm_print.h>
5976
5977 #define printf  kdbprintf
5978 void            db_pageout(void);
5979
5980 void
5981 db_vm(void)
5982 {
5983
5984         iprintf("VM Statistics:\n");
5985         db_indent += 2;
5986         iprintf("pages:\n");
5987         db_indent += 2;
5988         iprintf("activ %5d  inact %5d  free  %5d",
5989                 vm_page_active_count, vm_page_inactive_count,
5990                 vm_page_free_count);
5991         printf("   wire  %5d  gobbl %5d\n",
5992                vm_page_wire_count, vm_page_gobble_count);
5993         db_indent -= 2;
5994         iprintf("target:\n");
5995         db_indent += 2;
5996         iprintf("min   %5d  inact %5d  free  %5d",
5997                 vm_page_free_min, vm_page_inactive_target,
5998                 vm_page_free_target);
5999         printf("   resrv %5d\n", vm_page_free_reserved);
6000         db_indent -= 2;
6001         iprintf("pause:\n");
6002         db_pageout();
6003         db_indent -= 2;
6004 }
6005
6006 #if     MACH_COUNTERS
6007 extern int c_laundry_pages_freed;
6008 #endif  /* MACH_COUNTERS */
6009
6010 void
6011 db_pageout(void)
6012 {
6013         iprintf("Pageout Statistics:\n");
6014         db_indent += 2;
6015         iprintf("active %5d  inactv %5d\n",
6016                 vm_pageout_active, vm_pageout_inactive);
6017         iprintf("nolock %5d  avoid  %5d  busy   %5d  absent %5d\n",
6018                 vm_pageout_inactive_nolock, vm_pageout_inactive_avoid,
6019                 vm_pageout_inactive_busy, vm_pageout_inactive_absent);
6020         iprintf("used   %5d  clean  %5d  dirty  %5d\n",
6021                 vm_pageout_inactive_used, vm_pageout_inactive_clean,
6022                 vm_pageout_inactive_dirty);
6023 #if     MACH_COUNTERS
6024         iprintf("laundry_pages_freed %d\n", c_laundry_pages_freed);
6025 #endif  /* MACH_COUNTERS */
6026 #if     MACH_CLUSTER_STATS
6027         iprintf("Cluster Statistics:\n");
6028         db_indent += 2;
6029         iprintf("dirtied   %5d   cleaned  %5d   collisions  %5d\n",
6030                 vm_pageout_cluster_dirtied, vm_pageout_cluster_cleaned,
6031                 vm_pageout_cluster_collisions);
6032         iprintf("clusters  %5d   conversions  %5d\n",
6033                 vm_pageout_cluster_clusters, vm_pageout_cluster_conversions);
6034         db_indent -= 2;
6035         iprintf("Target Statistics:\n");
6036         db_indent += 2;
6037         iprintf("collisions   %5d   page_dirtied  %5d   page_freed  %5d\n",
6038                 vm_pageout_target_collisions, vm_pageout_target_page_dirtied,
6039                 vm_pageout_target_page_freed);
6040         db_indent -= 2;
6041 #endif  /* MACH_CLUSTER_STATS */
6042         db_indent -= 2;
6043 }
6044
6045 #endif  /* MACH_KDB */