osfmk/vm/vm_pageout.c

   1 /*
   2  * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58 /*
  59  *      File:   vm/vm_pageout.c
  60  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  61  *      Date:   1985
  62  *
  63  *      The proverbial page-out daemon.
  64  */
  65
  66 #include <stdint.h>
  67
  68 #include <debug.h>
  69 #include <mach_pagemap.h>
  70 #include <mach_cluster_stats.h>
  71 #include <mach_kdb.h>
  72 #include <advisory_pageout.h>
  73
  74 #include <mach/mach_types.h>
  75 #include <mach/memory_object.h>
  76 #include <mach/memory_object_default.h>
  77 #include <mach/memory_object_control_server.h>
  78 #include <mach/mach_host_server.h>
  79 #include <mach/upl.h>
  80 #include <mach/vm_map.h>
  81 #include <mach/vm_param.h>
  82 #include <mach/vm_statistics.h>
  83 #include <mach/sdt.h>
  84
  85 #include <kern/kern_types.h>
  86 #include <kern/counters.h>
  87 #include <kern/host_statistics.h>
  88 #include <kern/machine.h>
  89 #include <kern/misc_protos.h>
  90 #include <kern/sched.h>
  91 #include <kern/thread.h>
  92 #include <kern/xpr.h>
  93 #include <kern/kalloc.h>
  94
  95 #include <machine/vm_tuning.h>
  96 #include <machine/commpage.h>
  97
  98 #if CONFIG_EMBEDDED
  99 #include <sys/kern_memorystatus.h>
 100 #endif
 101
 102 #include <vm/pmap.h>
 103 #include <vm/vm_fault.h>
 104 #include <vm/vm_map.h>
 105 #include <vm/vm_object.h>
 106 #include <vm/vm_page.h>
 107 #include <vm/vm_pageout.h>
 108 #include <vm/vm_protos.h> /* must be last */
 109 #include <vm/memory_object.h>
 110 #include <vm/vm_purgeable_internal.h>
 111
 112 /*
 113  * ENCRYPTED SWAP:
 114  */
 115 #include <../bsd/crypto/aes/aes.h>
 116 extern u_int32_t random(void);  /* from <libkern/libkern.h> */
 117
 118 #if UPL_DEBUG
 119 #include <libkern/OSDebug.h>
 120 #endif
 121
 122 #ifndef VM_PAGEOUT_BURST_ACTIVE_THROTTLE   /* maximum iterations of the active queue to move pages to inactive */
 123 #define VM_PAGEOUT_BURST_ACTIVE_THROTTLE  100
 124 #endif
 125
 126 #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE  /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
 127 #ifdef  CONFIG_EMBEDDED
 128 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 1024
 129 #else
 130 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096
 131 #endif
 132 #endif
 133
 134 #ifndef VM_PAGEOUT_DEADLOCK_RELIEF
 135 #define VM_PAGEOUT_DEADLOCK_RELIEF 100  /* number of pages to move to break deadlock */
 136 #endif
 137
 138 #ifndef VM_PAGEOUT_INACTIVE_RELIEF
 139 #define VM_PAGEOUT_INACTIVE_RELIEF 50   /* minimum number of pages to move to the inactive q */
 140 #endif
 141
 142 #ifndef VM_PAGE_LAUNDRY_MAX
 143 #define VM_PAGE_LAUNDRY_MAX     16UL    /* maximum pageouts on a given pageout queue */
 144 #endif  /* VM_PAGEOUT_LAUNDRY_MAX */
 145
 146 #ifndef VM_PAGEOUT_BURST_WAIT
 147 #define VM_PAGEOUT_BURST_WAIT   30      /* milliseconds per page */
 148 #endif  /* VM_PAGEOUT_BURST_WAIT */
 149
 150 #ifndef VM_PAGEOUT_EMPTY_WAIT
 151 #define VM_PAGEOUT_EMPTY_WAIT   200     /* milliseconds */
 152 #endif  /* VM_PAGEOUT_EMPTY_WAIT */
 153
 154 #ifndef VM_PAGEOUT_DEADLOCK_WAIT
 155 #define VM_PAGEOUT_DEADLOCK_WAIT        300     /* milliseconds */
 156 #endif  /* VM_PAGEOUT_DEADLOCK_WAIT */
 157
 158 #ifndef VM_PAGEOUT_IDLE_WAIT
 159 #define VM_PAGEOUT_IDLE_WAIT    10      /* milliseconds */
 160 #endif  /* VM_PAGEOUT_IDLE_WAIT */
 161
 162 #ifndef VM_PAGE_SPECULATIVE_TARGET
 163 #define VM_PAGE_SPECULATIVE_TARGET(total) ((total) * 1 / 20)
 164 #endif /* VM_PAGE_SPECULATIVE_TARGET */
 165
 166 #ifndef VM_PAGE_INACTIVE_HEALTHY_LIMIT
 167 #define VM_PAGE_INACTIVE_HEALTHY_LIMIT(total) ((total) * 1 / 200)
 168 #endif /* VM_PAGE_INACTIVE_HEALTHY_LIMIT */
 169
 170
 171 /*
 172  *      To obtain a reasonable LRU approximation, the inactive queue
 173  *      needs to be large enough to give pages on it a chance to be
 174  *      referenced a second time.  This macro defines the fraction
 175  *      of active+inactive pages that should be inactive.
 176  *      The pageout daemon uses it to update vm_page_inactive_target.
 177  *
 178  *      If vm_page_free_count falls below vm_page_free_target and
 179  *      vm_page_inactive_count is below vm_page_inactive_target,
 180  *      then the pageout daemon starts running.
 181  */
 182
 183 #ifndef VM_PAGE_INACTIVE_TARGET
 184 #define VM_PAGE_INACTIVE_TARGET(avail)  ((avail) * 1 / 3)
 185 #endif  /* VM_PAGE_INACTIVE_TARGET */
 186
 187 /*
 188  *      Once the pageout daemon starts running, it keeps going
 189  *      until vm_page_free_count meets or exceeds vm_page_free_target.
 190  */
 191
 192 #ifndef VM_PAGE_FREE_TARGET
 193 #ifdef  CONFIG_EMBEDDED
 194 #define VM_PAGE_FREE_TARGET(free)       (15 + (free) / 100)
 195 #else
 196 #define VM_PAGE_FREE_TARGET(free)       (15 + (free) / 80)
 197 #endif
 198 #endif  /* VM_PAGE_FREE_TARGET */
 199
 200 /*
 201  *      The pageout daemon always starts running once vm_page_free_count
 202  *      falls below vm_page_free_min.
 203  */
 204
 205 #ifndef VM_PAGE_FREE_MIN
 206 #ifdef  CONFIG_EMBEDDED
 207 #define VM_PAGE_FREE_MIN(free)          (10 + (free) / 200)
 208 #else
 209 #define VM_PAGE_FREE_MIN(free)          (10 + (free) / 100)
 210 #endif
 211 #endif  /* VM_PAGE_FREE_MIN */
 212
 213 #define VM_PAGE_FREE_MIN_LIMIT          1500
 214 #define VM_PAGE_FREE_TARGET_LIMIT       2000
 215
 216
 217 /*
 218  *      When vm_page_free_count falls below vm_page_free_reserved,
 219  *      only vm-privileged threads can allocate pages.  vm-privilege
 220  *      allows the pageout daemon and default pager (and any other
 221  *      associated threads needed for default pageout) to continue
 222  *      operation by dipping into the reserved pool of pages.
 223  */
 224
 225 #ifndef VM_PAGE_FREE_RESERVED
 226 #define VM_PAGE_FREE_RESERVED(n)        \
 227         ((unsigned) (6 * VM_PAGE_LAUNDRY_MAX) + (n))
 228 #endif  /* VM_PAGE_FREE_RESERVED */
 229
 230 /*
 231  *      When we dequeue pages from the inactive list, they are
 232  *      reactivated (ie, put back on the active queue) if referenced.
 233  *      However, it is possible to starve the free list if other
 234  *      processors are referencing pages faster than we can turn off
 235  *      the referenced bit.  So we limit the number of reactivations
 236  *      we will make per call of vm_pageout_scan().
 237  */
 238 #define VM_PAGE_REACTIVATE_LIMIT_MAX 20000
 239 #ifndef VM_PAGE_REACTIVATE_LIMIT
 240 #ifdef  CONFIG_EMBEDDED
 241 #define VM_PAGE_REACTIVATE_LIMIT(avail) (VM_PAGE_INACTIVE_TARGET(avail) / 2)
 242 #else
 243 #define VM_PAGE_REACTIVATE_LIMIT(avail) (MAX((avail) * 1 / 20,VM_PAGE_REACTIVATE_LIMIT_MAX))
 244 #endif
 245 #endif  /* VM_PAGE_REACTIVATE_LIMIT */
 246 #define VM_PAGEOUT_INACTIVE_FORCE_RECLAIM       100
 247
 248
 249 /*
 250  * must hold the page queues lock to
 251  * manipulate this structure
 252  */
 253 struct vm_pageout_queue {
 254         queue_head_t    pgo_pending;    /* laundry pages to be processed by pager's iothread */
 255         unsigned int    pgo_laundry;    /* current count of laundry pages on queue or in flight */
 256         unsigned int    pgo_maxlaundry;
 257
 258         unsigned int    pgo_idle:1,     /* iothread is blocked waiting for work to do */
 259                         pgo_busy:1,     /* iothread is currently processing request from pgo_pending */
 260                         pgo_throttled:1,/* vm_pageout_scan thread needs a wakeup when pgo_laundry drops */
 261                         :0;
 262 };
 263
 264 #define VM_PAGE_Q_THROTTLED(q)          \
 265         ((q)->pgo_laundry >= (q)->pgo_maxlaundry)
 266
 267
 268 /*
 269  * Exported variable used to broadcast the activation of the pageout scan
 270  * Working Set uses this to throttle its use of pmap removes.  In this
 271  * way, code which runs within memory in an uncontested context does
 272  * not keep encountering soft faults.
 273  */
 274
 275 unsigned int    vm_pageout_scan_event_counter = 0;
 276
 277 /*
 278  * Forward declarations for internal routines.
 279  */
 280
 281 static void vm_pageout_garbage_collect(int);
 282 static void vm_pageout_iothread_continue(struct vm_pageout_queue *);
 283 static void vm_pageout_iothread_external(void);
 284 static void vm_pageout_iothread_internal(void);
 285
 286 extern void vm_pageout_continue(void);
 287 extern void vm_pageout_scan(void);
 288
 289 static thread_t vm_pageout_external_iothread = THREAD_NULL;
 290 static thread_t vm_pageout_internal_iothread = THREAD_NULL;
 291
 292 unsigned int vm_pageout_reserved_internal = 0;
 293 unsigned int vm_pageout_reserved_really = 0;
 294
 295 unsigned int vm_pageout_idle_wait = 0;          /* milliseconds */
 296 unsigned int vm_pageout_empty_wait = 0;         /* milliseconds */
 297 unsigned int vm_pageout_burst_wait = 0;         /* milliseconds */
 298 unsigned int vm_pageout_deadlock_wait = 0;      /* milliseconds */
 299 unsigned int vm_pageout_deadlock_relief = 0;
 300 unsigned int vm_pageout_inactive_relief = 0;
 301 unsigned int vm_pageout_burst_active_throttle = 0;
 302 unsigned int vm_pageout_burst_inactive_throttle = 0;
 303
 304 /*
 305  *      Protection against zero fill flushing live working sets derived
 306  *      from existing backing store and files
 307  */
 308 unsigned int vm_accellerate_zf_pageout_trigger = 400;
 309 unsigned int zf_queue_min_count = 100;
 310 unsigned int vm_zf_queue_count = 0;
 311
 312 #if defined(__ppc__) /* On ppc, vm statistics are still 32-bit */
 313 unsigned int vm_zf_count = 0;
 314 #else
 315 uint64_t vm_zf_count __attribute__((aligned(8))) = 0;
 316 #endif
 317
 318 /*
 319  *      These variables record the pageout daemon's actions:
 320  *      how many pages it looks at and what happens to those pages.
 321  *      No locking needed because only one thread modifies the variables.
 322  */
 323
 324 unsigned int vm_pageout_active = 0;             /* debugging */
 325 unsigned int vm_pageout_inactive = 0;           /* debugging */
 326 unsigned int vm_pageout_inactive_throttled = 0; /* debugging */
 327 unsigned int vm_pageout_inactive_forced = 0;    /* debugging */
 328 unsigned int vm_pageout_inactive_nolock = 0;    /* debugging */
 329 unsigned int vm_pageout_inactive_avoid = 0;     /* debugging */
 330 unsigned int vm_pageout_inactive_busy = 0;      /* debugging */
 331 unsigned int vm_pageout_inactive_absent = 0;    /* debugging */
 332 unsigned int vm_pageout_inactive_used = 0;      /* debugging */
 333 unsigned int vm_pageout_inactive_clean = 0;     /* debugging */
 334 unsigned int vm_pageout_inactive_dirty = 0;     /* debugging */
 335 unsigned int vm_pageout_inactive_deactivated = 0;       /* debugging */
 336 unsigned int vm_pageout_inactive_zf = 0;        /* debugging */
 337 unsigned int vm_pageout_dirty_no_pager = 0;     /* debugging */
 338 unsigned int vm_pageout_purged_objects = 0;     /* debugging */
 339 unsigned int vm_stat_discard = 0;               /* debugging */
 340 unsigned int vm_stat_discard_sent = 0;          /* debugging */
 341 unsigned int vm_stat_discard_failure = 0;       /* debugging */
 342 unsigned int vm_stat_discard_throttle = 0;      /* debugging */
 343 unsigned int vm_pageout_reactivation_limit_exceeded = 0;        /* debugging */
 344 unsigned int vm_pageout_catch_ups = 0;                          /* debugging */
 345 unsigned int vm_pageout_inactive_force_reclaim = 0;     /* debugging */
 346
 347 unsigned int vm_pageout_scan_active_throttled = 0;
 348 unsigned int vm_pageout_scan_inactive_throttled = 0;
 349 unsigned int vm_pageout_scan_throttle = 0;                      /* debugging */
 350 unsigned int vm_pageout_scan_throttle_aborted = 0;              /* debugging */
 351 unsigned int vm_pageout_scan_burst_throttle = 0;                /* debugging */
 352 unsigned int vm_pageout_scan_empty_throttle = 0;                /* debugging */
 353 unsigned int vm_pageout_scan_deadlock_detected = 0;             /* debugging */
 354 unsigned int vm_pageout_scan_active_throttle_success = 0;       /* debugging */
 355 unsigned int vm_pageout_scan_inactive_throttle_success = 0;     /* debugging */
 356
 357 unsigned int vm_page_speculative_count_drifts = 0;
 358 unsigned int vm_page_speculative_count_drift_max = 0;
 359
 360 /*
 361  * Backing store throttle when BS is exhausted
 362  */
 363 unsigned int    vm_backing_store_low = 0;
 364
 365 unsigned int vm_pageout_out_of_line  = 0;
 366 unsigned int vm_pageout_in_place  = 0;
 367
 368 unsigned int vm_page_steal_pageout_page = 0;
 369
 370 /*
 371  * ENCRYPTED SWAP:
 372  * counters and statistics...
 373  */
 374 unsigned long vm_page_decrypt_counter = 0;
 375 unsigned long vm_page_decrypt_for_upl_counter = 0;
 376 unsigned long vm_page_encrypt_counter = 0;
 377 unsigned long vm_page_encrypt_abort_counter = 0;
 378 unsigned long vm_page_encrypt_already_encrypted_counter = 0;
 379 boolean_t vm_pages_encrypted = FALSE; /* are there encrypted pages ? */
 380
 381 struct  vm_pageout_queue vm_pageout_queue_internal;
 382 struct  vm_pageout_queue vm_pageout_queue_external;
 383
 384 unsigned int vm_page_speculative_target = 0;
 385
 386 vm_object_t     vm_pageout_scan_wants_object = VM_OBJECT_NULL;
 387
 388 static boolean_t (* volatile consider_buffer_cache_collect)(void) = NULL;
 389
 390 #if DEVELOPMENT || DEBUG
 391 unsigned long vm_cs_validated_resets = 0;
 392 #endif
 393
 394 /*
 395  *      Routine:        vm_backing_store_disable
 396  *      Purpose:
 397  *              Suspend non-privileged threads wishing to extend
 398  *              backing store when we are low on backing store
 399  *              (Synchronized by caller)
 400  */
 401 void
 402 vm_backing_store_disable(
 403         boolean_t       disable)
 404 {
 405         if(disable) {
 406                 vm_backing_store_low = 1;
 407         } else {
 408                 if(vm_backing_store_low) {
 409                         vm_backing_store_low = 0;
 410                         thread_wakeup((event_t) &vm_backing_store_low);
 411                 }
 412         }
 413 }
 414
 415
 416 #if MACH_CLUSTER_STATS
 417 unsigned long vm_pageout_cluster_dirtied = 0;
 418 unsigned long vm_pageout_cluster_cleaned = 0;
 419 unsigned long vm_pageout_cluster_collisions = 0;
 420 unsigned long vm_pageout_cluster_clusters = 0;
 421 unsigned long vm_pageout_cluster_conversions = 0;
 422 unsigned long vm_pageout_target_collisions = 0;
 423 unsigned long vm_pageout_target_page_dirtied = 0;
 424 unsigned long vm_pageout_target_page_freed = 0;
 425 #define CLUSTER_STAT(clause)    clause
 426 #else   /* MACH_CLUSTER_STATS */
 427 #define CLUSTER_STAT(clause)
 428 #endif  /* MACH_CLUSTER_STATS */
 429
 430 /*
 431  *      Routine:        vm_pageout_object_terminate
 432  *      Purpose:
 433  *              Destroy the pageout_object, and perform all of the
 434  *              required cleanup actions.
 435  *
 436  *      In/Out conditions:
 437  *              The object must be locked, and will be returned locked.
 438  */
 439 void
 440 vm_pageout_object_terminate(
 441         vm_object_t     object)
 442 {
 443         vm_object_t     shadow_object;
 444
 445         /*
 446          * Deal with the deallocation (last reference) of a pageout object
 447          * (used for cleaning-in-place) by dropping the paging references/
 448          * freeing pages in the original object.
 449          */
 450
 451         assert(object->pageout);
 452         shadow_object = object->shadow;
 453         vm_object_lock(shadow_object);
 454
 455         while (!queue_empty(&object->memq)) {
 456                 vm_page_t               p, m;
 457                 vm_object_offset_t      offset;
 458
 459                 p = (vm_page_t) queue_first(&object->memq);
 460
 461                 assert(p->private);
 462                 assert(p->pageout);
 463                 p->pageout = FALSE;
 464                 assert(!p->cleaning);
 465
 466                 offset = p->offset;
 467                 VM_PAGE_FREE(p);
 468                 p = VM_PAGE_NULL;
 469
 470                 m = vm_page_lookup(shadow_object,
 471                         offset + object->shadow_offset);
 472
 473                 if(m == VM_PAGE_NULL)
 474                         continue;
 475                 assert(m->cleaning);
 476                 /* used as a trigger on upl_commit etc to recognize the */
 477                 /* pageout daemon's subseqent desire to pageout a cleaning */
 478                 /* page.  When the bit is on the upl commit code will   */
 479                 /* respect the pageout bit in the target page over the  */
 480                 /* caller's page list indication */
 481                 m->dump_cleaning = FALSE;
 482
 483                 assert((m->dirty) || (m->precious) ||
 484                                 (m->busy && m->cleaning));
 485
 486                 /*
 487                  * Handle the trusted pager throttle.
 488                  * Also decrement the burst throttle (if external).
 489                  */
 490                 vm_page_lock_queues();
 491                 if (m->laundry) {
 492                         vm_pageout_throttle_up(m);
 493                 }
 494
 495                 /*
 496                  * Handle the "target" page(s). These pages are to be freed if
 497                  * successfully cleaned. Target pages are always busy, and are
 498                  * wired exactly once. The initial target pages are not mapped,
 499                  * (so cannot be referenced or modified) but converted target
 500                  * pages may have been modified between the selection as an
 501                  * adjacent page and conversion to a target.
 502                  */
 503                 if (m->pageout) {
 504                         assert(m->busy);
 505                         assert(m->wire_count == 1);
 506                         m->cleaning = FALSE;
 507                         m->encrypted_cleaning = FALSE;
 508                         m->pageout = FALSE;
 509 #if MACH_CLUSTER_STATS
 510                         if (m->wanted) vm_pageout_target_collisions++;
 511 #endif
 512                         /*
 513                          * Revoke all access to the page. Since the object is
 514                          * locked, and the page is busy, this prevents the page
 515                          * from being dirtied after the pmap_disconnect() call
 516                          * returns.
 517                          *
 518                          * Since the page is left "dirty" but "not modifed", we
 519                          * can detect whether the page was redirtied during
 520                          * pageout by checking the modify state.
 521                          */
 522                         if (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED)
 523                               m->dirty = TRUE;
 524                         else
 525                               m->dirty = FALSE;
 526
 527                         if (m->dirty) {
 528                                 CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
 529                                 vm_page_unwire(m);/* reactivates */
 530                                 VM_STAT_INCR(reactivations);
 531                                 PAGE_WAKEUP_DONE(m);
 532                         } else {
 533                                 CLUSTER_STAT(vm_pageout_target_page_freed++;)
 534                                 vm_page_free(m);/* clears busy, etc. */
 535                         }
 536                         vm_page_unlock_queues();
 537                         continue;
 538                 }
 539                 /*
 540                  * Handle the "adjacent" pages. These pages were cleaned in
 541                  * place, and should be left alone.
 542                  * If prep_pin_count is nonzero, then someone is using the
 543                  * page, so make it active.
 544                  */
 545                 if (!m->active && !m->inactive && !m->throttled && !m->private) {
 546                         if (m->reference)
 547                                 vm_page_activate(m);
 548                         else
 549                                 vm_page_deactivate(m);
 550                 }
 551                 if((m->busy) && (m->cleaning)) {
 552
 553                         /* the request_page_list case, (COPY_OUT_FROM FALSE) */
 554                         m->busy = FALSE;
 555
 556                         /* We do not re-set m->dirty ! */
 557                         /* The page was busy so no extraneous activity     */
 558                         /* could have occurred. COPY_INTO is a read into the */
 559                         /* new pages. CLEAN_IN_PLACE does actually write   */
 560                         /* out the pages but handling outside of this code */
 561                         /* will take care of resetting dirty. We clear the */
 562                         /* modify however for the Programmed I/O case.     */
 563                         pmap_clear_modify(m->phys_page);
 564
 565                         m->absent = FALSE;
 566                         m->overwriting = FALSE;
 567                 } else if (m->overwriting) {
 568                         /* alternate request page list, write to page_list */
 569                         /* case.  Occurs when the original page was wired  */
 570                         /* at the time of the list request */
 571                         assert(VM_PAGE_WIRED(m));
 572                         vm_page_unwire(m);/* reactivates */
 573                         m->overwriting = FALSE;
 574                 } else {
 575                 /*
 576                  * Set the dirty state according to whether or not the page was
 577                  * modified during the pageout. Note that we purposefully do
 578                  * NOT call pmap_clear_modify since the page is still mapped.
 579                  * If the page were to be dirtied between the 2 calls, this
 580                  * this fact would be lost. This code is only necessary to
 581                  * maintain statistics, since the pmap module is always
 582                  * consulted if m->dirty is false.
 583                  */
 584 #if MACH_CLUSTER_STATS
 585                         m->dirty = pmap_is_modified(m->phys_page);
 586
 587                         if (m->dirty)   vm_pageout_cluster_dirtied++;
 588                         else            vm_pageout_cluster_cleaned++;
 589                         if (m->wanted)  vm_pageout_cluster_collisions++;
 590 #else
 591                         m->dirty = 0;
 592 #endif
 593                 }
 594                 m->cleaning = FALSE;
 595                 m->encrypted_cleaning = FALSE;
 596
 597                 /*
 598                  * Wakeup any thread waiting for the page to be un-cleaning.
 599                  */
 600                 PAGE_WAKEUP(m);
 601                 vm_page_unlock_queues();
 602         }
 603         /*
 604          * Account for the paging reference taken in vm_paging_object_allocate.
 605          */
 606         vm_object_activity_end(shadow_object);
 607         vm_object_unlock(shadow_object);
 608
 609         assert(object->ref_count == 0);
 610         assert(object->paging_in_progress == 0);
 611         assert(object->activity_in_progress == 0);
 612         assert(object->resident_page_count == 0);
 613         return;
 614 }
 615
 616 /*
 617  * Routine:     vm_pageclean_setup
 618  *
 619  * Purpose:     setup a page to be cleaned (made non-dirty), but not
 620  *              necessarily flushed from the VM page cache.
 621  *              This is accomplished by cleaning in place.
 622  *
 623  *              The page must not be busy, and new_object
 624  *              must be locked.
 625  *
 626  */
 627 void
 628 vm_pageclean_setup(
 629         vm_page_t               m,
 630         vm_page_t               new_m,
 631         vm_object_t             new_object,
 632         vm_object_offset_t      new_offset)
 633 {
 634         assert(!m->busy);
 635 #if 0
 636         assert(!m->cleaning);
 637 #endif
 638
 639         XPR(XPR_VM_PAGEOUT,
 640     "vm_pageclean_setup, obj 0x%X off 0x%X page 0x%X new 0x%X new_off 0x%X\n",
 641                 m->object, m->offset, m,
 642                 new_m, new_offset);
 643
 644         pmap_clear_modify(m->phys_page);
 645
 646         /*
 647          * Mark original page as cleaning in place.
 648          */
 649         m->cleaning = TRUE;
 650         m->dirty = TRUE;
 651         m->precious = FALSE;
 652
 653         /*
 654          * Convert the fictitious page to a private shadow of
 655          * the real page.
 656          */
 657         assert(new_m->fictitious);
 658         assert(new_m->phys_page == vm_page_fictitious_addr);
 659         new_m->fictitious = FALSE;
 660         new_m->private = TRUE;
 661         new_m->pageout = TRUE;
 662         new_m->phys_page = m->phys_page;
 663
 664         vm_page_lockspin_queues();
 665         vm_page_wire(new_m);
 666         vm_page_unlock_queues();
 667
 668         vm_page_insert(new_m, new_object, new_offset);
 669         assert(!new_m->wanted);
 670         new_m->busy = FALSE;
 671 }
 672
 673 /*
 674  *      Routine:        vm_pageout_initialize_page
 675  *      Purpose:
 676  *              Causes the specified page to be initialized in
 677  *              the appropriate memory object. This routine is used to push
 678  *              pages into a copy-object when they are modified in the
 679  *              permanent object.
 680  *
 681  *              The page is moved to a temporary object and paged out.
 682  *
 683  *      In/out conditions:
 684  *              The page in question must not be on any pageout queues.
 685  *              The object to which it belongs must be locked.
 686  *              The page must be busy, but not hold a paging reference.
 687  *
 688  *      Implementation:
 689  *              Move this page to a completely new object.
 690  */
 691 void
 692 vm_pageout_initialize_page(
 693         vm_page_t       m)
 694 {
 695         vm_object_t             object;
 696         vm_object_offset_t      paging_offset;
 697         vm_page_t               holding_page;
 698         memory_object_t         pager;
 699
 700         XPR(XPR_VM_PAGEOUT,
 701                 "vm_pageout_initialize_page, page 0x%X\n",
 702                 m, 0, 0, 0, 0);
 703         assert(m->busy);
 704
 705         /*
 706          *      Verify that we really want to clean this page
 707          */
 708         assert(!m->absent);
 709         assert(!m->error);
 710         assert(m->dirty);
 711
 712         /*
 713          *      Create a paging reference to let us play with the object.
 714          */
 715         object = m->object;
 716         paging_offset = m->offset + object->paging_offset;
 717
 718         if (m->absent || m->error || m->restart || (!m->dirty && !m->precious)) {
 719                 VM_PAGE_FREE(m);
 720                 panic("reservation without pageout?"); /* alan */
 721                 vm_object_unlock(object);
 722
 723                 return;
 724         }
 725
 726         /*
 727          * If there's no pager, then we can't clean the page.  This should
 728          * never happen since this should be a copy object and therefore not
 729          * an external object, so the pager should always be there.
 730          */
 731
 732         pager = object->pager;
 733
 734         if (pager == MEMORY_OBJECT_NULL) {
 735                 VM_PAGE_FREE(m);
 736                 panic("missing pager for copy object");
 737                 return;
 738         }
 739
 740         /* set the page for future call to vm_fault_list_request */
 741         vm_object_paging_begin(object);
 742         holding_page = NULL;
 743
 744         pmap_clear_modify(m->phys_page);
 745         m->dirty = TRUE;
 746         m->busy = TRUE;
 747         m->list_req_pending = TRUE;
 748         m->cleaning = TRUE;
 749         m->pageout = TRUE;
 750
 751         vm_page_lockspin_queues();
 752         vm_page_wire(m);
 753         vm_page_unlock_queues();
 754
 755         vm_object_unlock(object);
 756
 757         /*
 758          *      Write the data to its pager.
 759          *      Note that the data is passed by naming the new object,
 760          *      not a virtual address; the pager interface has been
 761          *      manipulated to use the "internal memory" data type.
 762          *      [The object reference from its allocation is donated
 763          *      to the eventual recipient.]
 764          */
 765         memory_object_data_initialize(pager, paging_offset, PAGE_SIZE);
 766
 767         vm_object_lock(object);
 768         vm_object_paging_end(object);
 769 }
 770
 771 #if     MACH_CLUSTER_STATS
 772 #define MAXCLUSTERPAGES 16
 773 struct {
 774         unsigned long pages_in_cluster;
 775         unsigned long pages_at_higher_offsets;
 776         unsigned long pages_at_lower_offsets;
 777 } cluster_stats[MAXCLUSTERPAGES];
 778 #endif  /* MACH_CLUSTER_STATS */
 779
 780
 781 /*
 782  * vm_pageout_cluster:
 783  *
 784  * Given a page, queue it to the appropriate I/O thread,
 785  * which will page it out and attempt to clean adjacent pages
 786  * in the same operation.
 787  *
 788  * The page must be busy, and the object and queues locked. We will take a
 789  * paging reference to prevent deallocation or collapse when we
 790  * release the object lock back at the call site.  The I/O thread
 791  * is responsible for consuming this reference
 792  *
 793  * The page must not be on any pageout queue.
 794  */
 795
 796 void
 797 vm_pageout_cluster(vm_page_t m)
 798 {
 799         vm_object_t     object = m->object;
 800         struct          vm_pageout_queue *q;
 801
 802
 803         XPR(XPR_VM_PAGEOUT,
 804                 "vm_pageout_cluster, object 0x%X offset 0x%X page 0x%X\n",
 805                 object, m->offset, m, 0, 0);
 806
 807         VM_PAGE_CHECK(m);
 808
 809         /*
 810          * Only a certain kind of page is appreciated here.
 811          */
 812         assert(m->busy && (m->dirty || m->precious) && (!VM_PAGE_WIRED(m)));
 813         assert(!m->cleaning && !m->pageout && !m->inactive && !m->active);
 814         assert(!m->throttled);
 815
 816         /*
 817          * protect the object from collapse -
 818          * locking in the object's paging_offset.
 819          */
 820         vm_object_paging_begin(object);
 821
 822         /*
 823          * set the page for future call to vm_fault_list_request
 824          * page should already be marked busy
 825          */
 826         vm_page_wire(m);
 827         m->list_req_pending = TRUE;
 828         m->cleaning = TRUE;
 829         m->pageout = TRUE;
 830
 831         if (object->internal == TRUE)
 832                 q = &vm_pageout_queue_internal;
 833         else
 834                 q = &vm_pageout_queue_external;
 835
 836         /*
 837          * pgo_laundry count is tied to the laundry bit
 838          */
 839         m->laundry = TRUE;
 840         q->pgo_laundry++;
 841
 842         m->pageout_queue = TRUE;
 843         queue_enter(&q->pgo_pending, m, vm_page_t, pageq);
 844
 845         if (q->pgo_idle == TRUE) {
 846                 q->pgo_idle = FALSE;
 847                 thread_wakeup((event_t) &q->pgo_pending);
 848         }
 849
 850         VM_PAGE_CHECK(m);
 851 }
 852
 853
 854 unsigned long vm_pageout_throttle_up_count = 0;
 855
 856 /*
 857  * A page is back from laundry or we are stealing it back from
 858  * the laundering state.  See if there are some pages waiting to
 859  * go to laundry and if we can let some of them go now.
 860  *
 861  * Object and page queues must be locked.
 862  */
 863 void
 864 vm_pageout_throttle_up(
 865         vm_page_t       m)
 866 {
 867         struct vm_pageout_queue *q;
 868
 869         assert(m->object != VM_OBJECT_NULL);
 870         assert(m->object != kernel_object);
 871
 872         vm_pageout_throttle_up_count++;
 873
 874         if (m->object->internal == TRUE)
 875                 q = &vm_pageout_queue_internal;
 876         else
 877                 q = &vm_pageout_queue_external;
 878
 879         if (m->pageout_queue == TRUE) {
 880
 881                 queue_remove(&q->pgo_pending, m, vm_page_t, pageq);
 882                 m->pageout_queue = FALSE;
 883
 884                 m->pageq.next = NULL;
 885                 m->pageq.prev = NULL;
 886
 887                 vm_object_paging_end(m->object);
 888         }
 889
 890         if ( m->laundry == TRUE ) {
 891
 892                 m->laundry = FALSE;
 893                 q->pgo_laundry--;
 894                 if (q->pgo_throttled == TRUE) {
 895                         q->pgo_throttled = FALSE;
 896                         thread_wakeup((event_t) &q->pgo_laundry);
 897                 }
 898         }
 899 }
 900
 901
 902 /*
 903  *      vm_pageout_scan does the dirty work for the pageout daemon.
 904  *      It returns with vm_page_queue_free_lock held and
 905  *      vm_page_free_wanted == 0.
 906  */
 907
 908 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT  (3 * MAX_UPL_TRANSFER)
 909
 910 #define FCS_IDLE                0
 911 #define FCS_DELAYED             1
 912 #define FCS_DEADLOCK_DETECTED   2
 913
 914 struct flow_control {
 915         int             state;
 916         mach_timespec_t ts;
 917 };
 918
 919
 920 /*
 921  * VM memory pressure monitoring.
 922  *
 923  * vm_pageout_scan() keeps track of the number of pages it considers and
 924  * reclaims, in the currently active vm_pageout_stat[vm_pageout_stat_now].
 925  *
 926  * compute_memory_pressure() is called every second from compute_averages()
 927  * and moves "vm_pageout_stat_now" forward, to start accumulating the number
 928  * of recalimed pages in a new vm_pageout_stat[] bucket.
 929  *
 930  * mach_vm_pressure_monitor() collects past statistics about memory pressure.
 931  * The caller provides the number of seconds ("nsecs") worth of statistics
 932  * it wants, up to 30 seconds.
 933  * It computes the number of pages reclaimed in the past "nsecs" seconds and
 934  * also returns the number of pages the system still needs to reclaim at this
 935  * moment in time.
 936  */
 937 #define VM_PAGEOUT_STAT_SIZE    31
 938 struct vm_pageout_stat {
 939         unsigned int considered;
 940         unsigned int reclaimed;
 941 } vm_pageout_stats[VM_PAGEOUT_STAT_SIZE] = {{0,0}, };
 942 unsigned int vm_pageout_stat_now = 0;
 943 unsigned int vm_memory_pressure = 0;
 944
 945 #define VM_PAGEOUT_STAT_BEFORE(i) \
 946         (((i) == 0) ? VM_PAGEOUT_STAT_SIZE - 1 : (i) - 1)
 947 #define VM_PAGEOUT_STAT_AFTER(i) \
 948         (((i) == VM_PAGEOUT_STAT_SIZE - 1) ? 0 : (i) + 1)
 949
 950 /*
 951  * Called from compute_averages().
 952  */
 953 void
 954 compute_memory_pressure(
 955         __unused void *arg)
 956 {
 957         unsigned int vm_pageout_next;
 958
 959         vm_memory_pressure =
 960                 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].reclaimed;
 961
 962         commpage_set_memory_pressure( vm_memory_pressure );
 963
 964         /* move "now" forward */
 965         vm_pageout_next = VM_PAGEOUT_STAT_AFTER(vm_pageout_stat_now);
 966         vm_pageout_stats[vm_pageout_next].considered = 0;
 967         vm_pageout_stats[vm_pageout_next].reclaimed = 0;
 968         vm_pageout_stat_now = vm_pageout_next;
 969 }
 970
 971 unsigned int
 972 mach_vm_ctl_page_free_wanted(void)
 973 {
 974         unsigned int page_free_target, page_free_count, page_free_wanted;
 975
 976         page_free_target = vm_page_free_target;
 977         page_free_count = vm_page_free_count;
 978         if (page_free_target > page_free_count) {
 979                 page_free_wanted = page_free_target - page_free_count;
 980         } else {
 981                 page_free_wanted = 0;
 982         }
 983
 984         return page_free_wanted;
 985 }
 986
 987 kern_return_t
 988 mach_vm_pressure_monitor(
 989         boolean_t       wait_for_pressure,
 990         unsigned int    nsecs_monitored,
 991         unsigned int    *pages_reclaimed_p,
 992         unsigned int    *pages_wanted_p)
 993 {
 994         wait_result_t   wr;
 995         unsigned int    vm_pageout_then, vm_pageout_now;
 996         unsigned int    pages_reclaimed;
 997
 998         /*
 999          * We don't take the vm_page_queue_lock here because we don't want
1000          * vm_pressure_monitor() to get in the way of the vm_pageout_scan()
1001          * thread when it's trying to reclaim memory.  We don't need fully
1002          * accurate monitoring anyway...
1003          */
1004
1005         if (wait_for_pressure) {
1006                 /* wait until there's memory pressure */
1007                 while (vm_page_free_count >= vm_page_free_target) {
1008                         wr = assert_wait((event_t) &vm_page_free_wanted,
1009                                          THREAD_INTERRUPTIBLE);
1010                         if (wr == THREAD_WAITING) {
1011                                 wr = thread_block(THREAD_CONTINUE_NULL);
1012                         }
1013                         if (wr == THREAD_INTERRUPTED) {
1014                                 return KERN_ABORTED;
1015                         }
1016                         if (wr == THREAD_AWAKENED) {
1017                                 /*
1018                                  * The memory pressure might have already
1019                                  * been relieved but let's not block again
1020                                  * and let's report that there was memory
1021                                  * pressure at some point.
1022                                  */
1023                                 break;
1024                         }
1025                 }
1026         }
1027
1028         /* provide the number of pages the system wants to reclaim */
1029         if (pages_wanted_p != NULL) {
1030                 *pages_wanted_p = mach_vm_ctl_page_free_wanted();
1031         }
1032
1033         if (pages_reclaimed_p == NULL) {
1034                 return KERN_SUCCESS;
1035         }
1036
1037         /* provide number of pages reclaimed in the last "nsecs_monitored" */
1038         do {
1039                 vm_pageout_now = vm_pageout_stat_now;
1040                 pages_reclaimed = 0;
1041                 for (vm_pageout_then =
1042                              VM_PAGEOUT_STAT_BEFORE(vm_pageout_now);
1043                      vm_pageout_then != vm_pageout_now &&
1044                              nsecs_monitored-- != 0;
1045                      vm_pageout_then =
1046                              VM_PAGEOUT_STAT_BEFORE(vm_pageout_then)) {
1047                         pages_reclaimed += vm_pageout_stats[vm_pageout_then].reclaimed;
1048                 }
1049         } while (vm_pageout_now != vm_pageout_stat_now);
1050         *pages_reclaimed_p = pages_reclaimed;
1051
1052         return KERN_SUCCESS;
1053 }
1054
1055 /* Page States: Used below to maintain the page state
1056    before it's removed from it's Q. This saved state
1057    helps us do the right accounting in certain cases
1058 */
1059
1060 #define PAGE_STATE_SPECULATIVE  1
1061 #define PAGE_STATE_THROTTLED    2
1062 #define PAGE_STATE_ZEROFILL     3
1063 #define PAGE_STATE_INACTIVE     4
1064
1065 #define VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m)                         \
1066         MACRO_BEGIN                                                     \
1067         /*                                                              \
1068          * If a "reusable" page somehow made it back into               \
1069          * the active queue, it's been re-used and is not               \
1070          * quite re-usable.                                             \
1071          * If the VM object was "all_reusable", consider it             \
1072          * as "all re-used" instead of converting it to                 \
1073          * "partially re-used", which could be expensive.               \
1074          */                                                             \
1075         if ((m)->reusable ||                                            \
1076             (m)->object->all_reusable) {                                \
1077                 vm_object_reuse_pages((m)->object,                      \
1078                                       (m)->offset,                      \
1079                                       (m)->offset + PAGE_SIZE_64,       \
1080                                       FALSE);                           \
1081         }                                                               \
1082         MACRO_END
1083
1084 void
1085 vm_pageout_scan(void)
1086 {
1087         unsigned int loop_count = 0;
1088         unsigned int inactive_burst_count = 0;
1089         unsigned int active_burst_count = 0;
1090         unsigned int reactivated_this_call;
1091         unsigned int reactivate_limit;
1092         vm_page_t   local_freeq = NULL;
1093         int         local_freed = 0;
1094         int         delayed_unlock;
1095         int         refmod_state = 0;
1096         int     vm_pageout_deadlock_target = 0;
1097         struct  vm_pageout_queue *iq;
1098         struct  vm_pageout_queue *eq;
1099         struct  vm_speculative_age_q *sq;
1100         struct  flow_control    flow_control = { 0, { 0, 0 } };
1101         boolean_t inactive_throttled = FALSE;
1102         boolean_t try_failed;
1103         mach_timespec_t         ts;
1104         unsigned int msecs = 0;
1105         vm_object_t     object;
1106         vm_object_t     last_object_tried;
1107 #if defined(__ppc__) /* On ppc, vm statistics are still 32-bit */
1108         unsigned int    zf_ratio;
1109         unsigned int    zf_run_count;
1110 #else
1111         uint64_t        zf_ratio;
1112         uint64_t        zf_run_count;
1113 #endif
1114         uint32_t        catch_up_count = 0;
1115         uint32_t        inactive_reclaim_run;
1116         boolean_t       forced_reclaim;
1117         int             page_prev_state = 0;
1118
1119         flow_control.state = FCS_IDLE;
1120         iq = &vm_pageout_queue_internal;
1121         eq = &vm_pageout_queue_external;
1122         sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
1123
1124
1125         XPR(XPR_VM_PAGEOUT, "vm_pageout_scan\n", 0, 0, 0, 0, 0);
1126
1127
1128         vm_page_lock_queues();
1129         delayed_unlock = 1;     /* must be nonzero if Qs are locked, 0 if unlocked */
1130
1131         /*
1132          *      Calculate the max number of referenced pages on the inactive
1133          *      queue that we will reactivate.
1134          */
1135         reactivated_this_call = 0;
1136         reactivate_limit = VM_PAGE_REACTIVATE_LIMIT(vm_page_active_count +
1137                                                     vm_page_inactive_count);
1138         inactive_reclaim_run = 0;
1139
1140
1141 /*???*/ /*
1142          *      We want to gradually dribble pages from the active queue
1143          *      to the inactive queue.  If we let the inactive queue get
1144          *      very small, and then suddenly dump many pages into it,
1145          *      those pages won't get a sufficient chance to be referenced
1146          *      before we start taking them from the inactive queue.
1147          *
1148          *      We must limit the rate at which we send pages to the pagers.
1149          *      data_write messages consume memory, for message buffers and
1150          *      for map-copy objects.  If we get too far ahead of the pagers,
1151          *      we can potentially run out of memory.
1152          *
1153          *      We can use the laundry count to limit directly the number
1154          *      of pages outstanding to the default pager.  A similar
1155          *      strategy for external pagers doesn't work, because
1156          *      external pagers don't have to deallocate the pages sent them,
1157          *      and because we might have to send pages to external pagers
1158          *      even if they aren't processing writes.  So we also
1159          *      use a burst count to limit writes to external pagers.
1160          *
1161          *      When memory is very tight, we can't rely on external pagers to
1162          *      clean pages.  They probably aren't running, because they
1163          *      aren't vm-privileged.  If we kept sending dirty pages to them,
1164          *      we could exhaust the free list.
1165          */
1166
1167
1168 Restart:
1169         assert(delayed_unlock!=0);
1170
1171         /*
1172          *      A page is "zero-filled" if it was not paged in from somewhere,
1173          *      and it belongs to an object at least VM_ZF_OBJECT_SIZE_THRESHOLD big.
1174          *      Recalculate the zero-filled page ratio.  We use this to apportion
1175          *      victimized pages between the normal and zero-filled inactive
1176          *      queues according to their relative abundance in memory.  Thus if a task
1177          *      is flooding memory with zf pages, we begin to hunt them down.
1178          *      It would be better to throttle greedy tasks at a higher level,
1179          *      but at the moment mach vm cannot do this.
1180          */
1181         {
1182 #if defined(__ppc__) /* On ppc, vm statistics are still 32-bit */
1183                 uint32_t  total  = vm_page_active_count + vm_page_inactive_count;
1184                 uint32_t  normal = total - vm_zf_count;
1185 #else
1186                 uint64_t  total  = vm_page_active_count + vm_page_inactive_count;
1187                 uint64_t  normal = total - vm_zf_count;
1188 #endif
1189
1190                 /* zf_ratio is the number of zf pages we victimize per normal page */
1191
1192                 if (vm_zf_count < vm_accellerate_zf_pageout_trigger)
1193                         zf_ratio = 0;
1194                 else if ((vm_zf_count <= normal) || (normal == 0))
1195                         zf_ratio = 1;
1196                 else
1197                         zf_ratio = vm_zf_count / normal;
1198
1199                 zf_run_count = 0;
1200         }
1201
1202         /*
1203          *      Recalculate vm_page_inactivate_target.
1204          */
1205         vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
1206                                                           vm_page_inactive_count +
1207                                                           vm_page_speculative_count);
1208         /*
1209          * don't want to wake the pageout_scan thread up everytime we fall below
1210          * the targets... set a low water mark at 0.25% below the target
1211          */
1212         vm_page_inactive_min = vm_page_inactive_target - (vm_page_inactive_target / 400);
1213
1214         vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count +
1215                                                                 vm_page_inactive_count);
1216         object = NULL;
1217         last_object_tried = NULL;
1218         try_failed = FALSE;
1219
1220         if ((vm_page_inactive_count + vm_page_speculative_count) < VM_PAGE_INACTIVE_HEALTHY_LIMIT(vm_page_active_count))
1221                 catch_up_count = vm_page_inactive_count + vm_page_speculative_count;
1222         else
1223                 catch_up_count = 0;
1224
1225         for (;;) {
1226                 vm_page_t m;
1227
1228                 DTRACE_VM2(rev, int, 1, (uint64_t *), NULL);
1229
1230                 if (delayed_unlock == 0) {
1231                         vm_page_lock_queues();
1232                         delayed_unlock = 1;
1233                 }
1234
1235                 /*
1236                  *      Don't sweep through active queue more than the throttle
1237                  *      which should be kept relatively low
1238                  */
1239                 active_burst_count = MIN(vm_pageout_burst_active_throttle,
1240                                          vm_page_active_count);
1241
1242                 /*
1243                  *      Move pages from active to inactive.
1244                  */
1245                 if ((vm_page_inactive_count + vm_page_speculative_count) >= vm_page_inactive_target)
1246                         goto done_moving_active_pages;
1247
1248                 while (!queue_empty(&vm_page_queue_active) && active_burst_count) {
1249
1250                         if (active_burst_count)
1251                                active_burst_count--;
1252
1253                         vm_pageout_active++;
1254
1255                         m = (vm_page_t) queue_first(&vm_page_queue_active);
1256
1257                         assert(m->active && !m->inactive);
1258                         assert(!m->laundry);
1259                         assert(m->object != kernel_object);
1260                         assert(m->phys_page != vm_page_guard_addr);
1261
1262                         DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
1263
1264                         /*
1265                          * Try to lock object; since we've already got the
1266                          * page queues lock, we can only 'try' for this one.
1267                          * if the 'try' fails, we need to do a mutex_pause
1268                          * to allow the owner of the object lock a chance to
1269                          * run... otherwise, we're likely to trip over this
1270                          * object in the same state as we work our way through
1271                          * the queue... clumps of pages associated with the same
1272                          * object are fairly typical on the inactive and active queues
1273                          */
1274                         if (m->object != object) {
1275                                 if (object != NULL) {
1276                                         vm_object_unlock(object);
1277                                         object = NULL;
1278                                         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1279                                 }
1280                                 if (!vm_object_lock_try_scan(m->object)) {
1281                                         /*
1282                                          * move page to end of active queue and continue
1283                                          */
1284                                         queue_remove(&vm_page_queue_active, m,
1285                                                      vm_page_t, pageq);
1286                                         queue_enter(&vm_page_queue_active, m,
1287                                                     vm_page_t, pageq);
1288
1289                                         try_failed = TRUE;
1290
1291                                         m = (vm_page_t) queue_first(&vm_page_queue_active);
1292                                         /*
1293                                          * this is the next object we're going to be interested in
1294                                          * try to make sure it's available after the mutex_yield
1295                                          * returns control
1296                                          */
1297                                         vm_pageout_scan_wants_object = m->object;
1298
1299                                         goto done_with_activepage;
1300                                 }
1301                                 object = m->object;
1302
1303                                 try_failed = FALSE;
1304                         }
1305
1306                         /*
1307                          * if the page is BUSY, then we pull it
1308                          * off the active queue and leave it alone.
1309                          * when BUSY is cleared, it will get stuck
1310                          * back on the appropriate queue
1311                          */
1312                         if (m->busy) {
1313                                 queue_remove(&vm_page_queue_active, m,
1314                                              vm_page_t, pageq);
1315                                 m->pageq.next = NULL;
1316                                 m->pageq.prev = NULL;
1317
1318                                 if (!m->fictitious)
1319                                         vm_page_active_count--;
1320                                 m->active = FALSE;
1321
1322                                 goto done_with_activepage;
1323                         }
1324
1325                         /* deal with a rogue "reusable" page */
1326                         VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m);
1327
1328                         /*
1329                          *      Deactivate the page while holding the object
1330                          *      locked, so we know the page is still not busy.
1331                          *      This should prevent races between pmap_enter
1332                          *      and pmap_clear_reference.  The page might be
1333                          *      absent or fictitious, but vm_page_deactivate
1334                          *      can handle that.
1335                          */
1336                         vm_page_deactivate(m);
1337
1338 done_with_activepage:
1339                         if (delayed_unlock++ > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT || try_failed == TRUE) {
1340
1341                                 if (object != NULL) {
1342                                         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1343                                         vm_object_unlock(object);
1344                                         object = NULL;
1345                                 }
1346                                 if (local_freeq) {
1347                                         vm_page_unlock_queues();
1348                                         vm_page_free_list(local_freeq, TRUE);
1349
1350                                         local_freeq = NULL;
1351                                         local_freed = 0;
1352                                         vm_page_lock_queues();
1353                                 } else
1354                                         lck_mtx_yield(&vm_page_queue_lock);
1355
1356                                 delayed_unlock = 1;
1357
1358                                 /*
1359                                  * continue the while loop processing
1360                                  * the active queue... need to hold
1361                                  * the page queues lock
1362                                  */
1363                         }
1364                 }
1365
1366
1367
1368                 /**********************************************************************
1369                  * above this point we're playing with the active queue
1370                  * below this point we're playing with the throttling mechanisms
1371                  * and the inactive queue
1372                  **********************************************************************/
1373
1374 done_moving_active_pages:
1375
1376                 /*
1377                  *      We are done if we have met our target *and*
1378                  *      nobody is still waiting for a page.
1379                  */
1380                 if (vm_page_free_count + local_freed >= vm_page_free_target) {
1381                         if (object != NULL) {
1382                                 vm_object_unlock(object);
1383                                 object = NULL;
1384                         }
1385                         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1386
1387                         if (local_freeq) {
1388                                 vm_page_unlock_queues();
1389                                 vm_page_free_list(local_freeq, TRUE);
1390
1391                                 local_freeq = NULL;
1392                                 local_freed = 0;
1393                                 vm_page_lock_queues();
1394                         }
1395                         /*
1396                          * inactive target still not met... keep going
1397                          * until we get the queues balanced
1398                          */
1399
1400                         /*
1401                          *      Recalculate vm_page_inactivate_target.
1402                          */
1403                         vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
1404                                                                           vm_page_inactive_count +
1405                                                                           vm_page_speculative_count);
1406
1407 #ifndef CONFIG_EMBEDDED
1408                         /*
1409                          * XXX: if no active pages can be reclaimed, pageout scan can be stuck trying
1410                          *      to balance the queues
1411                          */
1412                         if (((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) &&
1413                             !queue_empty(&vm_page_queue_active))
1414                                 continue;
1415 #endif
1416
1417                         lck_mtx_lock(&vm_page_queue_free_lock);
1418
1419                         if ((vm_page_free_count >= vm_page_free_target) &&
1420                             (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
1421
1422                                 vm_page_unlock_queues();
1423
1424                                 thread_wakeup((event_t) &vm_pageout_garbage_collect);
1425
1426                                 assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
1427
1428                                 return;
1429                         }
1430                         lck_mtx_unlock(&vm_page_queue_free_lock);
1431                 }
1432
1433                 /*
1434                  * Before anything, we check if we have any ripe volatile
1435                  * objects around. If so, try to purge the first object.
1436                  * If the purge fails, fall through to reclaim a page instead.
1437                  * If the purge succeeds, go back to the top and reevalute
1438                  * the new memory situation.
1439                  */
1440                 assert (available_for_purge>=0);
1441                 if (available_for_purge)
1442                 {
1443                         if (object != NULL) {
1444                                 vm_object_unlock(object);
1445                                 object = NULL;
1446                         }
1447                         if(TRUE == vm_purgeable_object_purge_one()) {
1448                                 continue;
1449                         }
1450                 }
1451
1452                 if (queue_empty(&sq->age_q) && vm_page_speculative_count) {
1453                         /*
1454                          * try to pull pages from the aging bins
1455                          * see vm_page.h for an explanation of how
1456                          * this mechanism works
1457                          */
1458                         struct vm_speculative_age_q     *aq;
1459                         mach_timespec_t ts_fully_aged;
1460                         boolean_t       can_steal = FALSE;
1461                         int num_scanned_queues;
1462
1463                         aq = &vm_page_queue_speculative[speculative_steal_index];
1464
1465                         num_scanned_queues = 0;
1466                         while (queue_empty(&aq->age_q) &&
1467                                num_scanned_queues++ != VM_PAGE_MAX_SPECULATIVE_AGE_Q) {
1468
1469                                 speculative_steal_index++;
1470
1471                                 if (speculative_steal_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q)
1472                                         speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
1473
1474                                 aq = &vm_page_queue_speculative[speculative_steal_index];
1475                         }
1476
1477                         if (num_scanned_queues ==
1478                             VM_PAGE_MAX_SPECULATIVE_AGE_Q + 1) {
1479                                 /*
1480                                  * XXX We've scanned all the speculative
1481                                  * queues but still haven't found one
1482                                  * that is not empty, even though
1483                                  * vm_page_speculative_count is not 0.
1484                                  */
1485                                 /* report the anomaly... */
1486                                 printf("vm_pageout_scan: "
1487                                        "all speculative queues empty "
1488                                        "but count=%d.  Re-adjusting.\n",
1489                                        vm_page_speculative_count);
1490                                 if (vm_page_speculative_count >
1491                                     vm_page_speculative_count_drift_max)
1492                                         vm_page_speculative_count_drift_max = vm_page_speculative_count;
1493                                 vm_page_speculative_count_drifts++;
1494 #if 6553678
1495                                 Debugger("vm_pageout_scan: no speculative pages");
1496 #endif
1497                                 /* readjust... */
1498                                 vm_page_speculative_count = 0;
1499                                 /* ... and continue */
1500                                 continue;
1501                         }
1502
1503                         if (vm_page_speculative_count > vm_page_speculative_target)
1504                                 can_steal = TRUE;
1505                         else {
1506                                 ts_fully_aged.tv_sec = (VM_PAGE_MAX_SPECULATIVE_AGE_Q * VM_PAGE_SPECULATIVE_Q_AGE_MS) / 1000;
1507                                 ts_fully_aged.tv_nsec = ((VM_PAGE_MAX_SPECULATIVE_AGE_Q * VM_PAGE_SPECULATIVE_Q_AGE_MS) % 1000)
1508                                                       * 1000 * NSEC_PER_USEC;
1509
1510                                 ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts);
1511
1512                                 clock_sec_t sec;
1513                                 clock_nsec_t nsec;
1514                                 clock_get_system_nanotime(&sec, &nsec);
1515                                 ts.tv_sec = (unsigned int) sec;
1516                                 ts.tv_nsec = nsec;
1517
1518                                 if (CMP_MACH_TIMESPEC(&ts, &ts_fully_aged) >= 0)
1519                                         can_steal = TRUE;
1520                         }
1521                         if (can_steal == TRUE)
1522                                 vm_page_speculate_ageit(aq);
1523                 }
1524
1525                 /*
1526                  * Sometimes we have to pause:
1527                  *      1) No inactive pages - nothing to do.
1528                  *      2) Flow control - default pageout queue is full
1529                  *      3) Loop control - no acceptable pages found on the inactive queue
1530                  *         within the last vm_pageout_burst_inactive_throttle iterations
1531                  */
1532                 if (queue_empty(&vm_page_queue_inactive) && queue_empty(&vm_page_queue_zf) && queue_empty(&sq->age_q) &&
1533                     (VM_PAGE_Q_THROTTLED(iq) || queue_empty(&vm_page_queue_throttled))) {
1534                         vm_pageout_scan_empty_throttle++;
1535                         msecs = vm_pageout_empty_wait;
1536                         goto vm_pageout_scan_delay;
1537
1538                 } else if (inactive_burst_count >=
1539                            MIN(vm_pageout_burst_inactive_throttle,
1540                                (vm_page_inactive_count +
1541                                 vm_page_speculative_count))) {
1542                         vm_pageout_scan_burst_throttle++;
1543                         msecs = vm_pageout_burst_wait;
1544                         goto vm_pageout_scan_delay;
1545
1546                 } else if (VM_PAGE_Q_THROTTLED(iq) && IP_VALID(memory_manager_default)) {
1547                         clock_sec_t sec;
1548                         clock_nsec_t nsec;
1549
1550                         switch (flow_control.state) {
1551
1552                         case FCS_IDLE:
1553 reset_deadlock_timer:
1554                                 ts.tv_sec = vm_pageout_deadlock_wait / 1000;
1555                                 ts.tv_nsec = (vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
1556                                 clock_get_system_nanotime(&sec, &nsec);
1557                                 flow_control.ts.tv_sec = (unsigned int) sec;
1558                                 flow_control.ts.tv_nsec = nsec;
1559                                 ADD_MACH_TIMESPEC(&flow_control.ts, &ts);
1560
1561                                 flow_control.state = FCS_DELAYED;
1562                                 msecs = vm_pageout_deadlock_wait;
1563
1564                                 break;
1565
1566                         case FCS_DELAYED:
1567                                 clock_get_system_nanotime(&sec, &nsec);
1568                                 ts.tv_sec = (unsigned int) sec;
1569                                 ts.tv_nsec = nsec;
1570
1571                                 if (CMP_MACH_TIMESPEC(&ts, &flow_control.ts) >= 0) {
1572                                         /*
1573                                          * the pageout thread for the default pager is potentially
1574                                          * deadlocked since the
1575                                          * default pager queue has been throttled for more than the
1576                                          * allowable time... we need to move some clean pages or dirty
1577                                          * pages belonging to the external pagers if they aren't throttled
1578                                          * vm_page_free_wanted represents the number of threads currently
1579                                          * blocked waiting for pages... we'll move one page for each of
1580                                          * these plus a fixed amount to break the logjam... once we're done
1581                                          * moving this number of pages, we'll re-enter the FSC_DELAYED state
1582                                          * with a new timeout target since we have no way of knowing
1583                                          * whether we've broken the deadlock except through observation
1584                                          * of the queue associated with the default pager... we need to
1585                                          * stop moving pages and allow the system to run to see what
1586                                          * state it settles into.
1587                                          */
1588                                         vm_pageout_deadlock_target = vm_pageout_deadlock_relief + vm_page_free_wanted + vm_page_free_wanted_privileged;
1589                                         vm_pageout_scan_deadlock_detected++;
1590                                         flow_control.state = FCS_DEADLOCK_DETECTED;
1591
1592                                         thread_wakeup((event_t) &vm_pageout_garbage_collect);
1593                                         goto consider_inactive;
1594                                 }
1595                                 /*
1596                                  * just resniff instead of trying
1597                                  * to compute a new delay time... we're going to be
1598                                  * awakened immediately upon a laundry completion,
1599                                  * so we won't wait any longer than necessary
1600                                  */
1601                                 msecs = vm_pageout_idle_wait;
1602                                 break;
1603
1604                         case FCS_DEADLOCK_DETECTED:
1605                                 if (vm_pageout_deadlock_target)
1606                                         goto consider_inactive;
1607                                 goto reset_deadlock_timer;
1608
1609                         }
1610                         vm_pageout_scan_throttle++;
1611                         iq->pgo_throttled = TRUE;
1612 vm_pageout_scan_delay:
1613                         if (object != NULL) {
1614                                 vm_object_unlock(object);
1615                                 object = NULL;
1616                         }
1617                         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1618
1619                         if (local_freeq) {
1620                                 vm_page_unlock_queues();
1621                                 vm_page_free_list(local_freeq, TRUE);
1622
1623                                 local_freeq = NULL;
1624                                 local_freed = 0;
1625                                 vm_page_lock_queues();
1626
1627                                 if (flow_control.state == FCS_DELAYED &&
1628                                     !VM_PAGE_Q_THROTTLED(iq)) {
1629                                         flow_control.state = FCS_IDLE;
1630                                         vm_pageout_scan_throttle_aborted++;
1631                                         goto consider_inactive;
1632                                 }
1633                         }
1634 #if CONFIG_EMBEDDED
1635                         {
1636                         int percent_avail;
1637
1638                         /*
1639                          * Decide if we need to send a memory status notification.
1640                          */
1641                         percent_avail =
1642                                 (vm_page_active_count + vm_page_inactive_count +
1643                                  vm_page_speculative_count + vm_page_free_count +
1644                                  (IP_VALID(memory_manager_default)?0:vm_page_purgeable_count) ) * 100 /
1645                                 atop_64(max_mem);
1646                         if (percent_avail >= (kern_memorystatus_level + 5) ||
1647                             percent_avail <= (kern_memorystatus_level - 5)) {
1648                                 kern_memorystatus_level = percent_avail;
1649                                 thread_wakeup((event_t)&kern_memorystatus_wakeup);
1650                         }
1651                         }
1652 #endif
1653                         assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000*NSEC_PER_USEC);
1654                         counter(c_vm_pageout_scan_block++);
1655
1656                         vm_page_unlock_queues();
1657
1658                         assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
1659
1660                         thread_block(THREAD_CONTINUE_NULL);
1661
1662                         vm_page_lock_queues();
1663                         delayed_unlock = 1;
1664
1665                         iq->pgo_throttled = FALSE;
1666
1667                         if (loop_count >= vm_page_inactive_count)
1668                                 loop_count = 0;
1669                         inactive_burst_count = 0;
1670
1671                         goto Restart;
1672                         /*NOTREACHED*/
1673                 }
1674
1675
1676                 flow_control.state = FCS_IDLE;
1677 consider_inactive:
1678                 loop_count++;
1679                 inactive_burst_count++;
1680                 vm_pageout_inactive++;
1681
1682                 /* Choose a victim. */
1683
1684                 while (1) {
1685                         m = NULL;
1686
1687                         if (IP_VALID(memory_manager_default)) {
1688                                 assert(vm_page_throttled_count == 0);
1689                                 assert(queue_empty(&vm_page_queue_throttled));
1690                         }
1691
1692                         /*
1693                          * The most eligible pages are ones we paged in speculatively,
1694                          * but which have not yet been touched.
1695                          */
1696                         if ( !queue_empty(&sq->age_q) ) {
1697                                 m = (vm_page_t) queue_first(&sq->age_q);
1698                                 break;
1699                         }
1700                         /*
1701                          * Time for a zero-filled inactive page?
1702                          */
1703                         if ( ((zf_run_count < zf_ratio) && vm_zf_queue_count >= zf_queue_min_count) ||
1704                              queue_empty(&vm_page_queue_inactive)) {
1705                                 if ( !queue_empty(&vm_page_queue_zf) ) {
1706                                         m = (vm_page_t) queue_first(&vm_page_queue_zf);
1707                                         zf_run_count++;
1708                                         break;
1709                                 }
1710                         }
1711                         /*
1712                          * It's either a normal inactive page or nothing.
1713                          */
1714                         if ( !queue_empty(&vm_page_queue_inactive) ) {
1715                                 m = (vm_page_t) queue_first(&vm_page_queue_inactive);
1716                                 zf_run_count = 0;
1717                                 break;
1718                         }
1719
1720                         panic("vm_pageout: no victim");
1721                 }
1722
1723                 assert(!m->active && (m->inactive || m->speculative || m->throttled));
1724                 assert(!m->laundry);
1725                 assert(m->object != kernel_object);
1726                 assert(m->phys_page != vm_page_guard_addr);
1727
1728                 if (!m->speculative) {
1729                         vm_pageout_stats[vm_pageout_stat_now].considered++;
1730                 }
1731
1732                 DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
1733
1734                 /*
1735                  * check to see if we currently are working
1736                  * with the same object... if so, we've
1737                  * already got the lock
1738                  */
1739                 if (m->object != object) {
1740                         /*
1741                          * the object associated with candidate page is
1742                          * different from the one we were just working
1743                          * with... dump the lock if we still own it
1744                          */
1745                         if (object != NULL) {
1746                                 vm_object_unlock(object);
1747                                 object = NULL;
1748                                 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1749                         }
1750                         /*
1751                          * Try to lock object; since we've alread got the
1752                          * page queues lock, we can only 'try' for this one.
1753                          * if the 'try' fails, we need to do a mutex_pause
1754                          * to allow the owner of the object lock a chance to
1755                          * run... otherwise, we're likely to trip over this
1756                          * object in the same state as we work our way through
1757                          * the queue... clumps of pages associated with the same
1758                          * object are fairly typical on the inactive and active queues
1759                          */
1760                         if (!vm_object_lock_try_scan(m->object)) {
1761                                 vm_pageout_inactive_nolock++;
1762
1763                         requeue_page:
1764                                 /*
1765                                  *      Move page to end and continue.
1766                                  *      Don't re-issue ticket
1767                                  */
1768                                 if (m->zero_fill) {
1769                                         if (m->speculative) {
1770                                                 panic("vm_pageout_scan(): page %p speculative and zero-fill !?\n", m);
1771                                         }
1772                                         assert(!m->speculative);
1773                                         queue_remove(&vm_page_queue_zf, m,
1774                                                      vm_page_t, pageq);
1775                                         queue_enter(&vm_page_queue_zf, m,
1776                                                     vm_page_t, pageq);
1777                                 } else if (m->speculative) {
1778                                         remque(&m->pageq);
1779                                         m->speculative = FALSE;
1780                                         vm_page_speculative_count--;
1781
1782                                         /*
1783                                          * move to the head of the inactive queue
1784                                          * to get it out of the way... the speculative
1785                                          * queue is generally too small to depend
1786                                          * on there being enough pages from other
1787                                          * objects to make cycling it back on the
1788                                          * same queue a winning proposition
1789                                          */
1790                                         queue_enter_first(&vm_page_queue_inactive, m,
1791                                                           vm_page_t, pageq);
1792                                         m->inactive = TRUE;
1793                                         vm_page_inactive_count++;
1794                                         token_new_pagecount++;
1795                                 }  else if (m->throttled) {
1796                                         queue_remove(&vm_page_queue_throttled, m,
1797                                                      vm_page_t, pageq);
1798                                         m->throttled = FALSE;
1799                                         vm_page_throttled_count--;
1800
1801                                         /*
1802                                          * not throttled any more, so can stick
1803                                          * it on the inactive queue.
1804                                          */
1805                                         queue_enter(&vm_page_queue_inactive, m,
1806                                                     vm_page_t, pageq);
1807                                         m->inactive = TRUE;
1808                                         vm_page_inactive_count++;
1809                                         token_new_pagecount++;
1810                                 } else {
1811                                         queue_remove(&vm_page_queue_inactive, m,
1812                                                      vm_page_t, pageq);
1813 #if MACH_ASSERT
1814                                         vm_page_inactive_count--;       /* balance for purgeable queue asserts */
1815 #endif
1816                                         vm_purgeable_q_advance_all();
1817
1818                                         queue_enter(&vm_page_queue_inactive, m,
1819                                                     vm_page_t, pageq);
1820 #if MACH_ASSERT
1821                                         vm_page_inactive_count++;       /* balance for purgeable queue asserts */
1822 #endif
1823                                         token_new_pagecount++;
1824                                 }
1825                                 pmap_clear_reference(m->phys_page);
1826                                 m->reference = FALSE;
1827
1828                                 if ( !queue_empty(&sq->age_q) )
1829                                         m = (vm_page_t) queue_first(&sq->age_q);
1830                                 else if ( ((zf_run_count < zf_ratio) && vm_zf_queue_count >= zf_queue_min_count) ||
1831                                           queue_empty(&vm_page_queue_inactive)) {
1832                                         if ( !queue_empty(&vm_page_queue_zf) )
1833                                                 m = (vm_page_t) queue_first(&vm_page_queue_zf);
1834                                 } else if ( !queue_empty(&vm_page_queue_inactive) ) {
1835                                         m = (vm_page_t) queue_first(&vm_page_queue_inactive);
1836                                 }
1837                                 /*
1838                                  * this is the next object we're going to be interested in
1839                                  * try to make sure its available after the mutex_yield
1840                                  * returns control
1841                                  */
1842                                 vm_pageout_scan_wants_object = m->object;
1843
1844                                 /*
1845                                  * force us to dump any collected free pages
1846                                  * and to pause before moving on
1847                                  */
1848                                 try_failed = TRUE;
1849
1850                                 goto done_with_inactivepage;
1851                         }
1852                         object = m->object;
1853                         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1854
1855                         try_failed = FALSE;
1856                 }
1857
1858                 /*
1859                  *      Paging out pages of external objects which
1860                  *      are currently being created must be avoided.
1861                  *      The pager may claim for memory, thus leading to a
1862                  *      possible dead lock between it and the pageout thread,
1863                  *      if such pages are finally chosen. The remaining assumption
1864                  *      is that there will finally be enough available pages in the
1865                  *      inactive pool to page out in order to satisfy all memory
1866                  *      claimed by the thread which concurrently creates the pager.
1867                  */
1868                 if (!object->pager_initialized && object->pager_created) {
1869                         /*
1870                          *      Move page to end and continue, hoping that
1871                          *      there will be enough other inactive pages to
1872                          *      page out so that the thread which currently
1873                          *      initializes the pager will succeed.
1874                          *      Don't re-grant the ticket, the page should
1875                          *      pulled from the queue and paged out whenever
1876                          *      one of its logically adjacent fellows is
1877                          *      targeted.
1878                          */
1879                         vm_pageout_inactive_avoid++;
1880                         goto requeue_page;
1881                 }
1882                 /*
1883                  *      Remove the page from its list.
1884                  */
1885                 if (m->speculative) {
1886                         remque(&m->pageq);
1887                         page_prev_state = PAGE_STATE_SPECULATIVE;
1888                         m->speculative = FALSE;
1889                         vm_page_speculative_count--;
1890                 } else if (m->throttled) {
1891                         queue_remove(&vm_page_queue_throttled, m, vm_page_t, pageq);
1892                         page_prev_state = PAGE_STATE_THROTTLED;
1893                         m->throttled = FALSE;
1894                         vm_page_throttled_count--;
1895                 } else {
1896                         if (m->zero_fill) {
1897                                 queue_remove(&vm_page_queue_zf, m, vm_page_t, pageq);
1898                                 page_prev_state = PAGE_STATE_ZEROFILL;
1899                                 vm_zf_queue_count--;
1900                         } else {
1901                                 page_prev_state = PAGE_STATE_INACTIVE;
1902                                 queue_remove(&vm_page_queue_inactive, m, vm_page_t, pageq);
1903                         }
1904                         m->inactive = FALSE;
1905                         if (!m->fictitious)
1906                                 vm_page_inactive_count--;
1907                         vm_purgeable_q_advance_all();
1908                 }
1909
1910                 m->pageq.next = NULL;
1911                 m->pageq.prev = NULL;
1912
1913                 if ( !m->fictitious && catch_up_count)
1914                         catch_up_count--;
1915
1916                 /*
1917                  * ENCRYPTED SWAP:
1918                  * if this page has already been picked up as part of a
1919                  * page-out cluster, it will be busy because it is being
1920                  * encrypted (see vm_object_upl_request()).  But we still
1921                  * want to demote it from "clean-in-place" (aka "adjacent")
1922                  * to "clean-and-free" (aka "target"), so let's ignore its
1923                  * "busy" bit here and proceed to check for "cleaning" a
1924                  * little bit below...
1925                  */
1926                 if ( !m->encrypted_cleaning && (m->busy || !object->alive)) {
1927                         /*
1928                          *      Somebody is already playing with this page.
1929                          *      Leave it off the pageout queues.
1930                          *
1931                          */
1932                         vm_pageout_inactive_busy++;
1933
1934                         goto done_with_inactivepage;
1935                 }
1936
1937                 /*
1938                  *      If it's absent or in error, we can reclaim the page.
1939                  */
1940
1941                 if (m->absent || m->error) {
1942                         vm_pageout_inactive_absent++;
1943 reclaim_page:
1944                         if (vm_pageout_deadlock_target) {
1945                                 vm_pageout_scan_inactive_throttle_success++;
1946                                 vm_pageout_deadlock_target--;
1947                         }
1948
1949                         DTRACE_VM2(dfree, int, 1, (uint64_t *), NULL);
1950
1951                         if (object->internal) {
1952                                 DTRACE_VM2(anonfree, int, 1, (uint64_t *), NULL);
1953                         } else {
1954                                 DTRACE_VM2(fsfree, int, 1, (uint64_t *), NULL);
1955                         }
1956                         vm_page_free_prepare_queues(m);
1957
1958                         /*
1959                          * remove page from object here since we're already
1960                          * behind the object lock... defer the rest of the work
1961                          * we'd normally do in vm_page_free_prepare_object
1962                          * until 'vm_page_free_list' is called
1963                          */
1964                         if (m->tabled)
1965                                 vm_page_remove(m, TRUE);
1966
1967                         assert(m->pageq.next == NULL &&
1968                                m->pageq.prev == NULL);
1969                         m->pageq.next = (queue_entry_t)local_freeq;
1970                         local_freeq = m;
1971                         local_freed++;
1972
1973                         inactive_burst_count = 0;
1974
1975                         if(page_prev_state != PAGE_STATE_SPECULATIVE) {
1976                                 vm_pageout_stats[vm_pageout_stat_now].reclaimed++;
1977                                 page_prev_state = 0;
1978                         }
1979
1980                         goto done_with_inactivepage;
1981                 }
1982
1983                 assert(!m->private);
1984                 assert(!m->fictitious);
1985
1986                 /*
1987                  *      If already cleaning this page in place, convert from
1988                  *      "adjacent" to "target". We can leave the page mapped,
1989                  *      and vm_pageout_object_terminate will determine whether
1990                  *      to free or reactivate.
1991                  */
1992
1993                 if (m->cleaning) {
1994                         m->busy = TRUE;
1995                         m->pageout = TRUE;
1996                         m->dump_cleaning = TRUE;
1997                         vm_page_wire(m);
1998
1999                         CLUSTER_STAT(vm_pageout_cluster_conversions++);
2000
2001                         inactive_burst_count = 0;
2002
2003                         goto done_with_inactivepage;
2004                 }
2005
2006                 /*
2007                  * If the object is empty, the page must be reclaimed even
2008                  * if dirty or used.
2009                  * If the page belongs to a volatile object, we stick it back
2010                  * on.
2011                  */
2012                 if (object->copy == VM_OBJECT_NULL) {
2013                         if (object->purgable == VM_PURGABLE_EMPTY) {
2014                                 m->busy = TRUE;
2015                                 if (m->pmapped == TRUE) {
2016                                         /* unmap the page */
2017                                         refmod_state = pmap_disconnect(m->phys_page);
2018                                         if (refmod_state & VM_MEM_MODIFIED) {
2019                                                 m->dirty = TRUE;
2020                                         }
2021                                 }
2022                                 if (m->dirty || m->precious) {
2023                                         /* we saved the cost of cleaning this page ! */
2024                                         vm_page_purged_count++;
2025                                 }
2026                                 goto reclaim_page;
2027                         }
2028                         if (object->purgable == VM_PURGABLE_VOLATILE) {
2029                                 /* if it's wired, we can't put it on our queue */
2030                                 assert(!VM_PAGE_WIRED(m));
2031                                 /* just stick it back on! */
2032                                 goto reactivate_page;
2033                         }
2034                 }
2035
2036                 /*
2037                  *      If it's being used, reactivate.
2038                  *      (Fictitious pages are either busy or absent.)
2039                  *      First, update the reference and dirty bits
2040                  *      to make sure the page is unreferenced.
2041                  */
2042                 refmod_state = -1;
2043
2044                 if (m->reference == FALSE && m->pmapped == TRUE) {
2045                         refmod_state = pmap_get_refmod(m->phys_page);
2046
2047                         if (refmod_state & VM_MEM_REFERENCED)
2048                                 m->reference = TRUE;
2049                         if (refmod_state & VM_MEM_MODIFIED)
2050                                 m->dirty = TRUE;
2051                 }
2052
2053                 if (m->reference || m->dirty) {
2054                         /* deal with a rogue "reusable" page */
2055                         VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m);
2056                 }
2057
2058                 if (m->reference && !m->no_cache) {
2059                         /*
2060                          * The page we pulled off the inactive list has
2061                          * been referenced.  It is possible for other
2062                          * processors to be touching pages faster than we
2063                          * can clear the referenced bit and traverse the
2064                          * inactive queue, so we limit the number of
2065                          * reactivations.
2066                          */
2067                         if (++reactivated_this_call >= reactivate_limit) {
2068                                 vm_pageout_reactivation_limit_exceeded++;
2069                         } else if (catch_up_count) {
2070                                 vm_pageout_catch_ups++;
2071                         } else if (++inactive_reclaim_run >= VM_PAGEOUT_INACTIVE_FORCE_RECLAIM) {
2072                                 vm_pageout_inactive_force_reclaim++;
2073                         } else {
2074                                 uint32_t isinuse;
2075 reactivate_page:
2076                                 if ( !object->internal && object->pager != MEMORY_OBJECT_NULL &&
2077                                      vnode_pager_get_isinuse(object->pager, &isinuse) == KERN_SUCCESS && !isinuse) {
2078                                         /*
2079                                          * no explict mappings of this object exist
2080                                          * and it's not open via the filesystem
2081                                          */
2082                                         vm_page_deactivate(m);
2083                                         vm_pageout_inactive_deactivated++;
2084                                 } else {
2085                                         /*
2086                                          * The page was/is being used, so put back on active list.
2087                                          */
2088                                         vm_page_activate(m);
2089                                         VM_STAT_INCR(reactivations);
2090                                 }
2091                                 vm_pageout_inactive_used++;
2092                                 inactive_burst_count = 0;
2093
2094                                 goto done_with_inactivepage;
2095                         }
2096                         /*
2097                          * Make sure we call pmap_get_refmod() if it
2098                          * wasn't already called just above, to update
2099                          * the dirty bit.
2100                          */
2101                         if ((refmod_state == -1) && !m->dirty && m->pmapped) {
2102                                 refmod_state = pmap_get_refmod(m->phys_page);
2103                                 if (refmod_state & VM_MEM_MODIFIED)
2104                                         m->dirty = TRUE;
2105                         }
2106                         forced_reclaim = TRUE;
2107                 } else {
2108                         forced_reclaim = FALSE;
2109                 }
2110
2111                 XPR(XPR_VM_PAGEOUT,
2112                 "vm_pageout_scan, replace object 0x%X offset 0x%X page 0x%X\n",
2113                 object, m->offset, m, 0,0);
2114
2115                 /*
2116                  * we've got a candidate page to steal...
2117                  *
2118                  * m->dirty is up to date courtesy of the
2119                  * preceding check for m->reference... if
2120                  * we get here, then m->reference had to be
2121                  * FALSE (or possibly "reactivate_limit" was
2122                  * exceeded), but in either case we called
2123                  * pmap_get_refmod() and updated both
2124                  * m->reference and m->dirty
2125                  *
2126                  * if it's dirty or precious we need to
2127                  * see if the target queue is throtttled
2128                  * it if is, we need to skip over it by moving it back
2129                  * to the end of the inactive queue
2130                  */
2131
2132                 inactive_throttled = FALSE;
2133
2134                 if (m->dirty || m->precious) {
2135                         if (object->internal) {
2136                                 if (VM_PAGE_Q_THROTTLED(iq))
2137                                         inactive_throttled = TRUE;
2138                         } else if (VM_PAGE_Q_THROTTLED(eq)) {
2139                                 inactive_throttled = TRUE;
2140                         }
2141                 }
2142                 if (inactive_throttled == TRUE) {
2143 throttle_inactive:
2144                         if (!IP_VALID(memory_manager_default) &&
2145                             object->internal && m->dirty &&
2146                             (object->purgable == VM_PURGABLE_DENY ||
2147                              object->purgable == VM_PURGABLE_NONVOLATILE ||
2148                              object->purgable == VM_PURGABLE_VOLATILE)) {
2149                                 queue_enter(&vm_page_queue_throttled, m,
2150                                             vm_page_t, pageq);
2151                                 m->throttled = TRUE;
2152                                 vm_page_throttled_count++;
2153                         } else {
2154                                 if (m->zero_fill) {
2155                                         queue_enter(&vm_page_queue_zf, m,
2156                                                     vm_page_t, pageq);
2157                                         vm_zf_queue_count++;
2158                                 } else
2159                                         queue_enter(&vm_page_queue_inactive, m,
2160                                                     vm_page_t, pageq);
2161                                 m->inactive = TRUE;
2162                                 if (!m->fictitious) {
2163                                         vm_page_inactive_count++;
2164                                         token_new_pagecount++;
2165                                 }
2166                         }
2167                         vm_pageout_scan_inactive_throttled++;
2168                         goto done_with_inactivepage;
2169                 }
2170
2171                 /*
2172                  * we've got a page that we can steal...
2173                  * eliminate all mappings and make sure
2174                  * we have the up-to-date modified state
2175                  * first take the page BUSY, so that no new
2176                  * mappings can be made
2177                  */
2178                 m->busy = TRUE;
2179
2180                 /*
2181                  * if we need to do a pmap_disconnect then we
2182                  * need to re-evaluate m->dirty since the pmap_disconnect
2183                  * provides the true state atomically... the
2184                  * page was still mapped up to the pmap_disconnect
2185                  * and may have been dirtied at the last microsecond
2186                  *
2187                  * we also check for the page being referenced 'late'
2188                  * if it was, we first need to do a WAKEUP_DONE on it
2189                  * since we already set m->busy = TRUE, before
2190                  * going off to reactivate it
2191                  *
2192                  * Note that if 'pmapped' is FALSE then the page is not
2193                  * and has not been in any map, so there is no point calling
2194                  * pmap_disconnect().  m->dirty and/or m->reference could
2195                  * have been set in anticipation of likely usage of the page.
2196                  */
2197                 if (m->pmapped == TRUE) {
2198                         refmod_state = pmap_disconnect(m->phys_page);
2199
2200                         if (refmod_state & VM_MEM_MODIFIED)
2201                                 m->dirty = TRUE;
2202                         if (refmod_state & VM_MEM_REFERENCED) {
2203
2204                                 /* If m->reference is already set, this page must have
2205                                  * already failed the reactivate_limit test, so don't
2206                                  * bump the counts twice.
2207                                  */
2208                                 if ( ! m->reference ) {
2209                                         m->reference = TRUE;
2210                                         if (forced_reclaim ||
2211                                             ++reactivated_this_call >= reactivate_limit)
2212                                                 vm_pageout_reactivation_limit_exceeded++;
2213                                         else {
2214                                                 PAGE_WAKEUP_DONE(m);
2215                                                 goto reactivate_page;
2216                                         }
2217                                 }
2218                         }
2219                 }
2220                 /*
2221                  * reset our count of pages that have been reclaimed
2222                  * since the last page was 'stolen'
2223                  */
2224                 inactive_reclaim_run = 0;
2225
2226                 /*
2227                  *      If it's clean and not precious, we can free the page.
2228                  */
2229                 if (!m->dirty && !m->precious) {
2230                         if (m->zero_fill)
2231                                 vm_pageout_inactive_zf++;
2232                         vm_pageout_inactive_clean++;
2233
2234                         goto reclaim_page;
2235                 }
2236
2237                 /*
2238                  * The page may have been dirtied since the last check
2239                  * for a throttled target queue (which may have been skipped
2240                  * if the page was clean then).  With the dirty page
2241                  * disconnected here, we can make one final check.
2242                  */
2243                 {
2244                         boolean_t disconnect_throttled = FALSE;
2245                         if (object->internal) {
2246                                 if (VM_PAGE_Q_THROTTLED(iq))
2247                                         disconnect_throttled = TRUE;
2248                         } else if (VM_PAGE_Q_THROTTLED(eq)) {
2249                                 disconnect_throttled = TRUE;
2250                         }
2251
2252                         if (disconnect_throttled == TRUE) {
2253                                 PAGE_WAKEUP_DONE(m);
2254                                 goto throttle_inactive;
2255                         }
2256                 }
2257
2258                 vm_pageout_stats[vm_pageout_stat_now].reclaimed++;
2259
2260                 vm_pageout_cluster(m);
2261
2262                 if (m->zero_fill)
2263                         vm_pageout_inactive_zf++;
2264                 vm_pageout_inactive_dirty++;
2265
2266                 inactive_burst_count = 0;
2267
2268 done_with_inactivepage:
2269                 if (delayed_unlock++ > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT || try_failed == TRUE) {
2270
2271                         if (object != NULL) {
2272                                 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2273                                 vm_object_unlock(object);
2274                                 object = NULL;
2275                         }
2276                         if (local_freeq) {
2277                                 vm_page_unlock_queues();
2278                                 vm_page_free_list(local_freeq, TRUE);
2279
2280                                 local_freeq = NULL;
2281                                 local_freed = 0;
2282                                 vm_page_lock_queues();
2283                         } else
2284                                 lck_mtx_yield(&vm_page_queue_lock);
2285
2286                         delayed_unlock = 1;
2287                 }
2288                 /*
2289                  * back to top of pageout scan loop
2290                  */
2291         }
2292 }
2293
2294
2295 int vm_page_free_count_init;
2296
2297 void
2298 vm_page_free_reserve(
2299         int pages)
2300 {
2301         int             free_after_reserve;
2302
2303         vm_page_free_reserved += pages;
2304
2305         free_after_reserve = vm_page_free_count_init - vm_page_free_reserved;
2306
2307         vm_page_free_min = vm_page_free_reserved +
2308                 VM_PAGE_FREE_MIN(free_after_reserve);
2309
2310         if (vm_page_free_min > VM_PAGE_FREE_MIN_LIMIT)
2311                 vm_page_free_min = VM_PAGE_FREE_MIN_LIMIT;
2312
2313         vm_page_free_target = vm_page_free_reserved +
2314                 VM_PAGE_FREE_TARGET(free_after_reserve);
2315
2316         if (vm_page_free_target > VM_PAGE_FREE_TARGET_LIMIT)
2317                 vm_page_free_target = VM_PAGE_FREE_TARGET_LIMIT;
2318
2319         if (vm_page_free_target < vm_page_free_min + 5)
2320                 vm_page_free_target = vm_page_free_min + 5;
2321
2322         vm_page_throttle_limit = vm_page_free_target - (vm_page_free_target / 3);
2323         vm_page_creation_throttle = vm_page_free_target / 2;
2324 }
2325
2326 /*
2327  *      vm_pageout is the high level pageout daemon.
2328  */
2329
2330 void
2331 vm_pageout_continue(void)
2332 {
2333         DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL);
2334         vm_pageout_scan_event_counter++;
2335         vm_pageout_scan();
2336         /* we hold vm_page_queue_free_lock now */
2337         assert(vm_page_free_wanted == 0);
2338         assert(vm_page_free_wanted_privileged == 0);
2339         assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
2340         lck_mtx_unlock(&vm_page_queue_free_lock);
2341
2342         counter(c_vm_pageout_block++);
2343         thread_block((thread_continue_t)vm_pageout_continue);
2344         /*NOTREACHED*/
2345 }
2346
2347
2348 #ifdef FAKE_DEADLOCK
2349
2350 #define FAKE_COUNT      5000
2351
2352 int internal_count = 0;
2353 int fake_deadlock = 0;
2354
2355 #endif
2356
2357 static void
2358 vm_pageout_iothread_continue(struct vm_pageout_queue *q)
2359 {
2360         vm_page_t       m = NULL;
2361         vm_object_t     object;
2362         boolean_t       need_wakeup;
2363         memory_object_t pager;
2364         thread_t        self = current_thread();
2365
2366         if ((vm_pageout_internal_iothread != THREAD_NULL)
2367             && (self == vm_pageout_external_iothread )
2368             && (self->options & TH_OPT_VMPRIV))
2369                 self->options &= ~TH_OPT_VMPRIV;
2370
2371         vm_page_lockspin_queues();
2372
2373         while ( !queue_empty(&q->pgo_pending) ) {
2374
2375                    q->pgo_busy = TRUE;
2376                    queue_remove_first(&q->pgo_pending, m, vm_page_t, pageq);
2377                    VM_PAGE_CHECK(m);
2378                    m->pageout_queue = FALSE;
2379                    m->pageq.next = NULL;
2380                    m->pageq.prev = NULL;
2381                    vm_page_unlock_queues();
2382
2383 #ifdef FAKE_DEADLOCK
2384                    if (q == &vm_pageout_queue_internal) {
2385                            vm_offset_t addr;
2386                            int  pg_count;
2387
2388                            internal_count++;
2389
2390                            if ((internal_count == FAKE_COUNT)) {
2391
2392                                    pg_count = vm_page_free_count + vm_page_free_reserved;
2393
2394                                    if (kmem_alloc(kernel_map, &addr, PAGE_SIZE * pg_count) == KERN_SUCCESS) {
2395                                            kmem_free(kernel_map, addr, PAGE_SIZE * pg_count);
2396                                    }
2397                                    internal_count = 0;
2398                                    fake_deadlock++;
2399                            }
2400                    }
2401 #endif
2402                    object = m->object;
2403
2404                    vm_object_lock(object);
2405
2406                    if (!object->pager_initialized) {
2407
2408                            /*
2409                             *   If there is no memory object for the page, create
2410                             *   one and hand it to the default pager.
2411                             */
2412
2413                            if (!object->pager_initialized)
2414                                    vm_object_collapse(object,
2415                                                       (vm_object_offset_t) 0,
2416                                                       TRUE);
2417                            if (!object->pager_initialized)
2418                                    vm_object_pager_create(object);
2419                            if (!object->pager_initialized) {
2420                                    /*
2421                                     *   Still no pager for the object.
2422                                     *   Reactivate the page.
2423                                     *
2424                                     *   Should only happen if there is no
2425                                     *   default pager.
2426                                     */
2427                                    vm_page_lockspin_queues();
2428
2429                                    vm_pageout_queue_steal(m, TRUE);
2430                                    vm_pageout_dirty_no_pager++;
2431                                    vm_page_activate(m);
2432
2433                                    vm_page_unlock_queues();
2434
2435                                    /*
2436                                     *   And we are done with it.
2437                                     */
2438                                    PAGE_WAKEUP_DONE(m);
2439
2440                                    vm_object_paging_end(object);
2441                                    vm_object_unlock(object);
2442
2443                                    vm_page_lockspin_queues();
2444                                    continue;
2445                            }
2446                    }
2447                    pager = object->pager;
2448                    if (pager == MEMORY_OBJECT_NULL) {
2449                            /*
2450                             * This pager has been destroyed by either
2451                             * memory_object_destroy or vm_object_destroy, and
2452                             * so there is nowhere for the page to go.
2453                             * Just free the page... VM_PAGE_FREE takes
2454                             * care of cleaning up all the state...
2455                             * including doing the vm_pageout_throttle_up
2456                             */
2457
2458                            VM_PAGE_FREE(m);
2459
2460                            vm_object_paging_end(object);
2461                            vm_object_unlock(object);
2462
2463                            vm_page_lockspin_queues();
2464                            continue;
2465                    }
2466                    VM_PAGE_CHECK(m);
2467                    vm_object_unlock(object);
2468                    /*
2469                     * we expect the paging_in_progress reference to have
2470                     * already been taken on the object before it was added
2471                     * to the appropriate pageout I/O queue... this will
2472                     * keep the object from being terminated and/or the
2473                     * paging_offset from changing until the I/O has
2474                     * completed... therefore no need to lock the object to
2475                     * pull the paging_offset from it.
2476                     *
2477                     * Send the data to the pager.
2478                     * any pageout clustering happens there
2479                     */
2480                    memory_object_data_return(pager,
2481                                              m->offset + object->paging_offset,
2482                                              PAGE_SIZE,
2483                                              NULL,
2484                                              NULL,
2485                                              FALSE,
2486                                              FALSE,
2487                                              0);
2488
2489                    vm_object_lock(object);
2490                    vm_object_paging_end(object);
2491                    vm_object_unlock(object);
2492
2493                    vm_page_lockspin_queues();
2494         }
2495         assert_wait((event_t) q, THREAD_UNINT);
2496
2497
2498         if (q->pgo_throttled == TRUE && !VM_PAGE_Q_THROTTLED(q)) {
2499                 q->pgo_throttled = FALSE;
2500                 need_wakeup = TRUE;
2501         } else
2502                 need_wakeup = FALSE;
2503
2504         q->pgo_busy = FALSE;
2505         q->pgo_idle = TRUE;
2506         vm_page_unlock_queues();
2507
2508         if (need_wakeup == TRUE)
2509                 thread_wakeup((event_t) &q->pgo_laundry);
2510
2511         thread_block_parameter((thread_continue_t)vm_pageout_iothread_continue, (void *) &q->pgo_pending);
2512         /*NOTREACHED*/
2513 }
2514
2515
2516 static void
2517 vm_pageout_iothread_external(void)
2518 {
2519         thread_t        self = current_thread();
2520
2521         self->options |= TH_OPT_VMPRIV;
2522
2523         vm_pageout_iothread_continue(&vm_pageout_queue_external);
2524         /*NOTREACHED*/
2525 }
2526
2527
2528 static void
2529 vm_pageout_iothread_internal(void)
2530 {
2531         thread_t        self = current_thread();
2532
2533         self->options |= TH_OPT_VMPRIV;
2534
2535         vm_pageout_iothread_continue(&vm_pageout_queue_internal);
2536         /*NOTREACHED*/
2537 }
2538
2539 kern_return_t
2540 vm_set_buffer_cleanup_callout(boolean_t (*func)(void))
2541 {
2542         if (OSCompareAndSwapPtr(NULL, func, (void * volatile *) &consider_buffer_cache_collect)) {
2543                 return KERN_SUCCESS;
2544         } else {
2545                 return KERN_FAILURE; /* Already set */
2546         }
2547 }
2548
2549 static void
2550 vm_pageout_garbage_collect(int collect)
2551 {
2552         if (collect) {
2553                 boolean_t buf_large_zfree = FALSE;
2554                 stack_collect();
2555
2556                 /*
2557                  * consider_zone_gc should be last, because the other operations
2558                  * might return memory to zones.
2559                  */
2560                 consider_machine_collect();
2561                 if (consider_buffer_cache_collect != NULL) {
2562                         buf_large_zfree = (*consider_buffer_cache_collect)();
2563                 }
2564                 consider_zone_gc(buf_large_zfree);
2565
2566                 consider_machine_adjust();
2567         }
2568
2569         assert_wait((event_t) &vm_pageout_garbage_collect, THREAD_UNINT);
2570
2571         thread_block_parameter((thread_continue_t) vm_pageout_garbage_collect, (void *)1);
2572         /*NOTREACHED*/
2573 }
2574
2575
2576
2577 void
2578 vm_pageout(void)
2579 {
2580         thread_t        self = current_thread();
2581         thread_t        thread;
2582         kern_return_t   result;
2583         spl_t           s;
2584
2585         /*
2586          * Set thread privileges.
2587          */
2588         s = splsched();
2589         thread_lock(self);
2590         self->priority = BASEPRI_PREEMPT - 1;
2591         set_sched_pri(self, self->priority);
2592         thread_unlock(self);
2593
2594         if (!self->reserved_stack)
2595                 self->reserved_stack = self->kernel_stack;
2596
2597         splx(s);
2598
2599         /*
2600          *      Initialize some paging parameters.
2601          */
2602
2603         if (vm_pageout_idle_wait == 0)
2604                 vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
2605
2606         if (vm_pageout_burst_wait == 0)
2607                 vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
2608
2609         if (vm_pageout_empty_wait == 0)
2610                 vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
2611
2612         if (vm_pageout_deadlock_wait == 0)
2613                 vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
2614
2615         if (vm_pageout_deadlock_relief == 0)
2616                 vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
2617
2618         if (vm_pageout_inactive_relief == 0)
2619                 vm_pageout_inactive_relief = VM_PAGEOUT_INACTIVE_RELIEF;
2620
2621         if (vm_pageout_burst_active_throttle == 0)
2622                 vm_pageout_burst_active_throttle = VM_PAGEOUT_BURST_ACTIVE_THROTTLE;
2623
2624         if (vm_pageout_burst_inactive_throttle == 0)
2625                 vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
2626
2627         /*
2628          * Set kernel task to low backing store privileged
2629          * status
2630          */
2631         task_lock(kernel_task);
2632         kernel_task->priv_flags |= VM_BACKING_STORE_PRIV;
2633         task_unlock(kernel_task);
2634
2635         vm_page_free_count_init = vm_page_free_count;
2636
2637         /*
2638          * even if we've already called vm_page_free_reserve
2639          * call it again here to insure that the targets are
2640          * accurately calculated (it uses vm_page_free_count_init)
2641          * calling it with an arg of 0 will not change the reserve
2642          * but will re-calculate free_min and free_target
2643          */
2644         if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
2645                 vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
2646         } else
2647                 vm_page_free_reserve(0);
2648
2649
2650         queue_init(&vm_pageout_queue_external.pgo_pending);
2651         vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
2652         vm_pageout_queue_external.pgo_laundry = 0;
2653         vm_pageout_queue_external.pgo_idle = FALSE;
2654         vm_pageout_queue_external.pgo_busy = FALSE;
2655         vm_pageout_queue_external.pgo_throttled = FALSE;
2656
2657         queue_init(&vm_pageout_queue_internal.pgo_pending);
2658         vm_pageout_queue_internal.pgo_maxlaundry = 0;
2659         vm_pageout_queue_internal.pgo_laundry = 0;
2660         vm_pageout_queue_internal.pgo_idle = FALSE;
2661         vm_pageout_queue_internal.pgo_busy = FALSE;
2662         vm_pageout_queue_internal.pgo_throttled = FALSE;
2663
2664
2665         /* internal pageout thread started when default pager registered first time */
2666         /* external pageout and garbage collection threads started here */
2667
2668         result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external, NULL,
2669                                               BASEPRI_PREEMPT - 1,
2670                                               &vm_pageout_external_iothread);
2671         if (result != KERN_SUCCESS)
2672                 panic("vm_pageout_iothread_external: create failed");
2673
2674         thread_deallocate(vm_pageout_external_iothread);
2675
2676         result = kernel_thread_start_priority((thread_continue_t)vm_pageout_garbage_collect, NULL,
2677                                               MINPRI_KERNEL,
2678                                               &thread);
2679         if (result != KERN_SUCCESS)
2680                 panic("vm_pageout_garbage_collect: create failed");
2681
2682         thread_deallocate(thread);
2683
2684         vm_object_reaper_init();
2685
2686
2687         vm_pageout_continue();
2688
2689         /*
2690          * Unreached code!
2691          *
2692          * The vm_pageout_continue() call above never returns, so the code below is never
2693          * executed.  We take advantage of this to declare several DTrace VM related probe
2694          * points that our kernel doesn't have an analog for.  These are probe points that
2695          * exist in Solaris and are in the DTrace documentation, so people may have written
2696          * scripts that use them.  Declaring the probe points here means their scripts will
2697          * compile and execute which we want for portability of the scripts, but since this
2698          * section of code is never reached, the probe points will simply never fire.  Yes,
2699          * this is basically a hack.  The problem is the DTrace probe points were chosen with
2700          * Solaris specific VM events in mind, not portability to different VM implementations.
2701          */
2702
2703         DTRACE_VM2(execfree, int, 1, (uint64_t *), NULL);
2704         DTRACE_VM2(execpgin, int, 1, (uint64_t *), NULL);
2705         DTRACE_VM2(execpgout, int, 1, (uint64_t *), NULL);
2706         DTRACE_VM2(pgswapin, int, 1, (uint64_t *), NULL);
2707         DTRACE_VM2(pgswapout, int, 1, (uint64_t *), NULL);
2708         DTRACE_VM2(swapin, int, 1, (uint64_t *), NULL);
2709         DTRACE_VM2(swapout, int, 1, (uint64_t *), NULL);
2710         /*NOTREACHED*/
2711 }
2712
2713 kern_return_t
2714 vm_pageout_internal_start(void)
2715 {
2716         kern_return_t result;
2717
2718         vm_pageout_queue_internal.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
2719         result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal, NULL, BASEPRI_PREEMPT - 1, &vm_pageout_internal_iothread);
2720         if (result == KERN_SUCCESS)
2721                 thread_deallocate(vm_pageout_internal_iothread);
2722         return result;
2723 }
2724
2725
2726 /*
2727  * when marshalling pages into a UPL and subsequently committing
2728  * or aborting them, it is necessary to hold
2729  * the vm_page_queue_lock (a hot global lock) for certain operations
2730  * on the page... however, the majority of the work can be done
2731  * while merely holding the object lock... in fact there are certain
2732  * collections of pages that don't require any work brokered by the
2733  * vm_page_queue_lock... to mitigate the time spent behind the global
2734  * lock, go to a 2 pass algorithm... collect pages up to DELAYED_WORK_LIMIT
2735  * while doing all of the work that doesn't require the vm_page_queue_lock...
2736  * then call dw_do_work to acquire the vm_page_queue_lock and do the
2737  * necessary work for each page... we will grab the busy bit on the page
2738  * if it's not already held so that dw_do_work can drop the object lock
2739  * if it can't immediately take the vm_page_queue_lock in order to compete
2740  * for the locks in the same order that vm_pageout_scan takes them.
2741  * the operation names are modeled after the names of the routines that
2742  * need to be called in order to make the changes very obvious in the
2743  * original loop
2744  */
2745
2746 #define DELAYED_WORK_LIMIT      32
2747
2748 #define DW_vm_page_unwire               0x01
2749 #define DW_vm_page_wire                 0x02
2750 #define DW_vm_page_free                 0x04
2751 #define DW_vm_page_activate             0x08
2752 #define DW_vm_page_deactivate_internal  0x10
2753 #define DW_vm_page_speculate            0x20
2754 #define DW_vm_page_lru                  0x40
2755 #define DW_vm_pageout_throttle_up       0x80
2756 #define DW_PAGE_WAKEUP                  0x100
2757 #define DW_clear_busy                   0x200
2758 #define DW_clear_reference              0x400
2759 #define DW_set_reference                0x800
2760
2761 struct dw {
2762         vm_page_t       dw_m;
2763         int             dw_mask;
2764 };
2765
2766
2767 static void dw_do_work(vm_object_t object, struct dw *dwp, int dw_count);
2768
2769
2770
2771 static upl_t
2772 upl_create(int type, int flags, upl_size_t size)
2773 {
2774         upl_t   upl;
2775         int     page_field_size = 0;
2776         int     upl_flags = 0;
2777         int     upl_size  = sizeof(struct upl);
2778
2779         size = round_page_32(size);
2780
2781         if (type & UPL_CREATE_LITE) {
2782                 page_field_size = (atop(size) + 7) >> 3;
2783                 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
2784
2785                 upl_flags |= UPL_LITE;
2786         }
2787         if (type & UPL_CREATE_INTERNAL) {
2788                 upl_size += (int) sizeof(struct upl_page_info) * atop(size);
2789
2790                 upl_flags |= UPL_INTERNAL;
2791         }
2792         upl = (upl_t)kalloc(upl_size + page_field_size);
2793
2794         if (page_field_size)
2795                 bzero((char *)upl + upl_size, page_field_size);
2796
2797         upl->flags = upl_flags | flags;
2798         upl->src_object = NULL;
2799         upl->kaddr = (vm_offset_t)0;
2800         upl->size = 0;
2801         upl->map_object = NULL;
2802         upl->ref_count = 1;
2803         upl->highest_page = 0;
2804         upl_lock_init(upl);
2805         upl->vector_upl = NULL;
2806 #if UPL_DEBUG
2807         upl->ubc_alias1 = 0;
2808         upl->ubc_alias2 = 0;
2809
2810         upl->upl_creator = current_thread();
2811         upl->upl_state = 0;
2812         upl->upl_commit_index = 0;
2813         bzero(&upl->upl_commit_records[0], sizeof(upl->upl_commit_records));
2814
2815         (void) OSBacktrace(&upl->upl_create_retaddr[0], UPL_DEBUG_STACK_FRAMES);
2816 #endif /* UPL_DEBUG */
2817
2818         return(upl);
2819 }
2820
2821 static void
2822 upl_destroy(upl_t upl)
2823 {
2824         int     page_field_size;  /* bit field in word size buf */
2825         int     size;
2826
2827 #if UPL_DEBUG
2828         {
2829                 vm_object_t     object;
2830
2831                 if (upl->flags & UPL_SHADOWED) {
2832                         object = upl->map_object->shadow;
2833                 } else {
2834                         object = upl->map_object;
2835                 }
2836                 vm_object_lock(object);
2837                 queue_remove(&object->uplq, upl, upl_t, uplq);
2838                 vm_object_unlock(object);
2839         }
2840 #endif /* UPL_DEBUG */
2841         /*
2842          * drop a reference on the map_object whether or
2843          * not a pageout object is inserted
2844          */
2845         if (upl->flags & UPL_SHADOWED)
2846                 vm_object_deallocate(upl->map_object);
2847
2848         if (upl->flags & UPL_DEVICE_MEMORY)
2849                 size = PAGE_SIZE;
2850         else
2851                 size = upl->size;
2852         page_field_size = 0;
2853
2854         if (upl->flags & UPL_LITE) {
2855                 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
2856                 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
2857         }
2858         upl_lock_destroy(upl);
2859         upl->vector_upl = (vector_upl_t) 0xfeedbeef;
2860         if (upl->flags & UPL_INTERNAL) {
2861                 kfree(upl,
2862                       sizeof(struct upl) +
2863                       (sizeof(struct upl_page_info) * (size/PAGE_SIZE))
2864                       + page_field_size);
2865         } else {
2866                 kfree(upl, sizeof(struct upl) + page_field_size);
2867         }
2868 }
2869
2870 void uc_upl_dealloc(upl_t upl);
2871 __private_extern__ void
2872 uc_upl_dealloc(upl_t upl)
2873 {
2874         if (--upl->ref_count == 0)
2875                 upl_destroy(upl);
2876 }
2877
2878 void
2879 upl_deallocate(upl_t upl)
2880 {
2881         if (--upl->ref_count == 0) {
2882                 if(vector_upl_is_valid(upl))
2883                         vector_upl_deallocate(upl);
2884                 upl_destroy(upl);
2885         }
2886 }
2887
2888 #if DEVELOPMENT || DEBUG
2889 /*/*
2890  * Statistics about UPL enforcement of copy-on-write obligations.
2891  */
2892 unsigned long upl_cow = 0;
2893 unsigned long upl_cow_again = 0;
2894 unsigned long upl_cow_pages = 0;
2895 unsigned long upl_cow_again_pages = 0;
2896
2897 unsigned long iopl_cow = 0;
2898 unsigned long iopl_cow_pages = 0;
2899 #endif
2900
2901 /*
2902  *      Routine:        vm_object_upl_request
2903  *      Purpose:
2904  *              Cause the population of a portion of a vm_object.
2905  *              Depending on the nature of the request, the pages
2906  *              returned may be contain valid data or be uninitialized.
2907  *              A page list structure, listing the physical pages
2908  *              will be returned upon request.
2909  *              This function is called by the file system or any other
2910  *              supplier of backing store to a pager.
2911  *              IMPORTANT NOTE: The caller must still respect the relationship
2912  *              between the vm_object and its backing memory object.  The
2913  *              caller MUST NOT substitute changes in the backing file
2914  *              without first doing a memory_object_lock_request on the
2915  *              target range unless it is know that the pages are not
2916  *              shared with another entity at the pager level.
2917  *              Copy_in_to:
2918  *                      if a page list structure is present
2919  *                      return the mapped physical pages, where a
2920  *                      page is not present, return a non-initialized
2921  *                      one.  If the no_sync bit is turned on, don't
2922  *                      call the pager unlock to synchronize with other
2923  *                      possible copies of the page. Leave pages busy
2924  *                      in the original object, if a page list structure
2925  *                      was specified.  When a commit of the page list
2926  *                      pages is done, the dirty bit will be set for each one.
2927  *              Copy_out_from:
2928  *                      If a page list structure is present, return
2929  *                      all mapped pages.  Where a page does not exist
2930  *                      map a zero filled one. Leave pages busy in
2931  *                      the original object.  If a page list structure
2932  *                      is not specified, this call is a no-op.
2933  *
2934  *              Note:  access of default pager objects has a rather interesting
2935  *              twist.  The caller of this routine, presumably the file system
2936  *              page cache handling code, will never actually make a request
2937  *              against a default pager backed object.  Only the default
2938  *              pager will make requests on backing store related vm_objects
2939  *              In this way the default pager can maintain the relationship
2940  *              between backing store files (abstract memory objects) and
2941  *              the vm_objects (cache objects), they support.
2942  *
2943  */
2944
2945 __private_extern__ kern_return_t
2946 vm_object_upl_request(
2947         vm_object_t             object,
2948         vm_object_offset_t      offset,
2949         upl_size_t              size,
2950         upl_t                   *upl_ptr,
2951         upl_page_info_array_t   user_page_list,
2952         unsigned int            *page_list_count,
2953         int                     cntrl_flags)
2954 {
2955         vm_page_t               dst_page = VM_PAGE_NULL;
2956         vm_object_offset_t      dst_offset;
2957         upl_size_t              xfer_size;
2958         boolean_t               dirty;
2959         boolean_t               hw_dirty;
2960         upl_t                   upl = NULL;
2961         unsigned int            entry;
2962 #if MACH_CLUSTER_STATS
2963         boolean_t               encountered_lrp = FALSE;
2964 #endif
2965         vm_page_t               alias_page = NULL;
2966         int                     refmod_state = 0;
2967         wpl_array_t             lite_list = NULL;
2968         vm_object_t             last_copy_object;
2969         struct  dw              dw_array[DELAYED_WORK_LIMIT];
2970         struct  dw              *dwp;
2971         int                     dw_count;
2972
2973         if (cntrl_flags & ~UPL_VALID_FLAGS) {
2974                 /*
2975                  * For forward compatibility's sake,
2976                  * reject any unknown flag.
2977                  */
2978                 return KERN_INVALID_VALUE;
2979         }
2980         if ( (!object->internal) && (object->paging_offset != 0) )
2981                 panic("vm_object_upl_request: external object with non-zero paging offset\n");
2982         if (object->phys_contiguous)
2983                 panic("vm_object_upl_request: contiguous object specified\n");
2984
2985
2986         if ((size / PAGE_SIZE) > MAX_UPL_SIZE)
2987                 size = MAX_UPL_SIZE * PAGE_SIZE;
2988
2989         if ( (cntrl_flags & UPL_SET_INTERNAL) && page_list_count != NULL)
2990                 *page_list_count = MAX_UPL_SIZE;
2991
2992         if (cntrl_flags & UPL_SET_INTERNAL) {
2993                 if (cntrl_flags & UPL_SET_LITE) {
2994
2995                         upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE, 0, size);
2996
2997                         user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
2998                         lite_list = (wpl_array_t)
2999                                         (((uintptr_t)user_page_list) +
3000                                         ((size/PAGE_SIZE) * sizeof(upl_page_info_t)));
3001                         if (size == 0) {
3002                                 user_page_list = NULL;
3003                                 lite_list = NULL;
3004                         }
3005                 } else {
3006                         upl = upl_create(UPL_CREATE_INTERNAL, 0, size);
3007
3008                         user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
3009                         if (size == 0) {
3010                                 user_page_list = NULL;
3011                         }
3012                 }
3013         } else {
3014                 if (cntrl_flags & UPL_SET_LITE) {
3015
3016                         upl = upl_create(UPL_CREATE_EXTERNAL | UPL_CREATE_LITE, 0, size);
3017
3018                         lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
3019                         if (size == 0) {
3020                                 lite_list = NULL;
3021                         }
3022                 } else {
3023                         upl = upl_create(UPL_CREATE_EXTERNAL, 0, size);
3024                 }
3025         }
3026         *upl_ptr = upl;
3027
3028         if (user_page_list)
3029                 user_page_list[0].device = FALSE;
3030
3031         if (cntrl_flags & UPL_SET_LITE) {
3032                 upl->map_object = object;
3033         } else {
3034                 upl->map_object = vm_object_allocate(size);
3035                 /*
3036                  * No neeed to lock the new object: nobody else knows
3037                  * about it yet, so it's all ours so far.
3038                  */
3039                 upl->map_object->shadow = object;
3040                 upl->map_object->pageout = TRUE;
3041                 upl->map_object->can_persist = FALSE;
3042                 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3043                 upl->map_object->shadow_offset = offset;
3044                 upl->map_object->wimg_bits = object->wimg_bits;
3045
3046                 VM_PAGE_GRAB_FICTITIOUS(alias_page);
3047
3048                 upl->flags |= UPL_SHADOWED;
3049         }
3050         /*
3051          * ENCRYPTED SWAP:
3052          * Just mark the UPL as "encrypted" here.
3053          * We'll actually encrypt the pages later,
3054          * in upl_encrypt(), when the caller has
3055          * selected which pages need to go to swap.
3056          */
3057         if (cntrl_flags & UPL_ENCRYPT)
3058                 upl->flags |= UPL_ENCRYPTED;
3059
3060         if (cntrl_flags & UPL_FOR_PAGEOUT)
3061                 upl->flags |= UPL_PAGEOUT;
3062
3063         vm_object_lock(object);
3064         vm_object_activity_begin(object);
3065
3066         /*
3067          * we can lock in the paging_offset once paging_in_progress is set
3068          */
3069         upl->size = size;
3070         upl->offset = offset + object->paging_offset;
3071
3072 #if UPL_DEBUG
3073         queue_enter(&object->uplq, upl, upl_t, uplq);
3074 #endif /* UPL_DEBUG */
3075
3076         if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != VM_OBJECT_NULL) {
3077                 /*
3078                  * Honor copy-on-write obligations
3079                  *
3080                  * The caller is gathering these pages and
3081                  * might modify their contents.  We need to
3082                  * make sure that the copy object has its own
3083                  * private copies of these pages before we let
3084                  * the caller modify them.
3085                  */
3086                 vm_object_update(object,
3087                                  offset,
3088                                  size,
3089                                  NULL,
3090                                  NULL,
3091                                  FALSE, /* should_return */
3092                                  MEMORY_OBJECT_COPY_SYNC,
3093                                  VM_PROT_NO_CHANGE);
3094 #if DEVELOPMENT || DEBUG
3095                 upl_cow++;
3096                 upl_cow_pages += size >> PAGE_SHIFT;
3097 #endif
3098         }
3099         /*
3100          * remember which copy object we synchronized with
3101          */
3102         last_copy_object = object->copy;
3103         entry = 0;
3104
3105         xfer_size = size;
3106         dst_offset = offset;
3107
3108         dwp = &dw_array[0];
3109         dw_count = 0;
3110
3111         while (xfer_size) {
3112
3113                 dwp->dw_mask = 0;
3114
3115                 if ((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
3116                         vm_object_unlock(object);
3117                         VM_PAGE_GRAB_FICTITIOUS(alias_page);
3118                         vm_object_lock(object);
3119                 }
3120                 if (cntrl_flags & UPL_COPYOUT_FROM) {
3121                         upl->flags |= UPL_PAGE_SYNC_DONE;
3122
3123                         if ( ((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
3124                                 dst_page->fictitious ||
3125                                 dst_page->absent ||
3126                                 dst_page->error ||
3127                                (VM_PAGE_WIRED(dst_page) && !dst_page->pageout && !dst_page->list_req_pending)) {
3128
3129                                 if (user_page_list)
3130                                         user_page_list[entry].phys_addr = 0;
3131
3132                                 goto try_next_page;
3133                         }
3134                         /*
3135                          * grab this up front...
3136                          * a high percentange of the time we're going to
3137                          * need the hardware modification state a bit later
3138                          * anyway... so we can eliminate an extra call into
3139                          * the pmap layer by grabbing it here and recording it
3140                          */
3141                         if (dst_page->pmapped)
3142                                 refmod_state = pmap_get_refmod(dst_page->phys_page);
3143                         else
3144                                 refmod_state = 0;
3145
3146                         if ( (refmod_state & VM_MEM_REFERENCED) && dst_page->inactive ) {
3147                                 /*
3148                                  * page is on inactive list and referenced...
3149                                  * reactivate it now... this gets it out of the
3150                                  * way of vm_pageout_scan which would have to
3151                                  * reactivate it upon tripping over it
3152                                  */
3153                                 dwp->dw_mask |= DW_vm_page_activate;
3154                         }
3155                         if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
3156                                 /*
3157                                  * we're only asking for DIRTY pages to be returned
3158                                  */
3159                                 if (dst_page->list_req_pending || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
3160                                         /*
3161                                          * if we were the page stolen by vm_pageout_scan to be
3162                                          * cleaned (as opposed to a buddy being clustered in
3163                                          * or this request is not being driven by a PAGEOUT cluster
3164                                          * then we only need to check for the page being dirty or
3165                                          * precious to decide whether to return it
3166                                          */
3167                                         if (dst_page->dirty || dst_page->precious || (refmod_state & VM_MEM_MODIFIED))
3168                                                 goto check_busy;
3169                                         goto dont_return;
3170                                 }
3171                                 /*
3172                                  * this is a request for a PAGEOUT cluster and this page
3173                                  * is merely along for the ride as a 'buddy'... not only
3174                                  * does it have to be dirty to be returned, but it also
3175                                  * can't have been referenced recently... note that we've
3176                                  * already filtered above based on whether this page is
3177                                  * currently on the inactive queue or it meets the page
3178                                  * ticket (generation count) check
3179                                  */
3180                                 if ( !(refmod_state & VM_MEM_REFERENCED) &&
3181                                      ((refmod_state & VM_MEM_MODIFIED) || dst_page->dirty || dst_page->precious) ) {
3182                                         goto check_busy;
3183                                 }
3184 dont_return:
3185                                 /*
3186                                  * if we reach here, we're not to return
3187                                  * the page... go on to the next one
3188                                  */
3189                                 if (user_page_list)
3190                                         user_page_list[entry].phys_addr = 0;
3191
3192                                 goto try_next_page;
3193                         }
3194 check_busy:
3195                         if (dst_page->busy && (!(dst_page->list_req_pending && dst_page->pageout))) {
3196                                 if (cntrl_flags & UPL_NOBLOCK) {
3197                                         if (user_page_list)
3198                                                 user_page_list[entry].phys_addr = 0;
3199
3200                                         goto try_next_page;
3201                                 }
3202                                 /*
3203                                  * someone else is playing with the
3204                                  * page.  We will have to wait.
3205                                  */
3206                                 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
3207
3208                                 continue;
3209                         }
3210                         /*
3211                          * Someone else already cleaning the page?
3212                          */
3213                         if ((dst_page->cleaning || dst_page->absent || VM_PAGE_WIRED(dst_page)) && !dst_page->list_req_pending) {
3214                                 if (user_page_list)
3215                                         user_page_list[entry].phys_addr = 0;
3216
3217                                 goto try_next_page;
3218                         }
3219                         /*
3220                          * ENCRYPTED SWAP:
3221                          * The caller is gathering this page and might
3222                          * access its contents later on.  Decrypt the
3223                          * page before adding it to the UPL, so that
3224                          * the caller never sees encrypted data.
3225                          */
3226                         if (! (cntrl_flags & UPL_ENCRYPT) && dst_page->encrypted) {
3227                                 int  was_busy;
3228
3229                                 /*
3230                                  * save the current state of busy
3231                                  * mark page as busy while decrypt
3232                                  * is in progress since it will drop
3233                                  * the object lock...
3234                                  */
3235                                 was_busy = dst_page->busy;
3236                                 dst_page->busy = TRUE;
3237
3238                                 vm_page_decrypt(dst_page, 0);
3239                                 vm_page_decrypt_for_upl_counter++;
3240                                 /*
3241                                  * restore to original busy state
3242                                  */
3243                                 dst_page->busy = was_busy;
3244                         }
3245                         if (dst_page->pageout_queue == TRUE) {
3246
3247                                 vm_page_lockspin_queues();
3248
3249 #if CONFIG_EMBEDDED
3250                                 if (dst_page->laundry)
3251 #else
3252                                 if (dst_page->pageout_queue == TRUE)
3253 #endif
3254                                 {
3255                                         /*
3256                                          * we've buddied up a page for a clustered pageout
3257                                          * that has already been moved to the pageout
3258                                          * queue by pageout_scan... we need to remove
3259                                          * it from the queue and drop the laundry count
3260                                          * on that queue
3261                                          */
3262                                         vm_pageout_throttle_up(dst_page);
3263                                 }
3264                                 vm_page_unlock_queues();
3265                         }
3266 #if MACH_CLUSTER_STATS
3267                         /*
3268                          * pageout statistics gathering.  count
3269                          * all the pages we will page out that
3270                          * were not counted in the initial
3271                          * vm_pageout_scan work
3272                          */
3273                         if (dst_page->list_req_pending)
3274                                 encountered_lrp = TRUE;
3275                         if ((dst_page->dirty || (dst_page->object->internal && dst_page->precious)) && !dst_page->list_req_pending) {
3276                                 if (encountered_lrp)
3277                                         CLUSTER_STAT(pages_at_higher_offsets++;)
3278                                 else
3279                                         CLUSTER_STAT(pages_at_lower_offsets++;)
3280                         }
3281 #endif
3282                         /*
3283                          * Turn off busy indication on pending
3284                          * pageout.  Note: we can only get here
3285                          * in the request pending case.
3286                          */
3287                         dst_page->list_req_pending = FALSE;
3288                         dst_page->busy = FALSE;
3289
3290                         hw_dirty = refmod_state & VM_MEM_MODIFIED;
3291                         dirty = hw_dirty ? TRUE : dst_page->dirty;
3292
3293                         if (dst_page->phys_page > upl->highest_page)
3294                                 upl->highest_page = dst_page->phys_page;
3295
3296                         if (cntrl_flags & UPL_SET_LITE) {
3297                                 unsigned int    pg_num;
3298
3299                                 pg_num = (unsigned int) ((dst_offset-offset)/PAGE_SIZE);
3300                                 assert(pg_num == (dst_offset-offset)/PAGE_SIZE);
3301                                 lite_list[pg_num>>5] |= 1 << (pg_num & 31);
3302
3303                                 if (hw_dirty)
3304                                         pmap_clear_modify(dst_page->phys_page);
3305
3306                                 /*
3307                                  * Mark original page as cleaning
3308                                  * in place.
3309                                  */
3310                                 dst_page->cleaning = TRUE;
3311                                 dst_page->precious = FALSE;
3312                         } else {
3313                                 /*
3314                                  * use pageclean setup, it is more
3315                                  * convenient even for the pageout
3316                                  * cases here
3317                                  */
3318                                 vm_object_lock(upl->map_object);
3319                                 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
3320                                 vm_object_unlock(upl->map_object);
3321
3322                                 alias_page->absent = FALSE;
3323                                 alias_page = NULL;
3324                         }
3325 #if     MACH_PAGEMAP
3326                         /*
3327                          * Record that this page has been
3328                          * written out
3329                          */
3330                         vm_external_state_set(object->existence_map, dst_page->offset);
3331 #endif  /*MACH_PAGEMAP*/
3332                         dst_page->dirty = dirty;
3333
3334                         if (!dirty)
3335                                 dst_page->precious = TRUE;
3336
3337                         if (dst_page->pageout)
3338                                 dst_page->busy = TRUE;
3339
3340                         if ( (cntrl_flags & UPL_ENCRYPT) ) {
3341                                 /*
3342                                  * ENCRYPTED SWAP:
3343                                  * We want to deny access to the target page
3344                                  * because its contents are about to be
3345                                  * encrypted and the user would be very
3346                                  * confused to see encrypted data instead
3347                                  * of their data.
3348                                  * We also set "encrypted_cleaning" to allow
3349                                  * vm_pageout_scan() to demote that page
3350                                  * from "adjacent/clean-in-place" to
3351                                  * "target/clean-and-free" if it bumps into
3352                                  * this page during its scanning while we're
3353                                  * still processing this cluster.
3354                                  */
3355                                 dst_page->busy = TRUE;
3356                                 dst_page->encrypted_cleaning = TRUE;
3357                         }
3358                         if ( !(cntrl_flags & UPL_CLEAN_IN_PLACE) ) {
3359                                 /*
3360                                  * deny access to the target page
3361                                  * while it is being worked on
3362                                  */
3363                                 if ((!dst_page->pageout) && ( !VM_PAGE_WIRED(dst_page))) {
3364                                         dst_page->busy = TRUE;
3365                                         dst_page->pageout = TRUE;
3366
3367                                         dwp->dw_mask |= DW_vm_page_wire;
3368                                 }
3369                         }
3370                 } else {
3371                         if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != last_copy_object) {
3372                                 /*
3373                                  * Honor copy-on-write obligations
3374                                  *
3375                                  * The copy object has changed since we
3376                                  * last synchronized for copy-on-write.
3377                                  * Another copy object might have been
3378                                  * inserted while we released the object's
3379                                  * lock.  Since someone could have seen the
3380                                  * original contents of the remaining pages
3381                                  * through that new object, we have to
3382                                  * synchronize with it again for the remaining
3383                                  * pages only.  The previous pages are "busy"
3384                                  * so they can not be seen through the new
3385                                  * mapping.  The new mapping will see our
3386                                  * upcoming changes for those previous pages,
3387                                  * but that's OK since they couldn't see what
3388                                  * was there before.  It's just a race anyway
3389                                  * and there's no guarantee of consistency or
3390                                  * atomicity.  We just don't want new mappings
3391                                  * to see both the *before* and *after* pages.
3392                                  */
3393                                 if (object->copy != VM_OBJECT_NULL) {
3394                                         vm_object_update(
3395                                                 object,
3396                                                 dst_offset,/* current offset */
3397                                                 xfer_size, /* remaining size */
3398                                                 NULL,
3399                                                 NULL,
3400                                                 FALSE,     /* should_return */
3401                                                 MEMORY_OBJECT_COPY_SYNC,
3402                                                 VM_PROT_NO_CHANGE);
3403
3404 #if DEVELOPMENT || DEBUG
3405                                         upl_cow_again++;
3406                                         upl_cow_again_pages += xfer_size >> PAGE_SHIFT;
3407 #endif
3408                                 }
3409                                 /*
3410                                  * remember the copy object we synced with
3411                                  */
3412                                 last_copy_object = object->copy;
3413                         }
3414                         dst_page = vm_page_lookup(object, dst_offset);
3415
3416                         if (dst_page != VM_PAGE_NULL) {
3417
3418                                 if ((cntrl_flags & UPL_RET_ONLY_ABSENT)) {
3419
3420                                         if ( !(dst_page->absent && dst_page->list_req_pending) ) {
3421                                                 /*
3422                                                  * skip over pages already present in the cache
3423                                                  */
3424                                                 if (user_page_list)
3425                                                         user_page_list[entry].phys_addr = 0;
3426
3427                                                 goto try_next_page;
3428                                         }
3429                                 }
3430                                 if ( !(dst_page->list_req_pending) ) {
3431
3432                                         if (dst_page->cleaning) {
3433                                                 /*
3434                                                  * someone else is writing to the page... wait...
3435                                                  */
3436                                                 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
3437
3438                                                 continue;
3439                                         }
3440                                 } else {
3441                                         if (dst_page->fictitious &&
3442                                             dst_page->phys_page == vm_page_fictitious_addr) {
3443                                                 assert( !dst_page->speculative);
3444                                                 /*
3445                                                  * dump the fictitious page
3446                                                  */
3447                                                 dst_page->list_req_pending = FALSE;
3448
3449                                                 VM_PAGE_FREE(dst_page);
3450
3451                                                 dst_page = NULL;
3452
3453                                         } else if (dst_page->absent) {
3454                                                 /*
3455                                                  * the default_pager case
3456                                                  */
3457                                                 dst_page->list_req_pending = FALSE;
3458                                                 dst_page->busy = FALSE;
3459
3460                                         } else if (dst_page->pageout) {
3461                                                 /*
3462                                                  * page was earmarked by vm_pageout_scan
3463                                                  * to be cleaned and stolen... we're going
3464                                                  * to take it back since we are not attempting
3465                                                  * to read that page and we don't want to stall
3466                                                  * waiting for it to be cleaned for 2 reasons...
3467                                                  * 1 - no use paging it out and back in
3468                                                  * 2 - if we stall, we may casue a deadlock in
3469                                                  *     the FS trying to acquire the its locks
3470                                                  *     on the VNOP_PAGEOUT path presuming that
3471                                                  *     those locks are already held on the read
3472                                                  *     path before trying to create this UPL
3473                                                  *
3474                                                  * so undo all of the state that vm_pageout_scan
3475                                                  * hung on this page
3476                                                  */
3477                                                 dst_page->busy = FALSE;
3478
3479                                                 vm_pageout_queue_steal(dst_page, FALSE);
3480                                         }
3481                                 }
3482                         }
3483                         if (dst_page == VM_PAGE_NULL) {
3484                                 if (object->private) {
3485                                         /*
3486                                          * This is a nasty wrinkle for users
3487                                          * of upl who encounter device or
3488                                          * private memory however, it is
3489                                          * unavoidable, only a fault can
3490                                          * resolve the actual backing
3491                                          * physical page by asking the
3492                                          * backing device.
3493                                          */
3494                                         if (user_page_list)
3495                                                 user_page_list[entry].phys_addr = 0;
3496
3497                                         goto try_next_page;
3498                                 }
3499                                 /*
3500                                  * need to allocate a page
3501                                  */
3502                                 dst_page = vm_page_grab();
3503
3504                                 if (dst_page == VM_PAGE_NULL) {
3505                                         if ( (cntrl_flags & (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) == (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) {
3506                                                /*
3507                                                 * we don't want to stall waiting for pages to come onto the free list
3508                                                 * while we're already holding absent pages in this UPL
3509                                                 * the caller will deal with the empty slots
3510                                                 */
3511                                                 if (user_page_list)
3512                                                         user_page_list[entry].phys_addr = 0;
3513
3514                                                 goto try_next_page;
3515                                         }
3516                                         /*
3517                                          * no pages available... wait
3518                                          * then try again for the same
3519                                          * offset...
3520                                          */
3521                                         vm_object_unlock(object);
3522                                         VM_PAGE_WAIT();
3523                                         vm_object_lock(object);
3524
3525                                         continue;
3526                                 }
3527                                 vm_page_insert(dst_page, object, dst_offset);
3528
3529                                 dst_page->absent = TRUE;
3530                                 dst_page->busy = FALSE;
3531
3532                                 if (cntrl_flags & UPL_RET_ONLY_ABSENT) {
3533                                         /*
3534                                          * if UPL_RET_ONLY_ABSENT was specified,
3535                                          * than we're definitely setting up a
3536                                          * upl for a clustered read/pagein
3537                                          * operation... mark the pages as clustered
3538                                          * so upl_commit_range can put them on the
3539                                          * speculative list
3540                                          */
3541                                         dst_page->clustered = TRUE;
3542                                 }
3543                         }
3544                         if (dst_page->fictitious) {
3545                                 panic("need corner case for fictitious page");
3546                         }
3547                         if (dst_page->busy) {
3548                                 /*
3549                                  * someone else is playing with the
3550                                  * page.  We will have to wait.
3551                                  */
3552                                 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
3553
3554                                 continue;
3555                         }
3556                         /*
3557                          * ENCRYPTED SWAP:
3558                          */
3559                         if (cntrl_flags & UPL_ENCRYPT) {
3560                                 /*
3561                                  * The page is going to be encrypted when we
3562                                  * get it from the pager, so mark it so.
3563                                  */
3564                                 dst_page->encrypted = TRUE;
3565                         } else {
3566                                 /*
3567                                  * Otherwise, the page will not contain
3568                                  * encrypted data.
3569                                  */
3570                                 dst_page->encrypted = FALSE;
3571                         }
3572                         dst_page->overwriting = TRUE;
3573
3574                         if (dst_page->pmapped) {
3575                                 if ( !(cntrl_flags & UPL_FILE_IO))
3576                                         /*
3577                                          * eliminate all mappings from the
3578                                          * original object and its prodigy
3579                                          */
3580                                         refmod_state = pmap_disconnect(dst_page->phys_page);
3581                                 else
3582                                         refmod_state = pmap_get_refmod(dst_page->phys_page);
3583                         } else
3584                                 refmod_state = 0;
3585
3586                         hw_dirty = refmod_state & VM_MEM_MODIFIED;
3587                         dirty = hw_dirty ? TRUE : dst_page->dirty;
3588
3589                         if (cntrl_flags & UPL_SET_LITE) {
3590                                 unsigned int    pg_num;
3591
3592                                 pg_num = (unsigned int) ((dst_offset-offset)/PAGE_SIZE);
3593                                 assert(pg_num == (dst_offset-offset)/PAGE_SIZE);
3594                                 lite_list[pg_num>>5] |= 1 << (pg_num & 31);
3595
3596                                 if (hw_dirty)
3597                                         pmap_clear_modify(dst_page->phys_page);
3598
3599                                 /*
3600                                  * Mark original page as cleaning
3601                                  * in place.
3602                                  */
3603                                 dst_page->cleaning = TRUE;
3604                                 dst_page->precious = FALSE;
3605                         } else {
3606                                 /*
3607                                  * use pageclean setup, it is more
3608                                  * convenient even for the pageout
3609                                  * cases here
3610                                  */
3611                                 vm_object_lock(upl->map_object);
3612                                 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
3613                                 vm_object_unlock(upl->map_object);
3614
3615                                 alias_page->absent = FALSE;
3616                                 alias_page = NULL;
3617                         }
3618
3619                         if (cntrl_flags & UPL_CLEAN_IN_PLACE) {
3620                                 /*
3621                                  * clean in place for read implies
3622                                  * that a write will be done on all
3623                                  * the pages that are dirty before
3624                                  * a upl commit is done.  The caller
3625                                  * is obligated to preserve the
3626                                  * contents of all pages marked dirty
3627                                  */
3628                                 upl->flags |= UPL_CLEAR_DIRTY;
3629                         }
3630                         dst_page->dirty = dirty;
3631
3632                         if (!dirty)
3633                                 dst_page->precious = TRUE;
3634
3635                         if ( !VM_PAGE_WIRED(dst_page)) {
3636                                 /*
3637                                  * deny access to the target page while
3638                                  * it is being worked on
3639                                  */
3640                                 dst_page->busy = TRUE;
3641                         } else
3642                                 dwp->dw_mask |= DW_vm_page_wire;
3643
3644                         /*
3645                          * We might be about to satisfy a fault which has been
3646                          * requested. So no need for the "restart" bit.
3647                          */
3648                         dst_page->restart = FALSE;
3649                         if (!dst_page->absent && !(cntrl_flags & UPL_WILL_MODIFY)) {
3650                                 /*
3651                                  * expect the page to be used
3652                                  */
3653                                 dwp->dw_mask |= DW_set_reference;
3654                         }
3655                         dst_page->precious = (cntrl_flags & UPL_PRECIOUS) ? TRUE : FALSE;
3656                 }
3657                 if (dst_page->phys_page > upl->highest_page)
3658                         upl->highest_page = dst_page->phys_page;
3659                 if (user_page_list) {
3660                         user_page_list[entry].phys_addr = dst_page->phys_page;
3661                         user_page_list[entry].pageout   = dst_page->pageout;
3662                         user_page_list[entry].absent    = dst_page->absent;
3663                         user_page_list[entry].dirty     = dst_page->dirty;
3664                         user_page_list[entry].precious  = dst_page->precious;
3665                         user_page_list[entry].device    = FALSE;
3666                         if (dst_page->clustered == TRUE)
3667                                 user_page_list[entry].speculative = dst_page->speculative;
3668                         else
3669                                 user_page_list[entry].speculative = FALSE;
3670                         user_page_list[entry].cs_validated = dst_page->cs_validated;
3671                         user_page_list[entry].cs_tainted = dst_page->cs_tainted;
3672                 }
3673                 /*
3674                  * if UPL_RET_ONLY_ABSENT is set, then
3675                  * we are working with a fresh page and we've
3676                  * just set the clustered flag on it to
3677                  * indicate that it was drug in as part of a
3678                  * speculative cluster... so leave it alone
3679                  */
3680                 if ( !(cntrl_flags & UPL_RET_ONLY_ABSENT)) {
3681                         /*
3682                          * someone is explicitly grabbing this page...
3683                          * update clustered and speculative state
3684                          *
3685                          */
3686                         VM_PAGE_CONSUME_CLUSTERED(dst_page);
3687                 }
3688 try_next_page:
3689                 if (dwp->dw_mask) {
3690                         if (dwp->dw_mask & DW_vm_page_activate)
3691                                 VM_STAT_INCR(reactivations);
3692
3693                         if (dst_page->busy == FALSE) {
3694                                 /*
3695                                  * dw_do_work may need to drop the object lock
3696                                  * if it does, we need the pages it's looking at to
3697                                  * be held stable via the busy bit.
3698                                  */
3699                                 dst_page->busy = TRUE;
3700                                 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
3701                         }
3702                         dwp->dw_m = dst_page;
3703                         dwp++;
3704                         dw_count++;
3705
3706                         if (dw_count >= DELAYED_WORK_LIMIT) {
3707                                 dw_do_work(object, &dw_array[0], dw_count);
3708
3709                                 dwp = &dw_array[0];
3710                                 dw_count = 0;
3711                         }
3712                 }
3713                 entry++;
3714                 dst_offset += PAGE_SIZE_64;
3715                 xfer_size -= PAGE_SIZE;
3716         }
3717         if (dw_count)
3718                 dw_do_work(object, &dw_array[0], dw_count);
3719
3720         if (alias_page != NULL) {
3721                 VM_PAGE_FREE(alias_page);
3722         }
3723
3724         if (page_list_count != NULL) {
3725                 if (upl->flags & UPL_INTERNAL)
3726                         *page_list_count = 0;
3727                 else if (*page_list_count > entry)
3728                         *page_list_count = entry;
3729         }
3730 #if UPL_DEBUG
3731         upl->upl_state = 1;
3732 #endif
3733         vm_object_unlock(object);
3734
3735         return KERN_SUCCESS;
3736 }
3737
3738 /* JMM - Backward compatability for now */
3739 kern_return_t
3740 vm_fault_list_request(                  /* forward */
3741         memory_object_control_t         control,
3742         vm_object_offset_t      offset,
3743         upl_size_t              size,
3744         upl_t                   *upl_ptr,
3745         upl_page_info_t         **user_page_list_ptr,
3746         unsigned int            page_list_count,
3747         int                     cntrl_flags);
3748 kern_return_t
3749 vm_fault_list_request(
3750         memory_object_control_t         control,
3751         vm_object_offset_t      offset,
3752         upl_size_t              size,
3753         upl_t                   *upl_ptr,
3754         upl_page_info_t         **user_page_list_ptr,
3755         unsigned int            page_list_count,
3756         int                     cntrl_flags)
3757 {
3758         unsigned int            local_list_count;
3759         upl_page_info_t         *user_page_list;
3760         kern_return_t           kr;
3761
3762         if((cntrl_flags & UPL_VECTOR)==UPL_VECTOR)
3763                  return KERN_INVALID_ARGUMENT;
3764
3765         if (user_page_list_ptr != NULL) {
3766                 local_list_count = page_list_count;
3767                 user_page_list = *user_page_list_ptr;
3768         } else {
3769                 local_list_count = 0;
3770                 user_page_list = NULL;
3771         }
3772         kr =  memory_object_upl_request(control,
3773                                 offset,
3774                                 size,
3775                                 upl_ptr,
3776                                 user_page_list,
3777                                 &local_list_count,
3778                                 cntrl_flags);
3779
3780         if(kr != KERN_SUCCESS)
3781                 return kr;
3782
3783         if ((user_page_list_ptr != NULL) && (cntrl_flags & UPL_INTERNAL)) {
3784                 *user_page_list_ptr = UPL_GET_INTERNAL_PAGE_LIST(*upl_ptr);
3785         }
3786
3787         return KERN_SUCCESS;
3788 }
3789
3790
3791
3792 /*
3793  *      Routine:        vm_object_super_upl_request
3794  *      Purpose:
3795  *              Cause the population of a portion of a vm_object
3796  *              in much the same way as memory_object_upl_request.
3797  *              Depending on the nature of the request, the pages
3798  *              returned may be contain valid data or be uninitialized.
3799  *              However, the region may be expanded up to the super
3800  *              cluster size provided.
3801  */
3802
3803 __private_extern__ kern_return_t
3804 vm_object_super_upl_request(
3805         vm_object_t object,
3806         vm_object_offset_t      offset,
3807         upl_size_t              size,
3808         upl_size_t              super_cluster,
3809         upl_t                   *upl,
3810         upl_page_info_t         *user_page_list,
3811         unsigned int            *page_list_count,
3812         int                     cntrl_flags)
3813 {
3814         if (object->paging_offset > offset  || ((cntrl_flags & UPL_VECTOR)==UPL_VECTOR))
3815                 return KERN_FAILURE;
3816
3817         assert(object->paging_in_progress);
3818         offset = offset - object->paging_offset;
3819
3820         if (super_cluster > size) {
3821
3822                 vm_object_offset_t      base_offset;
3823                 upl_size_t              super_size;
3824                 vm_object_size_t        super_size_64;
3825
3826                 base_offset = (offset & ~((vm_object_offset_t) super_cluster - 1));
3827                 super_size = (offset + size) > (base_offset + super_cluster) ? super_cluster<<1 : super_cluster;
3828                 super_size_64 = ((base_offset + super_size) > object->size) ? (object->size - base_offset) : super_size;
3829                 super_size = (upl_size_t) super_size_64;
3830                 assert(super_size == super_size_64);
3831
3832                 if (offset > (base_offset + super_size)) {
3833                         panic("vm_object_super_upl_request: Missed target pageout"
3834                               " %#llx,%#llx, %#x, %#x, %#x, %#llx\n",
3835                               offset, base_offset, super_size, super_cluster,
3836                               size, object->paging_offset);
3837                 }
3838                 /*
3839                  * apparently there is a case where the vm requests a
3840                  * page to be written out who's offset is beyond the
3841                  * object size
3842                  */
3843                 if ((offset + size) > (base_offset + super_size)) {
3844                         super_size_64 = (offset + size) - base_offset;
3845                         super_size = (upl_size_t) super_size_64;
3846                         assert(super_size == super_size_64);
3847                 }
3848
3849                 offset = base_offset;
3850                 size = super_size;
3851         }
3852         return vm_object_upl_request(object, offset, size, upl, user_page_list, page_list_count, cntrl_flags);
3853 }
3854
3855
3856 kern_return_t
3857 vm_map_create_upl(
3858         vm_map_t                map,
3859         vm_map_address_t        offset,
3860         upl_size_t              *upl_size,
3861         upl_t                   *upl,
3862         upl_page_info_array_t   page_list,
3863         unsigned int            *count,
3864         int                     *flags)
3865 {
3866         vm_map_entry_t  entry;
3867         int             caller_flags;
3868         int             force_data_sync;
3869         int             sync_cow_data;
3870         vm_object_t     local_object;
3871         vm_map_offset_t local_offset;
3872         vm_map_offset_t local_start;
3873         kern_return_t   ret;
3874
3875         caller_flags = *flags;
3876
3877         if (caller_flags & ~UPL_VALID_FLAGS) {
3878                 /*
3879                  * For forward compatibility's sake,
3880                  * reject any unknown flag.
3881                  */
3882                 return KERN_INVALID_VALUE;
3883         }
3884         force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
3885         sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
3886
3887         if (upl == NULL)
3888                 return KERN_INVALID_ARGUMENT;
3889
3890 REDISCOVER_ENTRY:
3891         vm_map_lock_read(map);
3892
3893         if (vm_map_lookup_entry(map, offset, &entry)) {
3894
3895                 if ((entry->vme_end - offset) < *upl_size) {
3896                         *upl_size = (upl_size_t) (entry->vme_end - offset);
3897                         assert(*upl_size == entry->vme_end - offset);
3898                 }
3899
3900                 if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
3901                         *flags = 0;
3902
3903                         if ( !entry->is_sub_map && entry->object.vm_object != VM_OBJECT_NULL) {
3904                                 if (entry->object.vm_object->private)
3905                                         *flags = UPL_DEV_MEMORY;
3906
3907                                 if (entry->object.vm_object->phys_contiguous)
3908                                         *flags |= UPL_PHYS_CONTIG;
3909                         }
3910                         vm_map_unlock_read(map);
3911
3912                         return KERN_SUCCESS;
3913                 }
3914                 if (entry->object.vm_object == VM_OBJECT_NULL || !entry->object.vm_object->phys_contiguous) {
3915                         if ((*upl_size/PAGE_SIZE) > MAX_UPL_SIZE)
3916                                 *upl_size = MAX_UPL_SIZE * PAGE_SIZE;
3917                 }
3918                 /*
3919                  *      Create an object if necessary.
3920                  */
3921                 if (entry->object.vm_object == VM_OBJECT_NULL) {
3922
3923                         if (vm_map_lock_read_to_write(map))
3924                                 goto REDISCOVER_ENTRY;
3925
3926                         entry->object.vm_object = vm_object_allocate((vm_size_t)(entry->vme_end - entry->vme_start));
3927                         entry->offset = 0;
3928
3929                         vm_map_lock_write_to_read(map);
3930                 }
3931                 if (!(caller_flags & UPL_COPYOUT_FROM)) {
3932                         if (!(entry->protection & VM_PROT_WRITE)) {
3933                                 vm_map_unlock_read(map);
3934                                 return KERN_PROTECTION_FAILURE;
3935                         }
3936                         if (entry->needs_copy)  {
3937                                 /*
3938                                  * Honor copy-on-write for COPY_SYMMETRIC
3939                                  * strategy.
3940                                  */
3941                                 vm_map_t                local_map;
3942                                 vm_object_t             object;
3943                                 vm_object_offset_t      new_offset;
3944                                 vm_prot_t               prot;
3945                                 boolean_t               wired;
3946                                 vm_map_version_t        version;
3947                                 vm_map_t                real_map;
3948
3949                                 local_map = map;
3950
3951                                 if (vm_map_lookup_locked(&local_map,
3952                                                          offset, VM_PROT_WRITE,
3953                                                          OBJECT_LOCK_EXCLUSIVE,
3954                                                          &version, &object,
3955                                                          &new_offset, &prot, &wired,
3956                                                          NULL,
3957                                                          &real_map) != KERN_SUCCESS) {
3958                                         vm_map_unlock_read(local_map);
3959                                         return KERN_FAILURE;
3960                                 }
3961                                 if (real_map != map)
3962                                         vm_map_unlock(real_map);
3963                                 vm_map_unlock_read(local_map);
3964
3965                                 vm_object_unlock(object);
3966
3967                                 goto REDISCOVER_ENTRY;
3968                         }
3969                 }
3970                 if (entry->is_sub_map) {
3971                         vm_map_t        submap;
3972
3973                         submap = entry->object.sub_map;
3974                         local_start = entry->vme_start;
3975                         local_offset = entry->offset;
3976
3977                         vm_map_reference(submap);
3978                         vm_map_unlock_read(map);
3979
3980                         ret = vm_map_create_upl(submap,
3981                                                 local_offset + (offset - local_start),
3982                                                 upl_size, upl, page_list, count, flags);
3983                         vm_map_deallocate(submap);
3984
3985                         return ret;
3986                 }
3987                 if (sync_cow_data) {
3988                         if (entry->object.vm_object->shadow || entry->object.vm_object->copy) {
3989                                 local_object = entry->object.vm_object;
3990                                 local_start = entry->vme_start;
3991                                 local_offset = entry->offset;
3992
3993                                 vm_object_reference(local_object);
3994                                 vm_map_unlock_read(map);
3995
3996                                 if (local_object->shadow && local_object->copy) {
3997                                         vm_object_lock_request(
3998                                                                local_object->shadow,
3999                                                                (vm_object_offset_t)
4000                                                                ((offset - local_start) +
4001                                                                 local_offset) +
4002                                                                local_object->shadow_offset,
4003                                                                *upl_size, FALSE,
4004                                                                MEMORY_OBJECT_DATA_SYNC,
4005                                                                VM_PROT_NO_CHANGE);
4006                                 }
4007                                 sync_cow_data = FALSE;
4008                                 vm_object_deallocate(local_object);
4009
4010                                 goto REDISCOVER_ENTRY;
4011                         }
4012                 }
4013                 if (force_data_sync) {
4014                         local_object = entry->object.vm_object;
4015                         local_start = entry->vme_start;
4016                         local_offset = entry->offset;
4017
4018                         vm_object_reference(local_object);
4019                         vm_map_unlock_read(map);
4020
4021                         vm_object_lock_request(
4022                                                local_object,
4023                                                (vm_object_offset_t)
4024                                                ((offset - local_start) + local_offset),
4025                                                (vm_object_size_t)*upl_size, FALSE,
4026                                                MEMORY_OBJECT_DATA_SYNC,
4027                                                VM_PROT_NO_CHANGE);
4028
4029                         force_data_sync = FALSE;
4030                         vm_object_deallocate(local_object);
4031
4032                         goto REDISCOVER_ENTRY;
4033                 }
4034                 if (entry->object.vm_object->private)
4035                         *flags = UPL_DEV_MEMORY;
4036                 else
4037                         *flags = 0;
4038
4039                 if (entry->object.vm_object->phys_contiguous)
4040                         *flags |= UPL_PHYS_CONTIG;
4041
4042                 local_object = entry->object.vm_object;
4043                 local_offset = entry->offset;
4044                 local_start = entry->vme_start;
4045
4046                 vm_object_reference(local_object);
4047                 vm_map_unlock_read(map);
4048
4049                 ret = vm_object_iopl_request(local_object,
4050                                               (vm_object_offset_t) ((offset - local_start) + local_offset),
4051                                               *upl_size,
4052                                               upl,
4053                                               page_list,
4054                                               count,
4055                                               caller_flags);
4056                 vm_object_deallocate(local_object);
4057
4058                 return(ret);
4059         }
4060         vm_map_unlock_read(map);
4061
4062         return(KERN_FAILURE);
4063 }
4064
4065 /*
4066  * Internal routine to enter a UPL into a VM map.
4067  *
4068  * JMM - This should just be doable through the standard
4069  * vm_map_enter() API.
4070  */
4071 kern_return_t
4072 vm_map_enter_upl(
4073         vm_map_t                map,
4074         upl_t                   upl,
4075         vm_map_offset_t         *dst_addr)
4076 {
4077         vm_map_size_t           size;
4078         vm_object_offset_t      offset;
4079         vm_map_offset_t         addr;
4080         vm_page_t               m;
4081         kern_return_t           kr;
4082         int                     isVectorUPL = 0, curr_upl=0;
4083         upl_t                   vector_upl = NULL;
4084         vm_offset_t             vector_upl_dst_addr = 0;
4085         vm_map_t                vector_upl_submap = NULL;
4086         upl_offset_t            subupl_offset = 0;
4087         upl_size_t              subupl_size = 0;
4088
4089         if (upl == UPL_NULL)
4090                 return KERN_INVALID_ARGUMENT;
4091
4092         if((isVectorUPL = vector_upl_is_valid(upl))) {
4093                 int mapped=0,valid_upls=0;
4094                 vector_upl = upl;
4095
4096                 upl_lock(vector_upl);
4097                 for(curr_upl=0; curr_upl < MAX_VECTOR_UPL_ELEMENTS; curr_upl++) {
4098                         upl =  vector_upl_subupl_byindex(vector_upl, curr_upl );
4099                         if(upl == NULL)
4100                                 continue;
4101                         valid_upls++;
4102                         if (UPL_PAGE_LIST_MAPPED & upl->flags)
4103                                 mapped++;
4104                 }
4105
4106                 if(mapped) {
4107                         if(mapped != valid_upls)
4108                                 panic("Only %d of the %d sub-upls within the Vector UPL are alread mapped\n", mapped, valid_upls);
4109                         else {
4110                                 upl_unlock(vector_upl);
4111                                 return KERN_FAILURE;
4112                         }
4113                 }
4114
4115                 kr = kmem_suballoc(map, &vector_upl_dst_addr, vector_upl->size, FALSE, VM_FLAGS_ANYWHERE, &vector_upl_submap);
4116                 if( kr != KERN_SUCCESS )
4117                         panic("Vector UPL submap allocation failed\n");
4118                 map = vector_upl_submap;
4119                 vector_upl_set_submap(vector_upl, vector_upl_submap, vector_upl_dst_addr);
4120                 curr_upl=0;
4121         }
4122         else
4123                 upl_lock(upl);
4124
4125 process_upl_to_enter:
4126         if(isVectorUPL){
4127                 if(curr_upl == MAX_VECTOR_UPL_ELEMENTS) {
4128                         *dst_addr = vector_upl_dst_addr;
4129                         upl_unlock(vector_upl);
4130                         return KERN_SUCCESS;
4131                 }
4132                 upl =  vector_upl_subupl_byindex(vector_upl, curr_upl++ );
4133                 if(upl == NULL)
4134                         goto process_upl_to_enter;
4135                 vector_upl_get_iostate(vector_upl, upl, &subupl_offset, &subupl_size);
4136                 *dst_addr = (vm_map_offset_t)(vector_upl_dst_addr + (vm_map_offset_t)subupl_offset);
4137         }
4138
4139         /*
4140          * check to see if already mapped
4141          */
4142         if (UPL_PAGE_LIST_MAPPED & upl->flags) {
4143                 upl_unlock(upl);
4144                 return KERN_FAILURE;
4145         }
4146
4147         if ((!(upl->flags & UPL_SHADOWED)) && !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) ||
4148                                                (upl->map_object->phys_contiguous))) {
4149                 vm_object_t             object;
4150                 vm_page_t               alias_page;
4151                 vm_object_offset_t      new_offset;
4152                 unsigned int            pg_num;
4153                 wpl_array_t             lite_list;
4154
4155                 if (upl->flags & UPL_INTERNAL) {
4156                         lite_list = (wpl_array_t)
4157                                 ((((uintptr_t)upl) + sizeof(struct upl))
4158                                  + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
4159                 } else {
4160                         lite_list = (wpl_array_t)(((uintptr_t)upl) + sizeof(struct upl));
4161                 }
4162                 object = upl->map_object;
4163                 upl->map_object = vm_object_allocate(upl->size);
4164
4165                 vm_object_lock(upl->map_object);
4166
4167                 upl->map_object->shadow = object;
4168                 upl->map_object->pageout = TRUE;
4169                 upl->map_object->can_persist = FALSE;
4170                 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
4171                 upl->map_object->shadow_offset = upl->offset - object->paging_offset;
4172                 upl->map_object->wimg_bits = object->wimg_bits;
4173                 offset = upl->map_object->shadow_offset;
4174                 new_offset = 0;
4175                 size = upl->size;
4176
4177                 upl->flags |= UPL_SHADOWED;
4178
4179                 while (size) {
4180                         pg_num = (unsigned int) (new_offset / PAGE_SIZE);
4181                         assert(pg_num == new_offset / PAGE_SIZE);
4182
4183                         if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
4184
4185                                 VM_PAGE_GRAB_FICTITIOUS(alias_page);
4186
4187                                 vm_object_lock(object);
4188
4189                                 m = vm_page_lookup(object, offset);
4190                                 if (m == VM_PAGE_NULL) {
4191                                         panic("vm_upl_map: page missing\n");
4192                                 }
4193
4194                                 /*
4195                                  * Convert the fictitious page to a private
4196                                  * shadow of the real page.
4197                                  */
4198                                 assert(alias_page->fictitious);
4199                                 alias_page->fictitious = FALSE;
4200                                 alias_page->private = TRUE;
4201                                 alias_page->pageout = TRUE;
4202                                 /*
4203                                  * since m is a page in the upl it must
4204                                  * already be wired or BUSY, so it's
4205                                  * safe to assign the underlying physical
4206                                  * page to the alias
4207                                  */
4208                                 alias_page->phys_page = m->phys_page;
4209
4210                                 vm_object_unlock(object);
4211
4212                                 vm_page_lockspin_queues();
4213                                 vm_page_wire(alias_page);
4214                                 vm_page_unlock_queues();
4215
4216                                 /*
4217                                  * ENCRYPTED SWAP:
4218                                  * The virtual page ("m") has to be wired in some way
4219                                  * here or its physical page ("m->phys_page") could
4220                                  * be recycled at any time.
4221                                  * Assuming this is enforced by the caller, we can't
4222                                  * get an encrypted page here.  Since the encryption
4223                                  * key depends on the VM page's "pager" object and
4224                                  * the "paging_offset", we couldn't handle 2 pageable
4225                                  * VM pages (with different pagers and paging_offsets)
4226                                  * sharing the same physical page:  we could end up
4227                                  * encrypting with one key (via one VM page) and
4228                                  * decrypting with another key (via the alias VM page).
4229                                  */
4230                                 ASSERT_PAGE_DECRYPTED(m);
4231
4232                                 vm_page_insert(alias_page, upl->map_object, new_offset);
4233
4234                                 assert(!alias_page->wanted);
4235                                 alias_page->busy = FALSE;
4236                                 alias_page->absent = FALSE;
4237                         }
4238                         size -= PAGE_SIZE;
4239                         offset += PAGE_SIZE_64;
4240                         new_offset += PAGE_SIZE_64;
4241                 }
4242                 vm_object_unlock(upl->map_object);
4243         }
4244         if ((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || upl->map_object->phys_contiguous)
4245                 offset = upl->offset - upl->map_object->paging_offset;
4246         else
4247                 offset = 0;
4248         size = upl->size;
4249
4250         vm_object_reference(upl->map_object);
4251
4252         if(!isVectorUPL) {
4253                 *dst_addr = 0;
4254                 /*
4255                 * NEED A UPL_MAP ALIAS
4256                 */
4257                 kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
4258                                   VM_FLAGS_ANYWHERE, upl->map_object, offset, FALSE,
4259                                   VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
4260         }
4261         else {
4262                 kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
4263                                   VM_FLAGS_FIXED, upl->map_object, offset, FALSE,
4264                                   VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
4265                 if(kr)
4266                         panic("vm_map_enter failed for a Vector UPL\n");
4267         }
4268
4269         if (kr != KERN_SUCCESS) {
4270                 upl_unlock(upl);
4271                 return(kr);
4272         }
4273         vm_object_lock(upl->map_object);
4274
4275         for (addr = *dst_addr; size > 0; size -= PAGE_SIZE, addr += PAGE_SIZE) {
4276                 m = vm_page_lookup(upl->map_object, offset);
4277
4278                 if (m) {
4279                         unsigned int    cache_attr;
4280                         cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
4281
4282                         m->pmapped = TRUE;
4283
4284                         /* CODE SIGNING ENFORCEMENT: page has been wpmapped,
4285                          * but only in kernel space. If this was on a user map,
4286                          * we'd have to set the wpmapped bit. */
4287                         /* m->wpmapped = TRUE; */
4288                         assert(map==kernel_map);
4289
4290                         PMAP_ENTER(map->pmap, addr, m, VM_PROT_ALL, cache_attr, TRUE);
4291                 }
4292                 offset += PAGE_SIZE_64;
4293         }
4294         vm_object_unlock(upl->map_object);
4295
4296         /*
4297          * hold a reference for the mapping
4298          */
4299         upl->ref_count++;
4300         upl->flags |= UPL_PAGE_LIST_MAPPED;
4301         upl->kaddr = (vm_offset_t) *dst_addr;
4302         assert(upl->kaddr == *dst_addr);
4303
4304         if(!isVectorUPL)
4305                 upl_unlock(upl);
4306         else
4307                 goto process_upl_to_enter;
4308
4309         return KERN_SUCCESS;
4310 }
4311
4312 /*
4313  * Internal routine to remove a UPL mapping from a VM map.
4314  *
4315  * XXX - This should just be doable through a standard
4316  * vm_map_remove() operation.  Otherwise, implicit clean-up
4317  * of the target map won't be able to correctly remove
4318  * these (and release the reference on the UPL).  Having
4319  * to do this means we can't map these into user-space
4320  * maps yet.
4321  */
4322 kern_return_t
4323 vm_map_remove_upl(
4324         vm_map_t        map,
4325         upl_t           upl)
4326 {
4327         vm_address_t    addr;
4328         upl_size_t      size;
4329         int             isVectorUPL = 0, curr_upl = 0;
4330         upl_t           vector_upl = NULL;
4331
4332         if (upl == UPL_NULL)
4333                 return KERN_INVALID_ARGUMENT;
4334
4335         if((isVectorUPL = vector_upl_is_valid(upl))) {
4336                 int     unmapped=0, valid_upls=0;
4337                 vector_upl = upl;
4338                 upl_lock(vector_upl);
4339                 for(curr_upl=0; curr_upl < MAX_VECTOR_UPL_ELEMENTS; curr_upl++) {
4340                         upl =  vector_upl_subupl_byindex(vector_upl, curr_upl );
4341                         if(upl == NULL)
4342                                 continue;
4343                         valid_upls++;
4344                         if (!(UPL_PAGE_LIST_MAPPED & upl->flags))
4345                                 unmapped++;
4346                 }
4347
4348                 if(unmapped) {
4349                         if(unmapped != valid_upls)
4350                                 panic("%d of the %d sub-upls within the Vector UPL is/are not mapped\n", unmapped, valid_upls);
4351                         else {
4352                                 upl_unlock(vector_upl);
4353                                 return KERN_FAILURE;
4354                         }
4355                 }
4356                 curr_upl=0;
4357         }
4358         else
4359                 upl_lock(upl);
4360
4361 process_upl_to_remove:
4362         if(isVectorUPL) {
4363                 if(curr_upl == MAX_VECTOR_UPL_ELEMENTS) {
4364                         vm_map_t v_upl_submap;
4365                         vm_offset_t v_upl_submap_dst_addr;
4366                         vector_upl_get_submap(vector_upl, &v_upl_submap, &v_upl_submap_dst_addr);
4367
4368                         vm_map_remove(map, v_upl_submap_dst_addr, v_upl_submap_dst_addr + vector_upl->size, VM_MAP_NO_FLAGS);
4369                         vm_map_deallocate(v_upl_submap);
4370                         upl_unlock(vector_upl);
4371                         return KERN_SUCCESS;
4372                 }
4373
4374                 upl =  vector_upl_subupl_byindex(vector_upl, curr_upl++ );
4375                 if(upl == NULL)
4376                         goto process_upl_to_remove;
4377         }
4378
4379         if (upl->flags & UPL_PAGE_LIST_MAPPED) {
4380                 addr = upl->kaddr;
4381                 size = upl->size;
4382
4383                 assert(upl->ref_count > 1);
4384                 upl->ref_count--;               /* removing mapping ref */
4385
4386                 upl->flags &= ~UPL_PAGE_LIST_MAPPED;
4387                 upl->kaddr = (vm_offset_t) 0;
4388
4389                 if(!isVectorUPL) {
4390                         upl_unlock(upl);
4391
4392                         vm_map_remove(map,
4393                                 vm_map_trunc_page(addr),
4394                                 vm_map_round_page(addr + size),
4395                                 VM_MAP_NO_FLAGS);
4396
4397                         return KERN_SUCCESS;
4398                 }
4399                 else {
4400                         /*
4401                         * If it's a Vectored UPL, we'll be removing the entire
4402                         * submap anyways, so no need to remove individual UPL
4403                         * element mappings from within the submap
4404                         */
4405                         goto process_upl_to_remove;
4406                 }
4407         }
4408         upl_unlock(upl);
4409
4410         return KERN_FAILURE;
4411 }
4412
4413 static void
4414 dw_do_work(
4415         vm_object_t     object,
4416         struct dw       *dwp,
4417         int             dw_count)
4418 {
4419         int             j;
4420         boolean_t       held_as_spin = TRUE;
4421
4422         /*
4423          * pageout_scan takes the vm_page_lock_queues first
4424          * then tries for the object lock... to avoid what
4425          * is effectively a lock inversion, we'll go to the
4426          * trouble of taking them in that same order... otherwise
4427          * if this object contains the majority of the pages resident
4428          * in the UBC (or a small set of large objects actively being
4429          * worked on contain the majority of the pages), we could
4430          * cause the pageout_scan thread to 'starve' in its attempt
4431          * to find pages to move to the free queue, since it has to
4432          * successfully acquire the object lock of any candidate page
4433          * before it can steal/clean it.
4434          */
4435         if (!vm_page_trylockspin_queues()) {
4436                 vm_object_unlock(object);
4437
4438                 vm_page_lockspin_queues();
4439
4440                 for (j = 0; ; j++) {
4441                         if (!vm_object_lock_avoid(object) &&
4442                             _vm_object_lock_try(object))
4443                                 break;
4444                         vm_page_unlock_queues();
4445                         mutex_pause(j);
4446                         vm_page_lockspin_queues();
4447                 }
4448         }
4449         for (j = 0; j < dw_count; j++, dwp++) {
4450
4451                 if (dwp->dw_mask & DW_vm_pageout_throttle_up)
4452                         vm_pageout_throttle_up(dwp->dw_m);
4453
4454                 if (dwp->dw_mask & DW_vm_page_wire)
4455                         vm_page_wire(dwp->dw_m);
4456                 else if (dwp->dw_mask & DW_vm_page_unwire)
4457                         vm_page_unwire(dwp->dw_m);
4458
4459                 if (dwp->dw_mask & DW_vm_page_free) {
4460                         if (held_as_spin == TRUE) {
4461                                 vm_page_lockconvert_queues();
4462                                 held_as_spin = FALSE;
4463                         }
4464                         vm_page_free(dwp->dw_m);
4465                 } else {
4466                         if (dwp->dw_mask & DW_vm_page_deactivate_internal)
4467                                 vm_page_deactivate_internal(dwp->dw_m, FALSE);
4468                         else if (dwp->dw_mask & DW_vm_page_activate)
4469                                 vm_page_activate(dwp->dw_m);
4470                         else if (dwp->dw_mask & DW_vm_page_speculate)
4471                                 vm_page_speculate(dwp->dw_m, TRUE);
4472                         else if (dwp->dw_mask & DW_vm_page_lru)
4473                                 vm_page_lru(dwp->dw_m);
4474
4475                         if (dwp->dw_mask & DW_set_reference)
4476                                 dwp->dw_m->reference = TRUE;
4477                         else if (dwp->dw_mask & DW_clear_reference)
4478                                 dwp->dw_m->reference = FALSE;
4479
4480                         if (dwp->dw_mask & DW_clear_busy)
4481                                 dwp->dw_m->busy = FALSE;
4482
4483                         if (dwp->dw_mask & DW_PAGE_WAKEUP)
4484                                 PAGE_WAKEUP(dwp->dw_m);
4485                 }
4486         }
4487         vm_page_unlock_queues();
4488 }
4489
4490
4491
4492 kern_return_t
4493 upl_commit_range(
4494         upl_t                   upl,
4495         upl_offset_t            offset,
4496         upl_size_t              size,
4497         int                     flags,
4498         upl_page_info_t         *page_list,
4499         mach_msg_type_number_t  count,
4500         boolean_t               *empty)
4501 {
4502         upl_size_t              xfer_size, subupl_size = size;
4503         vm_object_t             shadow_object;
4504         vm_object_t             object;
4505         vm_object_offset_t      target_offset;
4506         upl_offset_t            subupl_offset = offset;
4507         int                     entry;
4508         wpl_array_t             lite_list;
4509         int                     occupied;
4510         int                     clear_refmod = 0;
4511         int                     pgpgout_count = 0;
4512         struct  dw              dw_array[DELAYED_WORK_LIMIT];
4513         struct  dw              *dwp;
4514         int                     dw_count, isVectorUPL = 0;
4515         upl_t                   vector_upl = NULL;
4516
4517         *empty = FALSE;
4518
4519         if (upl == UPL_NULL)
4520                 return KERN_INVALID_ARGUMENT;
4521
4522         if (count == 0)
4523                 page_list = NULL;
4524
4525         if((isVectorUPL = vector_upl_is_valid(upl))) {
4526                 vector_upl = upl;
4527                 upl_lock(vector_upl);
4528         }
4529         else
4530                 upl_lock(upl);
4531
4532 process_upl_to_commit:
4533
4534         if(isVectorUPL) {
4535                 size = subupl_size;
4536                 offset = subupl_offset;
4537                 if(size == 0) {
4538                         upl_unlock(vector_upl);
4539                         return KERN_SUCCESS;
4540                 }
4541                 upl =  vector_upl_subupl_byoffset(vector_upl, &offset, &size);
4542                 if(upl == NULL) {
4543                         upl_unlock(vector_upl);
4544                         return KERN_FAILURE;
4545                 }
4546                 page_list = UPL_GET_INTERNAL_PAGE_LIST_SIMPLE(upl);
4547                 subupl_size -= size;
4548                 subupl_offset += size;
4549         }
4550
4551 #if UPL_DEBUG
4552         if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
4553                 (void) OSBacktrace(&upl->upl_commit_records[upl->upl_commit_index].c_retaddr[0], UPL_DEBUG_STACK_FRAMES);
4554
4555                 upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
4556                 upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
4557
4558                 upl->upl_commit_index++;
4559         }
4560 #endif
4561         if (upl->flags & UPL_DEVICE_MEMORY)
4562                 xfer_size = 0;
4563         else if ((offset + size) <= upl->size)
4564                 xfer_size = size;
4565         else {
4566                 if(!isVectorUPL)
4567                         upl_unlock(upl);
4568                 else {
4569                         upl_unlock(vector_upl);
4570                 }
4571                 return KERN_FAILURE;
4572         }
4573         if (upl->flags & UPL_CLEAR_DIRTY)
4574                 flags |= UPL_COMMIT_CLEAR_DIRTY;
4575
4576         if (upl->flags & UPL_INTERNAL)
4577                 lite_list = (wpl_array_t) ((((uintptr_t)upl) + sizeof(struct upl))
4578                                            + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
4579         else
4580                 lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
4581
4582         object = upl->map_object;
4583
4584         if (upl->flags & UPL_SHADOWED) {
4585                 vm_object_lock(object);
4586                 shadow_object = object->shadow;
4587         } else {
4588                 shadow_object = object;
4589         }
4590         entry = offset/PAGE_SIZE;
4591         target_offset = (vm_object_offset_t)offset;
4592
4593         if (upl->flags & UPL_KERNEL_OBJECT)
4594                 vm_object_lock_shared(shadow_object);
4595         else
4596                 vm_object_lock(shadow_object);
4597
4598         if (upl->flags & UPL_ACCESS_BLOCKED) {
4599                 assert(shadow_object->blocked_access);
4600                 shadow_object->blocked_access = FALSE;
4601                 vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
4602         }
4603
4604         if (shadow_object->code_signed) {
4605                 /*
4606                  * CODE SIGNING:
4607                  * If the object is code-signed, do not let this UPL tell
4608                  * us if the pages are valid or not.  Let the pages be
4609                  * validated by VM the normal way (when they get mapped or
4610                  * copied).
4611                  */
4612                 flags &= ~UPL_COMMIT_CS_VALIDATED;
4613         }
4614         if (! page_list) {
4615                 /*
4616                  * No page list to get the code-signing info from !?
4617                  */
4618                 flags &= ~UPL_COMMIT_CS_VALIDATED;
4619         }
4620
4621         dwp = &dw_array[0];
4622         dw_count = 0;
4623
4624         while (xfer_size) {
4625                 vm_page_t       t, m;
4626
4627                 dwp->dw_mask = 0;
4628                 clear_refmod = 0;
4629
4630                 m = VM_PAGE_NULL;
4631
4632                 if (upl->flags & UPL_LITE) {
4633                         unsigned int    pg_num;
4634
4635                         pg_num = (unsigned int) (target_offset/PAGE_SIZE);
4636                         assert(pg_num == target_offset/PAGE_SIZE);
4637
4638                         if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
4639                                 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
4640
4641                                 if (!(upl->flags & UPL_KERNEL_OBJECT))
4642                                         m = vm_page_lookup(shadow_object, target_offset + (upl->offset - shadow_object->paging_offset));
4643                         }
4644                 }
4645                 if (upl->flags & UPL_SHADOWED) {
4646                         if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
4647
4648                                 t->pageout = FALSE;
4649
4650                                 VM_PAGE_FREE(t);
4651
4652                                 if (m == VM_PAGE_NULL)
4653                                         m = vm_page_lookup(shadow_object, target_offset + object->shadow_offset);
4654                         }
4655                 }
4656                 if ((upl->flags & UPL_KERNEL_OBJECT) || m == VM_PAGE_NULL)
4657                         goto commit_next_page;
4658
4659                 if (flags & UPL_COMMIT_CS_VALIDATED) {
4660                         /*
4661                          * CODE SIGNING:
4662                          * Set the code signing bits according to
4663                          * what the UPL says they should be.
4664                          */
4665                         m->cs_validated = page_list[entry].cs_validated;
4666                         m->cs_tainted = page_list[entry].cs_tainted;
4667                 }
4668                 if (upl->flags & UPL_IO_WIRE) {
4669
4670                         dwp->dw_mask |= DW_vm_page_unwire;
4671
4672                         if (page_list)
4673                                 page_list[entry].phys_addr = 0;
4674
4675                         if (flags & UPL_COMMIT_SET_DIRTY)
4676                                 m->dirty = TRUE;
4677                         else if (flags & UPL_COMMIT_CLEAR_DIRTY) {
4678                                 m->dirty = FALSE;
4679
4680                                 if (! (flags & UPL_COMMIT_CS_VALIDATED) &&
4681                                     m->cs_validated && !m->cs_tainted) {
4682                                         /*
4683                                          * CODE SIGNING:
4684                                          * This page is no longer dirty
4685                                          * but could have been modified,
4686                                          * so it will need to be
4687                                          * re-validated.
4688                                          */
4689                                         m->cs_validated = FALSE;
4690 #if DEVELOPMENT || DEBUG
4691                                         vm_cs_validated_resets++;
4692 #endif
4693                                         pmap_disconnect(m->phys_page);
4694                                 }
4695                                 clear_refmod |= VM_MEM_MODIFIED;
4696                         }
4697                         if (flags & UPL_COMMIT_INACTIVATE) {
4698                                 dwp->dw_mask |= DW_vm_page_deactivate_internal;
4699                                 clear_refmod |= VM_MEM_REFERENCED;
4700                         }
4701                         if (upl->flags & UPL_ACCESS_BLOCKED) {
4702                                 /*
4703                                  * We blocked access to the pages in this UPL.
4704                                  * Clear the "busy" bit and wake up any waiter
4705                                  * for this page.
4706                                  */
4707                                 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
4708                         }
4709                         goto commit_next_page;
4710                 }
4711                 /*
4712                  * make sure to clear the hardware
4713                  * modify or reference bits before
4714                  * releasing the BUSY bit on this page
4715                  * otherwise we risk losing a legitimate
4716                  * change of state
4717                  */
4718                 if (flags & UPL_COMMIT_CLEAR_DIRTY) {
4719                         m->dirty = FALSE;
4720
4721                         if (! (flags & UPL_COMMIT_CS_VALIDATED) &&
4722                             m->cs_validated && !m->cs_tainted) {
4723                                 /*
4724                                  * CODE SIGNING:
4725                                  * This page is no longer dirty
4726                                  * but could have been modified,
4727                                  * so it will need to be
4728                                  * re-validated.
4729                                  */
4730                                 m->cs_validated = FALSE;
4731 #if DEVELOPMENT || DEBUG
4732                                 vm_cs_validated_resets++;
4733 #endif
4734                                 pmap_disconnect(m->phys_page);
4735                         }
4736                         clear_refmod |= VM_MEM_MODIFIED;
4737                 }
4738                 if (page_list) {
4739                         upl_page_info_t *p;
4740
4741                         p = &(page_list[entry]);
4742
4743                         if (p->phys_addr && p->pageout && !m->pageout) {
4744                                 m->busy = TRUE;
4745                                 m->pageout = TRUE;
4746
4747                                 dwp->dw_mask |= DW_vm_page_wire;
4748
4749                         } else if (p->phys_addr &&
4750                                    !p->pageout && m->pageout &&
4751                                    !m->dump_cleaning) {
4752                                 m->pageout = FALSE;
4753                                 m->absent = FALSE;
4754                                 m->overwriting = FALSE;
4755
4756                                 dwp->dw_mask |= (DW_vm_page_unwire | DW_clear_busy | DW_PAGE_WAKEUP);
4757                         }
4758                         page_list[entry].phys_addr = 0;
4759                 }
4760                 m->dump_cleaning = FALSE;
4761
4762                 if (m->laundry)
4763                         dwp->dw_mask |= DW_vm_pageout_throttle_up;
4764
4765                 if (m->pageout) {
4766                         m->cleaning = FALSE;
4767                         m->encrypted_cleaning = FALSE;
4768                         m->pageout = FALSE;
4769 #if MACH_CLUSTER_STATS
4770                         if (m->wanted) vm_pageout_target_collisions++;
4771 #endif
4772                         m->dirty = FALSE;
4773
4774                         if (! (flags & UPL_COMMIT_CS_VALIDATED) &&
4775                             m->cs_validated && !m->cs_tainted) {
4776                                 /*
4777                                  * CODE SIGNING:
4778                                  * This page is no longer dirty
4779                                  * but could have been modified,
4780                                  * so it will need to be
4781                                  * re-validated.
4782                                  */
4783                                 m->cs_validated = FALSE;
4784 #if DEVELOPMENT || DEBUG
4785                                 vm_cs_validated_resets++;
4786 #endif
4787                                 pmap_disconnect(m->phys_page);
4788                         }
4789
4790                         if ((flags & UPL_COMMIT_SET_DIRTY) ||
4791                             (m->pmapped && (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED)))
4792                                 m->dirty = TRUE;
4793
4794                         if (m->dirty) {
4795                                 /*
4796                                  * page was re-dirtied after we started
4797                                  * the pageout... reactivate it since
4798                                  * we don't know whether the on-disk
4799                                  * copy matches what is now in memory
4800                                  */
4801                                 dwp->dw_mask |= (DW_vm_page_unwire | DW_clear_busy | DW_PAGE_WAKEUP);
4802
4803                                 if (upl->flags & UPL_PAGEOUT) {
4804                                         CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
4805                                         VM_STAT_INCR(reactivations);
4806                                         DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL);
4807                                 }
4808                         } else {
4809                                 /*
4810                                  * page has been successfully cleaned
4811                                  * go ahead and free it for other use
4812                                  */
4813
4814                                 if (m->object->internal) {
4815                                         DTRACE_VM2(anonpgout, int, 1, (uint64_t *), NULL);
4816                                 } else {
4817                                         DTRACE_VM2(fspgout, int, 1, (uint64_t *), NULL);
4818                                 }
4819                                 dwp->dw_mask |= DW_vm_page_free;
4820
4821                                 if (upl->flags & UPL_PAGEOUT) {
4822                                         CLUSTER_STAT(vm_pageout_target_page_freed++;)
4823
4824                                         if (page_list[entry].dirty) {
4825                                                 VM_STAT_INCR(pageouts);
4826                                                 DTRACE_VM2(pgout, int, 1, (uint64_t *), NULL);
4827                                                 pgpgout_count++;
4828                                         }
4829                                 }
4830                         }
4831                         goto commit_next_page;
4832                 }
4833 #if MACH_CLUSTER_STATS
4834                 if (m->wpmapped)
4835                         m->dirty = pmap_is_modified(m->phys_page);
4836
4837                 if (m->dirty)   vm_pageout_cluster_dirtied++;
4838                 else            vm_pageout_cluster_cleaned++;
4839                 if (m->wanted)  vm_pageout_cluster_collisions++;
4840 #endif
4841                 m->dirty = FALSE;
4842
4843                 if (! (flags & UPL_COMMIT_CS_VALIDATED) &&
4844                     m->cs_validated && !m->cs_tainted) {
4845                         /*
4846                          * CODE SIGNING:
4847                          * This page is no longer dirty
4848                          * but could have been modified,
4849                          * so it will need to be
4850                          * re-validated.
4851                          */
4852                         m->cs_validated = FALSE;
4853 #if DEVELOPMENT || DEBUG
4854                         vm_cs_validated_resets++;
4855 #endif
4856                         pmap_disconnect(m->phys_page);
4857                 }
4858
4859                 if ((m->busy) && (m->cleaning)) {
4860                         /*
4861                          * the request_page_list case
4862                          */
4863                         m->absent = FALSE;
4864                         m->overwriting = FALSE;
4865
4866                         dwp->dw_mask |= DW_clear_busy;
4867
4868                 } else if (m->overwriting) {
4869                         /*
4870                          * alternate request page list, write to
4871                          * page_list case.  Occurs when the original
4872                          * page was wired at the time of the list
4873                          * request
4874                          */
4875                         assert(VM_PAGE_WIRED(m));
4876                         m->overwriting = FALSE;
4877
4878                         dwp->dw_mask |= DW_vm_page_unwire; /* reactivates */
4879                 }
4880                 m->cleaning = FALSE;
4881                 m->encrypted_cleaning = FALSE;
4882
4883                 /*
4884                  * It is a part of the semantic of COPYOUT_FROM
4885                  * UPLs that a commit implies cache sync
4886                  * between the vm page and the backing store
4887                  * this can be used to strip the precious bit
4888                  * as well as clean
4889                  */
4890                 if ((upl->flags & UPL_PAGE_SYNC_DONE) || (flags & UPL_COMMIT_CLEAR_PRECIOUS))
4891                         m->precious = FALSE;
4892
4893                 if (flags & UPL_COMMIT_SET_DIRTY)
4894                         m->dirty = TRUE;
4895
4896                 if ((flags & UPL_COMMIT_INACTIVATE) && !m->clustered && !m->speculative) {
4897                         dwp->dw_mask |= DW_vm_page_deactivate_internal;
4898                         clear_refmod |= VM_MEM_REFERENCED;
4899
4900                 } else if (!m->active && !m->inactive && !m->speculative) {
4901
4902                         if (m->clustered || (flags & UPL_COMMIT_SPECULATE))
4903                                 dwp->dw_mask |= DW_vm_page_speculate;
4904                         else if (m->reference)
4905                                 dwp->dw_mask |= DW_vm_page_activate;
4906                         else {
4907                                 dwp->dw_mask |= DW_vm_page_deactivate_internal;
4908                                 clear_refmod |= VM_MEM_REFERENCED;
4909                         }
4910                 }
4911                 if (upl->flags & UPL_ACCESS_BLOCKED) {
4912                         /*
4913                          * We blocked access to the pages in this URL.
4914                          * Clear the "busy" bit on this page before we
4915                          * wake up any waiter.
4916                          */
4917                         dwp->dw_mask |= DW_clear_busy;
4918                 }
4919                 /*
4920                  * Wakeup any thread waiting for the page to be un-cleaning.
4921                  */
4922                 dwp->dw_mask |= DW_PAGE_WAKEUP;
4923
4924 commit_next_page:
4925                 if (clear_refmod)
4926                         pmap_clear_refmod(m->phys_page, clear_refmod);
4927
4928                 target_offset += PAGE_SIZE_64;
4929                 xfer_size -= PAGE_SIZE;
4930                 entry++;
4931
4932                 if (dwp->dw_mask) {
4933                         if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) {
4934                                 if (m->busy == FALSE) {
4935                                         /*
4936                                          * dw_do_work may need to drop the object lock
4937                                          * if it does, we need the pages it's looking at to
4938                                          * be held stable via the busy bit.
4939                                          */
4940                                         m->busy = TRUE;
4941                                         dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
4942                                 }
4943                                 dwp->dw_m = m;
4944                                 dwp++;
4945                                 dw_count++;
4946
4947                                 if (dw_count >= DELAYED_WORK_LIMIT) {
4948                                         dw_do_work(shadow_object, &dw_array[0], dw_count);
4949
4950                                         dwp = &dw_array[0];
4951                                         dw_count = 0;
4952                                 }
4953                         } else {
4954                                 if (dwp->dw_mask & DW_clear_busy)
4955                                         m->busy = FALSE;
4956
4957                                 if (dwp->dw_mask & DW_PAGE_WAKEUP)
4958                                         PAGE_WAKEUP(m);
4959                         }
4960                 }
4961         }
4962         if (dw_count)
4963                 dw_do_work(shadow_object, &dw_array[0], dw_count);
4964
4965         occupied = 1;
4966
4967         if (upl->flags & UPL_DEVICE_MEMORY)  {
4968                 occupied = 0;
4969         } else if (upl->flags & UPL_LITE) {
4970                 int     pg_num;
4971                 int     i;
4972
4973                 pg_num = upl->size/PAGE_SIZE;
4974                 pg_num = (pg_num + 31) >> 5;
4975                 occupied = 0;
4976
4977                 for (i = 0; i < pg_num; i++) {
4978                         if (lite_list[i] != 0) {
4979                                 occupied = 1;
4980                                 break;
4981                         }
4982                 }
4983         } else {
4984                 if (queue_empty(&upl->map_object->memq))
4985                         occupied = 0;
4986         }
4987         if (occupied == 0) {
4988                 /*
4989                  * If this UPL element belongs to a Vector UPL and is
4990                  * empty, then this is the right function to deallocate
4991                  * it. So go ahead set the *empty variable. The flag
4992                  * UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
4993                  * should be considered relevant for the Vector UPL and not
4994                  * the internal UPLs.
4995                  */
4996                 if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) || isVectorUPL)
4997                         *empty = TRUE;
4998
4999                 if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
5000                         /*
5001                          * this is not a paging object
5002                          * so we need to drop the paging reference
5003                          * that was taken when we created the UPL
5004                          * against this object
5005                          */
5006                         vm_object_activity_end(shadow_object);
5007                 } else {
5008                          /*
5009                           * we dontated the paging reference to
5010                           * the map object... vm_pageout_object_terminate
5011                           * will drop this reference
5012                           */
5013                 }
5014         }
5015         vm_object_unlock(shadow_object);
5016         if (object != shadow_object)
5017                 vm_object_unlock(object);
5018
5019         if(!isVectorUPL)
5020                 upl_unlock(upl);
5021         else {
5022                 /*
5023                  * If we completed our operations on an UPL that is
5024                  * part of a Vectored UPL and if empty is TRUE, then
5025                  * we should go ahead and deallocate this UPL element.
5026                  * Then we check if this was the last of the UPL elements
5027                  * within that Vectored UPL. If so, set empty to TRUE
5028                  * so that in ubc_upl_commit_range or ubc_upl_commit, we
5029                  * can go ahead and deallocate the Vector UPL too.
5030                  */
5031                 if(*empty==TRUE) {
5032                         *empty = vector_upl_set_subupl(vector_upl, upl, 0);
5033                         upl_deallocate(upl);
5034                 }
5035                 goto process_upl_to_commit;
5036         }
5037
5038         if (pgpgout_count) {
5039                 DTRACE_VM2(pgpgout, int, pgpgout_count, (uint64_t *), NULL);
5040         }
5041
5042         return KERN_SUCCESS;
5043 }
5044
5045 kern_return_t
5046 upl_abort_range(
5047         upl_t                   upl,
5048         upl_offset_t            offset,
5049         upl_size_t              size,
5050         int                     error,
5051         boolean_t               *empty)
5052 {
5053         upl_size_t              xfer_size, subupl_size = size;
5054         vm_object_t             shadow_object;
5055         vm_object_t             object;
5056         vm_object_offset_t      target_offset;
5057         upl_offset_t            subupl_offset = offset;
5058         int                     entry;
5059         wpl_array_t             lite_list;
5060         int                     occupied;
5061         struct  dw              dw_array[DELAYED_WORK_LIMIT];
5062         struct  dw              *dwp;
5063         int                     dw_count, isVectorUPL = 0;
5064         upl_t                   vector_upl = NULL;
5065
5066         *empty = FALSE;
5067
5068         if (upl == UPL_NULL)
5069                 return KERN_INVALID_ARGUMENT;
5070
5071         if ( (upl->flags & UPL_IO_WIRE) && !(error & UPL_ABORT_DUMP_PAGES) )
5072                 return upl_commit_range(upl, offset, size, 0, NULL, 0, empty);
5073
5074         if((isVectorUPL = vector_upl_is_valid(upl))) {
5075                 vector_upl = upl;
5076                 upl_lock(vector_upl);
5077         }
5078         else
5079                 upl_lock(upl);
5080
5081 process_upl_to_abort:
5082         if(isVectorUPL) {
5083                 size = subupl_size;
5084                 offset = subupl_offset;
5085                 if(size == 0) {
5086                         upl_unlock(vector_upl);
5087                         return KERN_SUCCESS;
5088                 }
5089                 upl =  vector_upl_subupl_byoffset(vector_upl, &offset, &size);
5090                 if(upl == NULL) {
5091                         upl_unlock(vector_upl);
5092                         return KERN_FAILURE;
5093                 }
5094                 subupl_size -= size;
5095                 subupl_offset += size;
5096         }
5097
5098         *empty = FALSE;
5099
5100 #if UPL_DEBUG
5101         if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
5102                 (void) OSBacktrace(&upl->upl_commit_records[upl->upl_commit_index].c_retaddr[0], UPL_DEBUG_STACK_FRAMES);
5103
5104                 upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
5105                 upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
5106                 upl->upl_commit_records[upl->upl_commit_index].c_aborted = 1;
5107
5108                 upl->upl_commit_index++;
5109         }
5110 #endif
5111         if (upl->flags & UPL_DEVICE_MEMORY)
5112                 xfer_size = 0;
5113         else if ((offset + size) <= upl->size)
5114                 xfer_size = size;
5115         else {
5116                 if(!isVectorUPL)
5117                         upl_unlock(upl);
5118                 else {
5119                         upl_unlock(vector_upl);
5120                 }
5121
5122                 return KERN_FAILURE;
5123         }
5124         if (upl->flags & UPL_INTERNAL) {
5125                 lite_list = (wpl_array_t)
5126                         ((((uintptr_t)upl) + sizeof(struct upl))
5127                         + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
5128         } else {
5129                 lite_list = (wpl_array_t)
5130                         (((uintptr_t)upl) + sizeof(struct upl));
5131         }
5132         object = upl->map_object;
5133
5134         if (upl->flags & UPL_SHADOWED) {
5135                 vm_object_lock(object);
5136                 shadow_object = object->shadow;
5137         } else
5138                 shadow_object = object;
5139
5140         entry = offset/PAGE_SIZE;
5141         target_offset = (vm_object_offset_t)offset;
5142
5143         if (upl->flags & UPL_KERNEL_OBJECT)
5144                 vm_object_lock_shared(shadow_object);
5145         else
5146                 vm_object_lock(shadow_object);
5147
5148         if (upl->flags & UPL_ACCESS_BLOCKED) {
5149                 assert(shadow_object->blocked_access);
5150                 shadow_object->blocked_access = FALSE;
5151                 vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
5152         }
5153
5154         dwp = &dw_array[0];
5155         dw_count = 0;
5156
5157         if ((error & UPL_ABORT_DUMP_PAGES) && (upl->flags & UPL_KERNEL_OBJECT))
5158                 panic("upl_abort_range: kernel_object being DUMPED");
5159
5160         while (xfer_size) {
5161                 vm_page_t       t, m;
5162
5163                 dwp->dw_mask = 0;
5164
5165                 m = VM_PAGE_NULL;
5166
5167                 if (upl->flags & UPL_LITE) {
5168                         unsigned int    pg_num;
5169
5170                         pg_num = (unsigned int) (target_offset/PAGE_SIZE);
5171                         assert(pg_num == target_offset/PAGE_SIZE);
5172
5173
5174                         if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
5175                                 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
5176
5177                                 if ( !(upl->flags & UPL_KERNEL_OBJECT))
5178                                         m = vm_page_lookup(shadow_object, target_offset +
5179                                                            (upl->offset - shadow_object->paging_offset));
5180                         }
5181                 }
5182                 if (upl->flags & UPL_SHADOWED) {
5183                         if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
5184                                 t->pageout = FALSE;
5185
5186                                 VM_PAGE_FREE(t);
5187
5188                                 if (m == VM_PAGE_NULL)
5189                                         m = vm_page_lookup(shadow_object, target_offset + object->shadow_offset);
5190                         }
5191                 }
5192                 if ((upl->flags & UPL_KERNEL_OBJECT))
5193                         goto abort_next_page;
5194
5195                 if (m != VM_PAGE_NULL) {
5196
5197                         if (m->absent) {
5198                                 boolean_t must_free = TRUE;
5199
5200                                 m->clustered = FALSE;
5201                                 /*
5202                                  * COPYOUT = FALSE case
5203                                  * check for error conditions which must
5204                                  * be passed back to the pages customer
5205                                  */
5206                                 if (error & UPL_ABORT_RESTART) {
5207                                         m->restart = TRUE;
5208                                         m->absent = FALSE;
5209                                         m->unusual = TRUE;
5210                                         must_free = FALSE;
5211                                 } else if (error & UPL_ABORT_UNAVAILABLE) {
5212                                         m->restart = FALSE;
5213                                         m->unusual = TRUE;
5214                                         must_free = FALSE;
5215                                 } else if (error & UPL_ABORT_ERROR) {
5216                                         m->restart = FALSE;
5217                                         m->absent = FALSE;
5218                                         m->error = TRUE;
5219                                         m->unusual = TRUE;
5220                                         must_free = FALSE;
5221                                 }
5222
5223                                 /*
5224                                  * ENCRYPTED SWAP:
5225                                  * If the page was already encrypted,
5226                                  * we don't really need to decrypt it
5227                                  * now.  It will get decrypted later,
5228                                  * on demand, as soon as someone needs
5229                                  * to access its contents.
5230                                  */
5231
5232                                 m->cleaning = FALSE;
5233                                 m->encrypted_cleaning = FALSE;
5234                                 m->overwriting = FALSE;
5235
5236                                 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
5237
5238                                 if (must_free == TRUE)
5239                                         dwp->dw_mask |= DW_vm_page_free;
5240                                 else
5241                                         dwp->dw_mask |= DW_vm_page_activate;
5242                         } else {
5243                                 /*
5244                                  * Handle the trusted pager throttle.
5245                                  */
5246                                 if (m->laundry)
5247                                         dwp->dw_mask |= DW_vm_pageout_throttle_up;
5248
5249                                 if (m->pageout) {
5250                                         assert(m->busy);
5251                                         assert(m->wire_count == 1);
5252                                         m->pageout = FALSE;
5253
5254                                         dwp->dw_mask |= DW_vm_page_unwire;
5255                                 }
5256                                 m->dump_cleaning = FALSE;
5257                                 m->cleaning = FALSE;
5258                                 m->encrypted_cleaning = FALSE;
5259                                 m->overwriting = FALSE;
5260 #if     MACH_PAGEMAP
5261                                 vm_external_state_clr(m->object->existence_map, m->offset);
5262 #endif  /* MACH_PAGEMAP */
5263                                 if (error & UPL_ABORT_DUMP_PAGES) {
5264                                         pmap_disconnect(m->phys_page);
5265
5266                                         dwp->dw_mask |= DW_vm_page_free;
5267                                 } else {
5268                                         if (error & UPL_ABORT_REFERENCE) {
5269                                                 /*
5270                                                  * we've been told to explictly
5271                                                  * reference this page... for
5272                                                  * file I/O, this is done by
5273                                                  * implementing an LRU on the inactive q
5274                                                  */
5275                                                 dwp->dw_mask |= DW_vm_page_lru;
5276                                         }
5277                                         dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
5278                                 }
5279                         }
5280                 }
5281 abort_next_page:
5282                 target_offset += PAGE_SIZE_64;
5283                 xfer_size -= PAGE_SIZE;
5284                 entry++;
5285
5286                 if (dwp->dw_mask) {
5287                         if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) {
5288                                 if (m->busy == FALSE) {
5289                                         /*
5290                                          * dw_do_work may need to drop the object lock
5291                                          * if it does, we need the pages it's looking at to
5292                                          * be held stable via the busy bit.
5293                                          */
5294                                         m->busy = TRUE;
5295                                         dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
5296                                 }
5297                                 dwp->dw_m = m;
5298                                 dwp++;
5299                                 dw_count++;
5300
5301                                 if (dw_count >= DELAYED_WORK_LIMIT) {
5302                                         dw_do_work(shadow_object, &dw_array[0], dw_count);
5303
5304                                         dwp = &dw_array[0];
5305                                         dw_count = 0;
5306                                 }
5307                         } else {
5308                                 if (dwp->dw_mask & DW_clear_busy)
5309                                         m->busy = FALSE;
5310
5311                                 if (dwp->dw_mask & DW_PAGE_WAKEUP)
5312                                         PAGE_WAKEUP(m);
5313                         }
5314                 }
5315         }
5316         if (dw_count)
5317                 dw_do_work(shadow_object, &dw_array[0], dw_count);
5318
5319         occupied = 1;
5320
5321         if (upl->flags & UPL_DEVICE_MEMORY)  {
5322                 occupied = 0;
5323         } else if (upl->flags & UPL_LITE) {
5324                 int     pg_num;
5325                 int     i;
5326
5327                 pg_num = upl->size/PAGE_SIZE;
5328                 pg_num = (pg_num + 31) >> 5;
5329                 occupied = 0;
5330
5331                 for (i = 0; i < pg_num; i++) {
5332                         if (lite_list[i] != 0) {
5333                                 occupied = 1;
5334                                 break;
5335                         }
5336                 }
5337         } else {
5338                 if (queue_empty(&upl->map_object->memq))
5339                         occupied = 0;
5340         }
5341         if (occupied == 0) {
5342                 /*
5343                  * If this UPL element belongs to a Vector UPL and is
5344                  * empty, then this is the right function to deallocate
5345                  * it. So go ahead set the *empty variable. The flag
5346                  * UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
5347                  * should be considered relevant for the Vector UPL and
5348                  * not the internal UPLs.
5349                  */
5350                 if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) || isVectorUPL)
5351                         *empty = TRUE;
5352
5353                 if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
5354                         /*
5355                          * this is not a paging object
5356                          * so we need to drop the paging reference
5357                          * that was taken when we created the UPL
5358                          * against this object
5359                          */
5360                         vm_object_activity_end(shadow_object);
5361                 } else {
5362                          /*
5363                           * we dontated the paging reference to
5364                           * the map object... vm_pageout_object_terminate
5365                           * will drop this reference
5366                           */
5367                 }
5368         }
5369         vm_object_unlock(shadow_object);
5370         if (object != shadow_object)
5371                 vm_object_unlock(object);
5372
5373         if(!isVectorUPL)
5374                 upl_unlock(upl);
5375         else {
5376                 /*
5377                 * If we completed our operations on an UPL that is
5378                 * part of a Vectored UPL and if empty is TRUE, then
5379                 * we should go ahead and deallocate this UPL element.
5380                 * Then we check if this was the last of the UPL elements
5381                 * within that Vectored UPL. If so, set empty to TRUE
5382                 * so that in ubc_upl_abort_range or ubc_upl_abort, we
5383                 * can go ahead and deallocate the Vector UPL too.
5384                 */
5385                 if(*empty == TRUE) {
5386                         *empty = vector_upl_set_subupl(vector_upl, upl,0);
5387                         upl_deallocate(upl);
5388                 }
5389                 goto process_upl_to_abort;
5390         }
5391
5392         return KERN_SUCCESS;
5393 }
5394
5395
5396 kern_return_t
5397 upl_abort(
5398         upl_t   upl,
5399         int     error)
5400 {
5401         boolean_t       empty;
5402
5403         return upl_abort_range(upl, 0, upl->size, error, &empty);
5404 }
5405
5406
5407 /* an option on commit should be wire */
5408 kern_return_t
5409 upl_commit(
5410         upl_t                   upl,
5411         upl_page_info_t         *page_list,
5412         mach_msg_type_number_t  count)
5413 {
5414         boolean_t       empty;
5415
5416         return upl_commit_range(upl, 0, upl->size, 0, page_list, count, &empty);
5417 }
5418
5419
5420 unsigned int vm_object_iopl_request_sleep_for_cleaning = 0;
5421
5422 kern_return_t
5423 vm_object_iopl_request(
5424         vm_object_t             object,
5425         vm_object_offset_t      offset,
5426         upl_size_t              size,
5427         upl_t                   *upl_ptr,
5428         upl_page_info_array_t   user_page_list,
5429         unsigned int            *page_list_count,
5430         int                     cntrl_flags)
5431 {
5432         vm_page_t               dst_page;
5433         vm_object_offset_t      dst_offset;
5434         upl_size_t              xfer_size;
5435         upl_t                   upl = NULL;
5436         unsigned int            entry;
5437         wpl_array_t             lite_list = NULL;
5438         int                     no_zero_fill = FALSE;
5439         u_int32_t               psize;
5440         kern_return_t           ret;
5441         vm_prot_t               prot;
5442         struct vm_object_fault_info fault_info;
5443         struct  dw              dw_array[DELAYED_WORK_LIMIT];
5444         struct  dw              *dwp;
5445         int                     dw_count;
5446         int                     dw_index;
5447
5448         if (cntrl_flags & ~UPL_VALID_FLAGS) {
5449                 /*
5450                  * For forward compatibility's sake,
5451                  * reject any unknown flag.
5452                  */
5453                 return KERN_INVALID_VALUE;
5454         }
5455         if (vm_lopage_poolsize == 0)
5456                 cntrl_flags &= ~UPL_NEED_32BIT_ADDR;
5457
5458         if (cntrl_flags & UPL_NEED_32BIT_ADDR) {
5459                 if ( (cntrl_flags & (UPL_SET_IO_WIRE | UPL_SET_LITE)) != (UPL_SET_IO_WIRE | UPL_SET_LITE))
5460                         return KERN_INVALID_VALUE;
5461
5462                 if (object->phys_contiguous) {
5463                         if ((offset + object->shadow_offset) >= (vm_object_offset_t)max_valid_dma_address)
5464                                 return KERN_INVALID_ADDRESS;
5465
5466                         if (((offset + object->shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address)
5467                                 return KERN_INVALID_ADDRESS;
5468                 }
5469         }
5470
5471         if (cntrl_flags & UPL_ENCRYPT) {
5472                 /*
5473                  * ENCRYPTED SWAP:
5474                  * The paging path doesn't use this interface,
5475                  * so we don't support the UPL_ENCRYPT flag
5476                  * here.  We won't encrypt the pages.
5477                  */
5478                 assert(! (cntrl_flags & UPL_ENCRYPT));
5479         }
5480         if (cntrl_flags & UPL_NOZEROFILL)
5481                 no_zero_fill = TRUE;
5482
5483         if (cntrl_flags & UPL_COPYOUT_FROM)
5484                 prot = VM_PROT_READ;
5485         else
5486                 prot = VM_PROT_READ | VM_PROT_WRITE;
5487
5488         if (((size/PAGE_SIZE) > MAX_UPL_SIZE) && !object->phys_contiguous)
5489                 size = MAX_UPL_SIZE * PAGE_SIZE;
5490
5491         if (cntrl_flags & UPL_SET_INTERNAL) {
5492                 if (page_list_count != NULL)
5493                         *page_list_count = MAX_UPL_SIZE;
5494         }
5495         if (((cntrl_flags & UPL_SET_INTERNAL) && !(object->phys_contiguous)) &&
5496             ((page_list_count != NULL) && (*page_list_count != 0) && *page_list_count < (size/page_size)))
5497                 return KERN_INVALID_ARGUMENT;
5498
5499         if ((!object->internal) && (object->paging_offset != 0))
5500                 panic("vm_object_iopl_request: external object with non-zero paging offset\n");
5501
5502
5503         if (object->phys_contiguous)
5504                 psize = PAGE_SIZE;
5505         else
5506                 psize = size;
5507
5508         if (cntrl_flags & UPL_SET_INTERNAL) {
5509                 upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE, UPL_IO_WIRE, psize);
5510
5511                 user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
5512                 lite_list = (wpl_array_t) (((uintptr_t)user_page_list) +
5513                                            ((psize / PAGE_SIZE) * sizeof(upl_page_info_t)));
5514                 if (size == 0) {
5515                         user_page_list = NULL;
5516                         lite_list = NULL;
5517                 }
5518         } else {
5519                 upl = upl_create(UPL_CREATE_LITE, UPL_IO_WIRE, psize);
5520
5521                 lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
5522                 if (size == 0) {
5523                         lite_list = NULL;
5524                 }
5525         }
5526         if (user_page_list)
5527                 user_page_list[0].device = FALSE;
5528         *upl_ptr = upl;
5529
5530         upl->map_object = object;
5531         upl->size = size;
5532
5533         if (object == kernel_object &&
5534             !(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS))) {
5535                 upl->flags |= UPL_KERNEL_OBJECT;
5536 #if UPL_DEBUG
5537                 vm_object_lock(object);
5538 #else
5539                 vm_object_lock_shared(object);
5540 #endif
5541         } else {
5542                 vm_object_lock(object);
5543                 vm_object_activity_begin(object);
5544         }
5545         /*
5546          * paging in progress also protects the paging_offset
5547          */
5548         upl->offset = offset + object->paging_offset;
5549
5550         if (cntrl_flags & UPL_BLOCK_ACCESS) {
5551                 /*
5552                  * The user requested that access to the pages in this URL
5553                  * be blocked until the UPL is commited or aborted.
5554                  */
5555                 upl->flags |= UPL_ACCESS_BLOCKED;
5556         }
5557
5558         if (object->phys_contiguous) {
5559 #if UPL_DEBUG
5560                 queue_enter(&object->uplq, upl, upl_t, uplq);
5561 #endif /* UPL_DEBUG */
5562
5563                 if (upl->flags & UPL_ACCESS_BLOCKED) {
5564                         assert(!object->blocked_access);
5565                         object->blocked_access = TRUE;
5566                 }
5567
5568                 vm_object_unlock(object);
5569
5570                 /*
5571                  * don't need any shadow mappings for this one
5572                  * since it is already I/O memory
5573                  */
5574                 upl->flags |= UPL_DEVICE_MEMORY;
5575
5576                 upl->highest_page = (ppnum_t) ((offset + object->shadow_offset + size - 1)>>PAGE_SHIFT);
5577
5578                 if (user_page_list) {
5579                         user_page_list[0].phys_addr = (ppnum_t) ((offset + object->shadow_offset)>>PAGE_SHIFT);
5580                         user_page_list[0].device = TRUE;
5581                 }
5582                 if (page_list_count != NULL) {
5583                         if (upl->flags & UPL_INTERNAL)
5584                                 *page_list_count = 0;
5585                         else
5586                                 *page_list_count = 1;
5587                 }
5588                 return KERN_SUCCESS;
5589         }
5590         if (object != kernel_object) {
5591                 /*
5592                  * Protect user space from future COW operations
5593                  */
5594                 object->true_share = TRUE;
5595
5596                 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC)
5597                         object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
5598         }
5599
5600 #if UPL_DEBUG
5601         queue_enter(&object->uplq, upl, upl_t, uplq);
5602 #endif /* UPL_DEBUG */
5603
5604         if (!(cntrl_flags & UPL_COPYOUT_FROM) &&
5605             object->copy != VM_OBJECT_NULL) {
5606                 /*
5607                  * Honor copy-on-write obligations
5608                  *
5609                  * The caller is gathering these pages and
5610                  * might modify their contents.  We need to
5611                  * make sure that the copy object has its own
5612                  * private copies of these pages before we let
5613                  * the caller modify them.
5614                  *
5615                  * NOTE: someone else could map the original object
5616                  * after we've done this copy-on-write here, and they
5617                  * could then see an inconsistent picture of the memory
5618                  * while it's being modified via the UPL.  To prevent this,
5619                  * we would have to block access to these pages until the
5620                  * UPL is released.  We could use the UPL_BLOCK_ACCESS
5621                  * code path for that...
5622                  */
5623                 vm_object_update(object,
5624                                  offset,
5625                                  size,
5626                                  NULL,
5627                                  NULL,
5628                                  FALSE, /* should_return */
5629                                  MEMORY_OBJECT_COPY_SYNC,
5630                                  VM_PROT_NO_CHANGE);
5631 #if DEVELOPMENT || DEBUG
5632                 iopl_cow++;
5633                 iopl_cow_pages += size >> PAGE_SHIFT;
5634 #endif
5635         }
5636
5637
5638         entry = 0;
5639
5640         xfer_size = size;
5641         dst_offset = offset;
5642
5643         fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
5644         fault_info.user_tag  = 0;
5645         fault_info.lo_offset = offset;
5646         fault_info.hi_offset = offset + xfer_size;
5647         fault_info.no_cache  = FALSE;
5648         fault_info.stealth = FALSE;
5649
5650         dwp = &dw_array[0];
5651         dw_count = 0;
5652
5653         while (xfer_size) {
5654                 vm_fault_return_t       result;
5655                 unsigned int            pg_num;
5656
5657                 dwp->dw_mask = 0;
5658
5659                 dst_page = vm_page_lookup(object, dst_offset);
5660
5661                 /*
5662                  * ENCRYPTED SWAP:
5663                  * If the page is encrypted, we need to decrypt it,
5664                  * so force a soft page fault.
5665                  */
5666                 if (dst_page == VM_PAGE_NULL ||
5667                     dst_page->busy ||
5668                     dst_page->encrypted ||
5669                     dst_page->error ||
5670                     dst_page->restart ||
5671                     dst_page->absent ||
5672                     dst_page->fictitious) {
5673
5674                    if (object == kernel_object)
5675                            panic("vm_object_iopl_request: missing/bad page in kernel object\n");
5676
5677                    do {
5678                         vm_page_t       top_page;
5679                         kern_return_t   error_code;
5680                         int             interruptible;
5681
5682                         if (cntrl_flags & UPL_SET_INTERRUPTIBLE)
5683                                 interruptible = THREAD_ABORTSAFE;
5684                         else
5685                                 interruptible = THREAD_UNINT;
5686
5687                         fault_info.interruptible = interruptible;
5688                         fault_info.cluster_size = xfer_size;
5689
5690                         vm_object_paging_begin(object);
5691
5692                         result = vm_fault_page(object, dst_offset,
5693                                                prot | VM_PROT_WRITE, FALSE,
5694                                                &prot, &dst_page, &top_page,
5695                                                (int *)0,
5696                                                &error_code, no_zero_fill,
5697                                                FALSE, &fault_info);
5698
5699                         switch (result) {
5700
5701                         case VM_FAULT_SUCCESS:
5702
5703                                 PAGE_WAKEUP_DONE(dst_page);
5704                                 /*
5705                                  *      Release paging references and
5706                                  *      top-level placeholder page, if any.
5707                                  */
5708                                 if (top_page != VM_PAGE_NULL) {
5709                                         vm_object_t local_object;
5710
5711                                         local_object = top_page->object;
5712
5713                                         if (top_page->object != dst_page->object) {
5714                                                 vm_object_lock(local_object);
5715                                                 VM_PAGE_FREE(top_page);
5716                                                 vm_object_paging_end(local_object);
5717                                                 vm_object_unlock(local_object);
5718                                         } else {
5719                                                 VM_PAGE_FREE(top_page);
5720                                                 vm_object_paging_end(local_object);
5721                                         }
5722                                 }
5723                                 vm_object_paging_end(object);
5724                                 break;
5725
5726                         case VM_FAULT_RETRY:
5727                                 vm_object_lock(object);
5728                                 break;
5729
5730                         case VM_FAULT_FICTITIOUS_SHORTAGE:
5731                                 vm_page_more_fictitious();
5732
5733                                 vm_object_lock(object);
5734                                 break;
5735
5736                         case VM_FAULT_MEMORY_SHORTAGE:
5737                                 if (vm_page_wait(interruptible)) {
5738                                         vm_object_lock(object);
5739                                         break;
5740                                 }
5741                                 /* fall thru */
5742
5743                         case VM_FAULT_INTERRUPTED:
5744                                 error_code = MACH_SEND_INTERRUPTED;
5745                         case VM_FAULT_MEMORY_ERROR:
5746                         memory_error:
5747                                 ret = (error_code ? error_code: KERN_MEMORY_ERROR);
5748
5749                                 vm_object_lock(object);
5750                                 goto return_err;
5751
5752                         case VM_FAULT_SUCCESS_NO_VM_PAGE:
5753                                 /* success but no page: fail */
5754                                 vm_object_paging_end(object);
5755                                 vm_object_unlock(object);
5756                                 goto memory_error;
5757
5758                         default:
5759                                 panic("vm_object_iopl_request: unexpected error"
5760                                       " 0x%x from vm_fault_page()\n", result);
5761                         }
5762                    } while (result != VM_FAULT_SUCCESS);
5763
5764                 }
5765
5766                 if (upl->flags & UPL_KERNEL_OBJECT)
5767                         goto record_phys_addr;
5768
5769                 if (dst_page->cleaning) {
5770                         /*
5771                          * Someone else is cleaning this page in place.as
5772                          * In theory, we should be able to  proceed and use this
5773                          * page but they'll probably end up clearing the "busy"
5774                          * bit on it in upl_commit_range() but they didn't set
5775                          * it, so they would clear our "busy" bit and open
5776                          * us to race conditions.
5777                          * We'd better wait for the cleaning to complete and
5778                          * then try again.
5779                          */
5780                         vm_object_iopl_request_sleep_for_cleaning++;
5781                         PAGE_SLEEP(object, dst_page, THREAD_UNINT);
5782                         continue;
5783                 }
5784                 if ( (cntrl_flags & UPL_NEED_32BIT_ADDR) &&
5785                      dst_page->phys_page >= (max_valid_dma_address >> PAGE_SHIFT) ) {
5786                         vm_page_t       low_page;
5787                         int             refmod;
5788
5789                         /*
5790                          * support devices that can't DMA above 32 bits
5791                          * by substituting pages from a pool of low address
5792                          * memory for any pages we find above the 4G mark
5793                          * can't substitute if the page is already wired because
5794                          * we don't know whether that physical address has been
5795                          * handed out to some other 64 bit capable DMA device to use
5796                          */
5797                         if (VM_PAGE_WIRED(dst_page)) {
5798                                 ret = KERN_PROTECTION_FAILURE;
5799                                 goto return_err;
5800                         }
5801                         low_page = vm_page_grablo();
5802
5803                         if (low_page == VM_PAGE_NULL) {
5804                                 ret = KERN_RESOURCE_SHORTAGE;
5805                                 goto return_err;
5806                         }
5807                         /*
5808                          * from here until the vm_page_replace completes
5809                          * we musn't drop the object lock... we don't
5810                          * want anyone refaulting this page in and using
5811                          * it after we disconnect it... we want the fault
5812                          * to find the new page being substituted.
5813                          */
5814                         if (dst_page->pmapped)
5815                                 refmod = pmap_disconnect(dst_page->phys_page);
5816                         else
5817                                 refmod = 0;
5818                         vm_page_copy(dst_page, low_page);
5819
5820                         low_page->reference = dst_page->reference;
5821                         low_page->dirty     = dst_page->dirty;
5822
5823                         if (refmod & VM_MEM_REFERENCED)
5824                                 low_page->reference = TRUE;
5825                         if (refmod & VM_MEM_MODIFIED)
5826                                 low_page->dirty = TRUE;
5827
5828                         vm_page_replace(low_page, object, dst_offset);
5829
5830                         dst_page = low_page;
5831                         /*
5832                          * vm_page_grablo returned the page marked
5833                          * BUSY... we don't need a PAGE_WAKEUP_DONE
5834                          * here, because we've never dropped the object lock
5835                          */
5836                         dst_page->busy = FALSE;
5837                 }
5838                 dwp->dw_mask |= DW_vm_page_wire;
5839
5840                 if (cntrl_flags & UPL_BLOCK_ACCESS) {
5841                         /*
5842                          * Mark the page "busy" to block any future page fault
5843                          * on this page.  We'll also remove the mapping
5844                          * of all these pages before leaving this routine.
5845                          */
5846                         assert(!dst_page->fictitious);
5847                         dst_page->busy = TRUE;
5848                 }
5849                 /*
5850                  * expect the page to be used
5851                  * page queues lock must be held to set 'reference'
5852                  */
5853                 dwp->dw_mask |= DW_set_reference;
5854
5855                 if (!(cntrl_flags & UPL_COPYOUT_FROM))
5856                         dst_page->dirty = TRUE;
5857 record_phys_addr:
5858                 pg_num = (unsigned int) ((dst_offset-offset)/PAGE_SIZE);
5859                 assert(pg_num == (dst_offset-offset)/PAGE_SIZE);
5860                 lite_list[pg_num>>5] |= 1 << (pg_num & 31);
5861
5862                 if (dst_page->phys_page > upl->highest_page)
5863                         upl->highest_page = dst_page->phys_page;
5864
5865                 if (user_page_list) {
5866                         user_page_list[entry].phys_addr = dst_page->phys_page;
5867                         user_page_list[entry].pageout   = dst_page->pageout;
5868                         user_page_list[entry].absent    = dst_page->absent;
5869                         user_page_list[entry].dirty     = dst_page->dirty;
5870                         user_page_list[entry].precious  = dst_page->precious;
5871                         user_page_list[entry].device    = FALSE;
5872                         if (dst_page->clustered == TRUE)
5873                                 user_page_list[entry].speculative = dst_page->speculative;
5874                         else
5875                                 user_page_list[entry].speculative = FALSE;
5876                         user_page_list[entry].cs_validated = dst_page->cs_validated;
5877                         user_page_list[entry].cs_tainted = dst_page->cs_tainted;
5878                 }
5879                 if (object != kernel_object) {
5880                         /*
5881                          * someone is explicitly grabbing this page...
5882                          * update clustered and speculative state
5883                          *
5884                          */
5885                         VM_PAGE_CONSUME_CLUSTERED(dst_page);
5886                 }
5887                 entry++;
5888                 dst_offset += PAGE_SIZE_64;
5889                 xfer_size -= PAGE_SIZE;
5890
5891                 if (dwp->dw_mask) {
5892                         if (dst_page->busy == FALSE) {
5893                                 /*
5894                                  * dw_do_work may need to drop the object lock
5895                                  * if it does, we need the pages it's looking at to
5896                                  * be held stable via the busy bit.
5897                                  */
5898                                 dst_page->busy = TRUE;
5899                                 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
5900                         }
5901                         dwp->dw_m = dst_page;
5902                         dwp++;
5903                         dw_count++;
5904
5905                         if (dw_count >= DELAYED_WORK_LIMIT) {
5906                                 dw_do_work(object, &dw_array[0], dw_count);
5907
5908                                 dwp = &dw_array[0];
5909                                 dw_count = 0;
5910                         }
5911                 }
5912         }
5913         if (dw_count)
5914                 dw_do_work(object, &dw_array[0], dw_count);
5915
5916         if (page_list_count != NULL) {
5917                 if (upl->flags & UPL_INTERNAL)
5918                         *page_list_count = 0;
5919                 else if (*page_list_count > entry)
5920                         *page_list_count = entry;
5921         }
5922         vm_object_unlock(object);
5923
5924         if (cntrl_flags & UPL_BLOCK_ACCESS) {
5925                 /*
5926                  * We've marked all the pages "busy" so that future
5927                  * page faults will block.
5928                  * Now remove the mapping for these pages, so that they
5929                  * can't be accessed without causing a page fault.
5930                  */
5931                 vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
5932                                        PMAP_NULL, 0, VM_PROT_NONE);
5933                 assert(!object->blocked_access);
5934                 object->blocked_access = TRUE;
5935         }
5936         return KERN_SUCCESS;
5937
5938 return_err:
5939         dw_index = 0;
5940
5941         for (; offset < dst_offset; offset += PAGE_SIZE) {
5942                 dst_page = vm_page_lookup(object, offset);
5943
5944                 if (dst_page == VM_PAGE_NULL)
5945                         panic("vm_object_iopl_request: Wired pages missing. \n");
5946
5947                 if (dw_count) {
5948                         if (dw_array[dw_index].dw_m == dst_page) {
5949                                 dw_index++;
5950                                 dw_count--;
5951                                 continue;
5952                         }
5953                 }
5954                 vm_page_lockspin_queues();
5955                 vm_page_unwire(dst_page);
5956                 vm_page_unlock_queues();
5957
5958                 VM_STAT_INCR(reactivations);
5959         }
5960 #if UPL_DEBUG
5961         upl->upl_state = 2;
5962 #endif
5963         if (! (upl->flags & UPL_KERNEL_OBJECT)) {
5964                 vm_object_activity_end(object);
5965         }
5966         vm_object_unlock(object);
5967         upl_destroy(upl);
5968
5969         return ret;
5970 }
5971
5972 kern_return_t
5973 upl_transpose(
5974         upl_t           upl1,
5975         upl_t           upl2)
5976 {
5977         kern_return_t           retval;
5978         boolean_t               upls_locked;
5979         vm_object_t             object1, object2;
5980
5981         if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2  || ((upl1->flags & UPL_VECTOR)==UPL_VECTOR)  || ((upl2->flags & UPL_VECTOR)==UPL_VECTOR)) {
5982                 return KERN_INVALID_ARGUMENT;
5983         }
5984
5985         upls_locked = FALSE;
5986
5987         /*
5988          * Since we need to lock both UPLs at the same time,
5989          * avoid deadlocks by always taking locks in the same order.
5990          */
5991         if (upl1 < upl2) {
5992                 upl_lock(upl1);
5993                 upl_lock(upl2);
5994         } else {
5995                 upl_lock(upl2);
5996                 upl_lock(upl1);
5997         }
5998         upls_locked = TRUE;     /* the UPLs will need to be unlocked */
5999
6000         object1 = upl1->map_object;
6001         object2 = upl2->map_object;
6002
6003         if (upl1->offset != 0 || upl2->offset != 0 ||
6004             upl1->size != upl2->size) {
6005                 /*
6006                  * We deal only with full objects, not subsets.
6007                  * That's because we exchange the entire backing store info
6008                  * for the objects: pager, resident pages, etc...  We can't do
6009                  * only part of it.
6010                  */
6011                 retval = KERN_INVALID_VALUE;
6012                 goto done;
6013         }
6014
6015         /*
6016          * Tranpose the VM objects' backing store.
6017          */
6018         retval = vm_object_transpose(object1, object2,
6019                                      (vm_object_size_t) upl1->size);
6020
6021         if (retval == KERN_SUCCESS) {
6022                 /*
6023                  * Make each UPL point to the correct VM object, i.e. the
6024                  * object holding the pages that the UPL refers to...
6025                  */
6026 #if UPL_DEBUG
6027                 queue_remove(&object1->uplq, upl1, upl_t, uplq);
6028                 queue_remove(&object2->uplq, upl2, upl_t, uplq);
6029 #endif
6030                 upl1->map_object = object2;
6031                 upl2->map_object = object1;
6032 #if UPL_DEBUG
6033                 queue_enter(&object1->uplq, upl2, upl_t, uplq);
6034                 queue_enter(&object2->uplq, upl1, upl_t, uplq);
6035 #endif
6036         }
6037
6038 done:
6039         /*
6040          * Cleanup.
6041          */
6042         if (upls_locked) {
6043                 upl_unlock(upl1);
6044                 upl_unlock(upl2);
6045                 upls_locked = FALSE;
6046         }
6047
6048         return retval;
6049 }
6050
6051 /*
6052  * ENCRYPTED SWAP:
6053  *
6054  * Rationale:  the user might have some encrypted data on disk (via
6055  * FileVault or any other mechanism).  That data is then decrypted in
6056  * memory, which is safe as long as the machine is secure.  But that
6057  * decrypted data in memory could be paged out to disk by the default
6058  * pager.  The data would then be stored on disk in clear (not encrypted)
6059  * and it could be accessed by anyone who gets physical access to the
6060  * disk (if the laptop or the disk gets stolen for example).  This weakens
6061  * the security offered by FileVault.
6062  *
6063  * Solution:  the default pager will optionally request that all the
6064  * pages it gathers for pageout be encrypted, via the UPL interfaces,
6065  * before it sends this UPL to disk via the vnode_pageout() path.
6066  *
6067  * Notes:
6068  *
6069  * To avoid disrupting the VM LRU algorithms, we want to keep the
6070  * clean-in-place mechanisms, which allow us to send some extra pages to
6071  * swap (clustering) without actually removing them from the user's
6072  * address space.  We don't want the user to unknowingly access encrypted
6073  * data, so we have to actually remove the encrypted pages from the page
6074  * table.  When the user accesses the data, the hardware will fail to
6075  * locate the virtual page in its page table and will trigger a page
6076  * fault.  We can then decrypt the page and enter it in the page table
6077  * again.  Whenever we allow the user to access the contents of a page,
6078  * we have to make sure it's not encrypted.
6079  *
6080  *
6081  */
6082 /*
6083  * ENCRYPTED SWAP:
6084  * Reserve of virtual addresses in the kernel address space.
6085  * We need to map the physical pages in the kernel, so that we
6086  * can call the encryption/decryption routines with a kernel
6087  * virtual address.  We keep this pool of pre-allocated kernel
6088  * virtual addresses so that we don't have to scan the kernel's
6089  * virtaul address space each time we need to encrypt or decrypt
6090  * a physical page.
6091  * It would be nice to be able to encrypt and decrypt in physical
6092  * mode but that might not always be more efficient...
6093  */
6094 decl_simple_lock_data(,vm_paging_lock)
6095 #define VM_PAGING_NUM_PAGES     64
6096 vm_map_offset_t vm_paging_base_address = 0;
6097 boolean_t       vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
6098 int             vm_paging_max_index = 0;
6099 int             vm_paging_page_waiter = 0;
6100 int             vm_paging_page_waiter_total = 0;
6101 unsigned long   vm_paging_no_kernel_page = 0;
6102 unsigned long   vm_paging_objects_mapped = 0;
6103 unsigned long   vm_paging_pages_mapped = 0;
6104 unsigned long   vm_paging_objects_mapped_slow = 0;
6105 unsigned long   vm_paging_pages_mapped_slow = 0;
6106
6107 void
6108 vm_paging_map_init(void)
6109 {
6110         kern_return_t   kr;
6111         vm_map_offset_t page_map_offset;
6112         vm_map_entry_t  map_entry;
6113
6114         assert(vm_paging_base_address == 0);
6115
6116         /*
6117          * Initialize our pool of pre-allocated kernel
6118          * virtual addresses.
6119          */
6120         page_map_offset = 0;
6121         kr = vm_map_find_space(kernel_map,
6122                                &page_map_offset,
6123                                VM_PAGING_NUM_PAGES * PAGE_SIZE,
6124                                0,
6125                                0,
6126                                &map_entry);
6127         if (kr != KERN_SUCCESS) {
6128                 panic("vm_paging_map_init: kernel_map full\n");
6129         }
6130         map_entry->object.vm_object = kernel_object;
6131         map_entry->offset = page_map_offset;
6132         vm_object_reference(kernel_object);
6133         vm_map_unlock(kernel_map);
6134
6135         assert(vm_paging_base_address == 0);
6136         vm_paging_base_address = page_map_offset;
6137 }
6138
6139 /*
6140  * ENCRYPTED SWAP:
6141  * vm_paging_map_object:
6142  *      Maps part of a VM object's pages in the kernel
6143  *      virtual address space, using the pre-allocated
6144  *      kernel virtual addresses, if possible.
6145  * Context:
6146  *      The VM object is locked.  This lock will get
6147  *      dropped and re-acquired though, so the caller
6148  *      must make sure the VM object is kept alive
6149  *      (by holding a VM map that has a reference
6150  *      on it, for example, or taking an extra reference).
6151  *      The page should also be kept busy to prevent
6152  *      it from being reclaimed.
6153  */
6154 kern_return_t
6155 vm_paging_map_object(
6156         vm_map_offset_t         *address,
6157         vm_page_t               page,
6158         vm_object_t             object,
6159         vm_object_offset_t      offset,
6160         vm_map_size_t           *size,
6161         vm_prot_t               protection,
6162         boolean_t               can_unlock_object)
6163 {
6164         kern_return_t           kr;
6165         vm_map_offset_t         page_map_offset;
6166         vm_map_size_t           map_size;
6167         vm_object_offset_t      object_offset;
6168         int                     i;
6169
6170
6171         if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
6172                 assert(page->busy);
6173                 /*
6174                  * Use one of the pre-allocated kernel virtual addresses
6175                  * and just enter the VM page in the kernel address space
6176                  * at that virtual address.
6177                  */
6178                 simple_lock(&vm_paging_lock);
6179
6180                 /*
6181                  * Try and find an available kernel virtual address
6182                  * from our pre-allocated pool.
6183                  */
6184                 page_map_offset = 0;
6185                 for (;;) {
6186                         for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
6187                                 if (vm_paging_page_inuse[i] == FALSE) {
6188                                         page_map_offset =
6189                                                 vm_paging_base_address +
6190                                                 (i * PAGE_SIZE);
6191                                         break;
6192                                 }
6193                         }
6194                         if (page_map_offset != 0) {
6195                                 /* found a space to map our page ! */
6196                                 break;
6197                         }
6198
6199                         if (can_unlock_object) {
6200                                 /*
6201                                  * If we can afford to unlock the VM object,
6202                                  * let's take the slow path now...
6203                                  */
6204                                 break;
6205                         }
6206                         /*
6207                          * We can't afford to unlock the VM object, so
6208                          * let's wait for a space to become available...
6209                          */
6210                         vm_paging_page_waiter_total++;
6211                         vm_paging_page_waiter++;
6212                         thread_sleep_fast_usimple_lock(&vm_paging_page_waiter,
6213                                                        &vm_paging_lock,
6214                                                        THREAD_UNINT);
6215                         vm_paging_page_waiter--;
6216                         /* ... and try again */
6217                 }
6218
6219                 if (page_map_offset != 0) {
6220                         /*
6221                          * We found a kernel virtual address;
6222                          * map the physical page to that virtual address.
6223                          */
6224                         if (i > vm_paging_max_index) {
6225                                 vm_paging_max_index = i;
6226                         }
6227                         vm_paging_page_inuse[i] = TRUE;
6228                         simple_unlock(&vm_paging_lock);
6229
6230                         if (page->pmapped == FALSE) {
6231                                 pmap_sync_page_data_phys(page->phys_page);
6232                         }
6233                         page->pmapped = TRUE;
6234
6235                         /*
6236                          * Keep the VM object locked over the PMAP_ENTER
6237                          * and the actual use of the page by the kernel,
6238                          * or this pmap mapping might get undone by a
6239                          * vm_object_pmap_protect() call...
6240                          */
6241                         PMAP_ENTER(kernel_pmap,
6242                                    page_map_offset,
6243                                    page,
6244                                    protection,
6245                                    ((int) page->object->wimg_bits &
6246                                     VM_WIMG_MASK),
6247                                    TRUE);
6248                         vm_paging_objects_mapped++;
6249                         vm_paging_pages_mapped++;
6250                         *address = page_map_offset;
6251
6252                         /* all done and mapped, ready to use ! */
6253                         return KERN_SUCCESS;
6254                 }
6255
6256                 /*
6257                  * We ran out of pre-allocated kernel virtual
6258                  * addresses.  Just map the page in the kernel
6259                  * the slow and regular way.
6260                  */
6261                 vm_paging_no_kernel_page++;
6262                 simple_unlock(&vm_paging_lock);
6263         }
6264
6265         if (! can_unlock_object) {
6266                 return KERN_NOT_SUPPORTED;
6267         }
6268
6269         object_offset = vm_object_trunc_page(offset);
6270         map_size = vm_map_round_page(*size);
6271
6272         /*
6273          * Try and map the required range of the object
6274          * in the kernel_map
6275          */
6276
6277         vm_object_reference_locked(object);     /* for the map entry */
6278         vm_object_unlock(object);
6279
6280         kr = vm_map_enter(kernel_map,
6281                           address,
6282                           map_size,
6283                           0,
6284                           VM_FLAGS_ANYWHERE,
6285                           object,
6286                           object_offset,
6287                           FALSE,
6288                           protection,
6289                           VM_PROT_ALL,
6290                           VM_INHERIT_NONE);
6291         if (kr != KERN_SUCCESS) {
6292                 *address = 0;
6293                 *size = 0;
6294                 vm_object_deallocate(object);   /* for the map entry */
6295                 vm_object_lock(object);
6296                 return kr;
6297         }
6298
6299         *size = map_size;
6300
6301         /*
6302          * Enter the mapped pages in the page table now.
6303          */
6304         vm_object_lock(object);
6305         /*
6306          * VM object must be kept locked from before PMAP_ENTER()
6307          * until after the kernel is done accessing the page(s).
6308          * Otherwise, the pmap mappings in the kernel could be
6309          * undone by a call to vm_object_pmap_protect().
6310          */
6311
6312         for (page_map_offset = 0;
6313              map_size != 0;
6314              map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
6315                 unsigned int    cache_attr;
6316
6317                 page = vm_page_lookup(object, offset + page_map_offset);
6318                 if (page == VM_PAGE_NULL) {
6319                         printf("vm_paging_map_object: no page !?");
6320                         vm_object_unlock(object);
6321                         kr = vm_map_remove(kernel_map, *address, *size,
6322                                            VM_MAP_NO_FLAGS);
6323                         assert(kr == KERN_SUCCESS);
6324                         *address = 0;
6325                         *size = 0;
6326                         vm_object_lock(object);
6327                         return KERN_MEMORY_ERROR;
6328                 }
6329                 if (page->pmapped == FALSE) {
6330                         pmap_sync_page_data_phys(page->phys_page);
6331                 }
6332                 page->pmapped = TRUE;
6333                 cache_attr = ((unsigned int) object->wimg_bits) & VM_WIMG_MASK;
6334
6335                 //assert(pmap_verify_free(page->phys_page));
6336                 PMAP_ENTER(kernel_pmap,
6337                            *address + page_map_offset,
6338                            page,
6339                            protection,
6340                            cache_attr,
6341                            TRUE);
6342         }
6343
6344         vm_paging_objects_mapped_slow++;
6345         vm_paging_pages_mapped_slow += (unsigned long) (map_size / PAGE_SIZE_64);
6346
6347         return KERN_SUCCESS;
6348 }
6349
6350 /*
6351  * ENCRYPTED SWAP:
6352  * vm_paging_unmap_object:
6353  *      Unmaps part of a VM object's pages from the kernel
6354  *      virtual address space.
6355  * Context:
6356  *      The VM object is locked.  This lock will get
6357  *      dropped and re-acquired though.
6358  */
6359 void
6360 vm_paging_unmap_object(
6361         vm_object_t     object,
6362         vm_map_offset_t start,
6363         vm_map_offset_t end)
6364 {
6365         kern_return_t   kr;
6366         int             i;
6367
6368         if ((vm_paging_base_address == 0) ||
6369             (start < vm_paging_base_address) ||
6370             (end > (vm_paging_base_address
6371                      + (VM_PAGING_NUM_PAGES * PAGE_SIZE)))) {
6372                 /*
6373                  * We didn't use our pre-allocated pool of
6374                  * kernel virtual address.  Deallocate the
6375                  * virtual memory.
6376                  */
6377                 if (object != VM_OBJECT_NULL) {
6378                         vm_object_unlock(object);
6379                 }
6380                 kr = vm_map_remove(kernel_map, start, end, VM_MAP_NO_FLAGS);
6381                 if (object != VM_OBJECT_NULL) {
6382                         vm_object_lock(object);
6383                 }
6384                 assert(kr == KERN_SUCCESS);
6385         } else {
6386                 /*
6387                  * We used a kernel virtual address from our
6388                  * pre-allocated pool.  Put it back in the pool
6389                  * for next time.
6390                  */
6391                 assert(end - start == PAGE_SIZE);
6392                 i = (int) ((start - vm_paging_base_address) >> PAGE_SHIFT);
6393                 assert(i >= 0 && i < VM_PAGING_NUM_PAGES);
6394
6395                 /* undo the pmap mapping */
6396                 pmap_remove(kernel_pmap, start, end);
6397
6398                 simple_lock(&vm_paging_lock);
6399                 vm_paging_page_inuse[i] = FALSE;
6400                 if (vm_paging_page_waiter) {
6401                         thread_wakeup(&vm_paging_page_waiter);
6402                 }
6403                 simple_unlock(&vm_paging_lock);
6404         }
6405 }
6406
6407 #if CRYPTO
6408 /*
6409  * Encryption data.
6410  * "iv" is the "initial vector".  Ideally, we want to
6411  * have a different one for each page we encrypt, so that
6412  * crackers can't find encryption patterns too easily.
6413  */
6414 #define SWAP_CRYPT_AES_KEY_SIZE 128     /* XXX 192 and 256 don't work ! */
6415 boolean_t               swap_crypt_ctx_initialized = FALSE;
6416 aes_32t                 swap_crypt_key[8]; /* big enough for a 256 key */
6417 aes_ctx                 swap_crypt_ctx;
6418 const unsigned char     swap_crypt_null_iv[AES_BLOCK_SIZE] = {0xa, };
6419
6420 #if DEBUG
6421 boolean_t               swap_crypt_ctx_tested = FALSE;
6422 unsigned char swap_crypt_test_page_ref[4096] __attribute__((aligned(4096)));
6423 unsigned char swap_crypt_test_page_encrypt[4096] __attribute__((aligned(4096)));
6424 unsigned char swap_crypt_test_page_decrypt[4096] __attribute__((aligned(4096)));
6425 #endif /* DEBUG */
6426
6427 /*
6428  * Initialize the encryption context: key and key size.
6429  */
6430 void swap_crypt_ctx_initialize(void); /* forward */
6431 void
6432 swap_crypt_ctx_initialize(void)
6433 {
6434         unsigned int    i;
6435
6436         /*
6437          * No need for locking to protect swap_crypt_ctx_initialized
6438          * because the first use of encryption will come from the
6439          * pageout thread (we won't pagein before there's been a pageout)
6440          * and there's only one pageout thread.
6441          */
6442         if (swap_crypt_ctx_initialized == FALSE) {
6443                 for (i = 0;
6444                      i < (sizeof (swap_crypt_key) /
6445                           sizeof (swap_crypt_key[0]));
6446                      i++) {
6447                         swap_crypt_key[i] = random();
6448                 }
6449                 aes_encrypt_key((const unsigned char *) swap_crypt_key,
6450                                 SWAP_CRYPT_AES_KEY_SIZE,
6451                                 &swap_crypt_ctx.encrypt);
6452                 aes_decrypt_key((const unsigned char *) swap_crypt_key,
6453                                 SWAP_CRYPT_AES_KEY_SIZE,
6454                                 &swap_crypt_ctx.decrypt);
6455                 swap_crypt_ctx_initialized = TRUE;
6456         }
6457
6458 #if DEBUG
6459         /*
6460          * Validate the encryption algorithms.
6461          */
6462         if (swap_crypt_ctx_tested == FALSE) {
6463                 /* initialize */
6464                 for (i = 0; i < 4096; i++) {
6465                         swap_crypt_test_page_ref[i] = (char) i;
6466                 }
6467                 /* encrypt */
6468                 aes_encrypt_cbc(swap_crypt_test_page_ref,
6469                                 swap_crypt_null_iv,
6470                                 PAGE_SIZE / AES_BLOCK_SIZE,
6471                                 swap_crypt_test_page_encrypt,
6472                                 &swap_crypt_ctx.encrypt);
6473                 /* decrypt */
6474                 aes_decrypt_cbc(swap_crypt_test_page_encrypt,
6475                                 swap_crypt_null_iv,
6476                                 PAGE_SIZE / AES_BLOCK_SIZE,
6477                                 swap_crypt_test_page_decrypt,
6478                                 &swap_crypt_ctx.decrypt);
6479                 /* compare result with original */
6480                 for (i = 0; i < 4096; i ++) {
6481                         if (swap_crypt_test_page_decrypt[i] !=
6482                             swap_crypt_test_page_ref[i]) {
6483                                 panic("encryption test failed");
6484                         }
6485                 }
6486
6487                 /* encrypt again */
6488                 aes_encrypt_cbc(swap_crypt_test_page_decrypt,
6489                                 swap_crypt_null_iv,
6490                                 PAGE_SIZE / AES_BLOCK_SIZE,
6491                                 swap_crypt_test_page_decrypt,
6492                                 &swap_crypt_ctx.encrypt);
6493                 /* decrypt in place */
6494                 aes_decrypt_cbc(swap_crypt_test_page_decrypt,
6495                                 swap_crypt_null_iv,
6496                                 PAGE_SIZE / AES_BLOCK_SIZE,
6497                                 swap_crypt_test_page_decrypt,
6498                                 &swap_crypt_ctx.decrypt);
6499                 for (i = 0; i < 4096; i ++) {
6500                         if (swap_crypt_test_page_decrypt[i] !=
6501                             swap_crypt_test_page_ref[i]) {
6502                                 panic("in place encryption test failed");
6503                         }
6504                 }
6505
6506                 swap_crypt_ctx_tested = TRUE;
6507         }
6508 #endif /* DEBUG */
6509 }
6510
6511 /*
6512  * ENCRYPTED SWAP:
6513  * vm_page_encrypt:
6514  *      Encrypt the given page, for secure paging.
6515  *      The page might already be mapped at kernel virtual
6516  *      address "kernel_mapping_offset".  Otherwise, we need
6517  *      to map it.
6518  *
6519  * Context:
6520  *      The page's object is locked, but this lock will be released
6521  *      and re-acquired.
6522  *      The page is busy and not accessible by users (not entered in any pmap).
6523  */
6524 void
6525 vm_page_encrypt(
6526         vm_page_t       page,
6527         vm_map_offset_t kernel_mapping_offset)
6528 {
6529         kern_return_t           kr;
6530         vm_map_size_t           kernel_mapping_size;
6531         vm_offset_t             kernel_vaddr;
6532         union {
6533                 unsigned char   aes_iv[AES_BLOCK_SIZE];
6534                 struct {
6535                         memory_object_t         pager_object;
6536                         vm_object_offset_t      paging_offset;
6537                 } vm;
6538         } encrypt_iv;
6539
6540         if (! vm_pages_encrypted) {
6541                 vm_pages_encrypted = TRUE;
6542         }
6543
6544         assert(page->busy);
6545         assert(page->dirty || page->precious);
6546
6547         if (page->encrypted) {
6548                 /*
6549                  * Already encrypted: no need to do it again.
6550                  */
6551                 vm_page_encrypt_already_encrypted_counter++;
6552                 return;
6553         }
6554         ASSERT_PAGE_DECRYPTED(page);
6555
6556         /*
6557          * Take a paging-in-progress reference to keep the object
6558          * alive even if we have to unlock it (in vm_paging_map_object()
6559          * for example)...
6560          */
6561         vm_object_paging_begin(page->object);
6562
6563         if (kernel_mapping_offset == 0) {
6564                 /*
6565                  * The page hasn't already been mapped in kernel space
6566                  * by the caller.  Map it now, so that we can access
6567                  * its contents and encrypt them.
6568                  */
6569                 kernel_mapping_size = PAGE_SIZE;
6570                 kr = vm_paging_map_object(&kernel_mapping_offset,
6571                                           page,
6572                                           page->object,
6573                                           page->offset,
6574                                           &kernel_mapping_size,
6575                                           VM_PROT_READ | VM_PROT_WRITE,
6576                                           FALSE);
6577                 if (kr != KERN_SUCCESS) {
6578                         panic("vm_page_encrypt: "
6579                               "could not map page in kernel: 0x%x\n",
6580                               kr);
6581                 }
6582         } else {
6583                 kernel_mapping_size = 0;
6584         }
6585         kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
6586
6587         if (swap_crypt_ctx_initialized == FALSE) {
6588                 swap_crypt_ctx_initialize();
6589         }
6590         assert(swap_crypt_ctx_initialized);
6591
6592         /*
6593          * Prepare an "initial vector" for the encryption.
6594          * We use the "pager" and the "paging_offset" for that
6595          * page to obfuscate the encrypted data a bit more and
6596          * prevent crackers from finding patterns that they could
6597          * use to break the key.
6598          */
6599         bzero(&encrypt_iv.aes_iv[0], sizeof (encrypt_iv.aes_iv));
6600         encrypt_iv.vm.pager_object = page->object->pager;
6601         encrypt_iv.vm.paging_offset =
6602                 page->object->paging_offset + page->offset;
6603
6604         /* encrypt the "initial vector" */
6605         aes_encrypt_cbc((const unsigned char *) &encrypt_iv.aes_iv[0],
6606                         swap_crypt_null_iv,
6607                         1,
6608                         &encrypt_iv.aes_iv[0],
6609                         &swap_crypt_ctx.encrypt);
6610
6611         /*
6612          * Encrypt the page.
6613          */
6614         aes_encrypt_cbc((const unsigned char *) kernel_vaddr,
6615                         &encrypt_iv.aes_iv[0],
6616                         PAGE_SIZE / AES_BLOCK_SIZE,
6617                         (unsigned char *) kernel_vaddr,
6618                         &swap_crypt_ctx.encrypt);
6619
6620         vm_page_encrypt_counter++;
6621
6622         /*
6623          * Unmap the page from the kernel's address space,
6624          * if we had to map it ourselves.  Otherwise, let
6625          * the caller undo the mapping if needed.
6626          */
6627         if (kernel_mapping_size != 0) {
6628                 vm_paging_unmap_object(page->object,
6629                                        kernel_mapping_offset,
6630                                        kernel_mapping_offset + kernel_mapping_size);
6631         }
6632
6633         /*
6634          * Clear the "reference" and "modified" bits.
6635          * This should clean up any impact the encryption had
6636          * on them.
6637          * The page was kept busy and disconnected from all pmaps,
6638          * so it can't have been referenced or modified from user
6639          * space.
6640          * The software bits will be reset later after the I/O
6641          * has completed (in upl_commit_range()).
6642          */
6643         pmap_clear_refmod(page->phys_page, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
6644
6645         page->encrypted = TRUE;
6646
6647         vm_object_paging_end(page->object);
6648 }
6649
6650 /*
6651  * ENCRYPTED SWAP:
6652  * vm_page_decrypt:
6653  *      Decrypt the given page.
6654  *      The page might already be mapped at kernel virtual
6655  *      address "kernel_mapping_offset".  Otherwise, we need
6656  *      to map it.
6657  *
6658  * Context:
6659  *      The page's VM object is locked but will be unlocked and relocked.
6660  *      The page is busy and not accessible by users (not entered in any pmap).
6661  */
6662 void
6663 vm_page_decrypt(
6664         vm_page_t       page,
6665         vm_map_offset_t kernel_mapping_offset)
6666 {
6667         kern_return_t           kr;
6668         vm_map_size_t           kernel_mapping_size;
6669         vm_offset_t             kernel_vaddr;
6670         union {
6671                 unsigned char   aes_iv[AES_BLOCK_SIZE];
6672                 struct {
6673                         memory_object_t         pager_object;
6674                         vm_object_offset_t      paging_offset;
6675                 } vm;
6676         } decrypt_iv;
6677
6678         assert(page->busy);
6679         assert(page->encrypted);
6680
6681         /*
6682          * Take a paging-in-progress reference to keep the object
6683          * alive even if we have to unlock it (in vm_paging_map_object()
6684          * for example)...
6685          */
6686         vm_object_paging_begin(page->object);
6687
6688         if (kernel_mapping_offset == 0) {
6689                 /*
6690                  * The page hasn't already been mapped in kernel space
6691                  * by the caller.  Map it now, so that we can access
6692                  * its contents and decrypt them.
6693                  */
6694                 kernel_mapping_size = PAGE_SIZE;
6695                 kr = vm_paging_map_object(&kernel_mapping_offset,
6696                                           page,
6697                                           page->object,
6698                                           page->offset,
6699                                           &kernel_mapping_size,
6700                                           VM_PROT_READ | VM_PROT_WRITE,
6701                                           FALSE);
6702                 if (kr != KERN_SUCCESS) {
6703                         panic("vm_page_decrypt: "
6704                               "could not map page in kernel: 0x%x\n",
6705                               kr);
6706                 }
6707         } else {
6708                 kernel_mapping_size = 0;
6709         }
6710         kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
6711
6712         assert(swap_crypt_ctx_initialized);
6713
6714         /*
6715          * Prepare an "initial vector" for the decryption.
6716          * It has to be the same as the "initial vector" we
6717          * used to encrypt that page.
6718          */
6719         bzero(&decrypt_iv.aes_iv[0], sizeof (decrypt_iv.aes_iv));
6720         decrypt_iv.vm.pager_object = page->object->pager;
6721         decrypt_iv.vm.paging_offset =
6722                 page->object->paging_offset + page->offset;
6723
6724         /* encrypt the "initial vector" */
6725         aes_encrypt_cbc((const unsigned char *) &decrypt_iv.aes_iv[0],
6726                         swap_crypt_null_iv,
6727                         1,
6728                         &decrypt_iv.aes_iv[0],
6729                         &swap_crypt_ctx.encrypt);
6730
6731         /*
6732          * Decrypt the page.
6733          */
6734         aes_decrypt_cbc((const unsigned char *) kernel_vaddr,
6735                         &decrypt_iv.aes_iv[0],
6736                         PAGE_SIZE / AES_BLOCK_SIZE,
6737                         (unsigned char *) kernel_vaddr,
6738                         &swap_crypt_ctx.decrypt);
6739         vm_page_decrypt_counter++;
6740
6741         /*
6742          * Unmap the page from the kernel's address space,
6743          * if we had to map it ourselves.  Otherwise, let
6744          * the caller undo the mapping if needed.
6745          */
6746         if (kernel_mapping_size != 0) {
6747                 vm_paging_unmap_object(page->object,
6748                                        kernel_vaddr,
6749                                        kernel_vaddr + PAGE_SIZE);
6750         }
6751
6752         /*
6753          * After decryption, the page is actually clean.
6754          * It was encrypted as part of paging, which "cleans"
6755          * the "dirty" pages.
6756          * Noone could access it after it was encrypted
6757          * and the decryption doesn't count.
6758          */
6759         page->dirty = FALSE;
6760         assert (page->cs_validated == FALSE);
6761         pmap_clear_refmod(page->phys_page, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
6762         page->encrypted = FALSE;
6763
6764         /*
6765          * We've just modified the page's contents via the data cache and part
6766          * of the new contents might still be in the cache and not yet in RAM.
6767          * Since the page is now available and might get gathered in a UPL to
6768          * be part of a DMA transfer from a driver that expects the memory to
6769          * be coherent at this point, we have to flush the data cache.
6770          */
6771         pmap_sync_page_attributes_phys(page->phys_page);
6772         /*
6773          * Since the page is not mapped yet, some code might assume that it
6774          * doesn't need to invalidate the instruction cache when writing to
6775          * that page.  That code relies on "pmapped" being FALSE, so that the
6776          * caches get synchronized when the page is first mapped.
6777          */
6778         assert(pmap_verify_free(page->phys_page));
6779         page->pmapped = FALSE;
6780         page->wpmapped = FALSE;
6781
6782         vm_object_paging_end(page->object);
6783 }
6784
6785 #if DEVELOPMENT || DEBUG
6786 unsigned long upl_encrypt_upls = 0;
6787 unsigned long upl_encrypt_pages = 0;
6788 #endif
6789
6790 /*
6791  * ENCRYPTED SWAP:
6792  *
6793  * upl_encrypt:
6794  *      Encrypts all the pages in the UPL, within the specified range.
6795  *
6796  */
6797 void
6798 upl_encrypt(
6799         upl_t                   upl,
6800         upl_offset_t            crypt_offset,
6801         upl_size_t              crypt_size)
6802 {
6803         upl_size_t              upl_size, subupl_size=crypt_size;
6804         upl_offset_t            offset_in_upl, subupl_offset=crypt_offset;
6805         vm_object_t             upl_object;
6806         vm_object_offset_t      upl_offset;
6807         vm_page_t               page;
6808         vm_object_t             shadow_object;
6809         vm_object_offset_t      shadow_offset;
6810         vm_object_offset_t      paging_offset;
6811         vm_object_offset_t      base_offset;
6812         int                     isVectorUPL = 0;
6813         upl_t                   vector_upl = NULL;
6814
6815         if((isVectorUPL = vector_upl_is_valid(upl)))
6816                 vector_upl = upl;
6817
6818 process_upl_to_encrypt:
6819         if(isVectorUPL) {
6820                 crypt_size = subupl_size;
6821                 crypt_offset = subupl_offset;
6822                 upl =  vector_upl_subupl_byoffset(vector_upl, &crypt_offset, &crypt_size);
6823                 if(upl == NULL)
6824                         panic("upl_encrypt: Accessing a sub-upl that doesn't exist\n");
6825                 subupl_size -= crypt_size;
6826                 subupl_offset += crypt_size;
6827         }
6828
6829 #if DEVELOPMENT || DEBUG
6830         upl_encrypt_upls++;
6831         upl_encrypt_pages += crypt_size / PAGE_SIZE;
6832 #endif
6833         upl_object = upl->map_object;
6834         upl_offset = upl->offset;
6835         upl_size = upl->size;
6836
6837         vm_object_lock(upl_object);
6838
6839         /*
6840          * Find the VM object that contains the actual pages.
6841          */
6842         if (upl_object->pageout) {
6843                 shadow_object = upl_object->shadow;
6844                 /*
6845                  * The offset in the shadow object is actually also
6846                  * accounted for in upl->offset.  It possibly shouldn't be
6847                  * this way, but for now don't account for it twice.
6848                  */
6849                 shadow_offset = 0;
6850                 assert(upl_object->paging_offset == 0); /* XXX ? */
6851                 vm_object_lock(shadow_object);
6852         } else {
6853                 shadow_object = upl_object;
6854                 shadow_offset = 0;
6855         }
6856
6857         paging_offset = shadow_object->paging_offset;
6858         vm_object_paging_begin(shadow_object);
6859
6860         if (shadow_object != upl_object)
6861                 vm_object_unlock(upl_object);
6862
6863
6864         base_offset = shadow_offset;
6865         base_offset += upl_offset;
6866         base_offset += crypt_offset;
6867         base_offset -= paging_offset;
6868
6869         assert(crypt_offset + crypt_size <= upl_size);
6870
6871         for (offset_in_upl = 0;
6872              offset_in_upl < crypt_size;
6873              offset_in_upl += PAGE_SIZE) {
6874                 page = vm_page_lookup(shadow_object,
6875                                       base_offset + offset_in_upl);
6876                 if (page == VM_PAGE_NULL) {
6877                         panic("upl_encrypt: "
6878                               "no page for (obj=%p,off=%lld+%d)!\n",
6879                               shadow_object,
6880                               base_offset,
6881                               offset_in_upl);
6882                 }
6883                 /*
6884                  * Disconnect the page from all pmaps, so that nobody can
6885                  * access it while it's encrypted.  After that point, all
6886                  * accesses to this page will cause a page fault and block
6887                  * while the page is busy being encrypted.  After the
6888                  * encryption completes, any access will cause a
6889                  * page fault and the page gets decrypted at that time.
6890                  */
6891                 pmap_disconnect(page->phys_page);
6892                 vm_page_encrypt(page, 0);
6893
6894                 if (vm_object_lock_avoid(shadow_object)) {
6895                         /*
6896                          * Give vm_pageout_scan() a chance to convert more
6897                          * pages from "clean-in-place" to "clean-and-free",
6898                          * if it's interested in the same pages we selected
6899                          * in this cluster.
6900                          */
6901                         vm_object_unlock(shadow_object);
6902                         mutex_pause(2);
6903                         vm_object_lock(shadow_object);
6904                 }
6905         }
6906
6907         vm_object_paging_end(shadow_object);
6908         vm_object_unlock(shadow_object);
6909
6910         if(isVectorUPL && subupl_size)
6911                 goto process_upl_to_encrypt;
6912 }
6913
6914 #else /* CRYPTO */
6915 void
6916 upl_encrypt(
6917         __unused upl_t                  upl,
6918         __unused upl_offset_t   crypt_offset,
6919         __unused upl_size_t     crypt_size)
6920 {
6921 }
6922
6923 void
6924 vm_page_encrypt(
6925         __unused vm_page_t              page,
6926         __unused vm_map_offset_t        kernel_mapping_offset)
6927 {
6928 }
6929
6930 void
6931 vm_page_decrypt(
6932         __unused vm_page_t              page,
6933         __unused vm_map_offset_t        kernel_mapping_offset)
6934 {
6935 }
6936
6937 #endif /* CRYPTO */
6938
6939 void
6940 vm_pageout_queue_steal(vm_page_t page, boolean_t queues_locked)
6941 {
6942         page->list_req_pending = FALSE;
6943         page->cleaning = FALSE;
6944         page->pageout = FALSE;
6945
6946         if (!queues_locked) {
6947                 vm_page_lockspin_queues();
6948         }
6949
6950         /*
6951          * need to drop the laundry count...
6952          * we may also need to remove it
6953          * from the I/O paging queue...
6954          * vm_pageout_throttle_up handles both cases
6955          *
6956          * the laundry and pageout_queue flags are cleared...
6957          */
6958 #if CONFIG_EMBEDDED
6959         if (page->laundry)
6960                 vm_pageout_throttle_up(page);
6961 #else
6962         vm_pageout_throttle_up(page);
6963 #endif
6964
6965         /*
6966          * toss the wire count we picked up
6967          * when we intially set this page up
6968          * to be cleaned...
6969          */
6970         vm_page_unwire(page);
6971
6972         vm_page_steal_pageout_page++;
6973
6974         if (!queues_locked) {
6975                 vm_page_unlock_queues();
6976         }
6977 }
6978
6979 upl_t
6980 vector_upl_create(vm_offset_t upl_offset)
6981 {
6982         int     vector_upl_size  = sizeof(struct _vector_upl);
6983         int i=0;
6984         upl_t   upl;
6985         vector_upl_t vector_upl = (vector_upl_t)kalloc(vector_upl_size);
6986
6987         upl = upl_create(0,UPL_VECTOR,0);
6988         upl->vector_upl = vector_upl;
6989         upl->offset = upl_offset;
6990         vector_upl->size = 0;
6991         vector_upl->offset = upl_offset;
6992         vector_upl->invalid_upls=0;
6993         vector_upl->num_upls=0;
6994         vector_upl->pagelist = NULL;
6995
6996         for(i=0; i < MAX_VECTOR_UPL_ELEMENTS ; i++) {
6997                 vector_upl->upl_iostates[i].size = 0;
6998                 vector_upl->upl_iostates[i].offset = 0;
6999
7000         }
7001         return upl;
7002 }
7003
7004 void
7005 vector_upl_deallocate(upl_t upl)
7006 {
7007         if(upl) {
7008                 vector_upl_t vector_upl = upl->vector_upl;
7009                 if(vector_upl) {
7010                         if(vector_upl->invalid_upls != vector_upl->num_upls)
7011                                 panic("Deallocating non-empty Vectored UPL\n");
7012                         kfree(vector_upl->pagelist,(sizeof(struct upl_page_info)*(vector_upl->size/PAGE_SIZE)));
7013                         vector_upl->invalid_upls=0;
7014                         vector_upl->num_upls = 0;
7015                         vector_upl->pagelist = NULL;
7016                         vector_upl->size = 0;
7017                         vector_upl->offset = 0;
7018                         kfree(vector_upl, sizeof(struct _vector_upl));
7019                         vector_upl = (vector_upl_t)0xdeadbeef;
7020                 }
7021                 else
7022                         panic("vector_upl_deallocate was passed a non-vectored upl\n");
7023         }
7024         else
7025                 panic("vector_upl_deallocate was passed a NULL upl\n");
7026 }
7027
7028 boolean_t
7029 vector_upl_is_valid(upl_t upl)
7030 {
7031         if(upl &&  ((upl->flags & UPL_VECTOR)==UPL_VECTOR)) {
7032                 vector_upl_t vector_upl = upl->vector_upl;
7033                 if(vector_upl == NULL || vector_upl == (vector_upl_t)0xdeadbeef || vector_upl == (vector_upl_t)0xfeedbeef)
7034                         return FALSE;
7035                 else
7036                         return TRUE;
7037         }
7038         return FALSE;
7039 }
7040
7041 boolean_t
7042 vector_upl_set_subupl(upl_t upl,upl_t subupl, uint32_t io_size)
7043 {
7044         if(vector_upl_is_valid(upl)) {
7045                 vector_upl_t vector_upl = upl->vector_upl;
7046
7047                 if(vector_upl) {
7048                         if(subupl) {
7049                                 if(io_size) {
7050                                         if(io_size < PAGE_SIZE)
7051                                                 io_size = PAGE_SIZE;
7052                                         subupl->vector_upl = (void*)vector_upl;
7053                                         vector_upl->upl_elems[vector_upl->num_upls++] = subupl;
7054                                         vector_upl->size += io_size;
7055                                         upl->size += io_size;
7056                                 }
7057                                 else {
7058                                         uint32_t i=0,invalid_upls=0;
7059                                         for(i = 0; i < vector_upl->num_upls; i++) {
7060                                                 if(vector_upl->upl_elems[i] == subupl)
7061                                                         break;
7062                                         }
7063                                         if(i == vector_upl->num_upls)
7064                                                 panic("Trying to remove sub-upl when none exists");
7065
7066                                         vector_upl->upl_elems[i] = NULL;
7067                                         invalid_upls = hw_atomic_add(&(vector_upl)->invalid_upls, 1);
7068                                         if(invalid_upls == vector_upl->num_upls)
7069                                                 return TRUE;
7070                                         else
7071                                                 return FALSE;
7072                                 }
7073                         }
7074                         else
7075                                 panic("vector_upl_set_subupl was passed a NULL upl element\n");
7076                 }
7077                 else
7078                         panic("vector_upl_set_subupl was passed a non-vectored upl\n");
7079         }
7080         else
7081                 panic("vector_upl_set_subupl was passed a NULL upl\n");
7082
7083         return FALSE;
7084 }
7085
7086 void
7087 vector_upl_set_pagelist(upl_t upl)
7088 {
7089         if(vector_upl_is_valid(upl)) {
7090                 uint32_t i=0;
7091                 vector_upl_t vector_upl = upl->vector_upl;
7092
7093                 if(vector_upl) {
7094                         vm_offset_t pagelist_size=0, cur_upl_pagelist_size=0;
7095
7096                         vector_upl->pagelist = (upl_page_info_array_t)kalloc(sizeof(struct upl_page_info)*(vector_upl->size/PAGE_SIZE));
7097
7098                         for(i=0; i < vector_upl->num_upls; i++) {
7099                                 cur_upl_pagelist_size = sizeof(struct upl_page_info) * vector_upl->upl_elems[i]->size/PAGE_SIZE;
7100                                 bcopy(UPL_GET_INTERNAL_PAGE_LIST_SIMPLE(vector_upl->upl_elems[i]), (char*)vector_upl->pagelist + pagelist_size, cur_upl_pagelist_size);
7101                                 pagelist_size += cur_upl_pagelist_size;
7102                                 if(vector_upl->upl_elems[i]->highest_page > upl->highest_page)
7103                                         upl->highest_page = vector_upl->upl_elems[i]->highest_page;
7104                         }
7105                         assert( pagelist_size == (sizeof(struct upl_page_info)*(vector_upl->size/PAGE_SIZE)) );
7106                 }
7107                 else
7108                         panic("vector_upl_set_pagelist was passed a non-vectored upl\n");
7109         }
7110         else
7111                 panic("vector_upl_set_pagelist was passed a NULL upl\n");
7112
7113 }
7114
7115 upl_t
7116 vector_upl_subupl_byindex(upl_t upl, uint32_t index)
7117 {
7118         if(vector_upl_is_valid(upl)) {
7119                 vector_upl_t vector_upl = upl->vector_upl;
7120                 if(vector_upl) {
7121                         if(index < vector_upl->num_upls)
7122                                 return vector_upl->upl_elems[index];
7123                 }
7124                 else
7125                         panic("vector_upl_subupl_byindex was passed a non-vectored upl\n");
7126         }
7127         return NULL;
7128 }
7129
7130 upl_t
7131 vector_upl_subupl_byoffset(upl_t upl, upl_offset_t *upl_offset, upl_size_t *upl_size)
7132 {
7133         if(vector_upl_is_valid(upl)) {
7134                 uint32_t i=0;
7135                 vector_upl_t vector_upl = upl->vector_upl;
7136
7137                 if(vector_upl) {
7138                         upl_t subupl = NULL;
7139                         vector_upl_iostates_t subupl_state;
7140
7141                         for(i=0; i < vector_upl->num_upls; i++) {
7142                                 subupl = vector_upl->upl_elems[i];
7143                                 subupl_state = vector_upl->upl_iostates[i];
7144                                 if( *upl_offset <= (subupl_state.offset + subupl_state.size - 1)) {
7145                                         /* We could have been passed an offset/size pair that belongs
7146                                          * to an UPL element that has already been committed/aborted.
7147                                          * If so, return NULL.
7148                                          */
7149                                         if(subupl == NULL)
7150                                                 return NULL;
7151                                         if((subupl_state.offset + subupl_state.size) < (*upl_offset + *upl_size)) {
7152                                                 *upl_size = (subupl_state.offset + subupl_state.size) - *upl_offset;
7153                                                 if(*upl_size > subupl_state.size)
7154                                                         *upl_size = subupl_state.size;
7155                                         }
7156                                         if(*upl_offset >= subupl_state.offset)
7157                                                 *upl_offset -= subupl_state.offset;
7158                                         else if(i)
7159                                                 panic("Vector UPL offset miscalculation\n");
7160                                         return subupl;
7161                                 }
7162                         }
7163                 }
7164                 else
7165                         panic("vector_upl_subupl_byoffset was passed a non-vectored UPL\n");
7166         }
7167         return NULL;
7168 }
7169
7170 void
7171 vector_upl_get_submap(upl_t upl, vm_map_t *v_upl_submap, vm_offset_t *submap_dst_addr)
7172 {
7173         *v_upl_submap = NULL;
7174
7175         if(vector_upl_is_valid(upl)) {
7176                 vector_upl_t vector_upl = upl->vector_upl;
7177                 if(vector_upl) {
7178                         *v_upl_submap = vector_upl->submap;
7179                         *submap_dst_addr = vector_upl->submap_dst_addr;
7180                 }
7181                 else
7182                         panic("vector_upl_get_submap was passed a non-vectored UPL\n");
7183         }
7184         else
7185                 panic("vector_upl_get_submap was passed a null UPL\n");
7186 }
7187
7188 void
7189 vector_upl_set_submap(upl_t upl, vm_map_t submap, vm_offset_t submap_dst_addr)
7190 {
7191         if(vector_upl_is_valid(upl)) {
7192                 vector_upl_t vector_upl = upl->vector_upl;
7193                 if(vector_upl) {
7194                         vector_upl->submap = submap;
7195                         vector_upl->submap_dst_addr = submap_dst_addr;
7196                 }
7197                 else
7198                         panic("vector_upl_get_submap was passed a non-vectored UPL\n");
7199         }
7200         else
7201                 panic("vector_upl_get_submap was passed a NULL UPL\n");
7202 }
7203
7204 void
7205 vector_upl_set_iostate(upl_t upl, upl_t subupl, upl_offset_t offset, upl_size_t size)
7206 {
7207         if(vector_upl_is_valid(upl)) {
7208                 uint32_t i = 0;
7209                 vector_upl_t vector_upl = upl->vector_upl;
7210
7211                 if(vector_upl) {
7212                         for(i = 0; i < vector_upl->num_upls; i++) {
7213                                 if(vector_upl->upl_elems[i] == subupl)
7214                                         break;
7215                         }
7216
7217                         if(i == vector_upl->num_upls)
7218                                 panic("setting sub-upl iostate when none exists");
7219
7220                         vector_upl->upl_iostates[i].offset = offset;
7221                         if(size < PAGE_SIZE)
7222                                 size = PAGE_SIZE;
7223                         vector_upl->upl_iostates[i].size = size;
7224                 }
7225                 else
7226                         panic("vector_upl_set_iostate was passed a non-vectored UPL\n");
7227         }
7228         else
7229                 panic("vector_upl_set_iostate was passed a NULL UPL\n");
7230 }
7231
7232 void
7233 vector_upl_get_iostate(upl_t upl, upl_t subupl, upl_offset_t *offset, upl_size_t *size)
7234 {
7235         if(vector_upl_is_valid(upl)) {
7236                 uint32_t i = 0;
7237                 vector_upl_t vector_upl = upl->vector_upl;
7238
7239                 if(vector_upl) {
7240                         for(i = 0; i < vector_upl->num_upls; i++) {
7241                                 if(vector_upl->upl_elems[i] == subupl)
7242                                         break;
7243                         }
7244
7245                         if(i == vector_upl->num_upls)
7246                                 panic("getting sub-upl iostate when none exists");
7247
7248                         *offset = vector_upl->upl_iostates[i].offset;
7249                         *size = vector_upl->upl_iostates[i].size;
7250                 }
7251                 else
7252                         panic("vector_upl_get_iostate was passed a non-vectored UPL\n");
7253         }
7254         else
7255                 panic("vector_upl_get_iostate was passed a NULL UPL\n");
7256 }
7257
7258 void
7259 vector_upl_get_iostate_byindex(upl_t upl, uint32_t index, upl_offset_t *offset, upl_size_t *size)
7260 {
7261         if(vector_upl_is_valid(upl)) {
7262                 vector_upl_t vector_upl = upl->vector_upl;
7263                 if(vector_upl) {
7264                         if(index < vector_upl->num_upls) {
7265                                 *offset = vector_upl->upl_iostates[index].offset;
7266                                 *size = vector_upl->upl_iostates[index].size;
7267                         }
7268                         else
7269                                 *offset = *size = 0;
7270                 }
7271                 else
7272                         panic("vector_upl_get_iostate_byindex was passed a non-vectored UPL\n");
7273         }
7274         else
7275                 panic("vector_upl_get_iostate_byindex was passed a NULL UPL\n");
7276 }
7277
7278 upl_page_info_t *
7279 upl_get_internal_vectorupl_pagelist(upl_t upl)
7280 {
7281         return ((vector_upl_t)(upl->vector_upl))->pagelist;
7282 }
7283
7284 void *
7285 upl_get_internal_vectorupl(upl_t upl)
7286 {
7287         return upl->vector_upl;
7288 }
7289
7290 vm_size_t
7291 upl_get_internal_pagelist_offset(void)
7292 {
7293         return sizeof(struct upl);
7294 }
7295
7296 void
7297 upl_clear_dirty(
7298         upl_t           upl,
7299         boolean_t       value)
7300 {
7301         if (value) {
7302                 upl->flags |= UPL_CLEAR_DIRTY;
7303         } else {
7304                 upl->flags &= ~UPL_CLEAR_DIRTY;
7305         }
7306 }
7307
7308
7309 #ifdef MACH_BSD
7310
7311 boolean_t  upl_device_page(upl_page_info_t *upl)
7312 {
7313         return(UPL_DEVICE_PAGE(upl));
7314 }
7315 boolean_t  upl_page_present(upl_page_info_t *upl, int index)
7316 {
7317         return(UPL_PAGE_PRESENT(upl, index));
7318 }
7319 boolean_t  upl_speculative_page(upl_page_info_t *upl, int index)
7320 {
7321         return(UPL_SPECULATIVE_PAGE(upl, index));
7322 }
7323 boolean_t  upl_dirty_page(upl_page_info_t *upl, int index)
7324 {
7325         return(UPL_DIRTY_PAGE(upl, index));
7326 }
7327 boolean_t  upl_valid_page(upl_page_info_t *upl, int index)
7328 {
7329         return(UPL_VALID_PAGE(upl, index));
7330 }
7331 ppnum_t  upl_phys_page(upl_page_info_t *upl, int index)
7332 {
7333         return(UPL_PHYS_PAGE(upl, index));
7334 }
7335
7336
7337 void
7338 vm_countdirtypages(void)
7339 {
7340         vm_page_t m;
7341         int dpages;
7342         int pgopages;
7343         int precpages;
7344
7345
7346         dpages=0;
7347         pgopages=0;
7348         precpages=0;
7349
7350         vm_page_lock_queues();
7351         m = (vm_page_t) queue_first(&vm_page_queue_inactive);
7352         do {
7353                 if (m ==(vm_page_t )0) break;
7354
7355                 if(m->dirty) dpages++;
7356                 if(m->pageout) pgopages++;
7357                 if(m->precious) precpages++;
7358
7359                 assert(m->object != kernel_object);
7360                 m = (vm_page_t) queue_next(&m->pageq);
7361                 if (m ==(vm_page_t )0) break;
7362
7363         } while (!queue_end(&vm_page_queue_inactive,(queue_entry_t) m));
7364         vm_page_unlock_queues();
7365
7366         vm_page_lock_queues();
7367         m = (vm_page_t) queue_first(&vm_page_queue_throttled);
7368         do {
7369                 if (m ==(vm_page_t )0) break;
7370
7371                 dpages++;
7372                 assert(m->dirty);
7373                 assert(!m->pageout);
7374                 assert(m->object != kernel_object);
7375                 m = (vm_page_t) queue_next(&m->pageq);
7376                 if (m ==(vm_page_t )0) break;
7377
7378         } while (!queue_end(&vm_page_queue_throttled,(queue_entry_t) m));
7379         vm_page_unlock_queues();
7380
7381         vm_page_lock_queues();
7382         m = (vm_page_t) queue_first(&vm_page_queue_zf);
7383         do {
7384                 if (m ==(vm_page_t )0) break;
7385
7386                 if(m->dirty) dpages++;
7387                 if(m->pageout) pgopages++;
7388                 if(m->precious) precpages++;
7389
7390                 assert(m->object != kernel_object);
7391                 m = (vm_page_t) queue_next(&m->pageq);
7392                 if (m ==(vm_page_t )0) break;
7393
7394         } while (!queue_end(&vm_page_queue_zf,(queue_entry_t) m));
7395         vm_page_unlock_queues();
7396
7397         printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
7398
7399         dpages=0;
7400         pgopages=0;
7401         precpages=0;
7402
7403         vm_page_lock_queues();
7404         m = (vm_page_t) queue_first(&vm_page_queue_active);
7405
7406         do {
7407                 if(m == (vm_page_t )0) break;
7408                 if(m->dirty) dpages++;
7409                 if(m->pageout) pgopages++;
7410                 if(m->precious) precpages++;
7411
7412                 assert(m->object != kernel_object);
7413                 m = (vm_page_t) queue_next(&m->pageq);
7414                 if(m == (vm_page_t )0) break;
7415
7416         } while (!queue_end(&vm_page_queue_active,(queue_entry_t) m));
7417         vm_page_unlock_queues();
7418
7419         printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
7420
7421 }
7422 #endif /* MACH_BSD */
7423
7424 ppnum_t upl_get_highest_page(
7425                              upl_t                      upl)
7426 {
7427         return upl->highest_page;
7428 }
7429
7430 upl_size_t upl_get_size(
7431                              upl_t                      upl)
7432 {
7433         return upl->size;
7434 }
7435
7436 #if UPL_DEBUG
7437 kern_return_t  upl_ubc_alias_set(upl_t upl, uintptr_t alias1, uintptr_t alias2)
7438 {
7439         upl->ubc_alias1 = alias1;
7440         upl->ubc_alias2 = alias2;
7441         return KERN_SUCCESS;
7442 }
7443 int  upl_ubc_alias_get(upl_t upl, uintptr_t * al, uintptr_t * al2)
7444 {
7445         if(al)
7446                 *al = upl->ubc_alias1;
7447         if(al2)
7448                 *al2 = upl->ubc_alias2;
7449         return KERN_SUCCESS;
7450 }
7451 #endif /* UPL_DEBUG */
7452
7453
7454
7455 #if     MACH_KDB
7456 #include <ddb/db_output.h>
7457 #include <ddb/db_print.h>
7458 #include <vm/vm_print.h>
7459
7460 #define printf  kdbprintf
7461 void            db_pageout(void);
7462
7463 void
7464 db_vm(void)
7465 {
7466
7467         iprintf("VM Statistics:\n");
7468         db_indent += 2;
7469         iprintf("pages:\n");
7470         db_indent += 2;
7471         iprintf("activ %5d  inact %5d  free  %5d",
7472                 vm_page_active_count, vm_page_inactive_count,
7473                 vm_page_free_count);
7474         printf("   wire  %5d  gobbl %5d\n",
7475                vm_page_wire_count, vm_page_gobble_count);
7476         db_indent -= 2;
7477         iprintf("target:\n");
7478         db_indent += 2;
7479         iprintf("min   %5d  inact %5d  free  %5d",
7480                 vm_page_free_min, vm_page_inactive_target,
7481                 vm_page_free_target);
7482         printf("   resrv %5d\n", vm_page_free_reserved);
7483         db_indent -= 2;
7484         iprintf("pause:\n");
7485         db_pageout();
7486         db_indent -= 2;
7487 }
7488
7489 #if     MACH_COUNTERS
7490 extern int c_laundry_pages_freed;
7491 #endif  /* MACH_COUNTERS */
7492
7493 void
7494 db_pageout(void)
7495 {
7496         iprintf("Pageout Statistics:\n");
7497         db_indent += 2;
7498         iprintf("active %5d  inactv %5d\n",
7499                 vm_pageout_active, vm_pageout_inactive);
7500         iprintf("nolock %5d  avoid  %5d  busy   %5d  absent %5d\n",
7501                 vm_pageout_inactive_nolock, vm_pageout_inactive_avoid,
7502                 vm_pageout_inactive_busy, vm_pageout_inactive_absent);
7503         iprintf("used   %5d  clean  %5d  dirty  %5d\n",
7504                 vm_pageout_inactive_used, vm_pageout_inactive_clean,
7505                 vm_pageout_inactive_dirty);
7506 #if     MACH_COUNTERS
7507         iprintf("laundry_pages_freed %d\n", c_laundry_pages_freed);
7508 #endif  /* MACH_COUNTERS */
7509 #if     MACH_CLUSTER_STATS
7510         iprintf("Cluster Statistics:\n");
7511         db_indent += 2;
7512         iprintf("dirtied   %5d   cleaned  %5d   collisions  %5d\n",
7513                 vm_pageout_cluster_dirtied, vm_pageout_cluster_cleaned,
7514                 vm_pageout_cluster_collisions);
7515         iprintf("clusters  %5d   conversions  %5d\n",
7516                 vm_pageout_cluster_clusters, vm_pageout_cluster_conversions);
7517         db_indent -= 2;
7518         iprintf("Target Statistics:\n");
7519         db_indent += 2;
7520         iprintf("collisions   %5d   page_dirtied  %5d   page_freed  %5d\n",
7521                 vm_pageout_target_collisions, vm_pageout_target_page_dirtied,
7522                 vm_pageout_target_page_freed);
7523         db_indent -= 2;
7524 #endif  /* MACH_CLUSTER_STATS */
7525         db_indent -= 2;
7526 }
7527
7528 #endif  /* MACH_KDB */