osfmk/vm/vm_pageout.c

   1 /*
   2  * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58 /*
  59  *      File:   vm/vm_pageout.c
  60  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  61  *      Date:   1985
  62  *
  63  *      The proverbial page-out daemon.
  64  */
  65
  66 #include <stdint.h>
  67
  68 #include <debug.h>
  69 #include <mach_pagemap.h>
  70 #include <mach_cluster_stats.h>
  71 #include <mach_kdb.h>
  72 #include <advisory_pageout.h>
  73
  74 #include <mach/mach_types.h>
  75 #include <mach/memory_object.h>
  76 #include <mach/memory_object_default.h>
  77 #include <mach/memory_object_control_server.h>
  78 #include <mach/mach_host_server.h>
  79 #include <mach/upl.h>
  80 #include <mach/vm_map.h>
  81 #include <mach/vm_param.h>
  82 #include <mach/vm_statistics.h>
  83 #include <mach/sdt.h>
  84
  85 #include <kern/kern_types.h>
  86 #include <kern/counters.h>
  87 #include <kern/host_statistics.h>
  88 #include <kern/machine.h>
  89 #include <kern/misc_protos.h>
  90 #include <kern/sched.h>
  91 #include <kern/thread.h>
  92 #include <kern/xpr.h>
  93 #include <kern/kalloc.h>
  94
  95 #include <machine/vm_tuning.h>
  96 #include <machine/commpage.h>
  97
  98 #if CONFIG_EMBEDDED
  99 #include <sys/kern_memorystatus.h>
 100 #endif
 101
 102 #include <vm/pmap.h>
 103 #include <vm/vm_fault.h>
 104 #include <vm/vm_map.h>
 105 #include <vm/vm_object.h>
 106 #include <vm/vm_page.h>
 107 #include <vm/vm_pageout.h>
 108 #include <vm/vm_protos.h> /* must be last */
 109 #include <vm/memory_object.h>
 110 #include <vm/vm_purgeable_internal.h>
 111
 112 /*
 113  * ENCRYPTED SWAP:
 114  */
 115 #include <../bsd/crypto/aes/aes.h>
 116 extern u_int32_t random(void);  /* from <libkern/libkern.h> */
 117
 118 #if UPL_DEBUG
 119 #include <libkern/OSDebug.h>
 120 #endif
 121
 122 #ifndef VM_PAGEOUT_BURST_ACTIVE_THROTTLE   /* maximum iterations of the active queue to move pages to inactive */
 123 #define VM_PAGEOUT_BURST_ACTIVE_THROTTLE  100
 124 #endif
 125
 126 #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE  /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
 127 #ifdef  CONFIG_EMBEDDED
 128 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 1024
 129 #else
 130 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096
 131 #endif
 132 #endif
 133
 134 #ifndef VM_PAGEOUT_DEADLOCK_RELIEF
 135 #define VM_PAGEOUT_DEADLOCK_RELIEF 100  /* number of pages to move to break deadlock */
 136 #endif
 137
 138 #ifndef VM_PAGEOUT_INACTIVE_RELIEF
 139 #define VM_PAGEOUT_INACTIVE_RELIEF 50   /* minimum number of pages to move to the inactive q */
 140 #endif
 141
 142 #ifndef VM_PAGE_LAUNDRY_MAX
 143 #define VM_PAGE_LAUNDRY_MAX     16UL    /* maximum pageouts on a given pageout queue */
 144 #endif  /* VM_PAGEOUT_LAUNDRY_MAX */
 145
 146 #ifndef VM_PAGEOUT_BURST_WAIT
 147 #define VM_PAGEOUT_BURST_WAIT   30      /* milliseconds per page */
 148 #endif  /* VM_PAGEOUT_BURST_WAIT */
 149
 150 #ifndef VM_PAGEOUT_EMPTY_WAIT
 151 #define VM_PAGEOUT_EMPTY_WAIT   200     /* milliseconds */
 152 #endif  /* VM_PAGEOUT_EMPTY_WAIT */
 153
 154 #ifndef VM_PAGEOUT_DEADLOCK_WAIT
 155 #define VM_PAGEOUT_DEADLOCK_WAIT        300     /* milliseconds */
 156 #endif  /* VM_PAGEOUT_DEADLOCK_WAIT */
 157
 158 #ifndef VM_PAGEOUT_IDLE_WAIT
 159 #define VM_PAGEOUT_IDLE_WAIT    10      /* milliseconds */
 160 #endif  /* VM_PAGEOUT_IDLE_WAIT */
 161
 162 #ifndef VM_PAGE_SPECULATIVE_TARGET
 163 #define VM_PAGE_SPECULATIVE_TARGET(total) ((total) * 1 / 20)
 164 #endif /* VM_PAGE_SPECULATIVE_TARGET */
 165
 166 #ifndef VM_PAGE_INACTIVE_HEALTHY_LIMIT
 167 #define VM_PAGE_INACTIVE_HEALTHY_LIMIT(total) ((total) * 1 / 200)
 168 #endif /* VM_PAGE_INACTIVE_HEALTHY_LIMIT */
 169
 170
 171 /*
 172  *      To obtain a reasonable LRU approximation, the inactive queue
 173  *      needs to be large enough to give pages on it a chance to be
 174  *      referenced a second time.  This macro defines the fraction
 175  *      of active+inactive pages that should be inactive.
 176  *      The pageout daemon uses it to update vm_page_inactive_target.
 177  *
 178  *      If vm_page_free_count falls below vm_page_free_target and
 179  *      vm_page_inactive_count is below vm_page_inactive_target,
 180  *      then the pageout daemon starts running.
 181  */
 182
 183 #ifndef VM_PAGE_INACTIVE_TARGET
 184 #define VM_PAGE_INACTIVE_TARGET(avail)  ((avail) * 1 / 3)
 185 #endif  /* VM_PAGE_INACTIVE_TARGET */
 186
 187 /*
 188  *      Once the pageout daemon starts running, it keeps going
 189  *      until vm_page_free_count meets or exceeds vm_page_free_target.
 190  */
 191
 192 #ifndef VM_PAGE_FREE_TARGET
 193 #ifdef  CONFIG_EMBEDDED
 194 #define VM_PAGE_FREE_TARGET(free)       (15 + (free) / 100)
 195 #else
 196 #define VM_PAGE_FREE_TARGET(free)       (15 + (free) / 80)
 197 #endif
 198 #endif  /* VM_PAGE_FREE_TARGET */
 199
 200 /*
 201  *      The pageout daemon always starts running once vm_page_free_count
 202  *      falls below vm_page_free_min.
 203  */
 204
 205 #ifndef VM_PAGE_FREE_MIN
 206 #ifdef  CONFIG_EMBEDDED
 207 #define VM_PAGE_FREE_MIN(free)          (10 + (free) / 200)
 208 #else
 209 #define VM_PAGE_FREE_MIN(free)          (10 + (free) / 100)
 210 #endif
 211 #endif  /* VM_PAGE_FREE_MIN */
 212
 213 #define VM_PAGE_FREE_MIN_LIMIT          1500
 214 #define VM_PAGE_FREE_TARGET_LIMIT       2000
 215
 216
 217 /*
 218  *      When vm_page_free_count falls below vm_page_free_reserved,
 219  *      only vm-privileged threads can allocate pages.  vm-privilege
 220  *      allows the pageout daemon and default pager (and any other
 221  *      associated threads needed for default pageout) to continue
 222  *      operation by dipping into the reserved pool of pages.
 223  */
 224
 225 #ifndef VM_PAGE_FREE_RESERVED
 226 #define VM_PAGE_FREE_RESERVED(n)        \
 227         ((unsigned) (6 * VM_PAGE_LAUNDRY_MAX) + (n))
 228 #endif  /* VM_PAGE_FREE_RESERVED */
 229
 230 /*
 231  *      When we dequeue pages from the inactive list, they are
 232  *      reactivated (ie, put back on the active queue) if referenced.
 233  *      However, it is possible to starve the free list if other
 234  *      processors are referencing pages faster than we can turn off
 235  *      the referenced bit.  So we limit the number of reactivations
 236  *      we will make per call of vm_pageout_scan().
 237  */
 238 #define VM_PAGE_REACTIVATE_LIMIT_MAX 20000
 239 #ifndef VM_PAGE_REACTIVATE_LIMIT
 240 #ifdef  CONFIG_EMBEDDED
 241 #define VM_PAGE_REACTIVATE_LIMIT(avail) (VM_PAGE_INACTIVE_TARGET(avail) / 2)
 242 #else
 243 #define VM_PAGE_REACTIVATE_LIMIT(avail) (MAX((avail) * 1 / 20,VM_PAGE_REACTIVATE_LIMIT_MAX))
 244 #endif
 245 #endif  /* VM_PAGE_REACTIVATE_LIMIT */
 246 #define VM_PAGEOUT_INACTIVE_FORCE_RECLAIM       100
 247
 248
 249 /*
 250  * Exported variable used to broadcast the activation of the pageout scan
 251  * Working Set uses this to throttle its use of pmap removes.  In this
 252  * way, code which runs within memory in an uncontested context does
 253  * not keep encountering soft faults.
 254  */
 255
 256 unsigned int    vm_pageout_scan_event_counter = 0;
 257
 258 /*
 259  * Forward declarations for internal routines.
 260  */
 261
 262 static void vm_pageout_garbage_collect(int);
 263 static void vm_pageout_iothread_continue(struct vm_pageout_queue *);
 264 static void vm_pageout_iothread_external(void);
 265 static void vm_pageout_iothread_internal(void);
 266
 267 extern void vm_pageout_continue(void);
 268 extern void vm_pageout_scan(void);
 269
 270 static thread_t vm_pageout_external_iothread = THREAD_NULL;
 271 static thread_t vm_pageout_internal_iothread = THREAD_NULL;
 272
 273 unsigned int vm_pageout_reserved_internal = 0;
 274 unsigned int vm_pageout_reserved_really = 0;
 275
 276 unsigned int vm_pageout_idle_wait = 0;          /* milliseconds */
 277 unsigned int vm_pageout_empty_wait = 0;         /* milliseconds */
 278 unsigned int vm_pageout_burst_wait = 0;         /* milliseconds */
 279 unsigned int vm_pageout_deadlock_wait = 0;      /* milliseconds */
 280 unsigned int vm_pageout_deadlock_relief = 0;
 281 unsigned int vm_pageout_inactive_relief = 0;
 282 unsigned int vm_pageout_burst_active_throttle = 0;
 283 unsigned int vm_pageout_burst_inactive_throttle = 0;
 284
 285 /*
 286  *      Protection against zero fill flushing live working sets derived
 287  *      from existing backing store and files
 288  */
 289 unsigned int vm_accellerate_zf_pageout_trigger = 400;
 290 unsigned int zf_queue_min_count = 100;
 291 unsigned int vm_zf_queue_count = 0;
 292
 293 #if defined(__ppc__) /* On ppc, vm statistics are still 32-bit */
 294 unsigned int vm_zf_count = 0;
 295 #else
 296 uint64_t vm_zf_count __attribute__((aligned(8))) = 0;
 297 #endif
 298
 299 /*
 300  *      These variables record the pageout daemon's actions:
 301  *      how many pages it looks at and what happens to those pages.
 302  *      No locking needed because only one thread modifies the variables.
 303  */
 304
 305 unsigned int vm_pageout_active = 0;             /* debugging */
 306 unsigned int vm_pageout_inactive = 0;           /* debugging */
 307 unsigned int vm_pageout_inactive_throttled = 0; /* debugging */
 308 unsigned int vm_pageout_inactive_forced = 0;    /* debugging */
 309 unsigned int vm_pageout_inactive_nolock = 0;    /* debugging */
 310 unsigned int vm_pageout_inactive_avoid = 0;     /* debugging */
 311 unsigned int vm_pageout_inactive_busy = 0;      /* debugging */
 312 unsigned int vm_pageout_inactive_absent = 0;    /* debugging */
 313 unsigned int vm_pageout_inactive_used = 0;      /* debugging */
 314 unsigned int vm_pageout_inactive_clean = 0;     /* debugging */
 315 unsigned int vm_pageout_inactive_dirty = 0;     /* debugging */
 316 unsigned int vm_pageout_inactive_deactivated = 0;       /* debugging */
 317 unsigned int vm_pageout_inactive_zf = 0;        /* debugging */
 318 unsigned int vm_pageout_dirty_no_pager = 0;     /* debugging */
 319 unsigned int vm_pageout_purged_objects = 0;     /* debugging */
 320 unsigned int vm_stat_discard = 0;               /* debugging */
 321 unsigned int vm_stat_discard_sent = 0;          /* debugging */
 322 unsigned int vm_stat_discard_failure = 0;       /* debugging */
 323 unsigned int vm_stat_discard_throttle = 0;      /* debugging */
 324 unsigned int vm_pageout_reactivation_limit_exceeded = 0;        /* debugging */
 325 unsigned int vm_pageout_catch_ups = 0;                          /* debugging */
 326 unsigned int vm_pageout_inactive_force_reclaim = 0;     /* debugging */
 327
 328 unsigned int vm_pageout_scan_active_throttled = 0;
 329 unsigned int vm_pageout_scan_inactive_throttled = 0;
 330 unsigned int vm_pageout_scan_throttle = 0;                      /* debugging */
 331 unsigned int vm_pageout_scan_throttle_aborted = 0;              /* debugging */
 332 unsigned int vm_pageout_scan_burst_throttle = 0;                /* debugging */
 333 unsigned int vm_pageout_scan_empty_throttle = 0;                /* debugging */
 334 unsigned int vm_pageout_scan_deadlock_detected = 0;             /* debugging */
 335 unsigned int vm_pageout_scan_active_throttle_success = 0;       /* debugging */
 336 unsigned int vm_pageout_scan_inactive_throttle_success = 0;     /* debugging */
 337
 338 unsigned int vm_page_speculative_count_drifts = 0;
 339 unsigned int vm_page_speculative_count_drift_max = 0;
 340
 341 /*
 342  * Backing store throttle when BS is exhausted
 343  */
 344 unsigned int    vm_backing_store_low = 0;
 345
 346 unsigned int vm_pageout_out_of_line  = 0;
 347 unsigned int vm_pageout_in_place  = 0;
 348
 349 unsigned int vm_page_steal_pageout_page = 0;
 350
 351 /*
 352  * ENCRYPTED SWAP:
 353  * counters and statistics...
 354  */
 355 unsigned long vm_page_decrypt_counter = 0;
 356 unsigned long vm_page_decrypt_for_upl_counter = 0;
 357 unsigned long vm_page_encrypt_counter = 0;
 358 unsigned long vm_page_encrypt_abort_counter = 0;
 359 unsigned long vm_page_encrypt_already_encrypted_counter = 0;
 360 boolean_t vm_pages_encrypted = FALSE; /* are there encrypted pages ? */
 361
 362 struct  vm_pageout_queue vm_pageout_queue_internal;
 363 struct  vm_pageout_queue vm_pageout_queue_external;
 364
 365 unsigned int vm_page_speculative_target = 0;
 366
 367 vm_object_t     vm_pageout_scan_wants_object = VM_OBJECT_NULL;
 368
 369 boolean_t (* volatile consider_buffer_cache_collect)(int) = NULL;
 370
 371 #if DEVELOPMENT || DEBUG
 372 unsigned long vm_cs_validated_resets = 0;
 373 #endif
 374
 375 /*
 376  *      Routine:        vm_backing_store_disable
 377  *      Purpose:
 378  *              Suspend non-privileged threads wishing to extend
 379  *              backing store when we are low on backing store
 380  *              (Synchronized by caller)
 381  */
 382 void
 383 vm_backing_store_disable(
 384         boolean_t       disable)
 385 {
 386         if(disable) {
 387                 vm_backing_store_low = 1;
 388         } else {
 389                 if(vm_backing_store_low) {
 390                         vm_backing_store_low = 0;
 391                         thread_wakeup((event_t) &vm_backing_store_low);
 392                 }
 393         }
 394 }
 395
 396
 397 #if MACH_CLUSTER_STATS
 398 unsigned long vm_pageout_cluster_dirtied = 0;
 399 unsigned long vm_pageout_cluster_cleaned = 0;
 400 unsigned long vm_pageout_cluster_collisions = 0;
 401 unsigned long vm_pageout_cluster_clusters = 0;
 402 unsigned long vm_pageout_cluster_conversions = 0;
 403 unsigned long vm_pageout_target_collisions = 0;
 404 unsigned long vm_pageout_target_page_dirtied = 0;
 405 unsigned long vm_pageout_target_page_freed = 0;
 406 #define CLUSTER_STAT(clause)    clause
 407 #else   /* MACH_CLUSTER_STATS */
 408 #define CLUSTER_STAT(clause)
 409 #endif  /* MACH_CLUSTER_STATS */
 410
 411 /*
 412  *      Routine:        vm_pageout_object_terminate
 413  *      Purpose:
 414  *              Destroy the pageout_object, and perform all of the
 415  *              required cleanup actions.
 416  *
 417  *      In/Out conditions:
 418  *              The object must be locked, and will be returned locked.
 419  */
 420 void
 421 vm_pageout_object_terminate(
 422         vm_object_t     object)
 423 {
 424         vm_object_t     shadow_object;
 425
 426         /*
 427          * Deal with the deallocation (last reference) of a pageout object
 428          * (used for cleaning-in-place) by dropping the paging references/
 429          * freeing pages in the original object.
 430          */
 431
 432         assert(object->pageout);
 433         shadow_object = object->shadow;
 434         vm_object_lock(shadow_object);
 435
 436         while (!queue_empty(&object->memq)) {
 437                 vm_page_t               p, m;
 438                 vm_object_offset_t      offset;
 439
 440                 p = (vm_page_t) queue_first(&object->memq);
 441
 442                 assert(p->private);
 443                 assert(p->pageout);
 444                 p->pageout = FALSE;
 445                 assert(!p->cleaning);
 446
 447                 offset = p->offset;
 448                 VM_PAGE_FREE(p);
 449                 p = VM_PAGE_NULL;
 450
 451                 m = vm_page_lookup(shadow_object,
 452                         offset + object->shadow_offset);
 453
 454                 if(m == VM_PAGE_NULL)
 455                         continue;
 456                 assert(m->cleaning);
 457                 /* used as a trigger on upl_commit etc to recognize the */
 458                 /* pageout daemon's subseqent desire to pageout a cleaning */
 459                 /* page.  When the bit is on the upl commit code will   */
 460                 /* respect the pageout bit in the target page over the  */
 461                 /* caller's page list indication */
 462                 m->dump_cleaning = FALSE;
 463
 464                 assert((m->dirty) || (m->precious) ||
 465                                 (m->busy && m->cleaning));
 466
 467                 /*
 468                  * Handle the trusted pager throttle.
 469                  * Also decrement the burst throttle (if external).
 470                  */
 471                 vm_page_lock_queues();
 472                 if (m->laundry) {
 473                         vm_pageout_throttle_up(m);
 474                 }
 475
 476                 /*
 477                  * Handle the "target" page(s). These pages are to be freed if
 478                  * successfully cleaned. Target pages are always busy, and are
 479                  * wired exactly once. The initial target pages are not mapped,
 480                  * (so cannot be referenced or modified) but converted target
 481                  * pages may have been modified between the selection as an
 482                  * adjacent page and conversion to a target.
 483                  */
 484                 if (m->pageout) {
 485                         assert(m->busy);
 486                         assert(m->wire_count == 1);
 487                         m->cleaning = FALSE;
 488                         m->encrypted_cleaning = FALSE;
 489                         m->pageout = FALSE;
 490 #if MACH_CLUSTER_STATS
 491                         if (m->wanted) vm_pageout_target_collisions++;
 492 #endif
 493                         /*
 494                          * Revoke all access to the page. Since the object is
 495                          * locked, and the page is busy, this prevents the page
 496                          * from being dirtied after the pmap_disconnect() call
 497                          * returns.
 498                          *
 499                          * Since the page is left "dirty" but "not modifed", we
 500                          * can detect whether the page was redirtied during
 501                          * pageout by checking the modify state.
 502                          */
 503                         if (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED)
 504                               m->dirty = TRUE;
 505                         else
 506                               m->dirty = FALSE;
 507
 508                         if (m->dirty) {
 509                                 CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
 510                                 vm_page_unwire(m, TRUE);        /* reactivates */
 511                                 VM_STAT_INCR(reactivations);
 512                                 PAGE_WAKEUP_DONE(m);
 513                         } else {
 514                                 CLUSTER_STAT(vm_pageout_target_page_freed++;)
 515                                 vm_page_free(m);/* clears busy, etc. */
 516                         }
 517                         vm_page_unlock_queues();
 518                         continue;
 519                 }
 520                 /*
 521                  * Handle the "adjacent" pages. These pages were cleaned in
 522                  * place, and should be left alone.
 523                  * If prep_pin_count is nonzero, then someone is using the
 524                  * page, so make it active.
 525                  */
 526                 if (!m->active && !m->inactive && !m->throttled && !m->private) {
 527                         if (m->reference)
 528                                 vm_page_activate(m);
 529                         else
 530                                 vm_page_deactivate(m);
 531                 }
 532                 if((m->busy) && (m->cleaning)) {
 533
 534                         /* the request_page_list case, (COPY_OUT_FROM FALSE) */
 535                         m->busy = FALSE;
 536
 537                         /* We do not re-set m->dirty ! */
 538                         /* The page was busy so no extraneous activity     */
 539                         /* could have occurred. COPY_INTO is a read into the */
 540                         /* new pages. CLEAN_IN_PLACE does actually write   */
 541                         /* out the pages but handling outside of this code */
 542                         /* will take care of resetting dirty. We clear the */
 543                         /* modify however for the Programmed I/O case.     */
 544                         pmap_clear_modify(m->phys_page);
 545
 546                         m->absent = FALSE;
 547                         m->overwriting = FALSE;
 548                 } else if (m->overwriting) {
 549                         /* alternate request page list, write to page_list */
 550                         /* case.  Occurs when the original page was wired  */
 551                         /* at the time of the list request */
 552                         assert(VM_PAGE_WIRED(m));
 553                         vm_page_unwire(m, TRUE);        /* reactivates */
 554                         m->overwriting = FALSE;
 555                 } else {
 556                 /*
 557                  * Set the dirty state according to whether or not the page was
 558                  * modified during the pageout. Note that we purposefully do
 559                  * NOT call pmap_clear_modify since the page is still mapped.
 560                  * If the page were to be dirtied between the 2 calls, this
 561                  * this fact would be lost. This code is only necessary to
 562                  * maintain statistics, since the pmap module is always
 563                  * consulted if m->dirty is false.
 564                  */
 565 #if MACH_CLUSTER_STATS
 566                         m->dirty = pmap_is_modified(m->phys_page);
 567
 568                         if (m->dirty)   vm_pageout_cluster_dirtied++;
 569                         else            vm_pageout_cluster_cleaned++;
 570                         if (m->wanted)  vm_pageout_cluster_collisions++;
 571 #else
 572                         m->dirty = 0;
 573 #endif
 574                 }
 575                 m->cleaning = FALSE;
 576                 m->encrypted_cleaning = FALSE;
 577
 578                 /*
 579                  * Wakeup any thread waiting for the page to be un-cleaning.
 580                  */
 581                 PAGE_WAKEUP(m);
 582                 vm_page_unlock_queues();
 583         }
 584         /*
 585          * Account for the paging reference taken in vm_paging_object_allocate.
 586          */
 587         vm_object_activity_end(shadow_object);
 588         vm_object_unlock(shadow_object);
 589
 590         assert(object->ref_count == 0);
 591         assert(object->paging_in_progress == 0);
 592         assert(object->activity_in_progress == 0);
 593         assert(object->resident_page_count == 0);
 594         return;
 595 }
 596
 597 /*
 598  * Routine:     vm_pageclean_setup
 599  *
 600  * Purpose:     setup a page to be cleaned (made non-dirty), but not
 601  *              necessarily flushed from the VM page cache.
 602  *              This is accomplished by cleaning in place.
 603  *
 604  *              The page must not be busy, and new_object
 605  *              must be locked.
 606  *
 607  */
 608 void
 609 vm_pageclean_setup(
 610         vm_page_t               m,
 611         vm_page_t               new_m,
 612         vm_object_t             new_object,
 613         vm_object_offset_t      new_offset)
 614 {
 615         assert(!m->busy);
 616 #if 0
 617         assert(!m->cleaning);
 618 #endif
 619
 620         XPR(XPR_VM_PAGEOUT,
 621     "vm_pageclean_setup, obj 0x%X off 0x%X page 0x%X new 0x%X new_off 0x%X\n",
 622                 m->object, m->offset, m,
 623                 new_m, new_offset);
 624
 625         pmap_clear_modify(m->phys_page);
 626
 627         /*
 628          * Mark original page as cleaning in place.
 629          */
 630         m->cleaning = TRUE;
 631         m->dirty = TRUE;
 632         m->precious = FALSE;
 633
 634         /*
 635          * Convert the fictitious page to a private shadow of
 636          * the real page.
 637          */
 638         assert(new_m->fictitious);
 639         assert(new_m->phys_page == vm_page_fictitious_addr);
 640         new_m->fictitious = FALSE;
 641         new_m->private = TRUE;
 642         new_m->pageout = TRUE;
 643         new_m->phys_page = m->phys_page;
 644
 645         vm_page_lockspin_queues();
 646         vm_page_wire(new_m);
 647         vm_page_unlock_queues();
 648
 649         vm_page_insert(new_m, new_object, new_offset);
 650         assert(!new_m->wanted);
 651         new_m->busy = FALSE;
 652 }
 653
 654 /*
 655  *      Routine:        vm_pageout_initialize_page
 656  *      Purpose:
 657  *              Causes the specified page to be initialized in
 658  *              the appropriate memory object. This routine is used to push
 659  *              pages into a copy-object when they are modified in the
 660  *              permanent object.
 661  *
 662  *              The page is moved to a temporary object and paged out.
 663  *
 664  *      In/out conditions:
 665  *              The page in question must not be on any pageout queues.
 666  *              The object to which it belongs must be locked.
 667  *              The page must be busy, but not hold a paging reference.
 668  *
 669  *      Implementation:
 670  *              Move this page to a completely new object.
 671  */
 672 void
 673 vm_pageout_initialize_page(
 674         vm_page_t       m)
 675 {
 676         vm_object_t             object;
 677         vm_object_offset_t      paging_offset;
 678         vm_page_t               holding_page;
 679         memory_object_t         pager;
 680
 681         XPR(XPR_VM_PAGEOUT,
 682                 "vm_pageout_initialize_page, page 0x%X\n",
 683                 m, 0, 0, 0, 0);
 684         assert(m->busy);
 685
 686         /*
 687          *      Verify that we really want to clean this page
 688          */
 689         assert(!m->absent);
 690         assert(!m->error);
 691         assert(m->dirty);
 692
 693         /*
 694          *      Create a paging reference to let us play with the object.
 695          */
 696         object = m->object;
 697         paging_offset = m->offset + object->paging_offset;
 698
 699         if (m->absent || m->error || m->restart || (!m->dirty && !m->precious)) {
 700                 VM_PAGE_FREE(m);
 701                 panic("reservation without pageout?"); /* alan */
 702                 vm_object_unlock(object);
 703
 704                 return;
 705         }
 706
 707         /*
 708          * If there's no pager, then we can't clean the page.  This should
 709          * never happen since this should be a copy object and therefore not
 710          * an external object, so the pager should always be there.
 711          */
 712
 713         pager = object->pager;
 714
 715         if (pager == MEMORY_OBJECT_NULL) {
 716                 VM_PAGE_FREE(m);
 717                 panic("missing pager for copy object");
 718                 return;
 719         }
 720
 721         /* set the page for future call to vm_fault_list_request */
 722         vm_object_paging_begin(object);
 723         holding_page = NULL;
 724
 725         pmap_clear_modify(m->phys_page);
 726         m->dirty = TRUE;
 727         m->busy = TRUE;
 728         m->list_req_pending = TRUE;
 729         m->cleaning = TRUE;
 730         m->pageout = TRUE;
 731
 732         vm_page_lockspin_queues();
 733         vm_page_wire(m);
 734         vm_page_unlock_queues();
 735
 736         vm_object_unlock(object);
 737
 738         /*
 739          *      Write the data to its pager.
 740          *      Note that the data is passed by naming the new object,
 741          *      not a virtual address; the pager interface has been
 742          *      manipulated to use the "internal memory" data type.
 743          *      [The object reference from its allocation is donated
 744          *      to the eventual recipient.]
 745          */
 746         memory_object_data_initialize(pager, paging_offset, PAGE_SIZE);
 747
 748         vm_object_lock(object);
 749         vm_object_paging_end(object);
 750 }
 751
 752 #if     MACH_CLUSTER_STATS
 753 #define MAXCLUSTERPAGES 16
 754 struct {
 755         unsigned long pages_in_cluster;
 756         unsigned long pages_at_higher_offsets;
 757         unsigned long pages_at_lower_offsets;
 758 } cluster_stats[MAXCLUSTERPAGES];
 759 #endif  /* MACH_CLUSTER_STATS */
 760
 761
 762 /*
 763  * vm_pageout_cluster:
 764  *
 765  * Given a page, queue it to the appropriate I/O thread,
 766  * which will page it out and attempt to clean adjacent pages
 767  * in the same operation.
 768  *
 769  * The page must be busy, and the object and queues locked. We will take a
 770  * paging reference to prevent deallocation or collapse when we
 771  * release the object lock back at the call site.  The I/O thread
 772  * is responsible for consuming this reference
 773  *
 774  * The page must not be on any pageout queue.
 775  */
 776
 777 void
 778 vm_pageout_cluster(vm_page_t m)
 779 {
 780         vm_object_t     object = m->object;
 781         struct          vm_pageout_queue *q;
 782
 783
 784         XPR(XPR_VM_PAGEOUT,
 785                 "vm_pageout_cluster, object 0x%X offset 0x%X page 0x%X\n",
 786                 object, m->offset, m, 0, 0);
 787
 788         VM_PAGE_CHECK(m);
 789
 790         /*
 791          * Only a certain kind of page is appreciated here.
 792          */
 793         assert(m->busy && (m->dirty || m->precious) && (!VM_PAGE_WIRED(m)));
 794         assert(!m->cleaning && !m->pageout && !m->inactive && !m->active);
 795         assert(!m->throttled);
 796
 797         /*
 798          * protect the object from collapse -
 799          * locking in the object's paging_offset.
 800          */
 801         vm_object_paging_begin(object);
 802
 803         /*
 804          * set the page for future call to vm_fault_list_request
 805          * page should already be marked busy
 806          */
 807         vm_page_wire(m);
 808         m->list_req_pending = TRUE;
 809         m->cleaning = TRUE;
 810         m->pageout = TRUE;
 811
 812         if (object->internal == TRUE)
 813                 q = &vm_pageout_queue_internal;
 814         else
 815                 q = &vm_pageout_queue_external;
 816
 817         /*
 818          * pgo_laundry count is tied to the laundry bit
 819          */
 820         m->laundry = TRUE;
 821         q->pgo_laundry++;
 822
 823         m->pageout_queue = TRUE;
 824         queue_enter(&q->pgo_pending, m, vm_page_t, pageq);
 825
 826         if (q->pgo_idle == TRUE) {
 827                 q->pgo_idle = FALSE;
 828                 thread_wakeup((event_t) &q->pgo_pending);
 829         }
 830
 831         VM_PAGE_CHECK(m);
 832 }
 833
 834
 835 unsigned long vm_pageout_throttle_up_count = 0;
 836
 837 /*
 838  * A page is back from laundry or we are stealing it back from
 839  * the laundering state.  See if there are some pages waiting to
 840  * go to laundry and if we can let some of them go now.
 841  *
 842  * Object and page queues must be locked.
 843  */
 844 void
 845 vm_pageout_throttle_up(
 846         vm_page_t       m)
 847 {
 848         struct vm_pageout_queue *q;
 849
 850         assert(m->object != VM_OBJECT_NULL);
 851         assert(m->object != kernel_object);
 852
 853         vm_pageout_throttle_up_count++;
 854
 855         if (m->object->internal == TRUE)
 856                 q = &vm_pageout_queue_internal;
 857         else
 858                 q = &vm_pageout_queue_external;
 859
 860         if (m->pageout_queue == TRUE) {
 861
 862                 queue_remove(&q->pgo_pending, m, vm_page_t, pageq);
 863                 m->pageout_queue = FALSE;
 864
 865                 m->pageq.next = NULL;
 866                 m->pageq.prev = NULL;
 867
 868                 vm_object_paging_end(m->object);
 869         }
 870         if (m->laundry == TRUE) {
 871                 m->laundry = FALSE;
 872                 q->pgo_laundry--;
 873
 874                 if (q->pgo_throttled == TRUE) {
 875                         q->pgo_throttled = FALSE;
 876                         thread_wakeup((event_t) &q->pgo_laundry);
 877                 }
 878                 if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
 879                         q->pgo_draining = FALSE;
 880                         thread_wakeup((event_t) (&q->pgo_laundry+1));
 881                 }
 882         }
 883 }
 884
 885
 886 /*
 887  *      vm_pageout_scan does the dirty work for the pageout daemon.
 888  *      It returns with vm_page_queue_free_lock held and
 889  *      vm_page_free_wanted == 0.
 890  */
 891
 892 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT  (3 * MAX_UPL_TRANSFER)
 893
 894 #define FCS_IDLE                0
 895 #define FCS_DELAYED             1
 896 #define FCS_DEADLOCK_DETECTED   2
 897
 898 struct flow_control {
 899         int             state;
 900         mach_timespec_t ts;
 901 };
 902
 903
 904 /*
 905  * VM memory pressure monitoring.
 906  *
 907  * vm_pageout_scan() keeps track of the number of pages it considers and
 908  * reclaims, in the currently active vm_pageout_stat[vm_pageout_stat_now].
 909  *
 910  * compute_memory_pressure() is called every second from compute_averages()
 911  * and moves "vm_pageout_stat_now" forward, to start accumulating the number
 912  * of recalimed pages in a new vm_pageout_stat[] bucket.
 913  *
 914  * mach_vm_pressure_monitor() collects past statistics about memory pressure.
 915  * The caller provides the number of seconds ("nsecs") worth of statistics
 916  * it wants, up to 30 seconds.
 917  * It computes the number of pages reclaimed in the past "nsecs" seconds and
 918  * also returns the number of pages the system still needs to reclaim at this
 919  * moment in time.
 920  */
 921 #define VM_PAGEOUT_STAT_SIZE    31
 922 struct vm_pageout_stat {
 923         unsigned int considered;
 924         unsigned int reclaimed;
 925 } vm_pageout_stats[VM_PAGEOUT_STAT_SIZE] = {{0,0}, };
 926 unsigned int vm_pageout_stat_now = 0;
 927 unsigned int vm_memory_pressure = 0;
 928
 929 #define VM_PAGEOUT_STAT_BEFORE(i) \
 930         (((i) == 0) ? VM_PAGEOUT_STAT_SIZE - 1 : (i) - 1)
 931 #define VM_PAGEOUT_STAT_AFTER(i) \
 932         (((i) == VM_PAGEOUT_STAT_SIZE - 1) ? 0 : (i) + 1)
 933
 934 /*
 935  * Called from compute_averages().
 936  */
 937 void
 938 compute_memory_pressure(
 939         __unused void *arg)
 940 {
 941         unsigned int vm_pageout_next;
 942
 943         vm_memory_pressure =
 944                 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].reclaimed;
 945
 946         commpage_set_memory_pressure( vm_memory_pressure );
 947
 948         /* move "now" forward */
 949         vm_pageout_next = VM_PAGEOUT_STAT_AFTER(vm_pageout_stat_now);
 950         vm_pageout_stats[vm_pageout_next].considered = 0;
 951         vm_pageout_stats[vm_pageout_next].reclaimed = 0;
 952         vm_pageout_stat_now = vm_pageout_next;
 953 }
 954
 955 unsigned int
 956 mach_vm_ctl_page_free_wanted(void)
 957 {
 958         unsigned int page_free_target, page_free_count, page_free_wanted;
 959
 960         page_free_target = vm_page_free_target;
 961         page_free_count = vm_page_free_count;
 962         if (page_free_target > page_free_count) {
 963                 page_free_wanted = page_free_target - page_free_count;
 964         } else {
 965                 page_free_wanted = 0;
 966         }
 967
 968         return page_free_wanted;
 969 }
 970
 971 kern_return_t
 972 mach_vm_pressure_monitor(
 973         boolean_t       wait_for_pressure,
 974         unsigned int    nsecs_monitored,
 975         unsigned int    *pages_reclaimed_p,
 976         unsigned int    *pages_wanted_p)
 977 {
 978         wait_result_t   wr;
 979         unsigned int    vm_pageout_then, vm_pageout_now;
 980         unsigned int    pages_reclaimed;
 981
 982         /*
 983          * We don't take the vm_page_queue_lock here because we don't want
 984          * vm_pressure_monitor() to get in the way of the vm_pageout_scan()
 985          * thread when it's trying to reclaim memory.  We don't need fully
 986          * accurate monitoring anyway...
 987          */
 988
 989         if (wait_for_pressure) {
 990                 /* wait until there's memory pressure */
 991                 while (vm_page_free_count >= vm_page_free_target) {
 992                         wr = assert_wait((event_t) &vm_page_free_wanted,
 993                                          THREAD_INTERRUPTIBLE);
 994                         if (wr == THREAD_WAITING) {
 995                                 wr = thread_block(THREAD_CONTINUE_NULL);
 996                         }
 997                         if (wr == THREAD_INTERRUPTED) {
 998                                 return KERN_ABORTED;
 999                         }
1000                         if (wr == THREAD_AWAKENED) {
1001                                 /*
1002                                  * The memory pressure might have already
1003                                  * been relieved but let's not block again
1004                                  * and let's report that there was memory
1005                                  * pressure at some point.
1006                                  */
1007                                 break;
1008                         }
1009                 }
1010         }
1011
1012         /* provide the number of pages the system wants to reclaim */
1013         if (pages_wanted_p != NULL) {
1014                 *pages_wanted_p = mach_vm_ctl_page_free_wanted();
1015         }
1016
1017         if (pages_reclaimed_p == NULL) {
1018                 return KERN_SUCCESS;
1019         }
1020
1021         /* provide number of pages reclaimed in the last "nsecs_monitored" */
1022         do {
1023                 vm_pageout_now = vm_pageout_stat_now;
1024                 pages_reclaimed = 0;
1025                 for (vm_pageout_then =
1026                              VM_PAGEOUT_STAT_BEFORE(vm_pageout_now);
1027                      vm_pageout_then != vm_pageout_now &&
1028                              nsecs_monitored-- != 0;
1029                      vm_pageout_then =
1030                              VM_PAGEOUT_STAT_BEFORE(vm_pageout_then)) {
1031                         pages_reclaimed += vm_pageout_stats[vm_pageout_then].reclaimed;
1032                 }
1033         } while (vm_pageout_now != vm_pageout_stat_now);
1034         *pages_reclaimed_p = pages_reclaimed;
1035
1036         return KERN_SUCCESS;
1037 }
1038
1039 /* Page States: Used below to maintain the page state
1040    before it's removed from it's Q. This saved state
1041    helps us do the right accounting in certain cases
1042 */
1043
1044 #define PAGE_STATE_SPECULATIVE  1
1045 #define PAGE_STATE_THROTTLED    2
1046 #define PAGE_STATE_ZEROFILL     3
1047 #define PAGE_STATE_INACTIVE     4
1048
1049 #define VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m)                         \
1050         MACRO_BEGIN                                                     \
1051         /*                                                              \
1052          * If a "reusable" page somehow made it back into               \
1053          * the active queue, it's been re-used and is not               \
1054          * quite re-usable.                                             \
1055          * If the VM object was "all_reusable", consider it             \
1056          * as "all re-used" instead of converting it to                 \
1057          * "partially re-used", which could be expensive.               \
1058          */                                                             \
1059         if ((m)->reusable ||                                            \
1060             (m)->object->all_reusable) {                                \
1061                 vm_object_reuse_pages((m)->object,                      \
1062                                       (m)->offset,                      \
1063                                       (m)->offset + PAGE_SIZE_64,       \
1064                                       FALSE);                           \
1065         }                                                               \
1066         MACRO_END
1067
1068 void
1069 vm_pageout_scan(void)
1070 {
1071         unsigned int loop_count = 0;
1072         unsigned int inactive_burst_count = 0;
1073         unsigned int active_burst_count = 0;
1074         unsigned int reactivated_this_call;
1075         unsigned int reactivate_limit;
1076         vm_page_t   local_freeq = NULL;
1077         int         local_freed = 0;
1078         int         delayed_unlock;
1079         int         refmod_state = 0;
1080         int     vm_pageout_deadlock_target = 0;
1081         struct  vm_pageout_queue *iq;
1082         struct  vm_pageout_queue *eq;
1083         struct  vm_speculative_age_q *sq;
1084         struct  flow_control    flow_control = { 0, { 0, 0 } };
1085         boolean_t inactive_throttled = FALSE;
1086         boolean_t try_failed;
1087         mach_timespec_t         ts;
1088         unsigned int msecs = 0;
1089         vm_object_t     object;
1090         vm_object_t     last_object_tried;
1091 #if defined(__ppc__) /* On ppc, vm statistics are still 32-bit */
1092         unsigned int    zf_ratio;
1093         unsigned int    zf_run_count;
1094 #else
1095         uint64_t        zf_ratio;
1096         uint64_t        zf_run_count;
1097 #endif
1098         uint32_t        catch_up_count = 0;
1099         uint32_t        inactive_reclaim_run;
1100         boolean_t       forced_reclaim;
1101         int             page_prev_state = 0;
1102
1103         flow_control.state = FCS_IDLE;
1104         iq = &vm_pageout_queue_internal;
1105         eq = &vm_pageout_queue_external;
1106         sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
1107
1108
1109         XPR(XPR_VM_PAGEOUT, "vm_pageout_scan\n", 0, 0, 0, 0, 0);
1110
1111
1112         vm_page_lock_queues();
1113         delayed_unlock = 1;     /* must be nonzero if Qs are locked, 0 if unlocked */
1114
1115         /*
1116          *      Calculate the max number of referenced pages on the inactive
1117          *      queue that we will reactivate.
1118          */
1119         reactivated_this_call = 0;
1120         reactivate_limit = VM_PAGE_REACTIVATE_LIMIT(vm_page_active_count +
1121                                                     vm_page_inactive_count);
1122         inactive_reclaim_run = 0;
1123
1124
1125 /*???*/ /*
1126          *      We want to gradually dribble pages from the active queue
1127          *      to the inactive queue.  If we let the inactive queue get
1128          *      very small, and then suddenly dump many pages into it,
1129          *      those pages won't get a sufficient chance to be referenced
1130          *      before we start taking them from the inactive queue.
1131          *
1132          *      We must limit the rate at which we send pages to the pagers.
1133          *      data_write messages consume memory, for message buffers and
1134          *      for map-copy objects.  If we get too far ahead of the pagers,
1135          *      we can potentially run out of memory.
1136          *
1137          *      We can use the laundry count to limit directly the number
1138          *      of pages outstanding to the default pager.  A similar
1139          *      strategy for external pagers doesn't work, because
1140          *      external pagers don't have to deallocate the pages sent them,
1141          *      and because we might have to send pages to external pagers
1142          *      even if they aren't processing writes.  So we also
1143          *      use a burst count to limit writes to external pagers.
1144          *
1145          *      When memory is very tight, we can't rely on external pagers to
1146          *      clean pages.  They probably aren't running, because they
1147          *      aren't vm-privileged.  If we kept sending dirty pages to them,
1148          *      we could exhaust the free list.
1149          */
1150
1151
1152 Restart:
1153         assert(delayed_unlock!=0);
1154
1155         /*
1156          *      A page is "zero-filled" if it was not paged in from somewhere,
1157          *      and it belongs to an object at least VM_ZF_OBJECT_SIZE_THRESHOLD big.
1158          *      Recalculate the zero-filled page ratio.  We use this to apportion
1159          *      victimized pages between the normal and zero-filled inactive
1160          *      queues according to their relative abundance in memory.  Thus if a task
1161          *      is flooding memory with zf pages, we begin to hunt them down.
1162          *      It would be better to throttle greedy tasks at a higher level,
1163          *      but at the moment mach vm cannot do this.
1164          */
1165         {
1166 #if defined(__ppc__) /* On ppc, vm statistics are still 32-bit */
1167                 uint32_t  total  = vm_page_active_count + vm_page_inactive_count;
1168                 uint32_t  normal = total - vm_zf_count;
1169 #else
1170                 uint64_t  total  = vm_page_active_count + vm_page_inactive_count;
1171                 uint64_t  normal = total - vm_zf_count;
1172 #endif
1173
1174                 /* zf_ratio is the number of zf pages we victimize per normal page */
1175
1176                 if (vm_zf_count < vm_accellerate_zf_pageout_trigger)
1177                         zf_ratio = 0;
1178                 else if ((vm_zf_count <= normal) || (normal == 0))
1179                         zf_ratio = 1;
1180                 else
1181                         zf_ratio = vm_zf_count / normal;
1182
1183                 zf_run_count = 0;
1184         }
1185
1186         /*
1187          *      Recalculate vm_page_inactivate_target.
1188          */
1189         vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
1190                                                           vm_page_inactive_count +
1191                                                           vm_page_speculative_count);
1192         /*
1193          * don't want to wake the pageout_scan thread up everytime we fall below
1194          * the targets... set a low water mark at 0.25% below the target
1195          */
1196         vm_page_inactive_min = vm_page_inactive_target - (vm_page_inactive_target / 400);
1197
1198         vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count +
1199                                                                 vm_page_inactive_count);
1200         object = NULL;
1201         last_object_tried = NULL;
1202         try_failed = FALSE;
1203
1204         if ((vm_page_inactive_count + vm_page_speculative_count) < VM_PAGE_INACTIVE_HEALTHY_LIMIT(vm_page_active_count))
1205                 catch_up_count = vm_page_inactive_count + vm_page_speculative_count;
1206         else
1207                 catch_up_count = 0;
1208
1209         for (;;) {
1210                 vm_page_t m;
1211
1212                 DTRACE_VM2(rev, int, 1, (uint64_t *), NULL);
1213
1214                 if (delayed_unlock == 0) {
1215                         vm_page_lock_queues();
1216                         delayed_unlock = 1;
1217                 }
1218
1219                 /*
1220                  *      Don't sweep through active queue more than the throttle
1221                  *      which should be kept relatively low
1222                  */
1223                 active_burst_count = MIN(vm_pageout_burst_active_throttle,
1224                                          vm_page_active_count);
1225
1226                 /*
1227                  *      Move pages from active to inactive.
1228                  */
1229                 if ((vm_page_inactive_count + vm_page_speculative_count) >= vm_page_inactive_target)
1230                         goto done_moving_active_pages;
1231
1232                 while (!queue_empty(&vm_page_queue_active) && active_burst_count) {
1233
1234                         if (active_burst_count)
1235                                active_burst_count--;
1236
1237                         vm_pageout_active++;
1238
1239                         m = (vm_page_t) queue_first(&vm_page_queue_active);
1240
1241                         assert(m->active && !m->inactive);
1242                         assert(!m->laundry);
1243                         assert(m->object != kernel_object);
1244                         assert(m->phys_page != vm_page_guard_addr);
1245
1246                         DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
1247
1248                         /*
1249                          * Try to lock object; since we've already got the
1250                          * page queues lock, we can only 'try' for this one.
1251                          * if the 'try' fails, we need to do a mutex_pause
1252                          * to allow the owner of the object lock a chance to
1253                          * run... otherwise, we're likely to trip over this
1254                          * object in the same state as we work our way through
1255                          * the queue... clumps of pages associated with the same
1256                          * object are fairly typical on the inactive and active queues
1257                          */
1258                         if (m->object != object) {
1259                                 if (object != NULL) {
1260                                         vm_object_unlock(object);
1261                                         object = NULL;
1262                                         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1263                                 }
1264                                 if (!vm_object_lock_try_scan(m->object)) {
1265                                         /*
1266                                          * move page to end of active queue and continue
1267                                          */
1268                                         queue_remove(&vm_page_queue_active, m,
1269                                                      vm_page_t, pageq);
1270                                         queue_enter(&vm_page_queue_active, m,
1271                                                     vm_page_t, pageq);
1272
1273                                         try_failed = TRUE;
1274
1275                                         m = (vm_page_t) queue_first(&vm_page_queue_active);
1276                                         /*
1277                                          * this is the next object we're going to be interested in
1278                                          * try to make sure it's available after the mutex_yield
1279                                          * returns control
1280                                          */
1281                                         vm_pageout_scan_wants_object = m->object;
1282
1283                                         goto done_with_activepage;
1284                                 }
1285                                 object = m->object;
1286
1287                                 try_failed = FALSE;
1288                         }
1289
1290                         /*
1291                          * if the page is BUSY, then we pull it
1292                          * off the active queue and leave it alone.
1293                          * when BUSY is cleared, it will get stuck
1294                          * back on the appropriate queue
1295                          */
1296                         if (m->busy) {
1297                                 queue_remove(&vm_page_queue_active, m,
1298                                              vm_page_t, pageq);
1299                                 m->pageq.next = NULL;
1300                                 m->pageq.prev = NULL;
1301
1302                                 if (!m->fictitious)
1303                                         vm_page_active_count--;
1304                                 m->active = FALSE;
1305
1306                                 goto done_with_activepage;
1307                         }
1308
1309                         /* deal with a rogue "reusable" page */
1310                         VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m);
1311
1312                         /*
1313                          *      Deactivate the page while holding the object
1314                          *      locked, so we know the page is still not busy.
1315                          *      This should prevent races between pmap_enter
1316                          *      and pmap_clear_reference.  The page might be
1317                          *      absent or fictitious, but vm_page_deactivate
1318                          *      can handle that.
1319                          */
1320                         vm_page_deactivate(m);
1321
1322 done_with_activepage:
1323                         if (delayed_unlock++ > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT || try_failed == TRUE) {
1324
1325                                 if (object != NULL) {
1326                                         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1327                                         vm_object_unlock(object);
1328                                         object = NULL;
1329                                 }
1330                                 if (local_freeq) {
1331                                         vm_page_unlock_queues();
1332                                         vm_page_free_list(local_freeq, TRUE);
1333
1334                                         local_freeq = NULL;
1335                                         local_freed = 0;
1336                                         vm_page_lock_queues();
1337                                 } else
1338                                         lck_mtx_yield(&vm_page_queue_lock);
1339
1340                                 delayed_unlock = 1;
1341
1342                                 /*
1343                                  * continue the while loop processing
1344                                  * the active queue... need to hold
1345                                  * the page queues lock
1346                                  */
1347                         }
1348                 }
1349
1350
1351
1352                 /**********************************************************************
1353                  * above this point we're playing with the active queue
1354                  * below this point we're playing with the throttling mechanisms
1355                  * and the inactive queue
1356                  **********************************************************************/
1357
1358 done_moving_active_pages:
1359
1360                 /*
1361                  *      We are done if we have met our target *and*
1362                  *      nobody is still waiting for a page.
1363                  */
1364                 if (vm_page_free_count + local_freed >= vm_page_free_target) {
1365                         if (object != NULL) {
1366                                 vm_object_unlock(object);
1367                                 object = NULL;
1368                         }
1369                         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1370
1371                         if (local_freeq) {
1372                                 vm_page_unlock_queues();
1373                                 vm_page_free_list(local_freeq, TRUE);
1374
1375                                 local_freeq = NULL;
1376                                 local_freed = 0;
1377                                 vm_page_lock_queues();
1378                         }
1379                         /*
1380                          * inactive target still not met... keep going
1381                          * until we get the queues balanced
1382                          */
1383
1384                         /*
1385                          *      Recalculate vm_page_inactivate_target.
1386                          */
1387                         vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
1388                                                                           vm_page_inactive_count +
1389                                                                           vm_page_speculative_count);
1390
1391 #ifndef CONFIG_EMBEDDED
1392                         /*
1393                          * XXX: if no active pages can be reclaimed, pageout scan can be stuck trying
1394                          *      to balance the queues
1395                          */
1396                         if (((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) &&
1397                             !queue_empty(&vm_page_queue_active))
1398                                 continue;
1399 #endif
1400
1401                         lck_mtx_lock(&vm_page_queue_free_lock);
1402
1403                         if ((vm_page_free_count >= vm_page_free_target) &&
1404                             (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
1405
1406                                 vm_page_unlock_queues();
1407
1408                                 thread_wakeup((event_t) &vm_pageout_garbage_collect);
1409
1410                                 assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
1411
1412                                 return;
1413                         }
1414                         lck_mtx_unlock(&vm_page_queue_free_lock);
1415                 }
1416
1417                 /*
1418                  * Before anything, we check if we have any ripe volatile
1419                  * objects around. If so, try to purge the first object.
1420                  * If the purge fails, fall through to reclaim a page instead.
1421                  * If the purge succeeds, go back to the top and reevalute
1422                  * the new memory situation.
1423                  */
1424                 assert (available_for_purge>=0);
1425                 if (available_for_purge)
1426                 {
1427                         if (object != NULL) {
1428                                 vm_object_unlock(object);
1429                                 object = NULL;
1430                         }
1431                         if(TRUE == vm_purgeable_object_purge_one()) {
1432                                 continue;
1433                         }
1434                 }
1435
1436                 if (queue_empty(&sq->age_q) && vm_page_speculative_count) {
1437                         /*
1438                          * try to pull pages from the aging bins
1439                          * see vm_page.h for an explanation of how
1440                          * this mechanism works
1441                          */
1442                         struct vm_speculative_age_q     *aq;
1443                         mach_timespec_t ts_fully_aged;
1444                         boolean_t       can_steal = FALSE;
1445                         int num_scanned_queues;
1446
1447                         aq = &vm_page_queue_speculative[speculative_steal_index];
1448
1449                         num_scanned_queues = 0;
1450                         while (queue_empty(&aq->age_q) &&
1451                                num_scanned_queues++ != VM_PAGE_MAX_SPECULATIVE_AGE_Q) {
1452
1453                                 speculative_steal_index++;
1454
1455                                 if (speculative_steal_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q)
1456                                         speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
1457
1458                                 aq = &vm_page_queue_speculative[speculative_steal_index];
1459                         }
1460
1461                         if (num_scanned_queues ==
1462                             VM_PAGE_MAX_SPECULATIVE_AGE_Q + 1) {
1463                                 /*
1464                                  * XXX We've scanned all the speculative
1465                                  * queues but still haven't found one
1466                                  * that is not empty, even though
1467                                  * vm_page_speculative_count is not 0.
1468                                  */
1469                                 /* report the anomaly... */
1470                                 printf("vm_pageout_scan: "
1471                                        "all speculative queues empty "
1472                                        "but count=%d.  Re-adjusting.\n",
1473                                        vm_page_speculative_count);
1474                                 if (vm_page_speculative_count >
1475                                     vm_page_speculative_count_drift_max)
1476                                         vm_page_speculative_count_drift_max = vm_page_speculative_count;
1477                                 vm_page_speculative_count_drifts++;
1478 #if 6553678
1479                                 Debugger("vm_pageout_scan: no speculative pages");
1480 #endif
1481                                 /* readjust... */
1482                                 vm_page_speculative_count = 0;
1483                                 /* ... and continue */
1484                                 continue;
1485                         }
1486
1487                         if (vm_page_speculative_count > vm_page_speculative_target)
1488                                 can_steal = TRUE;
1489                         else {
1490                                 ts_fully_aged.tv_sec = (VM_PAGE_MAX_SPECULATIVE_AGE_Q * VM_PAGE_SPECULATIVE_Q_AGE_MS) / 1000;
1491                                 ts_fully_aged.tv_nsec = ((VM_PAGE_MAX_SPECULATIVE_AGE_Q * VM_PAGE_SPECULATIVE_Q_AGE_MS) % 1000)
1492                                                       * 1000 * NSEC_PER_USEC;
1493
1494                                 ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts);
1495
1496                                 clock_sec_t sec;
1497                                 clock_nsec_t nsec;
1498                                 clock_get_system_nanotime(&sec, &nsec);
1499                                 ts.tv_sec = (unsigned int) sec;
1500                                 ts.tv_nsec = nsec;
1501
1502                                 if (CMP_MACH_TIMESPEC(&ts, &ts_fully_aged) >= 0)
1503                                         can_steal = TRUE;
1504                         }
1505                         if (can_steal == TRUE)
1506                                 vm_page_speculate_ageit(aq);
1507                 }
1508
1509                 /*
1510                  * Sometimes we have to pause:
1511                  *      1) No inactive pages - nothing to do.
1512                  *      2) Flow control - default pageout queue is full
1513                  *      3) Loop control - no acceptable pages found on the inactive queue
1514                  *         within the last vm_pageout_burst_inactive_throttle iterations
1515                  */
1516                 if (queue_empty(&vm_page_queue_inactive) && queue_empty(&vm_page_queue_zf) && queue_empty(&sq->age_q) &&
1517                     (VM_PAGE_Q_THROTTLED(iq) || queue_empty(&vm_page_queue_throttled))) {
1518                         vm_pageout_scan_empty_throttle++;
1519                         msecs = vm_pageout_empty_wait;
1520                         goto vm_pageout_scan_delay;
1521
1522                 } else if (inactive_burst_count >=
1523                            MIN(vm_pageout_burst_inactive_throttle,
1524                                (vm_page_inactive_count +
1525                                 vm_page_speculative_count))) {
1526                         vm_pageout_scan_burst_throttle++;
1527                         msecs = vm_pageout_burst_wait;
1528                         goto vm_pageout_scan_delay;
1529
1530                 } else if (VM_PAGE_Q_THROTTLED(iq) && IP_VALID(memory_manager_default)) {
1531                         clock_sec_t sec;
1532                         clock_nsec_t nsec;
1533
1534                         switch (flow_control.state) {
1535
1536                         case FCS_IDLE:
1537 reset_deadlock_timer:
1538                                 ts.tv_sec = vm_pageout_deadlock_wait / 1000;
1539                                 ts.tv_nsec = (vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
1540                                 clock_get_system_nanotime(&sec, &nsec);
1541                                 flow_control.ts.tv_sec = (unsigned int) sec;
1542                                 flow_control.ts.tv_nsec = nsec;
1543                                 ADD_MACH_TIMESPEC(&flow_control.ts, &ts);
1544
1545                                 flow_control.state = FCS_DELAYED;
1546                                 msecs = vm_pageout_deadlock_wait;
1547
1548                                 break;
1549
1550                         case FCS_DELAYED:
1551                                 clock_get_system_nanotime(&sec, &nsec);
1552                                 ts.tv_sec = (unsigned int) sec;
1553                                 ts.tv_nsec = nsec;
1554
1555                                 if (CMP_MACH_TIMESPEC(&ts, &flow_control.ts) >= 0) {
1556                                         /*
1557                                          * the pageout thread for the default pager is potentially
1558                                          * deadlocked since the
1559                                          * default pager queue has been throttled for more than the
1560                                          * allowable time... we need to move some clean pages or dirty
1561                                          * pages belonging to the external pagers if they aren't throttled
1562                                          * vm_page_free_wanted represents the number of threads currently
1563                                          * blocked waiting for pages... we'll move one page for each of
1564                                          * these plus a fixed amount to break the logjam... once we're done
1565                                          * moving this number of pages, we'll re-enter the FSC_DELAYED state
1566                                          * with a new timeout target since we have no way of knowing
1567                                          * whether we've broken the deadlock except through observation
1568                                          * of the queue associated with the default pager... we need to
1569                                          * stop moving pages and allow the system to run to see what
1570                                          * state it settles into.
1571                                          */
1572                                         vm_pageout_deadlock_target = vm_pageout_deadlock_relief + vm_page_free_wanted + vm_page_free_wanted_privileged;
1573                                         vm_pageout_scan_deadlock_detected++;
1574                                         flow_control.state = FCS_DEADLOCK_DETECTED;
1575
1576                                         thread_wakeup((event_t) &vm_pageout_garbage_collect);
1577                                         goto consider_inactive;
1578                                 }
1579                                 /*
1580                                  * just resniff instead of trying
1581                                  * to compute a new delay time... we're going to be
1582                                  * awakened immediately upon a laundry completion,
1583                                  * so we won't wait any longer than necessary
1584                                  */
1585                                 msecs = vm_pageout_idle_wait;
1586                                 break;
1587
1588                         case FCS_DEADLOCK_DETECTED:
1589                                 if (vm_pageout_deadlock_target)
1590                                         goto consider_inactive;
1591                                 goto reset_deadlock_timer;
1592
1593                         }
1594                         vm_pageout_scan_throttle++;
1595                         iq->pgo_throttled = TRUE;
1596 vm_pageout_scan_delay:
1597                         if (object != NULL) {
1598                                 vm_object_unlock(object);
1599                                 object = NULL;
1600                         }
1601                         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1602
1603                         if (local_freeq) {
1604                                 vm_page_unlock_queues();
1605                                 vm_page_free_list(local_freeq, TRUE);
1606
1607                                 local_freeq = NULL;
1608                                 local_freed = 0;
1609                                 vm_page_lock_queues();
1610
1611                                 if (flow_control.state == FCS_DELAYED &&
1612                                     !VM_PAGE_Q_THROTTLED(iq)) {
1613                                         flow_control.state = FCS_IDLE;
1614                                         vm_pageout_scan_throttle_aborted++;
1615                                         goto consider_inactive;
1616                                 }
1617                         }
1618 #if CONFIG_EMBEDDED
1619                         {
1620                         int percent_avail;
1621
1622                         /*
1623                          * Decide if we need to send a memory status notification.
1624                          */
1625                         percent_avail =
1626                                 (vm_page_active_count + vm_page_inactive_count +
1627                                  vm_page_speculative_count + vm_page_free_count +
1628                                  (IP_VALID(memory_manager_default)?0:vm_page_purgeable_count) ) * 100 /
1629                                 atop_64(max_mem);
1630                         if (percent_avail >= (kern_memorystatus_level + 5) ||
1631                             percent_avail <= (kern_memorystatus_level - 5)) {
1632                                 kern_memorystatus_level = percent_avail;
1633                                 thread_wakeup((event_t)&kern_memorystatus_wakeup);
1634                         }
1635                         }
1636 #endif
1637                         assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000*NSEC_PER_USEC);
1638                         counter(c_vm_pageout_scan_block++);
1639
1640                         vm_page_unlock_queues();
1641
1642                         assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
1643
1644                         thread_block(THREAD_CONTINUE_NULL);
1645
1646                         vm_page_lock_queues();
1647                         delayed_unlock = 1;
1648
1649                         iq->pgo_throttled = FALSE;
1650
1651                         if (loop_count >= vm_page_inactive_count)
1652                                 loop_count = 0;
1653                         inactive_burst_count = 0;
1654
1655                         goto Restart;
1656                         /*NOTREACHED*/
1657                 }
1658
1659
1660                 flow_control.state = FCS_IDLE;
1661 consider_inactive:
1662                 loop_count++;
1663                 inactive_burst_count++;
1664                 vm_pageout_inactive++;
1665
1666                 /* Choose a victim. */
1667
1668                 while (1) {
1669                         m = NULL;
1670
1671                         if (IP_VALID(memory_manager_default)) {
1672                                 assert(vm_page_throttled_count == 0);
1673                                 assert(queue_empty(&vm_page_queue_throttled));
1674                         }
1675
1676                         /*
1677                          * The most eligible pages are ones we paged in speculatively,
1678                          * but which have not yet been touched.
1679                          */
1680                         if ( !queue_empty(&sq->age_q) ) {
1681                                 m = (vm_page_t) queue_first(&sq->age_q);
1682                                 break;
1683                         }
1684                         /*
1685                          * Time for a zero-filled inactive page?
1686                          */
1687                         if ( ((zf_run_count < zf_ratio) && vm_zf_queue_count >= zf_queue_min_count) ||
1688                              queue_empty(&vm_page_queue_inactive)) {
1689                                 if ( !queue_empty(&vm_page_queue_zf) ) {
1690                                         m = (vm_page_t) queue_first(&vm_page_queue_zf);
1691                                         zf_run_count++;
1692                                         break;
1693                                 }
1694                         }
1695                         /*
1696                          * It's either a normal inactive page or nothing.
1697                          */
1698                         if ( !queue_empty(&vm_page_queue_inactive) ) {
1699                                 m = (vm_page_t) queue_first(&vm_page_queue_inactive);
1700                                 zf_run_count = 0;
1701                                 break;
1702                         }
1703
1704                         panic("vm_pageout: no victim");
1705                 }
1706
1707                 assert(!m->active && (m->inactive || m->speculative || m->throttled));
1708                 assert(!m->laundry);
1709                 assert(m->object != kernel_object);
1710                 assert(m->phys_page != vm_page_guard_addr);
1711
1712                 if (!m->speculative) {
1713                         vm_pageout_stats[vm_pageout_stat_now].considered++;
1714                 }
1715
1716                 DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
1717
1718                 /*
1719                  * check to see if we currently are working
1720                  * with the same object... if so, we've
1721                  * already got the lock
1722                  */
1723                 if (m->object != object) {
1724                         /*
1725                          * the object associated with candidate page is
1726                          * different from the one we were just working
1727                          * with... dump the lock if we still own it
1728                          */
1729                         if (object != NULL) {
1730                                 vm_object_unlock(object);
1731                                 object = NULL;
1732                                 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1733                         }
1734                         /*
1735                          * Try to lock object; since we've alread got the
1736                          * page queues lock, we can only 'try' for this one.
1737                          * if the 'try' fails, we need to do a mutex_pause
1738                          * to allow the owner of the object lock a chance to
1739                          * run... otherwise, we're likely to trip over this
1740                          * object in the same state as we work our way through
1741                          * the queue... clumps of pages associated with the same
1742                          * object are fairly typical on the inactive and active queues
1743                          */
1744                         if (!vm_object_lock_try_scan(m->object)) {
1745                                 vm_pageout_inactive_nolock++;
1746
1747                         requeue_page:
1748                                 /*
1749                                  *      Move page to end and continue.
1750                                  *      Don't re-issue ticket
1751                                  */
1752                                 if (m->zero_fill) {
1753                                         if (m->speculative) {
1754                                                 panic("vm_pageout_scan(): page %p speculative and zero-fill !?\n", m);
1755                                         }
1756                                         assert(!m->speculative);
1757                                         queue_remove(&vm_page_queue_zf, m,
1758                                                      vm_page_t, pageq);
1759                                         queue_enter(&vm_page_queue_zf, m,
1760                                                     vm_page_t, pageq);
1761                                 } else if (m->speculative) {
1762                                         remque(&m->pageq);
1763                                         m->speculative = FALSE;
1764                                         vm_page_speculative_count--;
1765
1766                                         /*
1767                                          * move to the head of the inactive queue
1768                                          * to get it out of the way... the speculative
1769                                          * queue is generally too small to depend
1770                                          * on there being enough pages from other
1771                                          * objects to make cycling it back on the
1772                                          * same queue a winning proposition
1773                                          */
1774                                         queue_enter_first(&vm_page_queue_inactive, m,
1775                                                           vm_page_t, pageq);
1776                                         m->inactive = TRUE;
1777                                         vm_page_inactive_count++;
1778                                         token_new_pagecount++;
1779                                 }  else if (m->throttled) {
1780                                         queue_remove(&vm_page_queue_throttled, m,
1781                                                      vm_page_t, pageq);
1782                                         m->throttled = FALSE;
1783                                         vm_page_throttled_count--;
1784
1785                                         /*
1786                                          * not throttled any more, so can stick
1787                                          * it on the inactive queue.
1788                                          */
1789                                         queue_enter(&vm_page_queue_inactive, m,
1790                                                     vm_page_t, pageq);
1791                                         m->inactive = TRUE;
1792                                         vm_page_inactive_count++;
1793                                         token_new_pagecount++;
1794                                 } else {
1795                                         queue_remove(&vm_page_queue_inactive, m,
1796                                                      vm_page_t, pageq);
1797 #if MACH_ASSERT
1798                                         vm_page_inactive_count--;       /* balance for purgeable queue asserts */
1799 #endif
1800                                         vm_purgeable_q_advance_all();
1801
1802                                         queue_enter(&vm_page_queue_inactive, m,
1803                                                     vm_page_t, pageq);
1804 #if MACH_ASSERT
1805                                         vm_page_inactive_count++;       /* balance for purgeable queue asserts */
1806 #endif
1807                                         token_new_pagecount++;
1808                                 }
1809                                 pmap_clear_reference(m->phys_page);
1810                                 m->reference = FALSE;
1811
1812                                 if ( !queue_empty(&sq->age_q) )
1813                                         m = (vm_page_t) queue_first(&sq->age_q);
1814                                 else if ( ((zf_run_count < zf_ratio) && vm_zf_queue_count >= zf_queue_min_count) ||
1815                                           queue_empty(&vm_page_queue_inactive)) {
1816                                         if ( !queue_empty(&vm_page_queue_zf) )
1817                                                 m = (vm_page_t) queue_first(&vm_page_queue_zf);
1818                                 } else if ( !queue_empty(&vm_page_queue_inactive) ) {
1819                                         m = (vm_page_t) queue_first(&vm_page_queue_inactive);
1820                                 }
1821                                 /*
1822                                  * this is the next object we're going to be interested in
1823                                  * try to make sure its available after the mutex_yield
1824                                  * returns control
1825                                  */
1826                                 vm_pageout_scan_wants_object = m->object;
1827
1828                                 /*
1829                                  * force us to dump any collected free pages
1830                                  * and to pause before moving on
1831                                  */
1832                                 try_failed = TRUE;
1833
1834                                 goto done_with_inactivepage;
1835                         }
1836                         object = m->object;
1837                         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1838
1839                         try_failed = FALSE;
1840                 }
1841
1842                 /*
1843                  *      Paging out pages of external objects which
1844                  *      are currently being created must be avoided.
1845                  *      The pager may claim for memory, thus leading to a
1846                  *      possible dead lock between it and the pageout thread,
1847                  *      if such pages are finally chosen. The remaining assumption
1848                  *      is that there will finally be enough available pages in the
1849                  *      inactive pool to page out in order to satisfy all memory
1850                  *      claimed by the thread which concurrently creates the pager.
1851                  */
1852                 if (!object->pager_initialized && object->pager_created) {
1853                         /*
1854                          *      Move page to end and continue, hoping that
1855                          *      there will be enough other inactive pages to
1856                          *      page out so that the thread which currently
1857                          *      initializes the pager will succeed.
1858                          *      Don't re-grant the ticket, the page should
1859                          *      pulled from the queue and paged out whenever
1860                          *      one of its logically adjacent fellows is
1861                          *      targeted.
1862                          */
1863                         vm_pageout_inactive_avoid++;
1864                         goto requeue_page;
1865                 }
1866                 /*
1867                  *      Remove the page from its list.
1868                  */
1869                 if (m->speculative) {
1870                         remque(&m->pageq);
1871                         page_prev_state = PAGE_STATE_SPECULATIVE;
1872                         m->speculative = FALSE;
1873                         vm_page_speculative_count--;
1874                 } else if (m->throttled) {
1875                         queue_remove(&vm_page_queue_throttled, m, vm_page_t, pageq);
1876                         page_prev_state = PAGE_STATE_THROTTLED;
1877                         m->throttled = FALSE;
1878                         vm_page_throttled_count--;
1879                 } else {
1880                         if (m->zero_fill) {
1881                                 queue_remove(&vm_page_queue_zf, m, vm_page_t, pageq);
1882                                 page_prev_state = PAGE_STATE_ZEROFILL;
1883                                 vm_zf_queue_count--;
1884                         } else {
1885                                 page_prev_state = PAGE_STATE_INACTIVE;
1886                                 queue_remove(&vm_page_queue_inactive, m, vm_page_t, pageq);
1887                         }
1888                         m->inactive = FALSE;
1889                         if (!m->fictitious)
1890                                 vm_page_inactive_count--;
1891                         vm_purgeable_q_advance_all();
1892                 }
1893
1894                 m->pageq.next = NULL;
1895                 m->pageq.prev = NULL;
1896
1897                 if ( !m->fictitious && catch_up_count)
1898                         catch_up_count--;
1899
1900                 /*
1901                  * ENCRYPTED SWAP:
1902                  * if this page has already been picked up as part of a
1903                  * page-out cluster, it will be busy because it is being
1904                  * encrypted (see vm_object_upl_request()).  But we still
1905                  * want to demote it from "clean-in-place" (aka "adjacent")
1906                  * to "clean-and-free" (aka "target"), so let's ignore its
1907                  * "busy" bit here and proceed to check for "cleaning" a
1908                  * little bit below...
1909                  */
1910                 if ( !m->encrypted_cleaning && (m->busy || !object->alive)) {
1911                         /*
1912                          *      Somebody is already playing with this page.
1913                          *      Leave it off the pageout queues.
1914                          *
1915                          */
1916                         vm_pageout_inactive_busy++;
1917
1918                         goto done_with_inactivepage;
1919                 }
1920
1921                 /*
1922                  *      If it's absent or in error, we can reclaim the page.
1923                  */
1924
1925                 if (m->absent || m->error) {
1926                         vm_pageout_inactive_absent++;
1927 reclaim_page:
1928                         if (vm_pageout_deadlock_target) {
1929                                 vm_pageout_scan_inactive_throttle_success++;
1930                                 vm_pageout_deadlock_target--;
1931                         }
1932
1933                         DTRACE_VM2(dfree, int, 1, (uint64_t *), NULL);
1934
1935                         if (object->internal) {
1936                                 DTRACE_VM2(anonfree, int, 1, (uint64_t *), NULL);
1937                         } else {
1938                                 DTRACE_VM2(fsfree, int, 1, (uint64_t *), NULL);
1939                         }
1940                         vm_page_free_prepare_queues(m);
1941
1942                         /*
1943                          * remove page from object here since we're already
1944                          * behind the object lock... defer the rest of the work
1945                          * we'd normally do in vm_page_free_prepare_object
1946                          * until 'vm_page_free_list' is called
1947                          */
1948                         if (m->tabled)
1949                                 vm_page_remove(m, TRUE);
1950
1951                         assert(m->pageq.next == NULL &&
1952                                m->pageq.prev == NULL);
1953                         m->pageq.next = (queue_entry_t)local_freeq;
1954                         local_freeq = m;
1955                         local_freed++;
1956
1957                         inactive_burst_count = 0;
1958
1959                         if(page_prev_state != PAGE_STATE_SPECULATIVE) {
1960                                 vm_pageout_stats[vm_pageout_stat_now].reclaimed++;
1961                                 page_prev_state = 0;
1962                         }
1963
1964                         goto done_with_inactivepage;
1965                 }
1966
1967                 assert(!m->private);
1968                 assert(!m->fictitious);
1969
1970                 /*
1971                  *      If already cleaning this page in place, convert from
1972                  *      "adjacent" to "target". We can leave the page mapped,
1973                  *      and vm_pageout_object_terminate will determine whether
1974                  *      to free or reactivate.
1975                  */
1976
1977                 if (m->cleaning) {
1978                         m->busy = TRUE;
1979                         m->pageout = TRUE;
1980                         m->dump_cleaning = TRUE;
1981                         vm_page_wire(m);
1982
1983                         CLUSTER_STAT(vm_pageout_cluster_conversions++);
1984
1985                         inactive_burst_count = 0;
1986
1987                         goto done_with_inactivepage;
1988                 }
1989
1990                 /*
1991                  * If the object is empty, the page must be reclaimed even
1992                  * if dirty or used.
1993                  * If the page belongs to a volatile object, we stick it back
1994                  * on.
1995                  */
1996                 if (object->copy == VM_OBJECT_NULL) {
1997                         if (object->purgable == VM_PURGABLE_EMPTY) {
1998                                 m->busy = TRUE;
1999                                 if (m->pmapped == TRUE) {
2000                                         /* unmap the page */
2001                                         refmod_state = pmap_disconnect(m->phys_page);
2002                                         if (refmod_state & VM_MEM_MODIFIED) {
2003                                                 m->dirty = TRUE;
2004                                         }
2005                                 }
2006                                 if (m->dirty || m->precious) {
2007                                         /* we saved the cost of cleaning this page ! */
2008                                         vm_page_purged_count++;
2009                                 }
2010                                 goto reclaim_page;
2011                         }
2012                         if (object->purgable == VM_PURGABLE_VOLATILE) {
2013                                 /* if it's wired, we can't put it on our queue */
2014                                 assert(!VM_PAGE_WIRED(m));
2015                                 /* just stick it back on! */
2016                                 goto reactivate_page;
2017                         }
2018                 }
2019
2020                 /*
2021                  *      If it's being used, reactivate.
2022                  *      (Fictitious pages are either busy or absent.)
2023                  *      First, update the reference and dirty bits
2024                  *      to make sure the page is unreferenced.
2025                  */
2026                 refmod_state = -1;
2027
2028                 if (m->reference == FALSE && m->pmapped == TRUE) {
2029                         refmod_state = pmap_get_refmod(m->phys_page);
2030
2031                         if (refmod_state & VM_MEM_REFERENCED)
2032                                 m->reference = TRUE;
2033                         if (refmod_state & VM_MEM_MODIFIED)
2034                                 m->dirty = TRUE;
2035                 }
2036
2037                 if (m->reference || m->dirty) {
2038                         /* deal with a rogue "reusable" page */
2039                         VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m);
2040                 }
2041
2042                 if (m->reference && !m->no_cache) {
2043                         /*
2044                          * The page we pulled off the inactive list has
2045                          * been referenced.  It is possible for other
2046                          * processors to be touching pages faster than we
2047                          * can clear the referenced bit and traverse the
2048                          * inactive queue, so we limit the number of
2049                          * reactivations.
2050                          */
2051                         if (++reactivated_this_call >= reactivate_limit) {
2052                                 vm_pageout_reactivation_limit_exceeded++;
2053                         } else if (catch_up_count) {
2054                                 vm_pageout_catch_ups++;
2055                         } else if (++inactive_reclaim_run >= VM_PAGEOUT_INACTIVE_FORCE_RECLAIM) {
2056                                 vm_pageout_inactive_force_reclaim++;
2057                         } else {
2058                                 uint32_t isinuse;
2059 reactivate_page:
2060                                 if ( !object->internal && object->pager != MEMORY_OBJECT_NULL &&
2061                                      vnode_pager_get_isinuse(object->pager, &isinuse) == KERN_SUCCESS && !isinuse) {
2062                                         /*
2063                                          * no explict mappings of this object exist
2064                                          * and it's not open via the filesystem
2065                                          */
2066                                         vm_page_deactivate(m);
2067                                         vm_pageout_inactive_deactivated++;
2068                                 } else {
2069                                         /*
2070                                          * The page was/is being used, so put back on active list.
2071                                          */
2072                                         vm_page_activate(m);
2073                                         VM_STAT_INCR(reactivations);
2074                                 }
2075                                 vm_pageout_inactive_used++;
2076                                 inactive_burst_count = 0;
2077
2078                                 goto done_with_inactivepage;
2079                         }
2080                         /*
2081                          * Make sure we call pmap_get_refmod() if it
2082                          * wasn't already called just above, to update
2083                          * the dirty bit.
2084                          */
2085                         if ((refmod_state == -1) && !m->dirty && m->pmapped) {
2086                                 refmod_state = pmap_get_refmod(m->phys_page);
2087                                 if (refmod_state & VM_MEM_MODIFIED)
2088                                         m->dirty = TRUE;
2089                         }
2090                         forced_reclaim = TRUE;
2091                 } else {
2092                         forced_reclaim = FALSE;
2093                 }
2094
2095                 XPR(XPR_VM_PAGEOUT,
2096                 "vm_pageout_scan, replace object 0x%X offset 0x%X page 0x%X\n",
2097                 object, m->offset, m, 0,0);
2098
2099                 /*
2100                  * we've got a candidate page to steal...
2101                  *
2102                  * m->dirty is up to date courtesy of the
2103                  * preceding check for m->reference... if
2104                  * we get here, then m->reference had to be
2105                  * FALSE (or possibly "reactivate_limit" was
2106                  * exceeded), but in either case we called
2107                  * pmap_get_refmod() and updated both
2108                  * m->reference and m->dirty
2109                  *
2110                  * if it's dirty or precious we need to
2111                  * see if the target queue is throtttled
2112                  * it if is, we need to skip over it by moving it back
2113                  * to the end of the inactive queue
2114                  */
2115
2116                 inactive_throttled = FALSE;
2117
2118                 if (m->dirty || m->precious) {
2119                         if (object->internal) {
2120                                 if (VM_PAGE_Q_THROTTLED(iq))
2121                                         inactive_throttled = TRUE;
2122                         } else if (VM_PAGE_Q_THROTTLED(eq)) {
2123                                 inactive_throttled = TRUE;
2124                         }
2125                 }
2126                 if (inactive_throttled == TRUE) {
2127 throttle_inactive:
2128                         if (!IP_VALID(memory_manager_default) &&
2129                             object->internal && m->dirty &&
2130                             (object->purgable == VM_PURGABLE_DENY ||
2131                              object->purgable == VM_PURGABLE_NONVOLATILE ||
2132                              object->purgable == VM_PURGABLE_VOLATILE)) {
2133                                 queue_enter(&vm_page_queue_throttled, m,
2134                                             vm_page_t, pageq);
2135                                 m->throttled = TRUE;
2136                                 vm_page_throttled_count++;
2137                         } else {
2138                                 if (m->zero_fill) {
2139                                         queue_enter(&vm_page_queue_zf, m,
2140                                                     vm_page_t, pageq);
2141                                         vm_zf_queue_count++;
2142                                 } else
2143                                         queue_enter(&vm_page_queue_inactive, m,
2144                                                     vm_page_t, pageq);
2145                                 m->inactive = TRUE;
2146                                 if (!m->fictitious) {
2147                                         vm_page_inactive_count++;
2148                                         token_new_pagecount++;
2149                                 }
2150                         }
2151                         vm_pageout_scan_inactive_throttled++;
2152                         goto done_with_inactivepage;
2153                 }
2154
2155                 /*
2156                  * we've got a page that we can steal...
2157                  * eliminate all mappings and make sure
2158                  * we have the up-to-date modified state
2159                  * first take the page BUSY, so that no new
2160                  * mappings can be made
2161                  */
2162                 m->busy = TRUE;
2163
2164                 /*
2165                  * if we need to do a pmap_disconnect then we
2166                  * need to re-evaluate m->dirty since the pmap_disconnect
2167                  * provides the true state atomically... the
2168                  * page was still mapped up to the pmap_disconnect
2169                  * and may have been dirtied at the last microsecond
2170                  *
2171                  * we also check for the page being referenced 'late'
2172                  * if it was, we first need to do a WAKEUP_DONE on it
2173                  * since we already set m->busy = TRUE, before
2174                  * going off to reactivate it
2175                  *
2176                  * Note that if 'pmapped' is FALSE then the page is not
2177                  * and has not been in any map, so there is no point calling
2178                  * pmap_disconnect().  m->dirty and/or m->reference could
2179                  * have been set in anticipation of likely usage of the page.
2180                  */
2181                 if (m->pmapped == TRUE) {
2182                         refmod_state = pmap_disconnect(m->phys_page);
2183
2184                         if (refmod_state & VM_MEM_MODIFIED)
2185                                 m->dirty = TRUE;
2186                         if (refmod_state & VM_MEM_REFERENCED) {
2187
2188                                 /* If m->reference is already set, this page must have
2189                                  * already failed the reactivate_limit test, so don't
2190                                  * bump the counts twice.
2191                                  */
2192                                 if ( ! m->reference ) {
2193                                         m->reference = TRUE;
2194                                         if (forced_reclaim ||
2195                                             ++reactivated_this_call >= reactivate_limit)
2196                                                 vm_pageout_reactivation_limit_exceeded++;
2197                                         else {
2198                                                 PAGE_WAKEUP_DONE(m);
2199                                                 goto reactivate_page;
2200                                         }
2201                                 }
2202                         }
2203                 }
2204                 /*
2205                  * reset our count of pages that have been reclaimed
2206                  * since the last page was 'stolen'
2207                  */
2208                 inactive_reclaim_run = 0;
2209
2210                 /*
2211                  *      If it's clean and not precious, we can free the page.
2212                  */
2213                 if (!m->dirty && !m->precious) {
2214                         if (m->zero_fill)
2215                                 vm_pageout_inactive_zf++;
2216                         vm_pageout_inactive_clean++;
2217
2218                         goto reclaim_page;
2219                 }
2220
2221                 /*
2222                  * The page may have been dirtied since the last check
2223                  * for a throttled target queue (which may have been skipped
2224                  * if the page was clean then).  With the dirty page
2225                  * disconnected here, we can make one final check.
2226                  */
2227                 {
2228                         boolean_t disconnect_throttled = FALSE;
2229                         if (object->internal) {
2230                                 if (VM_PAGE_Q_THROTTLED(iq))
2231                                         disconnect_throttled = TRUE;
2232                         } else if (VM_PAGE_Q_THROTTLED(eq)) {
2233                                 disconnect_throttled = TRUE;
2234                         }
2235
2236                         if (disconnect_throttled == TRUE) {
2237                                 PAGE_WAKEUP_DONE(m);
2238                                 goto throttle_inactive;
2239                         }
2240                 }
2241
2242                 vm_pageout_stats[vm_pageout_stat_now].reclaimed++;
2243
2244                 vm_pageout_cluster(m);
2245
2246                 if (m->zero_fill)
2247                         vm_pageout_inactive_zf++;
2248                 vm_pageout_inactive_dirty++;
2249
2250                 inactive_burst_count = 0;
2251
2252 done_with_inactivepage:
2253                 if (delayed_unlock++ > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT || try_failed == TRUE) {
2254
2255                         if (object != NULL) {
2256                                 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2257                                 vm_object_unlock(object);
2258                                 object = NULL;
2259                         }
2260                         if (local_freeq) {
2261                                 vm_page_unlock_queues();
2262                                 vm_page_free_list(local_freeq, TRUE);
2263
2264                                 local_freeq = NULL;
2265                                 local_freed = 0;
2266                                 vm_page_lock_queues();
2267                         } else
2268                                 lck_mtx_yield(&vm_page_queue_lock);
2269
2270                         delayed_unlock = 1;
2271                 }
2272                 /*
2273                  * back to top of pageout scan loop
2274                  */
2275         }
2276 }
2277
2278
2279 int vm_page_free_count_init;
2280
2281 void
2282 vm_page_free_reserve(
2283         int pages)
2284 {
2285         int             free_after_reserve;
2286
2287         vm_page_free_reserved += pages;
2288
2289         free_after_reserve = vm_page_free_count_init - vm_page_free_reserved;
2290
2291         vm_page_free_min = vm_page_free_reserved +
2292                 VM_PAGE_FREE_MIN(free_after_reserve);
2293
2294         if (vm_page_free_min > VM_PAGE_FREE_MIN_LIMIT)
2295                 vm_page_free_min = VM_PAGE_FREE_MIN_LIMIT;
2296
2297         vm_page_free_target = vm_page_free_reserved +
2298                 VM_PAGE_FREE_TARGET(free_after_reserve);
2299
2300         if (vm_page_free_target > VM_PAGE_FREE_TARGET_LIMIT)
2301                 vm_page_free_target = VM_PAGE_FREE_TARGET_LIMIT;
2302
2303         if (vm_page_free_target < vm_page_free_min + 5)
2304                 vm_page_free_target = vm_page_free_min + 5;
2305
2306         vm_page_throttle_limit = vm_page_free_target - (vm_page_free_target / 3);
2307         vm_page_creation_throttle = vm_page_free_target / 2;
2308 }
2309
2310 /*
2311  *      vm_pageout is the high level pageout daemon.
2312  */
2313
2314 void
2315 vm_pageout_continue(void)
2316 {
2317         DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL);
2318         vm_pageout_scan_event_counter++;
2319         vm_pageout_scan();
2320         /* we hold vm_page_queue_free_lock now */
2321         assert(vm_page_free_wanted == 0);
2322         assert(vm_page_free_wanted_privileged == 0);
2323         assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
2324         lck_mtx_unlock(&vm_page_queue_free_lock);
2325
2326         counter(c_vm_pageout_block++);
2327         thread_block((thread_continue_t)vm_pageout_continue);
2328         /*NOTREACHED*/
2329 }
2330
2331
2332 #ifdef FAKE_DEADLOCK
2333
2334 #define FAKE_COUNT      5000
2335
2336 int internal_count = 0;
2337 int fake_deadlock = 0;
2338
2339 #endif
2340
2341 static void
2342 vm_pageout_iothread_continue(struct vm_pageout_queue *q)
2343 {
2344         vm_page_t       m = NULL;
2345         vm_object_t     object;
2346         memory_object_t pager;
2347         thread_t        self = current_thread();
2348
2349         if ((vm_pageout_internal_iothread != THREAD_NULL)
2350             && (self == vm_pageout_external_iothread )
2351             && (self->options & TH_OPT_VMPRIV))
2352                 self->options &= ~TH_OPT_VMPRIV;
2353
2354         vm_page_lockspin_queues();
2355
2356         while ( !queue_empty(&q->pgo_pending) ) {
2357
2358                    q->pgo_busy = TRUE;
2359                    queue_remove_first(&q->pgo_pending, m, vm_page_t, pageq);
2360                    VM_PAGE_CHECK(m);
2361                    m->pageout_queue = FALSE;
2362                    m->pageq.next = NULL;
2363                    m->pageq.prev = NULL;
2364                    vm_page_unlock_queues();
2365
2366 #ifdef FAKE_DEADLOCK
2367                    if (q == &vm_pageout_queue_internal) {
2368                            vm_offset_t addr;
2369                            int  pg_count;
2370
2371                            internal_count++;
2372
2373                            if ((internal_count == FAKE_COUNT)) {
2374
2375                                    pg_count = vm_page_free_count + vm_page_free_reserved;
2376
2377                                    if (kmem_alloc(kernel_map, &addr, PAGE_SIZE * pg_count) == KERN_SUCCESS) {
2378                                            kmem_free(kernel_map, addr, PAGE_SIZE * pg_count);
2379                                    }
2380                                    internal_count = 0;
2381                                    fake_deadlock++;
2382                            }
2383                    }
2384 #endif
2385                    object = m->object;
2386
2387                    vm_object_lock(object);
2388
2389                    if (!object->pager_initialized) {
2390
2391                            /*
2392                             *   If there is no memory object for the page, create
2393                             *   one and hand it to the default pager.
2394                             */
2395
2396                            if (!object->pager_initialized)
2397                                    vm_object_collapse(object,
2398                                                       (vm_object_offset_t) 0,
2399                                                       TRUE);
2400                            if (!object->pager_initialized)
2401                                    vm_object_pager_create(object);
2402                            if (!object->pager_initialized) {
2403                                    /*
2404                                     *   Still no pager for the object.
2405                                     *   Reactivate the page.
2406                                     *
2407                                     *   Should only happen if there is no
2408                                     *   default pager.
2409                                     */
2410                                    vm_page_lockspin_queues();
2411
2412                                    vm_pageout_queue_steal(m, TRUE);
2413                                    vm_pageout_dirty_no_pager++;
2414                                    vm_page_activate(m);
2415
2416                                    vm_page_unlock_queues();
2417
2418                                    /*
2419                                     *   And we are done with it.
2420                                     */
2421                                    PAGE_WAKEUP_DONE(m);
2422
2423                                    vm_object_paging_end(object);
2424                                    vm_object_unlock(object);
2425
2426                                    vm_page_lockspin_queues();
2427                                    continue;
2428                            }
2429                    }
2430                    pager = object->pager;
2431                    if (pager == MEMORY_OBJECT_NULL) {
2432                            /*
2433                             * This pager has been destroyed by either
2434                             * memory_object_destroy or vm_object_destroy, and
2435                             * so there is nowhere for the page to go.
2436                             */
2437                            if (m->pageout) {
2438                                    /*
2439                                     * Just free the page... VM_PAGE_FREE takes
2440                                     * care of cleaning up all the state...
2441                                     * including doing the vm_pageout_throttle_up
2442                                     */
2443                                    VM_PAGE_FREE(m);
2444                            } else {
2445                                    vm_page_lockspin_queues();
2446
2447                                    vm_pageout_queue_steal(m, TRUE);
2448                                    vm_page_activate(m);
2449
2450                                    vm_page_unlock_queues();
2451
2452                                    /*
2453                                     *   And we are done with it.
2454                                     */
2455                                    PAGE_WAKEUP_DONE(m);
2456                            }
2457                            vm_object_paging_end(object);
2458                            vm_object_unlock(object);
2459
2460                            vm_page_lockspin_queues();
2461                            continue;
2462                    }
2463                    VM_PAGE_CHECK(m);
2464                    vm_object_unlock(object);
2465                    /*
2466                     * we expect the paging_in_progress reference to have
2467                     * already been taken on the object before it was added
2468                     * to the appropriate pageout I/O queue... this will
2469                     * keep the object from being terminated and/or the
2470                     * paging_offset from changing until the I/O has
2471                     * completed... therefore no need to lock the object to
2472                     * pull the paging_offset from it.
2473                     *
2474                     * Send the data to the pager.
2475                     * any pageout clustering happens there
2476                     */
2477                    memory_object_data_return(pager,
2478                                              m->offset + object->paging_offset,
2479                                              PAGE_SIZE,
2480                                              NULL,
2481                                              NULL,
2482                                              FALSE,
2483                                              FALSE,
2484                                              0);
2485
2486                    vm_object_lock(object);
2487                    vm_object_paging_end(object);
2488                    vm_object_unlock(object);
2489
2490                    vm_page_lockspin_queues();
2491         }
2492         assert_wait((event_t) q, THREAD_UNINT);
2493
2494         if (q->pgo_throttled == TRUE && !VM_PAGE_Q_THROTTLED(q)) {
2495                 q->pgo_throttled = FALSE;
2496                 thread_wakeup((event_t) &q->pgo_laundry);
2497         }
2498         if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
2499                 q->pgo_draining = FALSE;
2500                 thread_wakeup((event_t) (&q->pgo_laundry+1));
2501         }
2502         q->pgo_busy = FALSE;
2503         q->pgo_idle = TRUE;
2504         vm_page_unlock_queues();
2505
2506         thread_block_parameter((thread_continue_t)vm_pageout_iothread_continue, (void *) &q->pgo_pending);
2507         /*NOTREACHED*/
2508 }
2509
2510
2511 static void
2512 vm_pageout_iothread_external(void)
2513 {
2514         thread_t        self = current_thread();
2515
2516         self->options |= TH_OPT_VMPRIV;
2517
2518         vm_pageout_iothread_continue(&vm_pageout_queue_external);
2519         /*NOTREACHED*/
2520 }
2521
2522
2523 static void
2524 vm_pageout_iothread_internal(void)
2525 {
2526         thread_t        self = current_thread();
2527
2528         self->options |= TH_OPT_VMPRIV;
2529
2530         vm_pageout_iothread_continue(&vm_pageout_queue_internal);
2531         /*NOTREACHED*/
2532 }
2533
2534 kern_return_t
2535 vm_set_buffer_cleanup_callout(boolean_t (*func)(int))
2536 {
2537         if (OSCompareAndSwapPtr(NULL, func, (void * volatile *) &consider_buffer_cache_collect)) {
2538                 return KERN_SUCCESS;
2539         } else {
2540                 return KERN_FAILURE; /* Already set */
2541         }
2542 }
2543
2544 static void
2545 vm_pageout_garbage_collect(int collect)
2546 {
2547         if (collect) {
2548                 boolean_t buf_large_zfree = FALSE;
2549                 stack_collect();
2550
2551                 /*
2552                  * consider_zone_gc should be last, because the other operations
2553                  * might return memory to zones.
2554                  */
2555                 consider_machine_collect();
2556                 if (consider_buffer_cache_collect != NULL) {
2557                         buf_large_zfree = (*consider_buffer_cache_collect)(0);
2558                 }
2559                 consider_zone_gc(buf_large_zfree);
2560
2561                 consider_machine_adjust();
2562         }
2563
2564         assert_wait((event_t) &vm_pageout_garbage_collect, THREAD_UNINT);
2565
2566         thread_block_parameter((thread_continue_t) vm_pageout_garbage_collect, (void *)1);
2567         /*NOTREACHED*/
2568 }
2569
2570
2571
2572 void
2573 vm_pageout(void)
2574 {
2575         thread_t        self = current_thread();
2576         thread_t        thread;
2577         kern_return_t   result;
2578         spl_t           s;
2579
2580         /*
2581          * Set thread privileges.
2582          */
2583         s = splsched();
2584         thread_lock(self);
2585         self->priority = BASEPRI_PREEMPT - 1;
2586         set_sched_pri(self, self->priority);
2587         thread_unlock(self);
2588
2589         if (!self->reserved_stack)
2590                 self->reserved_stack = self->kernel_stack;
2591
2592         splx(s);
2593
2594         /*
2595          *      Initialize some paging parameters.
2596          */
2597
2598         if (vm_pageout_idle_wait == 0)
2599                 vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
2600
2601         if (vm_pageout_burst_wait == 0)
2602                 vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
2603
2604         if (vm_pageout_empty_wait == 0)
2605                 vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
2606
2607         if (vm_pageout_deadlock_wait == 0)
2608                 vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
2609
2610         if (vm_pageout_deadlock_relief == 0)
2611                 vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
2612
2613         if (vm_pageout_inactive_relief == 0)
2614                 vm_pageout_inactive_relief = VM_PAGEOUT_INACTIVE_RELIEF;
2615
2616         if (vm_pageout_burst_active_throttle == 0)
2617                 vm_pageout_burst_active_throttle = VM_PAGEOUT_BURST_ACTIVE_THROTTLE;
2618
2619         if (vm_pageout_burst_inactive_throttle == 0)
2620                 vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
2621
2622         /*
2623          * Set kernel task to low backing store privileged
2624          * status
2625          */
2626         task_lock(kernel_task);
2627         kernel_task->priv_flags |= VM_BACKING_STORE_PRIV;
2628         task_unlock(kernel_task);
2629
2630         vm_page_free_count_init = vm_page_free_count;
2631
2632         /*
2633          * even if we've already called vm_page_free_reserve
2634          * call it again here to insure that the targets are
2635          * accurately calculated (it uses vm_page_free_count_init)
2636          * calling it with an arg of 0 will not change the reserve
2637          * but will re-calculate free_min and free_target
2638          */
2639         if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
2640                 vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
2641         } else
2642                 vm_page_free_reserve(0);
2643
2644
2645         queue_init(&vm_pageout_queue_external.pgo_pending);
2646         vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
2647         vm_pageout_queue_external.pgo_laundry = 0;
2648         vm_pageout_queue_external.pgo_idle = FALSE;
2649         vm_pageout_queue_external.pgo_busy = FALSE;
2650         vm_pageout_queue_external.pgo_throttled = FALSE;
2651         vm_pageout_queue_external.pgo_draining = FALSE;
2652
2653         queue_init(&vm_pageout_queue_internal.pgo_pending);
2654         vm_pageout_queue_internal.pgo_maxlaundry = 0;
2655         vm_pageout_queue_internal.pgo_laundry = 0;
2656         vm_pageout_queue_internal.pgo_idle = FALSE;
2657         vm_pageout_queue_internal.pgo_busy = FALSE;
2658         vm_pageout_queue_internal.pgo_throttled = FALSE;
2659         vm_pageout_queue_internal.pgo_draining = FALSE;
2660
2661
2662         /* internal pageout thread started when default pager registered first time */
2663         /* external pageout and garbage collection threads started here */
2664
2665         result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external, NULL,
2666                                               BASEPRI_PREEMPT - 1,
2667                                               &vm_pageout_external_iothread);
2668         if (result != KERN_SUCCESS)
2669                 panic("vm_pageout_iothread_external: create failed");
2670
2671         thread_deallocate(vm_pageout_external_iothread);
2672
2673         result = kernel_thread_start_priority((thread_continue_t)vm_pageout_garbage_collect, NULL,
2674                                               MINPRI_KERNEL,
2675                                               &thread);
2676         if (result != KERN_SUCCESS)
2677                 panic("vm_pageout_garbage_collect: create failed");
2678
2679         thread_deallocate(thread);
2680
2681         vm_object_reaper_init();
2682
2683
2684         vm_pageout_continue();
2685
2686         /*
2687          * Unreached code!
2688          *
2689          * The vm_pageout_continue() call above never returns, so the code below is never
2690          * executed.  We take advantage of this to declare several DTrace VM related probe
2691          * points that our kernel doesn't have an analog for.  These are probe points that
2692          * exist in Solaris and are in the DTrace documentation, so people may have written
2693          * scripts that use them.  Declaring the probe points here means their scripts will
2694          * compile and execute which we want for portability of the scripts, but since this
2695          * section of code is never reached, the probe points will simply never fire.  Yes,
2696          * this is basically a hack.  The problem is the DTrace probe points were chosen with
2697          * Solaris specific VM events in mind, not portability to different VM implementations.
2698          */
2699
2700         DTRACE_VM2(execfree, int, 1, (uint64_t *), NULL);
2701         DTRACE_VM2(execpgin, int, 1, (uint64_t *), NULL);
2702         DTRACE_VM2(execpgout, int, 1, (uint64_t *), NULL);
2703         DTRACE_VM2(pgswapin, int, 1, (uint64_t *), NULL);
2704         DTRACE_VM2(pgswapout, int, 1, (uint64_t *), NULL);
2705         DTRACE_VM2(swapin, int, 1, (uint64_t *), NULL);
2706         DTRACE_VM2(swapout, int, 1, (uint64_t *), NULL);
2707         /*NOTREACHED*/
2708 }
2709
2710 kern_return_t
2711 vm_pageout_internal_start(void)
2712 {
2713         kern_return_t result;
2714
2715         vm_pageout_queue_internal.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
2716         result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal, NULL, BASEPRI_PREEMPT - 1, &vm_pageout_internal_iothread);
2717         if (result == KERN_SUCCESS)
2718                 thread_deallocate(vm_pageout_internal_iothread);
2719         return result;
2720 }
2721
2722
2723 /*
2724  * when marshalling pages into a UPL and subsequently committing
2725  * or aborting them, it is necessary to hold
2726  * the vm_page_queue_lock (a hot global lock) for certain operations
2727  * on the page... however, the majority of the work can be done
2728  * while merely holding the object lock... in fact there are certain
2729  * collections of pages that don't require any work brokered by the
2730  * vm_page_queue_lock... to mitigate the time spent behind the global
2731  * lock, go to a 2 pass algorithm... collect pages up to DELAYED_WORK_LIMIT
2732  * while doing all of the work that doesn't require the vm_page_queue_lock...
2733  * then call dw_do_work to acquire the vm_page_queue_lock and do the
2734  * necessary work for each page... we will grab the busy bit on the page
2735  * if it's not already held so that dw_do_work can drop the object lock
2736  * if it can't immediately take the vm_page_queue_lock in order to compete
2737  * for the locks in the same order that vm_pageout_scan takes them.
2738  * the operation names are modeled after the names of the routines that
2739  * need to be called in order to make the changes very obvious in the
2740  * original loop
2741  */
2742
2743 #define DELAYED_WORK_LIMIT      32
2744
2745 #define DW_vm_page_unwire               0x01
2746 #define DW_vm_page_wire                 0x02
2747 #define DW_vm_page_free                 0x04
2748 #define DW_vm_page_activate             0x08
2749 #define DW_vm_page_deactivate_internal  0x10
2750 #define DW_vm_page_speculate            0x20
2751 #define DW_vm_page_lru                  0x40
2752 #define DW_vm_pageout_throttle_up       0x80
2753 #define DW_PAGE_WAKEUP                  0x100
2754 #define DW_clear_busy                   0x200
2755 #define DW_clear_reference              0x400
2756 #define DW_set_reference                0x800
2757
2758 struct dw {
2759         vm_page_t       dw_m;
2760         int             dw_mask;
2761 };
2762
2763
2764 static void dw_do_work(vm_object_t object, struct dw *dwp, int dw_count);
2765
2766
2767
2768 static upl_t
2769 upl_create(int type, int flags, upl_size_t size)
2770 {
2771         upl_t   upl;
2772         int     page_field_size = 0;
2773         int     upl_flags = 0;
2774         int     upl_size  = sizeof(struct upl);
2775
2776         size = round_page_32(size);
2777
2778         if (type & UPL_CREATE_LITE) {
2779                 page_field_size = (atop(size) + 7) >> 3;
2780                 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
2781
2782                 upl_flags |= UPL_LITE;
2783         }
2784         if (type & UPL_CREATE_INTERNAL) {
2785                 upl_size += (int) sizeof(struct upl_page_info) * atop(size);
2786
2787                 upl_flags |= UPL_INTERNAL;
2788         }
2789         upl = (upl_t)kalloc(upl_size + page_field_size);
2790
2791         if (page_field_size)
2792                 bzero((char *)upl + upl_size, page_field_size);
2793
2794         upl->flags = upl_flags | flags;
2795         upl->src_object = NULL;
2796         upl->kaddr = (vm_offset_t)0;
2797         upl->size = 0;
2798         upl->map_object = NULL;
2799         upl->ref_count = 1;
2800         upl->highest_page = 0;
2801         upl_lock_init(upl);
2802         upl->vector_upl = NULL;
2803 #if UPL_DEBUG
2804         upl->ubc_alias1 = 0;
2805         upl->ubc_alias2 = 0;
2806
2807         upl->upl_creator = current_thread();
2808         upl->upl_state = 0;
2809         upl->upl_commit_index = 0;
2810         bzero(&upl->upl_commit_records[0], sizeof(upl->upl_commit_records));
2811
2812         (void) OSBacktrace(&upl->upl_create_retaddr[0], UPL_DEBUG_STACK_FRAMES);
2813 #endif /* UPL_DEBUG */
2814
2815         return(upl);
2816 }
2817
2818 static void
2819 upl_destroy(upl_t upl)
2820 {
2821         int     page_field_size;  /* bit field in word size buf */
2822         int     size;
2823
2824 #if UPL_DEBUG
2825         {
2826                 vm_object_t     object;
2827
2828                 if (upl->flags & UPL_SHADOWED) {
2829                         object = upl->map_object->shadow;
2830                 } else {
2831                         object = upl->map_object;
2832                 }
2833                 vm_object_lock(object);
2834                 queue_remove(&object->uplq, upl, upl_t, uplq);
2835                 vm_object_unlock(object);
2836         }
2837 #endif /* UPL_DEBUG */
2838         /*
2839          * drop a reference on the map_object whether or
2840          * not a pageout object is inserted
2841          */
2842         if (upl->flags & UPL_SHADOWED)
2843                 vm_object_deallocate(upl->map_object);
2844
2845         if (upl->flags & UPL_DEVICE_MEMORY)
2846                 size = PAGE_SIZE;
2847         else
2848                 size = upl->size;
2849         page_field_size = 0;
2850
2851         if (upl->flags & UPL_LITE) {
2852                 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
2853                 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
2854         }
2855         upl_lock_destroy(upl);
2856         upl->vector_upl = (vector_upl_t) 0xfeedbeef;
2857         if (upl->flags & UPL_INTERNAL) {
2858                 kfree(upl,
2859                       sizeof(struct upl) +
2860                       (sizeof(struct upl_page_info) * (size/PAGE_SIZE))
2861                       + page_field_size);
2862         } else {
2863                 kfree(upl, sizeof(struct upl) + page_field_size);
2864         }
2865 }
2866
2867 void uc_upl_dealloc(upl_t upl);
2868 __private_extern__ void
2869 uc_upl_dealloc(upl_t upl)
2870 {
2871         if (--upl->ref_count == 0)
2872                 upl_destroy(upl);
2873 }
2874
2875 void
2876 upl_deallocate(upl_t upl)
2877 {
2878         if (--upl->ref_count == 0) {
2879                 if(vector_upl_is_valid(upl))
2880                         vector_upl_deallocate(upl);
2881                 upl_destroy(upl);
2882         }
2883 }
2884
2885 #if DEVELOPMENT || DEBUG
2886 /*/*
2887  * Statistics about UPL enforcement of copy-on-write obligations.
2888  */
2889 unsigned long upl_cow = 0;
2890 unsigned long upl_cow_again = 0;
2891 unsigned long upl_cow_pages = 0;
2892 unsigned long upl_cow_again_pages = 0;
2893
2894 unsigned long iopl_cow = 0;
2895 unsigned long iopl_cow_pages = 0;
2896 #endif
2897
2898 /*
2899  *      Routine:        vm_object_upl_request
2900  *      Purpose:
2901  *              Cause the population of a portion of a vm_object.
2902  *              Depending on the nature of the request, the pages
2903  *              returned may be contain valid data or be uninitialized.
2904  *              A page list structure, listing the physical pages
2905  *              will be returned upon request.
2906  *              This function is called by the file system or any other
2907  *              supplier of backing store to a pager.
2908  *              IMPORTANT NOTE: The caller must still respect the relationship
2909  *              between the vm_object and its backing memory object.  The
2910  *              caller MUST NOT substitute changes in the backing file
2911  *              without first doing a memory_object_lock_request on the
2912  *              target range unless it is know that the pages are not
2913  *              shared with another entity at the pager level.
2914  *              Copy_in_to:
2915  *                      if a page list structure is present
2916  *                      return the mapped physical pages, where a
2917  *                      page is not present, return a non-initialized
2918  *                      one.  If the no_sync bit is turned on, don't
2919  *                      call the pager unlock to synchronize with other
2920  *                      possible copies of the page. Leave pages busy
2921  *                      in the original object, if a page list structure
2922  *                      was specified.  When a commit of the page list
2923  *                      pages is done, the dirty bit will be set for each one.
2924  *              Copy_out_from:
2925  *                      If a page list structure is present, return
2926  *                      all mapped pages.  Where a page does not exist
2927  *                      map a zero filled one. Leave pages busy in
2928  *                      the original object.  If a page list structure
2929  *                      is not specified, this call is a no-op.
2930  *
2931  *              Note:  access of default pager objects has a rather interesting
2932  *              twist.  The caller of this routine, presumably the file system
2933  *              page cache handling code, will never actually make a request
2934  *              against a default pager backed object.  Only the default
2935  *              pager will make requests on backing store related vm_objects
2936  *              In this way the default pager can maintain the relationship
2937  *              between backing store files (abstract memory objects) and
2938  *              the vm_objects (cache objects), they support.
2939  *
2940  */
2941
2942 __private_extern__ kern_return_t
2943 vm_object_upl_request(
2944         vm_object_t             object,
2945         vm_object_offset_t      offset,
2946         upl_size_t              size,
2947         upl_t                   *upl_ptr,
2948         upl_page_info_array_t   user_page_list,
2949         unsigned int            *page_list_count,
2950         int                     cntrl_flags)
2951 {
2952         vm_page_t               dst_page = VM_PAGE_NULL;
2953         vm_object_offset_t      dst_offset;
2954         upl_size_t              xfer_size;
2955         boolean_t               dirty;
2956         boolean_t               hw_dirty;
2957         upl_t                   upl = NULL;
2958         unsigned int            entry;
2959 #if MACH_CLUSTER_STATS
2960         boolean_t               encountered_lrp = FALSE;
2961 #endif
2962         vm_page_t               alias_page = NULL;
2963         int                     refmod_state = 0;
2964         wpl_array_t             lite_list = NULL;
2965         vm_object_t             last_copy_object;
2966         struct  dw              dw_array[DELAYED_WORK_LIMIT];
2967         struct  dw              *dwp;
2968         int                     dw_count;
2969
2970         if (cntrl_flags & ~UPL_VALID_FLAGS) {
2971                 /*
2972                  * For forward compatibility's sake,
2973                  * reject any unknown flag.
2974                  */
2975                 return KERN_INVALID_VALUE;
2976         }
2977         if ( (!object->internal) && (object->paging_offset != 0) )
2978                 panic("vm_object_upl_request: external object with non-zero paging offset\n");
2979         if (object->phys_contiguous)
2980                 panic("vm_object_upl_request: contiguous object specified\n");
2981
2982
2983         if ((size / PAGE_SIZE) > MAX_UPL_SIZE)
2984                 size = MAX_UPL_SIZE * PAGE_SIZE;
2985
2986         if ( (cntrl_flags & UPL_SET_INTERNAL) && page_list_count != NULL)
2987                 *page_list_count = MAX_UPL_SIZE;
2988
2989         if (cntrl_flags & UPL_SET_INTERNAL) {
2990                 if (cntrl_flags & UPL_SET_LITE) {
2991
2992                         upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE, 0, size);
2993
2994                         user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
2995                         lite_list = (wpl_array_t)
2996                                         (((uintptr_t)user_page_list) +
2997                                         ((size/PAGE_SIZE) * sizeof(upl_page_info_t)));
2998                         if (size == 0) {
2999                                 user_page_list = NULL;
3000                                 lite_list = NULL;
3001                         }
3002                 } else {
3003                         upl = upl_create(UPL_CREATE_INTERNAL, 0, size);
3004
3005                         user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
3006                         if (size == 0) {
3007                                 user_page_list = NULL;
3008                         }
3009                 }
3010         } else {
3011                 if (cntrl_flags & UPL_SET_LITE) {
3012
3013                         upl = upl_create(UPL_CREATE_EXTERNAL | UPL_CREATE_LITE, 0, size);
3014
3015                         lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
3016                         if (size == 0) {
3017                                 lite_list = NULL;
3018                         }
3019                 } else {
3020                         upl = upl_create(UPL_CREATE_EXTERNAL, 0, size);
3021                 }
3022         }
3023         *upl_ptr = upl;
3024
3025         if (user_page_list)
3026                 user_page_list[0].device = FALSE;
3027
3028         if (cntrl_flags & UPL_SET_LITE) {
3029                 upl->map_object = object;
3030         } else {
3031                 upl->map_object = vm_object_allocate(size);
3032                 /*
3033                  * No neeed to lock the new object: nobody else knows
3034                  * about it yet, so it's all ours so far.
3035                  */
3036                 upl->map_object->shadow = object;
3037                 upl->map_object->pageout = TRUE;
3038                 upl->map_object->can_persist = FALSE;
3039                 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3040                 upl->map_object->shadow_offset = offset;
3041                 upl->map_object->wimg_bits = object->wimg_bits;
3042
3043                 VM_PAGE_GRAB_FICTITIOUS(alias_page);
3044
3045                 upl->flags |= UPL_SHADOWED;
3046         }
3047         /*
3048          * ENCRYPTED SWAP:
3049          * Just mark the UPL as "encrypted" here.
3050          * We'll actually encrypt the pages later,
3051          * in upl_encrypt(), when the caller has
3052          * selected which pages need to go to swap.
3053          */
3054         if (cntrl_flags & UPL_ENCRYPT)
3055                 upl->flags |= UPL_ENCRYPTED;
3056
3057         if (cntrl_flags & UPL_FOR_PAGEOUT)
3058                 upl->flags |= UPL_PAGEOUT;
3059
3060         vm_object_lock(object);
3061         vm_object_activity_begin(object);
3062
3063         /*
3064          * we can lock in the paging_offset once paging_in_progress is set
3065          */
3066         upl->size = size;
3067         upl->offset = offset + object->paging_offset;
3068
3069 #if UPL_DEBUG
3070         queue_enter(&object->uplq, upl, upl_t, uplq);
3071 #endif /* UPL_DEBUG */
3072
3073         if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != VM_OBJECT_NULL) {
3074                 /*
3075                  * Honor copy-on-write obligations
3076                  *
3077                  * The caller is gathering these pages and
3078                  * might modify their contents.  We need to
3079                  * make sure that the copy object has its own
3080                  * private copies of these pages before we let
3081                  * the caller modify them.
3082                  */
3083                 vm_object_update(object,
3084                                  offset,
3085                                  size,
3086                                  NULL,
3087                                  NULL,
3088                                  FALSE, /* should_return */
3089                                  MEMORY_OBJECT_COPY_SYNC,
3090                                  VM_PROT_NO_CHANGE);
3091 #if DEVELOPMENT || DEBUG
3092                 upl_cow++;
3093                 upl_cow_pages += size >> PAGE_SHIFT;
3094 #endif
3095         }
3096         /*
3097          * remember which copy object we synchronized with
3098          */
3099         last_copy_object = object->copy;
3100         entry = 0;
3101
3102         xfer_size = size;
3103         dst_offset = offset;
3104
3105         dwp = &dw_array[0];
3106         dw_count = 0;
3107
3108         while (xfer_size) {
3109
3110                 dwp->dw_mask = 0;
3111
3112                 if ((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
3113                         vm_object_unlock(object);
3114                         VM_PAGE_GRAB_FICTITIOUS(alias_page);
3115                         vm_object_lock(object);
3116                 }
3117                 if (cntrl_flags & UPL_COPYOUT_FROM) {
3118                         upl->flags |= UPL_PAGE_SYNC_DONE;
3119
3120                         if ( ((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
3121                                 dst_page->fictitious ||
3122                                 dst_page->absent ||
3123                                 dst_page->error ||
3124                                (VM_PAGE_WIRED(dst_page) && !dst_page->pageout && !dst_page->list_req_pending)) {
3125
3126                                 if (user_page_list)
3127                                         user_page_list[entry].phys_addr = 0;
3128
3129                                 goto try_next_page;
3130                         }
3131                         /*
3132                          * grab this up front...
3133                          * a high percentange of the time we're going to
3134                          * need the hardware modification state a bit later
3135                          * anyway... so we can eliminate an extra call into
3136                          * the pmap layer by grabbing it here and recording it
3137                          */
3138                         if (dst_page->pmapped)
3139                                 refmod_state = pmap_get_refmod(dst_page->phys_page);
3140                         else
3141                                 refmod_state = 0;
3142
3143                         if ( (refmod_state & VM_MEM_REFERENCED) && dst_page->inactive ) {
3144                                 /*
3145                                  * page is on inactive list and referenced...
3146                                  * reactivate it now... this gets it out of the
3147                                  * way of vm_pageout_scan which would have to
3148                                  * reactivate it upon tripping over it
3149                                  */
3150                                 dwp->dw_mask |= DW_vm_page_activate;
3151                         }
3152                         if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
3153                                 /*
3154                                  * we're only asking for DIRTY pages to be returned
3155                                  */
3156                                 if (dst_page->list_req_pending || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
3157                                         /*
3158                                          * if we were the page stolen by vm_pageout_scan to be
3159                                          * cleaned (as opposed to a buddy being clustered in
3160                                          * or this request is not being driven by a PAGEOUT cluster
3161                                          * then we only need to check for the page being dirty or
3162                                          * precious to decide whether to return it
3163                                          */
3164                                         if (dst_page->dirty || dst_page->precious || (refmod_state & VM_MEM_MODIFIED))
3165                                                 goto check_busy;
3166                                         goto dont_return;
3167                                 }
3168                                 /*
3169                                  * this is a request for a PAGEOUT cluster and this page
3170                                  * is merely along for the ride as a 'buddy'... not only
3171                                  * does it have to be dirty to be returned, but it also
3172                                  * can't have been referenced recently... note that we've
3173                                  * already filtered above based on whether this page is
3174                                  * currently on the inactive queue or it meets the page
3175                                  * ticket (generation count) check
3176                                  */
3177                                 if ( (cntrl_flags & UPL_CLEAN_IN_PLACE || !(refmod_state & VM_MEM_REFERENCED)) &&
3178                                      ((refmod_state & VM_MEM_MODIFIED) || dst_page->dirty || dst_page->precious) ) {
3179                                         goto check_busy;
3180                                 }
3181 dont_return:
3182                                 /*
3183                                  * if we reach here, we're not to return
3184                                  * the page... go on to the next one
3185                                  */
3186                                 if (user_page_list)
3187                                         user_page_list[entry].phys_addr = 0;
3188
3189                                 goto try_next_page;
3190                         }
3191 check_busy:
3192                         if (dst_page->busy && (!(dst_page->list_req_pending && (dst_page->pageout || dst_page->cleaning)))) {
3193                                 if (cntrl_flags & UPL_NOBLOCK) {
3194                                         if (user_page_list)
3195                                                 user_page_list[entry].phys_addr = 0;
3196
3197                                         goto try_next_page;
3198                                 }
3199                                 /*
3200                                  * someone else is playing with the
3201                                  * page.  We will have to wait.
3202                                  */
3203                                 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
3204
3205                                 continue;
3206                         }
3207                         /*
3208                          * Someone else already cleaning the page?
3209                          */
3210                         if ((dst_page->cleaning || dst_page->absent || VM_PAGE_WIRED(dst_page)) && !dst_page->list_req_pending) {
3211                                 if (user_page_list)
3212                                         user_page_list[entry].phys_addr = 0;
3213
3214                                 goto try_next_page;
3215                         }
3216                         /*
3217                          * ENCRYPTED SWAP:
3218                          * The caller is gathering this page and might
3219                          * access its contents later on.  Decrypt the
3220                          * page before adding it to the UPL, so that
3221                          * the caller never sees encrypted data.
3222                          */
3223                         if (! (cntrl_flags & UPL_ENCRYPT) && dst_page->encrypted) {
3224                                 int  was_busy;
3225
3226                                 /*
3227                                  * save the current state of busy
3228                                  * mark page as busy while decrypt
3229                                  * is in progress since it will drop
3230                                  * the object lock...
3231                                  */
3232                                 was_busy = dst_page->busy;
3233                                 dst_page->busy = TRUE;
3234
3235                                 vm_page_decrypt(dst_page, 0);
3236                                 vm_page_decrypt_for_upl_counter++;
3237                                 /*
3238                                  * restore to original busy state
3239                                  */
3240                                 dst_page->busy = was_busy;
3241                         }
3242                         if (dst_page->pageout_queue == TRUE) {
3243
3244                                 vm_page_lockspin_queues();
3245
3246 #if CONFIG_EMBEDDED
3247                                 if (dst_page->laundry)
3248 #else
3249                                 if (dst_page->pageout_queue == TRUE)
3250 #endif
3251                                 {
3252                                         /*
3253                                          * we've buddied up a page for a clustered pageout
3254                                          * that has already been moved to the pageout
3255                                          * queue by pageout_scan... we need to remove
3256                                          * it from the queue and drop the laundry count
3257                                          * on that queue
3258                                          */
3259                                         vm_pageout_throttle_up(dst_page);
3260                                 }
3261                                 vm_page_unlock_queues();
3262                         }
3263 #if MACH_CLUSTER_STATS
3264                         /*
3265                          * pageout statistics gathering.  count
3266                          * all the pages we will page out that
3267                          * were not counted in the initial
3268                          * vm_pageout_scan work
3269                          */
3270                         if (dst_page->list_req_pending)
3271                                 encountered_lrp = TRUE;
3272                         if ((dst_page->dirty || (dst_page->object->internal && dst_page->precious)) && !dst_page->list_req_pending) {
3273                                 if (encountered_lrp)
3274                                         CLUSTER_STAT(pages_at_higher_offsets++;)
3275                                 else
3276                                         CLUSTER_STAT(pages_at_lower_offsets++;)
3277                         }
3278 #endif
3279                         /*
3280                          * Turn off busy indication on pending
3281                          * pageout.  Note: we can only get here
3282                          * in the request pending case.
3283                          */
3284                         dst_page->list_req_pending = FALSE;
3285                         dst_page->busy = FALSE;
3286
3287                         hw_dirty = refmod_state & VM_MEM_MODIFIED;
3288                         dirty = hw_dirty ? TRUE : dst_page->dirty;
3289
3290                         if (dst_page->phys_page > upl->highest_page)
3291                                 upl->highest_page = dst_page->phys_page;
3292
3293                         if (cntrl_flags & UPL_SET_LITE) {
3294                                 unsigned int    pg_num;
3295
3296                                 pg_num = (unsigned int) ((dst_offset-offset)/PAGE_SIZE);
3297                                 assert(pg_num == (dst_offset-offset)/PAGE_SIZE);
3298                                 lite_list[pg_num>>5] |= 1 << (pg_num & 31);
3299
3300                                 if (hw_dirty)
3301                                         pmap_clear_modify(dst_page->phys_page);
3302
3303                                 /*
3304                                  * Mark original page as cleaning
3305                                  * in place.
3306                                  */
3307                                 dst_page->cleaning = TRUE;
3308                                 dst_page->precious = FALSE;
3309                         } else {
3310                                 /*
3311                                  * use pageclean setup, it is more
3312                                  * convenient even for the pageout
3313                                  * cases here
3314                                  */
3315                                 vm_object_lock(upl->map_object);
3316                                 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
3317                                 vm_object_unlock(upl->map_object);
3318
3319                                 alias_page->absent = FALSE;
3320                                 alias_page = NULL;
3321                         }
3322 #if     MACH_PAGEMAP
3323                         /*
3324                          * Record that this page has been
3325                          * written out
3326                          */
3327                         vm_external_state_set(object->existence_map, dst_page->offset);
3328 #endif  /*MACH_PAGEMAP*/
3329                         dst_page->dirty = dirty;
3330
3331                         if (!dirty)
3332                                 dst_page->precious = TRUE;
3333
3334                         if (dst_page->pageout)
3335                                 dst_page->busy = TRUE;
3336
3337                         if ( (cntrl_flags & UPL_ENCRYPT) ) {
3338                                 /*
3339                                  * ENCRYPTED SWAP:
3340                                  * We want to deny access to the target page
3341                                  * because its contents are about to be
3342                                  * encrypted and the user would be very
3343                                  * confused to see encrypted data instead
3344                                  * of their data.
3345                                  * We also set "encrypted_cleaning" to allow
3346                                  * vm_pageout_scan() to demote that page
3347                                  * from "adjacent/clean-in-place" to
3348                                  * "target/clean-and-free" if it bumps into
3349                                  * this page during its scanning while we're
3350                                  * still processing this cluster.
3351                                  */
3352                                 dst_page->busy = TRUE;
3353                                 dst_page->encrypted_cleaning = TRUE;
3354                         }
3355                         if ( !(cntrl_flags & UPL_CLEAN_IN_PLACE) ) {
3356                                 /*
3357                                  * deny access to the target page
3358                                  * while it is being worked on
3359                                  */
3360                                 if ((!dst_page->pageout) && ( !VM_PAGE_WIRED(dst_page))) {
3361                                         dst_page->busy = TRUE;
3362                                         dst_page->pageout = TRUE;
3363
3364                                         dwp->dw_mask |= DW_vm_page_wire;
3365                                 }
3366                         }
3367                 } else {
3368                         if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != last_copy_object) {
3369                                 /*
3370                                  * Honor copy-on-write obligations
3371                                  *
3372                                  * The copy object has changed since we
3373                                  * last synchronized for copy-on-write.
3374                                  * Another copy object might have been
3375                                  * inserted while we released the object's
3376                                  * lock.  Since someone could have seen the
3377                                  * original contents of the remaining pages
3378                                  * through that new object, we have to
3379                                  * synchronize with it again for the remaining
3380                                  * pages only.  The previous pages are "busy"
3381                                  * so they can not be seen through the new
3382                                  * mapping.  The new mapping will see our
3383                                  * upcoming changes for those previous pages,
3384                                  * but that's OK since they couldn't see what
3385                                  * was there before.  It's just a race anyway
3386                                  * and there's no guarantee of consistency or
3387                                  * atomicity.  We just don't want new mappings
3388                                  * to see both the *before* and *after* pages.
3389                                  */
3390                                 if (object->copy != VM_OBJECT_NULL) {
3391                                         vm_object_update(
3392                                                 object,
3393                                                 dst_offset,/* current offset */
3394                                                 xfer_size, /* remaining size */
3395                                                 NULL,
3396                                                 NULL,
3397                                                 FALSE,     /* should_return */
3398                                                 MEMORY_OBJECT_COPY_SYNC,
3399                                                 VM_PROT_NO_CHANGE);
3400
3401 #if DEVELOPMENT || DEBUG
3402                                         upl_cow_again++;
3403                                         upl_cow_again_pages += xfer_size >> PAGE_SHIFT;
3404 #endif
3405                                 }
3406                                 /*
3407                                  * remember the copy object we synced with
3408                                  */
3409                                 last_copy_object = object->copy;
3410                         }
3411                         dst_page = vm_page_lookup(object, dst_offset);
3412
3413                         if (dst_page != VM_PAGE_NULL) {
3414
3415                                 if ((cntrl_flags & UPL_RET_ONLY_ABSENT)) {
3416
3417                                         if ( !(dst_page->absent && dst_page->list_req_pending) ) {
3418                                                 /*
3419                                                  * skip over pages already present in the cache
3420                                                  */
3421                                                 if (user_page_list)
3422                                                         user_page_list[entry].phys_addr = 0;
3423
3424                                                 goto try_next_page;
3425                                         }
3426                                 }
3427                                 if ( !(dst_page->list_req_pending) ) {
3428
3429                                         if (dst_page->cleaning) {
3430                                                 /*
3431                                                  * someone else is writing to the page... wait...
3432                                                  */
3433                                                 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
3434
3435                                                 continue;
3436                                         }
3437                                 } else {
3438                                         if (dst_page->fictitious &&
3439                                             dst_page->phys_page == vm_page_fictitious_addr) {
3440                                                 assert( !dst_page->speculative);
3441                                                 /*
3442                                                  * dump the fictitious page
3443                                                  */
3444                                                 dst_page->list_req_pending = FALSE;
3445
3446                                                 VM_PAGE_FREE(dst_page);
3447
3448                                                 dst_page = NULL;
3449
3450                                         } else if (dst_page->absent) {
3451                                                 /*
3452                                                  * the default_pager case
3453                                                  */
3454                                                 dst_page->list_req_pending = FALSE;
3455                                                 dst_page->busy = FALSE;
3456
3457                                         } else if (dst_page->pageout || dst_page->cleaning) {
3458                                                 /*
3459                                                  * page was earmarked by vm_pageout_scan
3460                                                  * to be cleaned and stolen... we're going
3461                                                  * to take it back since we are not attempting
3462                                                  * to read that page and we don't want to stall
3463                                                  * waiting for it to be cleaned for 2 reasons...
3464                                                  * 1 - no use paging it out and back in
3465                                                  * 2 - if we stall, we may casue a deadlock in
3466                                                  *     the FS trying to acquire the its locks
3467                                                  *     on the VNOP_PAGEOUT path presuming that
3468                                                  *     those locks are already held on the read
3469                                                  *     path before trying to create this UPL
3470                                                  *
3471                                                  * so undo all of the state that vm_pageout_scan
3472                                                  * hung on this page
3473                                                  */
3474                                                 dst_page->busy = FALSE;
3475
3476                                                 vm_pageout_queue_steal(dst_page, FALSE);
3477                                         }
3478                                 }
3479                         }
3480                         if (dst_page == VM_PAGE_NULL) {
3481                                 if (object->private) {
3482                                         /*
3483                                          * This is a nasty wrinkle for users
3484                                          * of upl who encounter device or
3485                                          * private memory however, it is
3486                                          * unavoidable, only a fault can
3487                                          * resolve the actual backing
3488                                          * physical page by asking the
3489                                          * backing device.
3490                                          */
3491                                         if (user_page_list)
3492                                                 user_page_list[entry].phys_addr = 0;
3493
3494                                         goto try_next_page;
3495                                 }
3496                                 /*
3497                                  * need to allocate a page
3498                                  */
3499                                 dst_page = vm_page_grab();
3500
3501                                 if (dst_page == VM_PAGE_NULL) {
3502                                         if ( (cntrl_flags & (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) == (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) {
3503                                                /*
3504                                                 * we don't want to stall waiting for pages to come onto the free list
3505                                                 * while we're already holding absent pages in this UPL
3506                                                 * the caller will deal with the empty slots
3507                                                 */
3508                                                 if (user_page_list)
3509                                                         user_page_list[entry].phys_addr = 0;
3510
3511                                                 goto try_next_page;
3512                                         }
3513                                         /*
3514                                          * no pages available... wait
3515                                          * then try again for the same
3516                                          * offset...
3517                                          */
3518                                         vm_object_unlock(object);
3519                                         VM_PAGE_WAIT();
3520                                         vm_object_lock(object);
3521
3522                                         continue;
3523                                 }
3524                                 vm_page_insert(dst_page, object, dst_offset);
3525
3526                                 dst_page->absent = TRUE;
3527                                 dst_page->busy = FALSE;
3528
3529                                 if (cntrl_flags & UPL_RET_ONLY_ABSENT) {
3530                                         /*
3531                                          * if UPL_RET_ONLY_ABSENT was specified,
3532                                          * than we're definitely setting up a
3533                                          * upl for a clustered read/pagein
3534                                          * operation... mark the pages as clustered
3535                                          * so upl_commit_range can put them on the
3536                                          * speculative list
3537                                          */
3538                                         dst_page->clustered = TRUE;
3539                                 }
3540                         }
3541                         if (dst_page->fictitious) {
3542                                 panic("need corner case for fictitious page");
3543                         }
3544                         if (dst_page->busy) {
3545                                 /*
3546                                  * someone else is playing with the
3547                                  * page.  We will have to wait.
3548                                  */
3549                                 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
3550
3551                                 continue;
3552                         }
3553                         /*
3554                          * ENCRYPTED SWAP:
3555                          */
3556                         if (cntrl_flags & UPL_ENCRYPT) {
3557                                 /*
3558                                  * The page is going to be encrypted when we
3559                                  * get it from the pager, so mark it so.
3560                                  */
3561                                 dst_page->encrypted = TRUE;
3562                         } else {
3563                                 /*
3564                                  * Otherwise, the page will not contain
3565                                  * encrypted data.
3566                                  */
3567                                 dst_page->encrypted = FALSE;
3568                         }
3569                         dst_page->overwriting = TRUE;
3570
3571                         if (dst_page->pmapped) {
3572                                 if ( !(cntrl_flags & UPL_FILE_IO))
3573                                         /*
3574                                          * eliminate all mappings from the
3575                                          * original object and its prodigy
3576                                          */
3577                                         refmod_state = pmap_disconnect(dst_page->phys_page);
3578                                 else
3579                                         refmod_state = pmap_get_refmod(dst_page->phys_page);
3580                         } else
3581                                 refmod_state = 0;
3582
3583                         hw_dirty = refmod_state & VM_MEM_MODIFIED;
3584                         dirty = hw_dirty ? TRUE : dst_page->dirty;
3585
3586                         if (cntrl_flags & UPL_SET_LITE) {
3587                                 unsigned int    pg_num;
3588
3589                                 pg_num = (unsigned int) ((dst_offset-offset)/PAGE_SIZE);
3590                                 assert(pg_num == (dst_offset-offset)/PAGE_SIZE);
3591                                 lite_list[pg_num>>5] |= 1 << (pg_num & 31);
3592
3593                                 if (hw_dirty)
3594                                         pmap_clear_modify(dst_page->phys_page);
3595
3596                                 /*
3597                                  * Mark original page as cleaning
3598                                  * in place.
3599                                  */
3600                                 dst_page->cleaning = TRUE;
3601                                 dst_page->precious = FALSE;
3602                         } else {
3603                                 /*
3604                                  * use pageclean setup, it is more
3605                                  * convenient even for the pageout
3606                                  * cases here
3607                                  */
3608                                 vm_object_lock(upl->map_object);
3609                                 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
3610                                 vm_object_unlock(upl->map_object);
3611
3612                                 alias_page->absent = FALSE;
3613                                 alias_page = NULL;
3614                         }
3615
3616                         if (cntrl_flags & UPL_CLEAN_IN_PLACE) {
3617                                 /*
3618                                  * clean in place for read implies
3619                                  * that a write will be done on all
3620                                  * the pages that are dirty before
3621                                  * a upl commit is done.  The caller
3622                                  * is obligated to preserve the
3623                                  * contents of all pages marked dirty
3624                                  */
3625                                 upl->flags |= UPL_CLEAR_DIRTY;
3626                         }
3627                         dst_page->dirty = dirty;
3628
3629                         if (!dirty)
3630                                 dst_page->precious = TRUE;
3631
3632                         if ( !VM_PAGE_WIRED(dst_page)) {
3633                                 /*
3634                                  * deny access to the target page while
3635                                  * it is being worked on
3636                                  */
3637                                 dst_page->busy = TRUE;
3638                         } else
3639                                 dwp->dw_mask |= DW_vm_page_wire;
3640
3641                         /*
3642                          * We might be about to satisfy a fault which has been
3643                          * requested. So no need for the "restart" bit.
3644                          */
3645                         dst_page->restart = FALSE;
3646                         if (!dst_page->absent && !(cntrl_flags & UPL_WILL_MODIFY)) {
3647                                 /*
3648                                  * expect the page to be used
3649                                  */
3650                                 dwp->dw_mask |= DW_set_reference;
3651                         }
3652                         dst_page->precious = (cntrl_flags & UPL_PRECIOUS) ? TRUE : FALSE;
3653                 }
3654                 if (dst_page->busy)
3655                         upl->flags |= UPL_HAS_BUSY;
3656
3657                 if (dst_page->phys_page > upl->highest_page)
3658                         upl->highest_page = dst_page->phys_page;
3659                 if (user_page_list) {
3660                         user_page_list[entry].phys_addr = dst_page->phys_page;
3661                         user_page_list[entry].pageout   = dst_page->pageout;
3662                         user_page_list[entry].absent    = dst_page->absent;
3663                         user_page_list[entry].dirty     = dst_page->dirty;
3664                         user_page_list[entry].precious  = dst_page->precious;
3665                         user_page_list[entry].device    = FALSE;
3666                         if (dst_page->clustered == TRUE)
3667                                 user_page_list[entry].speculative = dst_page->speculative;
3668                         else
3669                                 user_page_list[entry].speculative = FALSE;
3670                         user_page_list[entry].cs_validated = dst_page->cs_validated;
3671                         user_page_list[entry].cs_tainted = dst_page->cs_tainted;
3672                 }
3673                 /*
3674                  * if UPL_RET_ONLY_ABSENT is set, then
3675                  * we are working with a fresh page and we've
3676                  * just set the clustered flag on it to
3677                  * indicate that it was drug in as part of a
3678                  * speculative cluster... so leave it alone
3679                  */
3680                 if ( !(cntrl_flags & UPL_RET_ONLY_ABSENT)) {
3681                         /*
3682                          * someone is explicitly grabbing this page...
3683                          * update clustered and speculative state
3684                          *
3685                          */
3686                         VM_PAGE_CONSUME_CLUSTERED(dst_page);
3687                 }
3688 try_next_page:
3689                 if (dwp->dw_mask) {
3690                         if (dwp->dw_mask & DW_vm_page_activate)
3691                                 VM_STAT_INCR(reactivations);
3692
3693                         if (dst_page->busy == FALSE) {
3694                                 /*
3695                                  * dw_do_work may need to drop the object lock
3696                                  * if it does, we need the pages it's looking at to
3697                                  * be held stable via the busy bit.
3698                                  */
3699                                 dst_page->busy = TRUE;
3700                                 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
3701                         }
3702                         dwp->dw_m = dst_page;
3703                         dwp++;
3704                         dw_count++;
3705
3706                         if (dw_count >= DELAYED_WORK_LIMIT) {
3707                                 dw_do_work(object, &dw_array[0], dw_count);
3708
3709                                 dwp = &dw_array[0];
3710                                 dw_count = 0;
3711                         }
3712                 }
3713                 entry++;
3714                 dst_offset += PAGE_SIZE_64;
3715                 xfer_size -= PAGE_SIZE;
3716         }
3717         if (dw_count)
3718                 dw_do_work(object, &dw_array[0], dw_count);
3719
3720         if (alias_page != NULL) {
3721                 VM_PAGE_FREE(alias_page);
3722         }
3723
3724         if (page_list_count != NULL) {
3725                 if (upl->flags & UPL_INTERNAL)
3726                         *page_list_count = 0;
3727                 else if (*page_list_count > entry)
3728                         *page_list_count = entry;
3729         }
3730 #if UPL_DEBUG
3731         upl->upl_state = 1;
3732 #endif
3733         vm_object_unlock(object);
3734
3735         return KERN_SUCCESS;
3736 }
3737
3738 /* JMM - Backward compatability for now */
3739 kern_return_t
3740 vm_fault_list_request(                  /* forward */
3741         memory_object_control_t         control,
3742         vm_object_offset_t      offset,
3743         upl_size_t              size,
3744         upl_t                   *upl_ptr,
3745         upl_page_info_t         **user_page_list_ptr,
3746         unsigned int            page_list_count,
3747         int                     cntrl_flags);
3748 kern_return_t
3749 vm_fault_list_request(
3750         memory_object_control_t         control,
3751         vm_object_offset_t      offset,
3752         upl_size_t              size,
3753         upl_t                   *upl_ptr,
3754         upl_page_info_t         **user_page_list_ptr,
3755         unsigned int            page_list_count,
3756         int                     cntrl_flags)
3757 {
3758         unsigned int            local_list_count;
3759         upl_page_info_t         *user_page_list;
3760         kern_return_t           kr;
3761
3762         if((cntrl_flags & UPL_VECTOR)==UPL_VECTOR)
3763                  return KERN_INVALID_ARGUMENT;
3764
3765         if (user_page_list_ptr != NULL) {
3766                 local_list_count = page_list_count;
3767                 user_page_list = *user_page_list_ptr;
3768         } else {
3769                 local_list_count = 0;
3770                 user_page_list = NULL;
3771         }
3772         kr =  memory_object_upl_request(control,
3773                                 offset,
3774                                 size,
3775                                 upl_ptr,
3776                                 user_page_list,
3777                                 &local_list_count,
3778                                 cntrl_flags);
3779
3780         if(kr != KERN_SUCCESS)
3781                 return kr;
3782
3783         if ((user_page_list_ptr != NULL) && (cntrl_flags & UPL_INTERNAL)) {
3784                 *user_page_list_ptr = UPL_GET_INTERNAL_PAGE_LIST(*upl_ptr);
3785         }
3786
3787         return KERN_SUCCESS;
3788 }
3789
3790
3791
3792 /*
3793  *      Routine:        vm_object_super_upl_request
3794  *      Purpose:
3795  *              Cause the population of a portion of a vm_object
3796  *              in much the same way as memory_object_upl_request.
3797  *              Depending on the nature of the request, the pages
3798  *              returned may be contain valid data or be uninitialized.
3799  *              However, the region may be expanded up to the super
3800  *              cluster size provided.
3801  */
3802
3803 __private_extern__ kern_return_t
3804 vm_object_super_upl_request(
3805         vm_object_t object,
3806         vm_object_offset_t      offset,
3807         upl_size_t              size,
3808         upl_size_t              super_cluster,
3809         upl_t                   *upl,
3810         upl_page_info_t         *user_page_list,
3811         unsigned int            *page_list_count,
3812         int                     cntrl_flags)
3813 {
3814         if (object->paging_offset > offset  || ((cntrl_flags & UPL_VECTOR)==UPL_VECTOR))
3815                 return KERN_FAILURE;
3816
3817         assert(object->paging_in_progress);
3818         offset = offset - object->paging_offset;
3819
3820         if (super_cluster > size) {
3821
3822                 vm_object_offset_t      base_offset;
3823                 upl_size_t              super_size;
3824                 vm_object_size_t        super_size_64;
3825
3826                 base_offset = (offset & ~((vm_object_offset_t) super_cluster - 1));
3827                 super_size = (offset + size) > (base_offset + super_cluster) ? super_cluster<<1 : super_cluster;
3828                 super_size_64 = ((base_offset + super_size) > object->size) ? (object->size - base_offset) : super_size;
3829                 super_size = (upl_size_t) super_size_64;
3830                 assert(super_size == super_size_64);
3831
3832                 if (offset > (base_offset + super_size)) {
3833                         panic("vm_object_super_upl_request: Missed target pageout"
3834                               " %#llx,%#llx, %#x, %#x, %#x, %#llx\n",
3835                               offset, base_offset, super_size, super_cluster,
3836                               size, object->paging_offset);
3837                 }
3838                 /*
3839                  * apparently there is a case where the vm requests a
3840                  * page to be written out who's offset is beyond the
3841                  * object size
3842                  */
3843                 if ((offset + size) > (base_offset + super_size)) {
3844                         super_size_64 = (offset + size) - base_offset;
3845                         super_size = (upl_size_t) super_size_64;
3846                         assert(super_size == super_size_64);
3847                 }
3848
3849                 offset = base_offset;
3850                 size = super_size;
3851         }
3852         return vm_object_upl_request(object, offset, size, upl, user_page_list, page_list_count, cntrl_flags);
3853 }
3854
3855
3856 kern_return_t
3857 vm_map_create_upl(
3858         vm_map_t                map,
3859         vm_map_address_t        offset,
3860         upl_size_t              *upl_size,
3861         upl_t                   *upl,
3862         upl_page_info_array_t   page_list,
3863         unsigned int            *count,
3864         int                     *flags)
3865 {
3866         vm_map_entry_t  entry;
3867         int             caller_flags;
3868         int             force_data_sync;
3869         int             sync_cow_data;
3870         vm_object_t     local_object;
3871         vm_map_offset_t local_offset;
3872         vm_map_offset_t local_start;
3873         kern_return_t   ret;
3874
3875         caller_flags = *flags;
3876
3877         if (caller_flags & ~UPL_VALID_FLAGS) {
3878                 /*
3879                  * For forward compatibility's sake,
3880                  * reject any unknown flag.
3881                  */
3882                 return KERN_INVALID_VALUE;
3883         }
3884         force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
3885         sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
3886
3887         if (upl == NULL)
3888                 return KERN_INVALID_ARGUMENT;
3889
3890 REDISCOVER_ENTRY:
3891         vm_map_lock_read(map);
3892
3893         if (vm_map_lookup_entry(map, offset, &entry)) {
3894
3895                 if ((entry->vme_end - offset) < *upl_size) {
3896                         *upl_size = (upl_size_t) (entry->vme_end - offset);
3897                         assert(*upl_size == entry->vme_end - offset);
3898                 }
3899
3900                 if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
3901                         *flags = 0;
3902
3903                         if ( !entry->is_sub_map && entry->object.vm_object != VM_OBJECT_NULL) {
3904                                 if (entry->object.vm_object->private)
3905                                         *flags = UPL_DEV_MEMORY;
3906
3907                                 if (entry->object.vm_object->phys_contiguous)
3908                                         *flags |= UPL_PHYS_CONTIG;
3909                         }
3910                         vm_map_unlock_read(map);
3911
3912                         return KERN_SUCCESS;
3913                 }
3914                 if (entry->object.vm_object == VM_OBJECT_NULL || !entry->object.vm_object->phys_contiguous) {
3915                         if ((*upl_size/PAGE_SIZE) > MAX_UPL_SIZE)
3916                                 *upl_size = MAX_UPL_SIZE * PAGE_SIZE;
3917                 }
3918                 /*
3919                  *      Create an object if necessary.
3920                  */
3921                 if (entry->object.vm_object == VM_OBJECT_NULL) {
3922
3923                         if (vm_map_lock_read_to_write(map))
3924                                 goto REDISCOVER_ENTRY;
3925
3926                         entry->object.vm_object = vm_object_allocate((vm_size_t)(entry->vme_end - entry->vme_start));
3927                         entry->offset = 0;
3928
3929                         vm_map_lock_write_to_read(map);
3930                 }
3931                 if (!(caller_flags & UPL_COPYOUT_FROM)) {
3932                         if (!(entry->protection & VM_PROT_WRITE)) {
3933                                 vm_map_unlock_read(map);
3934                                 return KERN_PROTECTION_FAILURE;
3935                         }
3936                         if (entry->needs_copy)  {
3937                                 /*
3938                                  * Honor copy-on-write for COPY_SYMMETRIC
3939                                  * strategy.
3940                                  */
3941                                 vm_map_t                local_map;
3942                                 vm_object_t             object;
3943                                 vm_object_offset_t      new_offset;
3944                                 vm_prot_t               prot;
3945                                 boolean_t               wired;
3946                                 vm_map_version_t        version;
3947                                 vm_map_t                real_map;
3948
3949                                 local_map = map;
3950
3951                                 if (vm_map_lookup_locked(&local_map,
3952                                                          offset, VM_PROT_WRITE,
3953                                                          OBJECT_LOCK_EXCLUSIVE,
3954                                                          &version, &object,
3955                                                          &new_offset, &prot, &wired,
3956                                                          NULL,
3957                                                          &real_map) != KERN_SUCCESS) {
3958                                         vm_map_unlock_read(local_map);
3959                                         return KERN_FAILURE;
3960                                 }
3961                                 if (real_map != map)
3962                                         vm_map_unlock(real_map);
3963                                 vm_map_unlock_read(local_map);
3964
3965                                 vm_object_unlock(object);
3966
3967                                 goto REDISCOVER_ENTRY;
3968                         }
3969                 }
3970                 if (entry->is_sub_map) {
3971                         vm_map_t        submap;
3972
3973                         submap = entry->object.sub_map;
3974                         local_start = entry->vme_start;
3975                         local_offset = entry->offset;
3976
3977                         vm_map_reference(submap);
3978                         vm_map_unlock_read(map);
3979
3980                         ret = vm_map_create_upl(submap,
3981                                                 local_offset + (offset - local_start),
3982                                                 upl_size, upl, page_list, count, flags);
3983                         vm_map_deallocate(submap);
3984
3985                         return ret;
3986                 }
3987                 if (sync_cow_data) {
3988                         if (entry->object.vm_object->shadow || entry->object.vm_object->copy) {
3989                                 local_object = entry->object.vm_object;
3990                                 local_start = entry->vme_start;
3991                                 local_offset = entry->offset;
3992
3993                                 vm_object_reference(local_object);
3994                                 vm_map_unlock_read(map);
3995
3996                                 if (local_object->shadow && local_object->copy) {
3997                                         vm_object_lock_request(
3998                                                                local_object->shadow,
3999                                                                (vm_object_offset_t)
4000                                                                ((offset - local_start) +
4001                                                                 local_offset) +
4002                                                                local_object->shadow_offset,
4003                                                                *upl_size, FALSE,
4004                                                                MEMORY_OBJECT_DATA_SYNC,
4005                                                                VM_PROT_NO_CHANGE);
4006                                 }
4007                                 sync_cow_data = FALSE;
4008                                 vm_object_deallocate(local_object);
4009
4010                                 goto REDISCOVER_ENTRY;
4011                         }
4012                 }
4013                 if (force_data_sync) {
4014                         local_object = entry->object.vm_object;
4015                         local_start = entry->vme_start;
4016                         local_offset = entry->offset;
4017
4018                         vm_object_reference(local_object);
4019                         vm_map_unlock_read(map);
4020
4021                         vm_object_lock_request(
4022                                                local_object,
4023                                                (vm_object_offset_t)
4024                                                ((offset - local_start) + local_offset),
4025                                                (vm_object_size_t)*upl_size, FALSE,
4026                                                MEMORY_OBJECT_DATA_SYNC,
4027                                                VM_PROT_NO_CHANGE);
4028
4029                         force_data_sync = FALSE;
4030                         vm_object_deallocate(local_object);
4031
4032                         goto REDISCOVER_ENTRY;
4033                 }
4034                 if (entry->object.vm_object->private)
4035                         *flags = UPL_DEV_MEMORY;
4036                 else
4037                         *flags = 0;
4038
4039                 if (entry->object.vm_object->phys_contiguous)
4040                         *flags |= UPL_PHYS_CONTIG;
4041
4042                 local_object = entry->object.vm_object;
4043                 local_offset = entry->offset;
4044                 local_start = entry->vme_start;
4045
4046                 vm_object_reference(local_object);
4047                 vm_map_unlock_read(map);
4048
4049                 ret = vm_object_iopl_request(local_object,
4050                                               (vm_object_offset_t) ((offset - local_start) + local_offset),
4051                                               *upl_size,
4052                                               upl,
4053                                               page_list,
4054                                               count,
4055                                               caller_flags);
4056                 vm_object_deallocate(local_object);
4057
4058                 return(ret);
4059         }
4060         vm_map_unlock_read(map);
4061
4062         return(KERN_FAILURE);
4063 }
4064
4065 /*
4066  * Internal routine to enter a UPL into a VM map.
4067  *
4068  * JMM - This should just be doable through the standard
4069  * vm_map_enter() API.
4070  */
4071 kern_return_t
4072 vm_map_enter_upl(
4073         vm_map_t                map,
4074         upl_t                   upl,
4075         vm_map_offset_t         *dst_addr)
4076 {
4077         vm_map_size_t           size;
4078         vm_object_offset_t      offset;
4079         vm_map_offset_t         addr;
4080         vm_page_t               m;
4081         kern_return_t           kr;
4082         int                     isVectorUPL = 0, curr_upl=0;
4083         upl_t                   vector_upl = NULL;
4084         vm_offset_t             vector_upl_dst_addr = 0;
4085         vm_map_t                vector_upl_submap = NULL;
4086         upl_offset_t            subupl_offset = 0;
4087         upl_size_t              subupl_size = 0;
4088
4089         if (upl == UPL_NULL)
4090                 return KERN_INVALID_ARGUMENT;
4091
4092         if((isVectorUPL = vector_upl_is_valid(upl))) {
4093                 int mapped=0,valid_upls=0;
4094                 vector_upl = upl;
4095
4096                 upl_lock(vector_upl);
4097                 for(curr_upl=0; curr_upl < MAX_VECTOR_UPL_ELEMENTS; curr_upl++) {
4098                         upl =  vector_upl_subupl_byindex(vector_upl, curr_upl );
4099                         if(upl == NULL)
4100                                 continue;
4101                         valid_upls++;
4102                         if (UPL_PAGE_LIST_MAPPED & upl->flags)
4103                                 mapped++;
4104                 }
4105
4106                 if(mapped) {
4107                         if(mapped != valid_upls)
4108                                 panic("Only %d of the %d sub-upls within the Vector UPL are alread mapped\n", mapped, valid_upls);
4109                         else {
4110                                 upl_unlock(vector_upl);
4111                                 return KERN_FAILURE;
4112                         }
4113                 }
4114
4115                 kr = kmem_suballoc(map, &vector_upl_dst_addr, vector_upl->size, FALSE, VM_FLAGS_ANYWHERE, &vector_upl_submap);
4116                 if( kr != KERN_SUCCESS )
4117                         panic("Vector UPL submap allocation failed\n");
4118                 map = vector_upl_submap;
4119                 vector_upl_set_submap(vector_upl, vector_upl_submap, vector_upl_dst_addr);
4120                 curr_upl=0;
4121         }
4122         else
4123                 upl_lock(upl);
4124
4125 process_upl_to_enter:
4126         if(isVectorUPL){
4127                 if(curr_upl == MAX_VECTOR_UPL_ELEMENTS) {
4128                         *dst_addr = vector_upl_dst_addr;
4129                         upl_unlock(vector_upl);
4130                         return KERN_SUCCESS;
4131                 }
4132                 upl =  vector_upl_subupl_byindex(vector_upl, curr_upl++ );
4133                 if(upl == NULL)
4134                         goto process_upl_to_enter;
4135                 vector_upl_get_iostate(vector_upl, upl, &subupl_offset, &subupl_size);
4136                 *dst_addr = (vm_map_offset_t)(vector_upl_dst_addr + (vm_map_offset_t)subupl_offset);
4137         } else {
4138                 /*
4139                  * check to see if already mapped
4140                  */
4141                 if (UPL_PAGE_LIST_MAPPED & upl->flags) {
4142                         upl_unlock(upl);
4143                         return KERN_FAILURE;
4144                 }
4145         }
4146         if ((!(upl->flags & UPL_SHADOWED)) &&
4147             ((upl->flags & UPL_HAS_BUSY) ||
4148              !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || (upl->map_object->phys_contiguous)))) {
4149
4150                 vm_object_t             object;
4151                 vm_page_t               alias_page;
4152                 vm_object_offset_t      new_offset;
4153                 unsigned int            pg_num;
4154                 wpl_array_t             lite_list;
4155
4156                 if (upl->flags & UPL_INTERNAL) {
4157                         lite_list = (wpl_array_t)
4158                                 ((((uintptr_t)upl) + sizeof(struct upl))
4159                                  + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
4160                 } else {
4161                         lite_list = (wpl_array_t)(((uintptr_t)upl) + sizeof(struct upl));
4162                 }
4163                 object = upl->map_object;
4164                 upl->map_object = vm_object_allocate(upl->size);
4165
4166                 vm_object_lock(upl->map_object);
4167
4168                 upl->map_object->shadow = object;
4169                 upl->map_object->pageout = TRUE;
4170                 upl->map_object->can_persist = FALSE;
4171                 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
4172                 upl->map_object->shadow_offset = upl->offset - object->paging_offset;
4173                 upl->map_object->wimg_bits = object->wimg_bits;
4174                 offset = upl->map_object->shadow_offset;
4175                 new_offset = 0;
4176                 size = upl->size;
4177
4178                 upl->flags |= UPL_SHADOWED;
4179
4180                 while (size) {
4181                         pg_num = (unsigned int) (new_offset / PAGE_SIZE);
4182                         assert(pg_num == new_offset / PAGE_SIZE);
4183
4184                         if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
4185
4186                                 VM_PAGE_GRAB_FICTITIOUS(alias_page);
4187
4188                                 vm_object_lock(object);
4189
4190                                 m = vm_page_lookup(object, offset);
4191                                 if (m == VM_PAGE_NULL) {
4192                                         panic("vm_upl_map: page missing\n");
4193                                 }
4194
4195                                 /*
4196                                  * Convert the fictitious page to a private
4197                                  * shadow of the real page.
4198                                  */
4199                                 assert(alias_page->fictitious);
4200                                 alias_page->fictitious = FALSE;
4201                                 alias_page->private = TRUE;
4202                                 alias_page->pageout = TRUE;
4203                                 /*
4204                                  * since m is a page in the upl it must
4205                                  * already be wired or BUSY, so it's
4206                                  * safe to assign the underlying physical
4207                                  * page to the alias
4208                                  */
4209                                 alias_page->phys_page = m->phys_page;
4210
4211                                 vm_object_unlock(object);
4212
4213                                 vm_page_lockspin_queues();
4214                                 vm_page_wire(alias_page);
4215                                 vm_page_unlock_queues();
4216
4217                                 /*
4218                                  * ENCRYPTED SWAP:
4219                                  * The virtual page ("m") has to be wired in some way
4220                                  * here or its physical page ("m->phys_page") could
4221                                  * be recycled at any time.
4222                                  * Assuming this is enforced by the caller, we can't
4223                                  * get an encrypted page here.  Since the encryption
4224                                  * key depends on the VM page's "pager" object and
4225                                  * the "paging_offset", we couldn't handle 2 pageable
4226                                  * VM pages (with different pagers and paging_offsets)
4227                                  * sharing the same physical page:  we could end up
4228                                  * encrypting with one key (via one VM page) and
4229                                  * decrypting with another key (via the alias VM page).
4230                                  */
4231                                 ASSERT_PAGE_DECRYPTED(m);
4232
4233                                 vm_page_insert(alias_page, upl->map_object, new_offset);
4234
4235                                 assert(!alias_page->wanted);
4236                                 alias_page->busy = FALSE;
4237                                 alias_page->absent = FALSE;
4238                         }
4239                         size -= PAGE_SIZE;
4240                         offset += PAGE_SIZE_64;
4241                         new_offset += PAGE_SIZE_64;
4242                 }
4243                 vm_object_unlock(upl->map_object);
4244         }
4245         if (upl->flags & UPL_SHADOWED)
4246                 offset = 0;
4247         else
4248                 offset = upl->offset - upl->map_object->paging_offset;
4249         size = upl->size;
4250
4251         vm_object_reference(upl->map_object);
4252
4253         if(!isVectorUPL) {
4254                 *dst_addr = 0;
4255                 /*
4256                 * NEED A UPL_MAP ALIAS
4257                 */
4258                 kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
4259                                   VM_FLAGS_ANYWHERE, upl->map_object, offset, FALSE,
4260                                   VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
4261
4262                 if (kr != KERN_SUCCESS) {
4263                         upl_unlock(upl);
4264                         return(kr);
4265                 }
4266         }
4267         else {
4268                 kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
4269                                   VM_FLAGS_FIXED, upl->map_object, offset, FALSE,
4270                                   VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
4271                 if(kr)
4272                         panic("vm_map_enter failed for a Vector UPL\n");
4273         }
4274         vm_object_lock(upl->map_object);
4275
4276         for (addr = *dst_addr; size > 0; size -= PAGE_SIZE, addr += PAGE_SIZE) {
4277                 m = vm_page_lookup(upl->map_object, offset);
4278
4279                 if (m) {
4280                         unsigned int    cache_attr;
4281                         cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
4282
4283                         m->pmapped = TRUE;
4284
4285                         /* CODE SIGNING ENFORCEMENT: page has been wpmapped,
4286                          * but only in kernel space. If this was on a user map,
4287                          * we'd have to set the wpmapped bit. */
4288                         /* m->wpmapped = TRUE; */
4289                         assert(map==kernel_map);
4290
4291                         PMAP_ENTER(map->pmap, addr, m, VM_PROT_ALL, cache_attr, TRUE);
4292                 }
4293                 offset += PAGE_SIZE_64;
4294         }
4295         vm_object_unlock(upl->map_object);
4296
4297         /*
4298          * hold a reference for the mapping
4299          */
4300         upl->ref_count++;
4301         upl->flags |= UPL_PAGE_LIST_MAPPED;
4302         upl->kaddr = (vm_offset_t) *dst_addr;
4303         assert(upl->kaddr == *dst_addr);
4304
4305         if(isVectorUPL)
4306                 goto process_upl_to_enter;
4307
4308         upl_unlock(upl);
4309
4310         return KERN_SUCCESS;
4311 }
4312
4313 /*
4314  * Internal routine to remove a UPL mapping from a VM map.
4315  *
4316  * XXX - This should just be doable through a standard
4317  * vm_map_remove() operation.  Otherwise, implicit clean-up
4318  * of the target map won't be able to correctly remove
4319  * these (and release the reference on the UPL).  Having
4320  * to do this means we can't map these into user-space
4321  * maps yet.
4322  */
4323 kern_return_t
4324 vm_map_remove_upl(
4325         vm_map_t        map,
4326         upl_t           upl)
4327 {
4328         vm_address_t    addr;
4329         upl_size_t      size;
4330         int             isVectorUPL = 0, curr_upl = 0;
4331         upl_t           vector_upl = NULL;
4332
4333         if (upl == UPL_NULL)
4334                 return KERN_INVALID_ARGUMENT;
4335
4336         if((isVectorUPL = vector_upl_is_valid(upl))) {
4337                 int     unmapped=0, valid_upls=0;
4338                 vector_upl = upl;
4339                 upl_lock(vector_upl);
4340                 for(curr_upl=0; curr_upl < MAX_VECTOR_UPL_ELEMENTS; curr_upl++) {
4341                         upl =  vector_upl_subupl_byindex(vector_upl, curr_upl );
4342                         if(upl == NULL)
4343                                 continue;
4344                         valid_upls++;
4345                         if (!(UPL_PAGE_LIST_MAPPED & upl->flags))
4346                                 unmapped++;
4347                 }
4348
4349                 if(unmapped) {
4350                         if(unmapped != valid_upls)
4351                                 panic("%d of the %d sub-upls within the Vector UPL is/are not mapped\n", unmapped, valid_upls);
4352                         else {
4353                                 upl_unlock(vector_upl);
4354                                 return KERN_FAILURE;
4355                         }
4356                 }
4357                 curr_upl=0;
4358         }
4359         else
4360                 upl_lock(upl);
4361
4362 process_upl_to_remove:
4363         if(isVectorUPL) {
4364                 if(curr_upl == MAX_VECTOR_UPL_ELEMENTS) {
4365                         vm_map_t v_upl_submap;
4366                         vm_offset_t v_upl_submap_dst_addr;
4367                         vector_upl_get_submap(vector_upl, &v_upl_submap, &v_upl_submap_dst_addr);
4368
4369                         vm_map_remove(map, v_upl_submap_dst_addr, v_upl_submap_dst_addr + vector_upl->size, VM_MAP_NO_FLAGS);
4370                         vm_map_deallocate(v_upl_submap);
4371                         upl_unlock(vector_upl);
4372                         return KERN_SUCCESS;
4373                 }
4374
4375                 upl =  vector_upl_subupl_byindex(vector_upl, curr_upl++ );
4376                 if(upl == NULL)
4377                         goto process_upl_to_remove;
4378         }
4379
4380         if (upl->flags & UPL_PAGE_LIST_MAPPED) {
4381                 addr = upl->kaddr;
4382                 size = upl->size;
4383
4384                 assert(upl->ref_count > 1);
4385                 upl->ref_count--;               /* removing mapping ref */
4386
4387                 upl->flags &= ~UPL_PAGE_LIST_MAPPED;
4388                 upl->kaddr = (vm_offset_t) 0;
4389
4390                 if(!isVectorUPL) {
4391                         upl_unlock(upl);
4392
4393                         vm_map_remove(map,
4394                                 vm_map_trunc_page(addr),
4395                                 vm_map_round_page(addr + size),
4396                                 VM_MAP_NO_FLAGS);
4397
4398                         return KERN_SUCCESS;
4399                 }
4400                 else {
4401                         /*
4402                         * If it's a Vectored UPL, we'll be removing the entire
4403                         * submap anyways, so no need to remove individual UPL
4404                         * element mappings from within the submap
4405                         */
4406                         goto process_upl_to_remove;
4407                 }
4408         }
4409         upl_unlock(upl);
4410
4411         return KERN_FAILURE;
4412 }
4413
4414 static void
4415 dw_do_work(
4416         vm_object_t     object,
4417         struct dw       *dwp,
4418         int             dw_count)
4419 {
4420         int             j;
4421         boolean_t       held_as_spin = TRUE;
4422
4423         /*
4424          * pageout_scan takes the vm_page_lock_queues first
4425          * then tries for the object lock... to avoid what
4426          * is effectively a lock inversion, we'll go to the
4427          * trouble of taking them in that same order... otherwise
4428          * if this object contains the majority of the pages resident
4429          * in the UBC (or a small set of large objects actively being
4430          * worked on contain the majority of the pages), we could
4431          * cause the pageout_scan thread to 'starve' in its attempt
4432          * to find pages to move to the free queue, since it has to
4433          * successfully acquire the object lock of any candidate page
4434          * before it can steal/clean it.
4435          */
4436         if (!vm_page_trylockspin_queues()) {
4437                 vm_object_unlock(object);
4438
4439                 vm_page_lockspin_queues();
4440
4441                 for (j = 0; ; j++) {
4442                         if (!vm_object_lock_avoid(object) &&
4443                             _vm_object_lock_try(object))
4444                                 break;
4445                         vm_page_unlock_queues();
4446                         mutex_pause(j);
4447                         vm_page_lockspin_queues();
4448                 }
4449         }
4450         for (j = 0; j < dw_count; j++, dwp++) {
4451
4452                 if (dwp->dw_mask & DW_vm_pageout_throttle_up)
4453                         vm_pageout_throttle_up(dwp->dw_m);
4454
4455                 if (dwp->dw_mask & DW_vm_page_wire)
4456                         vm_page_wire(dwp->dw_m);
4457                 else if (dwp->dw_mask & DW_vm_page_unwire) {
4458                         boolean_t       queueit;
4459
4460                         queueit = (dwp->dw_mask & DW_vm_page_free) ? FALSE : TRUE;
4461
4462                         vm_page_unwire(dwp->dw_m, queueit);
4463                 }
4464                 if (dwp->dw_mask & DW_vm_page_free) {
4465                         if (held_as_spin == TRUE) {
4466                                 vm_page_lockconvert_queues();
4467                                 held_as_spin = FALSE;
4468                         }
4469                         vm_page_free(dwp->dw_m);
4470                 } else {
4471                         if (dwp->dw_mask & DW_vm_page_deactivate_internal)
4472                                 vm_page_deactivate_internal(dwp->dw_m, FALSE);
4473                         else if (dwp->dw_mask & DW_vm_page_activate)
4474                                 vm_page_activate(dwp->dw_m);
4475                         else if (dwp->dw_mask & DW_vm_page_speculate)
4476                                 vm_page_speculate(dwp->dw_m, TRUE);
4477                         else if (dwp->dw_mask & DW_vm_page_lru)
4478                                 vm_page_lru(dwp->dw_m);
4479
4480                         if (dwp->dw_mask & DW_set_reference)
4481                                 dwp->dw_m->reference = TRUE;
4482                         else if (dwp->dw_mask & DW_clear_reference)
4483                                 dwp->dw_m->reference = FALSE;
4484
4485                         if (dwp->dw_mask & DW_clear_busy)
4486                                 dwp->dw_m->busy = FALSE;
4487
4488                         if (dwp->dw_mask & DW_PAGE_WAKEUP)
4489                                 PAGE_WAKEUP(dwp->dw_m);
4490                 }
4491         }
4492         vm_page_unlock_queues();
4493 }
4494
4495
4496
4497 kern_return_t
4498 upl_commit_range(
4499         upl_t                   upl,
4500         upl_offset_t            offset,
4501         upl_size_t              size,
4502         int                     flags,
4503         upl_page_info_t         *page_list,
4504         mach_msg_type_number_t  count,
4505         boolean_t               *empty)
4506 {
4507         upl_size_t              xfer_size, subupl_size = size;
4508         vm_object_t             shadow_object;
4509         vm_object_t             object;
4510         vm_object_offset_t      target_offset;
4511         upl_offset_t            subupl_offset = offset;
4512         int                     entry;
4513         wpl_array_t             lite_list;
4514         int                     occupied;
4515         int                     clear_refmod = 0;
4516         int                     pgpgout_count = 0;
4517         struct  dw              dw_array[DELAYED_WORK_LIMIT];
4518         struct  dw              *dwp;
4519         int                     dw_count, isVectorUPL = 0;
4520         upl_t                   vector_upl = NULL;
4521
4522         *empty = FALSE;
4523
4524         if (upl == UPL_NULL)
4525                 return KERN_INVALID_ARGUMENT;
4526
4527         if (count == 0)
4528                 page_list = NULL;
4529
4530         if((isVectorUPL = vector_upl_is_valid(upl))) {
4531                 vector_upl = upl;
4532                 upl_lock(vector_upl);
4533         }
4534         else
4535                 upl_lock(upl);
4536
4537 process_upl_to_commit:
4538
4539         if(isVectorUPL) {
4540                 size = subupl_size;
4541                 offset = subupl_offset;
4542                 if(size == 0) {
4543                         upl_unlock(vector_upl);
4544                         return KERN_SUCCESS;
4545                 }
4546                 upl =  vector_upl_subupl_byoffset(vector_upl, &offset, &size);
4547                 if(upl == NULL) {
4548                         upl_unlock(vector_upl);
4549                         return KERN_FAILURE;
4550                 }
4551                 page_list = UPL_GET_INTERNAL_PAGE_LIST_SIMPLE(upl);
4552                 subupl_size -= size;
4553                 subupl_offset += size;
4554         }
4555
4556 #if UPL_DEBUG
4557         if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
4558                 (void) OSBacktrace(&upl->upl_commit_records[upl->upl_commit_index].c_retaddr[0], UPL_DEBUG_STACK_FRAMES);
4559
4560                 upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
4561                 upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
4562
4563                 upl->upl_commit_index++;
4564         }
4565 #endif
4566         if (upl->flags & UPL_DEVICE_MEMORY)
4567                 xfer_size = 0;
4568         else if ((offset + size) <= upl->size)
4569                 xfer_size = size;
4570         else {
4571                 if(!isVectorUPL)
4572                         upl_unlock(upl);
4573                 else {
4574                         upl_unlock(vector_upl);
4575                 }
4576                 return KERN_FAILURE;
4577         }
4578         if (upl->flags & UPL_CLEAR_DIRTY)
4579                 flags |= UPL_COMMIT_CLEAR_DIRTY;
4580
4581         if (upl->flags & UPL_INTERNAL)
4582                 lite_list = (wpl_array_t) ((((uintptr_t)upl) + sizeof(struct upl))
4583                                            + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
4584         else
4585                 lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
4586
4587         object = upl->map_object;
4588
4589         if (upl->flags & UPL_SHADOWED) {
4590                 vm_object_lock(object);
4591                 shadow_object = object->shadow;
4592         } else {
4593                 shadow_object = object;
4594         }
4595         entry = offset/PAGE_SIZE;
4596         target_offset = (vm_object_offset_t)offset;
4597
4598         if (upl->flags & UPL_KERNEL_OBJECT)
4599                 vm_object_lock_shared(shadow_object);
4600         else
4601                 vm_object_lock(shadow_object);
4602
4603         if (upl->flags & UPL_ACCESS_BLOCKED) {
4604                 assert(shadow_object->blocked_access);
4605                 shadow_object->blocked_access = FALSE;
4606                 vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
4607         }
4608
4609         if (shadow_object->code_signed) {
4610                 /*
4611                  * CODE SIGNING:
4612                  * If the object is code-signed, do not let this UPL tell
4613                  * us if the pages are valid or not.  Let the pages be
4614                  * validated by VM the normal way (when they get mapped or
4615                  * copied).
4616                  */
4617                 flags &= ~UPL_COMMIT_CS_VALIDATED;
4618         }
4619         if (! page_list) {
4620                 /*
4621                  * No page list to get the code-signing info from !?
4622                  */
4623                 flags &= ~UPL_COMMIT_CS_VALIDATED;
4624         }
4625
4626         dwp = &dw_array[0];
4627         dw_count = 0;
4628
4629         while (xfer_size) {
4630                 vm_page_t       t, m;
4631
4632                 dwp->dw_mask = 0;
4633                 clear_refmod = 0;
4634
4635                 m = VM_PAGE_NULL;
4636
4637                 if (upl->flags & UPL_LITE) {
4638                         unsigned int    pg_num;
4639
4640                         pg_num = (unsigned int) (target_offset/PAGE_SIZE);
4641                         assert(pg_num == target_offset/PAGE_SIZE);
4642
4643                         if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
4644                                 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
4645
4646                                 if (!(upl->flags & UPL_KERNEL_OBJECT))
4647                                         m = vm_page_lookup(shadow_object, target_offset + (upl->offset - shadow_object->paging_offset));
4648                         }
4649                 }
4650                 if (upl->flags & UPL_SHADOWED) {
4651                         if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
4652
4653                                 t->pageout = FALSE;
4654
4655                                 VM_PAGE_FREE(t);
4656
4657                                 if (m == VM_PAGE_NULL)
4658                                         m = vm_page_lookup(shadow_object, target_offset + object->shadow_offset);
4659                         }
4660                 }
4661                 if ((upl->flags & UPL_KERNEL_OBJECT) || m == VM_PAGE_NULL)
4662                         goto commit_next_page;
4663
4664                 if (flags & UPL_COMMIT_CS_VALIDATED) {
4665                         /*
4666                          * CODE SIGNING:
4667                          * Set the code signing bits according to
4668                          * what the UPL says they should be.
4669                          */
4670                         m->cs_validated = page_list[entry].cs_validated;
4671                         m->cs_tainted = page_list[entry].cs_tainted;
4672                 }
4673                 if (upl->flags & UPL_IO_WIRE) {
4674
4675                         if (page_list)
4676                                 page_list[entry].phys_addr = 0;
4677
4678                         if (flags & UPL_COMMIT_SET_DIRTY)
4679                                 m->dirty = TRUE;
4680                         else if (flags & UPL_COMMIT_CLEAR_DIRTY) {
4681                                 m->dirty = FALSE;
4682
4683                                 if (! (flags & UPL_COMMIT_CS_VALIDATED) &&
4684                                     m->cs_validated && !m->cs_tainted) {
4685                                         /*
4686                                          * CODE SIGNING:
4687                                          * This page is no longer dirty
4688                                          * but could have been modified,
4689                                          * so it will need to be
4690                                          * re-validated.
4691                                          */
4692                                         m->cs_validated = FALSE;
4693 #if DEVELOPMENT || DEBUG
4694                                         vm_cs_validated_resets++;
4695 #endif
4696                                         pmap_disconnect(m->phys_page);
4697                                 }
4698                                 clear_refmod |= VM_MEM_MODIFIED;
4699                         }
4700                         if (flags & UPL_COMMIT_INACTIVATE) {
4701                                 dwp->dw_mask |= DW_vm_page_deactivate_internal;
4702                                 clear_refmod |= VM_MEM_REFERENCED;
4703                         }
4704                         if (upl->flags & UPL_ACCESS_BLOCKED) {
4705                                 /*
4706                                  * We blocked access to the pages in this UPL.
4707                                  * Clear the "busy" bit and wake up any waiter
4708                                  * for this page.
4709                                  */
4710                                 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
4711                         }
4712                         if (m->absent) {
4713                                 if (flags & UPL_COMMIT_FREE_ABSENT)
4714                                         dwp->dw_mask |= DW_vm_page_free;
4715                                 else {
4716                                         m->absent = FALSE;
4717                                         dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
4718                                 }
4719                         } else
4720                                 dwp->dw_mask |= DW_vm_page_unwire;
4721
4722                         goto commit_next_page;
4723                 }
4724                 /*
4725                  * make sure to clear the hardware
4726                  * modify or reference bits before
4727                  * releasing the BUSY bit on this page
4728                  * otherwise we risk losing a legitimate
4729                  * change of state
4730                  */
4731                 if (flags & UPL_COMMIT_CLEAR_DIRTY) {
4732                         m->dirty = FALSE;
4733
4734                         if (! (flags & UPL_COMMIT_CS_VALIDATED) &&
4735                             m->cs_validated && !m->cs_tainted) {
4736                                 /*
4737                                  * CODE SIGNING:
4738                                  * This page is no longer dirty
4739                                  * but could have been modified,
4740                                  * so it will need to be
4741                                  * re-validated.
4742                                  */
4743                                 m->cs_validated = FALSE;
4744 #if DEVELOPMENT || DEBUG
4745                                 vm_cs_validated_resets++;
4746 #endif
4747                                 pmap_disconnect(m->phys_page);
4748                         }
4749                         clear_refmod |= VM_MEM_MODIFIED;
4750                 }
4751                 if (page_list) {
4752                         upl_page_info_t *p;
4753
4754                         p = &(page_list[entry]);
4755
4756                         if (p->phys_addr && p->pageout && !m->pageout) {
4757                                 m->busy = TRUE;
4758                                 m->pageout = TRUE;
4759
4760                                 dwp->dw_mask |= DW_vm_page_wire;
4761
4762                         } else if (p->phys_addr &&
4763                                    !p->pageout && m->pageout &&
4764                                    !m->dump_cleaning) {
4765                                 m->pageout = FALSE;
4766                                 m->absent = FALSE;
4767                                 m->overwriting = FALSE;
4768
4769                                 dwp->dw_mask |= (DW_vm_page_unwire | DW_clear_busy | DW_PAGE_WAKEUP);
4770                         }
4771                         page_list[entry].phys_addr = 0;
4772                 }
4773                 m->dump_cleaning = FALSE;
4774
4775                 if (m->laundry)
4776                         dwp->dw_mask |= DW_vm_pageout_throttle_up;
4777
4778                 if (m->pageout) {
4779                         m->cleaning = FALSE;
4780                         m->encrypted_cleaning = FALSE;
4781                         m->pageout = FALSE;
4782 #if MACH_CLUSTER_STATS
4783                         if (m->wanted) vm_pageout_target_collisions++;
4784 #endif
4785                         m->dirty = FALSE;
4786
4787                         if (! (flags & UPL_COMMIT_CS_VALIDATED) &&
4788                             m->cs_validated && !m->cs_tainted) {
4789                                 /*
4790                                  * CODE SIGNING:
4791                                  * This page is no longer dirty
4792                                  * but could have been modified,
4793                                  * so it will need to be
4794                                  * re-validated.
4795                                  */
4796                                 m->cs_validated = FALSE;
4797 #if DEVELOPMENT || DEBUG
4798                                 vm_cs_validated_resets++;
4799 #endif
4800                                 pmap_disconnect(m->phys_page);
4801                         }
4802
4803                         if ((flags & UPL_COMMIT_SET_DIRTY) ||
4804                             (m->pmapped && (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED)))
4805                                 m->dirty = TRUE;
4806
4807                         if (m->dirty) {
4808                                 /*
4809                                  * page was re-dirtied after we started
4810                                  * the pageout... reactivate it since
4811                                  * we don't know whether the on-disk
4812                                  * copy matches what is now in memory
4813                                  */
4814                                 dwp->dw_mask |= (DW_vm_page_unwire | DW_clear_busy | DW_PAGE_WAKEUP);
4815
4816                                 if (upl->flags & UPL_PAGEOUT) {
4817                                         CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
4818                                         VM_STAT_INCR(reactivations);
4819                                         DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL);
4820                                 }
4821                         } else {
4822                                 /*
4823                                  * page has been successfully cleaned
4824                                  * go ahead and free it for other use
4825                                  */
4826
4827                                 if (m->object->internal) {
4828                                         DTRACE_VM2(anonpgout, int, 1, (uint64_t *), NULL);
4829                                 } else {
4830                                         DTRACE_VM2(fspgout, int, 1, (uint64_t *), NULL);
4831                                 }
4832                                 dwp->dw_mask |= DW_vm_page_free;
4833
4834                                 if (upl->flags & UPL_PAGEOUT) {
4835                                         CLUSTER_STAT(vm_pageout_target_page_freed++;)
4836
4837                                         if (page_list[entry].dirty) {
4838                                                 VM_STAT_INCR(pageouts);
4839                                                 DTRACE_VM2(pgout, int, 1, (uint64_t *), NULL);
4840                                                 pgpgout_count++;
4841                                         }
4842                                 }
4843                         }
4844                         goto commit_next_page;
4845                 }
4846 #if MACH_CLUSTER_STATS
4847                 if (m->wpmapped)
4848                         m->dirty = pmap_is_modified(m->phys_page);
4849
4850                 if (m->dirty)   vm_pageout_cluster_dirtied++;
4851                 else            vm_pageout_cluster_cleaned++;
4852                 if (m->wanted)  vm_pageout_cluster_collisions++;
4853 #endif
4854                 m->dirty = FALSE;
4855
4856                 if (! (flags & UPL_COMMIT_CS_VALIDATED) &&
4857                     m->cs_validated && !m->cs_tainted) {
4858                         /*
4859                          * CODE SIGNING:
4860                          * This page is no longer dirty
4861                          * but could have been modified,
4862                          * so it will need to be
4863                          * re-validated.
4864                          */
4865                         m->cs_validated = FALSE;
4866 #if DEVELOPMENT || DEBUG
4867                         vm_cs_validated_resets++;
4868 #endif
4869                         pmap_disconnect(m->phys_page);
4870                 }
4871
4872                 if ((m->busy) && (m->cleaning)) {
4873                         /*
4874                          * the request_page_list case
4875                          */
4876                         m->absent = FALSE;
4877                         m->overwriting = FALSE;
4878
4879                         dwp->dw_mask |= DW_clear_busy;
4880
4881                 } else if (m->overwriting) {
4882                         /*
4883                          * alternate request page list, write to
4884                          * page_list case.  Occurs when the original
4885                          * page was wired at the time of the list
4886                          * request
4887                          */
4888                         assert(VM_PAGE_WIRED(m));
4889                         m->overwriting = FALSE;
4890
4891                         dwp->dw_mask |= DW_vm_page_unwire; /* reactivates */
4892                 }
4893                 m->cleaning = FALSE;
4894                 m->encrypted_cleaning = FALSE;
4895
4896                 /*
4897                  * It is a part of the semantic of COPYOUT_FROM
4898                  * UPLs that a commit implies cache sync
4899                  * between the vm page and the backing store
4900                  * this can be used to strip the precious bit
4901                  * as well as clean
4902                  */
4903                 if ((upl->flags & UPL_PAGE_SYNC_DONE) || (flags & UPL_COMMIT_CLEAR_PRECIOUS))
4904                         m->precious = FALSE;
4905
4906                 if (flags & UPL_COMMIT_SET_DIRTY)
4907                         m->dirty = TRUE;
4908
4909                 if ((flags & UPL_COMMIT_INACTIVATE) && !m->clustered && !m->speculative) {
4910                         dwp->dw_mask |= DW_vm_page_deactivate_internal;
4911                         clear_refmod |= VM_MEM_REFERENCED;
4912
4913                 } else if (!m->active && !m->inactive && !m->speculative) {
4914
4915                         if (m->clustered || (flags & UPL_COMMIT_SPECULATE))
4916                                 dwp->dw_mask |= DW_vm_page_speculate;
4917                         else if (m->reference)
4918                                 dwp->dw_mask |= DW_vm_page_activate;
4919                         else {
4920                                 dwp->dw_mask |= DW_vm_page_deactivate_internal;
4921                                 clear_refmod |= VM_MEM_REFERENCED;
4922                         }
4923                 }
4924                 if (upl->flags & UPL_ACCESS_BLOCKED) {
4925                         /*
4926                          * We blocked access to the pages in this URL.
4927                          * Clear the "busy" bit on this page before we
4928                          * wake up any waiter.
4929                          */
4930                         dwp->dw_mask |= DW_clear_busy;
4931                 }
4932                 /*
4933                  * Wakeup any thread waiting for the page to be un-cleaning.
4934                  */
4935                 dwp->dw_mask |= DW_PAGE_WAKEUP;
4936
4937 commit_next_page:
4938                 if (clear_refmod)
4939                         pmap_clear_refmod(m->phys_page, clear_refmod);
4940
4941                 target_offset += PAGE_SIZE_64;
4942                 xfer_size -= PAGE_SIZE;
4943                 entry++;
4944
4945                 if (dwp->dw_mask) {
4946                         if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) {
4947                                 if (m->busy == FALSE) {
4948                                         /*
4949                                          * dw_do_work may need to drop the object lock
4950                                          * if it does, we need the pages it's looking at to
4951                                          * be held stable via the busy bit.
4952                                          */
4953                                         m->busy = TRUE;
4954                                         dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
4955                                 }
4956                                 dwp->dw_m = m;
4957                                 dwp++;
4958                                 dw_count++;
4959
4960                                 if (dw_count >= DELAYED_WORK_LIMIT) {
4961                                         dw_do_work(shadow_object, &dw_array[0], dw_count);
4962
4963                                         dwp = &dw_array[0];
4964                                         dw_count = 0;
4965                                 }
4966                         } else {
4967                                 if (dwp->dw_mask & DW_clear_busy)
4968                                         m->busy = FALSE;
4969
4970                                 if (dwp->dw_mask & DW_PAGE_WAKEUP)
4971                                         PAGE_WAKEUP(m);
4972                         }
4973                 }
4974         }
4975         if (dw_count)
4976                 dw_do_work(shadow_object, &dw_array[0], dw_count);
4977
4978         occupied = 1;
4979
4980         if (upl->flags & UPL_DEVICE_MEMORY)  {
4981                 occupied = 0;
4982         } else if (upl->flags & UPL_LITE) {
4983                 int     pg_num;
4984                 int     i;
4985
4986                 pg_num = upl->size/PAGE_SIZE;
4987                 pg_num = (pg_num + 31) >> 5;
4988                 occupied = 0;
4989
4990                 for (i = 0; i < pg_num; i++) {
4991                         if (lite_list[i] != 0) {
4992                                 occupied = 1;
4993                                 break;
4994                         }
4995                 }
4996         } else {
4997                 if (queue_empty(&upl->map_object->memq))
4998                         occupied = 0;
4999         }
5000         if (occupied == 0) {
5001                 /*
5002                  * If this UPL element belongs to a Vector UPL and is
5003                  * empty, then this is the right function to deallocate
5004                  * it. So go ahead set the *empty variable. The flag
5005                  * UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
5006                  * should be considered relevant for the Vector UPL and not
5007                  * the internal UPLs.
5008                  */
5009                 if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) || isVectorUPL)
5010                         *empty = TRUE;
5011
5012                 if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
5013                         /*
5014                          * this is not a paging object
5015                          * so we need to drop the paging reference
5016                          * that was taken when we created the UPL
5017                          * against this object
5018                          */
5019                         vm_object_activity_end(shadow_object);
5020                 } else {
5021                          /*
5022                           * we dontated the paging reference to
5023                           * the map object... vm_pageout_object_terminate
5024                           * will drop this reference
5025                           */
5026                 }
5027         }
5028         vm_object_unlock(shadow_object);
5029         if (object != shadow_object)
5030                 vm_object_unlock(object);
5031
5032         if(!isVectorUPL)
5033                 upl_unlock(upl);
5034         else {
5035                 /*
5036                  * If we completed our operations on an UPL that is
5037                  * part of a Vectored UPL and if empty is TRUE, then
5038                  * we should go ahead and deallocate this UPL element.
5039                  * Then we check if this was the last of the UPL elements
5040                  * within that Vectored UPL. If so, set empty to TRUE
5041                  * so that in ubc_upl_commit_range or ubc_upl_commit, we
5042                  * can go ahead and deallocate the Vector UPL too.
5043                  */
5044                 if(*empty==TRUE) {
5045                         *empty = vector_upl_set_subupl(vector_upl, upl, 0);
5046                         upl_deallocate(upl);
5047                 }
5048                 goto process_upl_to_commit;
5049         }
5050
5051         if (pgpgout_count) {
5052                 DTRACE_VM2(pgpgout, int, pgpgout_count, (uint64_t *), NULL);
5053         }
5054
5055         return KERN_SUCCESS;
5056 }
5057
5058 kern_return_t
5059 upl_abort_range(
5060         upl_t                   upl,
5061         upl_offset_t            offset,
5062         upl_size_t              size,
5063         int                     error,
5064         boolean_t               *empty)
5065 {
5066         upl_size_t              xfer_size, subupl_size = size;
5067         vm_object_t             shadow_object;
5068         vm_object_t             object;
5069         vm_object_offset_t      target_offset;
5070         upl_offset_t            subupl_offset = offset;
5071         int                     entry;
5072         wpl_array_t             lite_list;
5073         int                     occupied;
5074         struct  dw              dw_array[DELAYED_WORK_LIMIT];
5075         struct  dw              *dwp;
5076         int                     dw_count, isVectorUPL = 0;
5077         upl_t                   vector_upl = NULL;
5078
5079         *empty = FALSE;
5080
5081         if (upl == UPL_NULL)
5082                 return KERN_INVALID_ARGUMENT;
5083
5084         if ( (upl->flags & UPL_IO_WIRE) && !(error & UPL_ABORT_DUMP_PAGES) )
5085                 return upl_commit_range(upl, offset, size, UPL_COMMIT_FREE_ABSENT, NULL, 0, empty);
5086
5087         if((isVectorUPL = vector_upl_is_valid(upl))) {
5088                 vector_upl = upl;
5089                 upl_lock(vector_upl);
5090         }
5091         else
5092                 upl_lock(upl);
5093
5094 process_upl_to_abort:
5095         if(isVectorUPL) {
5096                 size = subupl_size;
5097                 offset = subupl_offset;
5098                 if(size == 0) {
5099                         upl_unlock(vector_upl);
5100                         return KERN_SUCCESS;
5101                 }
5102                 upl =  vector_upl_subupl_byoffset(vector_upl, &offset, &size);
5103                 if(upl == NULL) {
5104                         upl_unlock(vector_upl);
5105                         return KERN_FAILURE;
5106                 }
5107                 subupl_size -= size;
5108                 subupl_offset += size;
5109         }
5110
5111         *empty = FALSE;
5112
5113 #if UPL_DEBUG
5114         if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
5115                 (void) OSBacktrace(&upl->upl_commit_records[upl->upl_commit_index].c_retaddr[0], UPL_DEBUG_STACK_FRAMES);
5116
5117                 upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
5118                 upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
5119                 upl->upl_commit_records[upl->upl_commit_index].c_aborted = 1;
5120
5121                 upl->upl_commit_index++;
5122         }
5123 #endif
5124         if (upl->flags & UPL_DEVICE_MEMORY)
5125                 xfer_size = 0;
5126         else if ((offset + size) <= upl->size)
5127                 xfer_size = size;
5128         else {
5129                 if(!isVectorUPL)
5130                         upl_unlock(upl);
5131                 else {
5132                         upl_unlock(vector_upl);
5133                 }
5134
5135                 return KERN_FAILURE;
5136         }
5137         if (upl->flags & UPL_INTERNAL) {
5138                 lite_list = (wpl_array_t)
5139                         ((((uintptr_t)upl) + sizeof(struct upl))
5140                         + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
5141         } else {
5142                 lite_list = (wpl_array_t)
5143                         (((uintptr_t)upl) + sizeof(struct upl));
5144         }
5145         object = upl->map_object;
5146
5147         if (upl->flags & UPL_SHADOWED) {
5148                 vm_object_lock(object);
5149                 shadow_object = object->shadow;
5150         } else
5151                 shadow_object = object;
5152
5153         entry = offset/PAGE_SIZE;
5154         target_offset = (vm_object_offset_t)offset;
5155
5156         if (upl->flags & UPL_KERNEL_OBJECT)
5157                 vm_object_lock_shared(shadow_object);
5158         else
5159                 vm_object_lock(shadow_object);
5160
5161         if (upl->flags & UPL_ACCESS_BLOCKED) {
5162                 assert(shadow_object->blocked_access);
5163                 shadow_object->blocked_access = FALSE;
5164                 vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
5165         }
5166
5167         dwp = &dw_array[0];
5168         dw_count = 0;
5169
5170         if ((error & UPL_ABORT_DUMP_PAGES) && (upl->flags & UPL_KERNEL_OBJECT))
5171                 panic("upl_abort_range: kernel_object being DUMPED");
5172
5173         while (xfer_size) {
5174                 vm_page_t       t, m;
5175
5176                 dwp->dw_mask = 0;
5177
5178                 m = VM_PAGE_NULL;
5179
5180                 if (upl->flags & UPL_LITE) {
5181                         unsigned int    pg_num;
5182
5183                         pg_num = (unsigned int) (target_offset/PAGE_SIZE);
5184                         assert(pg_num == target_offset/PAGE_SIZE);
5185
5186
5187                         if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
5188                                 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
5189
5190                                 if ( !(upl->flags & UPL_KERNEL_OBJECT))
5191                                         m = vm_page_lookup(shadow_object, target_offset +
5192                                                            (upl->offset - shadow_object->paging_offset));
5193                         }
5194                 }
5195                 if (upl->flags & UPL_SHADOWED) {
5196                         if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
5197                                 t->pageout = FALSE;
5198
5199                                 VM_PAGE_FREE(t);
5200
5201                                 if (m == VM_PAGE_NULL)
5202                                         m = vm_page_lookup(shadow_object, target_offset + object->shadow_offset);
5203                         }
5204                 }
5205                 if ((upl->flags & UPL_KERNEL_OBJECT))
5206                         goto abort_next_page;
5207
5208                 if (m != VM_PAGE_NULL) {
5209
5210                         if (m->absent) {
5211                                 boolean_t must_free = TRUE;
5212
5213                                 m->clustered = FALSE;
5214                                 /*
5215                                  * COPYOUT = FALSE case
5216                                  * check for error conditions which must
5217                                  * be passed back to the pages customer
5218                                  */
5219                                 if (error & UPL_ABORT_RESTART) {
5220                                         m->restart = TRUE;
5221                                         m->absent = FALSE;
5222                                         m->unusual = TRUE;
5223                                         must_free = FALSE;
5224                                 } else if (error & UPL_ABORT_UNAVAILABLE) {
5225                                         m->restart = FALSE;
5226                                         m->unusual = TRUE;
5227                                         must_free = FALSE;
5228                                 } else if (error & UPL_ABORT_ERROR) {
5229                                         m->restart = FALSE;
5230                                         m->absent = FALSE;
5231                                         m->error = TRUE;
5232                                         m->unusual = TRUE;
5233                                         must_free = FALSE;
5234                                 }
5235
5236                                 /*
5237                                  * ENCRYPTED SWAP:
5238                                  * If the page was already encrypted,
5239                                  * we don't really need to decrypt it
5240                                  * now.  It will get decrypted later,
5241                                  * on demand, as soon as someone needs
5242                                  * to access its contents.
5243                                  */
5244
5245                                 m->cleaning = FALSE;
5246                                 m->encrypted_cleaning = FALSE;
5247                                 m->overwriting = FALSE;
5248
5249                                 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
5250
5251                                 if (must_free == TRUE)
5252                                         dwp->dw_mask |= DW_vm_page_free;
5253                                 else
5254                                         dwp->dw_mask |= DW_vm_page_activate;
5255                         } else {
5256                                 /*
5257                                  * Handle the trusted pager throttle.
5258                                  */
5259                                 if (m->laundry)
5260                                         dwp->dw_mask |= DW_vm_pageout_throttle_up;
5261
5262                                 if (m->pageout) {
5263                                         assert(m->busy);
5264                                         assert(m->wire_count == 1);
5265                                         m->pageout = FALSE;
5266
5267                                         dwp->dw_mask |= DW_vm_page_unwire;
5268                                 }
5269                                 m->dump_cleaning = FALSE;
5270                                 m->cleaning = FALSE;
5271                                 m->encrypted_cleaning = FALSE;
5272                                 m->overwriting = FALSE;
5273 #if     MACH_PAGEMAP
5274                                 vm_external_state_clr(m->object->existence_map, m->offset);
5275 #endif  /* MACH_PAGEMAP */
5276                                 if (error & UPL_ABORT_DUMP_PAGES) {
5277                                         pmap_disconnect(m->phys_page);
5278
5279                                         dwp->dw_mask |= DW_vm_page_free;
5280                                 } else {
5281                                         if (error & UPL_ABORT_REFERENCE) {
5282                                                 /*
5283                                                  * we've been told to explictly
5284                                                  * reference this page... for
5285                                                  * file I/O, this is done by
5286                                                  * implementing an LRU on the inactive q
5287                                                  */
5288                                                 dwp->dw_mask |= DW_vm_page_lru;
5289                                         }
5290                                         dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
5291                                 }
5292                         }
5293                 }
5294 abort_next_page:
5295                 target_offset += PAGE_SIZE_64;
5296                 xfer_size -= PAGE_SIZE;
5297                 entry++;
5298
5299                 if (dwp->dw_mask) {
5300                         if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) {
5301                                 if (m->busy == FALSE) {
5302                                         /*
5303                                          * dw_do_work may need to drop the object lock
5304                                          * if it does, we need the pages it's looking at to
5305                                          * be held stable via the busy bit.
5306                                          */
5307                                         m->busy = TRUE;
5308                                         dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
5309                                 }
5310                                 dwp->dw_m = m;
5311                                 dwp++;
5312                                 dw_count++;
5313
5314                                 if (dw_count >= DELAYED_WORK_LIMIT) {
5315                                         dw_do_work(shadow_object, &dw_array[0], dw_count);
5316
5317                                         dwp = &dw_array[0];
5318                                         dw_count = 0;
5319                                 }
5320                         } else {
5321                                 if (dwp->dw_mask & DW_clear_busy)
5322                                         m->busy = FALSE;
5323
5324                                 if (dwp->dw_mask & DW_PAGE_WAKEUP)
5325                                         PAGE_WAKEUP(m);
5326                         }
5327                 }
5328         }
5329         if (dw_count)
5330                 dw_do_work(shadow_object, &dw_array[0], dw_count);
5331
5332         occupied = 1;
5333
5334         if (upl->flags & UPL_DEVICE_MEMORY)  {
5335                 occupied = 0;
5336         } else if (upl->flags & UPL_LITE) {
5337                 int     pg_num;
5338                 int     i;
5339
5340                 pg_num = upl->size/PAGE_SIZE;
5341                 pg_num = (pg_num + 31) >> 5;
5342                 occupied = 0;
5343
5344                 for (i = 0; i < pg_num; i++) {
5345                         if (lite_list[i] != 0) {
5346                                 occupied = 1;
5347                                 break;
5348                         }
5349                 }
5350         } else {
5351                 if (queue_empty(&upl->map_object->memq))
5352                         occupied = 0;
5353         }
5354         if (occupied == 0) {
5355                 /*
5356                  * If this UPL element belongs to a Vector UPL and is
5357                  * empty, then this is the right function to deallocate
5358                  * it. So go ahead set the *empty variable. The flag
5359                  * UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
5360                  * should be considered relevant for the Vector UPL and
5361                  * not the internal UPLs.
5362                  */
5363                 if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) || isVectorUPL)
5364                         *empty = TRUE;
5365
5366                 if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
5367                         /*
5368                          * this is not a paging object
5369                          * so we need to drop the paging reference
5370                          * that was taken when we created the UPL
5371                          * against this object
5372                          */
5373                         vm_object_activity_end(shadow_object);
5374                 } else {
5375                          /*
5376                           * we dontated the paging reference to
5377                           * the map object... vm_pageout_object_terminate
5378                           * will drop this reference
5379                           */
5380                 }
5381         }
5382         vm_object_unlock(shadow_object);
5383         if (object != shadow_object)
5384                 vm_object_unlock(object);
5385
5386         if(!isVectorUPL)
5387                 upl_unlock(upl);
5388         else {
5389                 /*
5390                 * If we completed our operations on an UPL that is
5391                 * part of a Vectored UPL and if empty is TRUE, then
5392                 * we should go ahead and deallocate this UPL element.
5393                 * Then we check if this was the last of the UPL elements
5394                 * within that Vectored UPL. If so, set empty to TRUE
5395                 * so that in ubc_upl_abort_range or ubc_upl_abort, we
5396                 * can go ahead and deallocate the Vector UPL too.
5397                 */
5398                 if(*empty == TRUE) {
5399                         *empty = vector_upl_set_subupl(vector_upl, upl,0);
5400                         upl_deallocate(upl);
5401                 }
5402                 goto process_upl_to_abort;
5403         }
5404
5405         return KERN_SUCCESS;
5406 }
5407
5408
5409 kern_return_t
5410 upl_abort(
5411         upl_t   upl,
5412         int     error)
5413 {
5414         boolean_t       empty;
5415
5416         return upl_abort_range(upl, 0, upl->size, error, &empty);
5417 }
5418
5419
5420 /* an option on commit should be wire */
5421 kern_return_t
5422 upl_commit(
5423         upl_t                   upl,
5424         upl_page_info_t         *page_list,
5425         mach_msg_type_number_t  count)
5426 {
5427         boolean_t       empty;
5428
5429         return upl_commit_range(upl, 0, upl->size, 0, page_list, count, &empty);
5430 }
5431
5432
5433 unsigned int vm_object_iopl_request_sleep_for_cleaning = 0;
5434
5435 kern_return_t
5436 vm_object_iopl_request(
5437         vm_object_t             object,
5438         vm_object_offset_t      offset,
5439         upl_size_t              size,
5440         upl_t                   *upl_ptr,
5441         upl_page_info_array_t   user_page_list,
5442         unsigned int            *page_list_count,
5443         int                     cntrl_flags)
5444 {
5445         vm_page_t               dst_page;
5446         vm_object_offset_t      dst_offset;
5447         upl_size_t              xfer_size;
5448         upl_t                   upl = NULL;
5449         unsigned int            entry;
5450         wpl_array_t             lite_list = NULL;
5451         int                     no_zero_fill = FALSE;
5452         u_int32_t               psize;
5453         kern_return_t           ret;
5454         vm_prot_t               prot;
5455         struct vm_object_fault_info fault_info;
5456         struct  dw              dw_array[DELAYED_WORK_LIMIT];
5457         struct  dw              *dwp;
5458         int                     dw_count;
5459         int                     dw_index;
5460
5461         if (cntrl_flags & ~UPL_VALID_FLAGS) {
5462                 /*
5463                  * For forward compatibility's sake,
5464                  * reject any unknown flag.
5465                  */
5466                 return KERN_INVALID_VALUE;
5467         }
5468         if (vm_lopage_needed == FALSE)
5469                 cntrl_flags &= ~UPL_NEED_32BIT_ADDR;
5470
5471         if (cntrl_flags & UPL_NEED_32BIT_ADDR) {
5472                 if ( (cntrl_flags & (UPL_SET_IO_WIRE | UPL_SET_LITE)) != (UPL_SET_IO_WIRE | UPL_SET_LITE))
5473                         return KERN_INVALID_VALUE;
5474
5475                 if (object->phys_contiguous) {
5476                         if ((offset + object->shadow_offset) >= (vm_object_offset_t)max_valid_dma_address)
5477                                 return KERN_INVALID_ADDRESS;
5478
5479                         if (((offset + object->shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address)
5480                                 return KERN_INVALID_ADDRESS;
5481                 }
5482         }
5483
5484         if (cntrl_flags & UPL_ENCRYPT) {
5485                 /*
5486                  * ENCRYPTED SWAP:
5487                  * The paging path doesn't use this interface,
5488                  * so we don't support the UPL_ENCRYPT flag
5489                  * here.  We won't encrypt the pages.
5490                  */
5491                 assert(! (cntrl_flags & UPL_ENCRYPT));
5492         }
5493         if (cntrl_flags & UPL_NOZEROFILL)
5494                 no_zero_fill = TRUE;
5495
5496         if (cntrl_flags & UPL_COPYOUT_FROM)
5497                 prot = VM_PROT_READ;
5498         else
5499                 prot = VM_PROT_READ | VM_PROT_WRITE;
5500
5501         if (((size/PAGE_SIZE) > MAX_UPL_SIZE) && !object->phys_contiguous)
5502                 size = MAX_UPL_SIZE * PAGE_SIZE;
5503
5504         if (cntrl_flags & UPL_SET_INTERNAL) {
5505                 if (page_list_count != NULL)
5506                         *page_list_count = MAX_UPL_SIZE;
5507         }
5508         if (((cntrl_flags & UPL_SET_INTERNAL) && !(object->phys_contiguous)) &&
5509             ((page_list_count != NULL) && (*page_list_count != 0) && *page_list_count < (size/page_size)))
5510                 return KERN_INVALID_ARGUMENT;
5511
5512         if ((!object->internal) && (object->paging_offset != 0))
5513                 panic("vm_object_iopl_request: external object with non-zero paging offset\n");
5514
5515
5516         if (object->phys_contiguous)
5517                 psize = PAGE_SIZE;
5518         else
5519                 psize = size;
5520
5521         if (cntrl_flags & UPL_SET_INTERNAL) {
5522                 upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE, UPL_IO_WIRE, psize);
5523
5524                 user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
5525                 lite_list = (wpl_array_t) (((uintptr_t)user_page_list) +
5526                                            ((psize / PAGE_SIZE) * sizeof(upl_page_info_t)));
5527                 if (size == 0) {
5528                         user_page_list = NULL;
5529                         lite_list = NULL;
5530                 }
5531         } else {
5532                 upl = upl_create(UPL_CREATE_LITE, UPL_IO_WIRE, psize);
5533
5534                 lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
5535                 if (size == 0) {
5536                         lite_list = NULL;
5537                 }
5538         }
5539         if (user_page_list)
5540                 user_page_list[0].device = FALSE;
5541         *upl_ptr = upl;
5542
5543         upl->map_object = object;
5544         upl->size = size;
5545
5546         if (object == kernel_object &&
5547             !(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS))) {
5548                 upl->flags |= UPL_KERNEL_OBJECT;
5549 #if UPL_DEBUG
5550                 vm_object_lock(object);
5551 #else
5552                 vm_object_lock_shared(object);
5553 #endif
5554         } else {
5555                 vm_object_lock(object);
5556                 vm_object_activity_begin(object);
5557         }
5558         /*
5559          * paging in progress also protects the paging_offset
5560          */
5561         upl->offset = offset + object->paging_offset;
5562
5563         if (cntrl_flags & UPL_BLOCK_ACCESS) {
5564                 /*
5565                  * The user requested that access to the pages in this URL
5566                  * be blocked until the UPL is commited or aborted.
5567                  */
5568                 upl->flags |= UPL_ACCESS_BLOCKED;
5569         }
5570
5571         if (object->phys_contiguous) {
5572 #if UPL_DEBUG
5573                 queue_enter(&object->uplq, upl, upl_t, uplq);
5574 #endif /* UPL_DEBUG */
5575
5576                 if (upl->flags & UPL_ACCESS_BLOCKED) {
5577                         assert(!object->blocked_access);
5578                         object->blocked_access = TRUE;
5579                 }
5580
5581                 vm_object_unlock(object);
5582
5583                 /*
5584                  * don't need any shadow mappings for this one
5585                  * since it is already I/O memory
5586                  */
5587                 upl->flags |= UPL_DEVICE_MEMORY;
5588
5589                 upl->highest_page = (ppnum_t) ((offset + object->shadow_offset + size - 1)>>PAGE_SHIFT);
5590
5591                 if (user_page_list) {
5592                         user_page_list[0].phys_addr = (ppnum_t) ((offset + object->shadow_offset)>>PAGE_SHIFT);
5593                         user_page_list[0].device = TRUE;
5594                 }
5595                 if (page_list_count != NULL) {
5596                         if (upl->flags & UPL_INTERNAL)
5597                                 *page_list_count = 0;
5598                         else
5599                                 *page_list_count = 1;
5600                 }
5601                 return KERN_SUCCESS;
5602         }
5603         if (object != kernel_object) {
5604                 /*
5605                  * Protect user space from future COW operations
5606                  */
5607                 object->true_share = TRUE;
5608
5609                 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC)
5610                         object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
5611         }
5612
5613 #if UPL_DEBUG
5614         queue_enter(&object->uplq, upl, upl_t, uplq);
5615 #endif /* UPL_DEBUG */
5616
5617         if (!(cntrl_flags & UPL_COPYOUT_FROM) &&
5618             object->copy != VM_OBJECT_NULL) {
5619                 /*
5620                  * Honor copy-on-write obligations
5621                  *
5622                  * The caller is gathering these pages and
5623                  * might modify their contents.  We need to
5624                  * make sure that the copy object has its own
5625                  * private copies of these pages before we let
5626                  * the caller modify them.
5627                  *
5628                  * NOTE: someone else could map the original object
5629                  * after we've done this copy-on-write here, and they
5630                  * could then see an inconsistent picture of the memory
5631                  * while it's being modified via the UPL.  To prevent this,
5632                  * we would have to block access to these pages until the
5633                  * UPL is released.  We could use the UPL_BLOCK_ACCESS
5634                  * code path for that...
5635                  */
5636                 vm_object_update(object,
5637                                  offset,
5638                                  size,
5639                                  NULL,
5640                                  NULL,
5641                                  FALSE, /* should_return */
5642                                  MEMORY_OBJECT_COPY_SYNC,
5643                                  VM_PROT_NO_CHANGE);
5644 #if DEVELOPMENT || DEBUG
5645                 iopl_cow++;
5646                 iopl_cow_pages += size >> PAGE_SHIFT;
5647 #endif
5648         }
5649
5650
5651         entry = 0;
5652
5653         xfer_size = size;
5654         dst_offset = offset;
5655
5656         fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
5657         fault_info.user_tag  = 0;
5658         fault_info.lo_offset = offset;
5659         fault_info.hi_offset = offset + xfer_size;
5660         fault_info.no_cache  = FALSE;
5661         fault_info.stealth = FALSE;
5662         fault_info.mark_zf_absent = TRUE;
5663
5664         dwp = &dw_array[0];
5665         dw_count = 0;
5666
5667         while (xfer_size) {
5668                 vm_fault_return_t       result;
5669                 unsigned int            pg_num;
5670
5671                 dwp->dw_mask = 0;
5672
5673                 dst_page = vm_page_lookup(object, dst_offset);
5674
5675                 /*
5676                  * ENCRYPTED SWAP:
5677                  * If the page is encrypted, we need to decrypt it,
5678                  * so force a soft page fault.
5679                  */
5680                 if (dst_page == VM_PAGE_NULL ||
5681                     dst_page->busy ||
5682                     dst_page->encrypted ||
5683                     dst_page->error ||
5684                     dst_page->restart ||
5685                     dst_page->absent ||
5686                     dst_page->fictitious) {
5687
5688                    if (object == kernel_object)
5689                            panic("vm_object_iopl_request: missing/bad page in kernel object\n");
5690
5691                    do {
5692                         vm_page_t       top_page;
5693                         kern_return_t   error_code;
5694                         int             interruptible;
5695
5696                         if (cntrl_flags & UPL_SET_INTERRUPTIBLE)
5697                                 interruptible = THREAD_ABORTSAFE;
5698                         else
5699                                 interruptible = THREAD_UNINT;
5700
5701                         fault_info.interruptible = interruptible;
5702                         fault_info.cluster_size = xfer_size;
5703
5704                         vm_object_paging_begin(object);
5705
5706                         result = vm_fault_page(object, dst_offset,
5707                                                prot | VM_PROT_WRITE, FALSE,
5708                                                &prot, &dst_page, &top_page,
5709                                                (int *)0,
5710                                                &error_code, no_zero_fill,
5711                                                FALSE, &fault_info);
5712
5713                         switch (result) {
5714
5715                         case VM_FAULT_SUCCESS:
5716
5717                                 if ( !dst_page->absent) {
5718                                         PAGE_WAKEUP_DONE(dst_page);
5719                                 } else {
5720                                         /*
5721                                          * we only get back an absent page if we
5722                                          * requested that it not be zero-filled
5723                                          * because we are about to fill it via I/O
5724                                          *
5725                                          * absent pages should be left BUSY
5726                                          * to prevent them from being faulted
5727                                          * into an address space before we've
5728                                          * had a chance to complete the I/O on
5729                                          * them since they may contain info that
5730                                          * shouldn't be seen by the faulting task
5731                                          */
5732                                 }
5733                                 /*
5734                                  *      Release paging references and
5735                                  *      top-level placeholder page, if any.
5736                                  */
5737                                 if (top_page != VM_PAGE_NULL) {
5738                                         vm_object_t local_object;
5739
5740                                         local_object = top_page->object;
5741
5742                                         if (top_page->object != dst_page->object) {
5743                                                 vm_object_lock(local_object);
5744                                                 VM_PAGE_FREE(top_page);
5745                                                 vm_object_paging_end(local_object);
5746                                                 vm_object_unlock(local_object);
5747                                         } else {
5748                                                 VM_PAGE_FREE(top_page);
5749                                                 vm_object_paging_end(local_object);
5750                                         }
5751                                 }
5752                                 vm_object_paging_end(object);
5753                                 break;
5754
5755                         case VM_FAULT_RETRY:
5756                                 vm_object_lock(object);
5757                                 break;
5758
5759                         case VM_FAULT_FICTITIOUS_SHORTAGE:
5760                                 vm_page_more_fictitious();
5761
5762                                 vm_object_lock(object);
5763                                 break;
5764
5765                         case VM_FAULT_MEMORY_SHORTAGE:
5766                                 if (vm_page_wait(interruptible)) {
5767                                         vm_object_lock(object);
5768                                         break;
5769                                 }
5770                                 /* fall thru */
5771
5772                         case VM_FAULT_INTERRUPTED:
5773                                 error_code = MACH_SEND_INTERRUPTED;
5774                         case VM_FAULT_MEMORY_ERROR:
5775                         memory_error:
5776                                 ret = (error_code ? error_code: KERN_MEMORY_ERROR);
5777
5778                                 vm_object_lock(object);
5779                                 goto return_err;
5780
5781                         case VM_FAULT_SUCCESS_NO_VM_PAGE:
5782                                 /* success but no page: fail */
5783                                 vm_object_paging_end(object);
5784                                 vm_object_unlock(object);
5785                                 goto memory_error;
5786
5787                         default:
5788                                 panic("vm_object_iopl_request: unexpected error"
5789                                       " 0x%x from vm_fault_page()\n", result);
5790                         }
5791                    } while (result != VM_FAULT_SUCCESS);
5792
5793                 }
5794
5795                 if (upl->flags & UPL_KERNEL_OBJECT)
5796                         goto record_phys_addr;
5797
5798                 if (dst_page->cleaning) {
5799                         /*
5800                          * Someone else is cleaning this page in place.as
5801                          * In theory, we should be able to  proceed and use this
5802                          * page but they'll probably end up clearing the "busy"
5803                          * bit on it in upl_commit_range() but they didn't set
5804                          * it, so they would clear our "busy" bit and open
5805                          * us to race conditions.
5806                          * We'd better wait for the cleaning to complete and
5807                          * then try again.
5808                          */
5809                         vm_object_iopl_request_sleep_for_cleaning++;
5810                         PAGE_SLEEP(object, dst_page, THREAD_UNINT);
5811                         continue;
5812                 }
5813                 if ( (cntrl_flags & UPL_NEED_32BIT_ADDR) &&
5814                      dst_page->phys_page >= (max_valid_dma_address >> PAGE_SHIFT) ) {
5815                         vm_page_t       low_page;
5816                         int             refmod;
5817
5818                         /*
5819                          * support devices that can't DMA above 32 bits
5820                          * by substituting pages from a pool of low address
5821                          * memory for any pages we find above the 4G mark
5822                          * can't substitute if the page is already wired because
5823                          * we don't know whether that physical address has been
5824                          * handed out to some other 64 bit capable DMA device to use
5825                          */
5826                         if (VM_PAGE_WIRED(dst_page)) {
5827                                 ret = KERN_PROTECTION_FAILURE;
5828                                 goto return_err;
5829                         }
5830                         low_page = vm_page_grablo();
5831
5832                         if (low_page == VM_PAGE_NULL) {
5833                                 ret = KERN_RESOURCE_SHORTAGE;
5834                                 goto return_err;
5835                         }
5836                         /*
5837                          * from here until the vm_page_replace completes
5838                          * we musn't drop the object lock... we don't
5839                          * want anyone refaulting this page in and using
5840                          * it after we disconnect it... we want the fault
5841                          * to find the new page being substituted.
5842                          */
5843                         if (dst_page->pmapped)
5844                                 refmod = pmap_disconnect(dst_page->phys_page);
5845                         else
5846                                 refmod = 0;
5847
5848                         if ( !dst_page->absent)
5849                                 vm_page_copy(dst_page, low_page);
5850
5851                         low_page->reference = dst_page->reference;
5852                         low_page->dirty     = dst_page->dirty;
5853                         low_page->absent    = dst_page->absent;
5854
5855                         if (refmod & VM_MEM_REFERENCED)
5856                                 low_page->reference = TRUE;
5857                         if (refmod & VM_MEM_MODIFIED)
5858                                 low_page->dirty = TRUE;
5859
5860                         vm_page_replace(low_page, object, dst_offset);
5861
5862                         dst_page = low_page;
5863                         /*
5864                          * vm_page_grablo returned the page marked
5865                          * BUSY... we don't need a PAGE_WAKEUP_DONE
5866                          * here, because we've never dropped the object lock
5867                          */
5868                         if ( !dst_page->absent)
5869                                 dst_page->busy = FALSE;
5870                 }
5871                 if ( !dst_page->busy)
5872                         dwp->dw_mask |= DW_vm_page_wire;
5873
5874                 if (cntrl_flags & UPL_BLOCK_ACCESS) {
5875                         /*
5876                          * Mark the page "busy" to block any future page fault
5877                          * on this page.  We'll also remove the mapping
5878                          * of all these pages before leaving this routine.
5879                          */
5880                         assert(!dst_page->fictitious);
5881                         dst_page->busy = TRUE;
5882                 }
5883                 /*
5884                  * expect the page to be used
5885                  * page queues lock must be held to set 'reference'
5886                  */
5887                 dwp->dw_mask |= DW_set_reference;
5888
5889                 if (!(cntrl_flags & UPL_COPYOUT_FROM))
5890                         dst_page->dirty = TRUE;
5891 record_phys_addr:
5892                 if (dst_page->busy)
5893                         upl->flags |= UPL_HAS_BUSY;
5894
5895                 pg_num = (unsigned int) ((dst_offset-offset)/PAGE_SIZE);
5896                 assert(pg_num == (dst_offset-offset)/PAGE_SIZE);
5897                 lite_list[pg_num>>5] |= 1 << (pg_num & 31);
5898
5899                 if (dst_page->phys_page > upl->highest_page)
5900                         upl->highest_page = dst_page->phys_page;
5901
5902                 if (user_page_list) {
5903                         user_page_list[entry].phys_addr = dst_page->phys_page;
5904                         user_page_list[entry].pageout   = dst_page->pageout;
5905                         user_page_list[entry].absent    = dst_page->absent;
5906                         user_page_list[entry].dirty     = dst_page->dirty;
5907                         user_page_list[entry].precious  = dst_page->precious;
5908                         user_page_list[entry].device    = FALSE;
5909                         if (dst_page->clustered == TRUE)
5910                                 user_page_list[entry].speculative = dst_page->speculative;
5911                         else
5912                                 user_page_list[entry].speculative = FALSE;
5913                         user_page_list[entry].cs_validated = dst_page->cs_validated;
5914                         user_page_list[entry].cs_tainted = dst_page->cs_tainted;
5915                 }
5916                 if (object != kernel_object) {
5917                         /*
5918                          * someone is explicitly grabbing this page...
5919                          * update clustered and speculative state
5920                          *
5921                          */
5922                         VM_PAGE_CONSUME_CLUSTERED(dst_page);
5923                 }
5924                 entry++;
5925                 dst_offset += PAGE_SIZE_64;
5926                 xfer_size -= PAGE_SIZE;
5927
5928                 if (dwp->dw_mask) {
5929                         if (dst_page->busy == FALSE) {
5930                                 /*
5931                                  * dw_do_work may need to drop the object lock
5932                                  * if it does, we need the pages it's looking at to
5933                                  * be held stable via the busy bit.
5934                                  */
5935                                 dst_page->busy = TRUE;
5936                                 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
5937                         }
5938                         dwp->dw_m = dst_page;
5939                         dwp++;
5940                         dw_count++;
5941
5942                         if (dw_count >= DELAYED_WORK_LIMIT) {
5943                                 dw_do_work(object, &dw_array[0], dw_count);
5944
5945                                 dwp = &dw_array[0];
5946                                 dw_count = 0;
5947                         }
5948                 }
5949         }
5950         if (dw_count)
5951                 dw_do_work(object, &dw_array[0], dw_count);
5952
5953         if (page_list_count != NULL) {
5954                 if (upl->flags & UPL_INTERNAL)
5955                         *page_list_count = 0;
5956                 else if (*page_list_count > entry)
5957                         *page_list_count = entry;
5958         }
5959         vm_object_unlock(object);
5960
5961         if (cntrl_flags & UPL_BLOCK_ACCESS) {
5962                 /*
5963                  * We've marked all the pages "busy" so that future
5964                  * page faults will block.
5965                  * Now remove the mapping for these pages, so that they
5966                  * can't be accessed without causing a page fault.
5967                  */
5968                 vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
5969                                        PMAP_NULL, 0, VM_PROT_NONE);
5970                 assert(!object->blocked_access);
5971                 object->blocked_access = TRUE;
5972         }
5973         return KERN_SUCCESS;
5974
5975 return_err:
5976         dw_index = 0;
5977
5978         for (; offset < dst_offset; offset += PAGE_SIZE) {
5979                 boolean_t need_unwire;
5980
5981                 dst_page = vm_page_lookup(object, offset);
5982
5983                 if (dst_page == VM_PAGE_NULL)
5984                         panic("vm_object_iopl_request: Wired page missing. \n");
5985
5986                 /*
5987                  * if we've already processed this page in an earlier
5988                  * dw_do_work, we need to undo the wiring... we will
5989                  * leave the dirty and reference bits on if they
5990                  * were set, since we don't have a good way of knowing
5991                  * what the previous state was and we won't get here
5992                  * under any normal circumstances...  we will always
5993                  * clear BUSY and wakeup any waiters via vm_page_free
5994                  * or PAGE_WAKEUP_DONE
5995                  */
5996                 need_unwire = TRUE;
5997
5998                 if (dw_count) {
5999                         if (dw_array[dw_index].dw_m == dst_page) {
6000                                 /*
6001                                  * still in the deferred work list
6002                                  * which means we haven't yet called
6003                                  * vm_page_wire on this page
6004                                  */
6005                                 need_unwire = FALSE;
6006
6007                                 dw_index++;
6008                                 dw_count--;
6009                         }
6010                 }
6011                 vm_page_lock_queues();
6012
6013                 if (dst_page->absent) {
6014                         vm_page_free(dst_page);
6015
6016                         need_unwire = FALSE;
6017                 } else {
6018                         if (need_unwire == TRUE)
6019                                 vm_page_unwire(dst_page, TRUE);
6020
6021                         PAGE_WAKEUP_DONE(dst_page);
6022                 }
6023                 vm_page_unlock_queues();
6024
6025                 if (need_unwire == TRUE)
6026                         VM_STAT_INCR(reactivations);
6027         }
6028 #if UPL_DEBUG
6029         upl->upl_state = 2;
6030 #endif
6031         if (! (upl->flags & UPL_KERNEL_OBJECT)) {
6032                 vm_object_activity_end(object);
6033         }
6034         vm_object_unlock(object);
6035         upl_destroy(upl);
6036
6037         return ret;
6038 }
6039
6040 kern_return_t
6041 upl_transpose(
6042         upl_t           upl1,
6043         upl_t           upl2)
6044 {
6045         kern_return_t           retval;
6046         boolean_t               upls_locked;
6047         vm_object_t             object1, object2;
6048
6049         if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2  || ((upl1->flags & UPL_VECTOR)==UPL_VECTOR)  || ((upl2->flags & UPL_VECTOR)==UPL_VECTOR)) {
6050                 return KERN_INVALID_ARGUMENT;
6051         }
6052
6053         upls_locked = FALSE;
6054
6055         /*
6056          * Since we need to lock both UPLs at the same time,
6057          * avoid deadlocks by always taking locks in the same order.
6058          */
6059         if (upl1 < upl2) {
6060                 upl_lock(upl1);
6061                 upl_lock(upl2);
6062         } else {
6063                 upl_lock(upl2);
6064                 upl_lock(upl1);
6065         }
6066         upls_locked = TRUE;     /* the UPLs will need to be unlocked */
6067
6068         object1 = upl1->map_object;
6069         object2 = upl2->map_object;
6070
6071         if (upl1->offset != 0 || upl2->offset != 0 ||
6072             upl1->size != upl2->size) {
6073                 /*
6074                  * We deal only with full objects, not subsets.
6075                  * That's because we exchange the entire backing store info
6076                  * for the objects: pager, resident pages, etc...  We can't do
6077                  * only part of it.
6078                  */
6079                 retval = KERN_INVALID_VALUE;
6080                 goto done;
6081         }
6082
6083         /*
6084          * Tranpose the VM objects' backing store.
6085          */
6086         retval = vm_object_transpose(object1, object2,
6087                                      (vm_object_size_t) upl1->size);
6088
6089         if (retval == KERN_SUCCESS) {
6090                 /*
6091                  * Make each UPL point to the correct VM object, i.e. the
6092                  * object holding the pages that the UPL refers to...
6093                  */
6094 #if UPL_DEBUG
6095                 queue_remove(&object1->uplq, upl1, upl_t, uplq);
6096                 queue_remove(&object2->uplq, upl2, upl_t, uplq);
6097 #endif
6098                 upl1->map_object = object2;
6099                 upl2->map_object = object1;
6100 #if UPL_DEBUG
6101                 queue_enter(&object1->uplq, upl2, upl_t, uplq);
6102                 queue_enter(&object2->uplq, upl1, upl_t, uplq);
6103 #endif
6104         }
6105
6106 done:
6107         /*
6108          * Cleanup.
6109          */
6110         if (upls_locked) {
6111                 upl_unlock(upl1);
6112                 upl_unlock(upl2);
6113                 upls_locked = FALSE;
6114         }
6115
6116         return retval;
6117 }
6118
6119 /*
6120  * ENCRYPTED SWAP:
6121  *
6122  * Rationale:  the user might have some encrypted data on disk (via
6123  * FileVault or any other mechanism).  That data is then decrypted in
6124  * memory, which is safe as long as the machine is secure.  But that
6125  * decrypted data in memory could be paged out to disk by the default
6126  * pager.  The data would then be stored on disk in clear (not encrypted)
6127  * and it could be accessed by anyone who gets physical access to the
6128  * disk (if the laptop or the disk gets stolen for example).  This weakens
6129  * the security offered by FileVault.
6130  *
6131  * Solution:  the default pager will optionally request that all the
6132  * pages it gathers for pageout be encrypted, via the UPL interfaces,
6133  * before it sends this UPL to disk via the vnode_pageout() path.
6134  *
6135  * Notes:
6136  *
6137  * To avoid disrupting the VM LRU algorithms, we want to keep the
6138  * clean-in-place mechanisms, which allow us to send some extra pages to
6139  * swap (clustering) without actually removing them from the user's
6140  * address space.  We don't want the user to unknowingly access encrypted
6141  * data, so we have to actually remove the encrypted pages from the page
6142  * table.  When the user accesses the data, the hardware will fail to
6143  * locate the virtual page in its page table and will trigger a page
6144  * fault.  We can then decrypt the page and enter it in the page table
6145  * again.  Whenever we allow the user to access the contents of a page,
6146  * we have to make sure it's not encrypted.
6147  *
6148  *
6149  */
6150 /*
6151  * ENCRYPTED SWAP:
6152  * Reserve of virtual addresses in the kernel address space.
6153  * We need to map the physical pages in the kernel, so that we
6154  * can call the encryption/decryption routines with a kernel
6155  * virtual address.  We keep this pool of pre-allocated kernel
6156  * virtual addresses so that we don't have to scan the kernel's
6157  * virtaul address space each time we need to encrypt or decrypt
6158  * a physical page.
6159  * It would be nice to be able to encrypt and decrypt in physical
6160  * mode but that might not always be more efficient...
6161  */
6162 decl_simple_lock_data(,vm_paging_lock)
6163 #define VM_PAGING_NUM_PAGES     64
6164 vm_map_offset_t vm_paging_base_address = 0;
6165 boolean_t       vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
6166 int             vm_paging_max_index = 0;
6167 int             vm_paging_page_waiter = 0;
6168 int             vm_paging_page_waiter_total = 0;
6169 unsigned long   vm_paging_no_kernel_page = 0;
6170 unsigned long   vm_paging_objects_mapped = 0;
6171 unsigned long   vm_paging_pages_mapped = 0;
6172 unsigned long   vm_paging_objects_mapped_slow = 0;
6173 unsigned long   vm_paging_pages_mapped_slow = 0;
6174
6175 void
6176 vm_paging_map_init(void)
6177 {
6178         kern_return_t   kr;
6179         vm_map_offset_t page_map_offset;
6180         vm_map_entry_t  map_entry;
6181
6182         assert(vm_paging_base_address == 0);
6183
6184         /*
6185          * Initialize our pool of pre-allocated kernel
6186          * virtual addresses.
6187          */
6188         page_map_offset = 0;
6189         kr = vm_map_find_space(kernel_map,
6190                                &page_map_offset,
6191                                VM_PAGING_NUM_PAGES * PAGE_SIZE,
6192                                0,
6193                                0,
6194                                &map_entry);
6195         if (kr != KERN_SUCCESS) {
6196                 panic("vm_paging_map_init: kernel_map full\n");
6197         }
6198         map_entry->object.vm_object = kernel_object;
6199         map_entry->offset = page_map_offset;
6200         vm_object_reference(kernel_object);
6201         vm_map_unlock(kernel_map);
6202
6203         assert(vm_paging_base_address == 0);
6204         vm_paging_base_address = page_map_offset;
6205 }
6206
6207 /*
6208  * ENCRYPTED SWAP:
6209  * vm_paging_map_object:
6210  *      Maps part of a VM object's pages in the kernel
6211  *      virtual address space, using the pre-allocated
6212  *      kernel virtual addresses, if possible.
6213  * Context:
6214  *      The VM object is locked.  This lock will get
6215  *      dropped and re-acquired though, so the caller
6216  *      must make sure the VM object is kept alive
6217  *      (by holding a VM map that has a reference
6218  *      on it, for example, or taking an extra reference).
6219  *      The page should also be kept busy to prevent
6220  *      it from being reclaimed.
6221  */
6222 kern_return_t
6223 vm_paging_map_object(
6224         vm_map_offset_t         *address,
6225         vm_page_t               page,
6226         vm_object_t             object,
6227         vm_object_offset_t      offset,
6228         vm_map_size_t           *size,
6229         vm_prot_t               protection,
6230         boolean_t               can_unlock_object)
6231 {
6232         kern_return_t           kr;
6233         vm_map_offset_t         page_map_offset;
6234         vm_map_size_t           map_size;
6235         vm_object_offset_t      object_offset;
6236         int                     i;
6237
6238
6239         if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
6240                 assert(page->busy);
6241                 /*
6242                  * Use one of the pre-allocated kernel virtual addresses
6243                  * and just enter the VM page in the kernel address space
6244                  * at that virtual address.
6245                  */
6246                 simple_lock(&vm_paging_lock);
6247
6248                 /*
6249                  * Try and find an available kernel virtual address
6250                  * from our pre-allocated pool.
6251                  */
6252                 page_map_offset = 0;
6253                 for (;;) {
6254                         for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
6255                                 if (vm_paging_page_inuse[i] == FALSE) {
6256                                         page_map_offset =
6257                                                 vm_paging_base_address +
6258                                                 (i * PAGE_SIZE);
6259                                         break;
6260                                 }
6261                         }
6262                         if (page_map_offset != 0) {
6263                                 /* found a space to map our page ! */
6264                                 break;
6265                         }
6266
6267                         if (can_unlock_object) {
6268                                 /*
6269                                  * If we can afford to unlock the VM object,
6270                                  * let's take the slow path now...
6271                                  */
6272                                 break;
6273                         }
6274                         /*
6275                          * We can't afford to unlock the VM object, so
6276                          * let's wait for a space to become available...
6277                          */
6278                         vm_paging_page_waiter_total++;
6279                         vm_paging_page_waiter++;
6280                         thread_sleep_fast_usimple_lock(&vm_paging_page_waiter,
6281                                                        &vm_paging_lock,
6282                                                        THREAD_UNINT);
6283                         vm_paging_page_waiter--;
6284                         /* ... and try again */
6285                 }
6286
6287                 if (page_map_offset != 0) {
6288                         /*
6289                          * We found a kernel virtual address;
6290                          * map the physical page to that virtual address.
6291                          */
6292                         if (i > vm_paging_max_index) {
6293                                 vm_paging_max_index = i;
6294                         }
6295                         vm_paging_page_inuse[i] = TRUE;
6296                         simple_unlock(&vm_paging_lock);
6297
6298                         if (page->pmapped == FALSE) {
6299                                 pmap_sync_page_data_phys(page->phys_page);
6300                         }
6301                         page->pmapped = TRUE;
6302
6303                         /*
6304                          * Keep the VM object locked over the PMAP_ENTER
6305                          * and the actual use of the page by the kernel,
6306                          * or this pmap mapping might get undone by a
6307                          * vm_object_pmap_protect() call...
6308                          */
6309                         PMAP_ENTER(kernel_pmap,
6310                                    page_map_offset,
6311                                    page,
6312                                    protection,
6313                                    ((int) page->object->wimg_bits &
6314                                     VM_WIMG_MASK),
6315                                    TRUE);
6316                         vm_paging_objects_mapped++;
6317                         vm_paging_pages_mapped++;
6318                         *address = page_map_offset;
6319
6320                         /* all done and mapped, ready to use ! */
6321                         return KERN_SUCCESS;
6322                 }
6323
6324                 /*
6325                  * We ran out of pre-allocated kernel virtual
6326                  * addresses.  Just map the page in the kernel
6327                  * the slow and regular way.
6328                  */
6329                 vm_paging_no_kernel_page++;
6330                 simple_unlock(&vm_paging_lock);
6331         }
6332
6333         if (! can_unlock_object) {
6334                 return KERN_NOT_SUPPORTED;
6335         }
6336
6337         object_offset = vm_object_trunc_page(offset);
6338         map_size = vm_map_round_page(*size);
6339
6340         /*
6341          * Try and map the required range of the object
6342          * in the kernel_map
6343          */
6344
6345         vm_object_reference_locked(object);     /* for the map entry */
6346         vm_object_unlock(object);
6347
6348         kr = vm_map_enter(kernel_map,
6349                           address,
6350                           map_size,
6351                           0,
6352                           VM_FLAGS_ANYWHERE,
6353                           object,
6354                           object_offset,
6355                           FALSE,
6356                           protection,
6357                           VM_PROT_ALL,
6358                           VM_INHERIT_NONE);
6359         if (kr != KERN_SUCCESS) {
6360                 *address = 0;
6361                 *size = 0;
6362                 vm_object_deallocate(object);   /* for the map entry */
6363                 vm_object_lock(object);
6364                 return kr;
6365         }
6366
6367         *size = map_size;
6368
6369         /*
6370          * Enter the mapped pages in the page table now.
6371          */
6372         vm_object_lock(object);
6373         /*
6374          * VM object must be kept locked from before PMAP_ENTER()
6375          * until after the kernel is done accessing the page(s).
6376          * Otherwise, the pmap mappings in the kernel could be
6377          * undone by a call to vm_object_pmap_protect().
6378          */
6379
6380         for (page_map_offset = 0;
6381              map_size != 0;
6382              map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
6383                 unsigned int    cache_attr;
6384
6385                 page = vm_page_lookup(object, offset + page_map_offset);
6386                 if (page == VM_PAGE_NULL) {
6387                         printf("vm_paging_map_object: no page !?");
6388                         vm_object_unlock(object);
6389                         kr = vm_map_remove(kernel_map, *address, *size,
6390                                            VM_MAP_NO_FLAGS);
6391                         assert(kr == KERN_SUCCESS);
6392                         *address = 0;
6393                         *size = 0;
6394                         vm_object_lock(object);
6395                         return KERN_MEMORY_ERROR;
6396                 }
6397                 if (page->pmapped == FALSE) {
6398                         pmap_sync_page_data_phys(page->phys_page);
6399                 }
6400                 page->pmapped = TRUE;
6401                 cache_attr = ((unsigned int) object->wimg_bits) & VM_WIMG_MASK;
6402
6403                 //assert(pmap_verify_free(page->phys_page));
6404                 PMAP_ENTER(kernel_pmap,
6405                            *address + page_map_offset,
6406                            page,
6407                            protection,
6408                            cache_attr,
6409                            TRUE);
6410         }
6411
6412         vm_paging_objects_mapped_slow++;
6413         vm_paging_pages_mapped_slow += (unsigned long) (map_size / PAGE_SIZE_64);
6414
6415         return KERN_SUCCESS;
6416 }
6417
6418 /*
6419  * ENCRYPTED SWAP:
6420  * vm_paging_unmap_object:
6421  *      Unmaps part of a VM object's pages from the kernel
6422  *      virtual address space.
6423  * Context:
6424  *      The VM object is locked.  This lock will get
6425  *      dropped and re-acquired though.
6426  */
6427 void
6428 vm_paging_unmap_object(
6429         vm_object_t     object,
6430         vm_map_offset_t start,
6431         vm_map_offset_t end)
6432 {
6433         kern_return_t   kr;
6434         int             i;
6435
6436         if ((vm_paging_base_address == 0) ||
6437             (start < vm_paging_base_address) ||
6438             (end > (vm_paging_base_address
6439                      + (VM_PAGING_NUM_PAGES * PAGE_SIZE)))) {
6440                 /*
6441                  * We didn't use our pre-allocated pool of
6442                  * kernel virtual address.  Deallocate the
6443                  * virtual memory.
6444                  */
6445                 if (object != VM_OBJECT_NULL) {
6446                         vm_object_unlock(object);
6447                 }
6448                 kr = vm_map_remove(kernel_map, start, end, VM_MAP_NO_FLAGS);
6449                 if (object != VM_OBJECT_NULL) {
6450                         vm_object_lock(object);
6451                 }
6452                 assert(kr == KERN_SUCCESS);
6453         } else {
6454                 /*
6455                  * We used a kernel virtual address from our
6456                  * pre-allocated pool.  Put it back in the pool
6457                  * for next time.
6458                  */
6459                 assert(end - start == PAGE_SIZE);
6460                 i = (int) ((start - vm_paging_base_address) >> PAGE_SHIFT);
6461                 assert(i >= 0 && i < VM_PAGING_NUM_PAGES);
6462
6463                 /* undo the pmap mapping */
6464                 pmap_remove(kernel_pmap, start, end);
6465
6466                 simple_lock(&vm_paging_lock);
6467                 vm_paging_page_inuse[i] = FALSE;
6468                 if (vm_paging_page_waiter) {
6469                         thread_wakeup(&vm_paging_page_waiter);
6470                 }
6471                 simple_unlock(&vm_paging_lock);
6472         }
6473 }
6474
6475 #if CRYPTO
6476 /*
6477  * Encryption data.
6478  * "iv" is the "initial vector".  Ideally, we want to
6479  * have a different one for each page we encrypt, so that
6480  * crackers can't find encryption patterns too easily.
6481  */
6482 #define SWAP_CRYPT_AES_KEY_SIZE 128     /* XXX 192 and 256 don't work ! */
6483 boolean_t               swap_crypt_ctx_initialized = FALSE;
6484 aes_32t                 swap_crypt_key[8]; /* big enough for a 256 key */
6485 aes_ctx                 swap_crypt_ctx;
6486 const unsigned char     swap_crypt_null_iv[AES_BLOCK_SIZE] = {0xa, };
6487
6488 #if DEBUG
6489 boolean_t               swap_crypt_ctx_tested = FALSE;
6490 unsigned char swap_crypt_test_page_ref[4096] __attribute__((aligned(4096)));
6491 unsigned char swap_crypt_test_page_encrypt[4096] __attribute__((aligned(4096)));
6492 unsigned char swap_crypt_test_page_decrypt[4096] __attribute__((aligned(4096)));
6493 #endif /* DEBUG */
6494
6495 /*
6496  * Initialize the encryption context: key and key size.
6497  */
6498 void swap_crypt_ctx_initialize(void); /* forward */
6499 void
6500 swap_crypt_ctx_initialize(void)
6501 {
6502         unsigned int    i;
6503
6504         /*
6505          * No need for locking to protect swap_crypt_ctx_initialized
6506          * because the first use of encryption will come from the
6507          * pageout thread (we won't pagein before there's been a pageout)
6508          * and there's only one pageout thread.
6509          */
6510         if (swap_crypt_ctx_initialized == FALSE) {
6511                 for (i = 0;
6512                      i < (sizeof (swap_crypt_key) /
6513                           sizeof (swap_crypt_key[0]));
6514                      i++) {
6515                         swap_crypt_key[i] = random();
6516                 }
6517                 aes_encrypt_key((const unsigned char *) swap_crypt_key,
6518                                 SWAP_CRYPT_AES_KEY_SIZE,
6519                                 &swap_crypt_ctx.encrypt);
6520                 aes_decrypt_key((const unsigned char *) swap_crypt_key,
6521                                 SWAP_CRYPT_AES_KEY_SIZE,
6522                                 &swap_crypt_ctx.decrypt);
6523                 swap_crypt_ctx_initialized = TRUE;
6524         }
6525
6526 #if DEBUG
6527         /*
6528          * Validate the encryption algorithms.
6529          */
6530         if (swap_crypt_ctx_tested == FALSE) {
6531                 /* initialize */
6532                 for (i = 0; i < 4096; i++) {
6533                         swap_crypt_test_page_ref[i] = (char) i;
6534                 }
6535                 /* encrypt */
6536                 aes_encrypt_cbc(swap_crypt_test_page_ref,
6537                                 swap_crypt_null_iv,
6538                                 PAGE_SIZE / AES_BLOCK_SIZE,
6539                                 swap_crypt_test_page_encrypt,
6540                                 &swap_crypt_ctx.encrypt);
6541                 /* decrypt */
6542                 aes_decrypt_cbc(swap_crypt_test_page_encrypt,
6543                                 swap_crypt_null_iv,
6544                                 PAGE_SIZE / AES_BLOCK_SIZE,
6545                                 swap_crypt_test_page_decrypt,
6546                                 &swap_crypt_ctx.decrypt);
6547                 /* compare result with original */
6548                 for (i = 0; i < 4096; i ++) {
6549                         if (swap_crypt_test_page_decrypt[i] !=
6550                             swap_crypt_test_page_ref[i]) {
6551                                 panic("encryption test failed");
6552                         }
6553                 }
6554
6555                 /* encrypt again */
6556                 aes_encrypt_cbc(swap_crypt_test_page_decrypt,
6557                                 swap_crypt_null_iv,
6558                                 PAGE_SIZE / AES_BLOCK_SIZE,
6559                                 swap_crypt_test_page_decrypt,
6560                                 &swap_crypt_ctx.encrypt);
6561                 /* decrypt in place */
6562                 aes_decrypt_cbc(swap_crypt_test_page_decrypt,
6563                                 swap_crypt_null_iv,
6564                                 PAGE_SIZE / AES_BLOCK_SIZE,
6565                                 swap_crypt_test_page_decrypt,
6566                                 &swap_crypt_ctx.decrypt);
6567                 for (i = 0; i < 4096; i ++) {
6568                         if (swap_crypt_test_page_decrypt[i] !=
6569                             swap_crypt_test_page_ref[i]) {
6570                                 panic("in place encryption test failed");
6571                         }
6572                 }
6573
6574                 swap_crypt_ctx_tested = TRUE;
6575         }
6576 #endif /* DEBUG */
6577 }
6578
6579 /*
6580  * ENCRYPTED SWAP:
6581  * vm_page_encrypt:
6582  *      Encrypt the given page, for secure paging.
6583  *      The page might already be mapped at kernel virtual
6584  *      address "kernel_mapping_offset".  Otherwise, we need
6585  *      to map it.
6586  *
6587  * Context:
6588  *      The page's object is locked, but this lock will be released
6589  *      and re-acquired.
6590  *      The page is busy and not accessible by users (not entered in any pmap).
6591  */
6592 void
6593 vm_page_encrypt(
6594         vm_page_t       page,
6595         vm_map_offset_t kernel_mapping_offset)
6596 {
6597         kern_return_t           kr;
6598         vm_map_size_t           kernel_mapping_size;
6599         vm_offset_t             kernel_vaddr;
6600         union {
6601                 unsigned char   aes_iv[AES_BLOCK_SIZE];
6602                 struct {
6603                         memory_object_t         pager_object;
6604                         vm_object_offset_t      paging_offset;
6605                 } vm;
6606         } encrypt_iv;
6607
6608         if (! vm_pages_encrypted) {
6609                 vm_pages_encrypted = TRUE;
6610         }
6611
6612         assert(page->busy);
6613         assert(page->dirty || page->precious);
6614
6615         if (page->encrypted) {
6616                 /*
6617                  * Already encrypted: no need to do it again.
6618                  */
6619                 vm_page_encrypt_already_encrypted_counter++;
6620                 return;
6621         }
6622         ASSERT_PAGE_DECRYPTED(page);
6623
6624         /*
6625          * Take a paging-in-progress reference to keep the object
6626          * alive even if we have to unlock it (in vm_paging_map_object()
6627          * for example)...
6628          */
6629         vm_object_paging_begin(page->object);
6630
6631         if (kernel_mapping_offset == 0) {
6632                 /*
6633                  * The page hasn't already been mapped in kernel space
6634                  * by the caller.  Map it now, so that we can access
6635                  * its contents and encrypt them.
6636                  */
6637                 kernel_mapping_size = PAGE_SIZE;
6638                 kr = vm_paging_map_object(&kernel_mapping_offset,
6639                                           page,
6640                                           page->object,
6641                                           page->offset,
6642                                           &kernel_mapping_size,
6643                                           VM_PROT_READ | VM_PROT_WRITE,
6644                                           FALSE);
6645                 if (kr != KERN_SUCCESS) {
6646                         panic("vm_page_encrypt: "
6647                               "could not map page in kernel: 0x%x\n",
6648                               kr);
6649                 }
6650         } else {
6651                 kernel_mapping_size = 0;
6652         }
6653         kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
6654
6655         if (swap_crypt_ctx_initialized == FALSE) {
6656                 swap_crypt_ctx_initialize();
6657         }
6658         assert(swap_crypt_ctx_initialized);
6659
6660         /*
6661          * Prepare an "initial vector" for the encryption.
6662          * We use the "pager" and the "paging_offset" for that
6663          * page to obfuscate the encrypted data a bit more and
6664          * prevent crackers from finding patterns that they could
6665          * use to break the key.
6666          */
6667         bzero(&encrypt_iv.aes_iv[0], sizeof (encrypt_iv.aes_iv));
6668         encrypt_iv.vm.pager_object = page->object->pager;
6669         encrypt_iv.vm.paging_offset =
6670                 page->object->paging_offset + page->offset;
6671
6672         /* encrypt the "initial vector" */
6673         aes_encrypt_cbc((const unsigned char *) &encrypt_iv.aes_iv[0],
6674                         swap_crypt_null_iv,
6675                         1,
6676                         &encrypt_iv.aes_iv[0],
6677                         &swap_crypt_ctx.encrypt);
6678
6679         /*
6680          * Encrypt the page.
6681          */
6682         aes_encrypt_cbc((const unsigned char *) kernel_vaddr,
6683                         &encrypt_iv.aes_iv[0],
6684                         PAGE_SIZE / AES_BLOCK_SIZE,
6685                         (unsigned char *) kernel_vaddr,
6686                         &swap_crypt_ctx.encrypt);
6687
6688         vm_page_encrypt_counter++;
6689
6690         /*
6691          * Unmap the page from the kernel's address space,
6692          * if we had to map it ourselves.  Otherwise, let
6693          * the caller undo the mapping if needed.
6694          */
6695         if (kernel_mapping_size != 0) {
6696                 vm_paging_unmap_object(page->object,
6697                                        kernel_mapping_offset,
6698                                        kernel_mapping_offset + kernel_mapping_size);
6699         }
6700
6701         /*
6702          * Clear the "reference" and "modified" bits.
6703          * This should clean up any impact the encryption had
6704          * on them.
6705          * The page was kept busy and disconnected from all pmaps,
6706          * so it can't have been referenced or modified from user
6707          * space.
6708          * The software bits will be reset later after the I/O
6709          * has completed (in upl_commit_range()).
6710          */
6711         pmap_clear_refmod(page->phys_page, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
6712
6713         page->encrypted = TRUE;
6714
6715         vm_object_paging_end(page->object);
6716 }
6717
6718 /*
6719  * ENCRYPTED SWAP:
6720  * vm_page_decrypt:
6721  *      Decrypt the given page.
6722  *      The page might already be mapped at kernel virtual
6723  *      address "kernel_mapping_offset".  Otherwise, we need
6724  *      to map it.
6725  *
6726  * Context:
6727  *      The page's VM object is locked but will be unlocked and relocked.
6728  *      The page is busy and not accessible by users (not entered in any pmap).
6729  */
6730 void
6731 vm_page_decrypt(
6732         vm_page_t       page,
6733         vm_map_offset_t kernel_mapping_offset)
6734 {
6735         kern_return_t           kr;
6736         vm_map_size_t           kernel_mapping_size;
6737         vm_offset_t             kernel_vaddr;
6738         union {
6739                 unsigned char   aes_iv[AES_BLOCK_SIZE];
6740                 struct {
6741                         memory_object_t         pager_object;
6742                         vm_object_offset_t      paging_offset;
6743                 } vm;
6744         } decrypt_iv;
6745
6746         assert(page->busy);
6747         assert(page->encrypted);
6748
6749         /*
6750          * Take a paging-in-progress reference to keep the object
6751          * alive even if we have to unlock it (in vm_paging_map_object()
6752          * for example)...
6753          */
6754         vm_object_paging_begin(page->object);
6755
6756         if (kernel_mapping_offset == 0) {
6757                 /*
6758                  * The page hasn't already been mapped in kernel space
6759                  * by the caller.  Map it now, so that we can access
6760                  * its contents and decrypt them.
6761                  */
6762                 kernel_mapping_size = PAGE_SIZE;
6763                 kr = vm_paging_map_object(&kernel_mapping_offset,
6764                                           page,
6765                                           page->object,
6766                                           page->offset,
6767                                           &kernel_mapping_size,
6768                                           VM_PROT_READ | VM_PROT_WRITE,
6769                                           FALSE);
6770                 if (kr != KERN_SUCCESS) {
6771                         panic("vm_page_decrypt: "
6772                               "could not map page in kernel: 0x%x\n",
6773                               kr);
6774                 }
6775         } else {
6776                 kernel_mapping_size = 0;
6777         }
6778         kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
6779
6780         assert(swap_crypt_ctx_initialized);
6781
6782         /*
6783          * Prepare an "initial vector" for the decryption.
6784          * It has to be the same as the "initial vector" we
6785          * used to encrypt that page.
6786          */
6787         bzero(&decrypt_iv.aes_iv[0], sizeof (decrypt_iv.aes_iv));
6788         decrypt_iv.vm.pager_object = page->object->pager;
6789         decrypt_iv.vm.paging_offset =
6790                 page->object->paging_offset + page->offset;
6791
6792         /* encrypt the "initial vector" */
6793         aes_encrypt_cbc((const unsigned char *) &decrypt_iv.aes_iv[0],
6794                         swap_crypt_null_iv,
6795                         1,
6796                         &decrypt_iv.aes_iv[0],
6797                         &swap_crypt_ctx.encrypt);
6798
6799         /*
6800          * Decrypt the page.
6801          */
6802         aes_decrypt_cbc((const unsigned char *) kernel_vaddr,
6803                         &decrypt_iv.aes_iv[0],
6804                         PAGE_SIZE / AES_BLOCK_SIZE,
6805                         (unsigned char *) kernel_vaddr,
6806                         &swap_crypt_ctx.decrypt);
6807         vm_page_decrypt_counter++;
6808
6809         /*
6810          * Unmap the page from the kernel's address space,
6811          * if we had to map it ourselves.  Otherwise, let
6812          * the caller undo the mapping if needed.
6813          */
6814         if (kernel_mapping_size != 0) {
6815                 vm_paging_unmap_object(page->object,
6816                                        kernel_vaddr,
6817                                        kernel_vaddr + PAGE_SIZE);
6818         }
6819
6820         /*
6821          * After decryption, the page is actually clean.
6822          * It was encrypted as part of paging, which "cleans"
6823          * the "dirty" pages.
6824          * Noone could access it after it was encrypted
6825          * and the decryption doesn't count.
6826          */
6827         page->dirty = FALSE;
6828         assert (page->cs_validated == FALSE);
6829         pmap_clear_refmod(page->phys_page, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
6830         page->encrypted = FALSE;
6831
6832         /*
6833          * We've just modified the page's contents via the data cache and part
6834          * of the new contents might still be in the cache and not yet in RAM.
6835          * Since the page is now available and might get gathered in a UPL to
6836          * be part of a DMA transfer from a driver that expects the memory to
6837          * be coherent at this point, we have to flush the data cache.
6838          */
6839         pmap_sync_page_attributes_phys(page->phys_page);
6840         /*
6841          * Since the page is not mapped yet, some code might assume that it
6842          * doesn't need to invalidate the instruction cache when writing to
6843          * that page.  That code relies on "pmapped" being FALSE, so that the
6844          * caches get synchronized when the page is first mapped.
6845          */
6846         assert(pmap_verify_free(page->phys_page));
6847         page->pmapped = FALSE;
6848         page->wpmapped = FALSE;
6849
6850         vm_object_paging_end(page->object);
6851 }
6852
6853 #if DEVELOPMENT || DEBUG
6854 unsigned long upl_encrypt_upls = 0;
6855 unsigned long upl_encrypt_pages = 0;
6856 #endif
6857
6858 /*
6859  * ENCRYPTED SWAP:
6860  *
6861  * upl_encrypt:
6862  *      Encrypts all the pages in the UPL, within the specified range.
6863  *
6864  */
6865 void
6866 upl_encrypt(
6867         upl_t                   upl,
6868         upl_offset_t            crypt_offset,
6869         upl_size_t              crypt_size)
6870 {
6871         upl_size_t              upl_size, subupl_size=crypt_size;
6872         upl_offset_t            offset_in_upl, subupl_offset=crypt_offset;
6873         vm_object_t             upl_object;
6874         vm_object_offset_t      upl_offset;
6875         vm_page_t               page;
6876         vm_object_t             shadow_object;
6877         vm_object_offset_t      shadow_offset;
6878         vm_object_offset_t      paging_offset;
6879         vm_object_offset_t      base_offset;
6880         int                     isVectorUPL = 0;
6881         upl_t                   vector_upl = NULL;
6882
6883         if((isVectorUPL = vector_upl_is_valid(upl)))
6884                 vector_upl = upl;
6885
6886 process_upl_to_encrypt:
6887         if(isVectorUPL) {
6888                 crypt_size = subupl_size;
6889                 crypt_offset = subupl_offset;
6890                 upl =  vector_upl_subupl_byoffset(vector_upl, &crypt_offset, &crypt_size);
6891                 if(upl == NULL)
6892                         panic("upl_encrypt: Accessing a sub-upl that doesn't exist\n");
6893                 subupl_size -= crypt_size;
6894                 subupl_offset += crypt_size;
6895         }
6896
6897 #if DEVELOPMENT || DEBUG
6898         upl_encrypt_upls++;
6899         upl_encrypt_pages += crypt_size / PAGE_SIZE;
6900 #endif
6901         upl_object = upl->map_object;
6902         upl_offset = upl->offset;
6903         upl_size = upl->size;
6904
6905         vm_object_lock(upl_object);
6906
6907         /*
6908          * Find the VM object that contains the actual pages.
6909          */
6910         if (upl_object->pageout) {
6911                 shadow_object = upl_object->shadow;
6912                 /*
6913                  * The offset in the shadow object is actually also
6914                  * accounted for in upl->offset.  It possibly shouldn't be
6915                  * this way, but for now don't account for it twice.
6916                  */
6917                 shadow_offset = 0;
6918                 assert(upl_object->paging_offset == 0); /* XXX ? */
6919                 vm_object_lock(shadow_object);
6920         } else {
6921                 shadow_object = upl_object;
6922                 shadow_offset = 0;
6923         }
6924
6925         paging_offset = shadow_object->paging_offset;
6926         vm_object_paging_begin(shadow_object);
6927
6928         if (shadow_object != upl_object)
6929                 vm_object_unlock(upl_object);
6930
6931
6932         base_offset = shadow_offset;
6933         base_offset += upl_offset;
6934         base_offset += crypt_offset;
6935         base_offset -= paging_offset;
6936
6937         assert(crypt_offset + crypt_size <= upl_size);
6938
6939         for (offset_in_upl = 0;
6940              offset_in_upl < crypt_size;
6941              offset_in_upl += PAGE_SIZE) {
6942                 page = vm_page_lookup(shadow_object,
6943                                       base_offset + offset_in_upl);
6944                 if (page == VM_PAGE_NULL) {
6945                         panic("upl_encrypt: "
6946                               "no page for (obj=%p,off=%lld+%d)!\n",
6947                               shadow_object,
6948                               base_offset,
6949                               offset_in_upl);
6950                 }
6951                 /*
6952                  * Disconnect the page from all pmaps, so that nobody can
6953                  * access it while it's encrypted.  After that point, all
6954                  * accesses to this page will cause a page fault and block
6955                  * while the page is busy being encrypted.  After the
6956                  * encryption completes, any access will cause a
6957                  * page fault and the page gets decrypted at that time.
6958                  */
6959                 pmap_disconnect(page->phys_page);
6960                 vm_page_encrypt(page, 0);
6961
6962                 if (vm_object_lock_avoid(shadow_object)) {
6963                         /*
6964                          * Give vm_pageout_scan() a chance to convert more
6965                          * pages from "clean-in-place" to "clean-and-free",
6966                          * if it's interested in the same pages we selected
6967                          * in this cluster.
6968                          */
6969                         vm_object_unlock(shadow_object);
6970                         mutex_pause(2);
6971                         vm_object_lock(shadow_object);
6972                 }
6973         }
6974
6975         vm_object_paging_end(shadow_object);
6976         vm_object_unlock(shadow_object);
6977
6978         if(isVectorUPL && subupl_size)
6979                 goto process_upl_to_encrypt;
6980 }
6981
6982 #else /* CRYPTO */
6983 void
6984 upl_encrypt(
6985         __unused upl_t                  upl,
6986         __unused upl_offset_t   crypt_offset,
6987         __unused upl_size_t     crypt_size)
6988 {
6989 }
6990
6991 void
6992 vm_page_encrypt(
6993         __unused vm_page_t              page,
6994         __unused vm_map_offset_t        kernel_mapping_offset)
6995 {
6996 }
6997
6998 void
6999 vm_page_decrypt(
7000         __unused vm_page_t              page,
7001         __unused vm_map_offset_t        kernel_mapping_offset)
7002 {
7003 }
7004
7005 #endif /* CRYPTO */
7006
7007 void
7008 vm_pageout_queue_steal(vm_page_t page, boolean_t queues_locked)
7009 {
7010         boolean_t       pageout;
7011
7012         pageout = page->pageout;
7013
7014         page->list_req_pending = FALSE;
7015         page->cleaning = FALSE;
7016         page->pageout = FALSE;
7017
7018         if (!queues_locked) {
7019                 vm_page_lockspin_queues();
7020         }
7021
7022         /*
7023          * need to drop the laundry count...
7024          * we may also need to remove it
7025          * from the I/O paging queue...
7026          * vm_pageout_throttle_up handles both cases
7027          *
7028          * the laundry and pageout_queue flags are cleared...
7029          */
7030         vm_pageout_throttle_up(page);
7031
7032         if (pageout == TRUE) {
7033                 /*
7034                  * toss the wire count we picked up
7035                  * when we intially set this page up
7036                  * to be cleaned...
7037                  */
7038                 vm_page_unwire(page, TRUE);
7039         }
7040         vm_page_steal_pageout_page++;
7041
7042         if (!queues_locked) {
7043                 vm_page_unlock_queues();
7044         }
7045 }
7046
7047 upl_t
7048 vector_upl_create(vm_offset_t upl_offset)
7049 {
7050         int     vector_upl_size  = sizeof(struct _vector_upl);
7051         int i=0;
7052         upl_t   upl;
7053         vector_upl_t vector_upl = (vector_upl_t)kalloc(vector_upl_size);
7054
7055         upl = upl_create(0,UPL_VECTOR,0);
7056         upl->vector_upl = vector_upl;
7057         upl->offset = upl_offset;
7058         vector_upl->size = 0;
7059         vector_upl->offset = upl_offset;
7060         vector_upl->invalid_upls=0;
7061         vector_upl->num_upls=0;
7062         vector_upl->pagelist = NULL;
7063
7064         for(i=0; i < MAX_VECTOR_UPL_ELEMENTS ; i++) {
7065                 vector_upl->upl_iostates[i].size = 0;
7066                 vector_upl->upl_iostates[i].offset = 0;
7067
7068         }
7069         return upl;
7070 }
7071
7072 void
7073 vector_upl_deallocate(upl_t upl)
7074 {
7075         if(upl) {
7076                 vector_upl_t vector_upl = upl->vector_upl;
7077                 if(vector_upl) {
7078                         if(vector_upl->invalid_upls != vector_upl->num_upls)
7079                                 panic("Deallocating non-empty Vectored UPL\n");
7080                         kfree(vector_upl->pagelist,(sizeof(struct upl_page_info)*(vector_upl->size/PAGE_SIZE)));
7081                         vector_upl->invalid_upls=0;
7082                         vector_upl->num_upls = 0;
7083                         vector_upl->pagelist = NULL;
7084                         vector_upl->size = 0;
7085                         vector_upl->offset = 0;
7086                         kfree(vector_upl, sizeof(struct _vector_upl));
7087                         vector_upl = (vector_upl_t)0xdeadbeef;
7088                 }
7089                 else
7090                         panic("vector_upl_deallocate was passed a non-vectored upl\n");
7091         }
7092         else
7093                 panic("vector_upl_deallocate was passed a NULL upl\n");
7094 }
7095
7096 boolean_t
7097 vector_upl_is_valid(upl_t upl)
7098 {
7099         if(upl &&  ((upl->flags & UPL_VECTOR)==UPL_VECTOR)) {
7100                 vector_upl_t vector_upl = upl->vector_upl;
7101                 if(vector_upl == NULL || vector_upl == (vector_upl_t)0xdeadbeef || vector_upl == (vector_upl_t)0xfeedbeef)
7102                         return FALSE;
7103                 else
7104                         return TRUE;
7105         }
7106         return FALSE;
7107 }
7108
7109 boolean_t
7110 vector_upl_set_subupl(upl_t upl,upl_t subupl, uint32_t io_size)
7111 {
7112         if(vector_upl_is_valid(upl)) {
7113                 vector_upl_t vector_upl = upl->vector_upl;
7114
7115                 if(vector_upl) {
7116                         if(subupl) {
7117                                 if(io_size) {
7118                                         if(io_size < PAGE_SIZE)
7119                                                 io_size = PAGE_SIZE;
7120                                         subupl->vector_upl = (void*)vector_upl;
7121                                         vector_upl->upl_elems[vector_upl->num_upls++] = subupl;
7122                                         vector_upl->size += io_size;
7123                                         upl->size += io_size;
7124                                 }
7125                                 else {
7126                                         uint32_t i=0,invalid_upls=0;
7127                                         for(i = 0; i < vector_upl->num_upls; i++) {
7128                                                 if(vector_upl->upl_elems[i] == subupl)
7129                                                         break;
7130                                         }
7131                                         if(i == vector_upl->num_upls)
7132                                                 panic("Trying to remove sub-upl when none exists");
7133
7134                                         vector_upl->upl_elems[i] = NULL;
7135                                         invalid_upls = hw_atomic_add(&(vector_upl)->invalid_upls, 1);
7136                                         if(invalid_upls == vector_upl->num_upls)
7137                                                 return TRUE;
7138                                         else
7139                                                 return FALSE;
7140                                 }
7141                         }
7142                         else
7143                                 panic("vector_upl_set_subupl was passed a NULL upl element\n");
7144                 }
7145                 else
7146                         panic("vector_upl_set_subupl was passed a non-vectored upl\n");
7147         }
7148         else
7149                 panic("vector_upl_set_subupl was passed a NULL upl\n");
7150
7151         return FALSE;
7152 }
7153
7154 void
7155 vector_upl_set_pagelist(upl_t upl)
7156 {
7157         if(vector_upl_is_valid(upl)) {
7158                 uint32_t i=0;
7159                 vector_upl_t vector_upl = upl->vector_upl;
7160
7161                 if(vector_upl) {
7162                         vm_offset_t pagelist_size=0, cur_upl_pagelist_size=0;
7163
7164                         vector_upl->pagelist = (upl_page_info_array_t)kalloc(sizeof(struct upl_page_info)*(vector_upl->size/PAGE_SIZE));
7165
7166                         for(i=0; i < vector_upl->num_upls; i++) {
7167                                 cur_upl_pagelist_size = sizeof(struct upl_page_info) * vector_upl->upl_elems[i]->size/PAGE_SIZE;
7168                                 bcopy(UPL_GET_INTERNAL_PAGE_LIST_SIMPLE(vector_upl->upl_elems[i]), (char*)vector_upl->pagelist + pagelist_size, cur_upl_pagelist_size);
7169                                 pagelist_size += cur_upl_pagelist_size;
7170                                 if(vector_upl->upl_elems[i]->highest_page > upl->highest_page)
7171                                         upl->highest_page = vector_upl->upl_elems[i]->highest_page;
7172                         }
7173                         assert( pagelist_size == (sizeof(struct upl_page_info)*(vector_upl->size/PAGE_SIZE)) );
7174                 }
7175                 else
7176                         panic("vector_upl_set_pagelist was passed a non-vectored upl\n");
7177         }
7178         else
7179                 panic("vector_upl_set_pagelist was passed a NULL upl\n");
7180
7181 }
7182
7183 upl_t
7184 vector_upl_subupl_byindex(upl_t upl, uint32_t index)
7185 {
7186         if(vector_upl_is_valid(upl)) {
7187                 vector_upl_t vector_upl = upl->vector_upl;
7188                 if(vector_upl) {
7189                         if(index < vector_upl->num_upls)
7190                                 return vector_upl->upl_elems[index];
7191                 }
7192                 else
7193                         panic("vector_upl_subupl_byindex was passed a non-vectored upl\n");
7194         }
7195         return NULL;
7196 }
7197
7198 upl_t
7199 vector_upl_subupl_byoffset(upl_t upl, upl_offset_t *upl_offset, upl_size_t *upl_size)
7200 {
7201         if(vector_upl_is_valid(upl)) {
7202                 uint32_t i=0;
7203                 vector_upl_t vector_upl = upl->vector_upl;
7204
7205                 if(vector_upl) {
7206                         upl_t subupl = NULL;
7207                         vector_upl_iostates_t subupl_state;
7208
7209                         for(i=0; i < vector_upl->num_upls; i++) {
7210                                 subupl = vector_upl->upl_elems[i];
7211                                 subupl_state = vector_upl->upl_iostates[i];
7212                                 if( *upl_offset <= (subupl_state.offset + subupl_state.size - 1)) {
7213                                         /* We could have been passed an offset/size pair that belongs
7214                                          * to an UPL element that has already been committed/aborted.
7215                                          * If so, return NULL.
7216                                          */
7217                                         if(subupl == NULL)
7218                                                 return NULL;
7219                                         if((subupl_state.offset + subupl_state.size) < (*upl_offset + *upl_size)) {
7220                                                 *upl_size = (subupl_state.offset + subupl_state.size) - *upl_offset;
7221                                                 if(*upl_size > subupl_state.size)
7222                                                         *upl_size = subupl_state.size;
7223                                         }
7224                                         if(*upl_offset >= subupl_state.offset)
7225                                                 *upl_offset -= subupl_state.offset;
7226                                         else if(i)
7227                                                 panic("Vector UPL offset miscalculation\n");
7228                                         return subupl;
7229                                 }
7230                         }
7231                 }
7232                 else
7233                         panic("vector_upl_subupl_byoffset was passed a non-vectored UPL\n");
7234         }
7235         return NULL;
7236 }
7237
7238 void
7239 vector_upl_get_submap(upl_t upl, vm_map_t *v_upl_submap, vm_offset_t *submap_dst_addr)
7240 {
7241         *v_upl_submap = NULL;
7242
7243         if(vector_upl_is_valid(upl)) {
7244                 vector_upl_t vector_upl = upl->vector_upl;
7245                 if(vector_upl) {
7246                         *v_upl_submap = vector_upl->submap;
7247                         *submap_dst_addr = vector_upl->submap_dst_addr;
7248                 }
7249                 else
7250                         panic("vector_upl_get_submap was passed a non-vectored UPL\n");
7251         }
7252         else
7253                 panic("vector_upl_get_submap was passed a null UPL\n");
7254 }
7255
7256 void
7257 vector_upl_set_submap(upl_t upl, vm_map_t submap, vm_offset_t submap_dst_addr)
7258 {
7259         if(vector_upl_is_valid(upl)) {
7260                 vector_upl_t vector_upl = upl->vector_upl;
7261                 if(vector_upl) {
7262                         vector_upl->submap = submap;
7263                         vector_upl->submap_dst_addr = submap_dst_addr;
7264                 }
7265                 else
7266                         panic("vector_upl_get_submap was passed a non-vectored UPL\n");
7267         }
7268         else
7269                 panic("vector_upl_get_submap was passed a NULL UPL\n");
7270 }
7271
7272 void
7273 vector_upl_set_iostate(upl_t upl, upl_t subupl, upl_offset_t offset, upl_size_t size)
7274 {
7275         if(vector_upl_is_valid(upl)) {
7276                 uint32_t i = 0;
7277                 vector_upl_t vector_upl = upl->vector_upl;
7278
7279                 if(vector_upl) {
7280                         for(i = 0; i < vector_upl->num_upls; i++) {
7281                                 if(vector_upl->upl_elems[i] == subupl)
7282                                         break;
7283                         }
7284
7285                         if(i == vector_upl->num_upls)
7286                                 panic("setting sub-upl iostate when none exists");
7287
7288                         vector_upl->upl_iostates[i].offset = offset;
7289                         if(size < PAGE_SIZE)
7290                                 size = PAGE_SIZE;
7291                         vector_upl->upl_iostates[i].size = size;
7292                 }
7293                 else
7294                         panic("vector_upl_set_iostate was passed a non-vectored UPL\n");
7295         }
7296         else
7297                 panic("vector_upl_set_iostate was passed a NULL UPL\n");
7298 }
7299
7300 void
7301 vector_upl_get_iostate(upl_t upl, upl_t subupl, upl_offset_t *offset, upl_size_t *size)
7302 {
7303         if(vector_upl_is_valid(upl)) {
7304                 uint32_t i = 0;
7305                 vector_upl_t vector_upl = upl->vector_upl;
7306
7307                 if(vector_upl) {
7308                         for(i = 0; i < vector_upl->num_upls; i++) {
7309                                 if(vector_upl->upl_elems[i] == subupl)
7310                                         break;
7311                         }
7312
7313                         if(i == vector_upl->num_upls)
7314                                 panic("getting sub-upl iostate when none exists");
7315
7316                         *offset = vector_upl->upl_iostates[i].offset;
7317                         *size = vector_upl->upl_iostates[i].size;
7318                 }
7319                 else
7320                         panic("vector_upl_get_iostate was passed a non-vectored UPL\n");
7321         }
7322         else
7323                 panic("vector_upl_get_iostate was passed a NULL UPL\n");
7324 }
7325
7326 void
7327 vector_upl_get_iostate_byindex(upl_t upl, uint32_t index, upl_offset_t *offset, upl_size_t *size)
7328 {
7329         if(vector_upl_is_valid(upl)) {
7330                 vector_upl_t vector_upl = upl->vector_upl;
7331                 if(vector_upl) {
7332                         if(index < vector_upl->num_upls) {
7333                                 *offset = vector_upl->upl_iostates[index].offset;
7334                                 *size = vector_upl->upl_iostates[index].size;
7335                         }
7336                         else
7337                                 *offset = *size = 0;
7338                 }
7339                 else
7340                         panic("vector_upl_get_iostate_byindex was passed a non-vectored UPL\n");
7341         }
7342         else
7343                 panic("vector_upl_get_iostate_byindex was passed a NULL UPL\n");
7344 }
7345
7346 upl_page_info_t *
7347 upl_get_internal_vectorupl_pagelist(upl_t upl)
7348 {
7349         return ((vector_upl_t)(upl->vector_upl))->pagelist;
7350 }
7351
7352 void *
7353 upl_get_internal_vectorupl(upl_t upl)
7354 {
7355         return upl->vector_upl;
7356 }
7357
7358 vm_size_t
7359 upl_get_internal_pagelist_offset(void)
7360 {
7361         return sizeof(struct upl);
7362 }
7363
7364 void
7365 upl_clear_dirty(
7366         upl_t           upl,
7367         boolean_t       value)
7368 {
7369         if (value) {
7370                 upl->flags |= UPL_CLEAR_DIRTY;
7371         } else {
7372                 upl->flags &= ~UPL_CLEAR_DIRTY;
7373         }
7374 }
7375
7376
7377 #ifdef MACH_BSD
7378
7379 boolean_t  upl_device_page(upl_page_info_t *upl)
7380 {
7381         return(UPL_DEVICE_PAGE(upl));
7382 }
7383 boolean_t  upl_page_present(upl_page_info_t *upl, int index)
7384 {
7385         return(UPL_PAGE_PRESENT(upl, index));
7386 }
7387 boolean_t  upl_speculative_page(upl_page_info_t *upl, int index)
7388 {
7389         return(UPL_SPECULATIVE_PAGE(upl, index));
7390 }
7391 boolean_t  upl_dirty_page(upl_page_info_t *upl, int index)
7392 {
7393         return(UPL_DIRTY_PAGE(upl, index));
7394 }
7395 boolean_t  upl_valid_page(upl_page_info_t *upl, int index)
7396 {
7397         return(UPL_VALID_PAGE(upl, index));
7398 }
7399 ppnum_t  upl_phys_page(upl_page_info_t *upl, int index)
7400 {
7401         return(UPL_PHYS_PAGE(upl, index));
7402 }
7403
7404
7405 void
7406 vm_countdirtypages(void)
7407 {
7408         vm_page_t m;
7409         int dpages;
7410         int pgopages;
7411         int precpages;
7412
7413
7414         dpages=0;
7415         pgopages=0;
7416         precpages=0;
7417
7418         vm_page_lock_queues();
7419         m = (vm_page_t) queue_first(&vm_page_queue_inactive);
7420         do {
7421                 if (m ==(vm_page_t )0) break;
7422
7423                 if(m->dirty) dpages++;
7424                 if(m->pageout) pgopages++;
7425                 if(m->precious) precpages++;
7426
7427                 assert(m->object != kernel_object);
7428                 m = (vm_page_t) queue_next(&m->pageq);
7429                 if (m ==(vm_page_t )0) break;
7430
7431         } while (!queue_end(&vm_page_queue_inactive,(queue_entry_t) m));
7432         vm_page_unlock_queues();
7433
7434         vm_page_lock_queues();
7435         m = (vm_page_t) queue_first(&vm_page_queue_throttled);
7436         do {
7437                 if (m ==(vm_page_t )0) break;
7438
7439                 dpages++;
7440                 assert(m->dirty);
7441                 assert(!m->pageout);
7442                 assert(m->object != kernel_object);
7443                 m = (vm_page_t) queue_next(&m->pageq);
7444                 if (m ==(vm_page_t )0) break;
7445
7446         } while (!queue_end(&vm_page_queue_throttled,(queue_entry_t) m));
7447         vm_page_unlock_queues();
7448
7449         vm_page_lock_queues();
7450         m = (vm_page_t) queue_first(&vm_page_queue_zf);
7451         do {
7452                 if (m ==(vm_page_t )0) break;
7453
7454                 if(m->dirty) dpages++;
7455                 if(m->pageout) pgopages++;
7456                 if(m->precious) precpages++;
7457
7458                 assert(m->object != kernel_object);
7459                 m = (vm_page_t) queue_next(&m->pageq);
7460                 if (m ==(vm_page_t )0) break;
7461
7462         } while (!queue_end(&vm_page_queue_zf,(queue_entry_t) m));
7463         vm_page_unlock_queues();
7464
7465         printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
7466
7467         dpages=0;
7468         pgopages=0;
7469         precpages=0;
7470
7471         vm_page_lock_queues();
7472         m = (vm_page_t) queue_first(&vm_page_queue_active);
7473
7474         do {
7475                 if(m == (vm_page_t )0) break;
7476                 if(m->dirty) dpages++;
7477                 if(m->pageout) pgopages++;
7478                 if(m->precious) precpages++;
7479
7480                 assert(m->object != kernel_object);
7481                 m = (vm_page_t) queue_next(&m->pageq);
7482                 if(m == (vm_page_t )0) break;
7483
7484         } while (!queue_end(&vm_page_queue_active,(queue_entry_t) m));
7485         vm_page_unlock_queues();
7486
7487         printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
7488
7489 }
7490 #endif /* MACH_BSD */
7491
7492 ppnum_t upl_get_highest_page(
7493                              upl_t                      upl)
7494 {
7495         return upl->highest_page;
7496 }
7497
7498 upl_size_t upl_get_size(
7499                              upl_t                      upl)
7500 {
7501         return upl->size;
7502 }
7503
7504 #if UPL_DEBUG
7505 kern_return_t  upl_ubc_alias_set(upl_t upl, uintptr_t alias1, uintptr_t alias2)
7506 {
7507         upl->ubc_alias1 = alias1;
7508         upl->ubc_alias2 = alias2;
7509         return KERN_SUCCESS;
7510 }
7511 int  upl_ubc_alias_get(upl_t upl, uintptr_t * al, uintptr_t * al2)
7512 {
7513         if(al)
7514                 *al = upl->ubc_alias1;
7515         if(al2)
7516                 *al2 = upl->ubc_alias2;
7517         return KERN_SUCCESS;
7518 }
7519 #endif /* UPL_DEBUG */
7520
7521
7522
7523 #if     MACH_KDB
7524 #include <ddb/db_output.h>
7525 #include <ddb/db_print.h>
7526 #include <vm/vm_print.h>
7527
7528 #define printf  kdbprintf
7529 void            db_pageout(void);
7530
7531 void
7532 db_vm(void)
7533 {
7534
7535         iprintf("VM Statistics:\n");
7536         db_indent += 2;
7537         iprintf("pages:\n");
7538         db_indent += 2;
7539         iprintf("activ %5d  inact %5d  free  %5d",
7540                 vm_page_active_count, vm_page_inactive_count,
7541                 vm_page_free_count);
7542         printf("   wire  %5d  gobbl %5d\n",
7543                vm_page_wire_count, vm_page_gobble_count);
7544         db_indent -= 2;
7545         iprintf("target:\n");
7546         db_indent += 2;
7547         iprintf("min   %5d  inact %5d  free  %5d",
7548                 vm_page_free_min, vm_page_inactive_target,
7549                 vm_page_free_target);
7550         printf("   resrv %5d\n", vm_page_free_reserved);
7551         db_indent -= 2;
7552         iprintf("pause:\n");
7553         db_pageout();
7554         db_indent -= 2;
7555 }
7556
7557 #if     MACH_COUNTERS
7558 extern int c_laundry_pages_freed;
7559 #endif  /* MACH_COUNTERS */
7560
7561 void
7562 db_pageout(void)
7563 {
7564         iprintf("Pageout Statistics:\n");
7565         db_indent += 2;
7566         iprintf("active %5d  inactv %5d\n",
7567                 vm_pageout_active, vm_pageout_inactive);
7568         iprintf("nolock %5d  avoid  %5d  busy   %5d  absent %5d\n",
7569                 vm_pageout_inactive_nolock, vm_pageout_inactive_avoid,
7570                 vm_pageout_inactive_busy, vm_pageout_inactive_absent);
7571         iprintf("used   %5d  clean  %5d  dirty  %5d\n",
7572                 vm_pageout_inactive_used, vm_pageout_inactive_clean,
7573                 vm_pageout_inactive_dirty);
7574 #if     MACH_COUNTERS
7575         iprintf("laundry_pages_freed %d\n", c_laundry_pages_freed);
7576 #endif  /* MACH_COUNTERS */
7577 #if     MACH_CLUSTER_STATS
7578         iprintf("Cluster Statistics:\n");
7579         db_indent += 2;
7580         iprintf("dirtied   %5d   cleaned  %5d   collisions  %5d\n",
7581                 vm_pageout_cluster_dirtied, vm_pageout_cluster_cleaned,
7582                 vm_pageout_cluster_collisions);
7583         iprintf("clusters  %5d   conversions  %5d\n",
7584                 vm_pageout_cluster_clusters, vm_pageout_cluster_conversions);
7585         db_indent -= 2;
7586         iprintf("Target Statistics:\n");
7587         db_indent += 2;
7588         iprintf("collisions   %5d   page_dirtied  %5d   page_freed  %5d\n",
7589                 vm_pageout_target_collisions, vm_pageout_target_page_dirtied,
7590                 vm_pageout_target_page_freed);
7591         db_indent -= 2;
7592 #endif  /* MACH_CLUSTER_STATS */
7593         db_indent -= 2;
7594 }
7595
7596 #endif  /* MACH_KDB */