osfmk/vm/vm_pageout.c

   1 /*
   2  * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58 /*
  59  *      File:   vm/vm_pageout.c
  60  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  61  *      Date:   1985
  62  *
  63  *      The proverbial page-out daemon.
  64  */
  65
  66 #include <stdint.h>
  67
  68 #include <debug.h>
  69 #include <mach_pagemap.h>
  70 #include <mach_cluster_stats.h>
  71 #include <mach_kdb.h>
  72 #include <advisory_pageout.h>
  73
  74 #include <mach/mach_types.h>
  75 #include <mach/memory_object.h>
  76 #include <mach/memory_object_default.h>
  77 #include <mach/memory_object_control_server.h>
  78 #include <mach/mach_host_server.h>
  79 #include <mach/upl.h>
  80 #include <mach/vm_map.h>
  81 #include <mach/vm_param.h>
  82 #include <mach/vm_statistics.h>
  83 #include <mach/sdt.h>
  84
  85 #include <kern/kern_types.h>
  86 #include <kern/counters.h>
  87 #include <kern/host_statistics.h>
  88 #include <kern/machine.h>
  89 #include <kern/misc_protos.h>
  90 #include <kern/sched.h>
  91 #include <kern/thread.h>
  92 #include <kern/xpr.h>
  93 #include <kern/kalloc.h>
  94
  95 #include <machine/vm_tuning.h>
  96 #include <machine/commpage.h>
  97
  98 #if CONFIG_EMBEDDED
  99 #include <sys/kern_memorystatus.h>
 100 #endif
 101
 102 #include <vm/pmap.h>
 103 #include <vm/vm_fault.h>
 104 #include <vm/vm_map.h>
 105 #include <vm/vm_object.h>
 106 #include <vm/vm_page.h>
 107 #include <vm/vm_pageout.h>
 108 #include <vm/vm_protos.h> /* must be last */
 109 #include <vm/memory_object.h>
 110 #include <vm/vm_purgeable_internal.h>
 111
 112 /*
 113  * ENCRYPTED SWAP:
 114  */
 115 #include <../bsd/crypto/aes/aes.h>
 116 extern u_int32_t random(void);  /* from <libkern/libkern.h> */
 117
 118 #if UPL_DEBUG
 119 #include <libkern/OSDebug.h>
 120 #endif
 121
 122 #ifndef VM_PAGEOUT_BURST_ACTIVE_THROTTLE   /* maximum iterations of the active queue to move pages to inactive */
 123 #define VM_PAGEOUT_BURST_ACTIVE_THROTTLE  100
 124 #endif
 125
 126 #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE  /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
 127 #ifdef  CONFIG_EMBEDDED
 128 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 1024
 129 #else
 130 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096
 131 #endif
 132 #endif
 133
 134 #ifndef VM_PAGEOUT_DEADLOCK_RELIEF
 135 #define VM_PAGEOUT_DEADLOCK_RELIEF 100  /* number of pages to move to break deadlock */
 136 #endif
 137
 138 #ifndef VM_PAGEOUT_INACTIVE_RELIEF
 139 #define VM_PAGEOUT_INACTIVE_RELIEF 50   /* minimum number of pages to move to the inactive q */
 140 #endif
 141
 142 #ifndef VM_PAGE_LAUNDRY_MAX
 143 #define VM_PAGE_LAUNDRY_MAX     16UL    /* maximum pageouts on a given pageout queue */
 144 #endif  /* VM_PAGEOUT_LAUNDRY_MAX */
 145
 146 #ifndef VM_PAGEOUT_BURST_WAIT
 147 #define VM_PAGEOUT_BURST_WAIT   30      /* milliseconds per page */
 148 #endif  /* VM_PAGEOUT_BURST_WAIT */
 149
 150 #ifndef VM_PAGEOUT_EMPTY_WAIT
 151 #define VM_PAGEOUT_EMPTY_WAIT   200     /* milliseconds */
 152 #endif  /* VM_PAGEOUT_EMPTY_WAIT */
 153
 154 #ifndef VM_PAGEOUT_DEADLOCK_WAIT
 155 #define VM_PAGEOUT_DEADLOCK_WAIT        300     /* milliseconds */
 156 #endif  /* VM_PAGEOUT_DEADLOCK_WAIT */
 157
 158 #ifndef VM_PAGEOUT_IDLE_WAIT
 159 #define VM_PAGEOUT_IDLE_WAIT    10      /* milliseconds */
 160 #endif  /* VM_PAGEOUT_IDLE_WAIT */
 161
 162 #ifndef VM_PAGE_SPECULATIVE_TARGET
 163 #define VM_PAGE_SPECULATIVE_TARGET(total) ((total) * 1 / 20)
 164 #endif /* VM_PAGE_SPECULATIVE_TARGET */
 165
 166 #ifndef VM_PAGE_INACTIVE_HEALTHY_LIMIT
 167 #define VM_PAGE_INACTIVE_HEALTHY_LIMIT(total) ((total) * 1 / 200)
 168 #endif /* VM_PAGE_INACTIVE_HEALTHY_LIMIT */
 169
 170
 171 /*
 172  *      To obtain a reasonable LRU approximation, the inactive queue
 173  *      needs to be large enough to give pages on it a chance to be
 174  *      referenced a second time.  This macro defines the fraction
 175  *      of active+inactive pages that should be inactive.
 176  *      The pageout daemon uses it to update vm_page_inactive_target.
 177  *
 178  *      If vm_page_free_count falls below vm_page_free_target and
 179  *      vm_page_inactive_count is below vm_page_inactive_target,
 180  *      then the pageout daemon starts running.
 181  */
 182
 183 #ifndef VM_PAGE_INACTIVE_TARGET
 184 #define VM_PAGE_INACTIVE_TARGET(avail)  ((avail) * 1 / 3)
 185 #endif  /* VM_PAGE_INACTIVE_TARGET */
 186
 187 /*
 188  *      Once the pageout daemon starts running, it keeps going
 189  *      until vm_page_free_count meets or exceeds vm_page_free_target.
 190  */
 191
 192 #ifndef VM_PAGE_FREE_TARGET
 193 #ifdef  CONFIG_EMBEDDED
 194 #define VM_PAGE_FREE_TARGET(free)       (15 + (free) / 100)
 195 #else
 196 #define VM_PAGE_FREE_TARGET(free)       (15 + (free) / 80)
 197 #endif
 198 #endif  /* VM_PAGE_FREE_TARGET */
 199
 200 /*
 201  *      The pageout daemon always starts running once vm_page_free_count
 202  *      falls below vm_page_free_min.
 203  */
 204
 205 #ifndef VM_PAGE_FREE_MIN
 206 #ifdef  CONFIG_EMBEDDED
 207 #define VM_PAGE_FREE_MIN(free)          (10 + (free) / 200)
 208 #else
 209 #define VM_PAGE_FREE_MIN(free)          (10 + (free) / 100)
 210 #endif
 211 #endif  /* VM_PAGE_FREE_MIN */
 212
 213 #define VM_PAGE_FREE_MIN_LIMIT          1500
 214 #define VM_PAGE_FREE_TARGET_LIMIT       2000
 215
 216
 217 /*
 218  *      When vm_page_free_count falls below vm_page_free_reserved,
 219  *      only vm-privileged threads can allocate pages.  vm-privilege
 220  *      allows the pageout daemon and default pager (and any other
 221  *      associated threads needed for default pageout) to continue
 222  *      operation by dipping into the reserved pool of pages.
 223  */
 224
 225 #ifndef VM_PAGE_FREE_RESERVED
 226 #define VM_PAGE_FREE_RESERVED(n)        \
 227         ((unsigned) (6 * VM_PAGE_LAUNDRY_MAX) + (n))
 228 #endif  /* VM_PAGE_FREE_RESERVED */
 229
 230 /*
 231  *      When we dequeue pages from the inactive list, they are
 232  *      reactivated (ie, put back on the active queue) if referenced.
 233  *      However, it is possible to starve the free list if other
 234  *      processors are referencing pages faster than we can turn off
 235  *      the referenced bit.  So we limit the number of reactivations
 236  *      we will make per call of vm_pageout_scan().
 237  */
 238 #define VM_PAGE_REACTIVATE_LIMIT_MAX 20000
 239 #ifndef VM_PAGE_REACTIVATE_LIMIT
 240 #ifdef  CONFIG_EMBEDDED
 241 #define VM_PAGE_REACTIVATE_LIMIT(avail) (VM_PAGE_INACTIVE_TARGET(avail) / 2)
 242 #else
 243 #define VM_PAGE_REACTIVATE_LIMIT(avail) (MAX((avail) * 1 / 20,VM_PAGE_REACTIVATE_LIMIT_MAX))
 244 #endif
 245 #endif  /* VM_PAGE_REACTIVATE_LIMIT */
 246 #define VM_PAGEOUT_INACTIVE_FORCE_RECLAIM       100
 247
 248
 249 /*
 250  * Exported variable used to broadcast the activation of the pageout scan
 251  * Working Set uses this to throttle its use of pmap removes.  In this
 252  * way, code which runs within memory in an uncontested context does
 253  * not keep encountering soft faults.
 254  */
 255
 256 unsigned int    vm_pageout_scan_event_counter = 0;
 257
 258 /*
 259  * Forward declarations for internal routines.
 260  */
 261
 262 static void vm_pageout_garbage_collect(int);
 263 static void vm_pageout_iothread_continue(struct vm_pageout_queue *);
 264 static void vm_pageout_iothread_external(void);
 265 static void vm_pageout_iothread_internal(void);
 266
 267 extern void vm_pageout_continue(void);
 268 extern void vm_pageout_scan(void);
 269
 270 static thread_t vm_pageout_external_iothread = THREAD_NULL;
 271 static thread_t vm_pageout_internal_iothread = THREAD_NULL;
 272
 273 unsigned int vm_pageout_reserved_internal = 0;
 274 unsigned int vm_pageout_reserved_really = 0;
 275
 276 unsigned int vm_pageout_idle_wait = 0;          /* milliseconds */
 277 unsigned int vm_pageout_empty_wait = 0;         /* milliseconds */
 278 unsigned int vm_pageout_burst_wait = 0;         /* milliseconds */
 279 unsigned int vm_pageout_deadlock_wait = 0;      /* milliseconds */
 280 unsigned int vm_pageout_deadlock_relief = 0;
 281 unsigned int vm_pageout_inactive_relief = 0;
 282 unsigned int vm_pageout_burst_active_throttle = 0;
 283 unsigned int vm_pageout_burst_inactive_throttle = 0;
 284
 285 /*
 286  *      Protection against zero fill flushing live working sets derived
 287  *      from existing backing store and files
 288  */
 289 unsigned int vm_accellerate_zf_pageout_trigger = 400;
 290 unsigned int zf_queue_min_count = 100;
 291 unsigned int vm_zf_queue_count = 0;
 292
 293 #if defined(__ppc__) /* On ppc, vm statistics are still 32-bit */
 294 unsigned int vm_zf_count = 0;
 295 #else
 296 uint64_t vm_zf_count __attribute__((aligned(8))) = 0;
 297 #endif
 298
 299 /*
 300  *      These variables record the pageout daemon's actions:
 301  *      how many pages it looks at and what happens to those pages.
 302  *      No locking needed because only one thread modifies the variables.
 303  */
 304
 305 unsigned int vm_pageout_active = 0;             /* debugging */
 306 unsigned int vm_pageout_inactive = 0;           /* debugging */
 307 unsigned int vm_pageout_inactive_throttled = 0; /* debugging */
 308 unsigned int vm_pageout_inactive_forced = 0;    /* debugging */
 309 unsigned int vm_pageout_inactive_nolock = 0;    /* debugging */
 310 unsigned int vm_pageout_inactive_avoid = 0;     /* debugging */
 311 unsigned int vm_pageout_inactive_busy = 0;      /* debugging */
 312 unsigned int vm_pageout_inactive_absent = 0;    /* debugging */
 313 unsigned int vm_pageout_inactive_used = 0;      /* debugging */
 314 unsigned int vm_pageout_inactive_clean = 0;     /* debugging */
 315 unsigned int vm_pageout_inactive_dirty = 0;     /* debugging */
 316 unsigned int vm_pageout_inactive_deactivated = 0;       /* debugging */
 317 unsigned int vm_pageout_inactive_zf = 0;        /* debugging */
 318 unsigned int vm_pageout_dirty_no_pager = 0;     /* debugging */
 319 unsigned int vm_pageout_purged_objects = 0;     /* debugging */
 320 unsigned int vm_stat_discard = 0;               /* debugging */
 321 unsigned int vm_stat_discard_sent = 0;          /* debugging */
 322 unsigned int vm_stat_discard_failure = 0;       /* debugging */
 323 unsigned int vm_stat_discard_throttle = 0;      /* debugging */
 324 unsigned int vm_pageout_reactivation_limit_exceeded = 0;        /* debugging */
 325 unsigned int vm_pageout_catch_ups = 0;                          /* debugging */
 326 unsigned int vm_pageout_inactive_force_reclaim = 0;     /* debugging */
 327
 328 unsigned int vm_pageout_scan_active_throttled = 0;
 329 unsigned int vm_pageout_scan_inactive_throttled = 0;
 330 unsigned int vm_pageout_scan_throttle = 0;                      /* debugging */
 331 unsigned int vm_pageout_scan_throttle_aborted = 0;              /* debugging */
 332 unsigned int vm_pageout_scan_burst_throttle = 0;                /* debugging */
 333 unsigned int vm_pageout_scan_empty_throttle = 0;                /* debugging */
 334 unsigned int vm_pageout_scan_deadlock_detected = 0;             /* debugging */
 335 unsigned int vm_pageout_scan_active_throttle_success = 0;       /* debugging */
 336 unsigned int vm_pageout_scan_inactive_throttle_success = 0;     /* debugging */
 337
 338 unsigned int vm_page_speculative_count_drifts = 0;
 339 unsigned int vm_page_speculative_count_drift_max = 0;
 340
 341 /*
 342  * Backing store throttle when BS is exhausted
 343  */
 344 unsigned int    vm_backing_store_low = 0;
 345
 346 unsigned int vm_pageout_out_of_line  = 0;
 347 unsigned int vm_pageout_in_place  = 0;
 348
 349 unsigned int vm_page_steal_pageout_page = 0;
 350
 351 /*
 352  * ENCRYPTED SWAP:
 353  * counters and statistics...
 354  */
 355 unsigned long vm_page_decrypt_counter = 0;
 356 unsigned long vm_page_decrypt_for_upl_counter = 0;
 357 unsigned long vm_page_encrypt_counter = 0;
 358 unsigned long vm_page_encrypt_abort_counter = 0;
 359 unsigned long vm_page_encrypt_already_encrypted_counter = 0;
 360 boolean_t vm_pages_encrypted = FALSE; /* are there encrypted pages ? */
 361
 362 struct  vm_pageout_queue vm_pageout_queue_internal;
 363 struct  vm_pageout_queue vm_pageout_queue_external;
 364
 365 unsigned int vm_page_speculative_target = 0;
 366
 367 vm_object_t     vm_pageout_scan_wants_object = VM_OBJECT_NULL;
 368
 369 boolean_t (* volatile consider_buffer_cache_collect)(int) = NULL;
 370
 371 #if DEVELOPMENT || DEBUG
 372 unsigned long vm_cs_validated_resets = 0;
 373 #endif
 374
 375 /*
 376  *      Routine:        vm_backing_store_disable
 377  *      Purpose:
 378  *              Suspend non-privileged threads wishing to extend
 379  *              backing store when we are low on backing store
 380  *              (Synchronized by caller)
 381  */
 382 void
 383 vm_backing_store_disable(
 384         boolean_t       disable)
 385 {
 386         if(disable) {
 387                 vm_backing_store_low = 1;
 388         } else {
 389                 if(vm_backing_store_low) {
 390                         vm_backing_store_low = 0;
 391                         thread_wakeup((event_t) &vm_backing_store_low);
 392                 }
 393         }
 394 }
 395
 396
 397 #if MACH_CLUSTER_STATS
 398 unsigned long vm_pageout_cluster_dirtied = 0;
 399 unsigned long vm_pageout_cluster_cleaned = 0;
 400 unsigned long vm_pageout_cluster_collisions = 0;
 401 unsigned long vm_pageout_cluster_clusters = 0;
 402 unsigned long vm_pageout_cluster_conversions = 0;
 403 unsigned long vm_pageout_target_collisions = 0;
 404 unsigned long vm_pageout_target_page_dirtied = 0;
 405 unsigned long vm_pageout_target_page_freed = 0;
 406 #define CLUSTER_STAT(clause)    clause
 407 #else   /* MACH_CLUSTER_STATS */
 408 #define CLUSTER_STAT(clause)
 409 #endif  /* MACH_CLUSTER_STATS */
 410
 411 /*
 412  *      Routine:        vm_pageout_object_terminate
 413  *      Purpose:
 414  *              Destroy the pageout_object, and perform all of the
 415  *              required cleanup actions.
 416  *
 417  *      In/Out conditions:
 418  *              The object must be locked, and will be returned locked.
 419  */
 420 void
 421 vm_pageout_object_terminate(
 422         vm_object_t     object)
 423 {
 424         vm_object_t     shadow_object;
 425
 426         /*
 427          * Deal with the deallocation (last reference) of a pageout object
 428          * (used for cleaning-in-place) by dropping the paging references/
 429          * freeing pages in the original object.
 430          */
 431
 432         assert(object->pageout);
 433         shadow_object = object->shadow;
 434         vm_object_lock(shadow_object);
 435
 436         while (!queue_empty(&object->memq)) {
 437                 vm_page_t               p, m;
 438                 vm_object_offset_t      offset;
 439
 440                 p = (vm_page_t) queue_first(&object->memq);
 441
 442                 assert(p->private);
 443                 assert(p->pageout);
 444                 p->pageout = FALSE;
 445                 assert(!p->cleaning);
 446
 447                 offset = p->offset;
 448                 VM_PAGE_FREE(p);
 449                 p = VM_PAGE_NULL;
 450
 451                 m = vm_page_lookup(shadow_object,
 452                         offset + object->shadow_offset);
 453
 454                 if(m == VM_PAGE_NULL)
 455                         continue;
 456                 assert(m->cleaning);
 457                 /* used as a trigger on upl_commit etc to recognize the */
 458                 /* pageout daemon's subseqent desire to pageout a cleaning */
 459                 /* page.  When the bit is on the upl commit code will   */
 460                 /* respect the pageout bit in the target page over the  */
 461                 /* caller's page list indication */
 462                 m->dump_cleaning = FALSE;
 463
 464                 assert((m->dirty) || (m->precious) ||
 465                                 (m->busy && m->cleaning));
 466
 467                 /*
 468                  * Handle the trusted pager throttle.
 469                  * Also decrement the burst throttle (if external).
 470                  */
 471                 vm_page_lock_queues();
 472                 if (m->laundry) {
 473                         vm_pageout_throttle_up(m);
 474                 }
 475
 476                 /*
 477                  * Handle the "target" page(s). These pages are to be freed if
 478                  * successfully cleaned. Target pages are always busy, and are
 479                  * wired exactly once. The initial target pages are not mapped,
 480                  * (so cannot be referenced or modified) but converted target
 481                  * pages may have been modified between the selection as an
 482                  * adjacent page and conversion to a target.
 483                  */
 484                 if (m->pageout) {
 485                         assert(m->busy);
 486                         assert(m->wire_count == 1);
 487                         m->cleaning = FALSE;
 488                         m->encrypted_cleaning = FALSE;
 489                         m->pageout = FALSE;
 490 #if MACH_CLUSTER_STATS
 491                         if (m->wanted) vm_pageout_target_collisions++;
 492 #endif
 493                         /*
 494                          * Revoke all access to the page. Since the object is
 495                          * locked, and the page is busy, this prevents the page
 496                          * from being dirtied after the pmap_disconnect() call
 497                          * returns.
 498                          *
 499                          * Since the page is left "dirty" but "not modifed", we
 500                          * can detect whether the page was redirtied during
 501                          * pageout by checking the modify state.
 502                          */
 503                         if (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED)
 504                               m->dirty = TRUE;
 505                         else
 506                               m->dirty = FALSE;
 507
 508                         if (m->dirty) {
 509                                 CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
 510                                 vm_page_unwire(m, TRUE);        /* reactivates */
 511                                 VM_STAT_INCR(reactivations);
 512                                 PAGE_WAKEUP_DONE(m);
 513                         } else {
 514                                 CLUSTER_STAT(vm_pageout_target_page_freed++;)
 515                                 vm_page_free(m);/* clears busy, etc. */
 516                         }
 517                         vm_page_unlock_queues();
 518                         continue;
 519                 }
 520                 /*
 521                  * Handle the "adjacent" pages. These pages were cleaned in
 522                  * place, and should be left alone.
 523                  * If prep_pin_count is nonzero, then someone is using the
 524                  * page, so make it active.
 525                  */
 526                 if (!m->active && !m->inactive && !m->throttled && !m->private) {
 527                         if (m->reference)
 528                                 vm_page_activate(m);
 529                         else
 530                                 vm_page_deactivate(m);
 531                 }
 532                 if((m->busy) && (m->cleaning)) {
 533
 534                         /* the request_page_list case, (COPY_OUT_FROM FALSE) */
 535                         m->busy = FALSE;
 536
 537                         /* We do not re-set m->dirty ! */
 538                         /* The page was busy so no extraneous activity     */
 539                         /* could have occurred. COPY_INTO is a read into the */
 540                         /* new pages. CLEAN_IN_PLACE does actually write   */
 541                         /* out the pages but handling outside of this code */
 542                         /* will take care of resetting dirty. We clear the */
 543                         /* modify however for the Programmed I/O case.     */
 544                         pmap_clear_modify(m->phys_page);
 545
 546                         m->absent = FALSE;
 547                         m->overwriting = FALSE;
 548                 } else if (m->overwriting) {
 549                         /* alternate request page list, write to page_list */
 550                         /* case.  Occurs when the original page was wired  */
 551                         /* at the time of the list request */
 552                         assert(VM_PAGE_WIRED(m));
 553                         vm_page_unwire(m, TRUE);        /* reactivates */
 554                         m->overwriting = FALSE;
 555                 } else {
 556                 /*
 557                  * Set the dirty state according to whether or not the page was
 558                  * modified during the pageout. Note that we purposefully do
 559                  * NOT call pmap_clear_modify since the page is still mapped.
 560                  * If the page were to be dirtied between the 2 calls, this
 561                  * this fact would be lost. This code is only necessary to
 562                  * maintain statistics, since the pmap module is always
 563                  * consulted if m->dirty is false.
 564                  */
 565 #if MACH_CLUSTER_STATS
 566                         m->dirty = pmap_is_modified(m->phys_page);
 567
 568                         if (m->dirty)   vm_pageout_cluster_dirtied++;
 569                         else            vm_pageout_cluster_cleaned++;
 570                         if (m->wanted)  vm_pageout_cluster_collisions++;
 571 #else
 572                         m->dirty = 0;
 573 #endif
 574                 }
 575                 m->cleaning = FALSE;
 576                 m->encrypted_cleaning = FALSE;
 577
 578                 /*
 579                  * Wakeup any thread waiting for the page to be un-cleaning.
 580                  */
 581                 PAGE_WAKEUP(m);
 582                 vm_page_unlock_queues();
 583         }
 584         /*
 585          * Account for the paging reference taken in vm_paging_object_allocate.
 586          */
 587         vm_object_activity_end(shadow_object);
 588         vm_object_unlock(shadow_object);
 589
 590         assert(object->ref_count == 0);
 591         assert(object->paging_in_progress == 0);
 592         assert(object->activity_in_progress == 0);
 593         assert(object->resident_page_count == 0);
 594         return;
 595 }
 596
 597 /*
 598  * Routine:     vm_pageclean_setup
 599  *
 600  * Purpose:     setup a page to be cleaned (made non-dirty), but not
 601  *              necessarily flushed from the VM page cache.
 602  *              This is accomplished by cleaning in place.
 603  *
 604  *              The page must not be busy, and new_object
 605  *              must be locked.
 606  *
 607  */
 608 void
 609 vm_pageclean_setup(
 610         vm_page_t               m,
 611         vm_page_t               new_m,
 612         vm_object_t             new_object,
 613         vm_object_offset_t      new_offset)
 614 {
 615         assert(!m->busy);
 616 #if 0
 617         assert(!m->cleaning);
 618 #endif
 619
 620         XPR(XPR_VM_PAGEOUT,
 621     "vm_pageclean_setup, obj 0x%X off 0x%X page 0x%X new 0x%X new_off 0x%X\n",
 622                 m->object, m->offset, m,
 623                 new_m, new_offset);
 624
 625         pmap_clear_modify(m->phys_page);
 626
 627         /*
 628          * Mark original page as cleaning in place.
 629          */
 630         m->cleaning = TRUE;
 631         m->dirty = TRUE;
 632         m->precious = FALSE;
 633
 634         /*
 635          * Convert the fictitious page to a private shadow of
 636          * the real page.
 637          */
 638         assert(new_m->fictitious);
 639         assert(new_m->phys_page == vm_page_fictitious_addr);
 640         new_m->fictitious = FALSE;
 641         new_m->private = TRUE;
 642         new_m->pageout = TRUE;
 643         new_m->phys_page = m->phys_page;
 644
 645         vm_page_lockspin_queues();
 646         vm_page_wire(new_m);
 647         vm_page_unlock_queues();
 648
 649         vm_page_insert(new_m, new_object, new_offset);
 650         assert(!new_m->wanted);
 651         new_m->busy = FALSE;
 652 }
 653
 654 /*
 655  *      Routine:        vm_pageout_initialize_page
 656  *      Purpose:
 657  *              Causes the specified page to be initialized in
 658  *              the appropriate memory object. This routine is used to push
 659  *              pages into a copy-object when they are modified in the
 660  *              permanent object.
 661  *
 662  *              The page is moved to a temporary object and paged out.
 663  *
 664  *      In/out conditions:
 665  *              The page in question must not be on any pageout queues.
 666  *              The object to which it belongs must be locked.
 667  *              The page must be busy, but not hold a paging reference.
 668  *
 669  *      Implementation:
 670  *              Move this page to a completely new object.
 671  */
 672 void
 673 vm_pageout_initialize_page(
 674         vm_page_t       m)
 675 {
 676         vm_object_t             object;
 677         vm_object_offset_t      paging_offset;
 678         vm_page_t               holding_page;
 679         memory_object_t         pager;
 680
 681         XPR(XPR_VM_PAGEOUT,
 682                 "vm_pageout_initialize_page, page 0x%X\n",
 683                 m, 0, 0, 0, 0);
 684         assert(m->busy);
 685
 686         /*
 687          *      Verify that we really want to clean this page
 688          */
 689         assert(!m->absent);
 690         assert(!m->error);
 691         assert(m->dirty);
 692
 693         /*
 694          *      Create a paging reference to let us play with the object.
 695          */
 696         object = m->object;
 697         paging_offset = m->offset + object->paging_offset;
 698
 699         if (m->absent || m->error || m->restart || (!m->dirty && !m->precious)) {
 700                 VM_PAGE_FREE(m);
 701                 panic("reservation without pageout?"); /* alan */
 702                 vm_object_unlock(object);
 703
 704                 return;
 705         }
 706
 707         /*
 708          * If there's no pager, then we can't clean the page.  This should
 709          * never happen since this should be a copy object and therefore not
 710          * an external object, so the pager should always be there.
 711          */
 712
 713         pager = object->pager;
 714
 715         if (pager == MEMORY_OBJECT_NULL) {
 716                 VM_PAGE_FREE(m);
 717                 panic("missing pager for copy object");
 718                 return;
 719         }
 720
 721         /* set the page for future call to vm_fault_list_request */
 722         vm_object_paging_begin(object);
 723         holding_page = NULL;
 724
 725         pmap_clear_modify(m->phys_page);
 726         m->dirty = TRUE;
 727         m->busy = TRUE;
 728         m->list_req_pending = TRUE;
 729         m->cleaning = TRUE;
 730         m->pageout = TRUE;
 731
 732         vm_page_lockspin_queues();
 733         vm_page_wire(m);
 734         vm_page_unlock_queues();
 735
 736         vm_object_unlock(object);
 737
 738         /*
 739          *      Write the data to its pager.
 740          *      Note that the data is passed by naming the new object,
 741          *      not a virtual address; the pager interface has been
 742          *      manipulated to use the "internal memory" data type.
 743          *      [The object reference from its allocation is donated
 744          *      to the eventual recipient.]
 745          */
 746         memory_object_data_initialize(pager, paging_offset, PAGE_SIZE);
 747
 748         vm_object_lock(object);
 749         vm_object_paging_end(object);
 750 }
 751
 752 #if     MACH_CLUSTER_STATS
 753 #define MAXCLUSTERPAGES 16
 754 struct {
 755         unsigned long pages_in_cluster;
 756         unsigned long pages_at_higher_offsets;
 757         unsigned long pages_at_lower_offsets;
 758 } cluster_stats[MAXCLUSTERPAGES];
 759 #endif  /* MACH_CLUSTER_STATS */
 760
 761
 762 /*
 763  * vm_pageout_cluster:
 764  *
 765  * Given a page, queue it to the appropriate I/O thread,
 766  * which will page it out and attempt to clean adjacent pages
 767  * in the same operation.
 768  *
 769  * The page must be busy, and the object and queues locked. We will take a
 770  * paging reference to prevent deallocation or collapse when we
 771  * release the object lock back at the call site.  The I/O thread
 772  * is responsible for consuming this reference
 773  *
 774  * The page must not be on any pageout queue.
 775  */
 776
 777 void
 778 vm_pageout_cluster(vm_page_t m)
 779 {
 780         vm_object_t     object = m->object;
 781         struct          vm_pageout_queue *q;
 782
 783
 784         XPR(XPR_VM_PAGEOUT,
 785                 "vm_pageout_cluster, object 0x%X offset 0x%X page 0x%X\n",
 786                 object, m->offset, m, 0, 0);
 787
 788         VM_PAGE_CHECK(m);
 789
 790         /*
 791          * Only a certain kind of page is appreciated here.
 792          */
 793         assert(m->busy && (m->dirty || m->precious) && (!VM_PAGE_WIRED(m)));
 794         assert(!m->cleaning && !m->pageout && !m->inactive && !m->active);
 795         assert(!m->throttled);
 796
 797         /*
 798          * protect the object from collapse -
 799          * locking in the object's paging_offset.
 800          */
 801         vm_object_paging_begin(object);
 802
 803         /*
 804          * set the page for future call to vm_fault_list_request
 805          * page should already be marked busy
 806          */
 807         vm_page_wire(m);
 808         m->list_req_pending = TRUE;
 809         m->cleaning = TRUE;
 810         m->pageout = TRUE;
 811
 812         if (object->internal == TRUE)
 813                 q = &vm_pageout_queue_internal;
 814         else
 815                 q = &vm_pageout_queue_external;
 816
 817         /*
 818          * pgo_laundry count is tied to the laundry bit
 819          */
 820         m->laundry = TRUE;
 821         q->pgo_laundry++;
 822
 823         m->pageout_queue = TRUE;
 824         queue_enter(&q->pgo_pending, m, vm_page_t, pageq);
 825
 826         if (q->pgo_idle == TRUE) {
 827                 q->pgo_idle = FALSE;
 828                 thread_wakeup((event_t) &q->pgo_pending);
 829         }
 830
 831         VM_PAGE_CHECK(m);
 832 }
 833
 834
 835 unsigned long vm_pageout_throttle_up_count = 0;
 836
 837 /*
 838  * A page is back from laundry or we are stealing it back from
 839  * the laundering state.  See if there are some pages waiting to
 840  * go to laundry and if we can let some of them go now.
 841  *
 842  * Object and page queues must be locked.
 843  */
 844 void
 845 vm_pageout_throttle_up(
 846         vm_page_t       m)
 847 {
 848         struct vm_pageout_queue *q;
 849
 850         assert(m->object != VM_OBJECT_NULL);
 851         assert(m->object != kernel_object);
 852
 853         vm_pageout_throttle_up_count++;
 854
 855         if (m->object->internal == TRUE)
 856                 q = &vm_pageout_queue_internal;
 857         else
 858                 q = &vm_pageout_queue_external;
 859
 860         if (m->pageout_queue == TRUE) {
 861
 862                 queue_remove(&q->pgo_pending, m, vm_page_t, pageq);
 863                 m->pageout_queue = FALSE;
 864
 865                 m->pageq.next = NULL;
 866                 m->pageq.prev = NULL;
 867
 868                 vm_object_paging_end(m->object);
 869         }
 870         if (m->laundry == TRUE) {
 871                 m->laundry = FALSE;
 872                 q->pgo_laundry--;
 873
 874                 if (q->pgo_throttled == TRUE) {
 875                         q->pgo_throttled = FALSE;
 876                         thread_wakeup((event_t) &q->pgo_laundry);
 877                 }
 878                 if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
 879                         q->pgo_draining = FALSE;
 880                         thread_wakeup((event_t) (&q->pgo_laundry+1));
 881                 }
 882         }
 883 }
 884
 885
 886 /*
 887  *      vm_pageout_scan does the dirty work for the pageout daemon.
 888  *      It returns with vm_page_queue_free_lock held and
 889  *      vm_page_free_wanted == 0.
 890  */
 891
 892 #define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT  (3 * MAX_UPL_TRANSFER)
 893
 894 #define FCS_IDLE                0
 895 #define FCS_DELAYED             1
 896 #define FCS_DEADLOCK_DETECTED   2
 897
 898 struct flow_control {
 899         int             state;
 900         mach_timespec_t ts;
 901 };
 902
 903
 904 /*
 905  * VM memory pressure monitoring.
 906  *
 907  * vm_pageout_scan() keeps track of the number of pages it considers and
 908  * reclaims, in the currently active vm_pageout_stat[vm_pageout_stat_now].
 909  *
 910  * compute_memory_pressure() is called every second from compute_averages()
 911  * and moves "vm_pageout_stat_now" forward, to start accumulating the number
 912  * of recalimed pages in a new vm_pageout_stat[] bucket.
 913  *
 914  * mach_vm_pressure_monitor() collects past statistics about memory pressure.
 915  * The caller provides the number of seconds ("nsecs") worth of statistics
 916  * it wants, up to 30 seconds.
 917  * It computes the number of pages reclaimed in the past "nsecs" seconds and
 918  * also returns the number of pages the system still needs to reclaim at this
 919  * moment in time.
 920  */
 921 #define VM_PAGEOUT_STAT_SIZE    31
 922 struct vm_pageout_stat {
 923         unsigned int considered;
 924         unsigned int reclaimed;
 925 } vm_pageout_stats[VM_PAGEOUT_STAT_SIZE] = {{0,0}, };
 926 unsigned int vm_pageout_stat_now = 0;
 927 unsigned int vm_memory_pressure = 0;
 928
 929 #define VM_PAGEOUT_STAT_BEFORE(i) \
 930         (((i) == 0) ? VM_PAGEOUT_STAT_SIZE - 1 : (i) - 1)
 931 #define VM_PAGEOUT_STAT_AFTER(i) \
 932         (((i) == VM_PAGEOUT_STAT_SIZE - 1) ? 0 : (i) + 1)
 933
 934 /*
 935  * Called from compute_averages().
 936  */
 937 void
 938 compute_memory_pressure(
 939         __unused void *arg)
 940 {
 941         unsigned int vm_pageout_next;
 942
 943         vm_memory_pressure =
 944                 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].reclaimed;
 945
 946         commpage_set_memory_pressure( vm_memory_pressure );
 947
 948         /* move "now" forward */
 949         vm_pageout_next = VM_PAGEOUT_STAT_AFTER(vm_pageout_stat_now);
 950         vm_pageout_stats[vm_pageout_next].considered = 0;
 951         vm_pageout_stats[vm_pageout_next].reclaimed = 0;
 952         vm_pageout_stat_now = vm_pageout_next;
 953 }
 954
 955 unsigned int
 956 mach_vm_ctl_page_free_wanted(void)
 957 {
 958         unsigned int page_free_target, page_free_count, page_free_wanted;
 959
 960         page_free_target = vm_page_free_target;
 961         page_free_count = vm_page_free_count;
 962         if (page_free_target > page_free_count) {
 963                 page_free_wanted = page_free_target - page_free_count;
 964         } else {
 965                 page_free_wanted = 0;
 966         }
 967
 968         return page_free_wanted;
 969 }
 970
 971 kern_return_t
 972 mach_vm_pressure_monitor(
 973         boolean_t       wait_for_pressure,
 974         unsigned int    nsecs_monitored,
 975         unsigned int    *pages_reclaimed_p,
 976         unsigned int    *pages_wanted_p)
 977 {
 978         wait_result_t   wr;
 979         unsigned int    vm_pageout_then, vm_pageout_now;
 980         unsigned int    pages_reclaimed;
 981
 982         /*
 983          * We don't take the vm_page_queue_lock here because we don't want
 984          * vm_pressure_monitor() to get in the way of the vm_pageout_scan()
 985          * thread when it's trying to reclaim memory.  We don't need fully
 986          * accurate monitoring anyway...
 987          */
 988
 989         if (wait_for_pressure) {
 990                 /* wait until there's memory pressure */
 991                 while (vm_page_free_count >= vm_page_free_target) {
 992                         wr = assert_wait((event_t) &vm_page_free_wanted,
 993                                          THREAD_INTERRUPTIBLE);
 994                         if (wr == THREAD_WAITING) {
 995                                 wr = thread_block(THREAD_CONTINUE_NULL);
 996                         }
 997                         if (wr == THREAD_INTERRUPTED) {
 998                                 return KERN_ABORTED;
 999                         }
1000                         if (wr == THREAD_AWAKENED) {
1001                                 /*
1002                                  * The memory pressure might have already
1003                                  * been relieved but let's not block again
1004                                  * and let's report that there was memory
1005                                  * pressure at some point.
1006                                  */
1007                                 break;
1008                         }
1009                 }
1010         }
1011
1012         /* provide the number of pages the system wants to reclaim */
1013         if (pages_wanted_p != NULL) {
1014                 *pages_wanted_p = mach_vm_ctl_page_free_wanted();
1015         }
1016
1017         if (pages_reclaimed_p == NULL) {
1018                 return KERN_SUCCESS;
1019         }
1020
1021         /* provide number of pages reclaimed in the last "nsecs_monitored" */
1022         do {
1023                 vm_pageout_now = vm_pageout_stat_now;
1024                 pages_reclaimed = 0;
1025                 for (vm_pageout_then =
1026                              VM_PAGEOUT_STAT_BEFORE(vm_pageout_now);
1027                      vm_pageout_then != vm_pageout_now &&
1028                              nsecs_monitored-- != 0;
1029                      vm_pageout_then =
1030                              VM_PAGEOUT_STAT_BEFORE(vm_pageout_then)) {
1031                         pages_reclaimed += vm_pageout_stats[vm_pageout_then].reclaimed;
1032                 }
1033         } while (vm_pageout_now != vm_pageout_stat_now);
1034         *pages_reclaimed_p = pages_reclaimed;
1035
1036         return KERN_SUCCESS;
1037 }
1038
1039 /* Page States: Used below to maintain the page state
1040    before it's removed from it's Q. This saved state
1041    helps us do the right accounting in certain cases
1042 */
1043
1044 #define PAGE_STATE_SPECULATIVE  1
1045 #define PAGE_STATE_THROTTLED    2
1046 #define PAGE_STATE_ZEROFILL     3
1047 #define PAGE_STATE_INACTIVE     4
1048
1049 #define VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m)                         \
1050         MACRO_BEGIN                                                     \
1051         /*                                                              \
1052          * If a "reusable" page somehow made it back into               \
1053          * the active queue, it's been re-used and is not               \
1054          * quite re-usable.                                             \
1055          * If the VM object was "all_reusable", consider it             \
1056          * as "all re-used" instead of converting it to                 \
1057          * "partially re-used", which could be expensive.               \
1058          */                                                             \
1059         if ((m)->reusable ||                                            \
1060             (m)->object->all_reusable) {                                \
1061                 vm_object_reuse_pages((m)->object,                      \
1062                                       (m)->offset,                      \
1063                                       (m)->offset + PAGE_SIZE_64,       \
1064                                       FALSE);                           \
1065         }                                                               \
1066         MACRO_END
1067
1068 void
1069 vm_pageout_scan(void)
1070 {
1071         unsigned int loop_count = 0;
1072         unsigned int inactive_burst_count = 0;
1073         unsigned int active_burst_count = 0;
1074         unsigned int reactivated_this_call;
1075         unsigned int reactivate_limit;
1076         vm_page_t   local_freeq = NULL;
1077         int         local_freed = 0;
1078         int         delayed_unlock;
1079         int         refmod_state = 0;
1080         int     vm_pageout_deadlock_target = 0;
1081         struct  vm_pageout_queue *iq;
1082         struct  vm_pageout_queue *eq;
1083         struct  vm_speculative_age_q *sq;
1084         struct  flow_control    flow_control = { 0, { 0, 0 } };
1085         boolean_t inactive_throttled = FALSE;
1086         boolean_t try_failed;
1087         mach_timespec_t         ts;
1088         unsigned int msecs = 0;
1089         vm_object_t     object;
1090         vm_object_t     last_object_tried;
1091 #if defined(__ppc__) /* On ppc, vm statistics are still 32-bit */
1092         unsigned int    zf_ratio;
1093         unsigned int    zf_run_count;
1094 #else
1095         uint64_t        zf_ratio;
1096         uint64_t        zf_run_count;
1097 #endif
1098         uint32_t        catch_up_count = 0;
1099         uint32_t        inactive_reclaim_run;
1100         boolean_t       forced_reclaim;
1101         int             page_prev_state = 0;
1102
1103         flow_control.state = FCS_IDLE;
1104         iq = &vm_pageout_queue_internal;
1105         eq = &vm_pageout_queue_external;
1106         sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
1107
1108
1109         XPR(XPR_VM_PAGEOUT, "vm_pageout_scan\n", 0, 0, 0, 0, 0);
1110
1111
1112         vm_page_lock_queues();
1113         delayed_unlock = 1;     /* must be nonzero if Qs are locked, 0 if unlocked */
1114
1115         /*
1116          *      Calculate the max number of referenced pages on the inactive
1117          *      queue that we will reactivate.
1118          */
1119         reactivated_this_call = 0;
1120         reactivate_limit = VM_PAGE_REACTIVATE_LIMIT(vm_page_active_count +
1121                                                     vm_page_inactive_count);
1122         inactive_reclaim_run = 0;
1123
1124
1125 /*???*/ /*
1126          *      We want to gradually dribble pages from the active queue
1127          *      to the inactive queue.  If we let the inactive queue get
1128          *      very small, and then suddenly dump many pages into it,
1129          *      those pages won't get a sufficient chance to be referenced
1130          *      before we start taking them from the inactive queue.
1131          *
1132          *      We must limit the rate at which we send pages to the pagers.
1133          *      data_write messages consume memory, for message buffers and
1134          *      for map-copy objects.  If we get too far ahead of the pagers,
1135          *      we can potentially run out of memory.
1136          *
1137          *      We can use the laundry count to limit directly the number
1138          *      of pages outstanding to the default pager.  A similar
1139          *      strategy for external pagers doesn't work, because
1140          *      external pagers don't have to deallocate the pages sent them,
1141          *      and because we might have to send pages to external pagers
1142          *      even if they aren't processing writes.  So we also
1143          *      use a burst count to limit writes to external pagers.
1144          *
1145          *      When memory is very tight, we can't rely on external pagers to
1146          *      clean pages.  They probably aren't running, because they
1147          *      aren't vm-privileged.  If we kept sending dirty pages to them,
1148          *      we could exhaust the free list.
1149          */
1150
1151
1152 Restart:
1153         assert(delayed_unlock!=0);
1154
1155         /*
1156          *      A page is "zero-filled" if it was not paged in from somewhere,
1157          *      and it belongs to an object at least VM_ZF_OBJECT_SIZE_THRESHOLD big.
1158          *      Recalculate the zero-filled page ratio.  We use this to apportion
1159          *      victimized pages between the normal and zero-filled inactive
1160          *      queues according to their relative abundance in memory.  Thus if a task
1161          *      is flooding memory with zf pages, we begin to hunt them down.
1162          *      It would be better to throttle greedy tasks at a higher level,
1163          *      but at the moment mach vm cannot do this.
1164          */
1165         {
1166 #if defined(__ppc__) /* On ppc, vm statistics are still 32-bit */
1167                 uint32_t  total  = vm_page_active_count + vm_page_inactive_count;
1168                 uint32_t  normal = total - vm_zf_count;
1169 #else
1170                 uint64_t  total  = vm_page_active_count + vm_page_inactive_count;
1171                 uint64_t  normal = total - vm_zf_count;
1172 #endif
1173
1174                 /* zf_ratio is the number of zf pages we victimize per normal page */
1175
1176                 if (vm_zf_count < vm_accellerate_zf_pageout_trigger)
1177                         zf_ratio = 0;
1178                 else if ((vm_zf_count <= normal) || (normal == 0))
1179                         zf_ratio = 1;
1180                 else
1181                         zf_ratio = vm_zf_count / normal;
1182
1183                 zf_run_count = 0;
1184         }
1185
1186         /*
1187          *      Recalculate vm_page_inactivate_target.
1188          */
1189         vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
1190                                                           vm_page_inactive_count +
1191                                                           vm_page_speculative_count);
1192         /*
1193          * don't want to wake the pageout_scan thread up everytime we fall below
1194          * the targets... set a low water mark at 0.25% below the target
1195          */
1196         vm_page_inactive_min = vm_page_inactive_target - (vm_page_inactive_target / 400);
1197
1198         vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count +
1199                                                                 vm_page_inactive_count);
1200         object = NULL;
1201         last_object_tried = NULL;
1202         try_failed = FALSE;
1203
1204         if ((vm_page_inactive_count + vm_page_speculative_count) < VM_PAGE_INACTIVE_HEALTHY_LIMIT(vm_page_active_count))
1205                 catch_up_count = vm_page_inactive_count + vm_page_speculative_count;
1206         else
1207                 catch_up_count = 0;
1208
1209         for (;;) {
1210                 vm_page_t m;
1211
1212                 DTRACE_VM2(rev, int, 1, (uint64_t *), NULL);
1213
1214                 if (delayed_unlock == 0) {
1215                         vm_page_lock_queues();
1216                         delayed_unlock = 1;
1217                 }
1218
1219                 /*
1220                  *      Don't sweep through active queue more than the throttle
1221                  *      which should be kept relatively low
1222                  */
1223                 active_burst_count = MIN(vm_pageout_burst_active_throttle,
1224                                          vm_page_active_count);
1225
1226                 /*
1227                  *      Move pages from active to inactive.
1228                  */
1229                 if ((vm_page_inactive_count + vm_page_speculative_count) >= vm_page_inactive_target)
1230                         goto done_moving_active_pages;
1231
1232                 while (!queue_empty(&vm_page_queue_active) && active_burst_count) {
1233
1234                         if (active_burst_count)
1235                                active_burst_count--;
1236
1237                         vm_pageout_active++;
1238
1239                         m = (vm_page_t) queue_first(&vm_page_queue_active);
1240
1241                         assert(m->active && !m->inactive);
1242                         assert(!m->laundry);
1243                         assert(m->object != kernel_object);
1244                         assert(m->phys_page != vm_page_guard_addr);
1245
1246                         DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
1247
1248                         /*
1249                          * Try to lock object; since we've already got the
1250                          * page queues lock, we can only 'try' for this one.
1251                          * if the 'try' fails, we need to do a mutex_pause
1252                          * to allow the owner of the object lock a chance to
1253                          * run... otherwise, we're likely to trip over this
1254                          * object in the same state as we work our way through
1255                          * the queue... clumps of pages associated with the same
1256                          * object are fairly typical on the inactive and active queues
1257                          */
1258                         if (m->object != object) {
1259                                 if (object != NULL) {
1260                                         vm_object_unlock(object);
1261                                         object = NULL;
1262                                         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1263                                 }
1264                                 if (!vm_object_lock_try_scan(m->object)) {
1265                                         /*
1266                                          * move page to end of active queue and continue
1267                                          */
1268                                         queue_remove(&vm_page_queue_active, m,
1269                                                      vm_page_t, pageq);
1270                                         queue_enter(&vm_page_queue_active, m,
1271                                                     vm_page_t, pageq);
1272
1273                                         try_failed = TRUE;
1274
1275                                         m = (vm_page_t) queue_first(&vm_page_queue_active);
1276                                         /*
1277                                          * this is the next object we're going to be interested in
1278                                          * try to make sure it's available after the mutex_yield
1279                                          * returns control
1280                                          */
1281                                         vm_pageout_scan_wants_object = m->object;
1282
1283                                         goto done_with_activepage;
1284                                 }
1285                                 object = m->object;
1286
1287                                 try_failed = FALSE;
1288                         }
1289
1290                         /*
1291                          * if the page is BUSY, then we pull it
1292                          * off the active queue and leave it alone.
1293                          * when BUSY is cleared, it will get stuck
1294                          * back on the appropriate queue
1295                          */
1296                         if (m->busy) {
1297                                 queue_remove(&vm_page_queue_active, m,
1298                                              vm_page_t, pageq);
1299                                 m->pageq.next = NULL;
1300                                 m->pageq.prev = NULL;
1301
1302                                 if (!m->fictitious)
1303                                         vm_page_active_count--;
1304                                 m->active = FALSE;
1305
1306                                 goto done_with_activepage;
1307                         }
1308
1309                         /* deal with a rogue "reusable" page */
1310                         VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m);
1311
1312                         /*
1313                          *      Deactivate the page while holding the object
1314                          *      locked, so we know the page is still not busy.
1315                          *      This should prevent races between pmap_enter
1316                          *      and pmap_clear_reference.  The page might be
1317                          *      absent or fictitious, but vm_page_deactivate
1318                          *      can handle that.
1319                          */
1320                         vm_page_deactivate(m);
1321
1322 done_with_activepage:
1323                         if (delayed_unlock++ > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT || try_failed == TRUE) {
1324
1325                                 if (object != NULL) {
1326                                         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1327                                         vm_object_unlock(object);
1328                                         object = NULL;
1329                                 }
1330                                 if (local_freeq) {
1331                                         vm_page_unlock_queues();
1332                                         vm_page_free_list(local_freeq, TRUE);
1333
1334                                         local_freeq = NULL;
1335                                         local_freed = 0;
1336                                         vm_page_lock_queues();
1337                                 } else
1338                                         lck_mtx_yield(&vm_page_queue_lock);
1339
1340                                 delayed_unlock = 1;
1341
1342                                 /*
1343                                  * continue the while loop processing
1344                                  * the active queue... need to hold
1345                                  * the page queues lock
1346                                  */
1347                         }
1348                 }
1349
1350
1351
1352                 /**********************************************************************
1353                  * above this point we're playing with the active queue
1354                  * below this point we're playing with the throttling mechanisms
1355                  * and the inactive queue
1356                  **********************************************************************/
1357
1358 done_moving_active_pages:
1359
1360                 /*
1361                  *      We are done if we have met our target *and*
1362                  *      nobody is still waiting for a page.
1363                  */
1364                 if (vm_page_free_count + local_freed >= vm_page_free_target) {
1365                         if (object != NULL) {
1366                                 vm_object_unlock(object);
1367                                 object = NULL;
1368                         }
1369                         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1370
1371                         if (local_freeq) {
1372                                 vm_page_unlock_queues();
1373                                 vm_page_free_list(local_freeq, TRUE);
1374
1375                                 local_freeq = NULL;
1376                                 local_freed = 0;
1377                                 vm_page_lock_queues();
1378                         }
1379                         /*
1380                          * inactive target still not met... keep going
1381                          * until we get the queues balanced
1382                          */
1383
1384                         /*
1385                          *      Recalculate vm_page_inactivate_target.
1386                          */
1387                         vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
1388                                                                           vm_page_inactive_count +
1389                                                                           vm_page_speculative_count);
1390
1391 #ifndef CONFIG_EMBEDDED
1392                         /*
1393                          * XXX: if no active pages can be reclaimed, pageout scan can be stuck trying
1394                          *      to balance the queues
1395                          */
1396                         if (((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) &&
1397                             !queue_empty(&vm_page_queue_active))
1398                                 continue;
1399 #endif
1400
1401                         lck_mtx_lock(&vm_page_queue_free_lock);
1402
1403                         if ((vm_page_free_count >= vm_page_free_target) &&
1404                             (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
1405
1406                                 vm_page_unlock_queues();
1407
1408                                 thread_wakeup((event_t) &vm_pageout_garbage_collect);
1409
1410                                 assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
1411
1412                                 return;
1413                         }
1414                         lck_mtx_unlock(&vm_page_queue_free_lock);
1415                 }
1416
1417                 /*
1418                  * Before anything, we check if we have any ripe volatile
1419                  * objects around. If so, try to purge the first object.
1420                  * If the purge fails, fall through to reclaim a page instead.
1421                  * If the purge succeeds, go back to the top and reevalute
1422                  * the new memory situation.
1423                  */
1424                 assert (available_for_purge>=0);
1425                 if (available_for_purge)
1426                 {
1427                         if (object != NULL) {
1428                                 vm_object_unlock(object);
1429                                 object = NULL;
1430                         }
1431                         if(TRUE == vm_purgeable_object_purge_one()) {
1432                                 continue;
1433                         }
1434                 }
1435
1436                 if (queue_empty(&sq->age_q) && vm_page_speculative_count) {
1437                         /*
1438                          * try to pull pages from the aging bins
1439                          * see vm_page.h for an explanation of how
1440                          * this mechanism works
1441                          */
1442                         struct vm_speculative_age_q     *aq;
1443                         mach_timespec_t ts_fully_aged;
1444                         boolean_t       can_steal = FALSE;
1445                         int num_scanned_queues;
1446
1447                         aq = &vm_page_queue_speculative[speculative_steal_index];
1448
1449                         num_scanned_queues = 0;
1450                         while (queue_empty(&aq->age_q) &&
1451                                num_scanned_queues++ != VM_PAGE_MAX_SPECULATIVE_AGE_Q) {
1452
1453                                 speculative_steal_index++;
1454
1455                                 if (speculative_steal_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q)
1456                                         speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
1457
1458                                 aq = &vm_page_queue_speculative[speculative_steal_index];
1459                         }
1460
1461                         if (num_scanned_queues ==
1462                             VM_PAGE_MAX_SPECULATIVE_AGE_Q + 1) {
1463                                 /*
1464                                  * XXX We've scanned all the speculative
1465                                  * queues but still haven't found one
1466                                  * that is not empty, even though
1467                                  * vm_page_speculative_count is not 0.
1468                                  */
1469                                 /* report the anomaly... */
1470                                 printf("vm_pageout_scan: "
1471                                        "all speculative queues empty "
1472                                        "but count=%d.  Re-adjusting.\n",
1473                                        vm_page_speculative_count);
1474                                 if (vm_page_speculative_count >
1475                                     vm_page_speculative_count_drift_max)
1476                                         vm_page_speculative_count_drift_max = vm_page_speculative_count;
1477                                 vm_page_speculative_count_drifts++;
1478 #if 6553678
1479                                 Debugger("vm_pageout_scan: no speculative pages");
1480 #endif
1481                                 /* readjust... */
1482                                 vm_page_speculative_count = 0;
1483                                 /* ... and continue */
1484                                 continue;
1485                         }
1486
1487                         if (vm_page_speculative_count > vm_page_speculative_target)
1488                                 can_steal = TRUE;
1489                         else {
1490                                 ts_fully_aged.tv_sec = (VM_PAGE_MAX_SPECULATIVE_AGE_Q * VM_PAGE_SPECULATIVE_Q_AGE_MS) / 1000;
1491                                 ts_fully_aged.tv_nsec = ((VM_PAGE_MAX_SPECULATIVE_AGE_Q * VM_PAGE_SPECULATIVE_Q_AGE_MS) % 1000)
1492                                                       * 1000 * NSEC_PER_USEC;
1493
1494                                 ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts);
1495
1496                                 clock_sec_t sec;
1497                                 clock_nsec_t nsec;
1498                                 clock_get_system_nanotime(&sec, &nsec);
1499                                 ts.tv_sec = (unsigned int) sec;
1500                                 ts.tv_nsec = nsec;
1501
1502                                 if (CMP_MACH_TIMESPEC(&ts, &ts_fully_aged) >= 0)
1503                                         can_steal = TRUE;
1504                         }
1505                         if (can_steal == TRUE)
1506                                 vm_page_speculate_ageit(aq);
1507                 }
1508
1509                 /*
1510                  * Sometimes we have to pause:
1511                  *      1) No inactive pages - nothing to do.
1512                  *      2) Flow control - default pageout queue is full
1513                  *      3) Loop control - no acceptable pages found on the inactive queue
1514                  *         within the last vm_pageout_burst_inactive_throttle iterations
1515                  */
1516                 if (queue_empty(&vm_page_queue_inactive) && queue_empty(&vm_page_queue_zf) && queue_empty(&sq->age_q) &&
1517                     (VM_PAGE_Q_THROTTLED(iq) || queue_empty(&vm_page_queue_throttled))) {
1518                         vm_pageout_scan_empty_throttle++;
1519                         msecs = vm_pageout_empty_wait;
1520                         goto vm_pageout_scan_delay;
1521
1522                 } else if (inactive_burst_count >=
1523                            MIN(vm_pageout_burst_inactive_throttle,
1524                                (vm_page_inactive_count +
1525                                 vm_page_speculative_count))) {
1526                         vm_pageout_scan_burst_throttle++;
1527                         msecs = vm_pageout_burst_wait;
1528                         goto vm_pageout_scan_delay;
1529
1530                 } else if (VM_PAGE_Q_THROTTLED(iq) && IP_VALID(memory_manager_default)) {
1531                         clock_sec_t sec;
1532                         clock_nsec_t nsec;
1533
1534                         switch (flow_control.state) {
1535
1536                         case FCS_IDLE:
1537 reset_deadlock_timer:
1538                                 ts.tv_sec = vm_pageout_deadlock_wait / 1000;
1539                                 ts.tv_nsec = (vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
1540                                 clock_get_system_nanotime(&sec, &nsec);
1541                                 flow_control.ts.tv_sec = (unsigned int) sec;
1542                                 flow_control.ts.tv_nsec = nsec;
1543                                 ADD_MACH_TIMESPEC(&flow_control.ts, &ts);
1544
1545                                 flow_control.state = FCS_DELAYED;
1546                                 msecs = vm_pageout_deadlock_wait;
1547
1548                                 break;
1549
1550                         case FCS_DELAYED:
1551                                 clock_get_system_nanotime(&sec, &nsec);
1552                                 ts.tv_sec = (unsigned int) sec;
1553                                 ts.tv_nsec = nsec;
1554
1555                                 if (CMP_MACH_TIMESPEC(&ts, &flow_control.ts) >= 0) {
1556                                         /*
1557                                          * the pageout thread for the default pager is potentially
1558                                          * deadlocked since the
1559                                          * default pager queue has been throttled for more than the
1560                                          * allowable time... we need to move some clean pages or dirty
1561                                          * pages belonging to the external pagers if they aren't throttled
1562                                          * vm_page_free_wanted represents the number of threads currently
1563                                          * blocked waiting for pages... we'll move one page for each of
1564                                          * these plus a fixed amount to break the logjam... once we're done
1565                                          * moving this number of pages, we'll re-enter the FSC_DELAYED state
1566                                          * with a new timeout target since we have no way of knowing
1567                                          * whether we've broken the deadlock except through observation
1568                                          * of the queue associated with the default pager... we need to
1569                                          * stop moving pages and allow the system to run to see what
1570                                          * state it settles into.
1571                                          */
1572                                         vm_pageout_deadlock_target = vm_pageout_deadlock_relief + vm_page_free_wanted + vm_page_free_wanted_privileged;
1573                                         vm_pageout_scan_deadlock_detected++;
1574                                         flow_control.state = FCS_DEADLOCK_DETECTED;
1575
1576                                         thread_wakeup((event_t) &vm_pageout_garbage_collect);
1577                                         goto consider_inactive;
1578                                 }
1579                                 /*
1580                                  * just resniff instead of trying
1581                                  * to compute a new delay time... we're going to be
1582                                  * awakened immediately upon a laundry completion,
1583                                  * so we won't wait any longer than necessary
1584                                  */
1585                                 msecs = vm_pageout_idle_wait;
1586                                 break;
1587
1588                         case FCS_DEADLOCK_DETECTED:
1589                                 if (vm_pageout_deadlock_target)
1590                                         goto consider_inactive;
1591                                 goto reset_deadlock_timer;
1592
1593                         }
1594                         vm_pageout_scan_throttle++;
1595                         iq->pgo_throttled = TRUE;
1596 vm_pageout_scan_delay:
1597                         if (object != NULL) {
1598                                 vm_object_unlock(object);
1599                                 object = NULL;
1600                         }
1601                         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1602
1603                         if (local_freeq) {
1604                                 vm_page_unlock_queues();
1605                                 vm_page_free_list(local_freeq, TRUE);
1606
1607                                 local_freeq = NULL;
1608                                 local_freed = 0;
1609                                 vm_page_lock_queues();
1610
1611                                 if (flow_control.state == FCS_DELAYED &&
1612                                     !VM_PAGE_Q_THROTTLED(iq)) {
1613                                         flow_control.state = FCS_IDLE;
1614                                         vm_pageout_scan_throttle_aborted++;
1615                                         goto consider_inactive;
1616                                 }
1617                         }
1618 #if CONFIG_EMBEDDED
1619                         {
1620                         int percent_avail;
1621
1622                         /*
1623                          * Decide if we need to send a memory status notification.
1624                          */
1625                         percent_avail =
1626                                 (vm_page_active_count + vm_page_inactive_count +
1627                                  vm_page_speculative_count + vm_page_free_count +
1628                                  (IP_VALID(memory_manager_default)?0:vm_page_purgeable_count) ) * 100 /
1629                                 atop_64(max_mem);
1630                         if (percent_avail >= (kern_memorystatus_level + 5) ||
1631                             percent_avail <= (kern_memorystatus_level - 5)) {
1632                                 kern_memorystatus_level = percent_avail;
1633                                 thread_wakeup((event_t)&kern_memorystatus_wakeup);
1634                         }
1635                         }
1636 #endif
1637                         assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000*NSEC_PER_USEC);
1638                         counter(c_vm_pageout_scan_block++);
1639
1640                         vm_page_unlock_queues();
1641
1642                         assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
1643
1644                         thread_block(THREAD_CONTINUE_NULL);
1645
1646                         vm_page_lock_queues();
1647                         delayed_unlock = 1;
1648
1649                         iq->pgo_throttled = FALSE;
1650
1651                         if (loop_count >= vm_page_inactive_count)
1652                                 loop_count = 0;
1653                         inactive_burst_count = 0;
1654
1655                         goto Restart;
1656                         /*NOTREACHED*/
1657                 }
1658
1659
1660                 flow_control.state = FCS_IDLE;
1661 consider_inactive:
1662                 loop_count++;
1663                 inactive_burst_count++;
1664                 vm_pageout_inactive++;
1665
1666                 /* Choose a victim. */
1667
1668                 while (1) {
1669                         m = NULL;
1670
1671                         if (IP_VALID(memory_manager_default)) {
1672                                 assert(vm_page_throttled_count == 0);
1673                                 assert(queue_empty(&vm_page_queue_throttled));
1674                         }
1675
1676                         /*
1677                          * The most eligible pages are ones we paged in speculatively,
1678                          * but which have not yet been touched.
1679                          */
1680                         if ( !queue_empty(&sq->age_q) ) {
1681                                 m = (vm_page_t) queue_first(&sq->age_q);
1682                                 break;
1683                         }
1684                         /*
1685                          * Time for a zero-filled inactive page?
1686                          */
1687                         if ( ((zf_run_count < zf_ratio) && vm_zf_queue_count >= zf_queue_min_count) ||
1688                              queue_empty(&vm_page_queue_inactive)) {
1689                                 if ( !queue_empty(&vm_page_queue_zf) ) {
1690                                         m = (vm_page_t) queue_first(&vm_page_queue_zf);
1691                                         zf_run_count++;
1692                                         break;
1693                                 }
1694                         }
1695                         /*
1696                          * It's either a normal inactive page or nothing.
1697                          */
1698                         if ( !queue_empty(&vm_page_queue_inactive) ) {
1699                                 m = (vm_page_t) queue_first(&vm_page_queue_inactive);
1700                                 zf_run_count = 0;
1701                                 break;
1702                         }
1703
1704                         panic("vm_pageout: no victim");
1705                 }
1706
1707                 assert(!m->active && (m->inactive || m->speculative || m->throttled));
1708                 assert(!m->laundry);
1709                 assert(m->object != kernel_object);
1710                 assert(m->phys_page != vm_page_guard_addr);
1711
1712                 if (!m->speculative) {
1713                         vm_pageout_stats[vm_pageout_stat_now].considered++;
1714                 }
1715
1716                 DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
1717
1718                 /*
1719                  * check to see if we currently are working
1720                  * with the same object... if so, we've
1721                  * already got the lock
1722                  */
1723                 if (m->object != object) {
1724                         /*
1725                          * the object associated with candidate page is
1726                          * different from the one we were just working
1727                          * with... dump the lock if we still own it
1728                          */
1729                         if (object != NULL) {
1730                                 vm_object_unlock(object);
1731                                 object = NULL;
1732                                 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1733                         }
1734                         /*
1735                          * Try to lock object; since we've alread got the
1736                          * page queues lock, we can only 'try' for this one.
1737                          * if the 'try' fails, we need to do a mutex_pause
1738                          * to allow the owner of the object lock a chance to
1739                          * run... otherwise, we're likely to trip over this
1740                          * object in the same state as we work our way through
1741                          * the queue... clumps of pages associated with the same
1742                          * object are fairly typical on the inactive and active queues
1743                          */
1744                         if (!vm_object_lock_try_scan(m->object)) {
1745                                 vm_pageout_inactive_nolock++;
1746
1747                         requeue_page:
1748                                 /*
1749                                  *      Move page to end and continue.
1750                                  *      Don't re-issue ticket
1751                                  */
1752                                 if (m->zero_fill) {
1753                                         if (m->speculative) {
1754                                                 panic("vm_pageout_scan(): page %p speculative and zero-fill !?\n", m);
1755                                         }
1756                                         assert(!m->speculative);
1757                                         queue_remove(&vm_page_queue_zf, m,
1758                                                      vm_page_t, pageq);
1759                                         queue_enter(&vm_page_queue_zf, m,
1760                                                     vm_page_t, pageq);
1761                                 } else if (m->speculative) {
1762                                         remque(&m->pageq);
1763                                         m->speculative = FALSE;
1764                                         vm_page_speculative_count--;
1765
1766                                         /*
1767                                          * move to the head of the inactive queue
1768                                          * to get it out of the way... the speculative
1769                                          * queue is generally too small to depend
1770                                          * on there being enough pages from other
1771                                          * objects to make cycling it back on the
1772                                          * same queue a winning proposition
1773                                          */
1774                                         queue_enter_first(&vm_page_queue_inactive, m,
1775                                                           vm_page_t, pageq);
1776                                         m->inactive = TRUE;
1777                                         vm_page_inactive_count++;
1778                                         token_new_pagecount++;
1779                                 }  else if (m->throttled) {
1780                                         queue_remove(&vm_page_queue_throttled, m,
1781                                                      vm_page_t, pageq);
1782                                         m->throttled = FALSE;
1783                                         vm_page_throttled_count--;
1784
1785                                         /*
1786                                          * not throttled any more, so can stick
1787                                          * it on the inactive queue.
1788                                          */
1789                                         queue_enter(&vm_page_queue_inactive, m,
1790                                                     vm_page_t, pageq);
1791                                         m->inactive = TRUE;
1792                                         vm_page_inactive_count++;
1793                                         token_new_pagecount++;
1794                                 } else {
1795                                         queue_remove(&vm_page_queue_inactive, m,
1796                                                      vm_page_t, pageq);
1797 #if MACH_ASSERT
1798                                         vm_page_inactive_count--;       /* balance for purgeable queue asserts */
1799 #endif
1800                                         vm_purgeable_q_advance_all();
1801
1802                                         queue_enter(&vm_page_queue_inactive, m,
1803                                                     vm_page_t, pageq);
1804 #if MACH_ASSERT
1805                                         vm_page_inactive_count++;       /* balance for purgeable queue asserts */
1806 #endif
1807                                         token_new_pagecount++;
1808                                 }
1809                                 pmap_clear_reference(m->phys_page);
1810                                 m->reference = FALSE;
1811
1812                                 if ( !queue_empty(&sq->age_q) )
1813                                         m = (vm_page_t) queue_first(&sq->age_q);
1814                                 else if ( ((zf_run_count < zf_ratio) && vm_zf_queue_count >= zf_queue_min_count) ||
1815                                           queue_empty(&vm_page_queue_inactive)) {
1816                                         if ( !queue_empty(&vm_page_queue_zf) )
1817                                                 m = (vm_page_t) queue_first(&vm_page_queue_zf);
1818                                 } else if ( !queue_empty(&vm_page_queue_inactive) ) {
1819                                         m = (vm_page_t) queue_first(&vm_page_queue_inactive);
1820                                 }
1821                                 /*
1822                                  * this is the next object we're going to be interested in
1823                                  * try to make sure its available after the mutex_yield
1824                                  * returns control
1825                                  */
1826                                 vm_pageout_scan_wants_object = m->object;
1827
1828                                 /*
1829                                  * force us to dump any collected free pages
1830                                  * and to pause before moving on
1831                                  */
1832                                 try_failed = TRUE;
1833
1834                                 goto done_with_inactivepage;
1835                         }
1836                         object = m->object;
1837                         vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1838
1839                         try_failed = FALSE;
1840                 }
1841
1842                 /*
1843                  *      Paging out pages of external objects which
1844                  *      are currently being created must be avoided.
1845                  *      The pager may claim for memory, thus leading to a
1846                  *      possible dead lock between it and the pageout thread,
1847                  *      if such pages are finally chosen. The remaining assumption
1848                  *      is that there will finally be enough available pages in the
1849                  *      inactive pool to page out in order to satisfy all memory
1850                  *      claimed by the thread which concurrently creates the pager.
1851                  */
1852                 if (!object->pager_initialized && object->pager_created) {
1853                         /*
1854                          *      Move page to end and continue, hoping that
1855                          *      there will be enough other inactive pages to
1856                          *      page out so that the thread which currently
1857                          *      initializes the pager will succeed.
1858                          *      Don't re-grant the ticket, the page should
1859                          *      pulled from the queue and paged out whenever
1860                          *      one of its logically adjacent fellows is
1861                          *      targeted.
1862                          */
1863                         vm_pageout_inactive_avoid++;
1864                         goto requeue_page;
1865                 }
1866                 /*
1867                  *      Remove the page from its list.
1868                  */
1869                 if (m->speculative) {
1870                         remque(&m->pageq);
1871                         page_prev_state = PAGE_STATE_SPECULATIVE;
1872                         m->speculative = FALSE;
1873                         vm_page_speculative_count--;
1874                 } else if (m->throttled) {
1875                         queue_remove(&vm_page_queue_throttled, m, vm_page_t, pageq);
1876                         page_prev_state = PAGE_STATE_THROTTLED;
1877                         m->throttled = FALSE;
1878                         vm_page_throttled_count--;
1879                 } else {
1880                         if (m->zero_fill) {
1881                                 queue_remove(&vm_page_queue_zf, m, vm_page_t, pageq);
1882                                 page_prev_state = PAGE_STATE_ZEROFILL;
1883                                 vm_zf_queue_count--;
1884                         } else {
1885                                 page_prev_state = PAGE_STATE_INACTIVE;
1886                                 queue_remove(&vm_page_queue_inactive, m, vm_page_t, pageq);
1887                         }
1888                         m->inactive = FALSE;
1889                         if (!m->fictitious)
1890                                 vm_page_inactive_count--;
1891                         vm_purgeable_q_advance_all();
1892                 }
1893
1894                 m->pageq.next = NULL;
1895                 m->pageq.prev = NULL;
1896
1897                 if ( !m->fictitious && catch_up_count)
1898                         catch_up_count--;
1899
1900                 /*
1901                  * ENCRYPTED SWAP:
1902                  * if this page has already been picked up as part of a
1903                  * page-out cluster, it will be busy because it is being
1904                  * encrypted (see vm_object_upl_request()).  But we still
1905                  * want to demote it from "clean-in-place" (aka "adjacent")
1906                  * to "clean-and-free" (aka "target"), so let's ignore its
1907                  * "busy" bit here and proceed to check for "cleaning" a
1908                  * little bit below...
1909                  */
1910                 if ( !m->encrypted_cleaning && (m->busy || !object->alive)) {
1911                         /*
1912                          *      Somebody is already playing with this page.
1913                          *      Leave it off the pageout queues.
1914                          *
1915                          */
1916                         vm_pageout_inactive_busy++;
1917
1918                         goto done_with_inactivepage;
1919                 }
1920
1921                 /*
1922                  *      If it's absent or in error, we can reclaim the page.
1923                  */
1924
1925                 if (m->absent || m->error) {
1926                         vm_pageout_inactive_absent++;
1927 reclaim_page:
1928                         if (vm_pageout_deadlock_target) {
1929                                 vm_pageout_scan_inactive_throttle_success++;
1930                                 vm_pageout_deadlock_target--;
1931                         }
1932
1933                         DTRACE_VM2(dfree, int, 1, (uint64_t *), NULL);
1934
1935                         if (object->internal) {
1936                                 DTRACE_VM2(anonfree, int, 1, (uint64_t *), NULL);
1937                         } else {
1938                                 DTRACE_VM2(fsfree, int, 1, (uint64_t *), NULL);
1939                         }
1940                         vm_page_free_prepare_queues(m);
1941
1942                         /*
1943                          * remove page from object here since we're already
1944                          * behind the object lock... defer the rest of the work
1945                          * we'd normally do in vm_page_free_prepare_object
1946                          * until 'vm_page_free_list' is called
1947                          */
1948                         if (m->tabled)
1949                                 vm_page_remove(m, TRUE);
1950
1951                         assert(m->pageq.next == NULL &&
1952                                m->pageq.prev == NULL);
1953                         m->pageq.next = (queue_entry_t)local_freeq;
1954                         local_freeq = m;
1955                         local_freed++;
1956
1957                         inactive_burst_count = 0;
1958
1959                         if(page_prev_state != PAGE_STATE_SPECULATIVE) {
1960                                 vm_pageout_stats[vm_pageout_stat_now].reclaimed++;
1961                                 page_prev_state = 0;
1962                         }
1963
1964                         goto done_with_inactivepage;
1965                 }
1966
1967                 assert(!m->private);
1968                 assert(!m->fictitious);
1969
1970                 /*
1971                  *      If already cleaning this page in place, convert from
1972                  *      "adjacent" to "target". We can leave the page mapped,
1973                  *      and vm_pageout_object_terminate will determine whether
1974                  *      to free or reactivate.
1975                  */
1976
1977                 if (m->cleaning) {
1978                         m->busy = TRUE;
1979                         m->pageout = TRUE;
1980                         m->dump_cleaning = TRUE;
1981                         vm_page_wire(m);
1982
1983                         CLUSTER_STAT(vm_pageout_cluster_conversions++);
1984
1985                         inactive_burst_count = 0;
1986
1987                         goto done_with_inactivepage;
1988                 }
1989
1990                 /*
1991                  * If the object is empty, the page must be reclaimed even
1992                  * if dirty or used.
1993                  * If the page belongs to a volatile object, we stick it back
1994                  * on.
1995                  */
1996                 if (object->copy == VM_OBJECT_NULL) {
1997                         if (object->purgable == VM_PURGABLE_EMPTY) {
1998                                 m->busy = TRUE;
1999                                 if (m->pmapped == TRUE) {
2000                                         /* unmap the page */
2001                                         refmod_state = pmap_disconnect(m->phys_page);
2002                                         if (refmod_state & VM_MEM_MODIFIED) {
2003                                                 m->dirty = TRUE;
2004                                         }
2005                                 }
2006                                 if (m->dirty || m->precious) {
2007                                         /* we saved the cost of cleaning this page ! */
2008                                         vm_page_purged_count++;
2009                                 }
2010                                 goto reclaim_page;
2011                         }
2012                         if (object->purgable == VM_PURGABLE_VOLATILE) {
2013                                 /* if it's wired, we can't put it on our queue */
2014                                 assert(!VM_PAGE_WIRED(m));
2015                                 /* just stick it back on! */
2016                                 goto reactivate_page;
2017                         }
2018                 }
2019
2020                 /*
2021                  *      If it's being used, reactivate.
2022                  *      (Fictitious pages are either busy or absent.)
2023                  *      First, update the reference and dirty bits
2024                  *      to make sure the page is unreferenced.
2025                  */
2026                 refmod_state = -1;
2027
2028                 if (m->reference == FALSE && m->pmapped == TRUE) {
2029                         refmod_state = pmap_get_refmod(m->phys_page);
2030
2031                         if (refmod_state & VM_MEM_REFERENCED)
2032                                 m->reference = TRUE;
2033                         if (refmod_state & VM_MEM_MODIFIED)
2034                                 m->dirty = TRUE;
2035                 }
2036
2037                 if (m->reference || m->dirty) {
2038                         /* deal with a rogue "reusable" page */
2039                         VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m);
2040                 }
2041
2042                 if (m->reference && !m->no_cache) {
2043                         /*
2044                          * The page we pulled off the inactive list has
2045                          * been referenced.  It is possible for other
2046                          * processors to be touching pages faster than we
2047                          * can clear the referenced bit and traverse the
2048                          * inactive queue, so we limit the number of
2049                          * reactivations.
2050                          */
2051                         if (++reactivated_this_call >= reactivate_limit) {
2052                                 vm_pageout_reactivation_limit_exceeded++;
2053                         } else if (catch_up_count) {
2054                                 vm_pageout_catch_ups++;
2055                         } else if (++inactive_reclaim_run >= VM_PAGEOUT_INACTIVE_FORCE_RECLAIM) {
2056                                 vm_pageout_inactive_force_reclaim++;
2057                         } else {
2058                                 uint32_t isinuse;
2059 reactivate_page:
2060                                 if ( !object->internal && object->pager != MEMORY_OBJECT_NULL &&
2061                                      vnode_pager_get_isinuse(object->pager, &isinuse) == KERN_SUCCESS && !isinuse) {
2062                                         /*
2063                                          * no explict mappings of this object exist
2064                                          * and it's not open via the filesystem
2065                                          */
2066                                         vm_page_deactivate(m);
2067                                         vm_pageout_inactive_deactivated++;
2068                                 } else {
2069                                         /*
2070                                          * The page was/is being used, so put back on active list.
2071                                          */
2072                                         vm_page_activate(m);
2073                                         VM_STAT_INCR(reactivations);
2074                                 }
2075                                 vm_pageout_inactive_used++;
2076                                 inactive_burst_count = 0;
2077
2078                                 goto done_with_inactivepage;
2079                         }
2080                         /*
2081                          * Make sure we call pmap_get_refmod() if it
2082                          * wasn't already called just above, to update
2083                          * the dirty bit.
2084                          */
2085                         if ((refmod_state == -1) && !m->dirty && m->pmapped) {
2086                                 refmod_state = pmap_get_refmod(m->phys_page);
2087                                 if (refmod_state & VM_MEM_MODIFIED)
2088                                         m->dirty = TRUE;
2089                         }
2090                         forced_reclaim = TRUE;
2091                 } else {
2092                         forced_reclaim = FALSE;
2093                 }
2094
2095                 XPR(XPR_VM_PAGEOUT,
2096                 "vm_pageout_scan, replace object 0x%X offset 0x%X page 0x%X\n",
2097                 object, m->offset, m, 0,0);
2098
2099                 /*
2100                  * we've got a candidate page to steal...
2101                  *
2102                  * m->dirty is up to date courtesy of the
2103                  * preceding check for m->reference... if
2104                  * we get here, then m->reference had to be
2105                  * FALSE (or possibly "reactivate_limit" was
2106                  * exceeded), but in either case we called
2107                  * pmap_get_refmod() and updated both
2108                  * m->reference and m->dirty
2109                  *
2110                  * if it's dirty or precious we need to
2111                  * see if the target queue is throtttled
2112                  * it if is, we need to skip over it by moving it back
2113                  * to the end of the inactive queue
2114                  */
2115
2116                 inactive_throttled = FALSE;
2117
2118                 if (m->dirty || m->precious) {
2119                         if (object->internal) {
2120                                 if (VM_PAGE_Q_THROTTLED(iq))
2121                                         inactive_throttled = TRUE;
2122                         } else if (VM_PAGE_Q_THROTTLED(eq)) {
2123                                 inactive_throttled = TRUE;
2124                         }
2125                 }
2126                 if (inactive_throttled == TRUE) {
2127 throttle_inactive:
2128                         if (!IP_VALID(memory_manager_default) &&
2129                             object->internal && m->dirty &&
2130                             (object->purgable == VM_PURGABLE_DENY ||
2131                              object->purgable == VM_PURGABLE_NONVOLATILE ||
2132                              object->purgable == VM_PURGABLE_VOLATILE)) {
2133                                 queue_enter(&vm_page_queue_throttled, m,
2134                                             vm_page_t, pageq);
2135                                 m->throttled = TRUE;
2136                                 vm_page_throttled_count++;
2137                         } else {
2138                                 if (m->zero_fill) {
2139                                         queue_enter(&vm_page_queue_zf, m,
2140                                                     vm_page_t, pageq);
2141                                         vm_zf_queue_count++;
2142                                 } else
2143                                         queue_enter(&vm_page_queue_inactive, m,
2144                                                     vm_page_t, pageq);
2145                                 m->inactive = TRUE;
2146                                 if (!m->fictitious) {
2147                                         vm_page_inactive_count++;
2148                                         token_new_pagecount++;
2149                                 }
2150                         }
2151                         vm_pageout_scan_inactive_throttled++;
2152                         goto done_with_inactivepage;
2153                 }
2154
2155                 /*
2156                  * we've got a page that we can steal...
2157                  * eliminate all mappings and make sure
2158                  * we have the up-to-date modified state
2159                  * first take the page BUSY, so that no new
2160                  * mappings can be made
2161                  */
2162                 m->busy = TRUE;
2163
2164                 /*
2165                  * if we need to do a pmap_disconnect then we
2166                  * need to re-evaluate m->dirty since the pmap_disconnect
2167                  * provides the true state atomically... the
2168                  * page was still mapped up to the pmap_disconnect
2169                  * and may have been dirtied at the last microsecond
2170                  *
2171                  * we also check for the page being referenced 'late'
2172                  * if it was, we first need to do a WAKEUP_DONE on it
2173                  * since we already set m->busy = TRUE, before
2174                  * going off to reactivate it
2175                  *
2176                  * Note that if 'pmapped' is FALSE then the page is not
2177                  * and has not been in any map, so there is no point calling
2178                  * pmap_disconnect().  m->dirty and/or m->reference could
2179                  * have been set in anticipation of likely usage of the page.
2180                  */
2181                 if (m->pmapped == TRUE) {
2182                         refmod_state = pmap_disconnect(m->phys_page);
2183
2184                         if (refmod_state & VM_MEM_MODIFIED)
2185                                 m->dirty = TRUE;
2186                         if (refmod_state & VM_MEM_REFERENCED) {
2187
2188                                 /* If m->reference is already set, this page must have
2189                                  * already failed the reactivate_limit test, so don't
2190                                  * bump the counts twice.
2191                                  */
2192                                 if ( ! m->reference ) {
2193                                         m->reference = TRUE;
2194                                         if (forced_reclaim ||
2195                                             ++reactivated_this_call >= reactivate_limit)
2196                                                 vm_pageout_reactivation_limit_exceeded++;
2197                                         else {
2198                                                 PAGE_WAKEUP_DONE(m);
2199                                                 goto reactivate_page;
2200                                         }
2201                                 }
2202                         }
2203                 }
2204                 /*
2205                  * reset our count of pages that have been reclaimed
2206                  * since the last page was 'stolen'
2207                  */
2208                 inactive_reclaim_run = 0;
2209
2210                 /*
2211                  *      If it's clean and not precious, we can free the page.
2212                  */
2213                 if (!m->dirty && !m->precious) {
2214                         if (m->zero_fill)
2215                                 vm_pageout_inactive_zf++;
2216                         vm_pageout_inactive_clean++;
2217
2218                         goto reclaim_page;
2219                 }
2220
2221                 /*
2222                  * The page may have been dirtied since the last check
2223                  * for a throttled target queue (which may have been skipped
2224                  * if the page was clean then).  With the dirty page
2225                  * disconnected here, we can make one final check.
2226                  */
2227                 {
2228                         boolean_t disconnect_throttled = FALSE;
2229                         if (object->internal) {
2230                                 if (VM_PAGE_Q_THROTTLED(iq))
2231                                         disconnect_throttled = TRUE;
2232                         } else if (VM_PAGE_Q_THROTTLED(eq)) {
2233                                 disconnect_throttled = TRUE;
2234                         }
2235
2236                         if (disconnect_throttled == TRUE) {
2237                                 PAGE_WAKEUP_DONE(m);
2238                                 goto throttle_inactive;
2239                         }
2240                 }
2241
2242                 vm_pageout_stats[vm_pageout_stat_now].reclaimed++;
2243
2244                 vm_pageout_cluster(m);
2245
2246                 if (m->zero_fill)
2247                         vm_pageout_inactive_zf++;
2248                 vm_pageout_inactive_dirty++;
2249
2250                 inactive_burst_count = 0;
2251
2252 done_with_inactivepage:
2253                 if (delayed_unlock++ > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT || try_failed == TRUE) {
2254
2255                         if (object != NULL) {
2256                                 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2257                                 vm_object_unlock(object);
2258                                 object = NULL;
2259                         }
2260                         if (local_freeq) {
2261                                 vm_page_unlock_queues();
2262                                 vm_page_free_list(local_freeq, TRUE);
2263
2264                                 local_freeq = NULL;
2265                                 local_freed = 0;
2266                                 vm_page_lock_queues();
2267                         } else
2268                                 lck_mtx_yield(&vm_page_queue_lock);
2269
2270                         delayed_unlock = 1;
2271                 }
2272                 /*
2273                  * back to top of pageout scan loop
2274                  */
2275         }
2276 }
2277
2278
2279 int vm_page_free_count_init;
2280
2281 void
2282 vm_page_free_reserve(
2283         int pages)
2284 {
2285         int             free_after_reserve;
2286
2287         vm_page_free_reserved += pages;
2288
2289         free_after_reserve = vm_page_free_count_init - vm_page_free_reserved;
2290
2291         vm_page_free_min = vm_page_free_reserved +
2292                 VM_PAGE_FREE_MIN(free_after_reserve);
2293
2294         if (vm_page_free_min > VM_PAGE_FREE_MIN_LIMIT)
2295                 vm_page_free_min = VM_PAGE_FREE_MIN_LIMIT;
2296
2297         vm_page_free_target = vm_page_free_reserved +
2298                 VM_PAGE_FREE_TARGET(free_after_reserve);
2299
2300         if (vm_page_free_target > VM_PAGE_FREE_TARGET_LIMIT)
2301                 vm_page_free_target = VM_PAGE_FREE_TARGET_LIMIT;
2302
2303         if (vm_page_free_target < vm_page_free_min + 5)
2304                 vm_page_free_target = vm_page_free_min + 5;
2305
2306         vm_page_throttle_limit = vm_page_free_target - (vm_page_free_target / 3);
2307         vm_page_creation_throttle = vm_page_free_target / 2;
2308 }
2309
2310 /*
2311  *      vm_pageout is the high level pageout daemon.
2312  */
2313
2314 void
2315 vm_pageout_continue(void)
2316 {
2317         DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL);
2318         vm_pageout_scan_event_counter++;
2319         vm_pageout_scan();
2320         /* we hold vm_page_queue_free_lock now */
2321         assert(vm_page_free_wanted == 0);
2322         assert(vm_page_free_wanted_privileged == 0);
2323         assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
2324         lck_mtx_unlock(&vm_page_queue_free_lock);
2325
2326         counter(c_vm_pageout_block++);
2327         thread_block((thread_continue_t)vm_pageout_continue);
2328         /*NOTREACHED*/
2329 }
2330
2331
2332 #ifdef FAKE_DEADLOCK
2333
2334 #define FAKE_COUNT      5000
2335
2336 int internal_count = 0;
2337 int fake_deadlock = 0;
2338
2339 #endif
2340
2341 static void
2342 vm_pageout_iothread_continue(struct vm_pageout_queue *q)
2343 {
2344         vm_page_t       m = NULL;
2345         vm_object_t     object;
2346         memory_object_t pager;
2347         thread_t        self = current_thread();
2348
2349         if ((vm_pageout_internal_iothread != THREAD_NULL)
2350             && (self == vm_pageout_external_iothread )
2351             && (self->options & TH_OPT_VMPRIV))
2352                 self->options &= ~TH_OPT_VMPRIV;
2353
2354         vm_page_lockspin_queues();
2355
2356         while ( !queue_empty(&q->pgo_pending) ) {
2357
2358                    q->pgo_busy = TRUE;
2359                    queue_remove_first(&q->pgo_pending, m, vm_page_t, pageq);
2360                    VM_PAGE_CHECK(m);
2361                    m->pageout_queue = FALSE;
2362                    m->pageq.next = NULL;
2363                    m->pageq.prev = NULL;
2364                    vm_page_unlock_queues();
2365
2366 #ifdef FAKE_DEADLOCK
2367                    if (q == &vm_pageout_queue_internal) {
2368                            vm_offset_t addr;
2369                            int  pg_count;
2370
2371                            internal_count++;
2372
2373                            if ((internal_count == FAKE_COUNT)) {
2374
2375                                    pg_count = vm_page_free_count + vm_page_free_reserved;
2376
2377                                    if (kmem_alloc(kernel_map, &addr, PAGE_SIZE * pg_count) == KERN_SUCCESS) {
2378                                            kmem_free(kernel_map, addr, PAGE_SIZE * pg_count);
2379                                    }
2380                                    internal_count = 0;
2381                                    fake_deadlock++;
2382                            }
2383                    }
2384 #endif
2385                    object = m->object;
2386
2387                    vm_object_lock(object);
2388
2389                    if (!object->pager_initialized) {
2390
2391                            /*
2392                             *   If there is no memory object for the page, create
2393                             *   one and hand it to the default pager.
2394                             */
2395
2396                            if (!object->pager_initialized)
2397                                    vm_object_collapse(object,
2398                                                       (vm_object_offset_t) 0,
2399                                                       TRUE);
2400                            if (!object->pager_initialized)
2401                                    vm_object_pager_create(object);
2402                            if (!object->pager_initialized) {
2403                                    /*
2404                                     *   Still no pager for the object.
2405                                     *   Reactivate the page.
2406                                     *
2407                                     *   Should only happen if there is no
2408                                     *   default pager.
2409                                     */
2410                                    vm_page_lockspin_queues();
2411
2412                                    vm_pageout_queue_steal(m, TRUE);
2413                                    vm_pageout_dirty_no_pager++;
2414                                    vm_page_activate(m);
2415
2416                                    vm_page_unlock_queues();
2417
2418                                    /*
2419                                     *   And we are done with it.
2420                                     */
2421                                    PAGE_WAKEUP_DONE(m);
2422
2423                                    vm_object_paging_end(object);
2424                                    vm_object_unlock(object);
2425
2426                                    vm_page_lockspin_queues();
2427                                    continue;
2428                            }
2429                    }
2430                    pager = object->pager;
2431                    if (pager == MEMORY_OBJECT_NULL) {
2432                            /*
2433                             * This pager has been destroyed by either
2434                             * memory_object_destroy or vm_object_destroy, and
2435                             * so there is nowhere for the page to go.
2436                             */
2437                            if (m->pageout) {
2438                                    /*
2439                                     * Just free the page... VM_PAGE_FREE takes
2440                                     * care of cleaning up all the state...
2441                                     * including doing the vm_pageout_throttle_up
2442                                     */
2443                                    VM_PAGE_FREE(m);
2444                            } else {
2445                                    vm_page_lockspin_queues();
2446
2447                                    vm_pageout_queue_steal(m, TRUE);
2448                                    vm_page_activate(m);
2449
2450                                    vm_page_unlock_queues();
2451
2452                                    /*
2453                                     *   And we are done with it.
2454                                     */
2455                                    PAGE_WAKEUP_DONE(m);
2456                            }
2457                            vm_object_paging_end(object);
2458                            vm_object_unlock(object);
2459
2460                            vm_page_lockspin_queues();
2461                            continue;
2462                    }
2463                    VM_PAGE_CHECK(m);
2464                    vm_object_unlock(object);
2465                    /*
2466                     * we expect the paging_in_progress reference to have
2467                     * already been taken on the object before it was added
2468                     * to the appropriate pageout I/O queue... this will
2469                     * keep the object from being terminated and/or the
2470                     * paging_offset from changing until the I/O has
2471                     * completed... therefore no need to lock the object to
2472                     * pull the paging_offset from it.
2473                     *
2474                     * Send the data to the pager.
2475                     * any pageout clustering happens there
2476                     */
2477                    memory_object_data_return(pager,
2478                                              m->offset + object->paging_offset,
2479                                              PAGE_SIZE,
2480                                              NULL,
2481                                              NULL,
2482                                              FALSE,
2483                                              FALSE,
2484                                              0);
2485
2486                    vm_object_lock(object);
2487                    vm_object_paging_end(object);
2488                    vm_object_unlock(object);
2489
2490                    vm_page_lockspin_queues();
2491         }
2492         assert_wait((event_t) q, THREAD_UNINT);
2493
2494         if (q->pgo_throttled == TRUE && !VM_PAGE_Q_THROTTLED(q)) {
2495                 q->pgo_throttled = FALSE;
2496                 thread_wakeup((event_t) &q->pgo_laundry);
2497         }
2498         if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
2499                 q->pgo_draining = FALSE;
2500                 thread_wakeup((event_t) (&q->pgo_laundry+1));
2501         }
2502         q->pgo_busy = FALSE;
2503         q->pgo_idle = TRUE;
2504         vm_page_unlock_queues();
2505
2506         thread_block_parameter((thread_continue_t)vm_pageout_iothread_continue, (void *) &q->pgo_pending);
2507         /*NOTREACHED*/
2508 }
2509
2510
2511 static void
2512 vm_pageout_iothread_external(void)
2513 {
2514         thread_t        self = current_thread();
2515
2516         self->options |= TH_OPT_VMPRIV;
2517
2518         vm_pageout_iothread_continue(&vm_pageout_queue_external);
2519         /*NOTREACHED*/
2520 }
2521
2522
2523 static void
2524 vm_pageout_iothread_internal(void)
2525 {
2526         thread_t        self = current_thread();
2527
2528         self->options |= TH_OPT_VMPRIV;
2529
2530         vm_pageout_iothread_continue(&vm_pageout_queue_internal);
2531         /*NOTREACHED*/
2532 }
2533
2534 kern_return_t
2535 vm_set_buffer_cleanup_callout(boolean_t (*func)(int))
2536 {
2537         if (OSCompareAndSwapPtr(NULL, func, (void * volatile *) &consider_buffer_cache_collect)) {
2538                 return KERN_SUCCESS;
2539         } else {
2540                 return KERN_FAILURE; /* Already set */
2541         }
2542 }
2543
2544 static void
2545 vm_pageout_garbage_collect(int collect)
2546 {
2547         if (collect) {
2548                 boolean_t buf_large_zfree = FALSE;
2549                 stack_collect();
2550
2551                 /*
2552                  * consider_zone_gc should be last, because the other operations
2553                  * might return memory to zones.
2554                  */
2555                 consider_machine_collect();
2556                 if (consider_buffer_cache_collect != NULL) {
2557                         buf_large_zfree = (*consider_buffer_cache_collect)(0);
2558                 }
2559                 consider_zone_gc(buf_large_zfree);
2560
2561                 consider_machine_adjust();
2562         }
2563
2564         assert_wait((event_t) &vm_pageout_garbage_collect, THREAD_UNINT);
2565
2566         thread_block_parameter((thread_continue_t) vm_pageout_garbage_collect, (void *)1);
2567         /*NOTREACHED*/
2568 }
2569
2570
2571
2572 void
2573 vm_pageout(void)
2574 {
2575         thread_t        self = current_thread();
2576         thread_t        thread;
2577         kern_return_t   result;
2578         spl_t           s;
2579
2580         /*
2581          * Set thread privileges.
2582          */
2583         s = splsched();
2584         thread_lock(self);
2585         self->priority = BASEPRI_PREEMPT - 1;
2586         set_sched_pri(self, self->priority);
2587         thread_unlock(self);
2588
2589         if (!self->reserved_stack)
2590                 self->reserved_stack = self->kernel_stack;
2591
2592         splx(s);
2593
2594         /*
2595          *      Initialize some paging parameters.
2596          */
2597
2598         if (vm_pageout_idle_wait == 0)
2599                 vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
2600
2601         if (vm_pageout_burst_wait == 0)
2602                 vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
2603
2604         if (vm_pageout_empty_wait == 0)
2605                 vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
2606
2607         if (vm_pageout_deadlock_wait == 0)
2608                 vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
2609
2610         if (vm_pageout_deadlock_relief == 0)
2611                 vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
2612
2613         if (vm_pageout_inactive_relief == 0)
2614                 vm_pageout_inactive_relief = VM_PAGEOUT_INACTIVE_RELIEF;
2615
2616         if (vm_pageout_burst_active_throttle == 0)
2617                 vm_pageout_burst_active_throttle = VM_PAGEOUT_BURST_ACTIVE_THROTTLE;
2618
2619         if (vm_pageout_burst_inactive_throttle == 0)
2620                 vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
2621
2622         /*
2623          * Set kernel task to low backing store privileged
2624          * status
2625          */
2626         task_lock(kernel_task);
2627         kernel_task->priv_flags |= VM_BACKING_STORE_PRIV;
2628         task_unlock(kernel_task);
2629
2630         vm_page_free_count_init = vm_page_free_count;
2631
2632         /*
2633          * even if we've already called vm_page_free_reserve
2634          * call it again here to insure that the targets are
2635          * accurately calculated (it uses vm_page_free_count_init)
2636          * calling it with an arg of 0 will not change the reserve
2637          * but will re-calculate free_min and free_target
2638          */
2639         if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
2640                 vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
2641         } else
2642                 vm_page_free_reserve(0);
2643
2644
2645         queue_init(&vm_pageout_queue_external.pgo_pending);
2646         vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
2647         vm_pageout_queue_external.pgo_laundry = 0;
2648         vm_pageout_queue_external.pgo_idle = FALSE;
2649         vm_pageout_queue_external.pgo_busy = FALSE;
2650         vm_pageout_queue_external.pgo_throttled = FALSE;
2651         vm_pageout_queue_external.pgo_draining = FALSE;
2652
2653         queue_init(&vm_pageout_queue_internal.pgo_pending);
2654         vm_pageout_queue_internal.pgo_maxlaundry = 0;
2655         vm_pageout_queue_internal.pgo_laundry = 0;
2656         vm_pageout_queue_internal.pgo_idle = FALSE;
2657         vm_pageout_queue_internal.pgo_busy = FALSE;
2658         vm_pageout_queue_internal.pgo_throttled = FALSE;
2659         vm_pageout_queue_internal.pgo_draining = FALSE;
2660
2661
2662         /* internal pageout thread started when default pager registered first time */
2663         /* external pageout and garbage collection threads started here */
2664
2665         result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external, NULL,
2666                                               BASEPRI_PREEMPT - 1,
2667                                               &vm_pageout_external_iothread);
2668         if (result != KERN_SUCCESS)
2669                 panic("vm_pageout_iothread_external: create failed");
2670
2671         thread_deallocate(vm_pageout_external_iothread);
2672
2673         result = kernel_thread_start_priority((thread_continue_t)vm_pageout_garbage_collect, NULL,
2674                                               MINPRI_KERNEL,
2675                                               &thread);
2676         if (result != KERN_SUCCESS)
2677                 panic("vm_pageout_garbage_collect: create failed");
2678
2679         thread_deallocate(thread);
2680
2681         vm_object_reaper_init();
2682
2683
2684         vm_pageout_continue();
2685
2686         /*
2687          * Unreached code!
2688          *
2689          * The vm_pageout_continue() call above never returns, so the code below is never
2690          * executed.  We take advantage of this to declare several DTrace VM related probe
2691          * points that our kernel doesn't have an analog for.  These are probe points that
2692          * exist in Solaris and are in the DTrace documentation, so people may have written
2693          * scripts that use them.  Declaring the probe points here means their scripts will
2694          * compile and execute which we want for portability of the scripts, but since this
2695          * section of code is never reached, the probe points will simply never fire.  Yes,
2696          * this is basically a hack.  The problem is the DTrace probe points were chosen with
2697          * Solaris specific VM events in mind, not portability to different VM implementations.
2698          */
2699
2700         DTRACE_VM2(execfree, int, 1, (uint64_t *), NULL);
2701         DTRACE_VM2(execpgin, int, 1, (uint64_t *), NULL);
2702         DTRACE_VM2(execpgout, int, 1, (uint64_t *), NULL);
2703         DTRACE_VM2(pgswapin, int, 1, (uint64_t *), NULL);
2704         DTRACE_VM2(pgswapout, int, 1, (uint64_t *), NULL);
2705         DTRACE_VM2(swapin, int, 1, (uint64_t *), NULL);
2706         DTRACE_VM2(swapout, int, 1, (uint64_t *), NULL);
2707         /*NOTREACHED*/
2708 }
2709
2710 kern_return_t
2711 vm_pageout_internal_start(void)
2712 {
2713         kern_return_t result;
2714
2715         vm_pageout_queue_internal.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
2716         result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal, NULL, BASEPRI_PREEMPT - 1, &vm_pageout_internal_iothread);
2717         if (result == KERN_SUCCESS)
2718                 thread_deallocate(vm_pageout_internal_iothread);
2719         return result;
2720 }
2721
2722
2723 /*
2724  * when marshalling pages into a UPL and subsequently committing
2725  * or aborting them, it is necessary to hold
2726  * the vm_page_queue_lock (a hot global lock) for certain operations
2727  * on the page... however, the majority of the work can be done
2728  * while merely holding the object lock... in fact there are certain
2729  * collections of pages that don't require any work brokered by the
2730  * vm_page_queue_lock... to mitigate the time spent behind the global
2731  * lock, go to a 2 pass algorithm... collect pages up to DELAYED_WORK_LIMIT
2732  * while doing all of the work that doesn't require the vm_page_queue_lock...
2733  * then call dw_do_work to acquire the vm_page_queue_lock and do the
2734  * necessary work for each page... we will grab the busy bit on the page
2735  * if it's not already held so that dw_do_work can drop the object lock
2736  * if it can't immediately take the vm_page_queue_lock in order to compete
2737  * for the locks in the same order that vm_pageout_scan takes them.
2738  * the operation names are modeled after the names of the routines that
2739  * need to be called in order to make the changes very obvious in the
2740  * original loop
2741  */
2742
2743 #define DELAYED_WORK_LIMIT      32
2744
2745 #define DW_vm_page_unwire               0x01
2746 #define DW_vm_page_wire                 0x02
2747 #define DW_vm_page_free                 0x04
2748 #define DW_vm_page_activate             0x08
2749 #define DW_vm_page_deactivate_internal  0x10
2750 #define DW_vm_page_speculate            0x20
2751 #define DW_vm_page_lru                  0x40
2752 #define DW_vm_pageout_throttle_up       0x80
2753 #define DW_PAGE_WAKEUP                  0x100
2754 #define DW_clear_busy                   0x200
2755 #define DW_clear_reference              0x400
2756 #define DW_set_reference                0x800
2757
2758 struct dw {
2759         vm_page_t       dw_m;
2760         int             dw_mask;
2761 };
2762
2763
2764 static void dw_do_work(vm_object_t object, struct dw *dwp, int dw_count);
2765
2766
2767
2768 static upl_t
2769 upl_create(int type, int flags, upl_size_t size)
2770 {
2771         upl_t   upl;
2772         int     page_field_size = 0;
2773         int     upl_flags = 0;
2774         int     upl_size  = sizeof(struct upl);
2775
2776         size = round_page_32(size);
2777
2778         if (type & UPL_CREATE_LITE) {
2779                 page_field_size = (atop(size) + 7) >> 3;
2780                 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
2781
2782                 upl_flags |= UPL_LITE;
2783         }
2784         if (type & UPL_CREATE_INTERNAL) {
2785                 upl_size += (int) sizeof(struct upl_page_info) * atop(size);
2786
2787                 upl_flags |= UPL_INTERNAL;
2788         }
2789         upl = (upl_t)kalloc(upl_size + page_field_size);
2790
2791         if (page_field_size)
2792                 bzero((char *)upl + upl_size, page_field_size);
2793
2794         upl->flags = upl_flags | flags;
2795         upl->src_object = NULL;
2796         upl->kaddr = (vm_offset_t)0;
2797         upl->size = 0;
2798         upl->map_object = NULL;
2799         upl->ref_count = 1;
2800         upl->highest_page = 0;
2801         upl_lock_init(upl);
2802         upl->vector_upl = NULL;
2803 #if UPL_DEBUG
2804         upl->ubc_alias1 = 0;
2805         upl->ubc_alias2 = 0;
2806
2807         upl->upl_creator = current_thread();
2808         upl->upl_state = 0;
2809         upl->upl_commit_index = 0;
2810         bzero(&upl->upl_commit_records[0], sizeof(upl->upl_commit_records));
2811
2812         (void) OSBacktrace(&upl->upl_create_retaddr[0], UPL_DEBUG_STACK_FRAMES);
2813 #endif /* UPL_DEBUG */
2814
2815         return(upl);
2816 }
2817
2818 static void
2819 upl_destroy(upl_t upl)
2820 {
2821         int     page_field_size;  /* bit field in word size buf */
2822         int     size;
2823
2824 #if UPL_DEBUG
2825         {
2826                 vm_object_t     object;
2827
2828                 if (upl->flags & UPL_SHADOWED) {
2829                         object = upl->map_object->shadow;
2830                 } else {
2831                         object = upl->map_object;
2832                 }
2833                 vm_object_lock(object);
2834                 queue_remove(&object->uplq, upl, upl_t, uplq);
2835                 vm_object_unlock(object);
2836         }
2837 #endif /* UPL_DEBUG */
2838         /*
2839          * drop a reference on the map_object whether or
2840          * not a pageout object is inserted
2841          */
2842         if (upl->flags & UPL_SHADOWED)
2843                 vm_object_deallocate(upl->map_object);
2844
2845         if (upl->flags & UPL_DEVICE_MEMORY)
2846                 size = PAGE_SIZE;
2847         else
2848                 size = upl->size;
2849         page_field_size = 0;
2850
2851         if (upl->flags & UPL_LITE) {
2852                 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
2853                 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
2854         }
2855         upl_lock_destroy(upl);
2856         upl->vector_upl = (vector_upl_t) 0xfeedbeef;
2857         if (upl->flags & UPL_INTERNAL) {
2858                 kfree(upl,
2859                       sizeof(struct upl) +
2860                       (sizeof(struct upl_page_info) * (size/PAGE_SIZE))
2861                       + page_field_size);
2862         } else {
2863                 kfree(upl, sizeof(struct upl) + page_field_size);
2864         }
2865 }
2866
2867 void uc_upl_dealloc(upl_t upl);
2868 __private_extern__ void
2869 uc_upl_dealloc(upl_t upl)
2870 {
2871         if (--upl->ref_count == 0)
2872                 upl_destroy(upl);
2873 }
2874
2875 void
2876 upl_deallocate(upl_t upl)
2877 {
2878         if (--upl->ref_count == 0) {
2879                 if(vector_upl_is_valid(upl))
2880                         vector_upl_deallocate(upl);
2881                 upl_destroy(upl);
2882         }
2883 }
2884
2885 #if DEVELOPMENT || DEBUG
2886 /*/*
2887  * Statistics about UPL enforcement of copy-on-write obligations.
2888  */
2889 unsigned long upl_cow = 0;
2890 unsigned long upl_cow_again = 0;
2891 unsigned long upl_cow_pages = 0;
2892 unsigned long upl_cow_again_pages = 0;
2893
2894 unsigned long iopl_cow = 0;
2895 unsigned long iopl_cow_pages = 0;
2896 #endif
2897
2898 /*
2899  *      Routine:        vm_object_upl_request
2900  *      Purpose:
2901  *              Cause the population of a portion of a vm_object.
2902  *              Depending on the nature of the request, the pages
2903  *              returned may be contain valid data or be uninitialized.
2904  *              A page list structure, listing the physical pages
2905  *              will be returned upon request.
2906  *              This function is called by the file system or any other
2907  *              supplier of backing store to a pager.
2908  *              IMPORTANT NOTE: The caller must still respect the relationship
2909  *              between the vm_object and its backing memory object.  The
2910  *              caller MUST NOT substitute changes in the backing file
2911  *              without first doing a memory_object_lock_request on the
2912  *              target range unless it is know that the pages are not
2913  *              shared with another entity at the pager level.
2914  *              Copy_in_to:
2915  *                      if a page list structure is present
2916  *                      return the mapped physical pages, where a
2917  *                      page is not present, return a non-initialized
2918  *                      one.  If the no_sync bit is turned on, don't
2919  *                      call the pager unlock to synchronize with other
2920  *                      possible copies of the page. Leave pages busy
2921  *                      in the original object, if a page list structure
2922  *                      was specified.  When a commit of the page list
2923  *                      pages is done, the dirty bit will be set for each one.
2924  *              Copy_out_from:
2925  *                      If a page list structure is present, return
2926  *                      all mapped pages.  Where a page does not exist
2927  *                      map a zero filled one. Leave pages busy in
2928  *                      the original object.  If a page list structure
2929  *                      is not specified, this call is a no-op.
2930  *
2931  *              Note:  access of default pager objects has a rather interesting
2932  *              twist.  The caller of this routine, presumably the file system
2933  *              page cache handling code, will never actually make a request
2934  *              against a default pager backed object.  Only the default
2935  *              pager will make requests on backing store related vm_objects
2936  *              In this way the default pager can maintain the relationship
2937  *              between backing store files (abstract memory objects) and
2938  *              the vm_objects (cache objects), they support.
2939  *
2940  */
2941
2942 __private_extern__ kern_return_t
2943 vm_object_upl_request(
2944         vm_object_t             object,
2945         vm_object_offset_t      offset,
2946         upl_size_t              size,
2947         upl_t                   *upl_ptr,
2948         upl_page_info_array_t   user_page_list,
2949         unsigned int            *page_list_count,
2950         int                     cntrl_flags)
2951 {
2952         vm_page_t               dst_page = VM_PAGE_NULL;
2953         vm_object_offset_t      dst_offset;
2954         upl_size_t              xfer_size;
2955         boolean_t               dirty;
2956         boolean_t               hw_dirty;
2957         upl_t                   upl = NULL;
2958         unsigned int            entry;
2959 #if MACH_CLUSTER_STATS
2960         boolean_t               encountered_lrp = FALSE;
2961 #endif
2962         vm_page_t               alias_page = NULL;
2963         int                     refmod_state = 0;
2964         wpl_array_t             lite_list = NULL;
2965         vm_object_t             last_copy_object;
2966         struct  dw              dw_array[DELAYED_WORK_LIMIT];
2967         struct  dw              *dwp;
2968         int                     dw_count;
2969
2970         if (cntrl_flags & ~UPL_VALID_FLAGS) {
2971                 /*
2972                  * For forward compatibility's sake,
2973                  * reject any unknown flag.
2974                  */
2975                 return KERN_INVALID_VALUE;
2976         }
2977         if ( (!object->internal) && (object->paging_offset != 0) )
2978                 panic("vm_object_upl_request: external object with non-zero paging offset\n");
2979         if (object->phys_contiguous)
2980                 panic("vm_object_upl_request: contiguous object specified\n");
2981
2982
2983         if ((size / PAGE_SIZE) > MAX_UPL_SIZE)
2984                 size = MAX_UPL_SIZE * PAGE_SIZE;
2985
2986         if ( (cntrl_flags & UPL_SET_INTERNAL) && page_list_count != NULL)
2987                 *page_list_count = MAX_UPL_SIZE;
2988
2989         if (cntrl_flags & UPL_SET_INTERNAL) {
2990                 if (cntrl_flags & UPL_SET_LITE) {
2991
2992                         upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE, 0, size);
2993
2994                         user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
2995                         lite_list = (wpl_array_t)
2996                                         (((uintptr_t)user_page_list) +
2997                                         ((size/PAGE_SIZE) * sizeof(upl_page_info_t)));
2998                         if (size == 0) {
2999                                 user_page_list = NULL;
3000                                 lite_list = NULL;
3001                         }
3002                 } else {
3003                         upl = upl_create(UPL_CREATE_INTERNAL, 0, size);
3004
3005                         user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
3006                         if (size == 0) {
3007                                 user_page_list = NULL;
3008                         }
3009                 }
3010         } else {
3011                 if (cntrl_flags & UPL_SET_LITE) {
3012
3013                         upl = upl_create(UPL_CREATE_EXTERNAL | UPL_CREATE_LITE, 0, size);
3014
3015                         lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
3016                         if (size == 0) {
3017                                 lite_list = NULL;
3018                         }
3019                 } else {
3020                         upl = upl_create(UPL_CREATE_EXTERNAL, 0, size);
3021                 }
3022         }
3023         *upl_ptr = upl;
3024
3025         if (user_page_list)
3026                 user_page_list[0].device = FALSE;
3027
3028         if (cntrl_flags & UPL_SET_LITE) {
3029                 upl->map_object = object;
3030         } else {
3031                 upl->map_object = vm_object_allocate(size);
3032                 /*
3033                  * No neeed to lock the new object: nobody else knows
3034                  * about it yet, so it's all ours so far.
3035                  */
3036                 upl->map_object->shadow = object;
3037                 upl->map_object->pageout = TRUE;
3038                 upl->map_object->can_persist = FALSE;
3039                 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3040                 upl->map_object->shadow_offset = offset;
3041                 upl->map_object->wimg_bits = object->wimg_bits;
3042
3043                 VM_PAGE_GRAB_FICTITIOUS(alias_page);
3044
3045                 upl->flags |= UPL_SHADOWED;
3046         }
3047         /*
3048          * ENCRYPTED SWAP:
3049          * Just mark the UPL as "encrypted" here.
3050          * We'll actually encrypt the pages later,
3051          * in upl_encrypt(), when the caller has
3052          * selected which pages need to go to swap.
3053          */
3054         if (cntrl_flags & UPL_ENCRYPT)
3055                 upl->flags |= UPL_ENCRYPTED;
3056
3057         if (cntrl_flags & UPL_FOR_PAGEOUT)
3058                 upl->flags |= UPL_PAGEOUT;
3059
3060         vm_object_lock(object);
3061         vm_object_activity_begin(object);
3062
3063         /*
3064          * we can lock in the paging_offset once paging_in_progress is set
3065          */
3066         upl->size = size;
3067         upl->offset = offset + object->paging_offset;
3068
3069 #if UPL_DEBUG
3070         queue_enter(&object->uplq, upl, upl_t, uplq);
3071 #endif /* UPL_DEBUG */
3072
3073         if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != VM_OBJECT_NULL) {
3074                 /*
3075                  * Honor copy-on-write obligations
3076                  *
3077                  * The caller is gathering these pages and
3078                  * might modify their contents.  We need to
3079                  * make sure that the copy object has its own
3080                  * private copies of these pages before we let
3081                  * the caller modify them.
3082                  */
3083                 vm_object_update(object,
3084                                  offset,
3085                                  size,
3086                                  NULL,
3087                                  NULL,
3088                                  FALSE, /* should_return */
3089                                  MEMORY_OBJECT_COPY_SYNC,
3090                                  VM_PROT_NO_CHANGE);
3091 #if DEVELOPMENT || DEBUG
3092                 upl_cow++;
3093                 upl_cow_pages += size >> PAGE_SHIFT;
3094 #endif
3095         }
3096         /*
3097          * remember which copy object we synchronized with
3098          */
3099         last_copy_object = object->copy;
3100         entry = 0;
3101
3102         xfer_size = size;
3103         dst_offset = offset;
3104
3105         dwp = &dw_array[0];
3106         dw_count = 0;
3107
3108         while (xfer_size) {
3109
3110                 dwp->dw_mask = 0;
3111
3112                 if ((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
3113                         vm_object_unlock(object);
3114                         VM_PAGE_GRAB_FICTITIOUS(alias_page);
3115                         vm_object_lock(object);
3116                 }
3117                 if (cntrl_flags & UPL_COPYOUT_FROM) {
3118                         upl->flags |= UPL_PAGE_SYNC_DONE;
3119
3120                         if ( ((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
3121                                 dst_page->fictitious ||
3122                                 dst_page->absent ||
3123                                 dst_page->error ||
3124                                (VM_PAGE_WIRED(dst_page) && !dst_page->pageout && !dst_page->list_req_pending)) {
3125
3126                                 if (user_page_list)
3127                                         user_page_list[entry].phys_addr = 0;
3128
3129                                 goto try_next_page;
3130                         }
3131                         /*
3132                          * grab this up front...
3133                          * a high percentange of the time we're going to
3134                          * need the hardware modification state a bit later
3135                          * anyway... so we can eliminate an extra call into
3136                          * the pmap layer by grabbing it here and recording it
3137                          */
3138                         if (dst_page->pmapped)
3139                                 refmod_state = pmap_get_refmod(dst_page->phys_page);
3140                         else
3141                                 refmod_state = 0;
3142
3143                         if ( (refmod_state & VM_MEM_REFERENCED) && dst_page->inactive ) {
3144                                 /*
3145                                  * page is on inactive list and referenced...
3146                                  * reactivate it now... this gets it out of the
3147                                  * way of vm_pageout_scan which would have to
3148                                  * reactivate it upon tripping over it
3149                                  */
3150                                 dwp->dw_mask |= DW_vm_page_activate;
3151                         }
3152                         if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
3153                                 /*
3154                                  * we're only asking for DIRTY pages to be returned
3155                                  */
3156                                 if (dst_page->list_req_pending || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
3157                                         /*
3158                                          * if we were the page stolen by vm_pageout_scan to be
3159                                          * cleaned (as opposed to a buddy being clustered in
3160                                          * or this request is not being driven by a PAGEOUT cluster
3161                                          * then we only need to check for the page being dirty or
3162                                          * precious to decide whether to return it
3163                                          */
3164                                         if (dst_page->dirty || dst_page->precious || (refmod_state & VM_MEM_MODIFIED))
3165                                                 goto check_busy;
3166                                         goto dont_return;
3167                                 }
3168                                 /*
3169                                  * this is a request for a PAGEOUT cluster and this page
3170                                  * is merely along for the ride as a 'buddy'... not only
3171                                  * does it have to be dirty to be returned, but it also
3172                                  * can't have been referenced recently... note that we've
3173                                  * already filtered above based on whether this page is
3174                                  * currently on the inactive queue or it meets the page
3175                                  * ticket (generation count) check
3176                                  */
3177                                 if ( (cntrl_flags & UPL_CLEAN_IN_PLACE || !(refmod_state & VM_MEM_REFERENCED)) &&
3178                                      ((refmod_state & VM_MEM_MODIFIED) || dst_page->dirty || dst_page->precious) ) {
3179                                         goto check_busy;
3180                                 }
3181 dont_return:
3182                                 /*
3183                                  * if we reach here, we're not to return
3184                                  * the page... go on to the next one
3185                                  */
3186                                 if (user_page_list)
3187                                         user_page_list[entry].phys_addr = 0;
3188
3189                                 goto try_next_page;
3190                         }
3191 check_busy:
3192                         if (dst_page->busy && (!(dst_page->list_req_pending && (dst_page->pageout || dst_page->cleaning)))) {
3193                                 if (cntrl_flags & UPL_NOBLOCK) {
3194                                         if (user_page_list)
3195                                                 user_page_list[entry].phys_addr = 0;
3196
3197                                         goto try_next_page;
3198                                 }
3199                                 /*
3200                                  * someone else is playing with the
3201                                  * page.  We will have to wait.
3202                                  */
3203                                 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
3204
3205                                 continue;
3206                         }
3207                         /*
3208                          * Someone else already cleaning the page?
3209                          */
3210                         if ((dst_page->cleaning || dst_page->absent || VM_PAGE_WIRED(dst_page)) && !dst_page->list_req_pending) {
3211                                 if (user_page_list)
3212                                         user_page_list[entry].phys_addr = 0;
3213
3214                                 goto try_next_page;
3215                         }
3216                         /*
3217                          * ENCRYPTED SWAP:
3218                          * The caller is gathering this page and might
3219                          * access its contents later on.  Decrypt the
3220                          * page before adding it to the UPL, so that
3221                          * the caller never sees encrypted data.
3222                          */
3223                         if (! (cntrl_flags & UPL_ENCRYPT) && dst_page->encrypted) {
3224                                 int  was_busy;
3225
3226                                 /*
3227                                  * save the current state of busy
3228                                  * mark page as busy while decrypt
3229                                  * is in progress since it will drop
3230                                  * the object lock...
3231                                  */
3232                                 was_busy = dst_page->busy;
3233                                 dst_page->busy = TRUE;
3234
3235                                 vm_page_decrypt(dst_page, 0);
3236                                 vm_page_decrypt_for_upl_counter++;
3237                                 /*
3238                                  * restore to original busy state
3239                                  */
3240                                 dst_page->busy = was_busy;
3241                         }
3242                         if (dst_page->pageout_queue == TRUE) {
3243
3244                                 vm_page_lockspin_queues();
3245
3246 #if CONFIG_EMBEDDED
3247                                 if (dst_page->laundry)
3248 #else
3249                                 if (dst_page->pageout_queue == TRUE)
3250 #endif
3251                                 {
3252                                         /*
3253                                          * we've buddied up a page for a clustered pageout
3254                                          * that has already been moved to the pageout
3255                                          * queue by pageout_scan... we need to remove
3256                                          * it from the queue and drop the laundry count
3257                                          * on that queue
3258                                          */
3259                                         vm_pageout_throttle_up(dst_page);
3260                                 }
3261                                 vm_page_unlock_queues();
3262                         }
3263 #if MACH_CLUSTER_STATS
3264                         /*
3265                          * pageout statistics gathering.  count
3266                          * all the pages we will page out that
3267                          * were not counted in the initial
3268                          * vm_pageout_scan work
3269                          */
3270                         if (dst_page->list_req_pending)
3271                                 encountered_lrp = TRUE;
3272                         if ((dst_page->dirty || (dst_page->object->internal && dst_page->precious)) && !dst_page->list_req_pending) {
3273                                 if (encountered_lrp)
3274                                         CLUSTER_STAT(pages_at_higher_offsets++;)
3275                                 else
3276                                         CLUSTER_STAT(pages_at_lower_offsets++;)
3277                         }
3278 #endif
3279                         /*
3280                          * Turn off busy indication on pending
3281                          * pageout.  Note: we can only get here
3282                          * in the request pending case.
3283                          */
3284                         dst_page->list_req_pending = FALSE;
3285                         dst_page->busy = FALSE;
3286
3287                         hw_dirty = refmod_state & VM_MEM_MODIFIED;
3288                         dirty = hw_dirty ? TRUE : dst_page->dirty;
3289
3290                         if (dst_page->phys_page > upl->highest_page)
3291                                 upl->highest_page = dst_page->phys_page;
3292
3293                         if (cntrl_flags & UPL_SET_LITE) {
3294                                 unsigned int    pg_num;
3295
3296                                 pg_num = (unsigned int) ((dst_offset-offset)/PAGE_SIZE);
3297                                 assert(pg_num == (dst_offset-offset)/PAGE_SIZE);
3298                                 lite_list[pg_num>>5] |= 1 << (pg_num & 31);
3299
3300                                 if (hw_dirty)
3301                                         pmap_clear_modify(dst_page->phys_page);
3302
3303                                 /*
3304                                  * Mark original page as cleaning
3305                                  * in place.
3306                                  */
3307                                 dst_page->cleaning = TRUE;
3308                                 dst_page->precious = FALSE;
3309                         } else {
3310                                 /*
3311                                  * use pageclean setup, it is more
3312                                  * convenient even for the pageout
3313                                  * cases here
3314                                  */
3315                                 vm_object_lock(upl->map_object);
3316                                 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
3317                                 vm_object_unlock(upl->map_object);
3318
3319                                 alias_page->absent = FALSE;
3320                                 alias_page = NULL;
3321                         }
3322 #if     MACH_PAGEMAP
3323                         /*
3324                          * Record that this page has been
3325                          * written out
3326                          */
3327                         vm_external_state_set(object->existence_map, dst_page->offset);
3328 #endif  /*MACH_PAGEMAP*/
3329                         dst_page->dirty = dirty;
3330
3331                         if (!dirty)
3332                                 dst_page->precious = TRUE;
3333
3334                         if (dst_page->pageout)
3335                                 dst_page->busy = TRUE;
3336
3337                         if ( (cntrl_flags & UPL_ENCRYPT) ) {
3338                                 /*
3339                                  * ENCRYPTED SWAP:
3340                                  * We want to deny access to the target page
3341                                  * because its contents are about to be
3342                                  * encrypted and the user would be very
3343                                  * confused to see encrypted data instead
3344                                  * of their data.
3345                                  * We also set "encrypted_cleaning" to allow
3346                                  * vm_pageout_scan() to demote that page
3347                                  * from "adjacent/clean-in-place" to
3348                                  * "target/clean-and-free" if it bumps into
3349                                  * this page during its scanning while we're
3350                                  * still processing this cluster.
3351                                  */
3352                                 dst_page->busy = TRUE;
3353                                 dst_page->encrypted_cleaning = TRUE;
3354                         }
3355                         if ( !(cntrl_flags & UPL_CLEAN_IN_PLACE) ) {
3356                                 /*
3357                                  * deny access to the target page
3358                                  * while it is being worked on
3359                                  */
3360                                 if ((!dst_page->pageout) && ( !VM_PAGE_WIRED(dst_page))) {
3361                                         dst_page->busy = TRUE;
3362                                         dst_page->pageout = TRUE;
3363
3364                                         dwp->dw_mask |= DW_vm_page_wire;
3365                                 }
3366                         }
3367                 } else {
3368                         if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != last_copy_object) {
3369                                 /*
3370                                  * Honor copy-on-write obligations
3371                                  *
3372                                  * The copy object has changed since we
3373                                  * last synchronized for copy-on-write.
3374                                  * Another copy object might have been
3375                                  * inserted while we released the object's
3376                                  * lock.  Since someone could have seen the
3377                                  * original contents of the remaining pages
3378                                  * through that new object, we have to
3379                                  * synchronize with it again for the remaining
3380                                  * pages only.  The previous pages are "busy"
3381                                  * so they can not be seen through the new
3382                                  * mapping.  The new mapping will see our
3383                                  * upcoming changes for those previous pages,
3384                                  * but that's OK since they couldn't see what
3385                                  * was there before.  It's just a race anyway
3386                                  * and there's no guarantee of consistency or
3387                                  * atomicity.  We just don't want new mappings
3388                                  * to see both the *before* and *after* pages.
3389                                  */
3390                                 if (object->copy != VM_OBJECT_NULL) {
3391                                         vm_object_update(
3392                                                 object,
3393                                                 dst_offset,/* current offset */
3394                                                 xfer_size, /* remaining size */
3395                                                 NULL,
3396                                                 NULL,
3397                                                 FALSE,     /* should_return */
3398                                                 MEMORY_OBJECT_COPY_SYNC,
3399                                                 VM_PROT_NO_CHANGE);
3400
3401 #if DEVELOPMENT || DEBUG
3402                                         upl_cow_again++;
3403                                         upl_cow_again_pages += xfer_size >> PAGE_SHIFT;
3404 #endif
3405                                 }
3406                                 /*
3407                                  * remember the copy object we synced with
3408                                  */
3409                                 last_copy_object = object->copy;
3410                         }
3411                         dst_page = vm_page_lookup(object, dst_offset);
3412
3413                         if (dst_page != VM_PAGE_NULL) {
3414
3415                                 if ((cntrl_flags & UPL_RET_ONLY_ABSENT)) {
3416
3417                                         if ( !(dst_page->absent && dst_page->list_req_pending) ) {
3418                                                 /*
3419                                                  * skip over pages already present in the cache
3420                                                  */
3421                                                 if (user_page_list)
3422                                                         user_page_list[entry].phys_addr = 0;
3423
3424                                                 goto try_next_page;
3425                                         }
3426                                 }
3427                                 if ( !(dst_page->list_req_pending) ) {
3428
3429                                         if (dst_page->cleaning) {
3430                                                 /*
3431                                                  * someone else is writing to the page... wait...
3432                                                  */
3433                                                 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
3434
3435                                                 continue;
3436                                         }
3437                                 } else {
3438                                         if (dst_page->fictitious &&
3439                                             dst_page->phys_page == vm_page_fictitious_addr) {
3440                                                 assert( !dst_page->speculative);
3441                                                 /*
3442                                                  * dump the fictitious page
3443                                                  */
3444                                                 dst_page->list_req_pending = FALSE;
3445
3446                                                 VM_PAGE_FREE(dst_page);
3447
3448                                                 dst_page = NULL;
3449
3450                                         } else if (dst_page->absent) {
3451                                                 /*
3452                                                  * the default_pager case
3453                                                  */
3454                                                 dst_page->list_req_pending = FALSE;
3455                                                 dst_page->busy = FALSE;
3456
3457                                         } else if (dst_page->pageout || dst_page->cleaning) {
3458                                                 /*
3459                                                  * page was earmarked by vm_pageout_scan
3460                                                  * to be cleaned and stolen... we're going
3461                                                  * to take it back since we are not attempting
3462                                                  * to read that page and we don't want to stall
3463                                                  * waiting for it to be cleaned for 2 reasons...
3464                                                  * 1 - no use paging it out and back in
3465                                                  * 2 - if we stall, we may casue a deadlock in
3466                                                  *     the FS trying to acquire the its locks
3467                                                  *     on the VNOP_PAGEOUT path presuming that
3468                                                  *     those locks are already held on the read
3469                                                  *     path before trying to create this UPL
3470                                                  *
3471                                                  * so undo all of the state that vm_pageout_scan
3472                                                  * hung on this page
3473                                                  */
3474                                                 dst_page->busy = FALSE;
3475
3476                                                 vm_pageout_queue_steal(dst_page, FALSE);
3477                                         }
3478                                 }
3479                         }
3480                         if (dst_page == VM_PAGE_NULL) {
3481                                 if (object->private) {
3482                                         /*
3483                                          * This is a nasty wrinkle for users
3484                                          * of upl who encounter device or
3485                                          * private memory however, it is
3486                                          * unavoidable, only a fault can
3487                                          * resolve the actual backing
3488                                          * physical page by asking the
3489                                          * backing device.
3490                                          */
3491                                         if (user_page_list)
3492                                                 user_page_list[entry].phys_addr = 0;
3493
3494                                         goto try_next_page;
3495                                 }
3496                                 /*
3497                                  * need to allocate a page
3498                                  */
3499                                 dst_page = vm_page_grab();
3500
3501                                 if (dst_page == VM_PAGE_NULL) {
3502                                         if ( (cntrl_flags & (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) == (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) {
3503                                                /*
3504                                                 * we don't want to stall waiting for pages to come onto the free list
3505                                                 * while we're already holding absent pages in this UPL
3506                                                 * the caller will deal with the empty slots
3507                                                 */
3508                                                 if (user_page_list)
3509                                                         user_page_list[entry].phys_addr = 0;
3510
3511                                                 goto try_next_page;
3512                                         }
3513                                         /*
3514                                          * no pages available... wait
3515                                          * then try again for the same
3516                                          * offset...
3517                                          */
3518                                         vm_object_unlock(object);
3519                                         VM_PAGE_WAIT();
3520                                         vm_object_lock(object);
3521
3522                                         continue;
3523                                 }
3524                                 vm_page_insert(dst_page, object, dst_offset);
3525
3526                                 dst_page->absent = TRUE;
3527                                 dst_page->busy = FALSE;
3528
3529                                 if (cntrl_flags & UPL_RET_ONLY_ABSENT) {
3530                                         /*
3531                                          * if UPL_RET_ONLY_ABSENT was specified,
3532                                          * than we're definitely setting up a
3533                                          * upl for a clustered read/pagein
3534                                          * operation... mark the pages as clustered
3535                                          * so upl_commit_range can put them on the
3536                                          * speculative list
3537                                          */
3538                                         dst_page->clustered = TRUE;
3539                                 }
3540                         }
3541                         if (dst_page->fictitious) {
3542                                 panic("need corner case for fictitious page");
3543                         }
3544                         if (dst_page->busy) {
3545                                 /*
3546                                  * someone else is playing with the
3547                                  * page.  We will have to wait.
3548                                  */
3549                                 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
3550
3551                                 continue;
3552                         }
3553                         /*
3554                          * ENCRYPTED SWAP:
3555                          */
3556                         if (cntrl_flags & UPL_ENCRYPT) {
3557                                 /*
3558                                  * The page is going to be encrypted when we
3559                                  * get it from the pager, so mark it so.
3560                                  */
3561                                 dst_page->encrypted = TRUE;
3562                         } else {
3563                                 /*
3564                                  * Otherwise, the page will not contain
3565                                  * encrypted data.
3566                                  */
3567                                 dst_page->encrypted = FALSE;
3568                         }
3569                         dst_page->overwriting = TRUE;
3570
3571                         if (dst_page->pmapped) {
3572                                 if ( !(cntrl_flags & UPL_FILE_IO))
3573                                         /*
3574                                          * eliminate all mappings from the
3575                                          * original object and its prodigy
3576                                          */
3577                                         refmod_state = pmap_disconnect(dst_page->phys_page);
3578                                 else
3579                                         refmod_state = pmap_get_refmod(dst_page->phys_page);
3580                         } else
3581                                 refmod_state = 0;
3582
3583                         hw_dirty = refmod_state & VM_MEM_MODIFIED;
3584                         dirty = hw_dirty ? TRUE : dst_page->dirty;
3585
3586                         if (cntrl_flags & UPL_SET_LITE) {
3587                                 unsigned int    pg_num;
3588
3589                                 pg_num = (unsigned int) ((dst_offset-offset)/PAGE_SIZE);
3590                                 assert(pg_num == (dst_offset-offset)/PAGE_SIZE);
3591                                 lite_list[pg_num>>5] |= 1 << (pg_num & 31);
3592
3593                                 if (hw_dirty)
3594                                         pmap_clear_modify(dst_page->phys_page);
3595
3596                                 /*
3597                                  * Mark original page as cleaning
3598                                  * in place.
3599                                  */
3600                                 dst_page->cleaning = TRUE;
3601                                 dst_page->precious = FALSE;
3602                         } else {
3603                                 /*
3604                                  * use pageclean setup, it is more
3605                                  * convenient even for the pageout
3606                                  * cases here
3607                                  */
3608                                 vm_object_lock(upl->map_object);
3609                                 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
3610                                 vm_object_unlock(upl->map_object);
3611
3612                                 alias_page->absent = FALSE;
3613                                 alias_page = NULL;
3614                         }
3615
3616                         if (cntrl_flags & UPL_CLEAN_IN_PLACE) {
3617                                 /*
3618                                  * clean in place for read implies
3619                                  * that a write will be done on all
3620                                  * the pages that are dirty before
3621                                  * a upl commit is done.  The caller
3622                                  * is obligated to preserve the
3623                                  * contents of all pages marked dirty
3624                                  */
3625                                 upl->flags |= UPL_CLEAR_DIRTY;
3626                         }
3627                         dst_page->dirty = dirty;
3628
3629                         if (!dirty)
3630                                 dst_page->precious = TRUE;
3631
3632                         if ( !VM_PAGE_WIRED(dst_page)) {
3633                                 /*
3634                                  * deny access to the target page while
3635                                  * it is being worked on
3636                                  */
3637                                 dst_page->busy = TRUE;
3638                         } else
3639                                 dwp->dw_mask |= DW_vm_page_wire;
3640
3641                         /*
3642                          * We might be about to satisfy a fault which has been
3643                          * requested. So no need for the "restart" bit.
3644                          */
3645                         dst_page->restart = FALSE;
3646                         if (!dst_page->absent && !(cntrl_flags & UPL_WILL_MODIFY)) {
3647                                 /*
3648                                  * expect the page to be used
3649                                  */
3650                                 dwp->dw_mask |= DW_set_reference;
3651                         }
3652                         dst_page->precious = (cntrl_flags & UPL_PRECIOUS) ? TRUE : FALSE;
3653                 }
3654                 if (dst_page->phys_page > upl->highest_page)
3655                         upl->highest_page = dst_page->phys_page;
3656                 if (user_page_list) {
3657                         user_page_list[entry].phys_addr = dst_page->phys_page;
3658                         user_page_list[entry].pageout   = dst_page->pageout;
3659                         user_page_list[entry].absent    = dst_page->absent;
3660                         user_page_list[entry].dirty     = dst_page->dirty;
3661                         user_page_list[entry].precious  = dst_page->precious;
3662                         user_page_list[entry].device    = FALSE;
3663                         if (dst_page->clustered == TRUE)
3664                                 user_page_list[entry].speculative = dst_page->speculative;
3665                         else
3666                                 user_page_list[entry].speculative = FALSE;
3667                         user_page_list[entry].cs_validated = dst_page->cs_validated;
3668                         user_page_list[entry].cs_tainted = dst_page->cs_tainted;
3669                 }
3670                 /*
3671                  * if UPL_RET_ONLY_ABSENT is set, then
3672                  * we are working with a fresh page and we've
3673                  * just set the clustered flag on it to
3674                  * indicate that it was drug in as part of a
3675                  * speculative cluster... so leave it alone
3676                  */
3677                 if ( !(cntrl_flags & UPL_RET_ONLY_ABSENT)) {
3678                         /*
3679                          * someone is explicitly grabbing this page...
3680                          * update clustered and speculative state
3681                          *
3682                          */
3683                         VM_PAGE_CONSUME_CLUSTERED(dst_page);
3684                 }
3685 try_next_page:
3686                 if (dwp->dw_mask) {
3687                         if (dwp->dw_mask & DW_vm_page_activate)
3688                                 VM_STAT_INCR(reactivations);
3689
3690                         if (dst_page->busy == FALSE) {
3691                                 /*
3692                                  * dw_do_work may need to drop the object lock
3693                                  * if it does, we need the pages it's looking at to
3694                                  * be held stable via the busy bit.
3695                                  */
3696                                 dst_page->busy = TRUE;
3697                                 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
3698                         }
3699                         dwp->dw_m = dst_page;
3700                         dwp++;
3701                         dw_count++;
3702
3703                         if (dw_count >= DELAYED_WORK_LIMIT) {
3704                                 dw_do_work(object, &dw_array[0], dw_count);
3705
3706                                 dwp = &dw_array[0];
3707                                 dw_count = 0;
3708                         }
3709                 }
3710                 entry++;
3711                 dst_offset += PAGE_SIZE_64;
3712                 xfer_size -= PAGE_SIZE;
3713         }
3714         if (dw_count)
3715                 dw_do_work(object, &dw_array[0], dw_count);
3716
3717         if (alias_page != NULL) {
3718                 VM_PAGE_FREE(alias_page);
3719         }
3720
3721         if (page_list_count != NULL) {
3722                 if (upl->flags & UPL_INTERNAL)
3723                         *page_list_count = 0;
3724                 else if (*page_list_count > entry)
3725                         *page_list_count = entry;
3726         }
3727 #if UPL_DEBUG
3728         upl->upl_state = 1;
3729 #endif
3730         vm_object_unlock(object);
3731
3732         return KERN_SUCCESS;
3733 }
3734
3735 /* JMM - Backward compatability for now */
3736 kern_return_t
3737 vm_fault_list_request(                  /* forward */
3738         memory_object_control_t         control,
3739         vm_object_offset_t      offset,
3740         upl_size_t              size,
3741         upl_t                   *upl_ptr,
3742         upl_page_info_t         **user_page_list_ptr,
3743         unsigned int            page_list_count,
3744         int                     cntrl_flags);
3745 kern_return_t
3746 vm_fault_list_request(
3747         memory_object_control_t         control,
3748         vm_object_offset_t      offset,
3749         upl_size_t              size,
3750         upl_t                   *upl_ptr,
3751         upl_page_info_t         **user_page_list_ptr,
3752         unsigned int            page_list_count,
3753         int                     cntrl_flags)
3754 {
3755         unsigned int            local_list_count;
3756         upl_page_info_t         *user_page_list;
3757         kern_return_t           kr;
3758
3759         if((cntrl_flags & UPL_VECTOR)==UPL_VECTOR)
3760                  return KERN_INVALID_ARGUMENT;
3761
3762         if (user_page_list_ptr != NULL) {
3763                 local_list_count = page_list_count;
3764                 user_page_list = *user_page_list_ptr;
3765         } else {
3766                 local_list_count = 0;
3767                 user_page_list = NULL;
3768         }
3769         kr =  memory_object_upl_request(control,
3770                                 offset,
3771                                 size,
3772                                 upl_ptr,
3773                                 user_page_list,
3774                                 &local_list_count,
3775                                 cntrl_flags);
3776
3777         if(kr != KERN_SUCCESS)
3778                 return kr;
3779
3780         if ((user_page_list_ptr != NULL) && (cntrl_flags & UPL_INTERNAL)) {
3781                 *user_page_list_ptr = UPL_GET_INTERNAL_PAGE_LIST(*upl_ptr);
3782         }
3783
3784         return KERN_SUCCESS;
3785 }
3786
3787
3788
3789 /*
3790  *      Routine:        vm_object_super_upl_request
3791  *      Purpose:
3792  *              Cause the population of a portion of a vm_object
3793  *              in much the same way as memory_object_upl_request.
3794  *              Depending on the nature of the request, the pages
3795  *              returned may be contain valid data or be uninitialized.
3796  *              However, the region may be expanded up to the super
3797  *              cluster size provided.
3798  */
3799
3800 __private_extern__ kern_return_t
3801 vm_object_super_upl_request(
3802         vm_object_t object,
3803         vm_object_offset_t      offset,
3804         upl_size_t              size,
3805         upl_size_t              super_cluster,
3806         upl_t                   *upl,
3807         upl_page_info_t         *user_page_list,
3808         unsigned int            *page_list_count,
3809         int                     cntrl_flags)
3810 {
3811         if (object->paging_offset > offset  || ((cntrl_flags & UPL_VECTOR)==UPL_VECTOR))
3812                 return KERN_FAILURE;
3813
3814         assert(object->paging_in_progress);
3815         offset = offset - object->paging_offset;
3816
3817         if (super_cluster > size) {
3818
3819                 vm_object_offset_t      base_offset;
3820                 upl_size_t              super_size;
3821                 vm_object_size_t        super_size_64;
3822
3823                 base_offset = (offset & ~((vm_object_offset_t) super_cluster - 1));
3824                 super_size = (offset + size) > (base_offset + super_cluster) ? super_cluster<<1 : super_cluster;
3825                 super_size_64 = ((base_offset + super_size) > object->size) ? (object->size - base_offset) : super_size;
3826                 super_size = (upl_size_t) super_size_64;
3827                 assert(super_size == super_size_64);
3828
3829                 if (offset > (base_offset + super_size)) {
3830                         panic("vm_object_super_upl_request: Missed target pageout"
3831                               " %#llx,%#llx, %#x, %#x, %#x, %#llx\n",
3832                               offset, base_offset, super_size, super_cluster,
3833                               size, object->paging_offset);
3834                 }
3835                 /*
3836                  * apparently there is a case where the vm requests a
3837                  * page to be written out who's offset is beyond the
3838                  * object size
3839                  */
3840                 if ((offset + size) > (base_offset + super_size)) {
3841                         super_size_64 = (offset + size) - base_offset;
3842                         super_size = (upl_size_t) super_size_64;
3843                         assert(super_size == super_size_64);
3844                 }
3845
3846                 offset = base_offset;
3847                 size = super_size;
3848         }
3849         return vm_object_upl_request(object, offset, size, upl, user_page_list, page_list_count, cntrl_flags);
3850 }
3851
3852
3853 kern_return_t
3854 vm_map_create_upl(
3855         vm_map_t                map,
3856         vm_map_address_t        offset,
3857         upl_size_t              *upl_size,
3858         upl_t                   *upl,
3859         upl_page_info_array_t   page_list,
3860         unsigned int            *count,
3861         int                     *flags)
3862 {
3863         vm_map_entry_t  entry;
3864         int             caller_flags;
3865         int             force_data_sync;
3866         int             sync_cow_data;
3867         vm_object_t     local_object;
3868         vm_map_offset_t local_offset;
3869         vm_map_offset_t local_start;
3870         kern_return_t   ret;
3871
3872         caller_flags = *flags;
3873
3874         if (caller_flags & ~UPL_VALID_FLAGS) {
3875                 /*
3876                  * For forward compatibility's sake,
3877                  * reject any unknown flag.
3878                  */
3879                 return KERN_INVALID_VALUE;
3880         }
3881         force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
3882         sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
3883
3884         if (upl == NULL)
3885                 return KERN_INVALID_ARGUMENT;
3886
3887 REDISCOVER_ENTRY:
3888         vm_map_lock_read(map);
3889
3890         if (vm_map_lookup_entry(map, offset, &entry)) {
3891
3892                 if ((entry->vme_end - offset) < *upl_size) {
3893                         *upl_size = (upl_size_t) (entry->vme_end - offset);
3894                         assert(*upl_size == entry->vme_end - offset);
3895                 }
3896
3897                 if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
3898                         *flags = 0;
3899
3900                         if ( !entry->is_sub_map && entry->object.vm_object != VM_OBJECT_NULL) {
3901                                 if (entry->object.vm_object->private)
3902                                         *flags = UPL_DEV_MEMORY;
3903
3904                                 if (entry->object.vm_object->phys_contiguous)
3905                                         *flags |= UPL_PHYS_CONTIG;
3906                         }
3907                         vm_map_unlock_read(map);
3908
3909                         return KERN_SUCCESS;
3910                 }
3911                 if (entry->object.vm_object == VM_OBJECT_NULL || !entry->object.vm_object->phys_contiguous) {
3912                         if ((*upl_size/PAGE_SIZE) > MAX_UPL_SIZE)
3913                                 *upl_size = MAX_UPL_SIZE * PAGE_SIZE;
3914                 }
3915                 /*
3916                  *      Create an object if necessary.
3917                  */
3918                 if (entry->object.vm_object == VM_OBJECT_NULL) {
3919
3920                         if (vm_map_lock_read_to_write(map))
3921                                 goto REDISCOVER_ENTRY;
3922
3923                         entry->object.vm_object = vm_object_allocate((vm_size_t)(entry->vme_end - entry->vme_start));
3924                         entry->offset = 0;
3925
3926                         vm_map_lock_write_to_read(map);
3927                 }
3928                 if (!(caller_flags & UPL_COPYOUT_FROM)) {
3929                         if (!(entry->protection & VM_PROT_WRITE)) {
3930                                 vm_map_unlock_read(map);
3931                                 return KERN_PROTECTION_FAILURE;
3932                         }
3933                         if (entry->needs_copy)  {
3934                                 /*
3935                                  * Honor copy-on-write for COPY_SYMMETRIC
3936                                  * strategy.
3937                                  */
3938                                 vm_map_t                local_map;
3939                                 vm_object_t             object;
3940                                 vm_object_offset_t      new_offset;
3941                                 vm_prot_t               prot;
3942                                 boolean_t               wired;
3943                                 vm_map_version_t        version;
3944                                 vm_map_t                real_map;
3945
3946                                 local_map = map;
3947
3948                                 if (vm_map_lookup_locked(&local_map,
3949                                                          offset, VM_PROT_WRITE,
3950                                                          OBJECT_LOCK_EXCLUSIVE,
3951                                                          &version, &object,
3952                                                          &new_offset, &prot, &wired,
3953                                                          NULL,
3954                                                          &real_map) != KERN_SUCCESS) {
3955                                         vm_map_unlock_read(local_map);
3956                                         return KERN_FAILURE;
3957                                 }
3958                                 if (real_map != map)
3959                                         vm_map_unlock(real_map);
3960                                 vm_map_unlock_read(local_map);
3961
3962                                 vm_object_unlock(object);
3963
3964                                 goto REDISCOVER_ENTRY;
3965                         }
3966                 }
3967                 if (entry->is_sub_map) {
3968                         vm_map_t        submap;
3969
3970                         submap = entry->object.sub_map;
3971                         local_start = entry->vme_start;
3972                         local_offset = entry->offset;
3973
3974                         vm_map_reference(submap);
3975                         vm_map_unlock_read(map);
3976
3977                         ret = vm_map_create_upl(submap,
3978                                                 local_offset + (offset - local_start),
3979                                                 upl_size, upl, page_list, count, flags);
3980                         vm_map_deallocate(submap);
3981
3982                         return ret;
3983                 }
3984                 if (sync_cow_data) {
3985                         if (entry->object.vm_object->shadow || entry->object.vm_object->copy) {
3986                                 local_object = entry->object.vm_object;
3987                                 local_start = entry->vme_start;
3988                                 local_offset = entry->offset;
3989
3990                                 vm_object_reference(local_object);
3991                                 vm_map_unlock_read(map);
3992
3993                                 if (local_object->shadow && local_object->copy) {
3994                                         vm_object_lock_request(
3995                                                                local_object->shadow,
3996                                                                (vm_object_offset_t)
3997                                                                ((offset - local_start) +
3998                                                                 local_offset) +
3999                                                                local_object->shadow_offset,
4000                                                                *upl_size, FALSE,
4001                                                                MEMORY_OBJECT_DATA_SYNC,
4002                                                                VM_PROT_NO_CHANGE);
4003                                 }
4004                                 sync_cow_data = FALSE;
4005                                 vm_object_deallocate(local_object);
4006
4007                                 goto REDISCOVER_ENTRY;
4008                         }
4009                 }
4010                 if (force_data_sync) {
4011                         local_object = entry->object.vm_object;
4012                         local_start = entry->vme_start;
4013                         local_offset = entry->offset;
4014
4015                         vm_object_reference(local_object);
4016                         vm_map_unlock_read(map);
4017
4018                         vm_object_lock_request(
4019                                                local_object,
4020                                                (vm_object_offset_t)
4021                                                ((offset - local_start) + local_offset),
4022                                                (vm_object_size_t)*upl_size, FALSE,
4023                                                MEMORY_OBJECT_DATA_SYNC,
4024                                                VM_PROT_NO_CHANGE);
4025
4026                         force_data_sync = FALSE;
4027                         vm_object_deallocate(local_object);
4028
4029                         goto REDISCOVER_ENTRY;
4030                 }
4031                 if (entry->object.vm_object->private)
4032                         *flags = UPL_DEV_MEMORY;
4033                 else
4034                         *flags = 0;
4035
4036                 if (entry->object.vm_object->phys_contiguous)
4037                         *flags |= UPL_PHYS_CONTIG;
4038
4039                 local_object = entry->object.vm_object;
4040                 local_offset = entry->offset;
4041                 local_start = entry->vme_start;
4042
4043                 vm_object_reference(local_object);
4044                 vm_map_unlock_read(map);
4045
4046                 ret = vm_object_iopl_request(local_object,
4047                                               (vm_object_offset_t) ((offset - local_start) + local_offset),
4048                                               *upl_size,
4049                                               upl,
4050                                               page_list,
4051                                               count,
4052                                               caller_flags);
4053                 vm_object_deallocate(local_object);
4054
4055                 return(ret);
4056         }
4057         vm_map_unlock_read(map);
4058
4059         return(KERN_FAILURE);
4060 }
4061
4062 /*
4063  * Internal routine to enter a UPL into a VM map.
4064  *
4065  * JMM - This should just be doable through the standard
4066  * vm_map_enter() API.
4067  */
4068 kern_return_t
4069 vm_map_enter_upl(
4070         vm_map_t                map,
4071         upl_t                   upl,
4072         vm_map_offset_t         *dst_addr)
4073 {
4074         vm_map_size_t           size;
4075         vm_object_offset_t      offset;
4076         vm_map_offset_t         addr;
4077         vm_page_t               m;
4078         kern_return_t           kr;
4079         int                     isVectorUPL = 0, curr_upl=0;
4080         upl_t                   vector_upl = NULL;
4081         vm_offset_t             vector_upl_dst_addr = 0;
4082         vm_map_t                vector_upl_submap = NULL;
4083         upl_offset_t            subupl_offset = 0;
4084         upl_size_t              subupl_size = 0;
4085
4086         if (upl == UPL_NULL)
4087                 return KERN_INVALID_ARGUMENT;
4088
4089         if((isVectorUPL = vector_upl_is_valid(upl))) {
4090                 int mapped=0,valid_upls=0;
4091                 vector_upl = upl;
4092
4093                 upl_lock(vector_upl);
4094                 for(curr_upl=0; curr_upl < MAX_VECTOR_UPL_ELEMENTS; curr_upl++) {
4095                         upl =  vector_upl_subupl_byindex(vector_upl, curr_upl );
4096                         if(upl == NULL)
4097                                 continue;
4098                         valid_upls++;
4099                         if (UPL_PAGE_LIST_MAPPED & upl->flags)
4100                                 mapped++;
4101                 }
4102
4103                 if(mapped) {
4104                         if(mapped != valid_upls)
4105                                 panic("Only %d of the %d sub-upls within the Vector UPL are alread mapped\n", mapped, valid_upls);
4106                         else {
4107                                 upl_unlock(vector_upl);
4108                                 return KERN_FAILURE;
4109                         }
4110                 }
4111
4112                 kr = kmem_suballoc(map, &vector_upl_dst_addr, vector_upl->size, FALSE, VM_FLAGS_ANYWHERE, &vector_upl_submap);
4113                 if( kr != KERN_SUCCESS )
4114                         panic("Vector UPL submap allocation failed\n");
4115                 map = vector_upl_submap;
4116                 vector_upl_set_submap(vector_upl, vector_upl_submap, vector_upl_dst_addr);
4117                 curr_upl=0;
4118         }
4119         else
4120                 upl_lock(upl);
4121
4122 process_upl_to_enter:
4123         if(isVectorUPL){
4124                 if(curr_upl == MAX_VECTOR_UPL_ELEMENTS) {
4125                         *dst_addr = vector_upl_dst_addr;
4126                         upl_unlock(vector_upl);
4127                         return KERN_SUCCESS;
4128                 }
4129                 upl =  vector_upl_subupl_byindex(vector_upl, curr_upl++ );
4130                 if(upl == NULL)
4131                         goto process_upl_to_enter;
4132                 vector_upl_get_iostate(vector_upl, upl, &subupl_offset, &subupl_size);
4133                 *dst_addr = (vm_map_offset_t)(vector_upl_dst_addr + (vm_map_offset_t)subupl_offset);
4134         }
4135
4136         /*
4137          * check to see if already mapped
4138          */
4139         if (UPL_PAGE_LIST_MAPPED & upl->flags) {
4140                 upl_unlock(upl);
4141                 return KERN_FAILURE;
4142         }
4143
4144         if ((!(upl->flags & UPL_SHADOWED)) && !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) ||
4145                                                (upl->map_object->phys_contiguous))) {
4146                 vm_object_t             object;
4147                 vm_page_t               alias_page;
4148                 vm_object_offset_t      new_offset;
4149                 unsigned int            pg_num;
4150                 wpl_array_t             lite_list;
4151
4152                 if (upl->flags & UPL_INTERNAL) {
4153                         lite_list = (wpl_array_t)
4154                                 ((((uintptr_t)upl) + sizeof(struct upl))
4155                                  + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
4156                 } else {
4157                         lite_list = (wpl_array_t)(((uintptr_t)upl) + sizeof(struct upl));
4158                 }
4159                 object = upl->map_object;
4160                 upl->map_object = vm_object_allocate(upl->size);
4161
4162                 vm_object_lock(upl->map_object);
4163
4164                 upl->map_object->shadow = object;
4165                 upl->map_object->pageout = TRUE;
4166                 upl->map_object->can_persist = FALSE;
4167                 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
4168                 upl->map_object->shadow_offset = upl->offset - object->paging_offset;
4169                 upl->map_object->wimg_bits = object->wimg_bits;
4170                 offset = upl->map_object->shadow_offset;
4171                 new_offset = 0;
4172                 size = upl->size;
4173
4174                 upl->flags |= UPL_SHADOWED;
4175
4176                 while (size) {
4177                         pg_num = (unsigned int) (new_offset / PAGE_SIZE);
4178                         assert(pg_num == new_offset / PAGE_SIZE);
4179
4180                         if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
4181
4182                                 VM_PAGE_GRAB_FICTITIOUS(alias_page);
4183
4184                                 vm_object_lock(object);
4185
4186                                 m = vm_page_lookup(object, offset);
4187                                 if (m == VM_PAGE_NULL) {
4188                                         panic("vm_upl_map: page missing\n");
4189                                 }
4190
4191                                 /*
4192                                  * Convert the fictitious page to a private
4193                                  * shadow of the real page.
4194                                  */
4195                                 assert(alias_page->fictitious);
4196                                 alias_page->fictitious = FALSE;
4197                                 alias_page->private = TRUE;
4198                                 alias_page->pageout = TRUE;
4199                                 /*
4200                                  * since m is a page in the upl it must
4201                                  * already be wired or BUSY, so it's
4202                                  * safe to assign the underlying physical
4203                                  * page to the alias
4204                                  */
4205                                 alias_page->phys_page = m->phys_page;
4206
4207                                 vm_object_unlock(object);
4208
4209                                 vm_page_lockspin_queues();
4210                                 vm_page_wire(alias_page);
4211                                 vm_page_unlock_queues();
4212
4213                                 /*
4214                                  * ENCRYPTED SWAP:
4215                                  * The virtual page ("m") has to be wired in some way
4216                                  * here or its physical page ("m->phys_page") could
4217                                  * be recycled at any time.
4218                                  * Assuming this is enforced by the caller, we can't
4219                                  * get an encrypted page here.  Since the encryption
4220                                  * key depends on the VM page's "pager" object and
4221                                  * the "paging_offset", we couldn't handle 2 pageable
4222                                  * VM pages (with different pagers and paging_offsets)
4223                                  * sharing the same physical page:  we could end up
4224                                  * encrypting with one key (via one VM page) and
4225                                  * decrypting with another key (via the alias VM page).
4226                                  */
4227                                 ASSERT_PAGE_DECRYPTED(m);
4228
4229                                 vm_page_insert(alias_page, upl->map_object, new_offset);
4230
4231                                 assert(!alias_page->wanted);
4232                                 alias_page->busy = FALSE;
4233                                 alias_page->absent = FALSE;
4234                         }
4235                         size -= PAGE_SIZE;
4236                         offset += PAGE_SIZE_64;
4237                         new_offset += PAGE_SIZE_64;
4238                 }
4239                 vm_object_unlock(upl->map_object);
4240         }
4241         if ((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || upl->map_object->phys_contiguous)
4242                 offset = upl->offset - upl->map_object->paging_offset;
4243         else
4244                 offset = 0;
4245         size = upl->size;
4246
4247         vm_object_reference(upl->map_object);
4248
4249         if(!isVectorUPL) {
4250                 *dst_addr = 0;
4251                 /*
4252                 * NEED A UPL_MAP ALIAS
4253                 */
4254                 kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
4255                                   VM_FLAGS_ANYWHERE, upl->map_object, offset, FALSE,
4256                                   VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
4257         }
4258         else {
4259                 kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
4260                                   VM_FLAGS_FIXED, upl->map_object, offset, FALSE,
4261                                   VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
4262                 if(kr)
4263                         panic("vm_map_enter failed for a Vector UPL\n");
4264         }
4265
4266         if (kr != KERN_SUCCESS) {
4267                 upl_unlock(upl);
4268                 return(kr);
4269         }
4270         vm_object_lock(upl->map_object);
4271
4272         for (addr = *dst_addr; size > 0; size -= PAGE_SIZE, addr += PAGE_SIZE) {
4273                 m = vm_page_lookup(upl->map_object, offset);
4274
4275                 if (m) {
4276                         unsigned int    cache_attr;
4277                         cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
4278
4279                         m->pmapped = TRUE;
4280
4281                         /* CODE SIGNING ENFORCEMENT: page has been wpmapped,
4282                          * but only in kernel space. If this was on a user map,
4283                          * we'd have to set the wpmapped bit. */
4284                         /* m->wpmapped = TRUE; */
4285                         assert(map==kernel_map);
4286
4287                         PMAP_ENTER(map->pmap, addr, m, VM_PROT_ALL, cache_attr, TRUE);
4288                 }
4289                 offset += PAGE_SIZE_64;
4290         }
4291         vm_object_unlock(upl->map_object);
4292
4293         /*
4294          * hold a reference for the mapping
4295          */
4296         upl->ref_count++;
4297         upl->flags |= UPL_PAGE_LIST_MAPPED;
4298         upl->kaddr = (vm_offset_t) *dst_addr;
4299         assert(upl->kaddr == *dst_addr);
4300
4301         if(!isVectorUPL)
4302                 upl_unlock(upl);
4303         else
4304                 goto process_upl_to_enter;
4305
4306         return KERN_SUCCESS;
4307 }
4308
4309 /*
4310  * Internal routine to remove a UPL mapping from a VM map.
4311  *
4312  * XXX - This should just be doable through a standard
4313  * vm_map_remove() operation.  Otherwise, implicit clean-up
4314  * of the target map won't be able to correctly remove
4315  * these (and release the reference on the UPL).  Having
4316  * to do this means we can't map these into user-space
4317  * maps yet.
4318  */
4319 kern_return_t
4320 vm_map_remove_upl(
4321         vm_map_t        map,
4322         upl_t           upl)
4323 {
4324         vm_address_t    addr;
4325         upl_size_t      size;
4326         int             isVectorUPL = 0, curr_upl = 0;
4327         upl_t           vector_upl = NULL;
4328
4329         if (upl == UPL_NULL)
4330                 return KERN_INVALID_ARGUMENT;
4331
4332         if((isVectorUPL = vector_upl_is_valid(upl))) {
4333                 int     unmapped=0, valid_upls=0;
4334                 vector_upl = upl;
4335                 upl_lock(vector_upl);
4336                 for(curr_upl=0; curr_upl < MAX_VECTOR_UPL_ELEMENTS; curr_upl++) {
4337                         upl =  vector_upl_subupl_byindex(vector_upl, curr_upl );
4338                         if(upl == NULL)
4339                                 continue;
4340                         valid_upls++;
4341                         if (!(UPL_PAGE_LIST_MAPPED & upl->flags))
4342                                 unmapped++;
4343                 }
4344
4345                 if(unmapped) {
4346                         if(unmapped != valid_upls)
4347                                 panic("%d of the %d sub-upls within the Vector UPL is/are not mapped\n", unmapped, valid_upls);
4348                         else {
4349                                 upl_unlock(vector_upl);
4350                                 return KERN_FAILURE;
4351                         }
4352                 }
4353                 curr_upl=0;
4354         }
4355         else
4356                 upl_lock(upl);
4357
4358 process_upl_to_remove:
4359         if(isVectorUPL) {
4360                 if(curr_upl == MAX_VECTOR_UPL_ELEMENTS) {
4361                         vm_map_t v_upl_submap;
4362                         vm_offset_t v_upl_submap_dst_addr;
4363                         vector_upl_get_submap(vector_upl, &v_upl_submap, &v_upl_submap_dst_addr);
4364
4365                         vm_map_remove(map, v_upl_submap_dst_addr, v_upl_submap_dst_addr + vector_upl->size, VM_MAP_NO_FLAGS);
4366                         vm_map_deallocate(v_upl_submap);
4367                         upl_unlock(vector_upl);
4368                         return KERN_SUCCESS;
4369                 }
4370
4371                 upl =  vector_upl_subupl_byindex(vector_upl, curr_upl++ );
4372                 if(upl == NULL)
4373                         goto process_upl_to_remove;
4374         }
4375
4376         if (upl->flags & UPL_PAGE_LIST_MAPPED) {
4377                 addr = upl->kaddr;
4378                 size = upl->size;
4379
4380                 assert(upl->ref_count > 1);
4381                 upl->ref_count--;               /* removing mapping ref */
4382
4383                 upl->flags &= ~UPL_PAGE_LIST_MAPPED;
4384                 upl->kaddr = (vm_offset_t) 0;
4385
4386                 if(!isVectorUPL) {
4387                         upl_unlock(upl);
4388
4389                         vm_map_remove(map,
4390                                 vm_map_trunc_page(addr),
4391                                 vm_map_round_page(addr + size),
4392                                 VM_MAP_NO_FLAGS);
4393
4394                         return KERN_SUCCESS;
4395                 }
4396                 else {
4397                         /*
4398                         * If it's a Vectored UPL, we'll be removing the entire
4399                         * submap anyways, so no need to remove individual UPL
4400                         * element mappings from within the submap
4401                         */
4402                         goto process_upl_to_remove;
4403                 }
4404         }
4405         upl_unlock(upl);
4406
4407         return KERN_FAILURE;
4408 }
4409
4410 static void
4411 dw_do_work(
4412         vm_object_t     object,
4413         struct dw       *dwp,
4414         int             dw_count)
4415 {
4416         int             j;
4417         boolean_t       held_as_spin = TRUE;
4418
4419         /*
4420          * pageout_scan takes the vm_page_lock_queues first
4421          * then tries for the object lock... to avoid what
4422          * is effectively a lock inversion, we'll go to the
4423          * trouble of taking them in that same order... otherwise
4424          * if this object contains the majority of the pages resident
4425          * in the UBC (or a small set of large objects actively being
4426          * worked on contain the majority of the pages), we could
4427          * cause the pageout_scan thread to 'starve' in its attempt
4428          * to find pages to move to the free queue, since it has to
4429          * successfully acquire the object lock of any candidate page
4430          * before it can steal/clean it.
4431          */
4432         if (!vm_page_trylockspin_queues()) {
4433                 vm_object_unlock(object);
4434
4435                 vm_page_lockspin_queues();
4436
4437                 for (j = 0; ; j++) {
4438                         if (!vm_object_lock_avoid(object) &&
4439                             _vm_object_lock_try(object))
4440                                 break;
4441                         vm_page_unlock_queues();
4442                         mutex_pause(j);
4443                         vm_page_lockspin_queues();
4444                 }
4445         }
4446         for (j = 0; j < dw_count; j++, dwp++) {
4447
4448                 if (dwp->dw_mask & DW_vm_pageout_throttle_up)
4449                         vm_pageout_throttle_up(dwp->dw_m);
4450
4451                 if (dwp->dw_mask & DW_vm_page_wire)
4452                         vm_page_wire(dwp->dw_m);
4453                 else if (dwp->dw_mask & DW_vm_page_unwire) {
4454                         boolean_t       queueit;
4455
4456                         queueit = (dwp->dw_mask & DW_vm_page_free) ? FALSE : TRUE;
4457
4458                         vm_page_unwire(dwp->dw_m, queueit);
4459                 }
4460                 if (dwp->dw_mask & DW_vm_page_free) {
4461                         if (held_as_spin == TRUE) {
4462                                 vm_page_lockconvert_queues();
4463                                 held_as_spin = FALSE;
4464                         }
4465                         vm_page_free(dwp->dw_m);
4466                 } else {
4467                         if (dwp->dw_mask & DW_vm_page_deactivate_internal)
4468                                 vm_page_deactivate_internal(dwp->dw_m, FALSE);
4469                         else if (dwp->dw_mask & DW_vm_page_activate)
4470                                 vm_page_activate(dwp->dw_m);
4471                         else if (dwp->dw_mask & DW_vm_page_speculate)
4472                                 vm_page_speculate(dwp->dw_m, TRUE);
4473                         else if (dwp->dw_mask & DW_vm_page_lru)
4474                                 vm_page_lru(dwp->dw_m);
4475
4476                         if (dwp->dw_mask & DW_set_reference)
4477                                 dwp->dw_m->reference = TRUE;
4478                         else if (dwp->dw_mask & DW_clear_reference)
4479                                 dwp->dw_m->reference = FALSE;
4480
4481                         if (dwp->dw_mask & DW_clear_busy)
4482                                 dwp->dw_m->busy = FALSE;
4483
4484                         if (dwp->dw_mask & DW_PAGE_WAKEUP)
4485                                 PAGE_WAKEUP(dwp->dw_m);
4486                 }
4487         }
4488         vm_page_unlock_queues();
4489 }
4490
4491
4492
4493 kern_return_t
4494 upl_commit_range(
4495         upl_t                   upl,
4496         upl_offset_t            offset,
4497         upl_size_t              size,
4498         int                     flags,
4499         upl_page_info_t         *page_list,
4500         mach_msg_type_number_t  count,
4501         boolean_t               *empty)
4502 {
4503         upl_size_t              xfer_size, subupl_size = size;
4504         vm_object_t             shadow_object;
4505         vm_object_t             object;
4506         vm_object_offset_t      target_offset;
4507         upl_offset_t            subupl_offset = offset;
4508         int                     entry;
4509         wpl_array_t             lite_list;
4510         int                     occupied;
4511         int                     clear_refmod = 0;
4512         int                     pgpgout_count = 0;
4513         struct  dw              dw_array[DELAYED_WORK_LIMIT];
4514         struct  dw              *dwp;
4515         int                     dw_count, isVectorUPL = 0;
4516         upl_t                   vector_upl = NULL;
4517
4518         *empty = FALSE;
4519
4520         if (upl == UPL_NULL)
4521                 return KERN_INVALID_ARGUMENT;
4522
4523         if (count == 0)
4524                 page_list = NULL;
4525
4526         if((isVectorUPL = vector_upl_is_valid(upl))) {
4527                 vector_upl = upl;
4528                 upl_lock(vector_upl);
4529         }
4530         else
4531                 upl_lock(upl);
4532
4533 process_upl_to_commit:
4534
4535         if(isVectorUPL) {
4536                 size = subupl_size;
4537                 offset = subupl_offset;
4538                 if(size == 0) {
4539                         upl_unlock(vector_upl);
4540                         return KERN_SUCCESS;
4541                 }
4542                 upl =  vector_upl_subupl_byoffset(vector_upl, &offset, &size);
4543                 if(upl == NULL) {
4544                         upl_unlock(vector_upl);
4545                         return KERN_FAILURE;
4546                 }
4547                 page_list = UPL_GET_INTERNAL_PAGE_LIST_SIMPLE(upl);
4548                 subupl_size -= size;
4549                 subupl_offset += size;
4550         }
4551
4552 #if UPL_DEBUG
4553         if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
4554                 (void) OSBacktrace(&upl->upl_commit_records[upl->upl_commit_index].c_retaddr[0], UPL_DEBUG_STACK_FRAMES);
4555
4556                 upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
4557                 upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
4558
4559                 upl->upl_commit_index++;
4560         }
4561 #endif
4562         if (upl->flags & UPL_DEVICE_MEMORY)
4563                 xfer_size = 0;
4564         else if ((offset + size) <= upl->size)
4565                 xfer_size = size;
4566         else {
4567                 if(!isVectorUPL)
4568                         upl_unlock(upl);
4569                 else {
4570                         upl_unlock(vector_upl);
4571                 }
4572                 return KERN_FAILURE;
4573         }
4574         if (upl->flags & UPL_CLEAR_DIRTY)
4575                 flags |= UPL_COMMIT_CLEAR_DIRTY;
4576
4577         if (upl->flags & UPL_INTERNAL)
4578                 lite_list = (wpl_array_t) ((((uintptr_t)upl) + sizeof(struct upl))
4579                                            + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
4580         else
4581                 lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
4582
4583         object = upl->map_object;
4584
4585         if (upl->flags & UPL_SHADOWED) {
4586                 vm_object_lock(object);
4587                 shadow_object = object->shadow;
4588         } else {
4589                 shadow_object = object;
4590         }
4591         entry = offset/PAGE_SIZE;
4592         target_offset = (vm_object_offset_t)offset;
4593
4594         if (upl->flags & UPL_KERNEL_OBJECT)
4595                 vm_object_lock_shared(shadow_object);
4596         else
4597                 vm_object_lock(shadow_object);
4598
4599         if (upl->flags & UPL_ACCESS_BLOCKED) {
4600                 assert(shadow_object->blocked_access);
4601                 shadow_object->blocked_access = FALSE;
4602                 vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
4603         }
4604
4605         if (shadow_object->code_signed) {
4606                 /*
4607                  * CODE SIGNING:
4608                  * If the object is code-signed, do not let this UPL tell
4609                  * us if the pages are valid or not.  Let the pages be
4610                  * validated by VM the normal way (when they get mapped or
4611                  * copied).
4612                  */
4613                 flags &= ~UPL_COMMIT_CS_VALIDATED;
4614         }
4615         if (! page_list) {
4616                 /*
4617                  * No page list to get the code-signing info from !?
4618                  */
4619                 flags &= ~UPL_COMMIT_CS_VALIDATED;
4620         }
4621
4622         dwp = &dw_array[0];
4623         dw_count = 0;
4624
4625         while (xfer_size) {
4626                 vm_page_t       t, m;
4627
4628                 dwp->dw_mask = 0;
4629                 clear_refmod = 0;
4630
4631                 m = VM_PAGE_NULL;
4632
4633                 if (upl->flags & UPL_LITE) {
4634                         unsigned int    pg_num;
4635
4636                         pg_num = (unsigned int) (target_offset/PAGE_SIZE);
4637                         assert(pg_num == target_offset/PAGE_SIZE);
4638
4639                         if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
4640                                 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
4641
4642                                 if (!(upl->flags & UPL_KERNEL_OBJECT))
4643                                         m = vm_page_lookup(shadow_object, target_offset + (upl->offset - shadow_object->paging_offset));
4644                         }
4645                 }
4646                 if (upl->flags & UPL_SHADOWED) {
4647                         if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
4648
4649                                 t->pageout = FALSE;
4650
4651                                 VM_PAGE_FREE(t);
4652
4653                                 if (m == VM_PAGE_NULL)
4654                                         m = vm_page_lookup(shadow_object, target_offset + object->shadow_offset);
4655                         }
4656                 }
4657                 if ((upl->flags & UPL_KERNEL_OBJECT) || m == VM_PAGE_NULL)
4658                         goto commit_next_page;
4659
4660                 if (flags & UPL_COMMIT_CS_VALIDATED) {
4661                         /*
4662                          * CODE SIGNING:
4663                          * Set the code signing bits according to
4664                          * what the UPL says they should be.
4665                          */
4666                         m->cs_validated = page_list[entry].cs_validated;
4667                         m->cs_tainted = page_list[entry].cs_tainted;
4668                 }
4669                 if (upl->flags & UPL_IO_WIRE) {
4670
4671                         dwp->dw_mask |= DW_vm_page_unwire;
4672
4673                         if (page_list)
4674                                 page_list[entry].phys_addr = 0;
4675
4676                         if (flags & UPL_COMMIT_SET_DIRTY)
4677                                 m->dirty = TRUE;
4678                         else if (flags & UPL_COMMIT_CLEAR_DIRTY) {
4679                                 m->dirty = FALSE;
4680
4681                                 if (! (flags & UPL_COMMIT_CS_VALIDATED) &&
4682                                     m->cs_validated && !m->cs_tainted) {
4683                                         /*
4684                                          * CODE SIGNING:
4685                                          * This page is no longer dirty
4686                                          * but could have been modified,
4687                                          * so it will need to be
4688                                          * re-validated.
4689                                          */
4690                                         m->cs_validated = FALSE;
4691 #if DEVELOPMENT || DEBUG
4692                                         vm_cs_validated_resets++;
4693 #endif
4694                                         pmap_disconnect(m->phys_page);
4695                                 }
4696                                 clear_refmod |= VM_MEM_MODIFIED;
4697                         }
4698                         if (flags & UPL_COMMIT_INACTIVATE) {
4699                                 dwp->dw_mask |= DW_vm_page_deactivate_internal;
4700                                 clear_refmod |= VM_MEM_REFERENCED;
4701                         }
4702                         if (upl->flags & UPL_ACCESS_BLOCKED) {
4703                                 /*
4704                                  * We blocked access to the pages in this UPL.
4705                                  * Clear the "busy" bit and wake up any waiter
4706                                  * for this page.
4707                                  */
4708                                 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
4709                         }
4710                         if (m->absent) {
4711                                 if (flags & UPL_COMMIT_FREE_ABSENT)
4712                                         dwp->dw_mask |= DW_vm_page_free;
4713                                 else
4714                                         m->absent = FALSE;
4715                         }
4716                         goto commit_next_page;
4717                 }
4718                 /*
4719                  * make sure to clear the hardware
4720                  * modify or reference bits before
4721                  * releasing the BUSY bit on this page
4722                  * otherwise we risk losing a legitimate
4723                  * change of state
4724                  */
4725                 if (flags & UPL_COMMIT_CLEAR_DIRTY) {
4726                         m->dirty = FALSE;
4727
4728                         if (! (flags & UPL_COMMIT_CS_VALIDATED) &&
4729                             m->cs_validated && !m->cs_tainted) {
4730                                 /*
4731                                  * CODE SIGNING:
4732                                  * This page is no longer dirty
4733                                  * but could have been modified,
4734                                  * so it will need to be
4735                                  * re-validated.
4736                                  */
4737                                 m->cs_validated = FALSE;
4738 #if DEVELOPMENT || DEBUG
4739                                 vm_cs_validated_resets++;
4740 #endif
4741                                 pmap_disconnect(m->phys_page);
4742                         }
4743                         clear_refmod |= VM_MEM_MODIFIED;
4744                 }
4745                 if (page_list) {
4746                         upl_page_info_t *p;
4747
4748                         p = &(page_list[entry]);
4749
4750                         if (p->phys_addr && p->pageout && !m->pageout) {
4751                                 m->busy = TRUE;
4752                                 m->pageout = TRUE;
4753
4754                                 dwp->dw_mask |= DW_vm_page_wire;
4755
4756                         } else if (p->phys_addr &&
4757                                    !p->pageout && m->pageout &&
4758                                    !m->dump_cleaning) {
4759                                 m->pageout = FALSE;
4760                                 m->absent = FALSE;
4761                                 m->overwriting = FALSE;
4762
4763                                 dwp->dw_mask |= (DW_vm_page_unwire | DW_clear_busy | DW_PAGE_WAKEUP);
4764                         }
4765                         page_list[entry].phys_addr = 0;
4766                 }
4767                 m->dump_cleaning = FALSE;
4768
4769                 if (m->laundry)
4770                         dwp->dw_mask |= DW_vm_pageout_throttle_up;
4771
4772                 if (m->pageout) {
4773                         m->cleaning = FALSE;
4774                         m->encrypted_cleaning = FALSE;
4775                         m->pageout = FALSE;
4776 #if MACH_CLUSTER_STATS
4777                         if (m->wanted) vm_pageout_target_collisions++;
4778 #endif
4779                         m->dirty = FALSE;
4780
4781                         if (! (flags & UPL_COMMIT_CS_VALIDATED) &&
4782                             m->cs_validated && !m->cs_tainted) {
4783                                 /*
4784                                  * CODE SIGNING:
4785                                  * This page is no longer dirty
4786                                  * but could have been modified,
4787                                  * so it will need to be
4788                                  * re-validated.
4789                                  */
4790                                 m->cs_validated = FALSE;
4791 #if DEVELOPMENT || DEBUG
4792                                 vm_cs_validated_resets++;
4793 #endif
4794                                 pmap_disconnect(m->phys_page);
4795                         }
4796
4797                         if ((flags & UPL_COMMIT_SET_DIRTY) ||
4798                             (m->pmapped && (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED)))
4799                                 m->dirty = TRUE;
4800
4801                         if (m->dirty) {
4802                                 /*
4803                                  * page was re-dirtied after we started
4804                                  * the pageout... reactivate it since
4805                                  * we don't know whether the on-disk
4806                                  * copy matches what is now in memory
4807                                  */
4808                                 dwp->dw_mask |= (DW_vm_page_unwire | DW_clear_busy | DW_PAGE_WAKEUP);
4809
4810                                 if (upl->flags & UPL_PAGEOUT) {
4811                                         CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
4812                                         VM_STAT_INCR(reactivations);
4813                                         DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL);
4814                                 }
4815                         } else {
4816                                 /*
4817                                  * page has been successfully cleaned
4818                                  * go ahead and free it for other use
4819                                  */
4820
4821                                 if (m->object->internal) {
4822                                         DTRACE_VM2(anonpgout, int, 1, (uint64_t *), NULL);
4823                                 } else {
4824                                         DTRACE_VM2(fspgout, int, 1, (uint64_t *), NULL);
4825                                 }
4826                                 dwp->dw_mask |= DW_vm_page_free;
4827
4828                                 if (upl->flags & UPL_PAGEOUT) {
4829                                         CLUSTER_STAT(vm_pageout_target_page_freed++;)
4830
4831                                         if (page_list[entry].dirty) {
4832                                                 VM_STAT_INCR(pageouts);
4833                                                 DTRACE_VM2(pgout, int, 1, (uint64_t *), NULL);
4834                                                 pgpgout_count++;
4835                                         }
4836                                 }
4837                         }
4838                         goto commit_next_page;
4839                 }
4840 #if MACH_CLUSTER_STATS
4841                 if (m->wpmapped)
4842                         m->dirty = pmap_is_modified(m->phys_page);
4843
4844                 if (m->dirty)   vm_pageout_cluster_dirtied++;
4845                 else            vm_pageout_cluster_cleaned++;
4846                 if (m->wanted)  vm_pageout_cluster_collisions++;
4847 #endif
4848                 m->dirty = FALSE;
4849
4850                 if (! (flags & UPL_COMMIT_CS_VALIDATED) &&
4851                     m->cs_validated && !m->cs_tainted) {
4852                         /*
4853                          * CODE SIGNING:
4854                          * This page is no longer dirty
4855                          * but could have been modified,
4856                          * so it will need to be
4857                          * re-validated.
4858                          */
4859                         m->cs_validated = FALSE;
4860 #if DEVELOPMENT || DEBUG
4861                         vm_cs_validated_resets++;
4862 #endif
4863                         pmap_disconnect(m->phys_page);
4864                 }
4865
4866                 if ((m->busy) && (m->cleaning)) {
4867                         /*
4868                          * the request_page_list case
4869                          */
4870                         m->absent = FALSE;
4871                         m->overwriting = FALSE;
4872
4873                         dwp->dw_mask |= DW_clear_busy;
4874
4875                 } else if (m->overwriting) {
4876                         /*
4877                          * alternate request page list, write to
4878                          * page_list case.  Occurs when the original
4879                          * page was wired at the time of the list
4880                          * request
4881                          */
4882                         assert(VM_PAGE_WIRED(m));
4883                         m->overwriting = FALSE;
4884
4885                         dwp->dw_mask |= DW_vm_page_unwire; /* reactivates */
4886                 }
4887                 m->cleaning = FALSE;
4888                 m->encrypted_cleaning = FALSE;
4889
4890                 /*
4891                  * It is a part of the semantic of COPYOUT_FROM
4892                  * UPLs that a commit implies cache sync
4893                  * between the vm page and the backing store
4894                  * this can be used to strip the precious bit
4895                  * as well as clean
4896                  */
4897                 if ((upl->flags & UPL_PAGE_SYNC_DONE) || (flags & UPL_COMMIT_CLEAR_PRECIOUS))
4898                         m->precious = FALSE;
4899
4900                 if (flags & UPL_COMMIT_SET_DIRTY)
4901                         m->dirty = TRUE;
4902
4903                 if ((flags & UPL_COMMIT_INACTIVATE) && !m->clustered && !m->speculative) {
4904                         dwp->dw_mask |= DW_vm_page_deactivate_internal;
4905                         clear_refmod |= VM_MEM_REFERENCED;
4906
4907                 } else if (!m->active && !m->inactive && !m->speculative) {
4908
4909                         if (m->clustered || (flags & UPL_COMMIT_SPECULATE))
4910                                 dwp->dw_mask |= DW_vm_page_speculate;
4911                         else if (m->reference)
4912                                 dwp->dw_mask |= DW_vm_page_activate;
4913                         else {
4914                                 dwp->dw_mask |= DW_vm_page_deactivate_internal;
4915                                 clear_refmod |= VM_MEM_REFERENCED;
4916                         }
4917                 }
4918                 if (upl->flags & UPL_ACCESS_BLOCKED) {
4919                         /*
4920                          * We blocked access to the pages in this URL.
4921                          * Clear the "busy" bit on this page before we
4922                          * wake up any waiter.
4923                          */
4924                         dwp->dw_mask |= DW_clear_busy;
4925                 }
4926                 /*
4927                  * Wakeup any thread waiting for the page to be un-cleaning.
4928                  */
4929                 dwp->dw_mask |= DW_PAGE_WAKEUP;
4930
4931 commit_next_page:
4932                 if (clear_refmod)
4933                         pmap_clear_refmod(m->phys_page, clear_refmod);
4934
4935                 target_offset += PAGE_SIZE_64;
4936                 xfer_size -= PAGE_SIZE;
4937                 entry++;
4938
4939                 if (dwp->dw_mask) {
4940                         if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) {
4941                                 if (m->busy == FALSE) {
4942                                         /*
4943                                          * dw_do_work may need to drop the object lock
4944                                          * if it does, we need the pages it's looking at to
4945                                          * be held stable via the busy bit.
4946                                          */
4947                                         m->busy = TRUE;
4948                                         dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
4949                                 }
4950                                 dwp->dw_m = m;
4951                                 dwp++;
4952                                 dw_count++;
4953
4954                                 if (dw_count >= DELAYED_WORK_LIMIT) {
4955                                         dw_do_work(shadow_object, &dw_array[0], dw_count);
4956
4957                                         dwp = &dw_array[0];
4958                                         dw_count = 0;
4959                                 }
4960                         } else {
4961                                 if (dwp->dw_mask & DW_clear_busy)
4962                                         m->busy = FALSE;
4963
4964                                 if (dwp->dw_mask & DW_PAGE_WAKEUP)
4965                                         PAGE_WAKEUP(m);
4966                         }
4967                 }
4968         }
4969         if (dw_count)
4970                 dw_do_work(shadow_object, &dw_array[0], dw_count);
4971
4972         occupied = 1;
4973
4974         if (upl->flags & UPL_DEVICE_MEMORY)  {
4975                 occupied = 0;
4976         } else if (upl->flags & UPL_LITE) {
4977                 int     pg_num;
4978                 int     i;
4979
4980                 pg_num = upl->size/PAGE_SIZE;
4981                 pg_num = (pg_num + 31) >> 5;
4982                 occupied = 0;
4983
4984                 for (i = 0; i < pg_num; i++) {
4985                         if (lite_list[i] != 0) {
4986                                 occupied = 1;
4987                                 break;
4988                         }
4989                 }
4990         } else {
4991                 if (queue_empty(&upl->map_object->memq))
4992                         occupied = 0;
4993         }
4994         if (occupied == 0) {
4995                 /*
4996                  * If this UPL element belongs to a Vector UPL and is
4997                  * empty, then this is the right function to deallocate
4998                  * it. So go ahead set the *empty variable. The flag
4999                  * UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
5000                  * should be considered relevant for the Vector UPL and not
5001                  * the internal UPLs.
5002                  */
5003                 if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) || isVectorUPL)
5004                         *empty = TRUE;
5005
5006                 if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
5007                         /*
5008                          * this is not a paging object
5009                          * so we need to drop the paging reference
5010                          * that was taken when we created the UPL
5011                          * against this object
5012                          */
5013                         vm_object_activity_end(shadow_object);
5014                 } else {
5015                          /*
5016                           * we dontated the paging reference to
5017                           * the map object... vm_pageout_object_terminate
5018                           * will drop this reference
5019                           */
5020                 }
5021         }
5022         vm_object_unlock(shadow_object);
5023         if (object != shadow_object)
5024                 vm_object_unlock(object);
5025
5026         if(!isVectorUPL)
5027                 upl_unlock(upl);
5028         else {
5029                 /*
5030                  * If we completed our operations on an UPL that is
5031                  * part of a Vectored UPL and if empty is TRUE, then
5032                  * we should go ahead and deallocate this UPL element.
5033                  * Then we check if this was the last of the UPL elements
5034                  * within that Vectored UPL. If so, set empty to TRUE
5035                  * so that in ubc_upl_commit_range or ubc_upl_commit, we
5036                  * can go ahead and deallocate the Vector UPL too.
5037                  */
5038                 if(*empty==TRUE) {
5039                         *empty = vector_upl_set_subupl(vector_upl, upl, 0);
5040                         upl_deallocate(upl);
5041                 }
5042                 goto process_upl_to_commit;
5043         }
5044
5045         if (pgpgout_count) {
5046                 DTRACE_VM2(pgpgout, int, pgpgout_count, (uint64_t *), NULL);
5047         }
5048
5049         return KERN_SUCCESS;
5050 }
5051
5052 kern_return_t
5053 upl_abort_range(
5054         upl_t                   upl,
5055         upl_offset_t            offset,
5056         upl_size_t              size,
5057         int                     error,
5058         boolean_t               *empty)
5059 {
5060         upl_size_t              xfer_size, subupl_size = size;
5061         vm_object_t             shadow_object;
5062         vm_object_t             object;
5063         vm_object_offset_t      target_offset;
5064         upl_offset_t            subupl_offset = offset;
5065         int                     entry;
5066         wpl_array_t             lite_list;
5067         int                     occupied;
5068         struct  dw              dw_array[DELAYED_WORK_LIMIT];
5069         struct  dw              *dwp;
5070         int                     dw_count, isVectorUPL = 0;
5071         upl_t                   vector_upl = NULL;
5072
5073         *empty = FALSE;
5074
5075         if (upl == UPL_NULL)
5076                 return KERN_INVALID_ARGUMENT;
5077
5078         if ( (upl->flags & UPL_IO_WIRE) && !(error & UPL_ABORT_DUMP_PAGES) )
5079                 return upl_commit_range(upl, offset, size, UPL_COMMIT_FREE_ABSENT, NULL, 0, empty);
5080
5081         if((isVectorUPL = vector_upl_is_valid(upl))) {
5082                 vector_upl = upl;
5083                 upl_lock(vector_upl);
5084         }
5085         else
5086                 upl_lock(upl);
5087
5088 process_upl_to_abort:
5089         if(isVectorUPL) {
5090                 size = subupl_size;
5091                 offset = subupl_offset;
5092                 if(size == 0) {
5093                         upl_unlock(vector_upl);
5094                         return KERN_SUCCESS;
5095                 }
5096                 upl =  vector_upl_subupl_byoffset(vector_upl, &offset, &size);
5097                 if(upl == NULL) {
5098                         upl_unlock(vector_upl);
5099                         return KERN_FAILURE;
5100                 }
5101                 subupl_size -= size;
5102                 subupl_offset += size;
5103         }
5104
5105         *empty = FALSE;
5106
5107 #if UPL_DEBUG
5108         if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
5109                 (void) OSBacktrace(&upl->upl_commit_records[upl->upl_commit_index].c_retaddr[0], UPL_DEBUG_STACK_FRAMES);
5110
5111                 upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
5112                 upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
5113                 upl->upl_commit_records[upl->upl_commit_index].c_aborted = 1;
5114
5115                 upl->upl_commit_index++;
5116         }
5117 #endif
5118         if (upl->flags & UPL_DEVICE_MEMORY)
5119                 xfer_size = 0;
5120         else if ((offset + size) <= upl->size)
5121                 xfer_size = size;
5122         else {
5123                 if(!isVectorUPL)
5124                         upl_unlock(upl);
5125                 else {
5126                         upl_unlock(vector_upl);
5127                 }
5128
5129                 return KERN_FAILURE;
5130         }
5131         if (upl->flags & UPL_INTERNAL) {
5132                 lite_list = (wpl_array_t)
5133                         ((((uintptr_t)upl) + sizeof(struct upl))
5134                         + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
5135         } else {
5136                 lite_list = (wpl_array_t)
5137                         (((uintptr_t)upl) + sizeof(struct upl));
5138         }
5139         object = upl->map_object;
5140
5141         if (upl->flags & UPL_SHADOWED) {
5142                 vm_object_lock(object);
5143                 shadow_object = object->shadow;
5144         } else
5145                 shadow_object = object;
5146
5147         entry = offset/PAGE_SIZE;
5148         target_offset = (vm_object_offset_t)offset;
5149
5150         if (upl->flags & UPL_KERNEL_OBJECT)
5151                 vm_object_lock_shared(shadow_object);
5152         else
5153                 vm_object_lock(shadow_object);
5154
5155         if (upl->flags & UPL_ACCESS_BLOCKED) {
5156                 assert(shadow_object->blocked_access);
5157                 shadow_object->blocked_access = FALSE;
5158                 vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
5159         }
5160
5161         dwp = &dw_array[0];
5162         dw_count = 0;
5163
5164         if ((error & UPL_ABORT_DUMP_PAGES) && (upl->flags & UPL_KERNEL_OBJECT))
5165                 panic("upl_abort_range: kernel_object being DUMPED");
5166
5167         while (xfer_size) {
5168                 vm_page_t       t, m;
5169
5170                 dwp->dw_mask = 0;
5171
5172                 m = VM_PAGE_NULL;
5173
5174                 if (upl->flags & UPL_LITE) {
5175                         unsigned int    pg_num;
5176
5177                         pg_num = (unsigned int) (target_offset/PAGE_SIZE);
5178                         assert(pg_num == target_offset/PAGE_SIZE);
5179
5180
5181                         if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
5182                                 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
5183
5184                                 if ( !(upl->flags & UPL_KERNEL_OBJECT))
5185                                         m = vm_page_lookup(shadow_object, target_offset +
5186                                                            (upl->offset - shadow_object->paging_offset));
5187                         }
5188                 }
5189                 if (upl->flags & UPL_SHADOWED) {
5190                         if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
5191                                 t->pageout = FALSE;
5192
5193                                 VM_PAGE_FREE(t);
5194
5195                                 if (m == VM_PAGE_NULL)
5196                                         m = vm_page_lookup(shadow_object, target_offset + object->shadow_offset);
5197                         }
5198                 }
5199                 if ((upl->flags & UPL_KERNEL_OBJECT))
5200                         goto abort_next_page;
5201
5202                 if (m != VM_PAGE_NULL) {
5203
5204                         if (m->absent) {
5205                                 boolean_t must_free = TRUE;
5206
5207                                 m->clustered = FALSE;
5208                                 /*
5209                                  * COPYOUT = FALSE case
5210                                  * check for error conditions which must
5211                                  * be passed back to the pages customer
5212                                  */
5213                                 if (error & UPL_ABORT_RESTART) {
5214                                         m->restart = TRUE;
5215                                         m->absent = FALSE;
5216                                         m->unusual = TRUE;
5217                                         must_free = FALSE;
5218                                 } else if (error & UPL_ABORT_UNAVAILABLE) {
5219                                         m->restart = FALSE;
5220                                         m->unusual = TRUE;
5221                                         must_free = FALSE;
5222                                 } else if (error & UPL_ABORT_ERROR) {
5223                                         m->restart = FALSE;
5224                                         m->absent = FALSE;
5225                                         m->error = TRUE;
5226                                         m->unusual = TRUE;
5227                                         must_free = FALSE;
5228                                 }
5229
5230                                 /*
5231                                  * ENCRYPTED SWAP:
5232                                  * If the page was already encrypted,
5233                                  * we don't really need to decrypt it
5234                                  * now.  It will get decrypted later,
5235                                  * on demand, as soon as someone needs
5236                                  * to access its contents.
5237                                  */
5238
5239                                 m->cleaning = FALSE;
5240                                 m->encrypted_cleaning = FALSE;
5241                                 m->overwriting = FALSE;
5242
5243                                 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
5244
5245                                 if (must_free == TRUE)
5246                                         dwp->dw_mask |= DW_vm_page_free;
5247                                 else
5248                                         dwp->dw_mask |= DW_vm_page_activate;
5249                         } else {
5250                                 /*
5251                                  * Handle the trusted pager throttle.
5252                                  */
5253                                 if (m->laundry)
5254                                         dwp->dw_mask |= DW_vm_pageout_throttle_up;
5255
5256                                 if (m->pageout) {
5257                                         assert(m->busy);
5258                                         assert(m->wire_count == 1);
5259                                         m->pageout = FALSE;
5260
5261                                         dwp->dw_mask |= DW_vm_page_unwire;
5262                                 }
5263                                 m->dump_cleaning = FALSE;
5264                                 m->cleaning = FALSE;
5265                                 m->encrypted_cleaning = FALSE;
5266                                 m->overwriting = FALSE;
5267 #if     MACH_PAGEMAP
5268                                 vm_external_state_clr(m->object->existence_map, m->offset);
5269 #endif  /* MACH_PAGEMAP */
5270                                 if (error & UPL_ABORT_DUMP_PAGES) {
5271                                         pmap_disconnect(m->phys_page);
5272
5273                                         dwp->dw_mask |= DW_vm_page_free;
5274                                 } else {
5275                                         if (error & UPL_ABORT_REFERENCE) {
5276                                                 /*
5277                                                  * we've been told to explictly
5278                                                  * reference this page... for
5279                                                  * file I/O, this is done by
5280                                                  * implementing an LRU on the inactive q
5281                                                  */
5282                                                 dwp->dw_mask |= DW_vm_page_lru;
5283                                         }
5284                                         dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
5285                                 }
5286                         }
5287                 }
5288 abort_next_page:
5289                 target_offset += PAGE_SIZE_64;
5290                 xfer_size -= PAGE_SIZE;
5291                 entry++;
5292
5293                 if (dwp->dw_mask) {
5294                         if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) {
5295                                 if (m->busy == FALSE) {
5296                                         /*
5297                                          * dw_do_work may need to drop the object lock
5298                                          * if it does, we need the pages it's looking at to
5299                                          * be held stable via the busy bit.
5300                                          */
5301                                         m->busy = TRUE;
5302                                         dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
5303                                 }
5304                                 dwp->dw_m = m;
5305                                 dwp++;
5306                                 dw_count++;
5307
5308                                 if (dw_count >= DELAYED_WORK_LIMIT) {
5309                                         dw_do_work(shadow_object, &dw_array[0], dw_count);
5310
5311                                         dwp = &dw_array[0];
5312                                         dw_count = 0;
5313                                 }
5314                         } else {
5315                                 if (dwp->dw_mask & DW_clear_busy)
5316                                         m->busy = FALSE;
5317
5318                                 if (dwp->dw_mask & DW_PAGE_WAKEUP)
5319                                         PAGE_WAKEUP(m);
5320                         }
5321                 }
5322         }
5323         if (dw_count)
5324                 dw_do_work(shadow_object, &dw_array[0], dw_count);
5325
5326         occupied = 1;
5327
5328         if (upl->flags & UPL_DEVICE_MEMORY)  {
5329                 occupied = 0;
5330         } else if (upl->flags & UPL_LITE) {
5331                 int     pg_num;
5332                 int     i;
5333
5334                 pg_num = upl->size/PAGE_SIZE;
5335                 pg_num = (pg_num + 31) >> 5;
5336                 occupied = 0;
5337
5338                 for (i = 0; i < pg_num; i++) {
5339                         if (lite_list[i] != 0) {
5340                                 occupied = 1;
5341                                 break;
5342                         }
5343                 }
5344         } else {
5345                 if (queue_empty(&upl->map_object->memq))
5346                         occupied = 0;
5347         }
5348         if (occupied == 0) {
5349                 /*
5350                  * If this UPL element belongs to a Vector UPL and is
5351                  * empty, then this is the right function to deallocate
5352                  * it. So go ahead set the *empty variable. The flag
5353                  * UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
5354                  * should be considered relevant for the Vector UPL and
5355                  * not the internal UPLs.
5356                  */
5357                 if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) || isVectorUPL)
5358                         *empty = TRUE;
5359
5360                 if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
5361                         /*
5362                          * this is not a paging object
5363                          * so we need to drop the paging reference
5364                          * that was taken when we created the UPL
5365                          * against this object
5366                          */
5367                         vm_object_activity_end(shadow_object);
5368                 } else {
5369                          /*
5370                           * we dontated the paging reference to
5371                           * the map object... vm_pageout_object_terminate
5372                           * will drop this reference
5373                           */
5374                 }
5375         }
5376         vm_object_unlock(shadow_object);
5377         if (object != shadow_object)
5378                 vm_object_unlock(object);
5379
5380         if(!isVectorUPL)
5381                 upl_unlock(upl);
5382         else {
5383                 /*
5384                 * If we completed our operations on an UPL that is
5385                 * part of a Vectored UPL and if empty is TRUE, then
5386                 * we should go ahead and deallocate this UPL element.
5387                 * Then we check if this was the last of the UPL elements
5388                 * within that Vectored UPL. If so, set empty to TRUE
5389                 * so that in ubc_upl_abort_range or ubc_upl_abort, we
5390                 * can go ahead and deallocate the Vector UPL too.
5391                 */
5392                 if(*empty == TRUE) {
5393                         *empty = vector_upl_set_subupl(vector_upl, upl,0);
5394                         upl_deallocate(upl);
5395                 }
5396                 goto process_upl_to_abort;
5397         }
5398
5399         return KERN_SUCCESS;
5400 }
5401
5402
5403 kern_return_t
5404 upl_abort(
5405         upl_t   upl,
5406         int     error)
5407 {
5408         boolean_t       empty;
5409
5410         return upl_abort_range(upl, 0, upl->size, error, &empty);
5411 }
5412
5413
5414 /* an option on commit should be wire */
5415 kern_return_t
5416 upl_commit(
5417         upl_t                   upl,
5418         upl_page_info_t         *page_list,
5419         mach_msg_type_number_t  count)
5420 {
5421         boolean_t       empty;
5422
5423         return upl_commit_range(upl, 0, upl->size, 0, page_list, count, &empty);
5424 }
5425
5426
5427 unsigned int vm_object_iopl_request_sleep_for_cleaning = 0;
5428
5429 kern_return_t
5430 vm_object_iopl_request(
5431         vm_object_t             object,
5432         vm_object_offset_t      offset,
5433         upl_size_t              size,
5434         upl_t                   *upl_ptr,
5435         upl_page_info_array_t   user_page_list,
5436         unsigned int            *page_list_count,
5437         int                     cntrl_flags)
5438 {
5439         vm_page_t               dst_page;
5440         vm_object_offset_t      dst_offset;
5441         upl_size_t              xfer_size;
5442         upl_t                   upl = NULL;
5443         unsigned int            entry;
5444         wpl_array_t             lite_list = NULL;
5445         int                     no_zero_fill = FALSE;
5446         u_int32_t               psize;
5447         kern_return_t           ret;
5448         vm_prot_t               prot;
5449         struct vm_object_fault_info fault_info;
5450         struct  dw              dw_array[DELAYED_WORK_LIMIT];
5451         struct  dw              *dwp;
5452         int                     dw_count;
5453         int                     dw_index;
5454
5455         if (cntrl_flags & ~UPL_VALID_FLAGS) {
5456                 /*
5457                  * For forward compatibility's sake,
5458                  * reject any unknown flag.
5459                  */
5460                 return KERN_INVALID_VALUE;
5461         }
5462         if (vm_lopage_needed == FALSE)
5463                 cntrl_flags &= ~UPL_NEED_32BIT_ADDR;
5464
5465         if (cntrl_flags & UPL_NEED_32BIT_ADDR) {
5466                 if ( (cntrl_flags & (UPL_SET_IO_WIRE | UPL_SET_LITE)) != (UPL_SET_IO_WIRE | UPL_SET_LITE))
5467                         return KERN_INVALID_VALUE;
5468
5469                 if (object->phys_contiguous) {
5470                         if ((offset + object->shadow_offset) >= (vm_object_offset_t)max_valid_dma_address)
5471                                 return KERN_INVALID_ADDRESS;
5472
5473                         if (((offset + object->shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address)
5474                                 return KERN_INVALID_ADDRESS;
5475                 }
5476         }
5477
5478         if (cntrl_flags & UPL_ENCRYPT) {
5479                 /*
5480                  * ENCRYPTED SWAP:
5481                  * The paging path doesn't use this interface,
5482                  * so we don't support the UPL_ENCRYPT flag
5483                  * here.  We won't encrypt the pages.
5484                  */
5485                 assert(! (cntrl_flags & UPL_ENCRYPT));
5486         }
5487         if (cntrl_flags & UPL_NOZEROFILL)
5488                 no_zero_fill = TRUE;
5489
5490         if (cntrl_flags & UPL_COPYOUT_FROM)
5491                 prot = VM_PROT_READ;
5492         else
5493                 prot = VM_PROT_READ | VM_PROT_WRITE;
5494
5495         if (((size/PAGE_SIZE) > MAX_UPL_SIZE) && !object->phys_contiguous)
5496                 size = MAX_UPL_SIZE * PAGE_SIZE;
5497
5498         if (cntrl_flags & UPL_SET_INTERNAL) {
5499                 if (page_list_count != NULL)
5500                         *page_list_count = MAX_UPL_SIZE;
5501         }
5502         if (((cntrl_flags & UPL_SET_INTERNAL) && !(object->phys_contiguous)) &&
5503             ((page_list_count != NULL) && (*page_list_count != 0) && *page_list_count < (size/page_size)))
5504                 return KERN_INVALID_ARGUMENT;
5505
5506         if ((!object->internal) && (object->paging_offset != 0))
5507                 panic("vm_object_iopl_request: external object with non-zero paging offset\n");
5508
5509
5510         if (object->phys_contiguous)
5511                 psize = PAGE_SIZE;
5512         else
5513                 psize = size;
5514
5515         if (cntrl_flags & UPL_SET_INTERNAL) {
5516                 upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE, UPL_IO_WIRE, psize);
5517
5518                 user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
5519                 lite_list = (wpl_array_t) (((uintptr_t)user_page_list) +
5520                                            ((psize / PAGE_SIZE) * sizeof(upl_page_info_t)));
5521                 if (size == 0) {
5522                         user_page_list = NULL;
5523                         lite_list = NULL;
5524                 }
5525         } else {
5526                 upl = upl_create(UPL_CREATE_LITE, UPL_IO_WIRE, psize);
5527
5528                 lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
5529                 if (size == 0) {
5530                         lite_list = NULL;
5531                 }
5532         }
5533         if (user_page_list)
5534                 user_page_list[0].device = FALSE;
5535         *upl_ptr = upl;
5536
5537         upl->map_object = object;
5538         upl->size = size;
5539
5540         if (object == kernel_object &&
5541             !(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS))) {
5542                 upl->flags |= UPL_KERNEL_OBJECT;
5543 #if UPL_DEBUG
5544                 vm_object_lock(object);
5545 #else
5546                 vm_object_lock_shared(object);
5547 #endif
5548         } else {
5549                 vm_object_lock(object);
5550                 vm_object_activity_begin(object);
5551         }
5552         /*
5553          * paging in progress also protects the paging_offset
5554          */
5555         upl->offset = offset + object->paging_offset;
5556
5557         if (cntrl_flags & UPL_BLOCK_ACCESS) {
5558                 /*
5559                  * The user requested that access to the pages in this URL
5560                  * be blocked until the UPL is commited or aborted.
5561                  */
5562                 upl->flags |= UPL_ACCESS_BLOCKED;
5563         }
5564
5565         if (object->phys_contiguous) {
5566 #if UPL_DEBUG
5567                 queue_enter(&object->uplq, upl, upl_t, uplq);
5568 #endif /* UPL_DEBUG */
5569
5570                 if (upl->flags & UPL_ACCESS_BLOCKED) {
5571                         assert(!object->blocked_access);
5572                         object->blocked_access = TRUE;
5573                 }
5574
5575                 vm_object_unlock(object);
5576
5577                 /*
5578                  * don't need any shadow mappings for this one
5579                  * since it is already I/O memory
5580                  */
5581                 upl->flags |= UPL_DEVICE_MEMORY;
5582
5583                 upl->highest_page = (ppnum_t) ((offset + object->shadow_offset + size - 1)>>PAGE_SHIFT);
5584
5585                 if (user_page_list) {
5586                         user_page_list[0].phys_addr = (ppnum_t) ((offset + object->shadow_offset)>>PAGE_SHIFT);
5587                         user_page_list[0].device = TRUE;
5588                 }
5589                 if (page_list_count != NULL) {
5590                         if (upl->flags & UPL_INTERNAL)
5591                                 *page_list_count = 0;
5592                         else
5593                                 *page_list_count = 1;
5594                 }
5595                 return KERN_SUCCESS;
5596         }
5597         if (object != kernel_object) {
5598                 /*
5599                  * Protect user space from future COW operations
5600                  */
5601                 object->true_share = TRUE;
5602
5603                 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC)
5604                         object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
5605         }
5606
5607 #if UPL_DEBUG
5608         queue_enter(&object->uplq, upl, upl_t, uplq);
5609 #endif /* UPL_DEBUG */
5610
5611         if (!(cntrl_flags & UPL_COPYOUT_FROM) &&
5612             object->copy != VM_OBJECT_NULL) {
5613                 /*
5614                  * Honor copy-on-write obligations
5615                  *
5616                  * The caller is gathering these pages and
5617                  * might modify their contents.  We need to
5618                  * make sure that the copy object has its own
5619                  * private copies of these pages before we let
5620                  * the caller modify them.
5621                  *
5622                  * NOTE: someone else could map the original object
5623                  * after we've done this copy-on-write here, and they
5624                  * could then see an inconsistent picture of the memory
5625                  * while it's being modified via the UPL.  To prevent this,
5626                  * we would have to block access to these pages until the
5627                  * UPL is released.  We could use the UPL_BLOCK_ACCESS
5628                  * code path for that...
5629                  */
5630                 vm_object_update(object,
5631                                  offset,
5632                                  size,
5633                                  NULL,
5634                                  NULL,
5635                                  FALSE, /* should_return */
5636                                  MEMORY_OBJECT_COPY_SYNC,
5637                                  VM_PROT_NO_CHANGE);
5638 #if DEVELOPMENT || DEBUG
5639                 iopl_cow++;
5640                 iopl_cow_pages += size >> PAGE_SHIFT;
5641 #endif
5642         }
5643
5644
5645         entry = 0;
5646
5647         xfer_size = size;
5648         dst_offset = offset;
5649
5650         fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
5651         fault_info.user_tag  = 0;
5652         fault_info.lo_offset = offset;
5653         fault_info.hi_offset = offset + xfer_size;
5654         fault_info.no_cache  = FALSE;
5655         fault_info.stealth = FALSE;
5656         fault_info.mark_zf_absent = TRUE;
5657
5658         dwp = &dw_array[0];
5659         dw_count = 0;
5660
5661         while (xfer_size) {
5662                 vm_fault_return_t       result;
5663                 unsigned int            pg_num;
5664
5665                 dwp->dw_mask = 0;
5666
5667                 dst_page = vm_page_lookup(object, dst_offset);
5668
5669                 /*
5670                  * ENCRYPTED SWAP:
5671                  * If the page is encrypted, we need to decrypt it,
5672                  * so force a soft page fault.
5673                  */
5674                 if (dst_page == VM_PAGE_NULL ||
5675                     dst_page->busy ||
5676                     dst_page->encrypted ||
5677                     dst_page->error ||
5678                     dst_page->restart ||
5679                     dst_page->absent ||
5680                     dst_page->fictitious) {
5681
5682                    if (object == kernel_object)
5683                            panic("vm_object_iopl_request: missing/bad page in kernel object\n");
5684
5685                    do {
5686                         vm_page_t       top_page;
5687                         kern_return_t   error_code;
5688                         int             interruptible;
5689
5690                         if (cntrl_flags & UPL_SET_INTERRUPTIBLE)
5691                                 interruptible = THREAD_ABORTSAFE;
5692                         else
5693                                 interruptible = THREAD_UNINT;
5694
5695                         fault_info.interruptible = interruptible;
5696                         fault_info.cluster_size = xfer_size;
5697
5698                         vm_object_paging_begin(object);
5699
5700                         result = vm_fault_page(object, dst_offset,
5701                                                prot | VM_PROT_WRITE, FALSE,
5702                                                &prot, &dst_page, &top_page,
5703                                                (int *)0,
5704                                                &error_code, no_zero_fill,
5705                                                FALSE, &fault_info);
5706
5707                         switch (result) {
5708
5709                         case VM_FAULT_SUCCESS:
5710
5711                                 PAGE_WAKEUP_DONE(dst_page);
5712                                 /*
5713                                  *      Release paging references and
5714                                  *      top-level placeholder page, if any.
5715                                  */
5716                                 if (top_page != VM_PAGE_NULL) {
5717                                         vm_object_t local_object;
5718
5719                                         local_object = top_page->object;
5720
5721                                         if (top_page->object != dst_page->object) {
5722                                                 vm_object_lock(local_object);
5723                                                 VM_PAGE_FREE(top_page);
5724                                                 vm_object_paging_end(local_object);
5725                                                 vm_object_unlock(local_object);
5726                                         } else {
5727                                                 VM_PAGE_FREE(top_page);
5728                                                 vm_object_paging_end(local_object);
5729                                         }
5730                                 }
5731                                 vm_object_paging_end(object);
5732                                 break;
5733
5734                         case VM_FAULT_RETRY:
5735                                 vm_object_lock(object);
5736                                 break;
5737
5738                         case VM_FAULT_FICTITIOUS_SHORTAGE:
5739                                 vm_page_more_fictitious();
5740
5741                                 vm_object_lock(object);
5742                                 break;
5743
5744                         case VM_FAULT_MEMORY_SHORTAGE:
5745                                 if (vm_page_wait(interruptible)) {
5746                                         vm_object_lock(object);
5747                                         break;
5748                                 }
5749                                 /* fall thru */
5750
5751                         case VM_FAULT_INTERRUPTED:
5752                                 error_code = MACH_SEND_INTERRUPTED;
5753                         case VM_FAULT_MEMORY_ERROR:
5754                         memory_error:
5755                                 ret = (error_code ? error_code: KERN_MEMORY_ERROR);
5756
5757                                 vm_object_lock(object);
5758                                 goto return_err;
5759
5760                         case VM_FAULT_SUCCESS_NO_VM_PAGE:
5761                                 /* success but no page: fail */
5762                                 vm_object_paging_end(object);
5763                                 vm_object_unlock(object);
5764                                 goto memory_error;
5765
5766                         default:
5767                                 panic("vm_object_iopl_request: unexpected error"
5768                                       " 0x%x from vm_fault_page()\n", result);
5769                         }
5770                    } while (result != VM_FAULT_SUCCESS);
5771
5772                 }
5773
5774                 if (upl->flags & UPL_KERNEL_OBJECT)
5775                         goto record_phys_addr;
5776
5777                 if (dst_page->cleaning) {
5778                         /*
5779                          * Someone else is cleaning this page in place.as
5780                          * In theory, we should be able to  proceed and use this
5781                          * page but they'll probably end up clearing the "busy"
5782                          * bit on it in upl_commit_range() but they didn't set
5783                          * it, so they would clear our "busy" bit and open
5784                          * us to race conditions.
5785                          * We'd better wait for the cleaning to complete and
5786                          * then try again.
5787                          */
5788                         vm_object_iopl_request_sleep_for_cleaning++;
5789                         PAGE_SLEEP(object, dst_page, THREAD_UNINT);
5790                         continue;
5791                 }
5792                 if ( (cntrl_flags & UPL_NEED_32BIT_ADDR) &&
5793                      dst_page->phys_page >= (max_valid_dma_address >> PAGE_SHIFT) ) {
5794                         vm_page_t       low_page;
5795                         int             refmod;
5796
5797                         /*
5798                          * support devices that can't DMA above 32 bits
5799                          * by substituting pages from a pool of low address
5800                          * memory for any pages we find above the 4G mark
5801                          * can't substitute if the page is already wired because
5802                          * we don't know whether that physical address has been
5803                          * handed out to some other 64 bit capable DMA device to use
5804                          */
5805                         if (VM_PAGE_WIRED(dst_page)) {
5806                                 ret = KERN_PROTECTION_FAILURE;
5807                                 goto return_err;
5808                         }
5809                         low_page = vm_page_grablo();
5810
5811                         if (low_page == VM_PAGE_NULL) {
5812                                 ret = KERN_RESOURCE_SHORTAGE;
5813                                 goto return_err;
5814                         }
5815                         /*
5816                          * from here until the vm_page_replace completes
5817                          * we musn't drop the object lock... we don't
5818                          * want anyone refaulting this page in and using
5819                          * it after we disconnect it... we want the fault
5820                          * to find the new page being substituted.
5821                          */
5822                         if (dst_page->pmapped)
5823                                 refmod = pmap_disconnect(dst_page->phys_page);
5824                         else
5825                                 refmod = 0;
5826                         vm_page_copy(dst_page, low_page);
5827
5828                         low_page->reference = dst_page->reference;
5829                         low_page->dirty     = dst_page->dirty;
5830
5831                         if (refmod & VM_MEM_REFERENCED)
5832                                 low_page->reference = TRUE;
5833                         if (refmod & VM_MEM_MODIFIED)
5834                                 low_page->dirty = TRUE;
5835
5836                         vm_page_replace(low_page, object, dst_offset);
5837
5838                         dst_page = low_page;
5839                         /*
5840                          * vm_page_grablo returned the page marked
5841                          * BUSY... we don't need a PAGE_WAKEUP_DONE
5842                          * here, because we've never dropped the object lock
5843                          */
5844                         dst_page->busy = FALSE;
5845                 }
5846                 dwp->dw_mask |= DW_vm_page_wire;
5847
5848                 if (cntrl_flags & UPL_BLOCK_ACCESS) {
5849                         /*
5850                          * Mark the page "busy" to block any future page fault
5851                          * on this page.  We'll also remove the mapping
5852                          * of all these pages before leaving this routine.
5853                          */
5854                         assert(!dst_page->fictitious);
5855                         dst_page->busy = TRUE;
5856                 }
5857                 /*
5858                  * expect the page to be used
5859                  * page queues lock must be held to set 'reference'
5860                  */
5861                 dwp->dw_mask |= DW_set_reference;
5862
5863                 if (!(cntrl_flags & UPL_COPYOUT_FROM))
5864                         dst_page->dirty = TRUE;
5865 record_phys_addr:
5866                 pg_num = (unsigned int) ((dst_offset-offset)/PAGE_SIZE);
5867                 assert(pg_num == (dst_offset-offset)/PAGE_SIZE);
5868                 lite_list[pg_num>>5] |= 1 << (pg_num & 31);
5869
5870                 if (dst_page->phys_page > upl->highest_page)
5871                         upl->highest_page = dst_page->phys_page;
5872
5873                 if (user_page_list) {
5874                         user_page_list[entry].phys_addr = dst_page->phys_page;
5875                         user_page_list[entry].pageout   = dst_page->pageout;
5876                         user_page_list[entry].absent    = dst_page->absent;
5877                         user_page_list[entry].dirty     = dst_page->dirty;
5878                         user_page_list[entry].precious  = dst_page->precious;
5879                         user_page_list[entry].device    = FALSE;
5880                         if (dst_page->clustered == TRUE)
5881                                 user_page_list[entry].speculative = dst_page->speculative;
5882                         else
5883                                 user_page_list[entry].speculative = FALSE;
5884                         user_page_list[entry].cs_validated = dst_page->cs_validated;
5885                         user_page_list[entry].cs_tainted = dst_page->cs_tainted;
5886                 }
5887                 if (object != kernel_object) {
5888                         /*
5889                          * someone is explicitly grabbing this page...
5890                          * update clustered and speculative state
5891                          *
5892                          */
5893                         VM_PAGE_CONSUME_CLUSTERED(dst_page);
5894                 }
5895                 entry++;
5896                 dst_offset += PAGE_SIZE_64;
5897                 xfer_size -= PAGE_SIZE;
5898
5899                 if (dwp->dw_mask) {
5900                         if (dst_page->busy == FALSE) {
5901                                 /*
5902                                  * dw_do_work may need to drop the object lock
5903                                  * if it does, we need the pages it's looking at to
5904                                  * be held stable via the busy bit.
5905                                  */
5906                                 dst_page->busy = TRUE;
5907                                 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
5908                         }
5909                         dwp->dw_m = dst_page;
5910                         dwp++;
5911                         dw_count++;
5912
5913                         if (dw_count >= DELAYED_WORK_LIMIT) {
5914                                 dw_do_work(object, &dw_array[0], dw_count);
5915
5916                                 dwp = &dw_array[0];
5917                                 dw_count = 0;
5918                         }
5919                 }
5920         }
5921         if (dw_count)
5922                 dw_do_work(object, &dw_array[0], dw_count);
5923
5924         if (page_list_count != NULL) {
5925                 if (upl->flags & UPL_INTERNAL)
5926                         *page_list_count = 0;
5927                 else if (*page_list_count > entry)
5928                         *page_list_count = entry;
5929         }
5930         vm_object_unlock(object);
5931
5932         if (cntrl_flags & UPL_BLOCK_ACCESS) {
5933                 /*
5934                  * We've marked all the pages "busy" so that future
5935                  * page faults will block.
5936                  * Now remove the mapping for these pages, so that they
5937                  * can't be accessed without causing a page fault.
5938                  */
5939                 vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
5940                                        PMAP_NULL, 0, VM_PROT_NONE);
5941                 assert(!object->blocked_access);
5942                 object->blocked_access = TRUE;
5943         }
5944         return KERN_SUCCESS;
5945
5946 return_err:
5947         dw_index = 0;
5948
5949         for (; offset < dst_offset; offset += PAGE_SIZE) {
5950                 boolean_t need_unwire;
5951
5952                 dst_page = vm_page_lookup(object, offset);
5953
5954                 if (dst_page == VM_PAGE_NULL)
5955                         panic("vm_object_iopl_request: Wired pages missing. \n");
5956
5957                 /*
5958                  * if we've already processed this page in an earlier
5959                  * dw_do_work, we need to undo the wiring... we will
5960                  * leave the dirty and reference bits on if they
5961                  * were set, since we don't have a good way of knowing
5962                  * what the previous state was and we won't get here
5963                  * under any normal circumstances...  we will always
5964                  * clear BUSY and wakeup any waiters via vm_page_free
5965                  * or PAGE_WAKEUP_DONE
5966                  */
5967                 need_unwire = TRUE;
5968
5969                 if (dw_count) {
5970                         if (dw_array[dw_index].dw_m == dst_page) {
5971                                 /*
5972                                  * still in the deferred work list
5973                                  * which means we haven't yet called
5974                                  * vm_page_wire on this page
5975                                  */
5976                                 need_unwire = FALSE;
5977                         }
5978                         dw_index++;
5979                         dw_count--;
5980                 }
5981                 vm_page_lock_queues();
5982
5983                 if (need_unwire == TRUE) {
5984                         boolean_t queueit;
5985
5986                         queueit = (dst_page->absent) ? FALSE : TRUE;
5987
5988                         vm_page_unwire(dst_page, queueit);
5989                 }
5990                 if (dst_page->absent)
5991                         vm_page_free(dst_page);
5992                 else
5993                         PAGE_WAKEUP_DONE(dst_page);
5994
5995                 vm_page_unlock_queues();
5996
5997                 if (need_unwire == TRUE)
5998                         VM_STAT_INCR(reactivations);
5999         }
6000 #if UPL_DEBUG
6001         upl->upl_state = 2;
6002 #endif
6003         if (! (upl->flags & UPL_KERNEL_OBJECT)) {
6004                 vm_object_activity_end(object);
6005         }
6006         vm_object_unlock(object);
6007         upl_destroy(upl);
6008
6009         return ret;
6010 }
6011
6012 kern_return_t
6013 upl_transpose(
6014         upl_t           upl1,
6015         upl_t           upl2)
6016 {
6017         kern_return_t           retval;
6018         boolean_t               upls_locked;
6019         vm_object_t             object1, object2;
6020
6021         if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2  || ((upl1->flags & UPL_VECTOR)==UPL_VECTOR)  || ((upl2->flags & UPL_VECTOR)==UPL_VECTOR)) {
6022                 return KERN_INVALID_ARGUMENT;
6023         }
6024
6025         upls_locked = FALSE;
6026
6027         /*
6028          * Since we need to lock both UPLs at the same time,
6029          * avoid deadlocks by always taking locks in the same order.
6030          */
6031         if (upl1 < upl2) {
6032                 upl_lock(upl1);
6033                 upl_lock(upl2);
6034         } else {
6035                 upl_lock(upl2);
6036                 upl_lock(upl1);
6037         }
6038         upls_locked = TRUE;     /* the UPLs will need to be unlocked */
6039
6040         object1 = upl1->map_object;
6041         object2 = upl2->map_object;
6042
6043         if (upl1->offset != 0 || upl2->offset != 0 ||
6044             upl1->size != upl2->size) {
6045                 /*
6046                  * We deal only with full objects, not subsets.
6047                  * That's because we exchange the entire backing store info
6048                  * for the objects: pager, resident pages, etc...  We can't do
6049                  * only part of it.
6050                  */
6051                 retval = KERN_INVALID_VALUE;
6052                 goto done;
6053         }
6054
6055         /*
6056          * Tranpose the VM objects' backing store.
6057          */
6058         retval = vm_object_transpose(object1, object2,
6059                                      (vm_object_size_t) upl1->size);
6060
6061         if (retval == KERN_SUCCESS) {
6062                 /*
6063                  * Make each UPL point to the correct VM object, i.e. the
6064                  * object holding the pages that the UPL refers to...
6065                  */
6066 #if UPL_DEBUG
6067                 queue_remove(&object1->uplq, upl1, upl_t, uplq);
6068                 queue_remove(&object2->uplq, upl2, upl_t, uplq);
6069 #endif
6070                 upl1->map_object = object2;
6071                 upl2->map_object = object1;
6072 #if UPL_DEBUG
6073                 queue_enter(&object1->uplq, upl2, upl_t, uplq);
6074                 queue_enter(&object2->uplq, upl1, upl_t, uplq);
6075 #endif
6076         }
6077
6078 done:
6079         /*
6080          * Cleanup.
6081          */
6082         if (upls_locked) {
6083                 upl_unlock(upl1);
6084                 upl_unlock(upl2);
6085                 upls_locked = FALSE;
6086         }
6087
6088         return retval;
6089 }
6090
6091 /*
6092  * ENCRYPTED SWAP:
6093  *
6094  * Rationale:  the user might have some encrypted data on disk (via
6095  * FileVault or any other mechanism).  That data is then decrypted in
6096  * memory, which is safe as long as the machine is secure.  But that
6097  * decrypted data in memory could be paged out to disk by the default
6098  * pager.  The data would then be stored on disk in clear (not encrypted)
6099  * and it could be accessed by anyone who gets physical access to the
6100  * disk (if the laptop or the disk gets stolen for example).  This weakens
6101  * the security offered by FileVault.
6102  *
6103  * Solution:  the default pager will optionally request that all the
6104  * pages it gathers for pageout be encrypted, via the UPL interfaces,
6105  * before it sends this UPL to disk via the vnode_pageout() path.
6106  *
6107  * Notes:
6108  *
6109  * To avoid disrupting the VM LRU algorithms, we want to keep the
6110  * clean-in-place mechanisms, which allow us to send some extra pages to
6111  * swap (clustering) without actually removing them from the user's
6112  * address space.  We don't want the user to unknowingly access encrypted
6113  * data, so we have to actually remove the encrypted pages from the page
6114  * table.  When the user accesses the data, the hardware will fail to
6115  * locate the virtual page in its page table and will trigger a page
6116  * fault.  We can then decrypt the page and enter it in the page table
6117  * again.  Whenever we allow the user to access the contents of a page,
6118  * we have to make sure it's not encrypted.
6119  *
6120  *
6121  */
6122 /*
6123  * ENCRYPTED SWAP:
6124  * Reserve of virtual addresses in the kernel address space.
6125  * We need to map the physical pages in the kernel, so that we
6126  * can call the encryption/decryption routines with a kernel
6127  * virtual address.  We keep this pool of pre-allocated kernel
6128  * virtual addresses so that we don't have to scan the kernel's
6129  * virtaul address space each time we need to encrypt or decrypt
6130  * a physical page.
6131  * It would be nice to be able to encrypt and decrypt in physical
6132  * mode but that might not always be more efficient...
6133  */
6134 decl_simple_lock_data(,vm_paging_lock)
6135 #define VM_PAGING_NUM_PAGES     64
6136 vm_map_offset_t vm_paging_base_address = 0;
6137 boolean_t       vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
6138 int             vm_paging_max_index = 0;
6139 int             vm_paging_page_waiter = 0;
6140 int             vm_paging_page_waiter_total = 0;
6141 unsigned long   vm_paging_no_kernel_page = 0;
6142 unsigned long   vm_paging_objects_mapped = 0;
6143 unsigned long   vm_paging_pages_mapped = 0;
6144 unsigned long   vm_paging_objects_mapped_slow = 0;
6145 unsigned long   vm_paging_pages_mapped_slow = 0;
6146
6147 void
6148 vm_paging_map_init(void)
6149 {
6150         kern_return_t   kr;
6151         vm_map_offset_t page_map_offset;
6152         vm_map_entry_t  map_entry;
6153
6154         assert(vm_paging_base_address == 0);
6155
6156         /*
6157          * Initialize our pool of pre-allocated kernel
6158          * virtual addresses.
6159          */
6160         page_map_offset = 0;
6161         kr = vm_map_find_space(kernel_map,
6162                                &page_map_offset,
6163                                VM_PAGING_NUM_PAGES * PAGE_SIZE,
6164                                0,
6165                                0,
6166                                &map_entry);
6167         if (kr != KERN_SUCCESS) {
6168                 panic("vm_paging_map_init: kernel_map full\n");
6169         }
6170         map_entry->object.vm_object = kernel_object;
6171         map_entry->offset = page_map_offset;
6172         vm_object_reference(kernel_object);
6173         vm_map_unlock(kernel_map);
6174
6175         assert(vm_paging_base_address == 0);
6176         vm_paging_base_address = page_map_offset;
6177 }
6178
6179 /*
6180  * ENCRYPTED SWAP:
6181  * vm_paging_map_object:
6182  *      Maps part of a VM object's pages in the kernel
6183  *      virtual address space, using the pre-allocated
6184  *      kernel virtual addresses, if possible.
6185  * Context:
6186  *      The VM object is locked.  This lock will get
6187  *      dropped and re-acquired though, so the caller
6188  *      must make sure the VM object is kept alive
6189  *      (by holding a VM map that has a reference
6190  *      on it, for example, or taking an extra reference).
6191  *      The page should also be kept busy to prevent
6192  *      it from being reclaimed.
6193  */
6194 kern_return_t
6195 vm_paging_map_object(
6196         vm_map_offset_t         *address,
6197         vm_page_t               page,
6198         vm_object_t             object,
6199         vm_object_offset_t      offset,
6200         vm_map_size_t           *size,
6201         vm_prot_t               protection,
6202         boolean_t               can_unlock_object)
6203 {
6204         kern_return_t           kr;
6205         vm_map_offset_t         page_map_offset;
6206         vm_map_size_t           map_size;
6207         vm_object_offset_t      object_offset;
6208         int                     i;
6209
6210
6211         if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
6212                 assert(page->busy);
6213                 /*
6214                  * Use one of the pre-allocated kernel virtual addresses
6215                  * and just enter the VM page in the kernel address space
6216                  * at that virtual address.
6217                  */
6218                 simple_lock(&vm_paging_lock);
6219
6220                 /*
6221                  * Try and find an available kernel virtual address
6222                  * from our pre-allocated pool.
6223                  */
6224                 page_map_offset = 0;
6225                 for (;;) {
6226                         for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
6227                                 if (vm_paging_page_inuse[i] == FALSE) {
6228                                         page_map_offset =
6229                                                 vm_paging_base_address +
6230                                                 (i * PAGE_SIZE);
6231                                         break;
6232                                 }
6233                         }
6234                         if (page_map_offset != 0) {
6235                                 /* found a space to map our page ! */
6236                                 break;
6237                         }
6238
6239                         if (can_unlock_object) {
6240                                 /*
6241                                  * If we can afford to unlock the VM object,
6242                                  * let's take the slow path now...
6243                                  */
6244                                 break;
6245                         }
6246                         /*
6247                          * We can't afford to unlock the VM object, so
6248                          * let's wait for a space to become available...
6249                          */
6250                         vm_paging_page_waiter_total++;
6251                         vm_paging_page_waiter++;
6252                         thread_sleep_fast_usimple_lock(&vm_paging_page_waiter,
6253                                                        &vm_paging_lock,
6254                                                        THREAD_UNINT);
6255                         vm_paging_page_waiter--;
6256                         /* ... and try again */
6257                 }
6258
6259                 if (page_map_offset != 0) {
6260                         /*
6261                          * We found a kernel virtual address;
6262                          * map the physical page to that virtual address.
6263                          */
6264                         if (i > vm_paging_max_index) {
6265                                 vm_paging_max_index = i;
6266                         }
6267                         vm_paging_page_inuse[i] = TRUE;
6268                         simple_unlock(&vm_paging_lock);
6269
6270                         if (page->pmapped == FALSE) {
6271                                 pmap_sync_page_data_phys(page->phys_page);
6272                         }
6273                         page->pmapped = TRUE;
6274
6275                         /*
6276                          * Keep the VM object locked over the PMAP_ENTER
6277                          * and the actual use of the page by the kernel,
6278                          * or this pmap mapping might get undone by a
6279                          * vm_object_pmap_protect() call...
6280                          */
6281                         PMAP_ENTER(kernel_pmap,
6282                                    page_map_offset,
6283                                    page,
6284                                    protection,
6285                                    ((int) page->object->wimg_bits &
6286                                     VM_WIMG_MASK),
6287                                    TRUE);
6288                         vm_paging_objects_mapped++;
6289                         vm_paging_pages_mapped++;
6290                         *address = page_map_offset;
6291
6292                         /* all done and mapped, ready to use ! */
6293                         return KERN_SUCCESS;
6294                 }
6295
6296                 /*
6297                  * We ran out of pre-allocated kernel virtual
6298                  * addresses.  Just map the page in the kernel
6299                  * the slow and regular way.
6300                  */
6301                 vm_paging_no_kernel_page++;
6302                 simple_unlock(&vm_paging_lock);
6303         }
6304
6305         if (! can_unlock_object) {
6306                 return KERN_NOT_SUPPORTED;
6307         }
6308
6309         object_offset = vm_object_trunc_page(offset);
6310         map_size = vm_map_round_page(*size);
6311
6312         /*
6313          * Try and map the required range of the object
6314          * in the kernel_map
6315          */
6316
6317         vm_object_reference_locked(object);     /* for the map entry */
6318         vm_object_unlock(object);
6319
6320         kr = vm_map_enter(kernel_map,
6321                           address,
6322                           map_size,
6323                           0,
6324                           VM_FLAGS_ANYWHERE,
6325                           object,
6326                           object_offset,
6327                           FALSE,
6328                           protection,
6329                           VM_PROT_ALL,
6330                           VM_INHERIT_NONE);
6331         if (kr != KERN_SUCCESS) {
6332                 *address = 0;
6333                 *size = 0;
6334                 vm_object_deallocate(object);   /* for the map entry */
6335                 vm_object_lock(object);
6336                 return kr;
6337         }
6338
6339         *size = map_size;
6340
6341         /*
6342          * Enter the mapped pages in the page table now.
6343          */
6344         vm_object_lock(object);
6345         /*
6346          * VM object must be kept locked from before PMAP_ENTER()
6347          * until after the kernel is done accessing the page(s).
6348          * Otherwise, the pmap mappings in the kernel could be
6349          * undone by a call to vm_object_pmap_protect().
6350          */
6351
6352         for (page_map_offset = 0;
6353              map_size != 0;
6354              map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
6355                 unsigned int    cache_attr;
6356
6357                 page = vm_page_lookup(object, offset + page_map_offset);
6358                 if (page == VM_PAGE_NULL) {
6359                         printf("vm_paging_map_object: no page !?");
6360                         vm_object_unlock(object);
6361                         kr = vm_map_remove(kernel_map, *address, *size,
6362                                            VM_MAP_NO_FLAGS);
6363                         assert(kr == KERN_SUCCESS);
6364                         *address = 0;
6365                         *size = 0;
6366                         vm_object_lock(object);
6367                         return KERN_MEMORY_ERROR;
6368                 }
6369                 if (page->pmapped == FALSE) {
6370                         pmap_sync_page_data_phys(page->phys_page);
6371                 }
6372                 page->pmapped = TRUE;
6373                 cache_attr = ((unsigned int) object->wimg_bits) & VM_WIMG_MASK;
6374
6375                 //assert(pmap_verify_free(page->phys_page));
6376                 PMAP_ENTER(kernel_pmap,
6377                            *address + page_map_offset,
6378                            page,
6379                            protection,
6380                            cache_attr,
6381                            TRUE);
6382         }
6383
6384         vm_paging_objects_mapped_slow++;
6385         vm_paging_pages_mapped_slow += (unsigned long) (map_size / PAGE_SIZE_64);
6386
6387         return KERN_SUCCESS;
6388 }
6389
6390 /*
6391  * ENCRYPTED SWAP:
6392  * vm_paging_unmap_object:
6393  *      Unmaps part of a VM object's pages from the kernel
6394  *      virtual address space.
6395  * Context:
6396  *      The VM object is locked.  This lock will get
6397  *      dropped and re-acquired though.
6398  */
6399 void
6400 vm_paging_unmap_object(
6401         vm_object_t     object,
6402         vm_map_offset_t start,
6403         vm_map_offset_t end)
6404 {
6405         kern_return_t   kr;
6406         int             i;
6407
6408         if ((vm_paging_base_address == 0) ||
6409             (start < vm_paging_base_address) ||
6410             (end > (vm_paging_base_address
6411                      + (VM_PAGING_NUM_PAGES * PAGE_SIZE)))) {
6412                 /*
6413                  * We didn't use our pre-allocated pool of
6414                  * kernel virtual address.  Deallocate the
6415                  * virtual memory.
6416                  */
6417                 if (object != VM_OBJECT_NULL) {
6418                         vm_object_unlock(object);
6419                 }
6420                 kr = vm_map_remove(kernel_map, start, end, VM_MAP_NO_FLAGS);
6421                 if (object != VM_OBJECT_NULL) {
6422                         vm_object_lock(object);
6423                 }
6424                 assert(kr == KERN_SUCCESS);
6425         } else {
6426                 /*
6427                  * We used a kernel virtual address from our
6428                  * pre-allocated pool.  Put it back in the pool
6429                  * for next time.
6430                  */
6431                 assert(end - start == PAGE_SIZE);
6432                 i = (int) ((start - vm_paging_base_address) >> PAGE_SHIFT);
6433                 assert(i >= 0 && i < VM_PAGING_NUM_PAGES);
6434
6435                 /* undo the pmap mapping */
6436                 pmap_remove(kernel_pmap, start, end);
6437
6438                 simple_lock(&vm_paging_lock);
6439                 vm_paging_page_inuse[i] = FALSE;
6440                 if (vm_paging_page_waiter) {
6441                         thread_wakeup(&vm_paging_page_waiter);
6442                 }
6443                 simple_unlock(&vm_paging_lock);
6444         }
6445 }
6446
6447 #if CRYPTO
6448 /*
6449  * Encryption data.
6450  * "iv" is the "initial vector".  Ideally, we want to
6451  * have a different one for each page we encrypt, so that
6452  * crackers can't find encryption patterns too easily.
6453  */
6454 #define SWAP_CRYPT_AES_KEY_SIZE 128     /* XXX 192 and 256 don't work ! */
6455 boolean_t               swap_crypt_ctx_initialized = FALSE;
6456 aes_32t                 swap_crypt_key[8]; /* big enough for a 256 key */
6457 aes_ctx                 swap_crypt_ctx;
6458 const unsigned char     swap_crypt_null_iv[AES_BLOCK_SIZE] = {0xa, };
6459
6460 #if DEBUG
6461 boolean_t               swap_crypt_ctx_tested = FALSE;
6462 unsigned char swap_crypt_test_page_ref[4096] __attribute__((aligned(4096)));
6463 unsigned char swap_crypt_test_page_encrypt[4096] __attribute__((aligned(4096)));
6464 unsigned char swap_crypt_test_page_decrypt[4096] __attribute__((aligned(4096)));
6465 #endif /* DEBUG */
6466
6467 /*
6468  * Initialize the encryption context: key and key size.
6469  */
6470 void swap_crypt_ctx_initialize(void); /* forward */
6471 void
6472 swap_crypt_ctx_initialize(void)
6473 {
6474         unsigned int    i;
6475
6476         /*
6477          * No need for locking to protect swap_crypt_ctx_initialized
6478          * because the first use of encryption will come from the
6479          * pageout thread (we won't pagein before there's been a pageout)
6480          * and there's only one pageout thread.
6481          */
6482         if (swap_crypt_ctx_initialized == FALSE) {
6483                 for (i = 0;
6484                      i < (sizeof (swap_crypt_key) /
6485                           sizeof (swap_crypt_key[0]));
6486                      i++) {
6487                         swap_crypt_key[i] = random();
6488                 }
6489                 aes_encrypt_key((const unsigned char *) swap_crypt_key,
6490                                 SWAP_CRYPT_AES_KEY_SIZE,
6491                                 &swap_crypt_ctx.encrypt);
6492                 aes_decrypt_key((const unsigned char *) swap_crypt_key,
6493                                 SWAP_CRYPT_AES_KEY_SIZE,
6494                                 &swap_crypt_ctx.decrypt);
6495                 swap_crypt_ctx_initialized = TRUE;
6496         }
6497
6498 #if DEBUG
6499         /*
6500          * Validate the encryption algorithms.
6501          */
6502         if (swap_crypt_ctx_tested == FALSE) {
6503                 /* initialize */
6504                 for (i = 0; i < 4096; i++) {
6505                         swap_crypt_test_page_ref[i] = (char) i;
6506                 }
6507                 /* encrypt */
6508                 aes_encrypt_cbc(swap_crypt_test_page_ref,
6509                                 swap_crypt_null_iv,
6510                                 PAGE_SIZE / AES_BLOCK_SIZE,
6511                                 swap_crypt_test_page_encrypt,
6512                                 &swap_crypt_ctx.encrypt);
6513                 /* decrypt */
6514                 aes_decrypt_cbc(swap_crypt_test_page_encrypt,
6515                                 swap_crypt_null_iv,
6516                                 PAGE_SIZE / AES_BLOCK_SIZE,
6517                                 swap_crypt_test_page_decrypt,
6518                                 &swap_crypt_ctx.decrypt);
6519                 /* compare result with original */
6520                 for (i = 0; i < 4096; i ++) {
6521                         if (swap_crypt_test_page_decrypt[i] !=
6522                             swap_crypt_test_page_ref[i]) {
6523                                 panic("encryption test failed");
6524                         }
6525                 }
6526
6527                 /* encrypt again */
6528                 aes_encrypt_cbc(swap_crypt_test_page_decrypt,
6529                                 swap_crypt_null_iv,
6530                                 PAGE_SIZE / AES_BLOCK_SIZE,
6531                                 swap_crypt_test_page_decrypt,
6532                                 &swap_crypt_ctx.encrypt);
6533                 /* decrypt in place */
6534                 aes_decrypt_cbc(swap_crypt_test_page_decrypt,
6535                                 swap_crypt_null_iv,
6536                                 PAGE_SIZE / AES_BLOCK_SIZE,
6537                                 swap_crypt_test_page_decrypt,
6538                                 &swap_crypt_ctx.decrypt);
6539                 for (i = 0; i < 4096; i ++) {
6540                         if (swap_crypt_test_page_decrypt[i] !=
6541                             swap_crypt_test_page_ref[i]) {
6542                                 panic("in place encryption test failed");
6543                         }
6544                 }
6545
6546                 swap_crypt_ctx_tested = TRUE;
6547         }
6548 #endif /* DEBUG */
6549 }
6550
6551 /*
6552  * ENCRYPTED SWAP:
6553  * vm_page_encrypt:
6554  *      Encrypt the given page, for secure paging.
6555  *      The page might already be mapped at kernel virtual
6556  *      address "kernel_mapping_offset".  Otherwise, we need
6557  *      to map it.
6558  *
6559  * Context:
6560  *      The page's object is locked, but this lock will be released
6561  *      and re-acquired.
6562  *      The page is busy and not accessible by users (not entered in any pmap).
6563  */
6564 void
6565 vm_page_encrypt(
6566         vm_page_t       page,
6567         vm_map_offset_t kernel_mapping_offset)
6568 {
6569         kern_return_t           kr;
6570         vm_map_size_t           kernel_mapping_size;
6571         vm_offset_t             kernel_vaddr;
6572         union {
6573                 unsigned char   aes_iv[AES_BLOCK_SIZE];
6574                 struct {
6575                         memory_object_t         pager_object;
6576                         vm_object_offset_t      paging_offset;
6577                 } vm;
6578         } encrypt_iv;
6579
6580         if (! vm_pages_encrypted) {
6581                 vm_pages_encrypted = TRUE;
6582         }
6583
6584         assert(page->busy);
6585         assert(page->dirty || page->precious);
6586
6587         if (page->encrypted) {
6588                 /*
6589                  * Already encrypted: no need to do it again.
6590                  */
6591                 vm_page_encrypt_already_encrypted_counter++;
6592                 return;
6593         }
6594         ASSERT_PAGE_DECRYPTED(page);
6595
6596         /*
6597          * Take a paging-in-progress reference to keep the object
6598          * alive even if we have to unlock it (in vm_paging_map_object()
6599          * for example)...
6600          */
6601         vm_object_paging_begin(page->object);
6602
6603         if (kernel_mapping_offset == 0) {
6604                 /*
6605                  * The page hasn't already been mapped in kernel space
6606                  * by the caller.  Map it now, so that we can access
6607                  * its contents and encrypt them.
6608                  */
6609                 kernel_mapping_size = PAGE_SIZE;
6610                 kr = vm_paging_map_object(&kernel_mapping_offset,
6611                                           page,
6612                                           page->object,
6613                                           page->offset,
6614                                           &kernel_mapping_size,
6615                                           VM_PROT_READ | VM_PROT_WRITE,
6616                                           FALSE);
6617                 if (kr != KERN_SUCCESS) {
6618                         panic("vm_page_encrypt: "
6619                               "could not map page in kernel: 0x%x\n",
6620                               kr);
6621                 }
6622         } else {
6623                 kernel_mapping_size = 0;
6624         }
6625         kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
6626
6627         if (swap_crypt_ctx_initialized == FALSE) {
6628                 swap_crypt_ctx_initialize();
6629         }
6630         assert(swap_crypt_ctx_initialized);
6631
6632         /*
6633          * Prepare an "initial vector" for the encryption.
6634          * We use the "pager" and the "paging_offset" for that
6635          * page to obfuscate the encrypted data a bit more and
6636          * prevent crackers from finding patterns that they could
6637          * use to break the key.
6638          */
6639         bzero(&encrypt_iv.aes_iv[0], sizeof (encrypt_iv.aes_iv));
6640         encrypt_iv.vm.pager_object = page->object->pager;
6641         encrypt_iv.vm.paging_offset =
6642                 page->object->paging_offset + page->offset;
6643
6644         /* encrypt the "initial vector" */
6645         aes_encrypt_cbc((const unsigned char *) &encrypt_iv.aes_iv[0],
6646                         swap_crypt_null_iv,
6647                         1,
6648                         &encrypt_iv.aes_iv[0],
6649                         &swap_crypt_ctx.encrypt);
6650
6651         /*
6652          * Encrypt the page.
6653          */
6654         aes_encrypt_cbc((const unsigned char *) kernel_vaddr,
6655                         &encrypt_iv.aes_iv[0],
6656                         PAGE_SIZE / AES_BLOCK_SIZE,
6657                         (unsigned char *) kernel_vaddr,
6658                         &swap_crypt_ctx.encrypt);
6659
6660         vm_page_encrypt_counter++;
6661
6662         /*
6663          * Unmap the page from the kernel's address space,
6664          * if we had to map it ourselves.  Otherwise, let
6665          * the caller undo the mapping if needed.
6666          */
6667         if (kernel_mapping_size != 0) {
6668                 vm_paging_unmap_object(page->object,
6669                                        kernel_mapping_offset,
6670                                        kernel_mapping_offset + kernel_mapping_size);
6671         }
6672
6673         /*
6674          * Clear the "reference" and "modified" bits.
6675          * This should clean up any impact the encryption had
6676          * on them.
6677          * The page was kept busy and disconnected from all pmaps,
6678          * so it can't have been referenced or modified from user
6679          * space.
6680          * The software bits will be reset later after the I/O
6681          * has completed (in upl_commit_range()).
6682          */
6683         pmap_clear_refmod(page->phys_page, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
6684
6685         page->encrypted = TRUE;
6686
6687         vm_object_paging_end(page->object);
6688 }
6689
6690 /*
6691  * ENCRYPTED SWAP:
6692  * vm_page_decrypt:
6693  *      Decrypt the given page.
6694  *      The page might already be mapped at kernel virtual
6695  *      address "kernel_mapping_offset".  Otherwise, we need
6696  *      to map it.
6697  *
6698  * Context:
6699  *      The page's VM object is locked but will be unlocked and relocked.
6700  *      The page is busy and not accessible by users (not entered in any pmap).
6701  */
6702 void
6703 vm_page_decrypt(
6704         vm_page_t       page,
6705         vm_map_offset_t kernel_mapping_offset)
6706 {
6707         kern_return_t           kr;
6708         vm_map_size_t           kernel_mapping_size;
6709         vm_offset_t             kernel_vaddr;
6710         union {
6711                 unsigned char   aes_iv[AES_BLOCK_SIZE];
6712                 struct {
6713                         memory_object_t         pager_object;
6714                         vm_object_offset_t      paging_offset;
6715                 } vm;
6716         } decrypt_iv;
6717
6718         assert(page->busy);
6719         assert(page->encrypted);
6720
6721         /*
6722          * Take a paging-in-progress reference to keep the object
6723          * alive even if we have to unlock it (in vm_paging_map_object()
6724          * for example)...
6725          */
6726         vm_object_paging_begin(page->object);
6727
6728         if (kernel_mapping_offset == 0) {
6729                 /*
6730                  * The page hasn't already been mapped in kernel space
6731                  * by the caller.  Map it now, so that we can access
6732                  * its contents and decrypt them.
6733                  */
6734                 kernel_mapping_size = PAGE_SIZE;
6735                 kr = vm_paging_map_object(&kernel_mapping_offset,
6736                                           page,
6737                                           page->object,
6738                                           page->offset,
6739                                           &kernel_mapping_size,
6740                                           VM_PROT_READ | VM_PROT_WRITE,
6741                                           FALSE);
6742                 if (kr != KERN_SUCCESS) {
6743                         panic("vm_page_decrypt: "
6744                               "could not map page in kernel: 0x%x\n",
6745                               kr);
6746                 }
6747         } else {
6748                 kernel_mapping_size = 0;
6749         }
6750         kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
6751
6752         assert(swap_crypt_ctx_initialized);
6753
6754         /*
6755          * Prepare an "initial vector" for the decryption.
6756          * It has to be the same as the "initial vector" we
6757          * used to encrypt that page.
6758          */
6759         bzero(&decrypt_iv.aes_iv[0], sizeof (decrypt_iv.aes_iv));
6760         decrypt_iv.vm.pager_object = page->object->pager;
6761         decrypt_iv.vm.paging_offset =
6762                 page->object->paging_offset + page->offset;
6763
6764         /* encrypt the "initial vector" */
6765         aes_encrypt_cbc((const unsigned char *) &decrypt_iv.aes_iv[0],
6766                         swap_crypt_null_iv,
6767                         1,
6768                         &decrypt_iv.aes_iv[0],
6769                         &swap_crypt_ctx.encrypt);
6770
6771         /*
6772          * Decrypt the page.
6773          */
6774         aes_decrypt_cbc((const unsigned char *) kernel_vaddr,
6775                         &decrypt_iv.aes_iv[0],
6776                         PAGE_SIZE / AES_BLOCK_SIZE,
6777                         (unsigned char *) kernel_vaddr,
6778                         &swap_crypt_ctx.decrypt);
6779         vm_page_decrypt_counter++;
6780
6781         /*
6782          * Unmap the page from the kernel's address space,
6783          * if we had to map it ourselves.  Otherwise, let
6784          * the caller undo the mapping if needed.
6785          */
6786         if (kernel_mapping_size != 0) {
6787                 vm_paging_unmap_object(page->object,
6788                                        kernel_vaddr,
6789                                        kernel_vaddr + PAGE_SIZE);
6790         }
6791
6792         /*
6793          * After decryption, the page is actually clean.
6794          * It was encrypted as part of paging, which "cleans"
6795          * the "dirty" pages.
6796          * Noone could access it after it was encrypted
6797          * and the decryption doesn't count.
6798          */
6799         page->dirty = FALSE;
6800         assert (page->cs_validated == FALSE);
6801         pmap_clear_refmod(page->phys_page, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
6802         page->encrypted = FALSE;
6803
6804         /*
6805          * We've just modified the page's contents via the data cache and part
6806          * of the new contents might still be in the cache and not yet in RAM.
6807          * Since the page is now available and might get gathered in a UPL to
6808          * be part of a DMA transfer from a driver that expects the memory to
6809          * be coherent at this point, we have to flush the data cache.
6810          */
6811         pmap_sync_page_attributes_phys(page->phys_page);
6812         /*
6813          * Since the page is not mapped yet, some code might assume that it
6814          * doesn't need to invalidate the instruction cache when writing to
6815          * that page.  That code relies on "pmapped" being FALSE, so that the
6816          * caches get synchronized when the page is first mapped.
6817          */
6818         assert(pmap_verify_free(page->phys_page));
6819         page->pmapped = FALSE;
6820         page->wpmapped = FALSE;
6821
6822         vm_object_paging_end(page->object);
6823 }
6824
6825 #if DEVELOPMENT || DEBUG
6826 unsigned long upl_encrypt_upls = 0;
6827 unsigned long upl_encrypt_pages = 0;
6828 #endif
6829
6830 /*
6831  * ENCRYPTED SWAP:
6832  *
6833  * upl_encrypt:
6834  *      Encrypts all the pages in the UPL, within the specified range.
6835  *
6836  */
6837 void
6838 upl_encrypt(
6839         upl_t                   upl,
6840         upl_offset_t            crypt_offset,
6841         upl_size_t              crypt_size)
6842 {
6843         upl_size_t              upl_size, subupl_size=crypt_size;
6844         upl_offset_t            offset_in_upl, subupl_offset=crypt_offset;
6845         vm_object_t             upl_object;
6846         vm_object_offset_t      upl_offset;
6847         vm_page_t               page;
6848         vm_object_t             shadow_object;
6849         vm_object_offset_t      shadow_offset;
6850         vm_object_offset_t      paging_offset;
6851         vm_object_offset_t      base_offset;
6852         int                     isVectorUPL = 0;
6853         upl_t                   vector_upl = NULL;
6854
6855         if((isVectorUPL = vector_upl_is_valid(upl)))
6856                 vector_upl = upl;
6857
6858 process_upl_to_encrypt:
6859         if(isVectorUPL) {
6860                 crypt_size = subupl_size;
6861                 crypt_offset = subupl_offset;
6862                 upl =  vector_upl_subupl_byoffset(vector_upl, &crypt_offset, &crypt_size);
6863                 if(upl == NULL)
6864                         panic("upl_encrypt: Accessing a sub-upl that doesn't exist\n");
6865                 subupl_size -= crypt_size;
6866                 subupl_offset += crypt_size;
6867         }
6868
6869 #if DEVELOPMENT || DEBUG
6870         upl_encrypt_upls++;
6871         upl_encrypt_pages += crypt_size / PAGE_SIZE;
6872 #endif
6873         upl_object = upl->map_object;
6874         upl_offset = upl->offset;
6875         upl_size = upl->size;
6876
6877         vm_object_lock(upl_object);
6878
6879         /*
6880          * Find the VM object that contains the actual pages.
6881          */
6882         if (upl_object->pageout) {
6883                 shadow_object = upl_object->shadow;
6884                 /*
6885                  * The offset in the shadow object is actually also
6886                  * accounted for in upl->offset.  It possibly shouldn't be
6887                  * this way, but for now don't account for it twice.
6888                  */
6889                 shadow_offset = 0;
6890                 assert(upl_object->paging_offset == 0); /* XXX ? */
6891                 vm_object_lock(shadow_object);
6892         } else {
6893                 shadow_object = upl_object;
6894                 shadow_offset = 0;
6895         }
6896
6897         paging_offset = shadow_object->paging_offset;
6898         vm_object_paging_begin(shadow_object);
6899
6900         if (shadow_object != upl_object)
6901                 vm_object_unlock(upl_object);
6902
6903
6904         base_offset = shadow_offset;
6905         base_offset += upl_offset;
6906         base_offset += crypt_offset;
6907         base_offset -= paging_offset;
6908
6909         assert(crypt_offset + crypt_size <= upl_size);
6910
6911         for (offset_in_upl = 0;
6912              offset_in_upl < crypt_size;
6913              offset_in_upl += PAGE_SIZE) {
6914                 page = vm_page_lookup(shadow_object,
6915                                       base_offset + offset_in_upl);
6916                 if (page == VM_PAGE_NULL) {
6917                         panic("upl_encrypt: "
6918                               "no page for (obj=%p,off=%lld+%d)!\n",
6919                               shadow_object,
6920                               base_offset,
6921                               offset_in_upl);
6922                 }
6923                 /*
6924                  * Disconnect the page from all pmaps, so that nobody can
6925                  * access it while it's encrypted.  After that point, all
6926                  * accesses to this page will cause a page fault and block
6927                  * while the page is busy being encrypted.  After the
6928                  * encryption completes, any access will cause a
6929                  * page fault and the page gets decrypted at that time.
6930                  */
6931                 pmap_disconnect(page->phys_page);
6932                 vm_page_encrypt(page, 0);
6933
6934                 if (vm_object_lock_avoid(shadow_object)) {
6935                         /*
6936                          * Give vm_pageout_scan() a chance to convert more
6937                          * pages from "clean-in-place" to "clean-and-free",
6938                          * if it's interested in the same pages we selected
6939                          * in this cluster.
6940                          */
6941                         vm_object_unlock(shadow_object);
6942                         mutex_pause(2);
6943                         vm_object_lock(shadow_object);
6944                 }
6945         }
6946
6947         vm_object_paging_end(shadow_object);
6948         vm_object_unlock(shadow_object);
6949
6950         if(isVectorUPL && subupl_size)
6951                 goto process_upl_to_encrypt;
6952 }
6953
6954 #else /* CRYPTO */
6955 void
6956 upl_encrypt(
6957         __unused upl_t                  upl,
6958         __unused upl_offset_t   crypt_offset,
6959         __unused upl_size_t     crypt_size)
6960 {
6961 }
6962
6963 void
6964 vm_page_encrypt(
6965         __unused vm_page_t              page,
6966         __unused vm_map_offset_t        kernel_mapping_offset)
6967 {
6968 }
6969
6970 void
6971 vm_page_decrypt(
6972         __unused vm_page_t              page,
6973         __unused vm_map_offset_t        kernel_mapping_offset)
6974 {
6975 }
6976
6977 #endif /* CRYPTO */
6978
6979 void
6980 vm_pageout_queue_steal(vm_page_t page, boolean_t queues_locked)
6981 {
6982         boolean_t       pageout;
6983
6984         pageout = page->pageout;
6985
6986         page->list_req_pending = FALSE;
6987         page->cleaning = FALSE;
6988         page->pageout = FALSE;
6989
6990         if (!queues_locked) {
6991                 vm_page_lockspin_queues();
6992         }
6993
6994         /*
6995          * need to drop the laundry count...
6996          * we may also need to remove it
6997          * from the I/O paging queue...
6998          * vm_pageout_throttle_up handles both cases
6999          *
7000          * the laundry and pageout_queue flags are cleared...
7001          */
7002         vm_pageout_throttle_up(page);
7003
7004         if (pageout == TRUE) {
7005                 /*
7006                  * toss the wire count we picked up
7007                  * when we intially set this page up
7008                  * to be cleaned...
7009                  */
7010                 vm_page_unwire(page, TRUE);
7011         }
7012         vm_page_steal_pageout_page++;
7013
7014         if (!queues_locked) {
7015                 vm_page_unlock_queues();
7016         }
7017 }
7018
7019 upl_t
7020 vector_upl_create(vm_offset_t upl_offset)
7021 {
7022         int     vector_upl_size  = sizeof(struct _vector_upl);
7023         int i=0;
7024         upl_t   upl;
7025         vector_upl_t vector_upl = (vector_upl_t)kalloc(vector_upl_size);
7026
7027         upl = upl_create(0,UPL_VECTOR,0);
7028         upl->vector_upl = vector_upl;
7029         upl->offset = upl_offset;
7030         vector_upl->size = 0;
7031         vector_upl->offset = upl_offset;
7032         vector_upl->invalid_upls=0;
7033         vector_upl->num_upls=0;
7034         vector_upl->pagelist = NULL;
7035
7036         for(i=0; i < MAX_VECTOR_UPL_ELEMENTS ; i++) {
7037                 vector_upl->upl_iostates[i].size = 0;
7038                 vector_upl->upl_iostates[i].offset = 0;
7039
7040         }
7041         return upl;
7042 }
7043
7044 void
7045 vector_upl_deallocate(upl_t upl)
7046 {
7047         if(upl) {
7048                 vector_upl_t vector_upl = upl->vector_upl;
7049                 if(vector_upl) {
7050                         if(vector_upl->invalid_upls != vector_upl->num_upls)
7051                                 panic("Deallocating non-empty Vectored UPL\n");
7052                         kfree(vector_upl->pagelist,(sizeof(struct upl_page_info)*(vector_upl->size/PAGE_SIZE)));
7053                         vector_upl->invalid_upls=0;
7054                         vector_upl->num_upls = 0;
7055                         vector_upl->pagelist = NULL;
7056                         vector_upl->size = 0;
7057                         vector_upl->offset = 0;
7058                         kfree(vector_upl, sizeof(struct _vector_upl));
7059                         vector_upl = (vector_upl_t)0xdeadbeef;
7060                 }
7061                 else
7062                         panic("vector_upl_deallocate was passed a non-vectored upl\n");
7063         }
7064         else
7065                 panic("vector_upl_deallocate was passed a NULL upl\n");
7066 }
7067
7068 boolean_t
7069 vector_upl_is_valid(upl_t upl)
7070 {
7071         if(upl &&  ((upl->flags & UPL_VECTOR)==UPL_VECTOR)) {
7072                 vector_upl_t vector_upl = upl->vector_upl;
7073                 if(vector_upl == NULL || vector_upl == (vector_upl_t)0xdeadbeef || vector_upl == (vector_upl_t)0xfeedbeef)
7074                         return FALSE;
7075                 else
7076                         return TRUE;
7077         }
7078         return FALSE;
7079 }
7080
7081 boolean_t
7082 vector_upl_set_subupl(upl_t upl,upl_t subupl, uint32_t io_size)
7083 {
7084         if(vector_upl_is_valid(upl)) {
7085                 vector_upl_t vector_upl = upl->vector_upl;
7086
7087                 if(vector_upl) {
7088                         if(subupl) {
7089                                 if(io_size) {
7090                                         if(io_size < PAGE_SIZE)
7091                                                 io_size = PAGE_SIZE;
7092                                         subupl->vector_upl = (void*)vector_upl;
7093                                         vector_upl->upl_elems[vector_upl->num_upls++] = subupl;
7094                                         vector_upl->size += io_size;
7095                                         upl->size += io_size;
7096                                 }
7097                                 else {
7098                                         uint32_t i=0,invalid_upls=0;
7099                                         for(i = 0; i < vector_upl->num_upls; i++) {
7100                                                 if(vector_upl->upl_elems[i] == subupl)
7101                                                         break;
7102                                         }
7103                                         if(i == vector_upl->num_upls)
7104                                                 panic("Trying to remove sub-upl when none exists");
7105
7106                                         vector_upl->upl_elems[i] = NULL;
7107                                         invalid_upls = hw_atomic_add(&(vector_upl)->invalid_upls, 1);
7108                                         if(invalid_upls == vector_upl->num_upls)
7109                                                 return TRUE;
7110                                         else
7111                                                 return FALSE;
7112                                 }
7113                         }
7114                         else
7115                                 panic("vector_upl_set_subupl was passed a NULL upl element\n");
7116                 }
7117                 else
7118                         panic("vector_upl_set_subupl was passed a non-vectored upl\n");
7119         }
7120         else
7121                 panic("vector_upl_set_subupl was passed a NULL upl\n");
7122
7123         return FALSE;
7124 }
7125
7126 void
7127 vector_upl_set_pagelist(upl_t upl)
7128 {
7129         if(vector_upl_is_valid(upl)) {
7130                 uint32_t i=0;
7131                 vector_upl_t vector_upl = upl->vector_upl;
7132
7133                 if(vector_upl) {
7134                         vm_offset_t pagelist_size=0, cur_upl_pagelist_size=0;
7135
7136                         vector_upl->pagelist = (upl_page_info_array_t)kalloc(sizeof(struct upl_page_info)*(vector_upl->size/PAGE_SIZE));
7137
7138                         for(i=0; i < vector_upl->num_upls; i++) {
7139                                 cur_upl_pagelist_size = sizeof(struct upl_page_info) * vector_upl->upl_elems[i]->size/PAGE_SIZE;
7140                                 bcopy(UPL_GET_INTERNAL_PAGE_LIST_SIMPLE(vector_upl->upl_elems[i]), (char*)vector_upl->pagelist + pagelist_size, cur_upl_pagelist_size);
7141                                 pagelist_size += cur_upl_pagelist_size;
7142                                 if(vector_upl->upl_elems[i]->highest_page > upl->highest_page)
7143                                         upl->highest_page = vector_upl->upl_elems[i]->highest_page;
7144                         }
7145                         assert( pagelist_size == (sizeof(struct upl_page_info)*(vector_upl->size/PAGE_SIZE)) );
7146                 }
7147                 else
7148                         panic("vector_upl_set_pagelist was passed a non-vectored upl\n");
7149         }
7150         else
7151                 panic("vector_upl_set_pagelist was passed a NULL upl\n");
7152
7153 }
7154
7155 upl_t
7156 vector_upl_subupl_byindex(upl_t upl, uint32_t index)
7157 {
7158         if(vector_upl_is_valid(upl)) {
7159                 vector_upl_t vector_upl = upl->vector_upl;
7160                 if(vector_upl) {
7161                         if(index < vector_upl->num_upls)
7162                                 return vector_upl->upl_elems[index];
7163                 }
7164                 else
7165                         panic("vector_upl_subupl_byindex was passed a non-vectored upl\n");
7166         }
7167         return NULL;
7168 }
7169
7170 upl_t
7171 vector_upl_subupl_byoffset(upl_t upl, upl_offset_t *upl_offset, upl_size_t *upl_size)
7172 {
7173         if(vector_upl_is_valid(upl)) {
7174                 uint32_t i=0;
7175                 vector_upl_t vector_upl = upl->vector_upl;
7176
7177                 if(vector_upl) {
7178                         upl_t subupl = NULL;
7179                         vector_upl_iostates_t subupl_state;
7180
7181                         for(i=0; i < vector_upl->num_upls; i++) {
7182                                 subupl = vector_upl->upl_elems[i];
7183                                 subupl_state = vector_upl->upl_iostates[i];
7184                                 if( *upl_offset <= (subupl_state.offset + subupl_state.size - 1)) {
7185                                         /* We could have been passed an offset/size pair that belongs
7186                                          * to an UPL element that has already been committed/aborted.
7187                                          * If so, return NULL.
7188                                          */
7189                                         if(subupl == NULL)
7190                                                 return NULL;
7191                                         if((subupl_state.offset + subupl_state.size) < (*upl_offset + *upl_size)) {
7192                                                 *upl_size = (subupl_state.offset + subupl_state.size) - *upl_offset;
7193                                                 if(*upl_size > subupl_state.size)
7194                                                         *upl_size = subupl_state.size;
7195                                         }
7196                                         if(*upl_offset >= subupl_state.offset)
7197                                                 *upl_offset -= subupl_state.offset;
7198                                         else if(i)
7199                                                 panic("Vector UPL offset miscalculation\n");
7200                                         return subupl;
7201                                 }
7202                         }
7203                 }
7204                 else
7205                         panic("vector_upl_subupl_byoffset was passed a non-vectored UPL\n");
7206         }
7207         return NULL;
7208 }
7209
7210 void
7211 vector_upl_get_submap(upl_t upl, vm_map_t *v_upl_submap, vm_offset_t *submap_dst_addr)
7212 {
7213         *v_upl_submap = NULL;
7214
7215         if(vector_upl_is_valid(upl)) {
7216                 vector_upl_t vector_upl = upl->vector_upl;
7217                 if(vector_upl) {
7218                         *v_upl_submap = vector_upl->submap;
7219                         *submap_dst_addr = vector_upl->submap_dst_addr;
7220                 }
7221                 else
7222                         panic("vector_upl_get_submap was passed a non-vectored UPL\n");
7223         }
7224         else
7225                 panic("vector_upl_get_submap was passed a null UPL\n");
7226 }
7227
7228 void
7229 vector_upl_set_submap(upl_t upl, vm_map_t submap, vm_offset_t submap_dst_addr)
7230 {
7231         if(vector_upl_is_valid(upl)) {
7232                 vector_upl_t vector_upl = upl->vector_upl;
7233                 if(vector_upl) {
7234                         vector_upl->submap = submap;
7235                         vector_upl->submap_dst_addr = submap_dst_addr;
7236                 }
7237                 else
7238                         panic("vector_upl_get_submap was passed a non-vectored UPL\n");
7239         }
7240         else
7241                 panic("vector_upl_get_submap was passed a NULL UPL\n");
7242 }
7243
7244 void
7245 vector_upl_set_iostate(upl_t upl, upl_t subupl, upl_offset_t offset, upl_size_t size)
7246 {
7247         if(vector_upl_is_valid(upl)) {
7248                 uint32_t i = 0;
7249                 vector_upl_t vector_upl = upl->vector_upl;
7250
7251                 if(vector_upl) {
7252                         for(i = 0; i < vector_upl->num_upls; i++) {
7253                                 if(vector_upl->upl_elems[i] == subupl)
7254                                         break;
7255                         }
7256
7257                         if(i == vector_upl->num_upls)
7258                                 panic("setting sub-upl iostate when none exists");
7259
7260                         vector_upl->upl_iostates[i].offset = offset;
7261                         if(size < PAGE_SIZE)
7262                                 size = PAGE_SIZE;
7263                         vector_upl->upl_iostates[i].size = size;
7264                 }
7265                 else
7266                         panic("vector_upl_set_iostate was passed a non-vectored UPL\n");
7267         }
7268         else
7269                 panic("vector_upl_set_iostate was passed a NULL UPL\n");
7270 }
7271
7272 void
7273 vector_upl_get_iostate(upl_t upl, upl_t subupl, upl_offset_t *offset, upl_size_t *size)
7274 {
7275         if(vector_upl_is_valid(upl)) {
7276                 uint32_t i = 0;
7277                 vector_upl_t vector_upl = upl->vector_upl;
7278
7279                 if(vector_upl) {
7280                         for(i = 0; i < vector_upl->num_upls; i++) {
7281                                 if(vector_upl->upl_elems[i] == subupl)
7282                                         break;
7283                         }
7284
7285                         if(i == vector_upl->num_upls)
7286                                 panic("getting sub-upl iostate when none exists");
7287
7288                         *offset = vector_upl->upl_iostates[i].offset;
7289                         *size = vector_upl->upl_iostates[i].size;
7290                 }
7291                 else
7292                         panic("vector_upl_get_iostate was passed a non-vectored UPL\n");
7293         }
7294         else
7295                 panic("vector_upl_get_iostate was passed a NULL UPL\n");
7296 }
7297
7298 void
7299 vector_upl_get_iostate_byindex(upl_t upl, uint32_t index, upl_offset_t *offset, upl_size_t *size)
7300 {
7301         if(vector_upl_is_valid(upl)) {
7302                 vector_upl_t vector_upl = upl->vector_upl;
7303                 if(vector_upl) {
7304                         if(index < vector_upl->num_upls) {
7305                                 *offset = vector_upl->upl_iostates[index].offset;
7306                                 *size = vector_upl->upl_iostates[index].size;
7307                         }
7308                         else
7309                                 *offset = *size = 0;
7310                 }
7311                 else
7312                         panic("vector_upl_get_iostate_byindex was passed a non-vectored UPL\n");
7313         }
7314         else
7315                 panic("vector_upl_get_iostate_byindex was passed a NULL UPL\n");
7316 }
7317
7318 upl_page_info_t *
7319 upl_get_internal_vectorupl_pagelist(upl_t upl)
7320 {
7321         return ((vector_upl_t)(upl->vector_upl))->pagelist;
7322 }
7323
7324 void *
7325 upl_get_internal_vectorupl(upl_t upl)
7326 {
7327         return upl->vector_upl;
7328 }
7329
7330 vm_size_t
7331 upl_get_internal_pagelist_offset(void)
7332 {
7333         return sizeof(struct upl);
7334 }
7335
7336 void
7337 upl_clear_dirty(
7338         upl_t           upl,
7339         boolean_t       value)
7340 {
7341         if (value) {
7342                 upl->flags |= UPL_CLEAR_DIRTY;
7343         } else {
7344                 upl->flags &= ~UPL_CLEAR_DIRTY;
7345         }
7346 }
7347
7348
7349 #ifdef MACH_BSD
7350
7351 boolean_t  upl_device_page(upl_page_info_t *upl)
7352 {
7353         return(UPL_DEVICE_PAGE(upl));
7354 }
7355 boolean_t  upl_page_present(upl_page_info_t *upl, int index)
7356 {
7357         return(UPL_PAGE_PRESENT(upl, index));
7358 }
7359 boolean_t  upl_speculative_page(upl_page_info_t *upl, int index)
7360 {
7361         return(UPL_SPECULATIVE_PAGE(upl, index));
7362 }
7363 boolean_t  upl_dirty_page(upl_page_info_t *upl, int index)
7364 {
7365         return(UPL_DIRTY_PAGE(upl, index));
7366 }
7367 boolean_t  upl_valid_page(upl_page_info_t *upl, int index)
7368 {
7369         return(UPL_VALID_PAGE(upl, index));
7370 }
7371 ppnum_t  upl_phys_page(upl_page_info_t *upl, int index)
7372 {
7373         return(UPL_PHYS_PAGE(upl, index));
7374 }
7375
7376
7377 void
7378 vm_countdirtypages(void)
7379 {
7380         vm_page_t m;
7381         int dpages;
7382         int pgopages;
7383         int precpages;
7384
7385
7386         dpages=0;
7387         pgopages=0;
7388         precpages=0;
7389
7390         vm_page_lock_queues();
7391         m = (vm_page_t) queue_first(&vm_page_queue_inactive);
7392         do {
7393                 if (m ==(vm_page_t )0) break;
7394
7395                 if(m->dirty) dpages++;
7396                 if(m->pageout) pgopages++;
7397                 if(m->precious) precpages++;
7398
7399                 assert(m->object != kernel_object);
7400                 m = (vm_page_t) queue_next(&m->pageq);
7401                 if (m ==(vm_page_t )0) break;
7402
7403         } while (!queue_end(&vm_page_queue_inactive,(queue_entry_t) m));
7404         vm_page_unlock_queues();
7405
7406         vm_page_lock_queues();
7407         m = (vm_page_t) queue_first(&vm_page_queue_throttled);
7408         do {
7409                 if (m ==(vm_page_t )0) break;
7410
7411                 dpages++;
7412                 assert(m->dirty);
7413                 assert(!m->pageout);
7414                 assert(m->object != kernel_object);
7415                 m = (vm_page_t) queue_next(&m->pageq);
7416                 if (m ==(vm_page_t )0) break;
7417
7418         } while (!queue_end(&vm_page_queue_throttled,(queue_entry_t) m));
7419         vm_page_unlock_queues();
7420
7421         vm_page_lock_queues();
7422         m = (vm_page_t) queue_first(&vm_page_queue_zf);
7423         do {
7424                 if (m ==(vm_page_t )0) break;
7425
7426                 if(m->dirty) dpages++;
7427                 if(m->pageout) pgopages++;
7428                 if(m->precious) precpages++;
7429
7430                 assert(m->object != kernel_object);
7431                 m = (vm_page_t) queue_next(&m->pageq);
7432                 if (m ==(vm_page_t )0) break;
7433
7434         } while (!queue_end(&vm_page_queue_zf,(queue_entry_t) m));
7435         vm_page_unlock_queues();
7436
7437         printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
7438
7439         dpages=0;
7440         pgopages=0;
7441         precpages=0;
7442
7443         vm_page_lock_queues();
7444         m = (vm_page_t) queue_first(&vm_page_queue_active);
7445
7446         do {
7447                 if(m == (vm_page_t )0) break;
7448                 if(m->dirty) dpages++;
7449                 if(m->pageout) pgopages++;
7450                 if(m->precious) precpages++;
7451
7452                 assert(m->object != kernel_object);
7453                 m = (vm_page_t) queue_next(&m->pageq);
7454                 if(m == (vm_page_t )0) break;
7455
7456         } while (!queue_end(&vm_page_queue_active,(queue_entry_t) m));
7457         vm_page_unlock_queues();
7458
7459         printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
7460
7461 }
7462 #endif /* MACH_BSD */
7463
7464 ppnum_t upl_get_highest_page(
7465                              upl_t                      upl)
7466 {
7467         return upl->highest_page;
7468 }
7469
7470 upl_size_t upl_get_size(
7471                              upl_t                      upl)
7472 {
7473         return upl->size;
7474 }
7475
7476 #if UPL_DEBUG
7477 kern_return_t  upl_ubc_alias_set(upl_t upl, uintptr_t alias1, uintptr_t alias2)
7478 {
7479         upl->ubc_alias1 = alias1;
7480         upl->ubc_alias2 = alias2;
7481         return KERN_SUCCESS;
7482 }
7483 int  upl_ubc_alias_get(upl_t upl, uintptr_t * al, uintptr_t * al2)
7484 {
7485         if(al)
7486                 *al = upl->ubc_alias1;
7487         if(al2)
7488                 *al2 = upl->ubc_alias2;
7489         return KERN_SUCCESS;
7490 }
7491 #endif /* UPL_DEBUG */
7492
7493
7494
7495 #if     MACH_KDB
7496 #include <ddb/db_output.h>
7497 #include <ddb/db_print.h>
7498 #include <vm/vm_print.h>
7499
7500 #define printf  kdbprintf
7501 void            db_pageout(void);
7502
7503 void
7504 db_vm(void)
7505 {
7506
7507         iprintf("VM Statistics:\n");
7508         db_indent += 2;
7509         iprintf("pages:\n");
7510         db_indent += 2;
7511         iprintf("activ %5d  inact %5d  free  %5d",
7512                 vm_page_active_count, vm_page_inactive_count,
7513                 vm_page_free_count);
7514         printf("   wire  %5d  gobbl %5d\n",
7515                vm_page_wire_count, vm_page_gobble_count);
7516         db_indent -= 2;
7517         iprintf("target:\n");
7518         db_indent += 2;
7519         iprintf("min   %5d  inact %5d  free  %5d",
7520                 vm_page_free_min, vm_page_inactive_target,
7521                 vm_page_free_target);
7522         printf("   resrv %5d\n", vm_page_free_reserved);
7523         db_indent -= 2;
7524         iprintf("pause:\n");
7525         db_pageout();
7526         db_indent -= 2;
7527 }
7528
7529 #if     MACH_COUNTERS
7530 extern int c_laundry_pages_freed;
7531 #endif  /* MACH_COUNTERS */
7532
7533 void
7534 db_pageout(void)
7535 {
7536         iprintf("Pageout Statistics:\n");
7537         db_indent += 2;
7538         iprintf("active %5d  inactv %5d\n",
7539                 vm_pageout_active, vm_pageout_inactive);
7540         iprintf("nolock %5d  avoid  %5d  busy   %5d  absent %5d\n",
7541                 vm_pageout_inactive_nolock, vm_pageout_inactive_avoid,
7542                 vm_pageout_inactive_busy, vm_pageout_inactive_absent);
7543         iprintf("used   %5d  clean  %5d  dirty  %5d\n",
7544                 vm_pageout_inactive_used, vm_pageout_inactive_clean,
7545                 vm_pageout_inactive_dirty);
7546 #if     MACH_COUNTERS
7547         iprintf("laundry_pages_freed %d\n", c_laundry_pages_freed);
7548 #endif  /* MACH_COUNTERS */
7549 #if     MACH_CLUSTER_STATS
7550         iprintf("Cluster Statistics:\n");
7551         db_indent += 2;
7552         iprintf("dirtied   %5d   cleaned  %5d   collisions  %5d\n",
7553                 vm_pageout_cluster_dirtied, vm_pageout_cluster_cleaned,
7554                 vm_pageout_cluster_collisions);
7555         iprintf("clusters  %5d   conversions  %5d\n",
7556                 vm_pageout_cluster_clusters, vm_pageout_cluster_conversions);
7557         db_indent -= 2;
7558         iprintf("Target Statistics:\n");
7559         db_indent += 2;
7560         iprintf("collisions   %5d   page_dirtied  %5d   page_freed  %5d\n",
7561                 vm_pageout_target_collisions, vm_pageout_target_page_dirtied,
7562                 vm_pageout_target_page_freed);
7563         db_indent -= 2;
7564 #endif  /* MACH_CLUSTER_STATS */
7565         db_indent -= 2;
7566 }
7567
7568 #endif  /* MACH_KDB */