osfmk/vm/vm_pageout.c

   1 /*
   2  * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58 /*
  59  *      File:   vm/vm_pageout.c
  60  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  61  *      Date:   1985
  62  *
  63  *      The proverbial page-out daemon.
  64  */
  65
  66 #include <stdint.h>
  67
  68 #include <debug.h>
  69 #include <mach_pagemap.h>
  70 #include <mach_cluster_stats.h>
  71 #include <mach_kdb.h>
  72 #include <advisory_pageout.h>
  73
  74 #include <mach/mach_types.h>
  75 #include <mach/memory_object.h>
  76 #include <mach/memory_object_default.h>
  77 #include <mach/memory_object_control_server.h>
  78 #include <mach/mach_host_server.h>
  79 #include <mach/upl.h>
  80 #include <mach/vm_map.h>
  81 #include <mach/vm_param.h>
  82 #include <mach/vm_statistics.h>
  83
  84 #include <kern/kern_types.h>
  85 #include <kern/counters.h>
  86 #include <kern/host_statistics.h>
  87 #include <kern/machine.h>
  88 #include <kern/misc_protos.h>
  89 #include <kern/thread.h>
  90 #include <kern/xpr.h>
  91 #include <kern/kalloc.h>
  92
  93 #include <machine/vm_tuning.h>
  94
  95 #include <vm/pmap.h>
  96 #include <vm/vm_fault.h>
  97 #include <vm/vm_map.h>
  98 #include <vm/vm_object.h>
  99 #include <vm/vm_page.h>
 100 #include <vm/vm_pageout.h>
 101 #include <vm/vm_protos.h> /* must be last */
 102
 103 /*
 104  * ENCRYPTED SWAP:
 105  */
 106 #include <../bsd/crypto/aes/aes.h>
 107
 108 extern ipc_port_t       memory_manager_default;
 109
 110
 111 #ifndef VM_PAGEOUT_BURST_ACTIVE_THROTTLE
 112 #define VM_PAGEOUT_BURST_ACTIVE_THROTTLE  10000  /* maximum iterations of the active queue to move pages to inactive */
 113 #endif
 114
 115 #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE
 116 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096  /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
 117 #endif
 118
 119 #ifndef VM_PAGEOUT_DEADLOCK_RELIEF
 120 #define VM_PAGEOUT_DEADLOCK_RELIEF 100  /* number of pages to move to break deadlock */
 121 #endif
 122
 123 #ifndef VM_PAGEOUT_INACTIVE_RELIEF
 124 #define VM_PAGEOUT_INACTIVE_RELIEF 50   /* minimum number of pages to move to the inactive q */
 125 #endif
 126
 127 #ifndef VM_PAGE_LAUNDRY_MAX
 128 #define VM_PAGE_LAUNDRY_MAX     16UL    /* maximum pageouts on a given pageout queue */
 129 #endif  /* VM_PAGEOUT_LAUNDRY_MAX */
 130
 131 #ifndef VM_PAGEOUT_BURST_WAIT
 132 #define VM_PAGEOUT_BURST_WAIT   30      /* milliseconds per page */
 133 #endif  /* VM_PAGEOUT_BURST_WAIT */
 134
 135 #ifndef VM_PAGEOUT_EMPTY_WAIT
 136 #define VM_PAGEOUT_EMPTY_WAIT   200     /* milliseconds */
 137 #endif  /* VM_PAGEOUT_EMPTY_WAIT */
 138
 139 #ifndef VM_PAGEOUT_DEADLOCK_WAIT
 140 #define VM_PAGEOUT_DEADLOCK_WAIT        300     /* milliseconds */
 141 #endif  /* VM_PAGEOUT_DEADLOCK_WAIT */
 142
 143 #ifndef VM_PAGEOUT_IDLE_WAIT
 144 #define VM_PAGEOUT_IDLE_WAIT    10      /* milliseconds */
 145 #endif  /* VM_PAGEOUT_IDLE_WAIT */
 146
 147
 148 /*
 149  *      To obtain a reasonable LRU approximation, the inactive queue
 150  *      needs to be large enough to give pages on it a chance to be
 151  *      referenced a second time.  This macro defines the fraction
 152  *      of active+inactive pages that should be inactive.
 153  *      The pageout daemon uses it to update vm_page_inactive_target.
 154  *
 155  *      If vm_page_free_count falls below vm_page_free_target and
 156  *      vm_page_inactive_count is below vm_page_inactive_target,
 157  *      then the pageout daemon starts running.
 158  */
 159
 160 #ifndef VM_PAGE_INACTIVE_TARGET
 161 #define VM_PAGE_INACTIVE_TARGET(avail)  ((avail) * 1 / 3)
 162 #endif  /* VM_PAGE_INACTIVE_TARGET */
 163
 164 /*
 165  *      Once the pageout daemon starts running, it keeps going
 166  *      until vm_page_free_count meets or exceeds vm_page_free_target.
 167  */
 168
 169 #ifndef VM_PAGE_FREE_TARGET
 170 #define VM_PAGE_FREE_TARGET(free)       (15 + (free) / 80)
 171 #endif  /* VM_PAGE_FREE_TARGET */
 172
 173 /*
 174  *      The pageout daemon always starts running once vm_page_free_count
 175  *      falls below vm_page_free_min.
 176  */
 177
 178 #ifndef VM_PAGE_FREE_MIN
 179 #define VM_PAGE_FREE_MIN(free)  (10 + (free) / 100)
 180 #endif  /* VM_PAGE_FREE_MIN */
 181
 182 /*
 183  *      When vm_page_free_count falls below vm_page_free_reserved,
 184  *      only vm-privileged threads can allocate pages.  vm-privilege
 185  *      allows the pageout daemon and default pager (and any other
 186  *      associated threads needed for default pageout) to continue
 187  *      operation by dipping into the reserved pool of pages.
 188  */
 189
 190 #ifndef VM_PAGE_FREE_RESERVED
 191 #define VM_PAGE_FREE_RESERVED(n)        \
 192         ((6 * VM_PAGE_LAUNDRY_MAX) + (n))
 193 #endif  /* VM_PAGE_FREE_RESERVED */
 194
 195
 196 /*
 197  * must hold the page queues lock to
 198  * manipulate this structure
 199  */
 200 struct vm_pageout_queue {
 201         queue_head_t    pgo_pending;    /* laundry pages to be processed by pager's iothread */
 202         unsigned int    pgo_laundry;    /* current count of laundry pages on queue or in flight */
 203         unsigned int    pgo_maxlaundry;
 204
 205         unsigned int    pgo_idle:1,     /* iothread is blocked waiting for work to do */
 206                         pgo_busy:1,     /* iothread is currently processing request from pgo_pending */
 207                         pgo_throttled:1,/* vm_pageout_scan thread needs a wakeup when pgo_laundry drops */
 208                         :0;
 209 };
 210
 211 #define VM_PAGE_Q_THROTTLED(q)          \
 212         ((q)->pgo_laundry >= (q)->pgo_maxlaundry)
 213
 214
 215 /*
 216  * Exported variable used to broadcast the activation of the pageout scan
 217  * Working Set uses this to throttle its use of pmap removes.  In this
 218  * way, code which runs within memory in an uncontested context does
 219  * not keep encountering soft faults.
 220  */
 221
 222 unsigned int    vm_pageout_scan_event_counter = 0;
 223
 224 /*
 225  * Forward declarations for internal routines.
 226  */
 227
 228 static void vm_pageout_garbage_collect(int);
 229 static void vm_pageout_iothread_continue(struct vm_pageout_queue *);
 230 static void vm_pageout_iothread_external(void);
 231 static void vm_pageout_iothread_internal(void);
 232 static void vm_pageout_queue_steal(vm_page_t);
 233
 234 extern void vm_pageout_continue(void);
 235 extern void vm_pageout_scan(void);
 236
 237 unsigned int vm_pageout_reserved_internal = 0;
 238 unsigned int vm_pageout_reserved_really = 0;
 239
 240 unsigned int vm_pageout_idle_wait = 0;          /* milliseconds */
 241 unsigned int vm_pageout_empty_wait = 0;         /* milliseconds */
 242 unsigned int vm_pageout_burst_wait = 0;         /* milliseconds */
 243 unsigned int vm_pageout_deadlock_wait = 0;      /* milliseconds */
 244 unsigned int vm_pageout_deadlock_relief = 0;
 245 unsigned int vm_pageout_inactive_relief = 0;
 246 unsigned int vm_pageout_burst_active_throttle = 0;
 247 unsigned int vm_pageout_burst_inactive_throttle = 0;
 248
 249 /*
 250  *      Protection against zero fill flushing live working sets derived
 251  *      from existing backing store and files
 252  */
 253 unsigned int vm_accellerate_zf_pageout_trigger = 400;
 254 unsigned int vm_zf_iterator;
 255 unsigned int vm_zf_iterator_count = 40;
 256 unsigned int last_page_zf;
 257 unsigned int vm_zf_count = 0;
 258
 259 /*
 260  *      These variables record the pageout daemon's actions:
 261  *      how many pages it looks at and what happens to those pages.
 262  *      No locking needed because only one thread modifies the variables.
 263  */
 264
 265 unsigned int vm_pageout_active = 0;             /* debugging */
 266 unsigned int vm_pageout_inactive = 0;           /* debugging */
 267 unsigned int vm_pageout_inactive_throttled = 0; /* debugging */
 268 unsigned int vm_pageout_inactive_forced = 0;    /* debugging */
 269 unsigned int vm_pageout_inactive_nolock = 0;    /* debugging */
 270 unsigned int vm_pageout_inactive_avoid = 0;     /* debugging */
 271 unsigned int vm_pageout_inactive_busy = 0;      /* debugging */
 272 unsigned int vm_pageout_inactive_absent = 0;    /* debugging */
 273 unsigned int vm_pageout_inactive_used = 0;      /* debugging */
 274 unsigned int vm_pageout_inactive_clean = 0;     /* debugging */
 275 unsigned int vm_pageout_inactive_dirty = 0;     /* debugging */
 276 unsigned int vm_pageout_dirty_no_pager = 0;     /* debugging */
 277 unsigned int vm_pageout_purged_objects = 0;     /* debugging */
 278 unsigned int vm_stat_discard = 0;               /* debugging */
 279 unsigned int vm_stat_discard_sent = 0;          /* debugging */
 280 unsigned int vm_stat_discard_failure = 0;       /* debugging */
 281 unsigned int vm_stat_discard_throttle = 0;      /* debugging */
 282
 283 unsigned int vm_pageout_scan_active_throttled = 0;
 284 unsigned int vm_pageout_scan_inactive_throttled = 0;
 285 unsigned int vm_pageout_scan_throttle = 0;                      /* debugging */
 286 unsigned int vm_pageout_scan_burst_throttle = 0;                /* debugging */
 287 unsigned int vm_pageout_scan_empty_throttle = 0;                /* debugging */
 288 unsigned int vm_pageout_scan_deadlock_detected = 0;             /* debugging */
 289 unsigned int vm_pageout_scan_active_throttle_success = 0;       /* debugging */
 290 unsigned int vm_pageout_scan_inactive_throttle_success = 0;     /* debugging */
 291 /*
 292  * Backing store throttle when BS is exhausted
 293  */
 294 unsigned int    vm_backing_store_low = 0;
 295
 296 unsigned int vm_pageout_out_of_line  = 0;
 297 unsigned int vm_pageout_in_place  = 0;
 298
 299 /*
 300  * ENCRYPTED SWAP:
 301  * counters and statistics...
 302  */
 303 unsigned long vm_page_decrypt_counter = 0;
 304 unsigned long vm_page_decrypt_for_upl_counter = 0;
 305 unsigned long vm_page_encrypt_counter = 0;
 306 unsigned long vm_page_encrypt_abort_counter = 0;
 307 unsigned long vm_page_encrypt_already_encrypted_counter = 0;
 308 boolean_t vm_pages_encrypted = FALSE; /* are there encrypted pages ? */
 309
 310
 311 struct  vm_pageout_queue vm_pageout_queue_internal;
 312 struct  vm_pageout_queue vm_pageout_queue_external;
 313
 314
 315 /*
 316  *      Routine:        vm_backing_store_disable
 317  *      Purpose:
 318  *              Suspend non-privileged threads wishing to extend
 319  *              backing store when we are low on backing store
 320  *              (Synchronized by caller)
 321  */
 322 void
 323 vm_backing_store_disable(
 324         boolean_t       disable)
 325 {
 326         if(disable) {
 327                 vm_backing_store_low = 1;
 328         } else {
 329                 if(vm_backing_store_low) {
 330                         vm_backing_store_low = 0;
 331                         thread_wakeup((event_t) &vm_backing_store_low);
 332                 }
 333         }
 334 }
 335
 336
 337 /*
 338  *      Routine:        vm_pageout_object_allocate
 339  *      Purpose:
 340  *              Allocate an object for use as out-of-line memory in a
 341  *              data_return/data_initialize message.
 342  *              The page must be in an unlocked object.
 343  *
 344  *              If the page belongs to a trusted pager, cleaning in place
 345  *              will be used, which utilizes a special "pageout object"
 346  *              containing private alias pages for the real page frames.
 347  *              Untrusted pagers use normal out-of-line memory.
 348  */
 349 vm_object_t
 350 vm_pageout_object_allocate(
 351         vm_page_t               m,
 352         vm_size_t               size,
 353         vm_object_offset_t      offset)
 354 {
 355         vm_object_t     object = m->object;
 356         vm_object_t     new_object;
 357
 358         assert(object->pager_ready);
 359
 360         new_object = vm_object_allocate(size);
 361
 362         if (object->pager_trusted) {
 363                 assert (offset < object->size);
 364
 365                 vm_object_lock(new_object);
 366                 new_object->pageout = TRUE;
 367                 new_object->shadow = object;
 368                 new_object->can_persist = FALSE;
 369                 new_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
 370                 new_object->shadow_offset = offset;
 371                 vm_object_unlock(new_object);
 372
 373                 /*
 374                  * Take a paging reference on the object. This will be dropped
 375                  * in vm_pageout_object_terminate()
 376                  */
 377                 vm_object_lock(object);
 378                 vm_object_paging_begin(object);
 379                 vm_page_lock_queues();
 380                 vm_page_unlock_queues();
 381                 vm_object_unlock(object);
 382
 383                 vm_pageout_in_place++;
 384         } else
 385                 vm_pageout_out_of_line++;
 386         return(new_object);
 387 }
 388
 389 #if MACH_CLUSTER_STATS
 390 unsigned long vm_pageout_cluster_dirtied = 0;
 391 unsigned long vm_pageout_cluster_cleaned = 0;
 392 unsigned long vm_pageout_cluster_collisions = 0;
 393 unsigned long vm_pageout_cluster_clusters = 0;
 394 unsigned long vm_pageout_cluster_conversions = 0;
 395 unsigned long vm_pageout_target_collisions = 0;
 396 unsigned long vm_pageout_target_page_dirtied = 0;
 397 unsigned long vm_pageout_target_page_freed = 0;
 398 #define CLUSTER_STAT(clause)    clause
 399 #else   /* MACH_CLUSTER_STATS */
 400 #define CLUSTER_STAT(clause)
 401 #endif  /* MACH_CLUSTER_STATS */
 402
 403 /*
 404  *      Routine:        vm_pageout_object_terminate
 405  *      Purpose:
 406  *              Destroy the pageout_object allocated by
 407  *              vm_pageout_object_allocate(), and perform all of the
 408  *              required cleanup actions.
 409  *
 410  *      In/Out conditions:
 411  *              The object must be locked, and will be returned locked.
 412  */
 413 void
 414 vm_pageout_object_terminate(
 415         vm_object_t     object)
 416 {
 417         vm_object_t     shadow_object;
 418         boolean_t       shadow_internal;
 419
 420         /*
 421          * Deal with the deallocation (last reference) of a pageout object
 422          * (used for cleaning-in-place) by dropping the paging references/
 423          * freeing pages in the original object.
 424          */
 425
 426         assert(object->pageout);
 427         shadow_object = object->shadow;
 428         vm_object_lock(shadow_object);
 429         shadow_internal = shadow_object->internal;
 430
 431         while (!queue_empty(&object->memq)) {
 432                 vm_page_t               p, m;
 433                 vm_object_offset_t      offset;
 434
 435                 p = (vm_page_t) queue_first(&object->memq);
 436
 437                 assert(p->private);
 438                 assert(p->pageout);
 439                 p->pageout = FALSE;
 440                 assert(!p->cleaning);
 441
 442                 offset = p->offset;
 443                 VM_PAGE_FREE(p);
 444                 p = VM_PAGE_NULL;
 445
 446                 m = vm_page_lookup(shadow_object,
 447                         offset + object->shadow_offset);
 448
 449                 if(m == VM_PAGE_NULL)
 450                         continue;
 451                 assert(m->cleaning);
 452                 /* used as a trigger on upl_commit etc to recognize the */
 453                 /* pageout daemon's subseqent desire to pageout a cleaning */
 454                 /* page.  When the bit is on the upl commit code will   */
 455                 /* respect the pageout bit in the target page over the  */
 456                 /* caller's page list indication */
 457                 m->dump_cleaning = FALSE;
 458
 459                 /*
 460                  * Account for the paging reference taken when
 461                  * m->cleaning was set on this page.
 462                  */
 463                 vm_object_paging_end(shadow_object);
 464                 assert((m->dirty) || (m->precious) ||
 465                                 (m->busy && m->cleaning));
 466
 467                 /*
 468                  * Handle the trusted pager throttle.
 469                  * Also decrement the burst throttle (if external).
 470                  */
 471                 vm_page_lock_queues();
 472                 if (m->laundry) {
 473                         vm_pageout_throttle_up(m);
 474                 }
 475
 476                 /*
 477                  * Handle the "target" page(s). These pages are to be freed if
 478                  * successfully cleaned. Target pages are always busy, and are
 479                  * wired exactly once. The initial target pages are not mapped,
 480                  * (so cannot be referenced or modified) but converted target
 481                  * pages may have been modified between the selection as an
 482                  * adjacent page and conversion to a target.
 483                  */
 484                 if (m->pageout) {
 485                         assert(m->busy);
 486                         assert(m->wire_count == 1);
 487                         m->cleaning = FALSE;
 488                         m->pageout = FALSE;
 489 #if MACH_CLUSTER_STATS
 490                         if (m->wanted) vm_pageout_target_collisions++;
 491 #endif
 492                         /*
 493                          * Revoke all access to the page. Since the object is
 494                          * locked, and the page is busy, this prevents the page
 495                          * from being dirtied after the pmap_disconnect() call
 496                          * returns.
 497                          *
 498                          * Since the page is left "dirty" but "not modifed", we
 499                          * can detect whether the page was redirtied during
 500                          * pageout by checking the modify state.
 501                          */
 502                         if (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED)
 503                               m->dirty = TRUE;
 504                         else
 505                               m->dirty = FALSE;
 506
 507                         if (m->dirty) {
 508                                 CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
 509                                 vm_page_unwire(m);/* reactivates */
 510                                 VM_STAT(reactivations++);
 511                                 PAGE_WAKEUP_DONE(m);
 512                         } else {
 513                                 CLUSTER_STAT(vm_pageout_target_page_freed++;)
 514                                 vm_page_free(m);/* clears busy, etc. */
 515                         }
 516                         vm_page_unlock_queues();
 517                         continue;
 518                 }
 519                 /*
 520                  * Handle the "adjacent" pages. These pages were cleaned in
 521                  * place, and should be left alone.
 522                  * If prep_pin_count is nonzero, then someone is using the
 523                  * page, so make it active.
 524                  */
 525                 if (!m->active && !m->inactive && !m->private) {
 526                         if (m->reference)
 527                                 vm_page_activate(m);
 528                         else
 529                                 vm_page_deactivate(m);
 530                 }
 531                 if((m->busy) && (m->cleaning)) {
 532
 533                         /* the request_page_list case, (COPY_OUT_FROM FALSE) */
 534                         m->busy = FALSE;
 535
 536                         /* We do not re-set m->dirty ! */
 537                         /* The page was busy so no extraneous activity     */
 538                         /* could have occurred. COPY_INTO is a read into the */
 539                         /* new pages. CLEAN_IN_PLACE does actually write   */
 540                         /* out the pages but handling outside of this code */
 541                         /* will take care of resetting dirty. We clear the */
 542                         /* modify however for the Programmed I/O case.     */
 543                         pmap_clear_modify(m->phys_page);
 544                         if(m->absent) {
 545                                 m->absent = FALSE;
 546                                 if(shadow_object->absent_count == 1)
 547                                         vm_object_absent_release(shadow_object);
 548                                 else
 549                                         shadow_object->absent_count--;
 550                         }
 551                         m->overwriting = FALSE;
 552                 } else if (m->overwriting) {
 553                         /* alternate request page list, write to page_list */
 554                         /* case.  Occurs when the original page was wired  */
 555                         /* at the time of the list request */
 556                         assert(m->wire_count != 0);
 557                         vm_page_unwire(m);/* reactivates */
 558                         m->overwriting = FALSE;
 559                 } else {
 560                 /*
 561                  * Set the dirty state according to whether or not the page was
 562                  * modified during the pageout. Note that we purposefully do
 563                  * NOT call pmap_clear_modify since the page is still mapped.
 564                  * If the page were to be dirtied between the 2 calls, this
 565                  * this fact would be lost. This code is only necessary to
 566                  * maintain statistics, since the pmap module is always
 567                  * consulted if m->dirty is false.
 568                  */
 569 #if MACH_CLUSTER_STATS
 570                         m->dirty = pmap_is_modified(m->phys_page);
 571
 572                         if (m->dirty)   vm_pageout_cluster_dirtied++;
 573                         else            vm_pageout_cluster_cleaned++;
 574                         if (m->wanted)  vm_pageout_cluster_collisions++;
 575 #else
 576                         m->dirty = 0;
 577 #endif
 578                 }
 579                 m->cleaning = FALSE;
 580
 581                 /*
 582                  * Wakeup any thread waiting for the page to be un-cleaning.
 583                  */
 584                 PAGE_WAKEUP(m);
 585                 vm_page_unlock_queues();
 586         }
 587         /*
 588          * Account for the paging reference taken in vm_paging_object_allocate.
 589          */
 590         vm_object_paging_end(shadow_object);
 591         vm_object_unlock(shadow_object);
 592
 593         assert(object->ref_count == 0);
 594         assert(object->paging_in_progress == 0);
 595         assert(object->resident_page_count == 0);
 596         return;
 597 }
 598
 599 /*
 600  *      Routine:        vm_pageout_setup
 601  *      Purpose:
 602  *              Set up a page for pageout (clean & flush).
 603  *
 604  *              Move the page to a new object, as part of which it will be
 605  *              sent to its memory manager in a memory_object_data_write or
 606  *              memory_object_initialize message.
 607  *
 608  *              The "new_object" and "new_offset" arguments
 609  *              indicate where the page should be moved.
 610  *
 611  *      In/Out conditions:
 612  *              The page in question must not be on any pageout queues,
 613  *              and must be busy.  The object to which it belongs
 614  *              must be unlocked, and the caller must hold a paging
 615  *              reference to it.  The new_object must not be locked.
 616  *
 617  *              This routine returns a pointer to a place-holder page,
 618  *              inserted at the same offset, to block out-of-order
 619  *              requests for the page.  The place-holder page must
 620  *              be freed after the data_write or initialize message
 621  *              has been sent.
 622  *
 623  *              The original page is put on a paging queue and marked
 624  *              not busy on exit.
 625  */
 626 vm_page_t
 627 vm_pageout_setup(
 628         register vm_page_t      m,
 629         register vm_object_t    new_object,
 630         vm_object_offset_t      new_offset)
 631 {
 632         register vm_object_t    old_object = m->object;
 633         vm_object_offset_t      paging_offset;
 634         vm_object_offset_t      offset;
 635         register vm_page_t      holding_page;
 636         register vm_page_t      new_m;
 637         boolean_t               need_to_wire = FALSE;
 638
 639
 640         XPR(XPR_VM_PAGEOUT,
 641      "vm_pageout_setup, obj 0x%X off 0x%X page 0x%X new obj 0x%X offset 0x%X\n",
 642                 (integer_t)m->object, (integer_t)m->offset,
 643                 (integer_t)m, (integer_t)new_object,
 644                 (integer_t)new_offset);
 645         assert(m && m->busy && !m->absent && !m->fictitious && !m->error &&
 646                 !m->restart);
 647
 648         assert(m->dirty || m->precious);
 649
 650         /*
 651          *      Create a place-holder page where the old one was, to prevent
 652          *      attempted pageins of this page while we're unlocked.
 653          */
 654         VM_PAGE_GRAB_FICTITIOUS(holding_page);
 655
 656         vm_object_lock(old_object);
 657
 658         offset = m->offset;
 659         paging_offset = offset + old_object->paging_offset;
 660
 661         if (old_object->pager_trusted) {
 662                 /*
 663                  * This pager is trusted, so we can clean this page
 664                  * in place. Leave it in the old object, and mark it
 665                  * cleaning & pageout.
 666                  */
 667                 new_m = holding_page;
 668                 holding_page = VM_PAGE_NULL;
 669
 670                 /*
 671                  * Set up new page to be private shadow of real page.
 672                  */
 673                 new_m->phys_page = m->phys_page;
 674                 new_m->fictitious = FALSE;
 675                 new_m->pageout = TRUE;
 676
 677                 /*
 678                  * Mark real page as cleaning (indicating that we hold a
 679                  * paging reference to be released via m_o_d_r_c) and
 680                  * pageout (indicating that the page should be freed
 681                  * when the pageout completes).
 682                  */
 683                 pmap_clear_modify(m->phys_page);
 684                 vm_page_lock_queues();
 685                 new_m->private = TRUE;
 686                 vm_page_wire(new_m);
 687                 m->cleaning = TRUE;
 688                 m->pageout = TRUE;
 689
 690                 vm_page_wire(m);
 691                 assert(m->wire_count == 1);
 692                 vm_page_unlock_queues();
 693
 694                 m->dirty = TRUE;
 695                 m->precious = FALSE;
 696                 m->page_lock = VM_PROT_NONE;
 697                 m->unusual = FALSE;
 698                 m->unlock_request = VM_PROT_NONE;
 699         } else {
 700                 /*
 701                  * Cannot clean in place, so rip the old page out of the
 702                  * object, and stick the holding page in. Set new_m to the
 703                  * page in the new object.
 704                  */
 705                 vm_page_lock_queues();
 706                 VM_PAGE_QUEUES_REMOVE(m);
 707                 vm_page_remove(m);
 708
 709                 vm_page_insert(holding_page, old_object, offset);
 710                 vm_page_unlock_queues();
 711
 712                 m->dirty = TRUE;
 713                 m->precious = FALSE;
 714                 new_m = m;
 715                 new_m->page_lock = VM_PROT_NONE;
 716                 new_m->unlock_request = VM_PROT_NONE;
 717
 718                 if (old_object->internal)
 719                         need_to_wire = TRUE;
 720         }
 721         /*
 722          *      Record that this page has been written out
 723          */
 724 #if     MACH_PAGEMAP
 725         vm_external_state_set(old_object->existence_map, offset);
 726 #endif  /* MACH_PAGEMAP */
 727
 728         vm_object_unlock(old_object);
 729
 730         vm_object_lock(new_object);
 731
 732         /*
 733          *      Put the page into the new object. If it is a not wired
 734          *      (if it's the real page) it will be activated.
 735          */
 736
 737         vm_page_lock_queues();
 738         vm_page_insert(new_m, new_object, new_offset);
 739         if (need_to_wire)
 740                 vm_page_wire(new_m);
 741         else
 742                 vm_page_activate(new_m);
 743         PAGE_WAKEUP_DONE(new_m);
 744         vm_page_unlock_queues();
 745
 746         vm_object_unlock(new_object);
 747
 748         /*
 749          *      Return the placeholder page to simplify cleanup.
 750          */
 751         return (holding_page);
 752 }
 753
 754 /*
 755  * Routine:     vm_pageclean_setup
 756  *
 757  * Purpose:     setup a page to be cleaned (made non-dirty), but not
 758  *              necessarily flushed from the VM page cache.
 759  *              This is accomplished by cleaning in place.
 760  *
 761  *              The page must not be busy, and the object and page
 762  *              queues must be locked.
 763  *
 764  */
 765 void
 766 vm_pageclean_setup(
 767         vm_page_t               m,
 768         vm_page_t               new_m,
 769         vm_object_t             new_object,
 770         vm_object_offset_t      new_offset)
 771 {
 772         vm_object_t old_object = m->object;
 773         assert(!m->busy);
 774         assert(!m->cleaning);
 775
 776         XPR(XPR_VM_PAGEOUT,
 777     "vm_pageclean_setup, obj 0x%X off 0x%X page 0x%X new 0x%X new_off 0x%X\n",
 778                 (integer_t)old_object, m->offset, (integer_t)m,
 779                 (integer_t)new_m, new_offset);
 780
 781         pmap_clear_modify(m->phys_page);
 782         vm_object_paging_begin(old_object);
 783
 784         /*
 785          *      Record that this page has been written out
 786          */
 787 #if     MACH_PAGEMAP
 788         vm_external_state_set(old_object->existence_map, m->offset);
 789 #endif  /*MACH_PAGEMAP*/
 790
 791         /*
 792          * Mark original page as cleaning in place.
 793          */
 794         m->cleaning = TRUE;
 795         m->dirty = TRUE;
 796         m->precious = FALSE;
 797
 798         /*
 799          * Convert the fictitious page to a private shadow of
 800          * the real page.
 801          */
 802         assert(new_m->fictitious);
 803         new_m->fictitious = FALSE;
 804         new_m->private = TRUE;
 805         new_m->pageout = TRUE;
 806         new_m->phys_page = m->phys_page;
 807         vm_page_wire(new_m);
 808
 809         vm_page_insert(new_m, new_object, new_offset);
 810         assert(!new_m->wanted);
 811         new_m->busy = FALSE;
 812 }
 813
 814 void
 815 vm_pageclean_copy(
 816         vm_page_t               m,
 817         vm_page_t               new_m,
 818         vm_object_t             new_object,
 819         vm_object_offset_t      new_offset)
 820 {
 821         XPR(XPR_VM_PAGEOUT,
 822         "vm_pageclean_copy, page 0x%X new_m 0x%X new_obj 0x%X offset 0x%X\n",
 823                 m, new_m, new_object, new_offset, 0);
 824
 825         assert((!m->busy) && (!m->cleaning));
 826
 827         assert(!new_m->private && !new_m->fictitious);
 828
 829         pmap_clear_modify(m->phys_page);
 830
 831         m->busy = TRUE;
 832         vm_object_paging_begin(m->object);
 833         vm_page_unlock_queues();
 834         vm_object_unlock(m->object);
 835
 836         /*
 837          * Copy the original page to the new page.
 838          */
 839         vm_page_copy(m, new_m);
 840
 841         /*
 842          * Mark the old page as clean. A request to pmap_is_modified
 843          * will get the right answer.
 844          */
 845         vm_object_lock(m->object);
 846         m->dirty = FALSE;
 847
 848         vm_object_paging_end(m->object);
 849
 850         vm_page_lock_queues();
 851         if (!m->active && !m->inactive)
 852                 vm_page_activate(m);
 853         PAGE_WAKEUP_DONE(m);
 854
 855         vm_page_insert(new_m, new_object, new_offset);
 856         vm_page_activate(new_m);
 857         new_m->busy = FALSE;    /* No other thread can be waiting */
 858 }
 859
 860
 861 /*
 862  *      Routine:        vm_pageout_initialize_page
 863  *      Purpose:
 864  *              Causes the specified page to be initialized in
 865  *              the appropriate memory object. This routine is used to push
 866  *              pages into a copy-object when they are modified in the
 867  *              permanent object.
 868  *
 869  *              The page is moved to a temporary object and paged out.
 870  *
 871  *      In/out conditions:
 872  *              The page in question must not be on any pageout queues.
 873  *              The object to which it belongs must be locked.
 874  *              The page must be busy, but not hold a paging reference.
 875  *
 876  *      Implementation:
 877  *              Move this page to a completely new object.
 878  */
 879 void
 880 vm_pageout_initialize_page(
 881         vm_page_t       m)
 882 {
 883         vm_object_t             object;
 884         vm_object_offset_t      paging_offset;
 885         vm_page_t               holding_page;
 886
 887
 888         XPR(XPR_VM_PAGEOUT,
 889                 "vm_pageout_initialize_page, page 0x%X\n",
 890                 (integer_t)m, 0, 0, 0, 0);
 891         assert(m->busy);
 892
 893         /*
 894          *      Verify that we really want to clean this page
 895          */
 896         assert(!m->absent);
 897         assert(!m->error);
 898         assert(m->dirty);
 899
 900         /*
 901          *      Create a paging reference to let us play with the object.
 902          */
 903         object = m->object;
 904         paging_offset = m->offset + object->paging_offset;
 905         vm_object_paging_begin(object);
 906         if (m->absent || m->error || m->restart ||
 907             (!m->dirty && !m->precious)) {
 908                 VM_PAGE_FREE(m);
 909                 panic("reservation without pageout?"); /* alan */
 910              vm_object_unlock(object);
 911                 return;
 912         }
 913
 914         /* set the page for future call to vm_fault_list_request */
 915         holding_page = NULL;
 916         vm_page_lock_queues();
 917         pmap_clear_modify(m->phys_page);
 918         m->dirty = TRUE;
 919         m->busy = TRUE;
 920         m->list_req_pending = TRUE;
 921         m->cleaning = TRUE;
 922         m->pageout = TRUE;
 923         vm_page_wire(m);
 924         vm_page_unlock_queues();
 925         vm_object_unlock(object);
 926
 927         /*
 928          *      Write the data to its pager.
 929          *      Note that the data is passed by naming the new object,
 930          *      not a virtual address; the pager interface has been
 931          *      manipulated to use the "internal memory" data type.
 932          *      [The object reference from its allocation is donated
 933          *      to the eventual recipient.]
 934          */
 935         memory_object_data_initialize(object->pager,
 936                                         paging_offset,
 937                                         PAGE_SIZE);
 938
 939         vm_object_lock(object);
 940 }
 941
 942 #if     MACH_CLUSTER_STATS
 943 #define MAXCLUSTERPAGES 16
 944 struct {
 945         unsigned long pages_in_cluster;
 946         unsigned long pages_at_higher_offsets;
 947         unsigned long pages_at_lower_offsets;
 948 } cluster_stats[MAXCLUSTERPAGES];
 949 #endif  /* MACH_CLUSTER_STATS */
 950
 951 boolean_t allow_clustered_pageouts = FALSE;
 952
 953 /*
 954  * vm_pageout_cluster:
 955  *
 956  * Given a page, queue it to the appropriate I/O thread,
 957  * which will page it out and attempt to clean adjacent pages
 958  * in the same operation.
 959  *
 960  * The page must be busy, and the object and queues locked. We will take a
 961  * paging reference to prevent deallocation or collapse when we
 962  * release the object lock back at the call site.  The I/O thread
 963  * is responsible for consuming this reference
 964  *
 965  * The page must not be on any pageout queue.
 966  */
 967
 968 void
 969 vm_pageout_cluster(vm_page_t m)
 970 {
 971         vm_object_t     object = m->object;
 972         struct          vm_pageout_queue *q;
 973
 974
 975         XPR(XPR_VM_PAGEOUT,
 976                 "vm_pageout_cluster, object 0x%X offset 0x%X page 0x%X\n",
 977                 (integer_t)object, m->offset, (integer_t)m, 0, 0);
 978
 979         /*
 980          * Only a certain kind of page is appreciated here.
 981          */
 982         assert(m->busy && (m->dirty || m->precious) && (m->wire_count == 0));
 983         assert(!m->cleaning && !m->pageout && !m->inactive && !m->active);
 984
 985         /*
 986          * protect the object from collapse -
 987          * locking in the object's paging_offset.
 988          */
 989         vm_object_paging_begin(object);
 990
 991         /*
 992          * set the page for future call to vm_fault_list_request
 993          * page should already be marked busy
 994          */
 995         vm_page_wire(m);
 996         m->list_req_pending = TRUE;
 997         m->cleaning = TRUE;
 998         m->pageout = TRUE;
 999         m->laundry = TRUE;
1000
1001         if (object->internal == TRUE)
1002                 q = &vm_pageout_queue_internal;
1003         else
1004                 q = &vm_pageout_queue_external;
1005         q->pgo_laundry++;
1006
1007         m->pageout_queue = TRUE;
1008         queue_enter(&q->pgo_pending, m, vm_page_t, pageq);
1009
1010         if (q->pgo_idle == TRUE) {
1011                 q->pgo_idle = FALSE;
1012                 thread_wakeup((event_t) &q->pgo_pending);
1013         }
1014 }
1015
1016
1017 unsigned long vm_pageout_throttle_up_count = 0;
1018
1019 /*
1020  * A page is back from laundry.  See if there are some pages waiting to
1021  * go to laundry and if we can let some of them go now.
1022  *
1023  * Object and page queues must be locked.
1024  */
1025 void
1026 vm_pageout_throttle_up(
1027         vm_page_t       m)
1028 {
1029         struct vm_pageout_queue *q;
1030
1031         vm_pageout_throttle_up_count++;
1032
1033         assert(m->laundry);
1034         assert(m->object != VM_OBJECT_NULL);
1035         assert(m->object != kernel_object);
1036
1037         if (m->object->internal == TRUE)
1038                 q = &vm_pageout_queue_internal;
1039         else
1040                 q = &vm_pageout_queue_external;
1041
1042         m->laundry = FALSE;
1043         q->pgo_laundry--;
1044
1045         if (q->pgo_throttled == TRUE) {
1046                 q->pgo_throttled = FALSE;
1047                 thread_wakeup((event_t) &q->pgo_laundry);
1048         }
1049 }
1050
1051
1052 /*
1053  *      vm_pageout_scan does the dirty work for the pageout daemon.
1054  *      It returns with vm_page_queue_free_lock held and
1055  *      vm_page_free_wanted == 0.
1056  */
1057
1058 #define DELAYED_UNLOCK_LIMIT  (3 * MAX_UPL_TRANSFER)
1059
1060 #define FCS_IDLE                0
1061 #define FCS_DELAYED             1
1062 #define FCS_DEADLOCK_DETECTED   2
1063
1064 struct flow_control {
1065         int             state;
1066         mach_timespec_t ts;
1067 };
1068
1069 void
1070 vm_pageout_scan(void)
1071 {
1072         unsigned int loop_count = 0;
1073         unsigned int inactive_burst_count = 0;
1074         unsigned int active_burst_count = 0;
1075         vm_page_t   local_freeq = 0;
1076         int         local_freed = 0;
1077         int         delayed_unlock = 0;
1078         int         need_internal_inactive = 0;
1079         int         refmod_state = 0;
1080         int     vm_pageout_deadlock_target = 0;
1081         struct  vm_pageout_queue *iq;
1082         struct  vm_pageout_queue *eq;
1083         struct  flow_control    flow_control;
1084         boolean_t active_throttled = FALSE;
1085         boolean_t inactive_throttled = FALSE;
1086         mach_timespec_t         ts;
1087         unsigned int msecs = 0;
1088         vm_object_t     object;
1089
1090
1091         flow_control.state = FCS_IDLE;
1092         iq = &vm_pageout_queue_internal;
1093         eq = &vm_pageout_queue_external;
1094
1095         XPR(XPR_VM_PAGEOUT, "vm_pageout_scan\n", 0, 0, 0, 0, 0);
1096
1097 /*???*/ /*
1098          *      We want to gradually dribble pages from the active queue
1099          *      to the inactive queue.  If we let the inactive queue get
1100          *      very small, and then suddenly dump many pages into it,
1101          *      those pages won't get a sufficient chance to be referenced
1102          *      before we start taking them from the inactive queue.
1103          *
1104          *      We must limit the rate at which we send pages to the pagers.
1105          *      data_write messages consume memory, for message buffers and
1106          *      for map-copy objects.  If we get too far ahead of the pagers,
1107          *      we can potentially run out of memory.
1108          *
1109          *      We can use the laundry count to limit directly the number
1110          *      of pages outstanding to the default pager.  A similar
1111          *      strategy for external pagers doesn't work, because
1112          *      external pagers don't have to deallocate the pages sent them,
1113          *      and because we might have to send pages to external pagers
1114          *      even if they aren't processing writes.  So we also
1115          *      use a burst count to limit writes to external pagers.
1116          *
1117          *      When memory is very tight, we can't rely on external pagers to
1118          *      clean pages.  They probably aren't running, because they
1119          *      aren't vm-privileged.  If we kept sending dirty pages to them,
1120          *      we could exhaust the free list.
1121          */
1122         vm_page_lock_queues();
1123         delayed_unlock = 1;
1124
1125
1126 Restart:
1127         /*
1128          *      Recalculate vm_page_inactivate_target.
1129          */
1130         vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
1131                                                           vm_page_inactive_count);
1132         object = NULL;
1133
1134         for (;;) {
1135                 vm_page_t m;
1136
1137                 if (delayed_unlock == 0)
1138                         vm_page_lock_queues();
1139
1140                 active_burst_count = vm_page_active_count;
1141
1142                 if (active_burst_count > vm_pageout_burst_active_throttle)
1143                         active_burst_count = vm_pageout_burst_active_throttle;
1144
1145                 /*
1146                  *      Move pages from active to inactive.
1147                  */
1148                 while ((need_internal_inactive ||
1149                            vm_page_inactive_count < vm_page_inactive_target) &&
1150                        !queue_empty(&vm_page_queue_active) &&
1151                        ((active_burst_count--) > 0)) {
1152
1153                         vm_pageout_active++;
1154
1155                         m = (vm_page_t) queue_first(&vm_page_queue_active);
1156
1157                         assert(m->active && !m->inactive);
1158                         assert(!m->laundry);
1159                         assert(m->object != kernel_object);
1160
1161                         /*
1162                          * Try to lock object; since we've already got the
1163                          * page queues lock, we can only 'try' for this one.
1164                          * if the 'try' fails, we need to do a mutex_pause
1165                          * to allow the owner of the object lock a chance to
1166                          * run... otherwise, we're likely to trip over this
1167                          * object in the same state as we work our way through
1168                          * the queue... clumps of pages associated with the same
1169                          * object are fairly typical on the inactive and active queues
1170                          */
1171                         if (m->object != object) {
1172                                 if (object != NULL) {
1173                                         vm_object_unlock(object);
1174                                         object = NULL;
1175                                 }
1176                                 if (!vm_object_lock_try(m->object)) {
1177                                         /*
1178                                          * move page to end of active queue and continue
1179                                          */
1180                                         queue_remove(&vm_page_queue_active, m,
1181                                                      vm_page_t, pageq);
1182                                         queue_enter(&vm_page_queue_active, m,
1183                                                     vm_page_t, pageq);
1184
1185                                         goto done_with_activepage;
1186                                 }
1187                                 object = m->object;
1188                         }
1189                         /*
1190                          * if the page is BUSY, then we pull it
1191                          * off the active queue and leave it alone.
1192                          * when BUSY is cleared, it will get stuck
1193                          * back on the appropriate queue
1194                          */
1195                         if (m->busy) {
1196                                 queue_remove(&vm_page_queue_active, m,
1197                                              vm_page_t, pageq);
1198                                 m->pageq.next = NULL;
1199                                 m->pageq.prev = NULL;
1200
1201                                 if (!m->fictitious)
1202                                         vm_page_active_count--;
1203                                 m->active = FALSE;
1204
1205                                 goto done_with_activepage;
1206                         }
1207                         if (need_internal_inactive) {
1208                                 /*
1209                                  * If we're unable to make forward progress
1210                                  * with the current set of pages on the
1211                                  * inactive queue due to busy objects or
1212                                  * throttled pageout queues, then
1213                                  * move a page that is already clean
1214                                  * or belongs to a pageout queue that
1215                                  * isn't currently throttled
1216                                  */
1217                                 active_throttled = FALSE;
1218
1219                                 if (object->internal) {
1220                                         if ((VM_PAGE_Q_THROTTLED(iq) || !IP_VALID(memory_manager_default)))
1221                                                 active_throttled = TRUE;
1222                                 } else if (VM_PAGE_Q_THROTTLED(eq)) {
1223                                                 active_throttled = TRUE;
1224                                 }
1225                                 if (active_throttled == TRUE) {
1226                                         if (!m->dirty) {
1227                                                 refmod_state = pmap_get_refmod(m->phys_page);
1228
1229                                                 if (refmod_state & VM_MEM_REFERENCED)
1230                                                         m->reference = TRUE;
1231                                                 if (refmod_state & VM_MEM_MODIFIED)
1232                                                         m->dirty = TRUE;
1233                                         }
1234                                         if (m->dirty || m->precious) {
1235                                                 /*
1236                                                  * page is dirty and targets a THROTTLED queue
1237                                                  * so all we can do is move it back to the
1238                                                  * end of the active queue to get it out
1239                                                  * of the way
1240                                                  */
1241                                                 queue_remove(&vm_page_queue_active, m,
1242                                                              vm_page_t, pageq);
1243                                                 queue_enter(&vm_page_queue_active, m,
1244                                                             vm_page_t, pageq);
1245
1246                                                 vm_pageout_scan_active_throttled++;
1247
1248                                                 goto done_with_activepage;
1249                                         }
1250                                 }
1251                                 vm_pageout_scan_active_throttle_success++;
1252                                 need_internal_inactive--;
1253                         }
1254                         /*
1255                          *      Deactivate the page while holding the object
1256                          *      locked, so we know the page is still not busy.
1257                          *      This should prevent races between pmap_enter
1258                          *      and pmap_clear_reference.  The page might be
1259                          *      absent or fictitious, but vm_page_deactivate
1260                          *      can handle that.
1261                          */
1262                         vm_page_deactivate(m);
1263 done_with_activepage:
1264                         if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
1265
1266                                 if (object != NULL) {
1267                                         vm_object_unlock(object);
1268                                         object = NULL;
1269                                 }
1270                                 if (local_freeq) {
1271                                         vm_page_free_list(local_freeq);
1272
1273                                         local_freeq = 0;
1274                                         local_freed = 0;
1275                                 }
1276                                 delayed_unlock = 0;
1277                                 vm_page_unlock_queues();
1278
1279                                 mutex_pause();
1280                                 vm_page_lock_queues();
1281                                 /*
1282                                  * continue the while loop processing
1283                                  * the active queue... need to hold
1284                                  * the page queues lock
1285                                  */
1286                                 continue;
1287                         }
1288                 }
1289
1290
1291
1292                 /**********************************************************************
1293                  * above this point we're playing with the active queue
1294                  * below this point we're playing with the throttling mechanisms
1295                  * and the inactive queue
1296                  **********************************************************************/
1297
1298
1299
1300                 /*
1301                  *      We are done if we have met our target *and*
1302                  *      nobody is still waiting for a page.
1303                  */
1304                 if (vm_page_free_count + local_freed >= vm_page_free_target) {
1305                         if (object != NULL) {
1306                                 vm_object_unlock(object);
1307                                 object = NULL;
1308                         }
1309                         if (local_freeq) {
1310                                 vm_page_free_list(local_freeq);
1311
1312                                 local_freeq = 0;
1313                                 local_freed = 0;
1314                         }
1315                         mutex_lock(&vm_page_queue_free_lock);
1316
1317                         if ((vm_page_free_count >= vm_page_free_target) &&
1318                                   (vm_page_free_wanted == 0)) {
1319
1320                                 vm_page_unlock_queues();
1321
1322                                 thread_wakeup((event_t) &vm_pageout_garbage_collect);
1323                                 return;
1324                         }
1325                         mutex_unlock(&vm_page_queue_free_lock);
1326                 }
1327
1328
1329                 /*
1330                  * Sometimes we have to pause:
1331                  *      1) No inactive pages - nothing to do.
1332                  *      2) Flow control - default pageout queue is full
1333                  *      3) Loop control - no acceptable pages found on the inactive queue
1334                  *         within the last vm_pageout_burst_inactive_throttle iterations
1335                  */
1336                 if ((queue_empty(&vm_page_queue_inactive) && queue_empty(&vm_page_queue_zf))) {
1337                         vm_pageout_scan_empty_throttle++;
1338                         msecs = vm_pageout_empty_wait;
1339                         goto vm_pageout_scan_delay;
1340
1341                 } else if (inactive_burst_count >= vm_pageout_burst_inactive_throttle) {
1342                         vm_pageout_scan_burst_throttle++;
1343                         msecs = vm_pageout_burst_wait;
1344                         goto vm_pageout_scan_delay;
1345
1346                 } else if (VM_PAGE_Q_THROTTLED(iq)) {
1347
1348                         switch (flow_control.state) {
1349
1350                         case FCS_IDLE:
1351 reset_deadlock_timer:
1352                                 ts.tv_sec = vm_pageout_deadlock_wait / 1000;
1353                                 ts.tv_nsec = (vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
1354                                 clock_get_system_nanotime(
1355                                         &flow_control.ts.tv_sec,
1356                                         (uint32_t *) &flow_control.ts.tv_nsec);
1357                                 ADD_MACH_TIMESPEC(&flow_control.ts, &ts);
1358
1359                                 flow_control.state = FCS_DELAYED;
1360                                 msecs = vm_pageout_deadlock_wait;
1361
1362                                 break;
1363
1364                         case FCS_DELAYED:
1365                                 clock_get_system_nanotime(
1366                                         &ts.tv_sec,
1367                                         (uint32_t *) &ts.tv_nsec);
1368
1369                                 if (CMP_MACH_TIMESPEC(&ts, &flow_control.ts) >= 0) {
1370                                         /*
1371                                          * the pageout thread for the default pager is potentially
1372                                          * deadlocked since the
1373                                          * default pager queue has been throttled for more than the
1374                                          * allowable time... we need to move some clean pages or dirty
1375                                          * pages belonging to the external pagers if they aren't throttled
1376                                          * vm_page_free_wanted represents the number of threads currently
1377                                          * blocked waiting for pages... we'll move one page for each of
1378                                          * these plus a fixed amount to break the logjam... once we're done
1379                                          * moving this number of pages, we'll re-enter the FSC_DELAYED state
1380                                          * with a new timeout target since we have no way of knowing
1381                                          * whether we've broken the deadlock except through observation
1382                                          * of the queue associated with the default pager... we need to
1383                                          * stop moving pagings and allow the system to run to see what
1384                                          * state it settles into.
1385                                          */
1386                                         vm_pageout_deadlock_target = vm_pageout_deadlock_relief + vm_page_free_wanted;
1387                                         vm_pageout_scan_deadlock_detected++;
1388                                         flow_control.state = FCS_DEADLOCK_DETECTED;
1389
1390                                         thread_wakeup((event_t) &vm_pageout_garbage_collect);
1391                                         goto consider_inactive;
1392                                 }
1393                                 /*
1394                                  * just resniff instead of trying
1395                                  * to compute a new delay time... we're going to be
1396                                  * awakened immediately upon a laundry completion,
1397                                  * so we won't wait any longer than necessary
1398                                  */
1399                                 msecs = vm_pageout_idle_wait;
1400                                 break;
1401
1402                         case FCS_DEADLOCK_DETECTED:
1403                                 if (vm_pageout_deadlock_target)
1404                                         goto consider_inactive;
1405                                 goto reset_deadlock_timer;
1406
1407                         }
1408                         vm_pageout_scan_throttle++;
1409                         iq->pgo_throttled = TRUE;
1410 vm_pageout_scan_delay:
1411                         if (object != NULL) {
1412                                 vm_object_unlock(object);
1413                                 object = NULL;
1414                         }
1415                         if (local_freeq) {
1416                                 vm_page_free_list(local_freeq);
1417
1418                                 local_freeq = 0;
1419                                 local_freed = 0;
1420                         }
1421                         assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000*NSEC_PER_USEC);
1422
1423                         counter(c_vm_pageout_scan_block++);
1424
1425                         vm_page_unlock_queues();
1426
1427                         thread_block(THREAD_CONTINUE_NULL);
1428
1429                         vm_page_lock_queues();
1430                         delayed_unlock = 1;
1431
1432                         iq->pgo_throttled = FALSE;
1433
1434                         if (loop_count >= vm_page_inactive_count) {
1435                                 if (VM_PAGE_Q_THROTTLED(eq) || VM_PAGE_Q_THROTTLED(iq)) {
1436                                         /*
1437                                          * Make sure we move enough "appropriate"
1438                                          * pages to the inactive queue before trying
1439                                          * again.
1440                                          */
1441                                         need_internal_inactive = vm_pageout_inactive_relief;
1442                                 }
1443                                 loop_count = 0;
1444                         }
1445                         inactive_burst_count = 0;
1446
1447                         goto Restart;
1448                         /*NOTREACHED*/
1449                 }
1450
1451
1452                 flow_control.state = FCS_IDLE;
1453 consider_inactive:
1454                 loop_count++;
1455                 inactive_burst_count++;
1456                 vm_pageout_inactive++;
1457
1458                 if (!queue_empty(&vm_page_queue_inactive)) {
1459                         m = (vm_page_t) queue_first(&vm_page_queue_inactive);
1460
1461                         if (m->clustered && (m->no_isync == TRUE)) {
1462                                 goto use_this_page;
1463                         }
1464                 }
1465                 if (vm_zf_count < vm_accellerate_zf_pageout_trigger) {
1466                         vm_zf_iterator = 0;
1467                 } else {
1468                         last_page_zf = 0;
1469                         if((vm_zf_iterator+=1) >= vm_zf_iterator_count) {
1470                                         vm_zf_iterator = 0;
1471                         }
1472                 }
1473                 if (queue_empty(&vm_page_queue_zf) ||
1474                                 (((last_page_zf) || (vm_zf_iterator == 0)) &&
1475                                 !queue_empty(&vm_page_queue_inactive))) {
1476                         m = (vm_page_t) queue_first(&vm_page_queue_inactive);
1477                         last_page_zf = 0;
1478                 } else {
1479                         m = (vm_page_t) queue_first(&vm_page_queue_zf);
1480                         last_page_zf = 1;
1481                 }
1482 use_this_page:
1483                 assert(!m->active && m->inactive);
1484                 assert(!m->laundry);
1485                 assert(m->object != kernel_object);
1486
1487                 /*
1488                  * Try to lock object; since we've alread got the
1489                  * page queues lock, we can only 'try' for this one.
1490                  * if the 'try' fails, we need to do a mutex_pause
1491                  * to allow the owner of the object lock a chance to
1492                  * run... otherwise, we're likely to trip over this
1493                  * object in the same state as we work our way through
1494                  * the queue... clumps of pages associated with the same
1495                  * object are fairly typical on the inactive and active queues
1496                  */
1497                 if (m->object != object) {
1498                         if (object != NULL) {
1499                                 vm_object_unlock(object);
1500                                 object = NULL;
1501                         }
1502                         if (!vm_object_lock_try(m->object)) {
1503                                 /*
1504                                  *      Move page to end and continue.
1505                                  *      Don't re-issue ticket
1506                                  */
1507                                 if (m->zero_fill) {
1508                                         queue_remove(&vm_page_queue_zf, m,
1509                                                      vm_page_t, pageq);
1510                                         queue_enter(&vm_page_queue_zf, m,
1511                                                     vm_page_t, pageq);
1512                                 } else {
1513                                         queue_remove(&vm_page_queue_inactive, m,
1514                                                      vm_page_t, pageq);
1515                                         queue_enter(&vm_page_queue_inactive, m,
1516                                                     vm_page_t, pageq);
1517                                 }
1518                                 vm_pageout_inactive_nolock++;
1519
1520                                 /*
1521                                  * force us to dump any collected free pages
1522                                  * and to pause before moving on
1523                                  */
1524                                 delayed_unlock = DELAYED_UNLOCK_LIMIT + 1;
1525
1526                                 goto done_with_inactivepage;
1527                         }
1528                         object = m->object;
1529                 }
1530                 /*
1531                  * If the page belongs to a purgable object with no pending copies
1532                  * against it, then we reap all of the pages in the object
1533                  * and note that the object has been "emptied".  It'll be up to the
1534                  * application the discover this and recreate its contents if desired.
1535                  */
1536                 if ((object->purgable == VM_OBJECT_PURGABLE_VOLATILE ||
1537                      object->purgable == VM_OBJECT_PURGABLE_EMPTY) &&
1538                     object->copy == VM_OBJECT_NULL) {
1539
1540                         (void) vm_object_purge(object);
1541                         vm_pageout_purged_objects++;
1542                         /*
1543                          * we've just taken all of the pages from this object,
1544                          * so drop the lock now since we're not going to find
1545                          * any more pages belonging to it anytime soon
1546                          */
1547                         vm_object_unlock(object);
1548                         object = NULL;
1549
1550                         inactive_burst_count = 0;
1551
1552                         goto done_with_inactivepage;
1553                 }
1554
1555                 /*
1556                  *      Paging out pages of external objects which
1557                  *      are currently being created must be avoided.
1558                  *      The pager may claim for memory, thus leading to a
1559                  *      possible dead lock between it and the pageout thread,
1560                  *      if such pages are finally chosen. The remaining assumption
1561                  *      is that there will finally be enough available pages in the
1562                  *      inactive pool to page out in order to satisfy all memory
1563                  *      claimed by the thread which concurrently creates the pager.
1564                  */
1565                 if (!object->pager_initialized && object->pager_created) {
1566                         /*
1567                          *      Move page to end and continue, hoping that
1568                          *      there will be enough other inactive pages to
1569                          *      page out so that the thread which currently
1570                          *      initializes the pager will succeed.
1571                          *      Don't re-grant the ticket, the page should
1572                          *      pulled from the queue and paged out whenever
1573                          *      one of its logically adjacent fellows is
1574                          *      targeted.
1575                          */
1576                         if (m->zero_fill) {
1577                                 queue_remove(&vm_page_queue_zf, m,
1578                                              vm_page_t, pageq);
1579                                 queue_enter(&vm_page_queue_zf, m,
1580                                             vm_page_t, pageq);
1581                                 last_page_zf = 1;
1582                                 vm_zf_iterator = vm_zf_iterator_count - 1;
1583                         } else {
1584                                 queue_remove(&vm_page_queue_inactive, m,
1585                                              vm_page_t, pageq);
1586                                 queue_enter(&vm_page_queue_inactive, m,
1587                                             vm_page_t, pageq);
1588                                 last_page_zf = 0;
1589                                 vm_zf_iterator = 1;
1590                         }
1591                         vm_pageout_inactive_avoid++;
1592
1593                         goto done_with_inactivepage;
1594                 }
1595                 /*
1596                  *      Remove the page from the inactive list.
1597                  */
1598                 if (m->zero_fill) {
1599                         queue_remove(&vm_page_queue_zf, m, vm_page_t, pageq);
1600                 } else {
1601                         queue_remove(&vm_page_queue_inactive, m, vm_page_t, pageq);
1602                 }
1603                 m->pageq.next = NULL;
1604                 m->pageq.prev = NULL;
1605                 m->inactive = FALSE;
1606                 if (!m->fictitious)
1607                         vm_page_inactive_count--;
1608
1609                 if (m->busy || !object->alive) {
1610                         /*
1611                          *      Somebody is already playing with this page.
1612                          *      Leave it off the pageout queues.
1613                          */
1614                         vm_pageout_inactive_busy++;
1615
1616                         goto done_with_inactivepage;
1617                 }
1618
1619                 /*
1620                  *      If it's absent or in error, we can reclaim the page.
1621                  */
1622
1623                 if (m->absent || m->error) {
1624                         vm_pageout_inactive_absent++;
1625 reclaim_page:
1626                         if (vm_pageout_deadlock_target) {
1627                                 vm_pageout_scan_inactive_throttle_success++;
1628                                 vm_pageout_deadlock_target--;
1629                         }
1630                         if (m->tabled)
1631                                 vm_page_remove(m);    /* clears tabled, object, offset */
1632                         if (m->absent)
1633                                 vm_object_absent_release(object);
1634
1635                         assert(m->pageq.next == NULL &&
1636                                m->pageq.prev == NULL);
1637                         m->pageq.next = (queue_entry_t)local_freeq;
1638                         local_freeq = m;
1639                         local_freed++;
1640
1641                         inactive_burst_count = 0;
1642
1643                         goto done_with_inactivepage;
1644                 }
1645
1646                 assert(!m->private);
1647                 assert(!m->fictitious);
1648
1649                 /*
1650                  *      If already cleaning this page in place, convert from
1651                  *      "adjacent" to "target". We can leave the page mapped,
1652                  *      and vm_pageout_object_terminate will determine whether
1653                  *      to free or reactivate.
1654                  */
1655
1656                 if (m->cleaning) {
1657                         m->busy = TRUE;
1658                         m->pageout = TRUE;
1659                         m->dump_cleaning = TRUE;
1660                         vm_page_wire(m);
1661
1662                         CLUSTER_STAT(vm_pageout_cluster_conversions++);
1663
1664                         inactive_burst_count = 0;
1665
1666                         goto done_with_inactivepage;
1667                 }
1668
1669                 /*
1670                  *      If it's being used, reactivate.
1671                  *      (Fictitious pages are either busy or absent.)
1672                  */
1673                 if ( (!m->reference) ) {
1674                         refmod_state = pmap_get_refmod(m->phys_page);
1675
1676                         if (refmod_state & VM_MEM_REFERENCED)
1677                                 m->reference = TRUE;
1678                         if (refmod_state & VM_MEM_MODIFIED)
1679                                 m->dirty = TRUE;
1680                 }
1681                 if (m->reference) {
1682 was_referenced:
1683                         vm_page_activate(m);
1684                         VM_STAT(reactivations++);
1685
1686                         vm_pageout_inactive_used++;
1687                         last_page_zf = 0;
1688                         inactive_burst_count = 0;
1689
1690                         goto done_with_inactivepage;
1691                 }
1692
1693                 XPR(XPR_VM_PAGEOUT,
1694                 "vm_pageout_scan, replace object 0x%X offset 0x%X page 0x%X\n",
1695                 (integer_t)object, (integer_t)m->offset, (integer_t)m, 0,0);
1696
1697                 /*
1698                  * we've got a candidate page to steal...
1699                  *
1700                  * m->dirty is up to date courtesy of the
1701                  * preceding check for m->reference... if
1702                  * we get here, then m->reference had to be
1703                  * FALSE which means we did a pmap_get_refmod
1704                  * and updated both m->reference and m->dirty
1705                  *
1706                  * if it's dirty or precious we need to
1707                  * see if the target queue is throtttled
1708                  * it if is, we need to skip over it by moving it back
1709                  * to the end of the inactive queue
1710                  */
1711                 inactive_throttled = FALSE;
1712
1713                 if (m->dirty || m->precious) {
1714                         if (object->internal) {
1715                                 if ((VM_PAGE_Q_THROTTLED(iq) || !IP_VALID(memory_manager_default)))
1716                                         inactive_throttled = TRUE;
1717                         } else if (VM_PAGE_Q_THROTTLED(eq)) {
1718                                         inactive_throttled = TRUE;
1719                         }
1720                 }
1721                 if (inactive_throttled == TRUE) {
1722                         if (m->zero_fill) {
1723                                 queue_enter(&vm_page_queue_zf, m,
1724                                             vm_page_t, pageq);
1725                         } else {
1726                                 queue_enter(&vm_page_queue_inactive, m,
1727                                             vm_page_t, pageq);
1728                         }
1729                         if (!m->fictitious)
1730                                 vm_page_inactive_count++;
1731                         m->inactive = TRUE;
1732
1733                         vm_pageout_scan_inactive_throttled++;
1734
1735                         goto done_with_inactivepage;
1736                 }
1737                 /*
1738                  * we've got a page that we can steal...
1739                  * eliminate all mappings and make sure
1740                  * we have the up-to-date modified state
1741                  * first take the page BUSY, so that no new
1742                  * mappings can be made
1743                  */
1744                 m->busy = TRUE;
1745
1746                 /*
1747                  * if we need to do a pmap_disconnect then we
1748                  * need to re-evaluate m->dirty since the pmap_disconnect
1749                  * provides the true state atomically... the
1750                  * page was still mapped up to the pmap_disconnect
1751                  * and may have been dirtied at the last microsecond
1752                  *
1753                  * we also check for the page being referenced 'late'
1754                  * if it was, we first need to do a WAKEUP_DONE on it
1755                  * since we already set m->busy = TRUE, before
1756                  * going off to reactivate it
1757                  *
1758                  * if we don't need the pmap_disconnect, then
1759                  * m->dirty is up to date courtesy of the
1760                  * earlier check for m->reference... if
1761                  * we get here, then m->reference had to be
1762                  * FALSE which means we did a pmap_get_refmod
1763                  * and updated both m->reference and m->dirty...
1764                  */
1765                 if (m->no_isync == FALSE) {
1766                         refmod_state = pmap_disconnect(m->phys_page);
1767
1768                         if (refmod_state & VM_MEM_MODIFIED)
1769                                 m->dirty = TRUE;
1770                         if (refmod_state & VM_MEM_REFERENCED) {
1771                                 m->reference = TRUE;
1772
1773                                 PAGE_WAKEUP_DONE(m);
1774                                 goto was_referenced;
1775                         }
1776                 }
1777                 /*
1778                  *      If it's clean and not precious, we can free the page.
1779                  */
1780                 if (!m->dirty && !m->precious) {
1781                         vm_pageout_inactive_clean++;
1782                         goto reclaim_page;
1783                 }
1784                 vm_pageout_cluster(m);
1785
1786                 vm_pageout_inactive_dirty++;
1787
1788                 inactive_burst_count = 0;
1789
1790 done_with_inactivepage:
1791                 if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
1792
1793                         if (object != NULL) {
1794                                 vm_object_unlock(object);
1795                                 object = NULL;
1796                         }
1797                         if (local_freeq) {
1798                                 vm_page_free_list(local_freeq);
1799
1800                                 local_freeq = 0;
1801                                 local_freed = 0;
1802                         }
1803                         delayed_unlock = 0;
1804                         vm_page_unlock_queues();
1805                         mutex_pause();
1806                 }
1807                 /*
1808                  * back to top of pageout scan loop
1809                  */
1810         }
1811 }
1812
1813
1814 int vm_page_free_count_init;
1815
1816 void
1817 vm_page_free_reserve(
1818         int pages)
1819 {
1820         int             free_after_reserve;
1821
1822         vm_page_free_reserved += pages;
1823
1824         free_after_reserve = vm_page_free_count_init - vm_page_free_reserved;
1825
1826         vm_page_free_min = vm_page_free_reserved +
1827                 VM_PAGE_FREE_MIN(free_after_reserve);
1828
1829         vm_page_free_target = vm_page_free_reserved +
1830                 VM_PAGE_FREE_TARGET(free_after_reserve);
1831
1832         if (vm_page_free_target < vm_page_free_min + 5)
1833                 vm_page_free_target = vm_page_free_min + 5;
1834 }
1835
1836 /*
1837  *      vm_pageout is the high level pageout daemon.
1838  */
1839
1840 void
1841 vm_pageout_continue(void)
1842 {
1843         vm_pageout_scan_event_counter++;
1844         vm_pageout_scan();
1845         /* we hold vm_page_queue_free_lock now */
1846         assert(vm_page_free_wanted == 0);
1847         assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
1848         mutex_unlock(&vm_page_queue_free_lock);
1849
1850         counter(c_vm_pageout_block++);
1851         thread_block((thread_continue_t)vm_pageout_continue);
1852         /*NOTREACHED*/
1853 }
1854
1855
1856 /*
1857  * must be called with the
1858  * queues and object locks held
1859  */
1860 static void
1861 vm_pageout_queue_steal(vm_page_t m)
1862 {
1863         struct vm_pageout_queue *q;
1864
1865         if (m->object->internal == TRUE)
1866                 q = &vm_pageout_queue_internal;
1867         else
1868                 q = &vm_pageout_queue_external;
1869
1870         m->laundry = FALSE;
1871         m->pageout_queue = FALSE;
1872         queue_remove(&q->pgo_pending, m, vm_page_t, pageq);
1873
1874         m->pageq.next = NULL;
1875         m->pageq.prev = NULL;
1876
1877         vm_object_paging_end(m->object);
1878
1879         q->pgo_laundry--;
1880 }
1881
1882
1883 #ifdef FAKE_DEADLOCK
1884
1885 #define FAKE_COUNT      5000
1886
1887 int internal_count = 0;
1888 int fake_deadlock = 0;
1889
1890 #endif
1891
1892 static void
1893 vm_pageout_iothread_continue(struct vm_pageout_queue *q)
1894 {
1895         vm_page_t       m = NULL;
1896         vm_object_t     object;
1897         boolean_t       need_wakeup;
1898
1899         vm_page_lock_queues();
1900
1901         while ( !queue_empty(&q->pgo_pending) ) {
1902
1903                    q->pgo_busy = TRUE;
1904                    queue_remove_first(&q->pgo_pending, m, vm_page_t, pageq);
1905                    m->pageout_queue = FALSE;
1906                    vm_page_unlock_queues();
1907
1908                    m->pageq.next = NULL;
1909                    m->pageq.prev = NULL;
1910 #ifdef FAKE_DEADLOCK
1911                    if (q == &vm_pageout_queue_internal) {
1912                            vm_offset_t addr;
1913                            int  pg_count;
1914
1915                            internal_count++;
1916
1917                            if ((internal_count == FAKE_COUNT)) {
1918
1919                                    pg_count = vm_page_free_count + vm_page_free_reserved;
1920
1921                                    if (kmem_alloc(kernel_map, &addr, PAGE_SIZE * pg_count) == KERN_SUCCESS) {
1922                                            kmem_free(kernel_map, addr, PAGE_SIZE * pg_count);
1923                                    }
1924                                    internal_count = 0;
1925                                    fake_deadlock++;
1926                            }
1927                    }
1928 #endif
1929                    object = m->object;
1930
1931                    if (!object->pager_initialized) {
1932                            vm_object_lock(object);
1933
1934                            /*
1935                             *   If there is no memory object for the page, create
1936                             *   one and hand it to the default pager.
1937                             */
1938
1939                            if (!object->pager_initialized)
1940                                    vm_object_collapse(object,
1941                                                       (vm_object_offset_t) 0,
1942                                                       TRUE);
1943                            if (!object->pager_initialized)
1944                                    vm_object_pager_create(object);
1945                            if (!object->pager_initialized) {
1946                                    /*
1947                                     *   Still no pager for the object.
1948                                     *   Reactivate the page.
1949                                     *
1950                                     *   Should only happen if there is no
1951                                     *   default pager.
1952                                     */
1953                                    m->list_req_pending = FALSE;
1954                                    m->cleaning = FALSE;
1955                                    m->pageout = FALSE;
1956                                    vm_page_unwire(m);
1957
1958                                    vm_pageout_throttle_up(m);
1959
1960                                    vm_page_lock_queues();
1961                                    vm_pageout_dirty_no_pager++;
1962                                    vm_page_activate(m);
1963                                    vm_page_unlock_queues();
1964
1965                                    /*
1966                                     *   And we are done with it.
1967                                     */
1968                                    PAGE_WAKEUP_DONE(m);
1969
1970                                    vm_object_paging_end(object);
1971                                    vm_object_unlock(object);
1972
1973                                    vm_page_lock_queues();
1974                                    continue;
1975                            } else if (object->pager == MEMORY_OBJECT_NULL) {
1976                                    /*
1977                                     * This pager has been destroyed by either
1978                                     * memory_object_destroy or vm_object_destroy, and
1979                                     * so there is nowhere for the page to go.
1980                                     * Just free the page... VM_PAGE_FREE takes
1981                                     * care of cleaning up all the state...
1982                                     * including doing the vm_pageout_throttle_up
1983                                     */
1984                                    VM_PAGE_FREE(m);
1985
1986                                    vm_object_paging_end(object);
1987                                    vm_object_unlock(object);
1988
1989                                    vm_page_lock_queues();
1990                                    continue;
1991                            }
1992                            vm_object_unlock(object);
1993                    }
1994                    /*
1995                     * we expect the paging_in_progress reference to have
1996                     * already been taken on the object before it was added
1997                     * to the appropriate pageout I/O queue... this will
1998                     * keep the object from being terminated and/or the
1999                     * paging_offset from changing until the I/O has
2000                     * completed... therefore no need to lock the object to
2001                     * pull the paging_offset from it.
2002                     *
2003                     * Send the data to the pager.
2004                     * any pageout clustering happens there
2005                     */
2006                    memory_object_data_return(object->pager,
2007                                              m->offset + object->paging_offset,
2008                                              PAGE_SIZE,
2009                                              NULL,
2010                                              NULL,
2011                                              FALSE,
2012                                              FALSE,
2013                                              0);
2014
2015                    vm_object_lock(object);
2016                    vm_object_paging_end(object);
2017                    vm_object_unlock(object);
2018
2019                    vm_page_lock_queues();
2020         }
2021         assert_wait((event_t) q, THREAD_UNINT);
2022
2023
2024         if (q->pgo_throttled == TRUE && !VM_PAGE_Q_THROTTLED(q)) {
2025                 q->pgo_throttled = FALSE;
2026                 need_wakeup = TRUE;
2027         } else
2028                 need_wakeup = FALSE;
2029
2030         q->pgo_busy = FALSE;
2031         q->pgo_idle = TRUE;
2032         vm_page_unlock_queues();
2033
2034         if (need_wakeup == TRUE)
2035                 thread_wakeup((event_t) &q->pgo_laundry);
2036
2037         thread_block_parameter((thread_continue_t)vm_pageout_iothread_continue, (void *) &q->pgo_pending);
2038         /*NOTREACHED*/
2039 }
2040
2041
2042 static void
2043 vm_pageout_iothread_external(void)
2044 {
2045
2046         vm_pageout_iothread_continue(&vm_pageout_queue_external);
2047         /*NOTREACHED*/
2048 }
2049
2050
2051 static void
2052 vm_pageout_iothread_internal(void)
2053 {
2054         thread_t        self = current_thread();
2055
2056         self->options |= TH_OPT_VMPRIV;
2057
2058         vm_pageout_iothread_continue(&vm_pageout_queue_internal);
2059         /*NOTREACHED*/
2060 }
2061
2062 static void
2063 vm_pageout_garbage_collect(int collect)
2064 {
2065         if (collect) {
2066                 stack_collect();
2067
2068                 /*
2069                  * consider_zone_gc should be last, because the other operations
2070                  * might return memory to zones.
2071                  */
2072                 consider_machine_collect();
2073                 consider_zone_gc();
2074
2075                 consider_machine_adjust();
2076         }
2077
2078         assert_wait((event_t) &vm_pageout_garbage_collect, THREAD_UNINT);
2079
2080         thread_block_parameter((thread_continue_t) vm_pageout_garbage_collect, (void *)1);
2081         /*NOTREACHED*/
2082 }
2083
2084
2085
2086 void
2087 vm_pageout(void)
2088 {
2089         thread_t        self = current_thread();
2090         thread_t        thread;
2091         kern_return_t   result;
2092         spl_t           s;
2093
2094         /*
2095          * Set thread privileges.
2096          */
2097         s = splsched();
2098         thread_lock(self);
2099         self->priority = BASEPRI_PREEMPT - 1;
2100         set_sched_pri(self, self->priority);
2101         thread_unlock(self);
2102         splx(s);
2103
2104         /*
2105          *      Initialize some paging parameters.
2106          */
2107
2108         if (vm_pageout_idle_wait == 0)
2109                 vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
2110
2111         if (vm_pageout_burst_wait == 0)
2112                 vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
2113
2114         if (vm_pageout_empty_wait == 0)
2115                 vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
2116
2117         if (vm_pageout_deadlock_wait == 0)
2118                 vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
2119
2120         if (vm_pageout_deadlock_relief == 0)
2121                 vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
2122
2123         if (vm_pageout_inactive_relief == 0)
2124                 vm_pageout_inactive_relief = VM_PAGEOUT_INACTIVE_RELIEF;
2125
2126         if (vm_pageout_burst_active_throttle == 0)
2127                 vm_pageout_burst_active_throttle = VM_PAGEOUT_BURST_ACTIVE_THROTTLE;
2128
2129         if (vm_pageout_burst_inactive_throttle == 0)
2130                 vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
2131
2132         /*
2133          * Set kernel task to low backing store privileged
2134          * status
2135          */
2136         task_lock(kernel_task);
2137         kernel_task->priv_flags |= VM_BACKING_STORE_PRIV;
2138         task_unlock(kernel_task);
2139
2140         vm_page_free_count_init = vm_page_free_count;
2141         vm_zf_iterator = 0;
2142         /*
2143          * even if we've already called vm_page_free_reserve
2144          * call it again here to insure that the targets are
2145          * accurately calculated (it uses vm_page_free_count_init)
2146          * calling it with an arg of 0 will not change the reserve
2147          * but will re-calculate free_min and free_target
2148          */
2149         if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
2150                 vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
2151         } else
2152                 vm_page_free_reserve(0);
2153
2154
2155         queue_init(&vm_pageout_queue_external.pgo_pending);
2156         vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
2157         vm_pageout_queue_external.pgo_laundry = 0;
2158         vm_pageout_queue_external.pgo_idle = FALSE;
2159         vm_pageout_queue_external.pgo_busy = FALSE;
2160         vm_pageout_queue_external.pgo_throttled = FALSE;
2161
2162         queue_init(&vm_pageout_queue_internal.pgo_pending);
2163         vm_pageout_queue_internal.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
2164         vm_pageout_queue_internal.pgo_laundry = 0;
2165         vm_pageout_queue_internal.pgo_idle = FALSE;
2166         vm_pageout_queue_internal.pgo_busy = FALSE;
2167         vm_pageout_queue_internal.pgo_throttled = FALSE;
2168
2169
2170         result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal, NULL, BASEPRI_PREEMPT - 1, &thread);
2171         if (result != KERN_SUCCESS)
2172                 panic("vm_pageout_iothread_internal: create failed");
2173
2174         thread_deallocate(thread);
2175
2176
2177         result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external, NULL, BASEPRI_PREEMPT - 1, &thread);
2178         if (result != KERN_SUCCESS)
2179                 panic("vm_pageout_iothread_external: create failed");
2180
2181         thread_deallocate(thread);
2182
2183
2184         result = kernel_thread_start_priority((thread_continue_t)vm_pageout_garbage_collect, NULL, BASEPRI_PREEMPT - 2, &thread);
2185         if (result != KERN_SUCCESS)
2186                 panic("vm_pageout_garbage_collect: create failed");
2187
2188         thread_deallocate(thread);
2189
2190         vm_object_reaper_init();
2191
2192         vm_pageout_continue();
2193         /*NOTREACHED*/
2194 }
2195
2196
2197 static upl_t
2198 upl_create(
2199         int                flags,
2200         upl_size_t       size)
2201 {
2202         upl_t   upl;
2203         int     page_field_size;  /* bit field in word size buf */
2204
2205         page_field_size = 0;
2206         if (flags & UPL_CREATE_LITE) {
2207                 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
2208                 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
2209         }
2210         if(flags & UPL_CREATE_INTERNAL) {
2211                 upl = (upl_t)kalloc(sizeof(struct upl)
2212                         + (sizeof(struct upl_page_info)*(size/PAGE_SIZE))
2213                         + page_field_size);
2214         } else {
2215                 upl = (upl_t)kalloc(sizeof(struct upl) + page_field_size);
2216         }
2217         upl->flags = 0;
2218         upl->src_object = NULL;
2219         upl->kaddr = (vm_offset_t)0;
2220         upl->size = 0;
2221         upl->map_object = NULL;
2222         upl->ref_count = 1;
2223         upl->highest_page = 0;
2224         upl_lock_init(upl);
2225 #ifdef UPL_DEBUG
2226         upl->ubc_alias1 = 0;
2227         upl->ubc_alias2 = 0;
2228 #endif /* UPL_DEBUG */
2229         return(upl);
2230 }
2231
2232 static void
2233 upl_destroy(
2234         upl_t   upl)
2235 {
2236         int     page_field_size;  /* bit field in word size buf */
2237
2238 #ifdef UPL_DEBUG
2239         {
2240                 upl_t   upl_ele;
2241                 vm_object_t     object;
2242                 if (upl->map_object->pageout) {
2243                         object = upl->map_object->shadow;
2244                 } else {
2245                         object = upl->map_object;
2246                 }
2247                 vm_object_lock(object);
2248                 queue_iterate(&object->uplq, upl_ele, upl_t, uplq) {
2249                         if(upl_ele == upl) {
2250                                 queue_remove(&object->uplq,
2251                                                 upl_ele, upl_t, uplq);
2252                                 break;
2253                         }
2254                 }
2255                 vm_object_unlock(object);
2256         }
2257 #endif /* UPL_DEBUG */
2258         /* drop a reference on the map_object whether or */
2259         /* not a pageout object is inserted */
2260         if(upl->map_object->pageout)
2261                 vm_object_deallocate(upl->map_object);
2262
2263         page_field_size = 0;
2264         if (upl->flags & UPL_LITE) {
2265                 page_field_size = ((upl->size/PAGE_SIZE) + 7) >> 3;
2266                 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
2267         }
2268         if(upl->flags & UPL_INTERNAL) {
2269                 kfree(upl,
2270                       sizeof(struct upl) +
2271                       (sizeof(struct upl_page_info) * (upl->size/PAGE_SIZE))
2272                       + page_field_size);
2273         } else {
2274                 kfree(upl, sizeof(struct upl) + page_field_size);
2275         }
2276 }
2277
2278 void uc_upl_dealloc(upl_t upl);
2279 __private_extern__ void
2280 uc_upl_dealloc(
2281         upl_t   upl)
2282 {
2283         upl->ref_count -= 1;
2284         if(upl->ref_count == 0) {
2285                 upl_destroy(upl);
2286         }
2287 }
2288
2289 void
2290 upl_deallocate(
2291         upl_t   upl)
2292 {
2293
2294         upl->ref_count -= 1;
2295         if(upl->ref_count == 0) {
2296                 upl_destroy(upl);
2297         }
2298 }
2299
2300 /*
2301  * Statistics about UPL enforcement of copy-on-write obligations.
2302  */
2303 unsigned long upl_cow = 0;
2304 unsigned long upl_cow_again = 0;
2305 unsigned long upl_cow_contiguous = 0;
2306 unsigned long upl_cow_pages = 0;
2307 unsigned long upl_cow_again_pages = 0;
2308 unsigned long upl_cow_contiguous_pages = 0;
2309
2310 /*
2311  *      Routine:        vm_object_upl_request
2312  *      Purpose:
2313  *              Cause the population of a portion of a vm_object.
2314  *              Depending on the nature of the request, the pages
2315  *              returned may be contain valid data or be uninitialized.
2316  *              A page list structure, listing the physical pages
2317  *              will be returned upon request.
2318  *              This function is called by the file system or any other
2319  *              supplier of backing store to a pager.
2320  *              IMPORTANT NOTE: The caller must still respect the relationship
2321  *              between the vm_object and its backing memory object.  The
2322  *              caller MUST NOT substitute changes in the backing file
2323  *              without first doing a memory_object_lock_request on the
2324  *              target range unless it is know that the pages are not
2325  *              shared with another entity at the pager level.
2326  *              Copy_in_to:
2327  *                      if a page list structure is present
2328  *                      return the mapped physical pages, where a
2329  *                      page is not present, return a non-initialized
2330  *                      one.  If the no_sync bit is turned on, don't
2331  *                      call the pager unlock to synchronize with other
2332  *                      possible copies of the page. Leave pages busy
2333  *                      in the original object, if a page list structure
2334  *                      was specified.  When a commit of the page list
2335  *                      pages is done, the dirty bit will be set for each one.
2336  *              Copy_out_from:
2337  *                      If a page list structure is present, return
2338  *                      all mapped pages.  Where a page does not exist
2339  *                      map a zero filled one. Leave pages busy in
2340  *                      the original object.  If a page list structure
2341  *                      is not specified, this call is a no-op.
2342  *
2343  *              Note:  access of default pager objects has a rather interesting
2344  *              twist.  The caller of this routine, presumably the file system
2345  *              page cache handling code, will never actually make a request
2346  *              against a default pager backed object.  Only the default
2347  *              pager will make requests on backing store related vm_objects
2348  *              In this way the default pager can maintain the relationship
2349  *              between backing store files (abstract memory objects) and
2350  *              the vm_objects (cache objects), they support.
2351  *
2352  */
2353
2354 __private_extern__ kern_return_t
2355 vm_object_upl_request(
2356         vm_object_t             object,
2357         vm_object_offset_t      offset,
2358         upl_size_t              size,
2359         upl_t                   *upl_ptr,
2360         upl_page_info_array_t   user_page_list,
2361         unsigned int            *page_list_count,
2362         int                     cntrl_flags)
2363 {
2364         vm_page_t               dst_page = VM_PAGE_NULL;
2365         vm_object_offset_t      dst_offset = offset;
2366         upl_size_t              xfer_size = size;
2367         boolean_t               do_m_lock = FALSE;
2368         boolean_t               dirty;
2369         boolean_t               hw_dirty;
2370         upl_t                   upl = NULL;
2371         unsigned int            entry;
2372 #if MACH_CLUSTER_STATS
2373         boolean_t               encountered_lrp = FALSE;
2374 #endif
2375         vm_page_t               alias_page = NULL;
2376         int                     page_ticket;
2377         int                     refmod_state;
2378         wpl_array_t             lite_list = NULL;
2379         vm_object_t             last_copy_object;
2380
2381
2382         if (cntrl_flags & ~UPL_VALID_FLAGS) {
2383                 /*
2384                  * For forward compatibility's sake,
2385                  * reject any unknown flag.
2386                  */
2387                 return KERN_INVALID_VALUE;
2388         }
2389
2390         page_ticket = (cntrl_flags & UPL_PAGE_TICKET_MASK)
2391                                         >> UPL_PAGE_TICKET_SHIFT;
2392
2393         if(((size/PAGE_SIZE) > MAX_UPL_TRANSFER) && !object->phys_contiguous) {
2394                 size = MAX_UPL_TRANSFER * PAGE_SIZE;
2395         }
2396
2397         if(cntrl_flags & UPL_SET_INTERNAL)
2398                 if(page_list_count != NULL)
2399                         *page_list_count = MAX_UPL_TRANSFER;
2400
2401         if((!object->internal) && (object->paging_offset != 0))
2402                 panic("vm_object_upl_request: external object with non-zero paging offset\n");
2403
2404         if((cntrl_flags & UPL_COPYOUT_FROM) && (upl_ptr == NULL)) {
2405                 return KERN_SUCCESS;
2406         }
2407
2408         vm_object_lock(object);
2409         vm_object_paging_begin(object);
2410         vm_object_unlock(object);
2411
2412         if(upl_ptr) {
2413                 if(cntrl_flags & UPL_SET_INTERNAL) {
2414                         if(cntrl_flags & UPL_SET_LITE) {
2415                                 uintptr_t page_field_size;
2416                                 upl = upl_create(
2417                                         UPL_CREATE_INTERNAL | UPL_CREATE_LITE,
2418                                         size);
2419                                 user_page_list = (upl_page_info_t *)
2420                                    (((uintptr_t)upl) + sizeof(struct upl));
2421                                 lite_list = (wpl_array_t)
2422                                         (((uintptr_t)user_page_list) +
2423                                         ((size/PAGE_SIZE) *
2424                                                 sizeof(upl_page_info_t)));
2425                                 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
2426                                 page_field_size =
2427                                         (page_field_size + 3) & 0xFFFFFFFC;
2428                                 bzero((char *)lite_list, page_field_size);
2429                                 upl->flags =
2430                                         UPL_LITE | UPL_INTERNAL;
2431                         } else {
2432                                 upl = upl_create(UPL_CREATE_INTERNAL, size);
2433                                 user_page_list = (upl_page_info_t *)
2434                                         (((uintptr_t)upl) + sizeof(struct upl));
2435                                 upl->flags = UPL_INTERNAL;
2436                         }
2437                 } else {
2438                         if(cntrl_flags & UPL_SET_LITE) {
2439                                 uintptr_t page_field_size;
2440                                 upl = upl_create(UPL_CREATE_LITE, size);
2441                                 lite_list = (wpl_array_t)
2442                                    (((uintptr_t)upl) + sizeof(struct upl));
2443                                 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
2444                                 page_field_size =
2445                                         (page_field_size + 3) & 0xFFFFFFFC;
2446                                 bzero((char *)lite_list, page_field_size);
2447                                 upl->flags = UPL_LITE;
2448                         } else {
2449                                 upl = upl_create(UPL_CREATE_EXTERNAL, size);
2450                                 upl->flags = 0;
2451                         }
2452                 }
2453
2454                 if (object->phys_contiguous) {
2455                         if ((cntrl_flags & UPL_WILL_MODIFY) &&
2456                             object->copy != VM_OBJECT_NULL) {
2457                                 /* Honor copy-on-write obligations */
2458
2459                                 /*
2460                                  * XXX FBDP
2461                                  * We could still have a race...
2462                                  * A is here building the UPL for a write().
2463                                  * A pushes the pages to the current copy
2464                                  * object.
2465                                  * A returns the UPL to the caller.
2466                                  * B comes along and establishes another
2467                                  * private mapping on this object, inserting
2468                                  * a new copy object between the original
2469                                  * object and the old copy object.
2470                                  * B reads a page and gets the original contents
2471                                  * from the original object.
2472                                  * A modifies the page in the original object.
2473                                  * B reads the page again and sees A's changes,
2474                                  * which is wrong...
2475                                  *
2476                                  * The problem is that the pages are not
2477                                  * marked "busy" in the original object, so
2478                                  * nothing prevents B from reading it before
2479                                  * before A's changes are completed.
2480                                  *
2481                                  * The "paging_in_progress" might protect us
2482                                  * from the insertion of a new copy object
2483                                  * though...  To be verified.
2484                                  */
2485                                 vm_object_lock_request(object,
2486                                                        offset,
2487                                                        size,
2488                                                        FALSE,
2489                                                        MEMORY_OBJECT_COPY_SYNC,
2490                                                        VM_PROT_NO_CHANGE);
2491                                 upl_cow_contiguous++;
2492                                 upl_cow_contiguous_pages += size >> PAGE_SHIFT;
2493                         }
2494
2495                         upl->map_object = object;
2496                         /* don't need any shadow mappings for this one */
2497                         /* since it is already I/O memory */
2498                         upl->flags |= UPL_DEVICE_MEMORY;
2499
2500
2501                         /* paging_in_progress protects paging_offset */
2502                         upl->offset = offset + object->paging_offset;
2503                         upl->size = size;
2504                         *upl_ptr = upl;
2505                         if(user_page_list) {
2506                                 user_page_list[0].phys_addr =
2507                                    (offset + object->shadow_offset)>>PAGE_SHIFT;
2508                                 user_page_list[0].device = TRUE;
2509                         }
2510                         upl->highest_page = (offset + object->shadow_offset + size - 1)>>PAGE_SHIFT;
2511
2512                         if(page_list_count != NULL) {
2513                                 if (upl->flags & UPL_INTERNAL) {
2514                                         *page_list_count = 0;
2515                                 } else {
2516                                         *page_list_count = 1;
2517                                 }
2518                         }
2519
2520                         return KERN_SUCCESS;
2521                 }
2522
2523                 if(user_page_list)
2524                         user_page_list[0].device = FALSE;
2525
2526                 if(cntrl_flags & UPL_SET_LITE) {
2527                         upl->map_object = object;
2528                 } else {
2529                         upl->map_object = vm_object_allocate(size);
2530                         /*
2531                          * No neeed to lock the new object: nobody else knows
2532                          * about it yet, so it's all ours so far.
2533                          */
2534                         upl->map_object->shadow = object;
2535                         upl->map_object->pageout = TRUE;
2536                         upl->map_object->can_persist = FALSE;
2537                         upl->map_object->copy_strategy =
2538                                         MEMORY_OBJECT_COPY_NONE;
2539                         upl->map_object->shadow_offset = offset;
2540                         upl->map_object->wimg_bits = object->wimg_bits;
2541                 }
2542
2543         }
2544         if (!(cntrl_flags & UPL_SET_LITE)) {
2545                 VM_PAGE_GRAB_FICTITIOUS(alias_page);
2546         }
2547
2548         /*
2549          * ENCRYPTED SWAP:
2550          * Just mark the UPL as "encrypted" here.
2551          * We'll actually encrypt the pages later,
2552          * in upl_encrypt(), when the caller has
2553          * selected which pages need to go to swap.
2554          */
2555         if (cntrl_flags & UPL_ENCRYPT) {
2556                 upl->flags |= UPL_ENCRYPTED;
2557         }
2558         if (cntrl_flags & UPL_FOR_PAGEOUT) {
2559                 upl->flags |= UPL_PAGEOUT;
2560         }
2561         vm_object_lock(object);
2562
2563         /* we can lock in the paging_offset once paging_in_progress is set */
2564         if(upl_ptr) {
2565                 upl->size = size;
2566                 upl->offset = offset + object->paging_offset;
2567                 *upl_ptr = upl;
2568 #ifdef UPL_DEBUG
2569                 queue_enter(&object->uplq, upl, upl_t, uplq);
2570 #endif /* UPL_DEBUG */
2571         }
2572
2573         if ((cntrl_flags & UPL_WILL_MODIFY) &&
2574             object->copy != VM_OBJECT_NULL) {
2575                 /* Honor copy-on-write obligations */
2576
2577                 /*
2578                  * The caller is gathering these pages and
2579                  * might modify their contents.  We need to
2580                  * make sure that the copy object has its own
2581                  * private copies of these pages before we let
2582                  * the caller modify them.
2583                  */
2584                 vm_object_update(object,
2585                                  offset,
2586                                  size,
2587                                  NULL,
2588                                  NULL,
2589                                  FALSE, /* should_return */
2590                                  MEMORY_OBJECT_COPY_SYNC,
2591                                  VM_PROT_NO_CHANGE);
2592                 upl_cow++;
2593                 upl_cow_pages += size >> PAGE_SHIFT;
2594
2595         }
2596         /* remember which copy object we synchronized with */
2597         last_copy_object = object->copy;
2598
2599         entry = 0;
2600         if(cntrl_flags & UPL_COPYOUT_FROM) {
2601                 upl->flags |= UPL_PAGE_SYNC_DONE;
2602
2603                 while (xfer_size) {
2604                         if((alias_page == NULL) &&
2605                                 !(cntrl_flags & UPL_SET_LITE)) {
2606                                 vm_object_unlock(object);
2607                                 VM_PAGE_GRAB_FICTITIOUS(alias_page);
2608                                 vm_object_lock(object);
2609                         }
2610                         if ( ((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
2611                                 dst_page->fictitious ||
2612                                 dst_page->absent ||
2613                                 dst_page->error ||
2614                                (dst_page->wire_count && !dst_page->pageout) ||
2615
2616                              ((!dst_page->inactive) && (cntrl_flags & UPL_FOR_PAGEOUT) &&
2617                                (dst_page->page_ticket != page_ticket) &&
2618                               ((dst_page->page_ticket+1) != page_ticket)) ) {
2619
2620                                 if (user_page_list)
2621                                         user_page_list[entry].phys_addr = 0;
2622                         } else {
2623                                 /*
2624                                  * grab this up front...
2625                                  * a high percentange of the time we're going to
2626                                  * need the hardware modification state a bit later
2627                                  * anyway... so we can eliminate an extra call into
2628                                  * the pmap layer by grabbing it here and recording it
2629                                  */
2630                                 refmod_state = pmap_get_refmod(dst_page->phys_page);
2631
2632                                 if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
2633                                         /*
2634                                          * we're only asking for DIRTY pages to be returned
2635                                          */
2636
2637                                         if (dst_page->list_req_pending || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
2638                                                 /*
2639                                                  * if we were the page stolen by vm_pageout_scan to be
2640                                                  * cleaned (as opposed to a buddy being clustered in
2641                                                  * or this request is not being driven by a PAGEOUT cluster
2642                                                  * then we only need to check for the page being diry or
2643                                                  * precious to decide whether to return it
2644                                                  */
2645                                                 if (dst_page->dirty || dst_page->precious ||
2646                                                     (refmod_state & VM_MEM_MODIFIED)) {
2647                                                         goto check_busy;
2648                                                 }
2649                                         }
2650                                         /*
2651                                          * this is a request for a PAGEOUT cluster and this page
2652                                          * is merely along for the ride as a 'buddy'... not only
2653                                          * does it have to be dirty to be returned, but it also
2654                                          * can't have been referenced recently... note that we've
2655                                          * already filtered above based on whether this page is
2656                                          * currently on the inactive queue or it meets the page
2657                                          * ticket (generation count) check
2658                                          */
2659                                         if ( !(refmod_state & VM_MEM_REFERENCED) &&
2660                                              ((refmod_state & VM_MEM_MODIFIED) ||
2661                                               dst_page->dirty || dst_page->precious) ) {
2662                                                 goto check_busy;
2663                                         }
2664                                         /*
2665                                          * if we reach here, we're not to return
2666                                          * the page... go on to the next one
2667                                          */
2668                                         if (user_page_list)
2669                                                 user_page_list[entry].phys_addr = 0;
2670                                         entry++;
2671                                         dst_offset += PAGE_SIZE_64;
2672                                         xfer_size -= PAGE_SIZE;
2673                                         continue;
2674                                 }
2675 check_busy:
2676                                 if(dst_page->busy &&
2677                                         (!(dst_page->list_req_pending &&
2678                                                 dst_page->pageout))) {
2679                                         if(cntrl_flags & UPL_NOBLOCK) {
2680                                                 if(user_page_list) {
2681                                                         user_page_list[entry].phys_addr = 0;
2682                                                 }
2683                                                 entry++;
2684                                                 dst_offset += PAGE_SIZE_64;
2685                                                 xfer_size -= PAGE_SIZE;
2686                                                 continue;
2687                                         }
2688                                         /*
2689                                          * someone else is playing with the
2690                                          * page.  We will have to wait.
2691                                          */
2692                                         PAGE_SLEEP(object, dst_page, THREAD_UNINT);
2693                                         continue;
2694                                 }
2695                                 /* Someone else already cleaning the page? */
2696                                 if((dst_page->cleaning || dst_page->absent ||
2697                                         dst_page->wire_count != 0) &&
2698                                         !dst_page->list_req_pending) {
2699                                    if(user_page_list) {
2700                                            user_page_list[entry].phys_addr = 0;
2701                                    }
2702                                    entry++;
2703                                    dst_offset += PAGE_SIZE_64;
2704                                    xfer_size -= PAGE_SIZE;
2705                                    continue;
2706                                 }
2707                                 /* eliminate all mappings from the */
2708                                 /* original object and its prodigy */
2709
2710                                 vm_page_lock_queues();
2711
2712                                 if (dst_page->pageout_queue == TRUE)
2713                                         /*
2714                                          * we've buddied up a page for a clustered pageout
2715                                          * that has already been moved to the pageout
2716                                          * queue by pageout_scan... we need to remove
2717                                          * it from the queue and drop the laundry count
2718                                          * on that queue
2719                                          */
2720                                         vm_pageout_queue_steal(dst_page);
2721 #if MACH_CLUSTER_STATS
2722                                 /* pageout statistics gathering.  count  */
2723                                 /* all the pages we will page out that   */
2724                                 /* were not counted in the initial       */
2725                                 /* vm_pageout_scan work                  */
2726                                 if(dst_page->list_req_pending)
2727                                         encountered_lrp = TRUE;
2728                                 if((dst_page->dirty ||
2729                                         (dst_page->object->internal &&
2730                                         dst_page->precious)) &&
2731                                         (dst_page->list_req_pending
2732                                         == FALSE)) {
2733                                         if(encountered_lrp) {
2734                                                 CLUSTER_STAT
2735                                                 (pages_at_higher_offsets++;)
2736                                         } else {
2737                                                 CLUSTER_STAT
2738                                                 (pages_at_lower_offsets++;)
2739                                         }
2740                                 }
2741 #endif
2742                                 /* Turn off busy indication on pending */
2743                                 /* pageout.  Note: we can only get here */
2744                                 /* in the request pending case.  */
2745                                 dst_page->list_req_pending = FALSE;
2746                                 dst_page->busy = FALSE;
2747                                 dst_page->cleaning = FALSE;
2748
2749                                 hw_dirty = refmod_state & VM_MEM_MODIFIED;
2750                                 dirty = hw_dirty ? TRUE : dst_page->dirty;
2751
2752                                 if(cntrl_flags & UPL_SET_LITE) {
2753                                         int     pg_num;
2754                                         pg_num = (dst_offset-offset)/PAGE_SIZE;
2755                                         lite_list[pg_num>>5] |=
2756                                                         1 << (pg_num & 31);
2757                                         if (hw_dirty)
2758                                                 pmap_clear_modify(dst_page->phys_page);
2759                                         /*
2760                                          * Record that this page has been
2761                                          * written out
2762                                          */
2763 #if     MACH_PAGEMAP
2764                                         vm_external_state_set(
2765                                                 object->existence_map,
2766                                                 dst_page->offset);
2767 #endif  /*MACH_PAGEMAP*/
2768
2769                                         /*
2770                                          * Mark original page as cleaning
2771                                          * in place.
2772                                          */
2773                                         dst_page->cleaning = TRUE;
2774                                         dst_page->dirty = TRUE;
2775                                         dst_page->precious = FALSE;
2776                                 } else {
2777                                         /* use pageclean setup, it is more */
2778                                         /* convenient even for the pageout */
2779                                         /* cases here */
2780
2781                                         vm_object_lock(upl->map_object);
2782                                         vm_pageclean_setup(dst_page,
2783                                                 alias_page, upl->map_object,
2784                                                 size - xfer_size);
2785                                         vm_object_unlock(upl->map_object);
2786
2787                                         alias_page->absent = FALSE;
2788                                         alias_page = NULL;
2789                                 }
2790
2791                                 if(!dirty) {
2792                                         dst_page->dirty = FALSE;
2793                                         dst_page->precious = TRUE;
2794                                 }
2795
2796                                 if(dst_page->pageout)
2797                                         dst_page->busy = TRUE;
2798
2799                                 if ( (cntrl_flags & UPL_ENCRYPT) ) {
2800                                         /*
2801                                          * ENCRYPTED SWAP:
2802                                          * We want to deny access to the target page
2803                                          * because its contents are about to be
2804                                          * encrypted and the user would be very
2805                                          * confused to see encrypted data instead
2806                                          * of their data.
2807                                          */
2808                                         dst_page->busy = TRUE;
2809                                 }
2810                                 if ( !(cntrl_flags & UPL_CLEAN_IN_PLACE) ) {
2811                                         /*
2812                                          * deny access to the target page
2813                                          * while it is being worked on
2814                                          */
2815                                         if ((!dst_page->pageout) &&
2816                                             (dst_page->wire_count == 0)) {
2817                                                 dst_page->busy = TRUE;
2818                                                 dst_page->pageout = TRUE;
2819                                                 vm_page_wire(dst_page);
2820                                         }
2821                                 }
2822
2823                                 if (dst_page->phys_page > upl->highest_page)
2824                                         upl->highest_page = dst_page->phys_page;
2825
2826                                 if(user_page_list) {
2827                                         user_page_list[entry].phys_addr
2828                                                 = dst_page->phys_page;
2829                                         user_page_list[entry].dirty =
2830                                                         dst_page->dirty;
2831                                         user_page_list[entry].pageout =
2832                                                         dst_page->pageout;
2833                                         user_page_list[entry].absent =
2834                                                         dst_page->absent;
2835                                         user_page_list[entry].precious =
2836                                                         dst_page->precious;
2837                                 }
2838                                 vm_page_unlock_queues();
2839
2840                                 /*
2841                                  * ENCRYPTED SWAP:
2842                                  * The caller is gathering this page and might
2843                                  * access its contents later on.  Decrypt the
2844                                  * page before adding it to the UPL, so that
2845                                  * the caller never sees encrypted data.
2846                                  */
2847                                 if (! (cntrl_flags & UPL_ENCRYPT) &&
2848                                     dst_page->encrypted) {
2849                                         assert(dst_page->busy);
2850
2851                                         vm_page_decrypt(dst_page, 0);
2852                                         vm_page_decrypt_for_upl_counter++;
2853
2854                                         /*
2855                                          * Retry this page, since anything
2856                                          * could have changed while we were
2857                                          * decrypting.
2858                                          */
2859                                         continue;
2860                                 }
2861                         }
2862                         entry++;
2863                         dst_offset += PAGE_SIZE_64;
2864                         xfer_size -= PAGE_SIZE;
2865                 }
2866         } else {
2867                 while (xfer_size) {
2868                         if((alias_page == NULL) &&
2869                                 !(cntrl_flags & UPL_SET_LITE)) {
2870                                 vm_object_unlock(object);
2871                                 VM_PAGE_GRAB_FICTITIOUS(alias_page);
2872                                 vm_object_lock(object);
2873                         }
2874
2875                         if ((cntrl_flags & UPL_WILL_MODIFY) &&
2876                             object->copy != last_copy_object) {
2877                                 /* Honor copy-on-write obligations */
2878
2879                                 /*
2880                                  * The copy object has changed since we
2881                                  * last synchronized for copy-on-write.
2882                                  * Another copy object might have been
2883                                  * inserted while we released the object's
2884                                  * lock.  Since someone could have seen the
2885                                  * original contents of the remaining pages
2886                                  * through that new object, we have to
2887                                  * synchronize with it again for the remaining
2888                                  * pages only.  The previous pages are "busy"
2889                                  * so they can not be seen through the new
2890                                  * mapping.  The new mapping will see our
2891                                  * upcoming changes for those previous pages,
2892                                  * but that's OK since they couldn't see what
2893                                  * was there before.  It's just a race anyway
2894                                  * and there's no guarantee of consistency or
2895                                  * atomicity.  We just don't want new mappings
2896                                  * to see both the *before* and *after* pages.
2897                                  */
2898                                 if (object->copy != VM_OBJECT_NULL) {
2899                                         vm_object_update(
2900                                                 object,
2901                                                 dst_offset,/* current offset */
2902                                                 xfer_size, /* remaining size */
2903                                                 NULL,
2904                                                 NULL,
2905                                                 FALSE,     /* should_return */
2906                                                 MEMORY_OBJECT_COPY_SYNC,
2907                                                 VM_PROT_NO_CHANGE);
2908                                         upl_cow_again++;
2909                                         upl_cow_again_pages +=
2910                                                 xfer_size >> PAGE_SHIFT;
2911                                 }
2912                                 /* remember the copy object we synced with */
2913                                 last_copy_object = object->copy;
2914                         }
2915
2916                         dst_page = vm_page_lookup(object, dst_offset);
2917
2918                         if(dst_page != VM_PAGE_NULL) {
2919                                 if((cntrl_flags & UPL_RET_ONLY_ABSENT) &&
2920                                         !((dst_page->list_req_pending)
2921                                                 && (dst_page->absent))) {
2922                                         /* we are doing extended range */
2923                                         /* requests.  we want to grab  */
2924                                         /* pages around some which are */
2925                                         /* already present.  */
2926                                         if(user_page_list) {
2927                                                 user_page_list[entry].phys_addr = 0;
2928                                         }
2929                                         entry++;
2930                                         dst_offset += PAGE_SIZE_64;
2931                                         xfer_size -= PAGE_SIZE;
2932                                         continue;
2933                                 }
2934                                 if((dst_page->cleaning) &&
2935                                    !(dst_page->list_req_pending)) {
2936                                         /*someone else is writing to the */
2937                                         /* page.  We will have to wait.  */
2938                                         PAGE_SLEEP(object,dst_page,THREAD_UNINT);
2939                                         continue;
2940                                 }
2941                                 if ((dst_page->fictitious &&
2942                                      dst_page->list_req_pending)) {
2943                                         /* dump the fictitious page */
2944                                         dst_page->list_req_pending = FALSE;
2945                                         dst_page->clustered = FALSE;
2946
2947                                         vm_page_lock_queues();
2948                                         vm_page_free(dst_page);
2949                                         vm_page_unlock_queues();
2950
2951                                         dst_page = NULL;
2952                                 } else if ((dst_page->absent &&
2953                                             dst_page->list_req_pending)) {
2954                                         /* the default_pager case */
2955                                         dst_page->list_req_pending = FALSE;
2956                                         dst_page->busy = FALSE;
2957                                 }
2958                         }
2959                         if(dst_page == VM_PAGE_NULL) {
2960                                 if(object->private) {
2961                                         /*
2962                                          * This is a nasty wrinkle for users
2963                                          * of upl who encounter device or
2964                                          * private memory however, it is
2965                                          * unavoidable, only a fault can
2966                                          * reslove the actual backing
2967                                          * physical page by asking the
2968                                          * backing device.
2969                                          */
2970                                         if(user_page_list) {
2971                                                 user_page_list[entry].phys_addr = 0;
2972                                         }
2973                                         entry++;
2974                                         dst_offset += PAGE_SIZE_64;
2975                                         xfer_size -= PAGE_SIZE;
2976                                         continue;
2977                                 }
2978                                 /* need to allocate a page */
2979                                 dst_page = vm_page_alloc(object, dst_offset);
2980                                 if (dst_page == VM_PAGE_NULL) {
2981                                         vm_object_unlock(object);
2982                                         VM_PAGE_WAIT();
2983                                         vm_object_lock(object);
2984                                         continue;
2985                                 }
2986                                 dst_page->busy = FALSE;
2987 #if 0
2988                                 if(cntrl_flags & UPL_NO_SYNC) {
2989                                         dst_page->page_lock = 0;
2990                                         dst_page->unlock_request = 0;
2991                                 }
2992 #endif
2993                                 if(cntrl_flags & UPL_RET_ONLY_ABSENT) {
2994                                         /*
2995                                          * if UPL_RET_ONLY_ABSENT was specified,
2996                                          * than we're definitely setting up a
2997                                          * upl for a clustered read/pagein
2998                                          * operation... mark the pages as clustered
2999                                          * so vm_fault can correctly attribute them
3000                                          * to the 'pagein' bucket the first time
3001                                          * a fault happens on them
3002                                          */
3003                                         dst_page->clustered = TRUE;
3004                                 }
3005                                 dst_page->absent = TRUE;
3006                                 object->absent_count++;
3007                         }
3008 #if 1
3009                         if(cntrl_flags & UPL_NO_SYNC) {
3010                                 dst_page->page_lock = 0;
3011                                 dst_page->unlock_request = 0;
3012                         }
3013 #endif /* 1 */
3014
3015                         /*
3016                          * ENCRYPTED SWAP:
3017                          */
3018                         if (cntrl_flags & UPL_ENCRYPT) {
3019                                 /*
3020                                  * The page is going to be encrypted when we
3021                                  * get it from the pager, so mark it so.
3022                                  */
3023                                 dst_page->encrypted = TRUE;
3024                         } else {
3025                                 /*
3026                                  * Otherwise, the page will not contain
3027                                  * encrypted data.
3028                                  */
3029                                 dst_page->encrypted = FALSE;
3030                         }
3031
3032                         dst_page->overwriting = TRUE;
3033                         if(dst_page->fictitious) {
3034                                 panic("need corner case for fictitious page");
3035                         }
3036                         if(dst_page->page_lock) {
3037                                 do_m_lock = TRUE;
3038                         }
3039                         if(upl_ptr) {
3040
3041                                 /* eliminate all mappings from the */
3042                                 /* original object and its prodigy */
3043
3044                                 if(dst_page->busy) {
3045                                         /*someone else is playing with the */
3046                                         /* page.  We will have to wait.    */
3047                                         PAGE_SLEEP(object, dst_page, THREAD_UNINT);
3048                                         continue;
3049                                 }
3050                                 vm_page_lock_queues();
3051
3052                                 if( !(cntrl_flags & UPL_FILE_IO))
3053                                         hw_dirty = pmap_disconnect(dst_page->phys_page) & VM_MEM_MODIFIED;
3054                                 else
3055                                         hw_dirty = pmap_get_refmod(dst_page->phys_page) & VM_MEM_MODIFIED;
3056                                 dirty = hw_dirty ? TRUE : dst_page->dirty;
3057
3058                                 if(cntrl_flags & UPL_SET_LITE) {
3059                                         int     pg_num;
3060                                         pg_num = (dst_offset-offset)/PAGE_SIZE;
3061                                         lite_list[pg_num>>5] |=
3062                                                         1 << (pg_num & 31);
3063                                         if (hw_dirty)
3064                                                 pmap_clear_modify(dst_page->phys_page);
3065                                         /*
3066                                          * Record that this page has been
3067                                          * written out
3068                                          */
3069 #if     MACH_PAGEMAP
3070                                         vm_external_state_set(
3071                                                 object->existence_map,
3072                                                 dst_page->offset);
3073 #endif  /*MACH_PAGEMAP*/
3074
3075                                         /*
3076                                          * Mark original page as cleaning
3077                                          * in place.
3078                                          */
3079                                         dst_page->cleaning = TRUE;
3080                                         dst_page->dirty = TRUE;
3081                                         dst_page->precious = FALSE;
3082                                 } else {
3083                                         /* use pageclean setup, it is more */
3084                                         /* convenient even for the pageout */
3085                                         /* cases here */
3086                                         vm_object_lock(upl->map_object);
3087                                         vm_pageclean_setup(dst_page,
3088                                                 alias_page, upl->map_object,
3089                                                 size - xfer_size);
3090                                         vm_object_unlock(upl->map_object);
3091
3092                                         alias_page->absent = FALSE;
3093                                         alias_page = NULL;
3094                                 }
3095
3096                                 if(cntrl_flags & UPL_CLEAN_IN_PLACE) {
3097                                         /* clean in place for read implies   */
3098                                         /* that a write will be done on all  */
3099                                         /* the pages that are dirty before   */
3100                                         /* a upl commit is done.  The caller */
3101                                         /* is obligated to preserve the      */
3102                                         /* contents of all pages marked      */
3103                                         /* dirty. */
3104                                         upl->flags |= UPL_CLEAR_DIRTY;
3105                                 }
3106
3107                                 if(!dirty) {
3108                                         dst_page->dirty = FALSE;
3109                                         dst_page->precious = TRUE;
3110                                 }
3111
3112                                 if (dst_page->wire_count == 0) {
3113                                    /* deny access to the target page while */
3114                                    /* it is being worked on */
3115                                         dst_page->busy = TRUE;
3116                                 } else {
3117                                         vm_page_wire(dst_page);
3118                                 }
3119                                 if(cntrl_flags & UPL_RET_ONLY_ABSENT) {
3120                                         /*
3121                                          * expect the page not to be used
3122                                          * since it's coming in as part
3123                                          * of a cluster and could be
3124                                          * speculative... pages that
3125                                          * are 'consumed' will get a
3126                                          * hardware reference
3127                                          */
3128                                         dst_page->reference = FALSE;
3129                                 } else {
3130                                         /*
3131                                          * expect the page to be used
3132                                          */
3133                                         dst_page->reference = TRUE;
3134                                 }
3135                                 dst_page->precious =
3136                                         (cntrl_flags & UPL_PRECIOUS)
3137                                                         ? TRUE : FALSE;
3138
3139                                 if (dst_page->phys_page > upl->highest_page)
3140                                         upl->highest_page = dst_page->phys_page;
3141
3142                                 if(user_page_list) {
3143                                         user_page_list[entry].phys_addr
3144                                                 = dst_page->phys_page;
3145                                         user_page_list[entry].dirty =
3146                                                         dst_page->dirty;
3147                                         user_page_list[entry].pageout =
3148                                                         dst_page->pageout;
3149                                         user_page_list[entry].absent =
3150                                                         dst_page->absent;
3151                                         user_page_list[entry].precious =
3152                                                         dst_page->precious;
3153                                 }
3154                                 vm_page_unlock_queues();
3155                         }
3156                         entry++;
3157                         dst_offset += PAGE_SIZE_64;
3158                         xfer_size -= PAGE_SIZE;
3159                 }
3160         }
3161
3162         if (upl->flags & UPL_INTERNAL) {
3163                 if(page_list_count != NULL)
3164                         *page_list_count = 0;
3165         } else if (*page_list_count > entry) {
3166                 if(page_list_count != NULL)
3167                         *page_list_count = entry;
3168         }
3169
3170         if(alias_page != NULL) {
3171                 vm_page_lock_queues();
3172                 vm_page_free(alias_page);
3173                 vm_page_unlock_queues();
3174         }
3175
3176         if(do_m_lock) {
3177            vm_prot_t    access_required;
3178            /* call back all associated pages from other users of the pager */
3179            /* all future updates will be on data which is based on the     */
3180            /* changes we are going to make here. Note: it is assumed that  */
3181            /* we already hold copies of the data so we will not be seeing  */
3182            /* an avalanche of incoming data from the pager */
3183            access_required = (cntrl_flags & UPL_COPYOUT_FROM)
3184                                         ? VM_PROT_READ : VM_PROT_WRITE;
3185            while (TRUE) {
3186                 kern_return_t   rc;
3187
3188                 if(!object->pager_ready) {
3189                    wait_result_t wait_result;
3190
3191                    wait_result = vm_object_sleep(object,
3192                                                 VM_OBJECT_EVENT_PAGER_READY,
3193                                                 THREAD_UNINT);
3194                    if (wait_result !=  THREAD_AWAKENED) {
3195                         vm_object_unlock(object);
3196                         return KERN_FAILURE;
3197                    }
3198                    continue;
3199                 }
3200
3201                 vm_object_unlock(object);
3202                 rc = memory_object_data_unlock(
3203                         object->pager,
3204                         dst_offset + object->paging_offset,
3205                         size,
3206                         access_required);
3207                 if (rc != KERN_SUCCESS && rc != MACH_SEND_INTERRUPTED)
3208                         return KERN_FAILURE;
3209                 vm_object_lock(object);
3210
3211                 if (rc == KERN_SUCCESS)
3212                         break;
3213            }
3214
3215            /* lets wait on the last page requested */
3216            /* NOTE: we will have to update lock completed routine to signal */
3217            if(dst_page != VM_PAGE_NULL &&
3218                 (access_required & dst_page->page_lock) != access_required) {
3219                 PAGE_ASSERT_WAIT(dst_page, THREAD_UNINT);
3220                 vm_object_unlock(object);
3221                 thread_block(THREAD_CONTINUE_NULL);
3222                 return KERN_SUCCESS;
3223            }
3224         }
3225
3226         vm_object_unlock(object);
3227         return KERN_SUCCESS;
3228 }
3229
3230 /* JMM - Backward compatability for now */
3231 kern_return_t
3232 vm_fault_list_request(                  /* forward */
3233         memory_object_control_t         control,
3234         vm_object_offset_t      offset,
3235         upl_size_t              size,
3236         upl_t                   *upl_ptr,
3237         upl_page_info_t         **user_page_list_ptr,
3238         int                     page_list_count,
3239         int                     cntrl_flags);
3240 kern_return_t
3241 vm_fault_list_request(
3242         memory_object_control_t         control,
3243         vm_object_offset_t      offset,
3244         upl_size_t              size,
3245         upl_t                   *upl_ptr,
3246         upl_page_info_t         **user_page_list_ptr,
3247         int                     page_list_count,
3248         int                     cntrl_flags)
3249 {
3250         unsigned int            local_list_count;
3251         upl_page_info_t         *user_page_list;
3252         kern_return_t           kr;
3253
3254         if (user_page_list_ptr != NULL) {
3255                 local_list_count = page_list_count;
3256                 user_page_list = *user_page_list_ptr;
3257         } else {
3258                 local_list_count = 0;
3259                 user_page_list = NULL;
3260         }
3261         kr =  memory_object_upl_request(control,
3262                                 offset,
3263                                 size,
3264                                 upl_ptr,
3265                                 user_page_list,
3266                                 &local_list_count,
3267                                 cntrl_flags);
3268
3269         if(kr != KERN_SUCCESS)
3270                 return kr;
3271
3272         if ((user_page_list_ptr != NULL) && (cntrl_flags & UPL_INTERNAL)) {
3273                 *user_page_list_ptr = UPL_GET_INTERNAL_PAGE_LIST(*upl_ptr);
3274         }
3275
3276         return KERN_SUCCESS;
3277 }
3278
3279
3280
3281 /*
3282  *      Routine:        vm_object_super_upl_request
3283  *      Purpose:
3284  *              Cause the population of a portion of a vm_object
3285  *              in much the same way as memory_object_upl_request.
3286  *              Depending on the nature of the request, the pages
3287  *              returned may be contain valid data or be uninitialized.
3288  *              However, the region may be expanded up to the super
3289  *              cluster size provided.
3290  */
3291
3292 __private_extern__ kern_return_t
3293 vm_object_super_upl_request(
3294         vm_object_t object,
3295         vm_object_offset_t      offset,
3296         upl_size_t              size,
3297         upl_size_t              super_cluster,
3298         upl_t                   *upl,
3299         upl_page_info_t         *user_page_list,
3300         unsigned int            *page_list_count,
3301         int                     cntrl_flags)
3302 {
3303         vm_page_t       target_page;
3304         int             ticket;
3305
3306
3307         if(object->paging_offset > offset)
3308                 return KERN_FAILURE;
3309
3310         assert(object->paging_in_progress);
3311         offset = offset - object->paging_offset;
3312
3313         if(cntrl_flags & UPL_FOR_PAGEOUT) {
3314
3315                 vm_object_lock(object);
3316
3317                 if((target_page = vm_page_lookup(object, offset))
3318                                                         != VM_PAGE_NULL) {
3319                         ticket = target_page->page_ticket;
3320                         cntrl_flags = cntrl_flags & ~(int)UPL_PAGE_TICKET_MASK;
3321                         cntrl_flags = cntrl_flags |
3322                                 ((ticket << UPL_PAGE_TICKET_SHIFT)
3323                                                         & UPL_PAGE_TICKET_MASK);
3324                 }
3325                 vm_object_unlock(object);
3326         }
3327
3328         if (super_cluster > size) {
3329
3330                 vm_object_offset_t      base_offset;
3331                 upl_size_t              super_size;
3332
3333                 base_offset = (offset &
3334                         ~((vm_object_offset_t) super_cluster - 1));
3335                 super_size = (offset+size) > (base_offset + super_cluster) ?
3336                                 super_cluster<<1 : super_cluster;
3337                 super_size = ((base_offset + super_size) > object->size) ?
3338                                 (object->size - base_offset) : super_size;
3339                 if(offset > (base_offset + super_size))
3340                    panic("vm_object_super_upl_request: Missed target pageout"
3341                          " %#llx,%#llx, %#x, %#x, %#x, %#llx\n",
3342                          offset, base_offset, super_size, super_cluster,
3343                          size, object->paging_offset);
3344                 /*
3345                  * apparently there is a case where the vm requests a
3346                  * page to be written out who's offset is beyond the
3347                  * object size
3348                  */
3349                 if((offset + size) > (base_offset + super_size))
3350                    super_size = (offset + size) - base_offset;
3351
3352                 offset = base_offset;
3353                 size = super_size;
3354         }
3355         return vm_object_upl_request(object, offset, size,
3356                                      upl, user_page_list, page_list_count,
3357                                      cntrl_flags);
3358 }
3359
3360
3361 kern_return_t
3362 vm_map_create_upl(
3363         vm_map_t                map,
3364         vm_map_address_t        offset,
3365         upl_size_t              *upl_size,
3366         upl_t                   *upl,
3367         upl_page_info_array_t   page_list,
3368         unsigned int            *count,
3369         int                     *flags)
3370 {
3371         vm_map_entry_t  entry;
3372         int             caller_flags;
3373         int             force_data_sync;
3374         int             sync_cow_data;
3375         vm_object_t     local_object;
3376         vm_map_offset_t local_offset;
3377         vm_map_offset_t local_start;
3378         kern_return_t   ret;
3379
3380         caller_flags = *flags;
3381
3382         if (caller_flags & ~UPL_VALID_FLAGS) {
3383                 /*
3384                  * For forward compatibility's sake,
3385                  * reject any unknown flag.
3386                  */
3387                 return KERN_INVALID_VALUE;
3388         }
3389
3390         force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
3391         sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
3392
3393         if(upl == NULL)
3394                 return KERN_INVALID_ARGUMENT;
3395
3396
3397 REDISCOVER_ENTRY:
3398         vm_map_lock(map);
3399         if (vm_map_lookup_entry(map, offset, &entry)) {
3400                 if (entry->object.vm_object == VM_OBJECT_NULL ||
3401                         !entry->object.vm_object->phys_contiguous) {
3402                         if((*upl_size/page_size) > MAX_UPL_TRANSFER) {
3403                                 *upl_size = MAX_UPL_TRANSFER * page_size;
3404                         }
3405                 }
3406                 if((entry->vme_end - offset) < *upl_size) {
3407                         *upl_size = entry->vme_end - offset;
3408                 }
3409                 if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
3410                         if (entry->object.vm_object == VM_OBJECT_NULL) {
3411                                 *flags = 0;
3412                         } else if (entry->object.vm_object->private) {
3413                                 *flags = UPL_DEV_MEMORY;
3414                                 if (entry->object.vm_object->phys_contiguous) {
3415                                         *flags |= UPL_PHYS_CONTIG;
3416                                 }
3417                         } else  {
3418                                 *flags = 0;
3419                         }
3420                         vm_map_unlock(map);
3421                         return KERN_SUCCESS;
3422                 }
3423                 /*
3424                  *      Create an object if necessary.
3425                  */
3426                 if (entry->object.vm_object == VM_OBJECT_NULL) {
3427                         entry->object.vm_object = vm_object_allocate(
3428                                 (vm_size_t)(entry->vme_end - entry->vme_start));
3429                         entry->offset = 0;
3430                 }
3431                 if (!(caller_flags & UPL_COPYOUT_FROM)) {
3432                         if (!(entry->protection & VM_PROT_WRITE)) {
3433                                 vm_map_unlock(map);
3434                                 return KERN_PROTECTION_FAILURE;
3435                         }
3436                         if (entry->needs_copy)  {
3437                                 vm_map_t                local_map;
3438                                 vm_object_t             object;
3439                                 vm_map_offset_t         offset_hi;
3440                                 vm_map_offset_t         offset_lo;
3441                                 vm_object_offset_t      new_offset;
3442                                 vm_prot_t               prot;
3443                                 boolean_t               wired;
3444                                 vm_behavior_t           behavior;
3445                                 vm_map_version_t        version;
3446                                 vm_map_t                real_map;
3447
3448                                 local_map = map;
3449                                 vm_map_lock_write_to_read(map);
3450                                 if(vm_map_lookup_locked(&local_map,
3451                                         offset, VM_PROT_WRITE,
3452                                         &version, &object,
3453                                         &new_offset, &prot, &wired,
3454                                         &behavior, &offset_lo,
3455                                         &offset_hi, &real_map)) {
3456                                         vm_map_unlock(local_map);
3457                                         return KERN_FAILURE;
3458                                 }
3459                                 if (real_map != map) {
3460                                         vm_map_unlock(real_map);
3461                                 }
3462                                 vm_object_unlock(object);
3463                                 vm_map_unlock(local_map);
3464
3465                                 goto REDISCOVER_ENTRY;
3466                         }
3467                 }
3468                 if (entry->is_sub_map) {
3469                         vm_map_t        submap;
3470
3471                         submap = entry->object.sub_map;
3472                         local_start = entry->vme_start;
3473                         local_offset = entry->offset;
3474                         vm_map_reference(submap);
3475                         vm_map_unlock(map);
3476
3477                         ret = (vm_map_create_upl(submap,
3478                                 local_offset + (offset - local_start),
3479                                 upl_size, upl, page_list, count,
3480                                 flags));
3481
3482                         vm_map_deallocate(submap);
3483                         return ret;
3484                 }
3485
3486                 if (sync_cow_data) {
3487                         if (entry->object.vm_object->shadow
3488                                     || entry->object.vm_object->copy) {
3489
3490                                 local_object = entry->object.vm_object;
3491                                 local_start = entry->vme_start;
3492                                 local_offset = entry->offset;
3493                                 vm_object_reference(local_object);
3494                                 vm_map_unlock(map);
3495
3496                                 if (entry->object.vm_object->shadow &&
3497                                            entry->object.vm_object->copy) {
3498                                    vm_object_lock_request(
3499                                         local_object->shadow,
3500                                         (vm_object_offset_t)
3501                                         ((offset - local_start) +
3502                                          local_offset) +
3503                                         local_object->shadow_offset,
3504                                         *upl_size, FALSE,
3505                                         MEMORY_OBJECT_DATA_SYNC,
3506                                         VM_PROT_NO_CHANGE);
3507                                 }
3508                                 sync_cow_data = FALSE;
3509                                 vm_object_deallocate(local_object);
3510                                 goto REDISCOVER_ENTRY;
3511                         }
3512                 }
3513
3514                 if (force_data_sync) {
3515
3516                         local_object = entry->object.vm_object;
3517                         local_start = entry->vme_start;
3518                         local_offset = entry->offset;
3519                         vm_object_reference(local_object);
3520                         vm_map_unlock(map);
3521
3522                         vm_object_lock_request(
3523                                    local_object,
3524                                    (vm_object_offset_t)
3525                                    ((offset - local_start) + local_offset),
3526                                    (vm_object_size_t)*upl_size, FALSE,
3527                                    MEMORY_OBJECT_DATA_SYNC,
3528                                    VM_PROT_NO_CHANGE);
3529                         force_data_sync = FALSE;
3530                         vm_object_deallocate(local_object);
3531                         goto REDISCOVER_ENTRY;
3532                 }
3533
3534                 if(!(entry->object.vm_object->private)) {
3535                         if(*upl_size > (MAX_UPL_TRANSFER*PAGE_SIZE))
3536                                 *upl_size = (MAX_UPL_TRANSFER*PAGE_SIZE);
3537                         if(entry->object.vm_object->phys_contiguous) {
3538                                 *flags = UPL_PHYS_CONTIG;
3539                         } else {
3540                                 *flags = 0;
3541                         }
3542                 } else {
3543                         *flags = UPL_DEV_MEMORY | UPL_PHYS_CONTIG;
3544                 }
3545                 local_object = entry->object.vm_object;
3546                 local_offset = entry->offset;
3547                 local_start = entry->vme_start;
3548                 vm_object_reference(local_object);
3549                 vm_map_unlock(map);
3550                 if(caller_flags & UPL_SET_IO_WIRE) {
3551                         ret = (vm_object_iopl_request(local_object,
3552                                 (vm_object_offset_t)
3553                                    ((offset - local_start)
3554                                                 + local_offset),
3555                                 *upl_size,
3556                                 upl,
3557                                 page_list,
3558                                 count,
3559                                 caller_flags));
3560                 } else {
3561                         ret = (vm_object_upl_request(local_object,
3562                                 (vm_object_offset_t)
3563                                    ((offset - local_start)
3564                                                 + local_offset),
3565                                 *upl_size,
3566                                 upl,
3567                                 page_list,
3568                                 count,
3569                                 caller_flags));
3570                 }
3571                 vm_object_deallocate(local_object);
3572                 return(ret);
3573         }
3574
3575         vm_map_unlock(map);
3576         return(KERN_FAILURE);
3577
3578 }
3579
3580 /*
3581  * Internal routine to enter a UPL into a VM map.
3582  *
3583  * JMM - This should just be doable through the standard
3584  * vm_map_enter() API.
3585  */
3586 kern_return_t
3587 vm_map_enter_upl(
3588         vm_map_t                map,
3589         upl_t                   upl,
3590         vm_map_offset_t *dst_addr)
3591 {
3592         vm_map_size_t           size;
3593         vm_object_offset_t      offset;
3594         vm_map_offset_t         addr;
3595         vm_page_t               m;
3596         kern_return_t           kr;
3597
3598         if (upl == UPL_NULL)
3599                 return KERN_INVALID_ARGUMENT;
3600
3601         upl_lock(upl);
3602
3603         /* check to see if already mapped */
3604         if(UPL_PAGE_LIST_MAPPED & upl->flags) {
3605                 upl_unlock(upl);
3606                 return KERN_FAILURE;
3607         }
3608
3609         if((!(upl->map_object->pageout)) &&
3610                 !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) ||
3611                                         (upl->map_object->phys_contiguous))) {
3612                 vm_object_t             object;
3613                 vm_page_t               alias_page;
3614                 vm_object_offset_t      new_offset;
3615                 int                     pg_num;
3616                 wpl_array_t             lite_list;
3617
3618                 if(upl->flags & UPL_INTERNAL) {
3619                         lite_list = (wpl_array_t)
3620                                 ((((uintptr_t)upl) + sizeof(struct upl))
3621                                 + ((upl->size/PAGE_SIZE)
3622                                                 * sizeof(upl_page_info_t)));
3623                 } else {
3624                         lite_list = (wpl_array_t)
3625                                 (((uintptr_t)upl) + sizeof(struct upl));
3626                 }
3627                 object = upl->map_object;
3628                 upl->map_object = vm_object_allocate(upl->size);
3629                 vm_object_lock(upl->map_object);
3630                 upl->map_object->shadow = object;
3631                 upl->map_object->pageout = TRUE;
3632                 upl->map_object->can_persist = FALSE;
3633                 upl->map_object->copy_strategy =
3634                                 MEMORY_OBJECT_COPY_NONE;
3635                 upl->map_object->shadow_offset =
3636                                 upl->offset - object->paging_offset;
3637                 upl->map_object->wimg_bits = object->wimg_bits;
3638                 offset = upl->map_object->shadow_offset;
3639                 new_offset = 0;
3640                 size = upl->size;
3641
3642                 vm_object_lock(object);
3643
3644                 while(size) {
3645                    pg_num = (new_offset)/PAGE_SIZE;
3646                    if(lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
3647                         vm_object_unlock(object);
3648                         VM_PAGE_GRAB_FICTITIOUS(alias_page);
3649                         vm_object_lock(object);
3650                         m = vm_page_lookup(object, offset);
3651                         if (m == VM_PAGE_NULL) {
3652                                 panic("vm_upl_map: page missing\n");
3653                         }
3654
3655                         vm_object_paging_begin(object);
3656
3657                         /*
3658                         * Convert the fictitious page to a private
3659                          * shadow of the real page.
3660                          */
3661                         assert(alias_page->fictitious);
3662                         alias_page->fictitious = FALSE;
3663                         alias_page->private = TRUE;
3664                         alias_page->pageout = TRUE;
3665                         alias_page->phys_page = m->phys_page;
3666
3667                         vm_page_lock_queues();
3668                         vm_page_wire(alias_page);
3669                         vm_page_unlock_queues();
3670
3671                         /*
3672                          * ENCRYPTED SWAP:
3673                          * The virtual page ("m") has to be wired in some way
3674                          * here or its physical page ("m->phys_page") could
3675                          * be recycled at any time.
3676                          * Assuming this is enforced by the caller, we can't
3677                          * get an encrypted page here.  Since the encryption
3678                          * key depends on the VM page's "pager" object and
3679                          * the "paging_offset", we couldn't handle 2 pageable
3680                          * VM pages (with different pagers and paging_offsets)
3681                          * sharing the same physical page:  we could end up
3682                          * encrypting with one key (via one VM page) and
3683                          * decrypting with another key (via the alias VM page).
3684                          */
3685                         ASSERT_PAGE_DECRYPTED(m);
3686
3687                         vm_page_insert(alias_page,
3688                                         upl->map_object, new_offset);
3689                         assert(!alias_page->wanted);
3690                         alias_page->busy = FALSE;
3691                         alias_page->absent = FALSE;
3692                    }
3693
3694                    size -= PAGE_SIZE;
3695                    offset += PAGE_SIZE_64;
3696                    new_offset += PAGE_SIZE_64;
3697                 }
3698                 vm_object_unlock(object);
3699                 vm_object_unlock(upl->map_object);
3700         }
3701         if ((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || upl->map_object->phys_contiguous)
3702                 offset = upl->offset - upl->map_object->paging_offset;
3703         else
3704                 offset = 0;
3705
3706         size = upl->size;
3707
3708         vm_object_lock(upl->map_object);
3709         upl->map_object->ref_count++;
3710         vm_object_res_reference(upl->map_object);
3711         vm_object_unlock(upl->map_object);
3712
3713         *dst_addr = 0;
3714
3715
3716         /* NEED A UPL_MAP ALIAS */
3717         kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
3718                 VM_FLAGS_ANYWHERE, upl->map_object, offset, FALSE,
3719                 VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
3720
3721         if (kr != KERN_SUCCESS) {
3722                 upl_unlock(upl);
3723                 return(kr);
3724         }
3725
3726         vm_object_lock(upl->map_object);
3727
3728         for(addr=*dst_addr; size > 0; size-=PAGE_SIZE,addr+=PAGE_SIZE) {
3729                 m = vm_page_lookup(upl->map_object, offset);
3730                 if(m) {
3731                    unsigned int cache_attr;
3732                    cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
3733
3734                    PMAP_ENTER(map->pmap, addr,
3735                                 m, VM_PROT_ALL,
3736                                 cache_attr, TRUE);
3737                 }
3738                 offset+=PAGE_SIZE_64;
3739         }
3740         vm_object_unlock(upl->map_object);
3741
3742         upl->ref_count++;  /* hold a reference for the mapping */
3743         upl->flags |= UPL_PAGE_LIST_MAPPED;
3744         upl->kaddr = *dst_addr;
3745         upl_unlock(upl);
3746         return KERN_SUCCESS;
3747 }
3748
3749 /*
3750  * Internal routine to remove a UPL mapping from a VM map.
3751  *
3752  * XXX - This should just be doable through a standard
3753  * vm_map_remove() operation.  Otherwise, implicit clean-up
3754  * of the target map won't be able to correctly remove
3755  * these (and release the reference on the UPL).  Having
3756  * to do this means we can't map these into user-space
3757  * maps yet.
3758  */
3759 kern_return_t
3760 vm_map_remove_upl(
3761         vm_map_t        map,
3762         upl_t           upl)
3763 {
3764         vm_address_t    addr;
3765         upl_size_t      size;
3766
3767         if (upl == UPL_NULL)
3768                 return KERN_INVALID_ARGUMENT;
3769
3770         upl_lock(upl);
3771         if(upl->flags & UPL_PAGE_LIST_MAPPED) {
3772                 addr = upl->kaddr;
3773                 size = upl->size;
3774                 assert(upl->ref_count > 1);
3775                 upl->ref_count--;               /* removing mapping ref */
3776                 upl->flags &= ~UPL_PAGE_LIST_MAPPED;
3777                 upl->kaddr = (vm_offset_t) 0;
3778                 upl_unlock(upl);
3779
3780                 vm_map_remove(  map,
3781                                 vm_map_trunc_page(addr),
3782                                 vm_map_round_page(addr + size),
3783                                 VM_MAP_NO_FLAGS);
3784                 return KERN_SUCCESS;
3785         }
3786         upl_unlock(upl);
3787         return KERN_FAILURE;
3788 }
3789
3790 kern_return_t
3791 upl_commit_range(
3792         upl_t                   upl,
3793         upl_offset_t            offset,
3794         upl_size_t              size,
3795         int                     flags,
3796         upl_page_info_t         *page_list,
3797         mach_msg_type_number_t  count,
3798         boolean_t               *empty)
3799 {
3800         upl_size_t              xfer_size = size;
3801         vm_object_t             shadow_object;
3802         vm_object_t             object = upl->map_object;
3803         vm_object_offset_t      target_offset;
3804         int                     entry;
3805         wpl_array_t             lite_list;
3806         int                     occupied;
3807         int                     delayed_unlock = 0;
3808         int                     clear_refmod = 0;
3809         boolean_t               shadow_internal;
3810
3811         *empty = FALSE;
3812
3813         if (upl == UPL_NULL)
3814                 return KERN_INVALID_ARGUMENT;
3815
3816
3817         if (count == 0)
3818                 page_list = NULL;
3819
3820         if (object->pageout) {
3821                 shadow_object = object->shadow;
3822         } else {
3823                 shadow_object = object;
3824         }
3825
3826         upl_lock(upl);
3827
3828         if (upl->flags & UPL_ACCESS_BLOCKED) {
3829                 /*
3830                  * We used this UPL to block access to the pages by marking
3831                  * them "busy".  Now we need to clear the "busy" bit to allow
3832                  * access to these pages again.
3833                  */
3834                 flags |= UPL_COMMIT_ALLOW_ACCESS;
3835         }
3836
3837         if (upl->flags & UPL_CLEAR_DIRTY)
3838                 flags |= UPL_COMMIT_CLEAR_DIRTY;
3839
3840         if (upl->flags & UPL_DEVICE_MEMORY) {
3841                 xfer_size = 0;
3842         } else if ((offset + size) > upl->size) {
3843                 upl_unlock(upl);
3844                 return KERN_FAILURE;
3845         }
3846
3847         if (upl->flags & UPL_INTERNAL) {
3848                 lite_list = (wpl_array_t)
3849                         ((((uintptr_t)upl) + sizeof(struct upl))
3850                         + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
3851         } else {
3852                 lite_list = (wpl_array_t)
3853                         (((uintptr_t)upl) + sizeof(struct upl));
3854         }
3855         if (object != shadow_object)
3856                 vm_object_lock(object);
3857         vm_object_lock(shadow_object);
3858
3859         shadow_internal = shadow_object->internal;
3860
3861         entry = offset/PAGE_SIZE;
3862         target_offset = (vm_object_offset_t)offset;
3863
3864         while (xfer_size) {
3865                 vm_page_t       t,m;
3866                 upl_page_info_t *p;
3867
3868                 m = VM_PAGE_NULL;
3869
3870                 if (upl->flags & UPL_LITE) {
3871                         int     pg_num;
3872
3873                         pg_num = target_offset/PAGE_SIZE;
3874
3875                         if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
3876                                 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
3877                                 m = vm_page_lookup(shadow_object,
3878                                                    target_offset + (upl->offset -
3879                                                                     shadow_object->paging_offset));
3880                         }
3881                 }
3882                 if (object->pageout) {
3883                         if ((t = vm_page_lookup(object, target_offset)) != NULL) {
3884                                 t->pageout = FALSE;
3885
3886                                 if (delayed_unlock) {
3887                                         delayed_unlock = 0;
3888                                         vm_page_unlock_queues();
3889                                 }
3890                                 VM_PAGE_FREE(t);
3891
3892                                 if (m == NULL) {
3893                                         m = vm_page_lookup(
3894                                             shadow_object,
3895                                             target_offset +
3896                                                 object->shadow_offset);
3897                                 }
3898                                 if (m != VM_PAGE_NULL)
3899                                         vm_object_paging_end(m->object);
3900                         }
3901                 }
3902                 if (m != VM_PAGE_NULL) {
3903
3904                    clear_refmod = 0;
3905
3906                    if (upl->flags & UPL_IO_WIRE) {
3907
3908                         if (delayed_unlock == 0)
3909                                 vm_page_lock_queues();
3910
3911                         vm_page_unwire(m);
3912
3913                         if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
3914                                 delayed_unlock = 0;
3915                                 vm_page_unlock_queues();
3916                         }
3917                         if (page_list) {
3918                                 page_list[entry].phys_addr = 0;
3919                         }
3920                         if (flags & UPL_COMMIT_SET_DIRTY) {
3921                                 m->dirty = TRUE;
3922                         } else if (flags & UPL_COMMIT_CLEAR_DIRTY) {
3923                                 m->dirty = FALSE;
3924                                 clear_refmod |= VM_MEM_MODIFIED;
3925                         }
3926                         if (flags & UPL_COMMIT_INACTIVATE) {
3927                                 m->reference = FALSE;
3928                                 clear_refmod |= VM_MEM_REFERENCED;
3929                                 vm_page_deactivate(m);
3930                         }
3931                         if (clear_refmod)
3932                                 pmap_clear_refmod(m->phys_page, clear_refmod);
3933
3934                         if (flags & UPL_COMMIT_ALLOW_ACCESS) {
3935                                 /*
3936                                  * We blocked access to the pages in this UPL.
3937                                  * Clear the "busy" bit and wake up any waiter
3938                                  * for this page.
3939                                  */
3940                                 PAGE_WAKEUP_DONE(m);
3941                         }
3942
3943                         target_offset += PAGE_SIZE_64;
3944                         xfer_size -= PAGE_SIZE;
3945                         entry++;
3946                         continue;
3947                    }
3948                    if (delayed_unlock == 0)
3949                         vm_page_lock_queues();
3950                    /*
3951                     * make sure to clear the hardware
3952                     * modify or reference bits before
3953                     * releasing the BUSY bit on this page
3954                     * otherwise we risk losing a legitimate
3955                     * change of state
3956                     */
3957                    if (flags & UPL_COMMIT_CLEAR_DIRTY) {
3958                         m->dirty = FALSE;
3959                         clear_refmod |= VM_MEM_MODIFIED;
3960                    }
3961                    if (flags & UPL_COMMIT_INACTIVATE)
3962                         clear_refmod |= VM_MEM_REFERENCED;
3963
3964                    if (clear_refmod)
3965                         pmap_clear_refmod(m->phys_page, clear_refmod);
3966
3967                    if (page_list) {
3968                         p = &(page_list[entry]);
3969                         if(p->phys_addr && p->pageout && !m->pageout) {
3970                                 m->busy = TRUE;
3971                                 m->pageout = TRUE;
3972                                 vm_page_wire(m);
3973                         } else if (page_list[entry].phys_addr &&
3974                                         !p->pageout && m->pageout &&
3975                                         !m->dump_cleaning) {
3976                                 m->pageout = FALSE;
3977                                 m->absent = FALSE;
3978                                 m->overwriting = FALSE;
3979                                 vm_page_unwire(m);
3980                                 PAGE_WAKEUP_DONE(m);
3981                         }
3982                         page_list[entry].phys_addr = 0;
3983                    }
3984                    m->dump_cleaning = FALSE;
3985                    if(m->laundry) {
3986                            vm_pageout_throttle_up(m);
3987                    }
3988                    if(m->pageout) {
3989                       m->cleaning = FALSE;
3990                       m->pageout = FALSE;
3991 #if MACH_CLUSTER_STATS
3992                       if (m->wanted) vm_pageout_target_collisions++;
3993 #endif
3994                       if (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED)
3995                               m->dirty = TRUE;
3996                       else
3997                               m->dirty = FALSE;
3998
3999                       if(m->dirty) {
4000                               vm_page_unwire(m);/* reactivates */
4001
4002                               if (upl->flags & UPL_PAGEOUT) {
4003                                       CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
4004                                       VM_STAT(reactivations++);
4005                               }
4006                               PAGE_WAKEUP_DONE(m);
4007                       } else {
4008                             vm_page_free(m);/* clears busy, etc. */
4009
4010                             if (upl->flags & UPL_PAGEOUT) {
4011                                     CLUSTER_STAT(vm_pageout_target_page_freed++;)
4012
4013                                     if (page_list[entry].dirty)
4014                                             VM_STAT(pageouts++);
4015                             }
4016                       }
4017                       if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
4018                             delayed_unlock = 0;
4019                             vm_page_unlock_queues();
4020                       }
4021                       target_offset += PAGE_SIZE_64;
4022                       xfer_size -= PAGE_SIZE;
4023                       entry++;
4024                       continue;
4025                    }
4026 #if MACH_CLUSTER_STATS
4027                    m->dirty = pmap_is_modified(m->phys_page);
4028
4029                    if (m->dirty)   vm_pageout_cluster_dirtied++;
4030                    else            vm_pageout_cluster_cleaned++;
4031                    if (m->wanted)  vm_pageout_cluster_collisions++;
4032 #else
4033                    m->dirty = 0;
4034 #endif
4035
4036                    if((m->busy) && (m->cleaning)) {
4037                         /* the request_page_list case */
4038                         if(m->absent) {
4039                                 m->absent = FALSE;
4040                                 if(shadow_object->absent_count == 1)
4041                                       vm_object_absent_release(shadow_object);
4042                                 else
4043                                       shadow_object->absent_count--;
4044                         }
4045                         m->overwriting = FALSE;
4046                         m->busy = FALSE;
4047                         m->dirty = FALSE;
4048                    } else if (m->overwriting) {
4049                          /* alternate request page list, write to
4050                           * page_list case.  Occurs when the original
4051                           * page was wired at the time of the list
4052                           * request */
4053                          assert(m->wire_count != 0);
4054                          vm_page_unwire(m);/* reactivates */
4055                          m->overwriting = FALSE;
4056                    }
4057                    m->cleaning = FALSE;
4058
4059                    /* It is a part of the semantic of COPYOUT_FROM */
4060                    /* UPLs that a commit implies cache sync           */
4061                    /* between the vm page and the backing store    */
4062                    /* this can be used to strip the precious bit   */
4063                    /* as well as clean */
4064                    if (upl->flags & UPL_PAGE_SYNC_DONE)
4065                          m->precious = FALSE;
4066
4067                    if (flags & UPL_COMMIT_SET_DIRTY)
4068                         m->dirty = TRUE;
4069
4070                    if (flags & UPL_COMMIT_INACTIVATE) {
4071                         m->reference = FALSE;
4072                         vm_page_deactivate(m);
4073                    } else if (!m->active && !m->inactive) {
4074                         if (m->reference)
4075                                 vm_page_activate(m);
4076                         else
4077                                 vm_page_deactivate(m);
4078                    }
4079
4080                    if (flags & UPL_COMMIT_ALLOW_ACCESS) {
4081                            /*
4082                             * We blocked access to the pages in this URL.
4083                             * Clear the "busy" bit on this page before we
4084                             * wake up any waiter.
4085                             */
4086                            m->busy = FALSE;
4087                    }
4088
4089                    /*
4090                     * Wakeup any thread waiting for the page to be un-cleaning.
4091                     */
4092                    PAGE_WAKEUP(m);
4093
4094                    if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
4095                          delayed_unlock = 0;
4096                          vm_page_unlock_queues();
4097                    }
4098                 }
4099                 target_offset += PAGE_SIZE_64;
4100                 xfer_size -= PAGE_SIZE;
4101                 entry++;
4102         }
4103         if (delayed_unlock)
4104                 vm_page_unlock_queues();
4105
4106         occupied = 1;
4107
4108         if (upl->flags & UPL_DEVICE_MEMORY)  {
4109                 occupied = 0;
4110         } else if (upl->flags & UPL_LITE) {
4111                 int     pg_num;
4112                 int     i;
4113                 pg_num = upl->size/PAGE_SIZE;
4114                 pg_num = (pg_num + 31) >> 5;
4115                 occupied = 0;
4116                 for(i= 0; i<pg_num; i++) {
4117                         if(lite_list[i] != 0) {
4118                                 occupied = 1;
4119                                 break;
4120                         }
4121                 }
4122         } else {
4123                 if(queue_empty(&upl->map_object->memq)) {
4124                         occupied = 0;
4125                 }
4126         }
4127
4128         if(occupied == 0) {
4129                 if(upl->flags & UPL_COMMIT_NOTIFY_EMPTY) {
4130                         *empty = TRUE;
4131                 }
4132                 if(object == shadow_object)
4133                         vm_object_paging_end(shadow_object);
4134         }
4135         vm_object_unlock(shadow_object);
4136         if (object != shadow_object)
4137                 vm_object_unlock(object);
4138         upl_unlock(upl);
4139
4140         return KERN_SUCCESS;
4141 }
4142
4143 kern_return_t
4144 upl_abort_range(
4145         upl_t                   upl,
4146         upl_offset_t            offset,
4147         upl_size_t              size,
4148         int                     error,
4149         boolean_t               *empty)
4150 {
4151         upl_size_t              xfer_size = size;
4152         vm_object_t             shadow_object;
4153         vm_object_t             object = upl->map_object;
4154         vm_object_offset_t      target_offset;
4155         int                     entry;
4156         wpl_array_t             lite_list;
4157         int                     occupied;
4158         boolean_t               shadow_internal;
4159
4160         *empty = FALSE;
4161
4162         if (upl == UPL_NULL)
4163                 return KERN_INVALID_ARGUMENT;
4164
4165         if (upl->flags & UPL_IO_WIRE) {
4166                 return upl_commit_range(upl,
4167                         offset, size, 0,
4168                         NULL, 0, empty);
4169         }
4170
4171         if(object->pageout) {
4172                 shadow_object = object->shadow;
4173         } else {
4174                 shadow_object = object;
4175         }
4176
4177         upl_lock(upl);
4178         if(upl->flags & UPL_DEVICE_MEMORY) {
4179                 xfer_size = 0;
4180         } else if ((offset + size) > upl->size) {
4181                 upl_unlock(upl);
4182                 return KERN_FAILURE;
4183         }
4184         if (object != shadow_object)
4185                 vm_object_lock(object);
4186         vm_object_lock(shadow_object);
4187
4188         shadow_internal = shadow_object->internal;
4189
4190         if(upl->flags & UPL_INTERNAL) {
4191                 lite_list = (wpl_array_t)
4192                         ((((uintptr_t)upl) + sizeof(struct upl))
4193                         + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
4194         } else {
4195                 lite_list = (wpl_array_t)
4196                         (((uintptr_t)upl) + sizeof(struct upl));
4197         }
4198
4199         entry = offset/PAGE_SIZE;
4200         target_offset = (vm_object_offset_t)offset;
4201         while(xfer_size) {
4202                 vm_page_t       t,m;
4203
4204                 m = VM_PAGE_NULL;
4205                 if(upl->flags & UPL_LITE) {
4206                         int     pg_num;
4207                         pg_num = target_offset/PAGE_SIZE;
4208                         if(lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
4209                                 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
4210                                 m = vm_page_lookup(shadow_object,
4211                                         target_offset + (upl->offset -
4212                                                 shadow_object->paging_offset));
4213                         }
4214                 }
4215                 if(object->pageout) {
4216                         if ((t = vm_page_lookup(object, target_offset))
4217                                                                 != NULL) {
4218                                 t->pageout = FALSE;
4219                                 VM_PAGE_FREE(t);
4220                                 if(m == NULL) {
4221                                         m = vm_page_lookup(
4222                                             shadow_object,
4223                                             target_offset +
4224                                                 object->shadow_offset);
4225                                 }
4226                                 if(m != VM_PAGE_NULL)
4227                                         vm_object_paging_end(m->object);
4228                         }
4229                 }
4230                 if(m != VM_PAGE_NULL) {
4231                         vm_page_lock_queues();
4232                         if(m->absent) {
4233                                 boolean_t must_free = TRUE;
4234
4235                                 /* COPYOUT = FALSE case */
4236                                 /* check for error conditions which must */
4237                                 /* be passed back to the pages customer  */
4238                                 if(error & UPL_ABORT_RESTART) {
4239                                         m->restart = TRUE;
4240                                         m->absent = FALSE;
4241                                         vm_object_absent_release(m->object);
4242                                         m->page_error = KERN_MEMORY_ERROR;
4243                                         m->error = TRUE;
4244                                         must_free = FALSE;
4245                                 } else if(error & UPL_ABORT_UNAVAILABLE) {
4246                                         m->restart = FALSE;
4247                                         m->unusual = TRUE;
4248                                         must_free = FALSE;
4249                                 } else if(error & UPL_ABORT_ERROR) {
4250                                         m->restart = FALSE;
4251                                         m->absent = FALSE;
4252                                         vm_object_absent_release(m->object);
4253                                         m->page_error = KERN_MEMORY_ERROR;
4254                                         m->error = TRUE;
4255                                         must_free = FALSE;
4256                                 }
4257
4258                                 /*
4259                                  * ENCRYPTED SWAP:
4260                                  * If the page was already encrypted,
4261                                  * we don't really need to decrypt it
4262                                  * now.  It will get decrypted later,
4263                                  * on demand, as soon as someone needs
4264                                  * to access its contents.
4265                                  */
4266
4267                                 m->cleaning = FALSE;
4268                                 m->overwriting = FALSE;
4269                                 PAGE_WAKEUP_DONE(m);
4270
4271                                 if (must_free == TRUE) {
4272                                         vm_page_free(m);
4273                                 } else {
4274                                         vm_page_activate(m);
4275                                 }
4276                                 vm_page_unlock_queues();
4277
4278                                 target_offset += PAGE_SIZE_64;
4279                                 xfer_size -= PAGE_SIZE;
4280                                 entry++;
4281                                 continue;
4282                         }
4283                         /*
4284                         * Handle the trusted pager throttle.
4285                         */
4286                         if (m->laundry) {
4287                                 vm_pageout_throttle_up(m);
4288                         }
4289                         if(m->pageout) {
4290                                 assert(m->busy);
4291                                 assert(m->wire_count == 1);
4292                                 m->pageout = FALSE;
4293                                 vm_page_unwire(m);
4294                         }
4295                         m->dump_cleaning = FALSE;
4296                         m->cleaning = FALSE;
4297                         m->overwriting = FALSE;
4298 #if     MACH_PAGEMAP
4299                         vm_external_state_clr(
4300                                 m->object->existence_map, m->offset);
4301 #endif  /* MACH_PAGEMAP */
4302                         if(error & UPL_ABORT_DUMP_PAGES) {
4303                                 vm_page_free(m);
4304                                 pmap_disconnect(m->phys_page);
4305                         } else {
4306                                 PAGE_WAKEUP_DONE(m);
4307                         }
4308                         vm_page_unlock_queues();
4309                 }
4310                 target_offset += PAGE_SIZE_64;
4311                 xfer_size -= PAGE_SIZE;
4312                 entry++;
4313         }
4314         occupied = 1;
4315         if (upl->flags & UPL_DEVICE_MEMORY)  {
4316                 occupied = 0;
4317         } else if (upl->flags & UPL_LITE) {
4318                 int     pg_num;
4319                 int     i;
4320                 pg_num = upl->size/PAGE_SIZE;
4321                 pg_num = (pg_num + 31) >> 5;
4322                 occupied = 0;
4323                 for(i= 0; i<pg_num; i++) {
4324                         if(lite_list[i] != 0) {
4325                                 occupied = 1;
4326                                 break;
4327                         }
4328                 }
4329         } else {
4330                 if(queue_empty(&upl->map_object->memq)) {
4331                         occupied = 0;
4332                 }
4333         }
4334
4335         if(occupied == 0) {
4336                 if(upl->flags & UPL_COMMIT_NOTIFY_EMPTY) {
4337                         *empty = TRUE;
4338                 }
4339                 if(object == shadow_object)
4340                         vm_object_paging_end(shadow_object);
4341         }
4342         vm_object_unlock(shadow_object);
4343         if (object != shadow_object)
4344                 vm_object_unlock(object);
4345
4346         upl_unlock(upl);
4347
4348         return KERN_SUCCESS;
4349 }
4350
4351 kern_return_t
4352 upl_abort(
4353         upl_t   upl,
4354         int     error)
4355 {
4356         vm_object_t             object = NULL;
4357         vm_object_t             shadow_object = NULL;
4358         vm_object_offset_t      offset;
4359         vm_object_offset_t      shadow_offset;
4360         vm_object_offset_t      target_offset;
4361         upl_size_t              i;
4362         wpl_array_t             lite_list;
4363         vm_page_t               t,m;
4364         int                     occupied;
4365         boolean_t               shadow_internal;
4366
4367         if (upl == UPL_NULL)
4368                 return KERN_INVALID_ARGUMENT;
4369
4370         if (upl->flags & UPL_IO_WIRE) {
4371                 boolean_t       empty;
4372                 return upl_commit_range(upl,
4373                         0, upl->size, 0,
4374                         NULL, 0, &empty);
4375         }
4376
4377         upl_lock(upl);
4378         if(upl->flags & UPL_DEVICE_MEMORY) {
4379                 upl_unlock(upl);
4380                 return KERN_SUCCESS;
4381         }
4382
4383         object = upl->map_object;
4384
4385         if (object == NULL) {
4386                 panic("upl_abort: upl object is not backed by an object");
4387                 upl_unlock(upl);
4388                 return KERN_INVALID_ARGUMENT;
4389         }
4390
4391         if(object->pageout) {
4392                 shadow_object = object->shadow;
4393                 shadow_offset = object->shadow_offset;
4394         } else {
4395                 shadow_object = object;
4396                 shadow_offset = upl->offset - object->paging_offset;
4397         }
4398
4399         if(upl->flags & UPL_INTERNAL) {
4400                 lite_list = (wpl_array_t)
4401                         ((((uintptr_t)upl) + sizeof(struct upl))
4402                         + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
4403         } else {
4404                 lite_list = (wpl_array_t)
4405                         (((uintptr_t)upl) + sizeof(struct upl));
4406         }
4407         offset = 0;
4408
4409         if (object != shadow_object)
4410                 vm_object_lock(object);
4411         vm_object_lock(shadow_object);
4412
4413         shadow_internal = shadow_object->internal;
4414
4415         for(i = 0; i<(upl->size); i+=PAGE_SIZE, offset += PAGE_SIZE_64) {
4416                 m = VM_PAGE_NULL;
4417                 target_offset = offset + shadow_offset;
4418                 if(upl->flags & UPL_LITE) {
4419                         int     pg_num;
4420                         pg_num = offset/PAGE_SIZE;
4421                         if(lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
4422                                 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
4423                                 m = vm_page_lookup(
4424                                         shadow_object, target_offset);
4425                         }
4426                 }
4427                 if(object->pageout) {
4428                         if ((t = vm_page_lookup(object, offset)) != NULL) {
4429                                 t->pageout = FALSE;
4430                                 VM_PAGE_FREE(t);
4431                                 if(m == NULL) {
4432                                         m = vm_page_lookup(
4433                                             shadow_object, target_offset);
4434                                 }
4435                                 if(m != VM_PAGE_NULL)
4436                                         vm_object_paging_end(m->object);
4437                         }
4438                 }
4439                 if(m != VM_PAGE_NULL) {
4440                         vm_page_lock_queues();
4441                         if(m->absent) {
4442                                 boolean_t must_free = TRUE;
4443
4444                                 /* COPYOUT = FALSE case */
4445                                 /* check for error conditions which must */
4446                                 /* be passed back to the pages customer  */
4447                                 if(error & UPL_ABORT_RESTART) {
4448                                         m->restart = TRUE;
4449                                         m->absent = FALSE;
4450                                         vm_object_absent_release(m->object);
4451                                         m->page_error = KERN_MEMORY_ERROR;
4452                                         m->error = TRUE;
4453                                         must_free = FALSE;
4454                                 } else if(error & UPL_ABORT_UNAVAILABLE) {
4455                                         m->restart = FALSE;
4456                                         m->unusual = TRUE;
4457                                         must_free = FALSE;
4458                                 } else if(error & UPL_ABORT_ERROR) {
4459                                         m->restart = FALSE;
4460                                         m->absent = FALSE;
4461                                         vm_object_absent_release(m->object);
4462                                         m->page_error = KERN_MEMORY_ERROR;
4463                                         m->error = TRUE;
4464                                         must_free = FALSE;
4465                                 }
4466
4467                                 /*
4468                                  * ENCRYPTED SWAP:
4469                                  * If the page was already encrypted,
4470                                  * we don't really need to decrypt it
4471                                  * now.  It will get decrypted later,
4472                                  * on demand, as soon as someone needs
4473                                  * to access its contents.
4474                                  */
4475
4476                                 m->cleaning = FALSE;
4477                                 m->overwriting = FALSE;
4478                                 PAGE_WAKEUP_DONE(m);
4479
4480                                 if (must_free == TRUE) {
4481                                         vm_page_free(m);
4482                                 } else {
4483                                         vm_page_activate(m);
4484                                 }
4485                                 vm_page_unlock_queues();
4486                                 continue;
4487                         }
4488                         /*
4489                          * Handle the trusted pager throttle.
4490                          */
4491                         if (m->laundry) {
4492                                 vm_pageout_throttle_up(m);
4493                         }
4494                         if(m->pageout) {
4495                                 assert(m->busy);
4496                                 assert(m->wire_count == 1);
4497                                 m->pageout = FALSE;
4498                                 vm_page_unwire(m);
4499                         }
4500                         m->dump_cleaning = FALSE;
4501                         m->cleaning = FALSE;
4502                         m->overwriting = FALSE;
4503 #if     MACH_PAGEMAP
4504                         vm_external_state_clr(
4505                                 m->object->existence_map, m->offset);
4506 #endif  /* MACH_PAGEMAP */
4507                         if(error & UPL_ABORT_DUMP_PAGES) {
4508                                 vm_page_free(m);
4509                                 pmap_disconnect(m->phys_page);
4510                         } else {
4511                                 PAGE_WAKEUP_DONE(m);
4512                         }
4513                         vm_page_unlock_queues();
4514                 }
4515         }
4516         occupied = 1;
4517         if (upl->flags & UPL_DEVICE_MEMORY)  {
4518                 occupied = 0;
4519         } else if (upl->flags & UPL_LITE) {
4520                 int     pg_num;
4521                 int     j;
4522                 pg_num = upl->size/PAGE_SIZE;
4523                 pg_num = (pg_num + 31) >> 5;
4524                 occupied = 0;
4525                 for(j= 0; j<pg_num; j++) {
4526                         if(lite_list[j] != 0) {
4527                                 occupied = 1;
4528                                 break;
4529                         }
4530                 }
4531         } else {
4532                 if(queue_empty(&upl->map_object->memq)) {
4533                         occupied = 0;
4534                 }
4535         }
4536
4537         if(occupied == 0) {
4538                 if(object == shadow_object)
4539                         vm_object_paging_end(shadow_object);
4540         }
4541         vm_object_unlock(shadow_object);
4542         if (object != shadow_object)
4543                 vm_object_unlock(object);
4544
4545         upl_unlock(upl);
4546         return KERN_SUCCESS;
4547 }
4548
4549 /* an option on commit should be wire */
4550 kern_return_t
4551 upl_commit(
4552         upl_t                   upl,
4553         upl_page_info_t         *page_list,
4554         mach_msg_type_number_t  count)
4555 {
4556         if (upl == UPL_NULL)
4557                 return KERN_INVALID_ARGUMENT;
4558
4559         if(upl->flags & (UPL_LITE | UPL_IO_WIRE)) {
4560                 boolean_t       empty;
4561                 return upl_commit_range(upl, 0, upl->size, 0,
4562                                         page_list, count, &empty);
4563         }
4564
4565         if (count == 0)
4566                 page_list = NULL;
4567
4568         upl_lock(upl);
4569         if (upl->flags & UPL_DEVICE_MEMORY)
4570                 page_list = NULL;
4571
4572         if (upl->flags & UPL_ENCRYPTED) {
4573                 /*
4574                  * ENCRYPTED SWAP:
4575                  * This UPL was encrypted, but we don't need
4576                  * to decrypt here.  We'll decrypt each page
4577                  * later, on demand, as soon as someone needs
4578                  * to access the page's contents.
4579                  */
4580         }
4581
4582         if ((upl->flags & UPL_CLEAR_DIRTY) ||
4583                 (upl->flags & UPL_PAGE_SYNC_DONE) || page_list) {
4584                 vm_object_t     shadow_object = upl->map_object->shadow;
4585                 vm_object_t     object = upl->map_object;
4586                 vm_object_offset_t target_offset;
4587                 upl_size_t      xfer_end;
4588                 int             entry;
4589
4590                 vm_page_t       t, m;
4591                 upl_page_info_t *p;
4592
4593                 if (object != shadow_object)
4594                         vm_object_lock(object);
4595                 vm_object_lock(shadow_object);
4596
4597                 entry = 0;
4598                 target_offset = object->shadow_offset;
4599                 xfer_end = upl->size + object->shadow_offset;
4600
4601                 while(target_offset < xfer_end) {
4602
4603                         if ((t = vm_page_lookup(object,
4604                                 target_offset - object->shadow_offset))
4605                                 == NULL) {
4606                                 target_offset += PAGE_SIZE_64;
4607                                 entry++;
4608                                 continue;
4609                         }
4610
4611                         m = vm_page_lookup(shadow_object, target_offset);
4612                         if(m != VM_PAGE_NULL) {
4613                             /*
4614                              * ENCRYPTED SWAP:
4615                              * If this page was encrypted, we
4616                              * don't need to decrypt it here.
4617                              * We'll decrypt it later, on demand,
4618                              * as soon as someone needs to access
4619                              * its contents.
4620                              */
4621
4622                             if (upl->flags & UPL_CLEAR_DIRTY) {
4623                                 pmap_clear_modify(m->phys_page);
4624                                 m->dirty = FALSE;
4625                             }
4626                             /* It is a part of the semantic of */
4627                             /* COPYOUT_FROM UPLs that a commit */
4628                             /* implies cache sync between the  */
4629                             /* vm page and the backing store   */
4630                             /* this can be used to strip the   */
4631                             /* precious bit as well as clean   */
4632                             if (upl->flags & UPL_PAGE_SYNC_DONE)
4633                                 m->precious = FALSE;
4634
4635                            if(page_list) {
4636                                 p = &(page_list[entry]);
4637                                 if(page_list[entry].phys_addr &&
4638                                                 p->pageout && !m->pageout) {
4639                                         vm_page_lock_queues();
4640                                         m->busy = TRUE;
4641                                         m->pageout = TRUE;
4642                                         vm_page_wire(m);
4643                                         vm_page_unlock_queues();
4644                                 } else if (page_list[entry].phys_addr &&
4645                                                 !p->pageout && m->pageout &&
4646                                                 !m->dump_cleaning) {
4647                                         vm_page_lock_queues();
4648                                         m->pageout = FALSE;
4649                                         m->absent = FALSE;
4650                                         m->overwriting = FALSE;
4651                                         vm_page_unwire(m);
4652                                         PAGE_WAKEUP_DONE(m);
4653                                         vm_page_unlock_queues();
4654                                 }
4655                                 page_list[entry].phys_addr = 0;
4656                            }
4657                         }
4658                         target_offset += PAGE_SIZE_64;
4659                         entry++;
4660                 }
4661                 vm_object_unlock(shadow_object);
4662                 if (object != shadow_object)
4663                         vm_object_unlock(object);
4664
4665         }
4666         if (upl->flags & UPL_DEVICE_MEMORY)  {
4667                 vm_object_lock(upl->map_object->shadow);
4668                 if(upl->map_object == upl->map_object->shadow)
4669                         vm_object_paging_end(upl->map_object->shadow);
4670                 vm_object_unlock(upl->map_object->shadow);
4671         }
4672         upl_unlock(upl);
4673         return KERN_SUCCESS;
4674 }
4675
4676
4677
4678 kern_return_t
4679 vm_object_iopl_request(
4680         vm_object_t             object,
4681         vm_object_offset_t      offset,
4682         upl_size_t              size,
4683         upl_t                   *upl_ptr,
4684         upl_page_info_array_t   user_page_list,
4685         unsigned int            *page_list_count,
4686         int                     cntrl_flags)
4687 {
4688         vm_page_t               dst_page;
4689         vm_object_offset_t      dst_offset = offset;
4690         upl_size_t              xfer_size = size;
4691         upl_t                   upl = NULL;
4692         unsigned int            entry;
4693         wpl_array_t             lite_list = NULL;
4694         int                     page_field_size;
4695         int                     delayed_unlock = 0;
4696         int                     no_zero_fill = FALSE;
4697         vm_page_t               alias_page = NULL;
4698         kern_return_t           ret;
4699         vm_prot_t               prot;
4700
4701
4702         if (cntrl_flags & ~UPL_VALID_FLAGS) {
4703                 /*
4704                  * For forward compatibility's sake,
4705                  * reject any unknown flag.
4706                  */
4707                 return KERN_INVALID_VALUE;
4708         }
4709         if (vm_lopage_poolsize == 0)
4710                 cntrl_flags &= ~UPL_NEED_32BIT_ADDR;
4711
4712         if (cntrl_flags & UPL_NEED_32BIT_ADDR) {
4713                 if ( (cntrl_flags & (UPL_SET_IO_WIRE | UPL_SET_LITE)) != (UPL_SET_IO_WIRE | UPL_SET_LITE))
4714                         return KERN_INVALID_VALUE;
4715
4716                 if (object->phys_contiguous) {
4717                         if ((offset + object->shadow_offset) >= (vm_object_offset_t)max_valid_dma_address)
4718                                 return KERN_INVALID_ADDRESS;
4719
4720                         if (((offset + object->shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address)
4721                                 return KERN_INVALID_ADDRESS;
4722                 }
4723         }
4724
4725         if (cntrl_flags & UPL_ENCRYPT) {
4726                 /*
4727                  * ENCRYPTED SWAP:
4728                  * The paging path doesn't use this interface,
4729                  * so we don't support the UPL_ENCRYPT flag
4730                  * here.  We won't encrypt the pages.
4731                  */
4732                 assert(! (cntrl_flags & UPL_ENCRYPT));
4733         }
4734
4735         if (cntrl_flags & UPL_NOZEROFILL)
4736                 no_zero_fill = TRUE;
4737
4738         if (cntrl_flags & UPL_COPYOUT_FROM)
4739                 prot = VM_PROT_READ;
4740         else
4741                 prot = VM_PROT_READ | VM_PROT_WRITE;
4742
4743         if(((size/page_size) > MAX_UPL_TRANSFER) && !object->phys_contiguous) {
4744                 size = MAX_UPL_TRANSFER * page_size;
4745         }
4746
4747         if(cntrl_flags & UPL_SET_INTERNAL)
4748                 if(page_list_count != NULL)
4749                         *page_list_count = MAX_UPL_TRANSFER;
4750         if(((cntrl_flags & UPL_SET_INTERNAL) && !(object->phys_contiguous)) &&
4751            ((page_list_count != NULL) && (*page_list_count != 0)
4752                                 && *page_list_count < (size/page_size)))
4753                 return KERN_INVALID_ARGUMENT;
4754
4755         if((!object->internal) && (object->paging_offset != 0))
4756                 panic("vm_object_upl_request: external object with non-zero paging offset\n");
4757
4758         if(object->phys_contiguous) {
4759                 /* No paging operations are possible against this memory */
4760                 /* and so no need for map object, ever */
4761                 cntrl_flags |= UPL_SET_LITE;
4762         }
4763
4764         if(upl_ptr) {
4765                 if(cntrl_flags & UPL_SET_INTERNAL) {
4766                         if(cntrl_flags & UPL_SET_LITE) {
4767                                 upl = upl_create(
4768                                         UPL_CREATE_INTERNAL | UPL_CREATE_LITE,
4769                                         size);
4770                                 user_page_list = (upl_page_info_t *)
4771                                    (((uintptr_t)upl) + sizeof(struct upl));
4772                                 lite_list = (wpl_array_t)
4773                                         (((uintptr_t)user_page_list) +
4774                                         ((size/PAGE_SIZE) *
4775                                                 sizeof(upl_page_info_t)));
4776                                 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
4777                                 page_field_size =
4778                                         (page_field_size + 3) & 0xFFFFFFFC;
4779                                 bzero((char *)lite_list, page_field_size);
4780                                 upl->flags =
4781                                         UPL_LITE | UPL_INTERNAL | UPL_IO_WIRE;
4782                         } else {
4783                                 upl = upl_create(UPL_CREATE_INTERNAL, size);
4784                                 user_page_list = (upl_page_info_t *)
4785                                         (((uintptr_t)upl)
4786                                                 + sizeof(struct upl));
4787                                 upl->flags = UPL_INTERNAL | UPL_IO_WIRE;
4788                         }
4789                 } else {
4790                         if(cntrl_flags & UPL_SET_LITE) {
4791                                 upl = upl_create(UPL_CREATE_LITE, size);
4792                                 lite_list = (wpl_array_t)
4793                                    (((uintptr_t)upl) + sizeof(struct upl));
4794                                 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
4795                                 page_field_size =
4796                                         (page_field_size + 3) & 0xFFFFFFFC;
4797                                 bzero((char *)lite_list, page_field_size);
4798                                 upl->flags = UPL_LITE | UPL_IO_WIRE;
4799                         } else {
4800                                 upl = upl_create(UPL_CREATE_EXTERNAL, size);
4801                                 upl->flags = UPL_IO_WIRE;
4802                         }
4803                 }
4804
4805                 if(object->phys_contiguous) {
4806                         upl->map_object = object;
4807                         /* don't need any shadow mappings for this one */
4808                         /* since it is already I/O memory */
4809                         upl->flags |= UPL_DEVICE_MEMORY;
4810
4811                         vm_object_lock(object);
4812                         vm_object_paging_begin(object);
4813                         vm_object_unlock(object);
4814
4815                         /* paging in progress also protects the paging_offset */
4816                         upl->offset = offset + object->paging_offset;
4817                         upl->size = size;
4818                         *upl_ptr = upl;
4819                         if(user_page_list) {
4820                                 user_page_list[0].phys_addr =
4821                                   (offset + object->shadow_offset)>>PAGE_SHIFT;
4822                                 user_page_list[0].device = TRUE;
4823                         }
4824                         upl->highest_page = (offset + object->shadow_offset + size - 1)>>PAGE_SHIFT;
4825
4826                         if(page_list_count != NULL) {
4827                                 if (upl->flags & UPL_INTERNAL) {
4828                                         *page_list_count = 0;
4829                                 } else {
4830                                         *page_list_count = 1;
4831                                 }
4832                         }
4833                         return KERN_SUCCESS;
4834                 }
4835                 if(user_page_list)
4836                         user_page_list[0].device = FALSE;
4837
4838                 if(cntrl_flags & UPL_SET_LITE) {
4839                         upl->map_object = object;
4840                 } else {
4841                         upl->map_object = vm_object_allocate(size);
4842                         vm_object_lock(upl->map_object);
4843                         upl->map_object->shadow = object;
4844                         upl->map_object->pageout = TRUE;
4845                         upl->map_object->can_persist = FALSE;
4846                         upl->map_object->copy_strategy =
4847                                         MEMORY_OBJECT_COPY_NONE;
4848                         upl->map_object->shadow_offset = offset;
4849                         upl->map_object->wimg_bits = object->wimg_bits;
4850                         vm_object_unlock(upl->map_object);
4851                 }
4852         }
4853         vm_object_lock(object);
4854         vm_object_paging_begin(object);
4855
4856         if (!object->phys_contiguous) {
4857                 /* Protect user space from future COW operations */
4858                 object->true_share = TRUE;
4859                 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC)
4860                         object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
4861         }
4862
4863         /* we can lock the upl offset now that paging_in_progress is set */
4864         if(upl_ptr) {
4865                 upl->size = size;
4866                 upl->offset = offset + object->paging_offset;
4867                 *upl_ptr = upl;
4868 #ifdef UPL_DEBUG
4869                 queue_enter(&object->uplq, upl, upl_t, uplq);
4870 #endif /* UPL_DEBUG */
4871         }
4872
4873         if (cntrl_flags & UPL_BLOCK_ACCESS) {
4874                 /*
4875                  * The user requested that access to the pages in this URL
4876                  * be blocked until the UPL is commited or aborted.
4877                  */
4878                 upl->flags |= UPL_ACCESS_BLOCKED;
4879         }
4880
4881         entry = 0;
4882         while (xfer_size) {
4883                 if((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
4884                         if (delayed_unlock) {
4885                                 delayed_unlock = 0;
4886                                 vm_page_unlock_queues();
4887                         }
4888                         vm_object_unlock(object);
4889                         VM_PAGE_GRAB_FICTITIOUS(alias_page);
4890                         vm_object_lock(object);
4891                 }
4892                 dst_page = vm_page_lookup(object, dst_offset);
4893
4894                 /*
4895                  * ENCRYPTED SWAP:
4896                  * If the page is encrypted, we need to decrypt it,
4897                  * so force a soft page fault.
4898                  */
4899                 if ((dst_page == VM_PAGE_NULL) || (dst_page->busy) ||
4900                     (dst_page->encrypted) ||
4901                     (dst_page->unusual && (dst_page->error ||
4902                                            dst_page->restart ||
4903                                            dst_page->absent ||
4904                                            dst_page->fictitious ||
4905                                            (prot & dst_page->page_lock)))) {
4906                         vm_fault_return_t       result;
4907                    do {
4908                         vm_page_t       top_page;
4909                         kern_return_t   error_code;
4910                         int             interruptible;
4911
4912                         vm_object_offset_t      lo_offset = offset;
4913                         vm_object_offset_t      hi_offset = offset + size;
4914
4915
4916                         if (delayed_unlock) {
4917                                 delayed_unlock = 0;
4918                                 vm_page_unlock_queues();
4919                         }
4920
4921                         if(cntrl_flags & UPL_SET_INTERRUPTIBLE) {
4922                                 interruptible = THREAD_ABORTSAFE;
4923                         } else {
4924                                 interruptible = THREAD_UNINT;
4925                         }
4926
4927                         result = vm_fault_page(object, dst_offset,
4928                                 prot | VM_PROT_WRITE, FALSE,
4929                                 interruptible,
4930                                 lo_offset, hi_offset,
4931                                 VM_BEHAVIOR_SEQUENTIAL,
4932                                 &prot, &dst_page, &top_page,
4933                                 (int *)0,
4934                                 &error_code, no_zero_fill, FALSE, NULL, 0);
4935
4936                         switch(result) {
4937                         case VM_FAULT_SUCCESS:
4938
4939                                 PAGE_WAKEUP_DONE(dst_page);
4940
4941                                 /*
4942                                  *      Release paging references and
4943                                  *      top-level placeholder page, if any.
4944                                  */
4945
4946                                 if(top_page != VM_PAGE_NULL) {
4947                                         vm_object_t local_object;
4948                                         local_object =
4949                                                 top_page->object;
4950                                         if(top_page->object
4951                                                 != dst_page->object) {
4952                                                 vm_object_lock(
4953                                                         local_object);
4954                                                 VM_PAGE_FREE(top_page);
4955                                                 vm_object_paging_end(
4956                                                         local_object);
4957                                                 vm_object_unlock(
4958                                                         local_object);
4959                                         } else {
4960                                                 VM_PAGE_FREE(top_page);
4961                                                 vm_object_paging_end(
4962                                                         local_object);
4963                                         }
4964                                 }
4965
4966                                 break;
4967
4968
4969                         case VM_FAULT_RETRY:
4970                                 vm_object_lock(object);
4971                                 vm_object_paging_begin(object);
4972                                 break;
4973
4974                         case VM_FAULT_FICTITIOUS_SHORTAGE:
4975                                 vm_page_more_fictitious();
4976                                 vm_object_lock(object);
4977                                 vm_object_paging_begin(object);
4978                                 break;
4979
4980                         case VM_FAULT_MEMORY_SHORTAGE:
4981                                 if (vm_page_wait(interruptible)) {
4982                                         vm_object_lock(object);
4983                                         vm_object_paging_begin(object);
4984                                         break;
4985                                 }
4986                                 /* fall thru */
4987
4988                         case VM_FAULT_INTERRUPTED:
4989                                 error_code = MACH_SEND_INTERRUPTED;
4990                         case VM_FAULT_MEMORY_ERROR:
4991                                 ret = (error_code ? error_code:
4992                                         KERN_MEMORY_ERROR);
4993                                 vm_object_lock(object);
4994
4995                                 goto return_err;
4996                         }
4997                    } while ((result != VM_FAULT_SUCCESS)
4998                                 || (result == VM_FAULT_INTERRUPTED));
4999                 }
5000
5001                 if ( (cntrl_flags & UPL_NEED_32BIT_ADDR) &&
5002                      dst_page->phys_page >= (max_valid_dma_address >> PAGE_SHIFT) ) {
5003                         vm_page_t       low_page;
5004                         int             refmod;
5005
5006                         /*
5007                          * support devices that can't DMA above 32 bits
5008                          * by substituting pages from a pool of low address
5009                          * memory for any pages we find above the 4G mark
5010                          * can't substitute if the page is already wired because
5011                          * we don't know whether that physical address has been
5012                          * handed out to some other 64 bit capable DMA device to use
5013                          */
5014                         if (dst_page->wire_count) {
5015                                 ret = KERN_PROTECTION_FAILURE;
5016                                 goto return_err;
5017                         }
5018                         if (delayed_unlock) {
5019                                 delayed_unlock = 0;
5020                                 vm_page_unlock_queues();
5021                         }
5022                         low_page = vm_page_grablo();
5023
5024                         if (low_page == VM_PAGE_NULL) {
5025                                 ret = KERN_RESOURCE_SHORTAGE;
5026                                 goto return_err;
5027                         }
5028                         /*
5029                          * from here until the vm_page_replace completes
5030                          * we musn't drop the object lock... we don't
5031                          * want anyone refaulting this page in and using
5032                          * it after we disconnect it... we want the fault
5033                          * to find the new page being substituted.
5034                          */
5035                         refmod = pmap_disconnect(dst_page->phys_page);
5036
5037                         vm_page_copy(dst_page, low_page);
5038
5039                         low_page->reference = dst_page->reference;
5040                         low_page->dirty     = dst_page->dirty;
5041
5042                         if (refmod & VM_MEM_REFERENCED)
5043                                 low_page->reference = TRUE;
5044                         if (refmod & VM_MEM_MODIFIED)
5045                                 low_page->dirty = TRUE;
5046
5047                         vm_page_lock_queues();
5048                         vm_page_replace(low_page, object, dst_offset);
5049                         /*
5050                          * keep the queue lock since we're going to
5051                          * need it immediately
5052                          */
5053                         delayed_unlock = 1;
5054
5055                         dst_page = low_page;
5056                         /*
5057                          * vm_page_grablo returned the page marked
5058                          * BUSY... we don't need a PAGE_WAKEUP_DONE
5059                          * here, because we've never dropped the object lock
5060                          */
5061                         dst_page->busy = FALSE;
5062                 }
5063                 if (delayed_unlock == 0)
5064                         vm_page_lock_queues();
5065                 vm_page_wire(dst_page);
5066
5067                 if (cntrl_flags & UPL_BLOCK_ACCESS) {
5068                         /*
5069                          * Mark the page "busy" to block any future page fault
5070                          * on this page.  We'll also remove the mapping
5071                          * of all these pages before leaving this routine.
5072                          */
5073                         assert(!dst_page->fictitious);
5074                         dst_page->busy = TRUE;
5075                 }
5076
5077                 if (upl_ptr) {
5078                         if (cntrl_flags & UPL_SET_LITE) {
5079                                 int     pg_num;
5080                                 pg_num = (dst_offset-offset)/PAGE_SIZE;
5081                                 lite_list[pg_num>>5] |= 1 << (pg_num & 31);
5082                         } else {
5083                                 /*
5084                                  * Convert the fictitious page to a
5085                                  * private shadow of the real page.
5086                                  */
5087                                 assert(alias_page->fictitious);
5088                                 alias_page->fictitious = FALSE;
5089                                 alias_page->private = TRUE;
5090                                 alias_page->pageout = TRUE;
5091                                 alias_page->phys_page = dst_page->phys_page;
5092                                 vm_page_wire(alias_page);
5093
5094                                 vm_page_insert(alias_page,
5095                                         upl->map_object, size - xfer_size);
5096                                 assert(!alias_page->wanted);
5097                                 alias_page->busy = FALSE;
5098                                 alias_page->absent = FALSE;
5099                         }
5100
5101                         /* expect the page to be used */
5102                         dst_page->reference = TRUE;
5103
5104                         if (!(cntrl_flags & UPL_COPYOUT_FROM))
5105                                 dst_page->dirty = TRUE;
5106                         alias_page = NULL;
5107
5108                         if (dst_page->phys_page > upl->highest_page)
5109                                 upl->highest_page = dst_page->phys_page;
5110
5111                         if (user_page_list) {
5112                                 user_page_list[entry].phys_addr
5113                                         = dst_page->phys_page;
5114                                 user_page_list[entry].dirty =
5115                                                 dst_page->dirty;
5116                                 user_page_list[entry].pageout =
5117                                                 dst_page->pageout;
5118                                 user_page_list[entry].absent =
5119                                                 dst_page->absent;
5120                                 user_page_list[entry].precious =
5121                                                 dst_page->precious;
5122                         }
5123                 }
5124                 if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
5125                         delayed_unlock = 0;
5126                         vm_page_unlock_queues();
5127                 }
5128                 entry++;
5129                 dst_offset += PAGE_SIZE_64;
5130                 xfer_size -= PAGE_SIZE;
5131         }
5132         if (delayed_unlock)
5133                 vm_page_unlock_queues();
5134
5135         if (upl->flags & UPL_INTERNAL) {
5136                 if(page_list_count != NULL)
5137                         *page_list_count = 0;
5138         } else if (*page_list_count > entry) {
5139                 if(page_list_count != NULL)
5140                         *page_list_count = entry;
5141         }
5142
5143         if (alias_page != NULL) {
5144                 vm_page_lock_queues();
5145                 vm_page_free(alias_page);
5146                 vm_page_unlock_queues();
5147         }
5148
5149         vm_object_unlock(object);
5150
5151         if (cntrl_flags & UPL_BLOCK_ACCESS) {
5152                 /*
5153                  * We've marked all the pages "busy" so that future
5154                  * page faults will block.
5155                  * Now remove the mapping for these pages, so that they
5156                  * can't be accessed without causing a page fault.
5157                  */
5158                 vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
5159                                        PMAP_NULL, 0, VM_PROT_NONE);
5160         }
5161
5162         return KERN_SUCCESS;
5163
5164
5165 return_err:
5166         if (delayed_unlock)
5167                 vm_page_unlock_queues();
5168
5169         for (; offset < dst_offset; offset += PAGE_SIZE) {
5170                 dst_page = vm_page_lookup(object, offset);
5171
5172                 if (dst_page == VM_PAGE_NULL)
5173                         panic("vm_object_iopl_request: Wired pages missing. \n");
5174                 vm_page_lock_queues();
5175                 vm_page_unwire(dst_page);
5176                 vm_page_unlock_queues();
5177                 VM_STAT(reactivations++);
5178         }
5179         vm_object_paging_end(object);
5180         vm_object_unlock(object);
5181         upl_destroy(upl);
5182
5183         return ret;
5184 }
5185
5186
5187 kern_return_t
5188 upl_transpose(
5189         upl_t           upl1,
5190         upl_t           upl2)
5191 {
5192         kern_return_t           retval;
5193         boolean_t               upls_locked;
5194         vm_object_t             object1, object2;
5195
5196         if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2) {
5197                 return KERN_INVALID_ARGUMENT;
5198         }
5199
5200         upls_locked = FALSE;
5201
5202         /*
5203          * Since we need to lock both UPLs at the same time,
5204          * avoid deadlocks by always taking locks in the same order.
5205          */
5206         if (upl1 < upl2) {
5207                 upl_lock(upl1);
5208                 upl_lock(upl2);
5209         } else {
5210                 upl_lock(upl2);
5211                 upl_lock(upl1);
5212         }
5213         upls_locked = TRUE;     /* the UPLs will need to be unlocked */
5214
5215         object1 = upl1->map_object;
5216         object2 = upl2->map_object;
5217
5218         if (upl1->offset != 0 || upl2->offset != 0 ||
5219             upl1->size != upl2->size) {
5220                 /*
5221                  * We deal only with full objects, not subsets.
5222                  * That's because we exchange the entire backing store info
5223                  * for the objects: pager, resident pages, etc...  We can't do
5224                  * only part of it.
5225                  */
5226                 retval = KERN_INVALID_VALUE;
5227                 goto done;
5228         }
5229
5230         /*
5231          * Tranpose the VM objects' backing store.
5232          */
5233         retval = vm_object_transpose(object1, object2,
5234                                      (vm_object_size_t) upl1->size);
5235
5236         if (retval == KERN_SUCCESS) {
5237                 /*
5238                  * Make each UPL point to the correct VM object, i.e. the
5239                  * object holding the pages that the UPL refers to...
5240                  */
5241                 upl1->map_object = object2;
5242                 upl2->map_object = object1;
5243         }
5244
5245 done:
5246         /*
5247          * Cleanup.
5248          */
5249         if (upls_locked) {
5250                 upl_unlock(upl1);
5251                 upl_unlock(upl2);
5252                 upls_locked = FALSE;
5253         }
5254
5255         return retval;
5256 }
5257
5258 /*
5259  * ENCRYPTED SWAP:
5260  *
5261  * Rationale:  the user might have some encrypted data on disk (via
5262  * FileVault or any other mechanism).  That data is then decrypted in
5263  * memory, which is safe as long as the machine is secure.  But that
5264  * decrypted data in memory could be paged out to disk by the default
5265  * pager.  The data would then be stored on disk in clear (not encrypted)
5266  * and it could be accessed by anyone who gets physical access to the
5267  * disk (if the laptop or the disk gets stolen for example).  This weakens
5268  * the security offered by FileVault.
5269  *
5270  * Solution:  the default pager will optionally request that all the
5271  * pages it gathers for pageout be encrypted, via the UPL interfaces,
5272  * before it sends this UPL to disk via the vnode_pageout() path.
5273  *
5274  * Notes:
5275  *
5276  * To avoid disrupting the VM LRU algorithms, we want to keep the
5277  * clean-in-place mechanisms, which allow us to send some extra pages to
5278  * swap (clustering) without actually removing them from the user's
5279  * address space.  We don't want the user to unknowingly access encrypted
5280  * data, so we have to actually remove the encrypted pages from the page
5281  * table.  When the user accesses the data, the hardware will fail to
5282  * locate the virtual page in its page table and will trigger a page
5283  * fault.  We can then decrypt the page and enter it in the page table
5284  * again.  Whenever we allow the user to access the contents of a page,
5285  * we have to make sure it's not encrypted.
5286  *
5287  *
5288  */
5289 /*
5290  * ENCRYPTED SWAP:
5291  * Reserve of virtual addresses in the kernel address space.
5292  * We need to map the physical pages in the kernel, so that we
5293  * can call the encryption/decryption routines with a kernel
5294  * virtual address.  We keep this pool of pre-allocated kernel
5295  * virtual addresses so that we don't have to scan the kernel's
5296  * virtaul address space each time we need to encrypt or decrypt
5297  * a physical page.
5298  * It would be nice to be able to encrypt and decrypt in physical
5299  * mode but that might not always be more efficient...
5300  */
5301 decl_simple_lock_data(,vm_paging_lock)
5302 #define VM_PAGING_NUM_PAGES     64
5303 vm_map_offset_t vm_paging_base_address = 0;
5304 boolean_t       vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
5305 int             vm_paging_max_index = 0;
5306 unsigned long   vm_paging_no_kernel_page = 0;
5307 unsigned long   vm_paging_objects_mapped = 0;
5308 unsigned long   vm_paging_pages_mapped = 0;
5309 unsigned long   vm_paging_objects_mapped_slow = 0;
5310 unsigned long   vm_paging_pages_mapped_slow = 0;
5311
5312 /*
5313  * ENCRYPTED SWAP:
5314  * vm_paging_map_object:
5315  *      Maps part of a VM object's pages in the kernel
5316  *      virtual address space, using the pre-allocated
5317  *      kernel virtual addresses, if possible.
5318  * Context:
5319  *      The VM object is locked.  This lock will get
5320  *      dropped and re-acquired though.
5321  */
5322 kern_return_t
5323 vm_paging_map_object(
5324         vm_map_offset_t         *address,
5325         vm_page_t               page,
5326         vm_object_t             object,
5327         vm_object_offset_t      offset,
5328         vm_map_size_t           *size)
5329 {
5330         kern_return_t           kr;
5331         vm_map_offset_t         page_map_offset;
5332         vm_map_size_t           map_size;
5333         vm_object_offset_t      object_offset;
5334         int                     i;
5335         vm_map_entry_t          map_entry;
5336
5337
5338         if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
5339                 /*
5340                  * Use one of the pre-allocated kernel virtual addresses
5341                  * and just enter the VM page in the kernel address space
5342                  * at that virtual address.
5343                  */
5344                 vm_object_unlock(object);
5345                 simple_lock(&vm_paging_lock);
5346
5347                 if (vm_paging_base_address == 0) {
5348                         /*
5349                          * Initialize our pool of pre-allocated kernel
5350                          * virtual addresses.
5351                          */
5352                         simple_unlock(&vm_paging_lock);
5353                         page_map_offset = 0;
5354                         kr = vm_map_find_space(kernel_map,
5355                                                &page_map_offset,
5356                                                VM_PAGING_NUM_PAGES * PAGE_SIZE,
5357                                                0,
5358                                                0,
5359                                                &map_entry);
5360                         if (kr != KERN_SUCCESS) {
5361                                 panic("vm_paging_map_object: "
5362                                       "kernel_map full\n");
5363                         }
5364                         map_entry->object.vm_object = kernel_object;
5365                         map_entry->offset =
5366                                 page_map_offset - VM_MIN_KERNEL_ADDRESS;
5367                         vm_object_reference(kernel_object);
5368                         vm_map_unlock(kernel_map);
5369
5370                         simple_lock(&vm_paging_lock);
5371                         if (vm_paging_base_address != 0) {
5372                                 /* someone raced us and won: undo */
5373                                 simple_unlock(&vm_paging_lock);
5374                                 kr = vm_map_remove(kernel_map,
5375                                                    page_map_offset,
5376                                                    page_map_offset +
5377                                                    (VM_PAGING_NUM_PAGES
5378                                                     * PAGE_SIZE),
5379                                                    VM_MAP_NO_FLAGS);
5380                                 assert(kr == KERN_SUCCESS);
5381                                 simple_lock(&vm_paging_lock);
5382                         } else {
5383                                 vm_paging_base_address = page_map_offset;
5384                         }
5385                 }
5386
5387                 /*
5388                  * Try and find an available kernel virtual address
5389                  * from our pre-allocated pool.
5390                  */
5391                 page_map_offset = 0;
5392                 for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
5393                         if (vm_paging_page_inuse[i] == FALSE) {
5394                                 page_map_offset = vm_paging_base_address +
5395                                         (i * PAGE_SIZE);
5396                                 break;
5397                         }
5398                 }
5399
5400                 if (page_map_offset != 0) {
5401                         /*
5402                          * We found a kernel virtual address;
5403                          * map the physical page to that virtual address.
5404                          */
5405                         if (i > vm_paging_max_index) {
5406                                 vm_paging_max_index = i;
5407                         }
5408                         vm_paging_page_inuse[i] = TRUE;
5409                         simple_unlock(&vm_paging_lock);
5410                         if (page->no_isync == TRUE) {
5411                                 pmap_sync_page_data_phys(page->phys_page);
5412                         }
5413                         assert(pmap_verify_free(page->phys_page));
5414                         PMAP_ENTER(kernel_pmap,
5415                                    page_map_offset,
5416                                    page,
5417                                    VM_PROT_DEFAULT,
5418                                    ((int) page->object->wimg_bits &
5419                                     VM_WIMG_MASK),
5420                                    TRUE);
5421                         vm_paging_objects_mapped++;
5422                         vm_paging_pages_mapped++;
5423                         *address = page_map_offset;
5424                         vm_object_lock(object);
5425
5426                         /* all done and mapped, ready to use ! */
5427                         return KERN_SUCCESS;
5428                 }
5429
5430                 /*
5431                  * We ran out of pre-allocated kernel virtual
5432                  * addresses.  Just map the page in the kernel
5433                  * the slow and regular way.
5434                  */
5435                 vm_paging_no_kernel_page++;
5436                 simple_unlock(&vm_paging_lock);
5437                 vm_object_lock(object);
5438         }
5439
5440         object_offset = vm_object_trunc_page(offset);
5441         map_size = vm_map_round_page(*size);
5442
5443         /*
5444          * Try and map the required range of the object
5445          * in the kernel_map
5446          */
5447
5448         /* don't go beyond the object's end... */
5449         if (object_offset >= object->size) {
5450                 map_size = 0;
5451         } else if (map_size > object->size - offset) {
5452                 map_size = object->size - offset;
5453         }
5454
5455         vm_object_reference_locked(object);     /* for the map entry */
5456         vm_object_unlock(object);
5457
5458         kr = vm_map_enter(kernel_map,
5459                           address,
5460                           map_size,
5461                           0,
5462                           VM_FLAGS_ANYWHERE,
5463                           object,
5464                           object_offset,
5465                           FALSE,
5466                           VM_PROT_DEFAULT,
5467                           VM_PROT_ALL,
5468                           VM_INHERIT_NONE);
5469         if (kr != KERN_SUCCESS) {
5470                 *address = 0;
5471                 *size = 0;
5472                 vm_object_deallocate(object);   /* for the map entry */
5473                 return kr;
5474         }
5475
5476         *size = map_size;
5477
5478         /*
5479          * Enter the mapped pages in the page table now.
5480          */
5481         vm_object_lock(object);
5482         for (page_map_offset = 0;
5483              map_size != 0;
5484              map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
5485                 unsigned int    cache_attr;
5486
5487                 page = vm_page_lookup(object, offset + page_map_offset);
5488                 if (page == VM_PAGE_NULL) {
5489                         panic("vm_paging_map_object: no page !?");
5490                 }
5491                 if (page->no_isync == TRUE) {
5492                         pmap_sync_page_data_phys(page->phys_page);
5493                 }
5494                 cache_attr = ((unsigned int) object->wimg_bits) & VM_WIMG_MASK;
5495
5496                 assert(pmap_verify_free(page->phys_page));
5497                 PMAP_ENTER(kernel_pmap,
5498                            *address + page_map_offset,
5499                            page,
5500                            VM_PROT_DEFAULT,
5501                            cache_attr,
5502                            TRUE);
5503         }
5504
5505         vm_paging_objects_mapped_slow++;
5506         vm_paging_pages_mapped_slow += map_size / PAGE_SIZE_64;
5507
5508         return KERN_SUCCESS;
5509 }
5510
5511 /*
5512  * ENCRYPTED SWAP:
5513  * vm_paging_unmap_object:
5514  *      Unmaps part of a VM object's pages from the kernel
5515  *      virtual address space.
5516  * Context:
5517  *      The VM object is locked.  This lock will get
5518  *      dropped and re-acquired though.
5519  */
5520 void
5521 vm_paging_unmap_object(
5522         vm_object_t     object,
5523         vm_map_offset_t start,
5524         vm_map_offset_t end)
5525 {
5526         kern_return_t   kr;
5527         int             i;
5528
5529         if ((vm_paging_base_address == 0) ||
5530             (start < vm_paging_base_address) ||
5531             (end > (vm_paging_base_address
5532                     + (VM_PAGING_NUM_PAGES * PAGE_SIZE)))) {
5533                 /*
5534                  * We didn't use our pre-allocated pool of
5535                  * kernel virtual address.  Deallocate the
5536                  * virtual memory.
5537                  */
5538                 if (object != VM_OBJECT_NULL) {
5539                         vm_object_unlock(object);
5540                 }
5541                 kr = vm_map_remove(kernel_map, start, end, VM_MAP_NO_FLAGS);
5542                 if (object != VM_OBJECT_NULL) {
5543                         vm_object_lock(object);
5544                 }
5545                 assert(kr == KERN_SUCCESS);
5546         } else {
5547                 /*
5548                  * We used a kernel virtual address from our
5549                  * pre-allocated pool.  Put it back in the pool
5550                  * for next time.
5551                  */
5552                 assert(end - start == PAGE_SIZE);
5553                 i = (start - vm_paging_base_address) >> PAGE_SHIFT;
5554
5555                 /* undo the pmap mapping */
5556                 pmap_remove(kernel_pmap, start, end);
5557
5558                 simple_lock(&vm_paging_lock);
5559                 vm_paging_page_inuse[i] = FALSE;
5560                 simple_unlock(&vm_paging_lock);
5561         }
5562 }
5563
5564 /*
5565  * Encryption data.
5566  * "iv" is the "initial vector".  Ideally, we want to
5567  * have a different one for each page we encrypt, so that
5568  * crackers can't find encryption patterns too easily.
5569  */
5570 #define SWAP_CRYPT_AES_KEY_SIZE 128     /* XXX 192 and 256 don't work ! */
5571 boolean_t               swap_crypt_ctx_initialized = FALSE;
5572 aes_32t                 swap_crypt_key[8]; /* big enough for a 256 key */
5573 aes_ctx                 swap_crypt_ctx;
5574 const unsigned char     swap_crypt_null_iv[AES_BLOCK_SIZE] = {0xa, };
5575
5576 #if DEBUG
5577 boolean_t               swap_crypt_ctx_tested = FALSE;
5578 unsigned char swap_crypt_test_page_ref[4096] __attribute__((aligned(4096)));
5579 unsigned char swap_crypt_test_page_encrypt[4096] __attribute__((aligned(4096)));
5580 unsigned char swap_crypt_test_page_decrypt[4096] __attribute__((aligned(4096)));
5581 #endif /* DEBUG */
5582
5583 extern u_long random(void);
5584
5585 /*
5586  * Initialize the encryption context: key and key size.
5587  */
5588 void swap_crypt_ctx_initialize(void); /* forward */
5589 void
5590 swap_crypt_ctx_initialize(void)
5591 {
5592         unsigned int    i;
5593
5594         /*
5595          * No need for locking to protect swap_crypt_ctx_initialized
5596          * because the first use of encryption will come from the
5597          * pageout thread (we won't pagein before there's been a pageout)
5598          * and there's only one pageout thread.
5599          */
5600         if (swap_crypt_ctx_initialized == FALSE) {
5601                 for (i = 0;
5602                      i < (sizeof (swap_crypt_key) /
5603                           sizeof (swap_crypt_key[0]));
5604                      i++) {
5605                         swap_crypt_key[i] = random();
5606                 }
5607                 aes_encrypt_key((const unsigned char *) swap_crypt_key,
5608                                 SWAP_CRYPT_AES_KEY_SIZE,
5609                                 &swap_crypt_ctx.encrypt);
5610                 aes_decrypt_key((const unsigned char *) swap_crypt_key,
5611                                 SWAP_CRYPT_AES_KEY_SIZE,
5612                                 &swap_crypt_ctx.decrypt);
5613                 swap_crypt_ctx_initialized = TRUE;
5614         }
5615
5616 #if DEBUG
5617         /*
5618          * Validate the encryption algorithms.
5619          */
5620         if (swap_crypt_ctx_tested == FALSE) {
5621                 /* initialize */
5622                 for (i = 0; i < 4096; i++) {
5623                         swap_crypt_test_page_ref[i] = (char) i;
5624                 }
5625                 /* encrypt */
5626                 aes_encrypt_cbc(swap_crypt_test_page_ref,
5627                                 swap_crypt_null_iv,
5628                                 PAGE_SIZE / AES_BLOCK_SIZE,
5629                                 swap_crypt_test_page_encrypt,
5630                                 &swap_crypt_ctx.encrypt);
5631                 /* decrypt */
5632                 aes_decrypt_cbc(swap_crypt_test_page_encrypt,
5633                                 swap_crypt_null_iv,
5634                                 PAGE_SIZE / AES_BLOCK_SIZE,
5635                                 swap_crypt_test_page_decrypt,
5636                                 &swap_crypt_ctx.decrypt);
5637                 /* compare result with original */
5638                 for (i = 0; i < 4096; i ++) {
5639                         if (swap_crypt_test_page_decrypt[i] !=
5640                             swap_crypt_test_page_ref[i]) {
5641                                 panic("encryption test failed");
5642                         }
5643                 }
5644
5645                 /* encrypt again */
5646                 aes_encrypt_cbc(swap_crypt_test_page_decrypt,
5647                                 swap_crypt_null_iv,
5648                                 PAGE_SIZE / AES_BLOCK_SIZE,
5649                                 swap_crypt_test_page_decrypt,
5650                                 &swap_crypt_ctx.encrypt);
5651                 /* decrypt in place */
5652                 aes_decrypt_cbc(swap_crypt_test_page_decrypt,
5653                                 swap_crypt_null_iv,
5654                                 PAGE_SIZE / AES_BLOCK_SIZE,
5655                                 swap_crypt_test_page_decrypt,
5656                                 &swap_crypt_ctx.decrypt);
5657                 for (i = 0; i < 4096; i ++) {
5658                         if (swap_crypt_test_page_decrypt[i] !=
5659                             swap_crypt_test_page_ref[i]) {
5660                                 panic("in place encryption test failed");
5661                         }
5662                 }
5663
5664                 swap_crypt_ctx_tested = TRUE;
5665         }
5666 #endif /* DEBUG */
5667 }
5668
5669 /*
5670  * ENCRYPTED SWAP:
5671  * vm_page_encrypt:
5672  *      Encrypt the given page, for secure paging.
5673  *      The page might already be mapped at kernel virtual
5674  *      address "kernel_mapping_offset".  Otherwise, we need
5675  *      to map it.
5676  *
5677  * Context:
5678  *      The page's object is locked, but this lock will be released
5679  *      and re-acquired.
5680  *      The page is busy and not accessible by users (not entered in any pmap).
5681  */
5682 void
5683 vm_page_encrypt(
5684         vm_page_t       page,
5685         vm_map_offset_t kernel_mapping_offset)
5686 {
5687         int                     clear_refmod = 0;
5688         kern_return_t           kr;
5689         boolean_t               page_was_referenced;
5690         boolean_t               page_was_modified;
5691         vm_map_size_t           kernel_mapping_size;
5692         vm_offset_t             kernel_vaddr;
5693         union {
5694                 unsigned char   aes_iv[AES_BLOCK_SIZE];
5695                 struct {
5696                         memory_object_t         pager_object;
5697                         vm_object_offset_t      paging_offset;
5698                 } vm;
5699         } encrypt_iv;
5700
5701         if (! vm_pages_encrypted) {
5702                 vm_pages_encrypted = TRUE;
5703         }
5704
5705         assert(page->busy);
5706         assert(page->dirty || page->precious);
5707
5708         if (page->encrypted) {
5709                 /*
5710                  * Already encrypted: no need to do it again.
5711                  */
5712                 vm_page_encrypt_already_encrypted_counter++;
5713                 return;
5714         }
5715         ASSERT_PAGE_DECRYPTED(page);
5716
5717         /*
5718          * Gather the "reference" and "modified" status of the page.
5719          * We'll restore these values after the encryption, so that
5720          * the encryption is transparent to the rest of the system
5721          * and doesn't impact the VM's LRU logic.
5722          */
5723         page_was_referenced =
5724                 (page->reference || pmap_is_referenced(page->phys_page));
5725         page_was_modified =
5726                 (page->dirty || pmap_is_modified(page->phys_page));
5727
5728         if (kernel_mapping_offset == 0) {
5729                 /*
5730                  * The page hasn't already been mapped in kernel space
5731                  * by the caller.  Map it now, so that we can access
5732                  * its contents and encrypt them.
5733                  */
5734                 kernel_mapping_size = PAGE_SIZE;
5735                 kr = vm_paging_map_object(&kernel_mapping_offset,
5736                                           page,
5737                                           page->object,
5738                                           page->offset,
5739                                           &kernel_mapping_size);
5740                 if (kr != KERN_SUCCESS) {
5741                         panic("vm_page_encrypt: "
5742                               "could not map page in kernel: 0x%x\n",
5743                               kr);
5744                 }
5745         } else {
5746                 kernel_mapping_size = 0;
5747         }
5748         kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
5749
5750         if (swap_crypt_ctx_initialized == FALSE) {
5751                 swap_crypt_ctx_initialize();
5752         }
5753         assert(swap_crypt_ctx_initialized);
5754
5755         /*
5756          * Prepare an "initial vector" for the encryption.
5757          * We use the "pager" and the "paging_offset" for that
5758          * page to obfuscate the encrypted data a bit more and
5759          * prevent crackers from finding patterns that they could
5760          * use to break the key.
5761          */
5762         bzero(&encrypt_iv.aes_iv[0], sizeof (encrypt_iv.aes_iv));
5763         encrypt_iv.vm.pager_object = page->object->pager;
5764         encrypt_iv.vm.paging_offset =
5765                 page->object->paging_offset + page->offset;
5766
5767         vm_object_unlock(page->object);
5768
5769         /* encrypt the "initial vector" */
5770         aes_encrypt_cbc((const unsigned char *) &encrypt_iv.aes_iv[0],
5771                         swap_crypt_null_iv,
5772                         1,
5773                         &encrypt_iv.aes_iv[0],
5774                         &swap_crypt_ctx.encrypt);
5775
5776         /*
5777          * Encrypt the page.
5778          */
5779         aes_encrypt_cbc((const unsigned char *) kernel_vaddr,
5780                         &encrypt_iv.aes_iv[0],
5781                         PAGE_SIZE / AES_BLOCK_SIZE,
5782                         (unsigned char *) kernel_vaddr,
5783                         &swap_crypt_ctx.encrypt);
5784
5785         vm_page_encrypt_counter++;
5786
5787         vm_object_lock(page->object);
5788
5789         /*
5790          * Unmap the page from the kernel's address space,
5791          * if we had to map it ourselves.  Otherwise, let
5792          * the caller undo the mapping if needed.
5793          */
5794         if (kernel_mapping_size != 0) {
5795                 vm_paging_unmap_object(page->object,
5796                                        kernel_mapping_offset,
5797                                        kernel_mapping_offset + kernel_mapping_size);
5798         }
5799
5800         /*
5801          * Restore the "reference" and "modified" bits.
5802          * This should clean up any impact the encryption had
5803          * on them.
5804          */
5805         if (! page_was_referenced) {
5806                 clear_refmod |= VM_MEM_REFERENCED;
5807                 page->reference = FALSE;
5808         }
5809         if (! page_was_modified) {
5810                 clear_refmod |= VM_MEM_MODIFIED;
5811                 page->dirty = FALSE;
5812         }
5813         if (clear_refmod)
5814                 pmap_clear_refmod(page->phys_page, clear_refmod);
5815
5816         page->encrypted = TRUE;
5817 }
5818
5819 /*
5820  * ENCRYPTED SWAP:
5821  * vm_page_decrypt:
5822  *      Decrypt the given page.
5823  *      The page might already be mapped at kernel virtual
5824  *      address "kernel_mapping_offset".  Otherwise, we need
5825  *      to map it.
5826  *
5827  * Context:
5828  *      The page's VM object is locked but will be unlocked and relocked.
5829  *      The page is busy and not accessible by users (not entered in any pmap).
5830  */
5831 void
5832 vm_page_decrypt(
5833         vm_page_t       page,
5834         vm_map_offset_t kernel_mapping_offset)
5835 {
5836         int                     clear_refmod = 0;
5837         kern_return_t           kr;
5838         vm_map_size_t           kernel_mapping_size;
5839         vm_offset_t             kernel_vaddr;
5840         boolean_t               page_was_referenced;
5841         union {
5842                 unsigned char   aes_iv[AES_BLOCK_SIZE];
5843                 struct {
5844                         memory_object_t         pager_object;
5845                         vm_object_offset_t      paging_offset;
5846                 } vm;
5847         } decrypt_iv;
5848
5849         assert(page->busy);
5850         assert(page->encrypted);
5851
5852         /*
5853          * Gather the "reference" status of the page.
5854          * We'll restore its value after the decryption, so that
5855          * the decryption is transparent to the rest of the system
5856          * and doesn't impact the VM's LRU logic.
5857          */
5858         page_was_referenced =
5859                 (page->reference || pmap_is_referenced(page->phys_page));
5860
5861         if (kernel_mapping_offset == 0) {
5862                 /*
5863                  * The page hasn't already been mapped in kernel space
5864                  * by the caller.  Map it now, so that we can access
5865                  * its contents and decrypt them.
5866                  */
5867                 kernel_mapping_size = PAGE_SIZE;
5868                 kr = vm_paging_map_object(&kernel_mapping_offset,
5869                                           page,
5870                                           page->object,
5871                                           page->offset,
5872                                           &kernel_mapping_size);
5873                 if (kr != KERN_SUCCESS) {
5874                         panic("vm_page_decrypt: "
5875                               "could not map page in kernel: 0x%x\n");
5876                 }
5877         } else {
5878                 kernel_mapping_size = 0;
5879         }
5880         kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
5881
5882         assert(swap_crypt_ctx_initialized);
5883
5884         /*
5885          * Prepare an "initial vector" for the decryption.
5886          * It has to be the same as the "initial vector" we
5887          * used to encrypt that page.
5888          */
5889         bzero(&decrypt_iv.aes_iv[0], sizeof (decrypt_iv.aes_iv));
5890         decrypt_iv.vm.pager_object = page->object->pager;
5891         decrypt_iv.vm.paging_offset =
5892                 page->object->paging_offset + page->offset;
5893
5894         vm_object_unlock(page->object);
5895
5896         /* encrypt the "initial vector" */
5897         aes_encrypt_cbc((const unsigned char *) &decrypt_iv.aes_iv[0],
5898                         swap_crypt_null_iv,
5899                         1,
5900                         &decrypt_iv.aes_iv[0],
5901                         &swap_crypt_ctx.encrypt);
5902
5903         /*
5904          * Decrypt the page.
5905          */
5906         aes_decrypt_cbc((const unsigned char *) kernel_vaddr,
5907                         &decrypt_iv.aes_iv[0],
5908                         PAGE_SIZE / AES_BLOCK_SIZE,
5909                         (unsigned char *) kernel_vaddr,
5910                         &swap_crypt_ctx.decrypt);
5911         vm_page_decrypt_counter++;
5912
5913         vm_object_lock(page->object);
5914
5915         /*
5916          * Unmap the page from the kernel's address space,
5917          * if we had to map it ourselves.  Otherwise, let
5918          * the caller undo the mapping if needed.
5919          */
5920         if (kernel_mapping_size != 0) {
5921                 vm_paging_unmap_object(page->object,
5922                                        kernel_vaddr,
5923                                        kernel_vaddr + PAGE_SIZE);
5924         }
5925
5926         /*
5927          * After decryption, the page is actually clean.
5928          * It was encrypted as part of paging, which "cleans"
5929          * the "dirty" pages.
5930          * Noone could access it after it was encrypted
5931          * and the decryption doesn't count.
5932          */
5933         page->dirty = FALSE;
5934         clear_refmod = VM_MEM_MODIFIED;
5935
5936         /* restore the "reference" bit */
5937         if (! page_was_referenced) {
5938                 page->reference = FALSE;
5939                 clear_refmod |= VM_MEM_REFERENCED;
5940         }
5941         pmap_clear_refmod(page->phys_page, clear_refmod);
5942
5943         page->encrypted = FALSE;
5944
5945         /*
5946          * We've just modified the page's contents via the data cache and part
5947          * of the new contents might still be in the cache and not yet in RAM.
5948          * Since the page is now available and might get gathered in a UPL to
5949          * be part of a DMA transfer from a driver that expects the memory to
5950          * be coherent at this point, we have to flush the data cache.
5951          */
5952         pmap_sync_page_attributes_phys(page->phys_page);
5953         /*
5954          * Since the page is not mapped yet, some code might assume that it
5955          * doesn't need to invalidate the instruction cache when writing to
5956          * that page.  That code relies on "no_isync" being set, so that the
5957          * caches get syncrhonized when the page is first mapped.  So we need
5958          * to set "no_isync" here too, despite the fact that we just
5959          * synchronized the caches above...
5960          */
5961         page->no_isync = TRUE;
5962 }
5963
5964 unsigned long upl_encrypt_upls = 0;
5965 unsigned long upl_encrypt_pages = 0;
5966
5967 /*
5968  * ENCRYPTED SWAP:
5969  *
5970  * upl_encrypt:
5971  *      Encrypts all the pages in the UPL, within the specified range.
5972  *
5973  */
5974 void
5975 upl_encrypt(
5976         upl_t                   upl,
5977         upl_offset_t            crypt_offset,
5978         upl_size_t              crypt_size)
5979 {
5980         upl_size_t              upl_size;
5981         upl_offset_t            upl_offset;
5982         vm_object_t             upl_object;
5983         vm_page_t               page;
5984         vm_object_t             shadow_object;
5985         vm_object_offset_t      shadow_offset;
5986         vm_object_offset_t      paging_offset;
5987         vm_object_offset_t      base_offset;
5988
5989         upl_encrypt_upls++;
5990         upl_encrypt_pages += crypt_size / PAGE_SIZE;
5991
5992         upl_lock(upl);
5993
5994         upl_object = upl->map_object;
5995         upl_offset = upl->offset;
5996         upl_size = upl->size;
5997
5998         upl_unlock(upl);
5999
6000         vm_object_lock(upl_object);
6001
6002         /*
6003          * Find the VM object that contains the actual pages.
6004          */
6005         if (upl_object->pageout) {
6006                 shadow_object = upl_object->shadow;
6007                 /*
6008                  * The offset in the shadow object is actually also
6009                  * accounted for in upl->offset.  It possibly shouldn't be
6010                  * this way, but for now don't account for it twice.
6011                  */
6012                 shadow_offset = 0;
6013                 assert(upl_object->paging_offset == 0); /* XXX ? */
6014                 vm_object_lock(shadow_object);
6015         } else {
6016                 shadow_object = upl_object;
6017                 shadow_offset = 0;
6018         }
6019
6020         paging_offset = shadow_object->paging_offset;
6021         vm_object_paging_begin(shadow_object);
6022
6023         if (shadow_object != upl_object) {
6024                 vm_object_unlock(shadow_object);
6025         }
6026         vm_object_unlock(upl_object);
6027
6028         base_offset = shadow_offset;
6029         base_offset += upl_offset;
6030         base_offset += crypt_offset;
6031         base_offset -= paging_offset;
6032         /*
6033          * Unmap the pages, so that nobody can continue accessing them while
6034          * they're encrypted.  After that point, all accesses to these pages
6035          * will cause a page fault and block while the page is being encrypted
6036          * (busy).  After the encryption completes, any access will cause a
6037          * page fault and the page gets decrypted at that time.
6038          */
6039         assert(crypt_offset + crypt_size <= upl_size);
6040         vm_object_pmap_protect(shadow_object,
6041                                base_offset,
6042                                (vm_object_size_t)crypt_size,
6043                                PMAP_NULL,
6044                                0,
6045                                VM_PROT_NONE);
6046
6047         /* XXX FBDP could the object have changed significantly here ? */
6048         vm_object_lock(shadow_object);
6049
6050         for (upl_offset = 0;
6051              upl_offset < crypt_size;
6052              upl_offset += PAGE_SIZE) {
6053                 page = vm_page_lookup(shadow_object,
6054                                       base_offset + upl_offset);
6055                 if (page == VM_PAGE_NULL) {
6056                         panic("upl_encrypt: "
6057                               "no page for (obj=%p,off=%lld+%d)!\n",
6058                               shadow_object,
6059                               base_offset,
6060                               upl_offset);
6061                 }
6062                 vm_page_encrypt(page, 0);
6063         }
6064
6065         vm_object_paging_end(shadow_object);
6066         vm_object_unlock(shadow_object);
6067 }
6068
6069 vm_size_t
6070 upl_get_internal_pagelist_offset(void)
6071 {
6072         return sizeof(struct upl);
6073 }
6074
6075 void
6076 upl_clear_dirty(
6077         upl_t           upl,
6078         boolean_t       value)
6079 {
6080         if (value) {
6081                 upl->flags |= UPL_CLEAR_DIRTY;
6082         } else {
6083                 upl->flags &= ~UPL_CLEAR_DIRTY;
6084         }
6085 }
6086
6087
6088 #ifdef MACH_BSD
6089
6090 boolean_t  upl_page_present(upl_page_info_t *upl, int index)
6091 {
6092         return(UPL_PAGE_PRESENT(upl, index));
6093 }
6094 boolean_t  upl_dirty_page(upl_page_info_t *upl, int index)
6095 {
6096         return(UPL_DIRTY_PAGE(upl, index));
6097 }
6098 boolean_t  upl_valid_page(upl_page_info_t *upl, int index)
6099 {
6100         return(UPL_VALID_PAGE(upl, index));
6101 }
6102 ppnum_t  upl_phys_page(upl_page_info_t *upl, int index)
6103 {
6104         return(UPL_PHYS_PAGE(upl, index));
6105 }
6106
6107 void
6108 vm_countdirtypages(void)
6109 {
6110         vm_page_t m;
6111         int dpages;
6112         int pgopages;
6113         int precpages;
6114
6115
6116         dpages=0;
6117         pgopages=0;
6118         precpages=0;
6119
6120         vm_page_lock_queues();
6121         m = (vm_page_t) queue_first(&vm_page_queue_inactive);
6122         do {
6123                 if (m ==(vm_page_t )0) break;
6124
6125                 if(m->dirty) dpages++;
6126                 if(m->pageout) pgopages++;
6127                 if(m->precious) precpages++;
6128
6129                 assert(m->object != kernel_object);
6130                 m = (vm_page_t) queue_next(&m->pageq);
6131                 if (m ==(vm_page_t )0) break;
6132
6133         } while (!queue_end(&vm_page_queue_inactive,(queue_entry_t) m));
6134         vm_page_unlock_queues();
6135
6136         vm_page_lock_queues();
6137         m = (vm_page_t) queue_first(&vm_page_queue_zf);
6138         do {
6139                 if (m ==(vm_page_t )0) break;
6140
6141                 if(m->dirty) dpages++;
6142                 if(m->pageout) pgopages++;
6143                 if(m->precious) precpages++;
6144
6145                 assert(m->object != kernel_object);
6146                 m = (vm_page_t) queue_next(&m->pageq);
6147                 if (m ==(vm_page_t )0) break;
6148
6149         } while (!queue_end(&vm_page_queue_zf,(queue_entry_t) m));
6150         vm_page_unlock_queues();
6151
6152         printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
6153
6154         dpages=0;
6155         pgopages=0;
6156         precpages=0;
6157
6158         vm_page_lock_queues();
6159         m = (vm_page_t) queue_first(&vm_page_queue_active);
6160
6161         do {
6162                 if(m == (vm_page_t )0) break;
6163                 if(m->dirty) dpages++;
6164                 if(m->pageout) pgopages++;
6165                 if(m->precious) precpages++;
6166
6167                 assert(m->object != kernel_object);
6168                 m = (vm_page_t) queue_next(&m->pageq);
6169                 if(m == (vm_page_t )0) break;
6170
6171         } while (!queue_end(&vm_page_queue_active,(queue_entry_t) m));
6172         vm_page_unlock_queues();
6173
6174         printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
6175
6176 }
6177 #endif /* MACH_BSD */
6178
6179 ppnum_t upl_get_highest_page(
6180         upl_t                   upl)
6181 {
6182         return upl->highest_page;
6183 }
6184
6185 #ifdef UPL_DEBUG
6186 kern_return_t  upl_ubc_alias_set(upl_t upl, unsigned int alias1, unsigned int alias2)
6187 {
6188         upl->ubc_alias1 = alias1;
6189         upl->ubc_alias2 = alias2;
6190         return KERN_SUCCESS;
6191 }
6192 int  upl_ubc_alias_get(upl_t upl, unsigned int * al, unsigned int * al2)
6193 {
6194         if(al)
6195                 *al = upl->ubc_alias1;
6196         if(al2)
6197                 *al2 = upl->ubc_alias2;
6198         return KERN_SUCCESS;
6199 }
6200 #endif /* UPL_DEBUG */
6201
6202
6203
6204 #if     MACH_KDB
6205 #include <ddb/db_output.h>
6206 #include <ddb/db_print.h>
6207 #include <vm/vm_print.h>
6208
6209 #define printf  kdbprintf
6210 void            db_pageout(void);
6211
6212 void
6213 db_vm(void)
6214 {
6215
6216         iprintf("VM Statistics:\n");
6217         db_indent += 2;
6218         iprintf("pages:\n");
6219         db_indent += 2;
6220         iprintf("activ %5d  inact %5d  free  %5d",
6221                 vm_page_active_count, vm_page_inactive_count,
6222                 vm_page_free_count);
6223         printf("   wire  %5d  gobbl %5d\n",
6224                vm_page_wire_count, vm_page_gobble_count);
6225         db_indent -= 2;
6226         iprintf("target:\n");
6227         db_indent += 2;
6228         iprintf("min   %5d  inact %5d  free  %5d",
6229                 vm_page_free_min, vm_page_inactive_target,
6230                 vm_page_free_target);
6231         printf("   resrv %5d\n", vm_page_free_reserved);
6232         db_indent -= 2;
6233         iprintf("pause:\n");
6234         db_pageout();
6235         db_indent -= 2;
6236 }
6237
6238 #if     MACH_COUNTERS
6239 extern int c_laundry_pages_freed;
6240 #endif  /* MACH_COUNTERS */
6241
6242 void
6243 db_pageout(void)
6244 {
6245         iprintf("Pageout Statistics:\n");
6246         db_indent += 2;
6247         iprintf("active %5d  inactv %5d\n",
6248                 vm_pageout_active, vm_pageout_inactive);
6249         iprintf("nolock %5d  avoid  %5d  busy   %5d  absent %5d\n",
6250                 vm_pageout_inactive_nolock, vm_pageout_inactive_avoid,
6251                 vm_pageout_inactive_busy, vm_pageout_inactive_absent);
6252         iprintf("used   %5d  clean  %5d  dirty  %5d\n",
6253                 vm_pageout_inactive_used, vm_pageout_inactive_clean,
6254                 vm_pageout_inactive_dirty);
6255 #if     MACH_COUNTERS
6256         iprintf("laundry_pages_freed %d\n", c_laundry_pages_freed);
6257 #endif  /* MACH_COUNTERS */
6258 #if     MACH_CLUSTER_STATS
6259         iprintf("Cluster Statistics:\n");
6260         db_indent += 2;
6261         iprintf("dirtied   %5d   cleaned  %5d   collisions  %5d\n",
6262                 vm_pageout_cluster_dirtied, vm_pageout_cluster_cleaned,
6263                 vm_pageout_cluster_collisions);
6264         iprintf("clusters  %5d   conversions  %5d\n",
6265                 vm_pageout_cluster_clusters, vm_pageout_cluster_conversions);
6266         db_indent -= 2;
6267         iprintf("Target Statistics:\n");
6268         db_indent += 2;
6269         iprintf("collisions   %5d   page_dirtied  %5d   page_freed  %5d\n",
6270                 vm_pageout_target_collisions, vm_pageout_target_page_dirtied,
6271                 vm_pageout_target_page_freed);
6272         db_indent -= 2;
6273 #endif  /* MACH_CLUSTER_STATS */
6274         db_indent -= 2;
6275 }
6276
6277 #endif  /* MACH_KDB */