osfmk/vm/vm_pageout.c

   1 /*
   2  * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License.  The rights granted to you under the
  10  * License may not be used to create, or enable the creation or
  11  * redistribution of, unlawful or unlicensed copies of an Apple operating
  12  * system, or to circumvent, violate, or enable the circumvention or
  13  * violation of, any terms of an Apple operating system software license
  14  * agreement.
  15  *
  16  * Please obtain a copy of the License at
  17  * http://www.opensource.apple.com/apsl/ and read it before using this
  18  * file.
  19  *
  20  * The Original Code and all software distributed under the License are
  21  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  22  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  23  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  24  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  25  * Please see the License for the specific language governing rights and
  26  * limitations under the License.
  27  *
  28  * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
  29  */
  30 /*
  31  * @OSF_COPYRIGHT@
  32  */
  33 /*
  34  * Mach Operating System
  35  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  36  * All Rights Reserved.
  37  *
  38  * Permission to use, copy, modify and distribute this software and its
  39  * documentation is hereby granted, provided that both the copyright
  40  * notice and this permission notice appear in all copies of the
  41  * software, derivative works or modified versions, and any portions
  42  * thereof, and that both notices appear in supporting documentation.
  43  *
  44  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  45  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  46  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  47  *
  48  * Carnegie Mellon requests users of this software to return to
  49  *
  50  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  51  *  School of Computer Science
  52  *  Carnegie Mellon University
  53  *  Pittsburgh PA 15213-3890
  54  *
  55  * any improvements or extensions that they make and grant Carnegie Mellon
  56  * the rights to redistribute these changes.
  57  */
  58 /*
  59  */
  60 /*
  61  *      File:   vm/vm_pageout.c
  62  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  63  *      Date:   1985
  64  *
  65  *      The proverbial page-out daemon.
  66  */
  67
  68 #include <stdint.h>
  69
  70 #include <debug.h>
  71 #include <mach_pagemap.h>
  72 #include <mach_cluster_stats.h>
  73 #include <mach_kdb.h>
  74 #include <advisory_pageout.h>
  75
  76 #include <mach/mach_types.h>
  77 #include <mach/memory_object.h>
  78 #include <mach/memory_object_default.h>
  79 #include <mach/memory_object_control_server.h>
  80 #include <mach/mach_host_server.h>
  81 #include <mach/upl.h>
  82 #include <mach/vm_map.h>
  83 #include <mach/vm_param.h>
  84 #include <mach/vm_statistics.h>
  85
  86 #include <kern/kern_types.h>
  87 #include <kern/counters.h>
  88 #include <kern/host_statistics.h>
  89 #include <kern/machine.h>
  90 #include <kern/misc_protos.h>
  91 #include <kern/thread.h>
  92 #include <kern/xpr.h>
  93 #include <kern/kalloc.h>
  94
  95 #include <machine/vm_tuning.h>
  96
  97 #include <vm/pmap.h>
  98 #include <vm/vm_fault.h>
  99 #include <vm/vm_map.h>
 100 #include <vm/vm_object.h>
 101 #include <vm/vm_page.h>
 102 #include <vm/vm_pageout.h>
 103 #include <vm/vm_protos.h> /* must be last */
 104
 105 /*
 106  * ENCRYPTED SWAP:
 107  */
 108 #ifdef __ppc__
 109 #include <ppc/mappings.h>
 110 #endif /* __ppc__ */
 111 #include <../bsd/crypto/aes/aes.h>
 112
 113 extern ipc_port_t       memory_manager_default;
 114
 115
 116 #ifndef VM_PAGEOUT_BURST_ACTIVE_THROTTLE
 117 #define VM_PAGEOUT_BURST_ACTIVE_THROTTLE  10000  /* maximum iterations of the active queue to move pages to inactive */
 118 #endif
 119
 120 #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE
 121 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096  /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
 122 #endif
 123
 124 #ifndef VM_PAGEOUT_DEADLOCK_RELIEF
 125 #define VM_PAGEOUT_DEADLOCK_RELIEF 100  /* number of pages to move to break deadlock */
 126 #endif
 127
 128 #ifndef VM_PAGEOUT_INACTIVE_RELIEF
 129 #define VM_PAGEOUT_INACTIVE_RELIEF 50   /* minimum number of pages to move to the inactive q */
 130 #endif
 131
 132 #ifndef VM_PAGE_LAUNDRY_MAX
 133 #define VM_PAGE_LAUNDRY_MAX     16UL    /* maximum pageouts on a given pageout queue */
 134 #endif  /* VM_PAGEOUT_LAUNDRY_MAX */
 135
 136 #ifndef VM_PAGEOUT_BURST_WAIT
 137 #define VM_PAGEOUT_BURST_WAIT   30      /* milliseconds per page */
 138 #endif  /* VM_PAGEOUT_BURST_WAIT */
 139
 140 #ifndef VM_PAGEOUT_EMPTY_WAIT
 141 #define VM_PAGEOUT_EMPTY_WAIT   200     /* milliseconds */
 142 #endif  /* VM_PAGEOUT_EMPTY_WAIT */
 143
 144 #ifndef VM_PAGEOUT_DEADLOCK_WAIT
 145 #define VM_PAGEOUT_DEADLOCK_WAIT        300     /* milliseconds */
 146 #endif  /* VM_PAGEOUT_DEADLOCK_WAIT */
 147
 148 #ifndef VM_PAGEOUT_IDLE_WAIT
 149 #define VM_PAGEOUT_IDLE_WAIT    10      /* milliseconds */
 150 #endif  /* VM_PAGEOUT_IDLE_WAIT */
 151
 152
 153 /*
 154  *      To obtain a reasonable LRU approximation, the inactive queue
 155  *      needs to be large enough to give pages on it a chance to be
 156  *      referenced a second time.  This macro defines the fraction
 157  *      of active+inactive pages that should be inactive.
 158  *      The pageout daemon uses it to update vm_page_inactive_target.
 159  *
 160  *      If vm_page_free_count falls below vm_page_free_target and
 161  *      vm_page_inactive_count is below vm_page_inactive_target,
 162  *      then the pageout daemon starts running.
 163  */
 164
 165 #ifndef VM_PAGE_INACTIVE_TARGET
 166 #define VM_PAGE_INACTIVE_TARGET(avail)  ((avail) * 1 / 3)
 167 #endif  /* VM_PAGE_INACTIVE_TARGET */
 168
 169 /*
 170  *      Once the pageout daemon starts running, it keeps going
 171  *      until vm_page_free_count meets or exceeds vm_page_free_target.
 172  */
 173
 174 #ifndef VM_PAGE_FREE_TARGET
 175 #define VM_PAGE_FREE_TARGET(free)       (15 + (free) / 80)
 176 #endif  /* VM_PAGE_FREE_TARGET */
 177
 178 /*
 179  *      The pageout daemon always starts running once vm_page_free_count
 180  *      falls below vm_page_free_min.
 181  */
 182
 183 #ifndef VM_PAGE_FREE_MIN
 184 #define VM_PAGE_FREE_MIN(free)  (10 + (free) / 100)
 185 #endif  /* VM_PAGE_FREE_MIN */
 186
 187 /*
 188  *      When vm_page_free_count falls below vm_page_free_reserved,
 189  *      only vm-privileged threads can allocate pages.  vm-privilege
 190  *      allows the pageout daemon and default pager (and any other
 191  *      associated threads needed for default pageout) to continue
 192  *      operation by dipping into the reserved pool of pages.
 193  */
 194
 195 #ifndef VM_PAGE_FREE_RESERVED
 196 #define VM_PAGE_FREE_RESERVED(n)        \
 197         ((6 * VM_PAGE_LAUNDRY_MAX) + (n))
 198 #endif  /* VM_PAGE_FREE_RESERVED */
 199
 200
 201 /*
 202  * must hold the page queues lock to
 203  * manipulate this structure
 204  */
 205 struct vm_pageout_queue {
 206         queue_head_t    pgo_pending;    /* laundry pages to be processed by pager's iothread */
 207         unsigned int    pgo_laundry;    /* current count of laundry pages on queue or in flight */
 208         unsigned int    pgo_maxlaundry;
 209
 210         unsigned int    pgo_idle:1,     /* iothread is blocked waiting for work to do */
 211                         pgo_busy:1,     /* iothread is currently processing request from pgo_pending */
 212                         pgo_throttled:1,/* vm_pageout_scan thread needs a wakeup when pgo_laundry drops */
 213                         :0;
 214 };
 215
 216 #define VM_PAGE_Q_THROTTLED(q)          \
 217         ((q)->pgo_laundry >= (q)->pgo_maxlaundry)
 218
 219
 220 /*
 221  * Exported variable used to broadcast the activation of the pageout scan
 222  * Working Set uses this to throttle its use of pmap removes.  In this
 223  * way, code which runs within memory in an uncontested context does
 224  * not keep encountering soft faults.
 225  */
 226
 227 unsigned int    vm_pageout_scan_event_counter = 0;
 228
 229 /*
 230  * Forward declarations for internal routines.
 231  */
 232
 233 static void vm_pageout_garbage_collect(int);
 234 static void vm_pageout_iothread_continue(struct vm_pageout_queue *);
 235 static void vm_pageout_iothread_external(void);
 236 static void vm_pageout_iothread_internal(void);
 237 static void vm_pageout_queue_steal(vm_page_t);
 238
 239 extern void vm_pageout_continue(void);
 240 extern void vm_pageout_scan(void);
 241
 242 unsigned int vm_pageout_reserved_internal = 0;
 243 unsigned int vm_pageout_reserved_really = 0;
 244
 245 unsigned int vm_pageout_idle_wait = 0;          /* milliseconds */
 246 unsigned int vm_pageout_empty_wait = 0;         /* milliseconds */
 247 unsigned int vm_pageout_burst_wait = 0;         /* milliseconds */
 248 unsigned int vm_pageout_deadlock_wait = 0;      /* milliseconds */
 249 unsigned int vm_pageout_deadlock_relief = 0;
 250 unsigned int vm_pageout_inactive_relief = 0;
 251 unsigned int vm_pageout_burst_active_throttle = 0;
 252 unsigned int vm_pageout_burst_inactive_throttle = 0;
 253
 254 /*
 255  *      Protection against zero fill flushing live working sets derived
 256  *      from existing backing store and files
 257  */
 258 unsigned int vm_accellerate_zf_pageout_trigger = 400;
 259 unsigned int vm_zf_iterator;
 260 unsigned int vm_zf_iterator_count = 40;
 261 unsigned int last_page_zf;
 262 unsigned int vm_zf_count = 0;
 263
 264 /*
 265  *      These variables record the pageout daemon's actions:
 266  *      how many pages it looks at and what happens to those pages.
 267  *      No locking needed because only one thread modifies the variables.
 268  */
 269
 270 unsigned int vm_pageout_active = 0;             /* debugging */
 271 unsigned int vm_pageout_inactive = 0;           /* debugging */
 272 unsigned int vm_pageout_inactive_throttled = 0; /* debugging */
 273 unsigned int vm_pageout_inactive_forced = 0;    /* debugging */
 274 unsigned int vm_pageout_inactive_nolock = 0;    /* debugging */
 275 unsigned int vm_pageout_inactive_avoid = 0;     /* debugging */
 276 unsigned int vm_pageout_inactive_busy = 0;      /* debugging */
 277 unsigned int vm_pageout_inactive_absent = 0;    /* debugging */
 278 unsigned int vm_pageout_inactive_used = 0;      /* debugging */
 279 unsigned int vm_pageout_inactive_clean = 0;     /* debugging */
 280 unsigned int vm_pageout_inactive_dirty = 0;     /* debugging */
 281 unsigned int vm_pageout_dirty_no_pager = 0;     /* debugging */
 282 unsigned int vm_pageout_purged_objects = 0;     /* debugging */
 283 unsigned int vm_stat_discard = 0;               /* debugging */
 284 unsigned int vm_stat_discard_sent = 0;          /* debugging */
 285 unsigned int vm_stat_discard_failure = 0;       /* debugging */
 286 unsigned int vm_stat_discard_throttle = 0;      /* debugging */
 287
 288 unsigned int vm_pageout_scan_active_throttled = 0;
 289 unsigned int vm_pageout_scan_inactive_throttled = 0;
 290 unsigned int vm_pageout_scan_throttle = 0;                      /* debugging */
 291 unsigned int vm_pageout_scan_burst_throttle = 0;                /* debugging */
 292 unsigned int vm_pageout_scan_empty_throttle = 0;                /* debugging */
 293 unsigned int vm_pageout_scan_deadlock_detected = 0;             /* debugging */
 294 unsigned int vm_pageout_scan_active_throttle_success = 0;       /* debugging */
 295 unsigned int vm_pageout_scan_inactive_throttle_success = 0;     /* debugging */
 296 /*
 297  * Backing store throttle when BS is exhausted
 298  */
 299 unsigned int    vm_backing_store_low = 0;
 300
 301 unsigned int vm_pageout_out_of_line  = 0;
 302 unsigned int vm_pageout_in_place  = 0;
 303
 304 /*
 305  * ENCRYPTED SWAP:
 306  * counters and statistics...
 307  */
 308 unsigned long vm_page_decrypt_counter = 0;
 309 unsigned long vm_page_decrypt_for_upl_counter = 0;
 310 unsigned long vm_page_encrypt_counter = 0;
 311 unsigned long vm_page_encrypt_abort_counter = 0;
 312 unsigned long vm_page_encrypt_already_encrypted_counter = 0;
 313 boolean_t vm_pages_encrypted = FALSE; /* are there encrypted pages ? */
 314
 315
 316 struct  vm_pageout_queue vm_pageout_queue_internal;
 317 struct  vm_pageout_queue vm_pageout_queue_external;
 318
 319
 320 /*
 321  *      Routine:        vm_backing_store_disable
 322  *      Purpose:
 323  *              Suspend non-privileged threads wishing to extend
 324  *              backing store when we are low on backing store
 325  *              (Synchronized by caller)
 326  */
 327 void
 328 vm_backing_store_disable(
 329         boolean_t       disable)
 330 {
 331         if(disable) {
 332                 vm_backing_store_low = 1;
 333         } else {
 334                 if(vm_backing_store_low) {
 335                         vm_backing_store_low = 0;
 336                         thread_wakeup((event_t) &vm_backing_store_low);
 337                 }
 338         }
 339 }
 340
 341
 342 /*
 343  *      Routine:        vm_pageout_object_allocate
 344  *      Purpose:
 345  *              Allocate an object for use as out-of-line memory in a
 346  *              data_return/data_initialize message.
 347  *              The page must be in an unlocked object.
 348  *
 349  *              If the page belongs to a trusted pager, cleaning in place
 350  *              will be used, which utilizes a special "pageout object"
 351  *              containing private alias pages for the real page frames.
 352  *              Untrusted pagers use normal out-of-line memory.
 353  */
 354 vm_object_t
 355 vm_pageout_object_allocate(
 356         vm_page_t               m,
 357         vm_size_t               size,
 358         vm_object_offset_t      offset)
 359 {
 360         vm_object_t     object = m->object;
 361         vm_object_t     new_object;
 362
 363         assert(object->pager_ready);
 364
 365         new_object = vm_object_allocate(size);
 366
 367         if (object->pager_trusted) {
 368                 assert (offset < object->size);
 369
 370                 vm_object_lock(new_object);
 371                 new_object->pageout = TRUE;
 372                 new_object->shadow = object;
 373                 new_object->can_persist = FALSE;
 374                 new_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
 375                 new_object->shadow_offset = offset;
 376                 vm_object_unlock(new_object);
 377
 378                 /*
 379                  * Take a paging reference on the object. This will be dropped
 380                  * in vm_pageout_object_terminate()
 381                  */
 382                 vm_object_lock(object);
 383                 vm_object_paging_begin(object);
 384                 vm_page_lock_queues();
 385                 vm_page_unlock_queues();
 386                 vm_object_unlock(object);
 387
 388                 vm_pageout_in_place++;
 389         } else
 390                 vm_pageout_out_of_line++;
 391         return(new_object);
 392 }
 393
 394 #if MACH_CLUSTER_STATS
 395 unsigned long vm_pageout_cluster_dirtied = 0;
 396 unsigned long vm_pageout_cluster_cleaned = 0;
 397 unsigned long vm_pageout_cluster_collisions = 0;
 398 unsigned long vm_pageout_cluster_clusters = 0;
 399 unsigned long vm_pageout_cluster_conversions = 0;
 400 unsigned long vm_pageout_target_collisions = 0;
 401 unsigned long vm_pageout_target_page_dirtied = 0;
 402 unsigned long vm_pageout_target_page_freed = 0;
 403 #define CLUSTER_STAT(clause)    clause
 404 #else   /* MACH_CLUSTER_STATS */
 405 #define CLUSTER_STAT(clause)
 406 #endif  /* MACH_CLUSTER_STATS */
 407
 408 /*
 409  *      Routine:        vm_pageout_object_terminate
 410  *      Purpose:
 411  *              Destroy the pageout_object allocated by
 412  *              vm_pageout_object_allocate(), and perform all of the
 413  *              required cleanup actions.
 414  *
 415  *      In/Out conditions:
 416  *              The object must be locked, and will be returned locked.
 417  */
 418 void
 419 vm_pageout_object_terminate(
 420         vm_object_t     object)
 421 {
 422         vm_object_t     shadow_object;
 423         boolean_t       shadow_internal;
 424
 425         /*
 426          * Deal with the deallocation (last reference) of a pageout object
 427          * (used for cleaning-in-place) by dropping the paging references/
 428          * freeing pages in the original object.
 429          */
 430
 431         assert(object->pageout);
 432         shadow_object = object->shadow;
 433         vm_object_lock(shadow_object);
 434         shadow_internal = shadow_object->internal;
 435
 436         while (!queue_empty(&object->memq)) {
 437                 vm_page_t               p, m;
 438                 vm_object_offset_t      offset;
 439
 440                 p = (vm_page_t) queue_first(&object->memq);
 441
 442                 assert(p->private);
 443                 assert(p->pageout);
 444                 p->pageout = FALSE;
 445                 assert(!p->cleaning);
 446
 447                 offset = p->offset;
 448                 VM_PAGE_FREE(p);
 449                 p = VM_PAGE_NULL;
 450
 451                 m = vm_page_lookup(shadow_object,
 452                         offset + object->shadow_offset);
 453
 454                 if(m == VM_PAGE_NULL)
 455                         continue;
 456                 assert(m->cleaning);
 457                 /* used as a trigger on upl_commit etc to recognize the */
 458                 /* pageout daemon's subseqent desire to pageout a cleaning */
 459                 /* page.  When the bit is on the upl commit code will   */
 460                 /* respect the pageout bit in the target page over the  */
 461                 /* caller's page list indication */
 462                 m->dump_cleaning = FALSE;
 463
 464                 /*
 465                  * Account for the paging reference taken when
 466                  * m->cleaning was set on this page.
 467                  */
 468                 vm_object_paging_end(shadow_object);
 469                 assert((m->dirty) || (m->precious) ||
 470                                 (m->busy && m->cleaning));
 471
 472                 /*
 473                  * Handle the trusted pager throttle.
 474                  * Also decrement the burst throttle (if external).
 475                  */
 476                 vm_page_lock_queues();
 477                 if (m->laundry) {
 478                         vm_pageout_throttle_up(m);
 479                 }
 480
 481                 /*
 482                  * Handle the "target" page(s). These pages are to be freed if
 483                  * successfully cleaned. Target pages are always busy, and are
 484                  * wired exactly once. The initial target pages are not mapped,
 485                  * (so cannot be referenced or modified) but converted target
 486                  * pages may have been modified between the selection as an
 487                  * adjacent page and conversion to a target.
 488                  */
 489                 if (m->pageout) {
 490                         assert(m->busy);
 491                         assert(m->wire_count == 1);
 492                         m->cleaning = FALSE;
 493                         m->pageout = FALSE;
 494 #if MACH_CLUSTER_STATS
 495                         if (m->wanted) vm_pageout_target_collisions++;
 496 #endif
 497                         /*
 498                          * Revoke all access to the page. Since the object is
 499                          * locked, and the page is busy, this prevents the page
 500                          * from being dirtied after the pmap_disconnect() call
 501                          * returns.
 502                          *
 503                          * Since the page is left "dirty" but "not modifed", we
 504                          * can detect whether the page was redirtied during
 505                          * pageout by checking the modify state.
 506                          */
 507                         if (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED)
 508                               m->dirty = TRUE;
 509                         else
 510                               m->dirty = FALSE;
 511
 512                         if (m->dirty) {
 513                                 CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
 514                                 vm_page_unwire(m);/* reactivates */
 515                                 VM_STAT(reactivations++);
 516                                 PAGE_WAKEUP_DONE(m);
 517                         } else {
 518                                 CLUSTER_STAT(vm_pageout_target_page_freed++;)
 519                                 vm_page_free(m);/* clears busy, etc. */
 520                         }
 521                         vm_page_unlock_queues();
 522                         continue;
 523                 }
 524                 /*
 525                  * Handle the "adjacent" pages. These pages were cleaned in
 526                  * place, and should be left alone.
 527                  * If prep_pin_count is nonzero, then someone is using the
 528                  * page, so make it active.
 529                  */
 530                 if (!m->active && !m->inactive && !m->private) {
 531                         if (m->reference)
 532                                 vm_page_activate(m);
 533                         else
 534                                 vm_page_deactivate(m);
 535                 }
 536                 if((m->busy) && (m->cleaning)) {
 537
 538                         /* the request_page_list case, (COPY_OUT_FROM FALSE) */
 539                         m->busy = FALSE;
 540
 541                         /* We do not re-set m->dirty ! */
 542                         /* The page was busy so no extraneous activity     */
 543                         /* could have occurred. COPY_INTO is a read into the */
 544                         /* new pages. CLEAN_IN_PLACE does actually write   */
 545                         /* out the pages but handling outside of this code */
 546                         /* will take care of resetting dirty. We clear the */
 547                         /* modify however for the Programmed I/O case.     */
 548                         pmap_clear_modify(m->phys_page);
 549                         if(m->absent) {
 550                                 m->absent = FALSE;
 551                                 if(shadow_object->absent_count == 1)
 552                                         vm_object_absent_release(shadow_object);
 553                                 else
 554                                         shadow_object->absent_count--;
 555                         }
 556                         m->overwriting = FALSE;
 557                 } else if (m->overwriting) {
 558                         /* alternate request page list, write to page_list */
 559                         /* case.  Occurs when the original page was wired  */
 560                         /* at the time of the list request */
 561                         assert(m->wire_count != 0);
 562                         vm_page_unwire(m);/* reactivates */
 563                         m->overwriting = FALSE;
 564                 } else {
 565                 /*
 566                  * Set the dirty state according to whether or not the page was
 567                  * modified during the pageout. Note that we purposefully do
 568                  * NOT call pmap_clear_modify since the page is still mapped.
 569                  * If the page were to be dirtied between the 2 calls, this
 570                  * this fact would be lost. This code is only necessary to
 571                  * maintain statistics, since the pmap module is always
 572                  * consulted if m->dirty is false.
 573                  */
 574 #if MACH_CLUSTER_STATS
 575                         m->dirty = pmap_is_modified(m->phys_page);
 576
 577                         if (m->dirty)   vm_pageout_cluster_dirtied++;
 578                         else            vm_pageout_cluster_cleaned++;
 579                         if (m->wanted)  vm_pageout_cluster_collisions++;
 580 #else
 581                         m->dirty = 0;
 582 #endif
 583                 }
 584                 m->cleaning = FALSE;
 585
 586                 /*
 587                  * Wakeup any thread waiting for the page to be un-cleaning.
 588                  */
 589                 PAGE_WAKEUP(m);
 590                 vm_page_unlock_queues();
 591         }
 592         /*
 593          * Account for the paging reference taken in vm_paging_object_allocate.
 594          */
 595         vm_object_paging_end(shadow_object);
 596         vm_object_unlock(shadow_object);
 597
 598         assert(object->ref_count == 0);
 599         assert(object->paging_in_progress == 0);
 600         assert(object->resident_page_count == 0);
 601         return;
 602 }
 603
 604 /*
 605  *      Routine:        vm_pageout_setup
 606  *      Purpose:
 607  *              Set up a page for pageout (clean & flush).
 608  *
 609  *              Move the page to a new object, as part of which it will be
 610  *              sent to its memory manager in a memory_object_data_write or
 611  *              memory_object_initialize message.
 612  *
 613  *              The "new_object" and "new_offset" arguments
 614  *              indicate where the page should be moved.
 615  *
 616  *      In/Out conditions:
 617  *              The page in question must not be on any pageout queues,
 618  *              and must be busy.  The object to which it belongs
 619  *              must be unlocked, and the caller must hold a paging
 620  *              reference to it.  The new_object must not be locked.
 621  *
 622  *              This routine returns a pointer to a place-holder page,
 623  *              inserted at the same offset, to block out-of-order
 624  *              requests for the page.  The place-holder page must
 625  *              be freed after the data_write or initialize message
 626  *              has been sent.
 627  *
 628  *              The original page is put on a paging queue and marked
 629  *              not busy on exit.
 630  */
 631 vm_page_t
 632 vm_pageout_setup(
 633         register vm_page_t      m,
 634         register vm_object_t    new_object,
 635         vm_object_offset_t      new_offset)
 636 {
 637         register vm_object_t    old_object = m->object;
 638         vm_object_offset_t      paging_offset;
 639         vm_object_offset_t      offset;
 640         register vm_page_t      holding_page;
 641         register vm_page_t      new_m;
 642         boolean_t               need_to_wire = FALSE;
 643
 644
 645         XPR(XPR_VM_PAGEOUT,
 646      "vm_pageout_setup, obj 0x%X off 0x%X page 0x%X new obj 0x%X offset 0x%X\n",
 647                 (integer_t)m->object, (integer_t)m->offset,
 648                 (integer_t)m, (integer_t)new_object,
 649                 (integer_t)new_offset);
 650         assert(m && m->busy && !m->absent && !m->fictitious && !m->error &&
 651                 !m->restart);
 652
 653         assert(m->dirty || m->precious);
 654
 655         /*
 656          *      Create a place-holder page where the old one was, to prevent
 657          *      attempted pageins of this page while we're unlocked.
 658          */
 659         VM_PAGE_GRAB_FICTITIOUS(holding_page);
 660
 661         vm_object_lock(old_object);
 662
 663         offset = m->offset;
 664         paging_offset = offset + old_object->paging_offset;
 665
 666         if (old_object->pager_trusted) {
 667                 /*
 668                  * This pager is trusted, so we can clean this page
 669                  * in place. Leave it in the old object, and mark it
 670                  * cleaning & pageout.
 671                  */
 672                 new_m = holding_page;
 673                 holding_page = VM_PAGE_NULL;
 674
 675                 /*
 676                  * Set up new page to be private shadow of real page.
 677                  */
 678                 new_m->phys_page = m->phys_page;
 679                 new_m->fictitious = FALSE;
 680                 new_m->pageout = TRUE;
 681
 682                 /*
 683                  * Mark real page as cleaning (indicating that we hold a
 684                  * paging reference to be released via m_o_d_r_c) and
 685                  * pageout (indicating that the page should be freed
 686                  * when the pageout completes).
 687                  */
 688                 pmap_clear_modify(m->phys_page);
 689                 vm_page_lock_queues();
 690                 new_m->private = TRUE;
 691                 vm_page_wire(new_m);
 692                 m->cleaning = TRUE;
 693                 m->pageout = TRUE;
 694
 695                 vm_page_wire(m);
 696                 assert(m->wire_count == 1);
 697                 vm_page_unlock_queues();
 698
 699                 m->dirty = TRUE;
 700                 m->precious = FALSE;
 701                 m->page_lock = VM_PROT_NONE;
 702                 m->unusual = FALSE;
 703                 m->unlock_request = VM_PROT_NONE;
 704         } else {
 705                 /*
 706                  * Cannot clean in place, so rip the old page out of the
 707                  * object, and stick the holding page in. Set new_m to the
 708                  * page in the new object.
 709                  */
 710                 vm_page_lock_queues();
 711                 VM_PAGE_QUEUES_REMOVE(m);
 712                 vm_page_remove(m);
 713
 714                 vm_page_insert(holding_page, old_object, offset);
 715                 vm_page_unlock_queues();
 716
 717                 m->dirty = TRUE;
 718                 m->precious = FALSE;
 719                 new_m = m;
 720                 new_m->page_lock = VM_PROT_NONE;
 721                 new_m->unlock_request = VM_PROT_NONE;
 722
 723                 if (old_object->internal)
 724                         need_to_wire = TRUE;
 725         }
 726         /*
 727          *      Record that this page has been written out
 728          */
 729 #if     MACH_PAGEMAP
 730         vm_external_state_set(old_object->existence_map, offset);
 731 #endif  /* MACH_PAGEMAP */
 732
 733         vm_object_unlock(old_object);
 734
 735         vm_object_lock(new_object);
 736
 737         /*
 738          *      Put the page into the new object. If it is a not wired
 739          *      (if it's the real page) it will be activated.
 740          */
 741
 742         vm_page_lock_queues();
 743         vm_page_insert(new_m, new_object, new_offset);
 744         if (need_to_wire)
 745                 vm_page_wire(new_m);
 746         else
 747                 vm_page_activate(new_m);
 748         PAGE_WAKEUP_DONE(new_m);
 749         vm_page_unlock_queues();
 750
 751         vm_object_unlock(new_object);
 752
 753         /*
 754          *      Return the placeholder page to simplify cleanup.
 755          */
 756         return (holding_page);
 757 }
 758
 759 /*
 760  * Routine:     vm_pageclean_setup
 761  *
 762  * Purpose:     setup a page to be cleaned (made non-dirty), but not
 763  *              necessarily flushed from the VM page cache.
 764  *              This is accomplished by cleaning in place.
 765  *
 766  *              The page must not be busy, and the object and page
 767  *              queues must be locked.
 768  *
 769  */
 770 void
 771 vm_pageclean_setup(
 772         vm_page_t               m,
 773         vm_page_t               new_m,
 774         vm_object_t             new_object,
 775         vm_object_offset_t      new_offset)
 776 {
 777         vm_object_t old_object = m->object;
 778         assert(!m->busy);
 779         assert(!m->cleaning);
 780
 781         XPR(XPR_VM_PAGEOUT,
 782     "vm_pageclean_setup, obj 0x%X off 0x%X page 0x%X new 0x%X new_off 0x%X\n",
 783                 (integer_t)old_object, m->offset, (integer_t)m,
 784                 (integer_t)new_m, new_offset);
 785
 786         pmap_clear_modify(m->phys_page);
 787         vm_object_paging_begin(old_object);
 788
 789         /*
 790          *      Record that this page has been written out
 791          */
 792 #if     MACH_PAGEMAP
 793         vm_external_state_set(old_object->existence_map, m->offset);
 794 #endif  /*MACH_PAGEMAP*/
 795
 796         /*
 797          * Mark original page as cleaning in place.
 798          */
 799         m->cleaning = TRUE;
 800         m->dirty = TRUE;
 801         m->precious = FALSE;
 802
 803         /*
 804          * Convert the fictitious page to a private shadow of
 805          * the real page.
 806          */
 807         assert(new_m->fictitious);
 808         new_m->fictitious = FALSE;
 809         new_m->private = TRUE;
 810         new_m->pageout = TRUE;
 811         new_m->phys_page = m->phys_page;
 812         vm_page_wire(new_m);
 813
 814         vm_page_insert(new_m, new_object, new_offset);
 815         assert(!new_m->wanted);
 816         new_m->busy = FALSE;
 817 }
 818
 819 void
 820 vm_pageclean_copy(
 821         vm_page_t               m,
 822         vm_page_t               new_m,
 823         vm_object_t             new_object,
 824         vm_object_offset_t      new_offset)
 825 {
 826         XPR(XPR_VM_PAGEOUT,
 827         "vm_pageclean_copy, page 0x%X new_m 0x%X new_obj 0x%X offset 0x%X\n",
 828                 m, new_m, new_object, new_offset, 0);
 829
 830         assert((!m->busy) && (!m->cleaning));
 831
 832         assert(!new_m->private && !new_m->fictitious);
 833
 834         pmap_clear_modify(m->phys_page);
 835
 836         m->busy = TRUE;
 837         vm_object_paging_begin(m->object);
 838         vm_page_unlock_queues();
 839         vm_object_unlock(m->object);
 840
 841         /*
 842          * Copy the original page to the new page.
 843          */
 844         vm_page_copy(m, new_m);
 845
 846         /*
 847          * Mark the old page as clean. A request to pmap_is_modified
 848          * will get the right answer.
 849          */
 850         vm_object_lock(m->object);
 851         m->dirty = FALSE;
 852
 853         vm_object_paging_end(m->object);
 854
 855         vm_page_lock_queues();
 856         if (!m->active && !m->inactive)
 857                 vm_page_activate(m);
 858         PAGE_WAKEUP_DONE(m);
 859
 860         vm_page_insert(new_m, new_object, new_offset);
 861         vm_page_activate(new_m);
 862         new_m->busy = FALSE;    /* No other thread can be waiting */
 863 }
 864
 865
 866 /*
 867  *      Routine:        vm_pageout_initialize_page
 868  *      Purpose:
 869  *              Causes the specified page to be initialized in
 870  *              the appropriate memory object. This routine is used to push
 871  *              pages into a copy-object when they are modified in the
 872  *              permanent object.
 873  *
 874  *              The page is moved to a temporary object and paged out.
 875  *
 876  *      In/out conditions:
 877  *              The page in question must not be on any pageout queues.
 878  *              The object to which it belongs must be locked.
 879  *              The page must be busy, but not hold a paging reference.
 880  *
 881  *      Implementation:
 882  *              Move this page to a completely new object.
 883  */
 884 void
 885 vm_pageout_initialize_page(
 886         vm_page_t       m)
 887 {
 888         vm_object_t             object;
 889         vm_object_offset_t      paging_offset;
 890         vm_page_t               holding_page;
 891
 892
 893         XPR(XPR_VM_PAGEOUT,
 894                 "vm_pageout_initialize_page, page 0x%X\n",
 895                 (integer_t)m, 0, 0, 0, 0);
 896         assert(m->busy);
 897
 898         /*
 899          *      Verify that we really want to clean this page
 900          */
 901         assert(!m->absent);
 902         assert(!m->error);
 903         assert(m->dirty);
 904
 905         /*
 906          *      Create a paging reference to let us play with the object.
 907          */
 908         object = m->object;
 909         paging_offset = m->offset + object->paging_offset;
 910         vm_object_paging_begin(object);
 911         if (m->absent || m->error || m->restart ||
 912             (!m->dirty && !m->precious)) {
 913                 VM_PAGE_FREE(m);
 914                 panic("reservation without pageout?"); /* alan */
 915              vm_object_unlock(object);
 916                 return;
 917         }
 918
 919         /* set the page for future call to vm_fault_list_request */
 920         holding_page = NULL;
 921         vm_page_lock_queues();
 922         pmap_clear_modify(m->phys_page);
 923         m->dirty = TRUE;
 924         m->busy = TRUE;
 925         m->list_req_pending = TRUE;
 926         m->cleaning = TRUE;
 927         m->pageout = TRUE;
 928         vm_page_wire(m);
 929         vm_page_unlock_queues();
 930         vm_object_unlock(object);
 931
 932         /*
 933          *      Write the data to its pager.
 934          *      Note that the data is passed by naming the new object,
 935          *      not a virtual address; the pager interface has been
 936          *      manipulated to use the "internal memory" data type.
 937          *      [The object reference from its allocation is donated
 938          *      to the eventual recipient.]
 939          */
 940         memory_object_data_initialize(object->pager,
 941                                         paging_offset,
 942                                         PAGE_SIZE);
 943
 944         vm_object_lock(object);
 945 }
 946
 947 #if     MACH_CLUSTER_STATS
 948 #define MAXCLUSTERPAGES 16
 949 struct {
 950         unsigned long pages_in_cluster;
 951         unsigned long pages_at_higher_offsets;
 952         unsigned long pages_at_lower_offsets;
 953 } cluster_stats[MAXCLUSTERPAGES];
 954 #endif  /* MACH_CLUSTER_STATS */
 955
 956 boolean_t allow_clustered_pageouts = FALSE;
 957
 958 /*
 959  * vm_pageout_cluster:
 960  *
 961  * Given a page, queue it to the appropriate I/O thread,
 962  * which will page it out and attempt to clean adjacent pages
 963  * in the same operation.
 964  *
 965  * The page must be busy, and the object and queues locked. We will take a
 966  * paging reference to prevent deallocation or collapse when we
 967  * release the object lock back at the call site.  The I/O thread
 968  * is responsible for consuming this reference
 969  *
 970  * The page must not be on any pageout queue.
 971  */
 972
 973 void
 974 vm_pageout_cluster(vm_page_t m)
 975 {
 976         vm_object_t     object = m->object;
 977         struct          vm_pageout_queue *q;
 978
 979
 980         XPR(XPR_VM_PAGEOUT,
 981                 "vm_pageout_cluster, object 0x%X offset 0x%X page 0x%X\n",
 982                 (integer_t)object, m->offset, (integer_t)m, 0, 0);
 983
 984         /*
 985          * Only a certain kind of page is appreciated here.
 986          */
 987         assert(m->busy && (m->dirty || m->precious) && (m->wire_count == 0));
 988         assert(!m->cleaning && !m->pageout && !m->inactive && !m->active);
 989
 990         /*
 991          * protect the object from collapse -
 992          * locking in the object's paging_offset.
 993          */
 994         vm_object_paging_begin(object);
 995
 996         /*
 997          * set the page for future call to vm_fault_list_request
 998          * page should already be marked busy
 999          */
1000         vm_page_wire(m);
1001         m->list_req_pending = TRUE;
1002         m->cleaning = TRUE;
1003         m->pageout = TRUE;
1004         m->laundry = TRUE;
1005
1006         if (object->internal == TRUE)
1007                 q = &vm_pageout_queue_internal;
1008         else
1009                 q = &vm_pageout_queue_external;
1010         q->pgo_laundry++;
1011
1012         m->pageout_queue = TRUE;
1013         queue_enter(&q->pgo_pending, m, vm_page_t, pageq);
1014
1015         if (q->pgo_idle == TRUE) {
1016                 q->pgo_idle = FALSE;
1017                 thread_wakeup((event_t) &q->pgo_pending);
1018         }
1019 }
1020
1021
1022 unsigned long vm_pageout_throttle_up_count = 0;
1023
1024 /*
1025  * A page is back from laundry.  See if there are some pages waiting to
1026  * go to laundry and if we can let some of them go now.
1027  *
1028  * Object and page queues must be locked.
1029  */
1030 void
1031 vm_pageout_throttle_up(
1032         vm_page_t       m)
1033 {
1034         struct vm_pageout_queue *q;
1035
1036         vm_pageout_throttle_up_count++;
1037
1038         assert(m->laundry);
1039         assert(m->object != VM_OBJECT_NULL);
1040         assert(m->object != kernel_object);
1041
1042         if (m->object->internal == TRUE)
1043                 q = &vm_pageout_queue_internal;
1044         else
1045                 q = &vm_pageout_queue_external;
1046
1047         m->laundry = FALSE;
1048         q->pgo_laundry--;
1049
1050         if (q->pgo_throttled == TRUE) {
1051                 q->pgo_throttled = FALSE;
1052                 thread_wakeup((event_t) &q->pgo_laundry);
1053         }
1054 }
1055
1056
1057 /*
1058  *      vm_pageout_scan does the dirty work for the pageout daemon.
1059  *      It returns with vm_page_queue_free_lock held and
1060  *      vm_page_free_wanted == 0.
1061  */
1062
1063 #define DELAYED_UNLOCK_LIMIT  (3 * MAX_UPL_TRANSFER)
1064
1065 #define FCS_IDLE                0
1066 #define FCS_DELAYED             1
1067 #define FCS_DEADLOCK_DETECTED   2
1068
1069 struct flow_control {
1070         int             state;
1071         mach_timespec_t ts;
1072 };
1073
1074 void
1075 vm_pageout_scan(void)
1076 {
1077         unsigned int loop_count = 0;
1078         unsigned int inactive_burst_count = 0;
1079         unsigned int active_burst_count = 0;
1080         vm_page_t   local_freeq = 0;
1081         int         local_freed = 0;
1082         int         delayed_unlock = 0;
1083         int         need_internal_inactive = 0;
1084         int         refmod_state = 0;
1085         int     vm_pageout_deadlock_target = 0;
1086         struct  vm_pageout_queue *iq;
1087         struct  vm_pageout_queue *eq;
1088         struct  flow_control    flow_control;
1089         boolean_t active_throttled = FALSE;
1090         boolean_t inactive_throttled = FALSE;
1091         mach_timespec_t         ts;
1092         unsigned int msecs = 0;
1093         vm_object_t     object;
1094
1095
1096         flow_control.state = FCS_IDLE;
1097         iq = &vm_pageout_queue_internal;
1098         eq = &vm_pageout_queue_external;
1099
1100         XPR(XPR_VM_PAGEOUT, "vm_pageout_scan\n", 0, 0, 0, 0, 0);
1101
1102 /*???*/ /*
1103          *      We want to gradually dribble pages from the active queue
1104          *      to the inactive queue.  If we let the inactive queue get
1105          *      very small, and then suddenly dump many pages into it,
1106          *      those pages won't get a sufficient chance to be referenced
1107          *      before we start taking them from the inactive queue.
1108          *
1109          *      We must limit the rate at which we send pages to the pagers.
1110          *      data_write messages consume memory, for message buffers and
1111          *      for map-copy objects.  If we get too far ahead of the pagers,
1112          *      we can potentially run out of memory.
1113          *
1114          *      We can use the laundry count to limit directly the number
1115          *      of pages outstanding to the default pager.  A similar
1116          *      strategy for external pagers doesn't work, because
1117          *      external pagers don't have to deallocate the pages sent them,
1118          *      and because we might have to send pages to external pagers
1119          *      even if they aren't processing writes.  So we also
1120          *      use a burst count to limit writes to external pagers.
1121          *
1122          *      When memory is very tight, we can't rely on external pagers to
1123          *      clean pages.  They probably aren't running, because they
1124          *      aren't vm-privileged.  If we kept sending dirty pages to them,
1125          *      we could exhaust the free list.
1126          */
1127         vm_page_lock_queues();
1128         delayed_unlock = 1;
1129
1130
1131 Restart:
1132         /*
1133          *      Recalculate vm_page_inactivate_target.
1134          */
1135         vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
1136                                                           vm_page_inactive_count);
1137         object = NULL;
1138
1139         for (;;) {
1140                 vm_page_t m;
1141
1142                 if (delayed_unlock == 0)
1143                         vm_page_lock_queues();
1144
1145                 active_burst_count = vm_page_active_count;
1146
1147                 if (active_burst_count > vm_pageout_burst_active_throttle)
1148                         active_burst_count = vm_pageout_burst_active_throttle;
1149
1150                 /*
1151                  *      Move pages from active to inactive.
1152                  */
1153                 while ((need_internal_inactive ||
1154                            vm_page_inactive_count < vm_page_inactive_target) &&
1155                        !queue_empty(&vm_page_queue_active) &&
1156                        ((active_burst_count--) > 0)) {
1157
1158                         vm_pageout_active++;
1159
1160                         m = (vm_page_t) queue_first(&vm_page_queue_active);
1161
1162                         assert(m->active && !m->inactive);
1163                         assert(!m->laundry);
1164                         assert(m->object != kernel_object);
1165
1166                         /*
1167                          * Try to lock object; since we've already got the
1168                          * page queues lock, we can only 'try' for this one.
1169                          * if the 'try' fails, we need to do a mutex_pause
1170                          * to allow the owner of the object lock a chance to
1171                          * run... otherwise, we're likely to trip over this
1172                          * object in the same state as we work our way through
1173                          * the queue... clumps of pages associated with the same
1174                          * object are fairly typical on the inactive and active queues
1175                          */
1176                         if (m->object != object) {
1177                                 if (object != NULL) {
1178                                         vm_object_unlock(object);
1179                                         object = NULL;
1180                                 }
1181                                 if (!vm_object_lock_try(m->object)) {
1182                                         /*
1183                                          * move page to end of active queue and continue
1184                                          */
1185                                         queue_remove(&vm_page_queue_active, m,
1186                                                      vm_page_t, pageq);
1187                                         queue_enter(&vm_page_queue_active, m,
1188                                                     vm_page_t, pageq);
1189
1190                                         goto done_with_activepage;
1191                                 }
1192                                 object = m->object;
1193                         }
1194                         /*
1195                          * if the page is BUSY, then we pull it
1196                          * off the active queue and leave it alone.
1197                          * when BUSY is cleared, it will get stuck
1198                          * back on the appropriate queue
1199                          */
1200                         if (m->busy) {
1201                                 queue_remove(&vm_page_queue_active, m,
1202                                              vm_page_t, pageq);
1203                                 m->pageq.next = NULL;
1204                                 m->pageq.prev = NULL;
1205
1206                                 if (!m->fictitious)
1207                                         vm_page_active_count--;
1208                                 m->active = FALSE;
1209
1210                                 goto done_with_activepage;
1211                         }
1212                         if (need_internal_inactive) {
1213                                 /*
1214                                  * If we're unable to make forward progress
1215                                  * with the current set of pages on the
1216                                  * inactive queue due to busy objects or
1217                                  * throttled pageout queues, then
1218                                  * move a page that is already clean
1219                                  * or belongs to a pageout queue that
1220                                  * isn't currently throttled
1221                                  */
1222                                 active_throttled = FALSE;
1223
1224                                 if (object->internal) {
1225                                         if ((VM_PAGE_Q_THROTTLED(iq) || !IP_VALID(memory_manager_default)))
1226                                                 active_throttled = TRUE;
1227                                 } else if (VM_PAGE_Q_THROTTLED(eq)) {
1228                                                 active_throttled = TRUE;
1229                                 }
1230                                 if (active_throttled == TRUE) {
1231                                         if (!m->dirty) {
1232                                                 refmod_state = pmap_get_refmod(m->phys_page);
1233
1234                                                 if (refmod_state & VM_MEM_REFERENCED)
1235                                                         m->reference = TRUE;
1236                                                 if (refmod_state & VM_MEM_MODIFIED)
1237                                                         m->dirty = TRUE;
1238                                         }
1239                                         if (m->dirty || m->precious) {
1240                                                 /*
1241                                                  * page is dirty and targets a THROTTLED queue
1242                                                  * so all we can do is move it back to the
1243                                                  * end of the active queue to get it out
1244                                                  * of the way
1245                                                  */
1246                                                 queue_remove(&vm_page_queue_active, m,
1247                                                              vm_page_t, pageq);
1248                                                 queue_enter(&vm_page_queue_active, m,
1249                                                             vm_page_t, pageq);
1250
1251                                                 vm_pageout_scan_active_throttled++;
1252
1253                                                 goto done_with_activepage;
1254                                         }
1255                                 }
1256                                 vm_pageout_scan_active_throttle_success++;
1257                                 need_internal_inactive--;
1258                         }
1259                         /*
1260                          *      Deactivate the page while holding the object
1261                          *      locked, so we know the page is still not busy.
1262                          *      This should prevent races between pmap_enter
1263                          *      and pmap_clear_reference.  The page might be
1264                          *      absent or fictitious, but vm_page_deactivate
1265                          *      can handle that.
1266                          */
1267                         vm_page_deactivate(m);
1268 done_with_activepage:
1269                         if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
1270
1271                                 if (object != NULL) {
1272                                         vm_object_unlock(object);
1273                                         object = NULL;
1274                                 }
1275                                 if (local_freeq) {
1276                                         vm_page_free_list(local_freeq);
1277
1278                                         local_freeq = 0;
1279                                         local_freed = 0;
1280                                 }
1281                                 delayed_unlock = 0;
1282                                 vm_page_unlock_queues();
1283
1284                                 mutex_pause();
1285                                 vm_page_lock_queues();
1286                                 /*
1287                                  * continue the while loop processing
1288                                  * the active queue... need to hold
1289                                  * the page queues lock
1290                                  */
1291                                 continue;
1292                         }
1293                 }
1294
1295
1296
1297                 /**********************************************************************
1298                  * above this point we're playing with the active queue
1299                  * below this point we're playing with the throttling mechanisms
1300                  * and the inactive queue
1301                  **********************************************************************/
1302
1303
1304
1305                 /*
1306                  *      We are done if we have met our target *and*
1307                  *      nobody is still waiting for a page.
1308                  */
1309                 if (vm_page_free_count + local_freed >= vm_page_free_target) {
1310                         if (object != NULL) {
1311                                 vm_object_unlock(object);
1312                                 object = NULL;
1313                         }
1314                         if (local_freeq) {
1315                                 vm_page_free_list(local_freeq);
1316
1317                                 local_freeq = 0;
1318                                 local_freed = 0;
1319                         }
1320                         mutex_lock(&vm_page_queue_free_lock);
1321
1322                         if ((vm_page_free_count >= vm_page_free_target) &&
1323                                   (vm_page_free_wanted == 0)) {
1324
1325                                 vm_page_unlock_queues();
1326
1327                                 thread_wakeup((event_t) &vm_pageout_garbage_collect);
1328                                 return;
1329                         }
1330                         mutex_unlock(&vm_page_queue_free_lock);
1331                 }
1332
1333
1334                 /*
1335                  * Sometimes we have to pause:
1336                  *      1) No inactive pages - nothing to do.
1337                  *      2) Flow control - default pageout queue is full
1338                  *      3) Loop control - no acceptable pages found on the inactive queue
1339                  *         within the last vm_pageout_burst_inactive_throttle iterations
1340                  */
1341                 if ((queue_empty(&vm_page_queue_inactive) && queue_empty(&vm_page_queue_zf))) {
1342                         vm_pageout_scan_empty_throttle++;
1343                         msecs = vm_pageout_empty_wait;
1344                         goto vm_pageout_scan_delay;
1345
1346                 } else if (inactive_burst_count >= vm_pageout_burst_inactive_throttle) {
1347                         vm_pageout_scan_burst_throttle++;
1348                         msecs = vm_pageout_burst_wait;
1349                         goto vm_pageout_scan_delay;
1350
1351                 } else if (VM_PAGE_Q_THROTTLED(iq)) {
1352
1353                         switch (flow_control.state) {
1354
1355                         case FCS_IDLE:
1356 reset_deadlock_timer:
1357                                 ts.tv_sec = vm_pageout_deadlock_wait / 1000;
1358                                 ts.tv_nsec = (vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
1359                                 clock_get_system_nanotime(
1360                                         &flow_control.ts.tv_sec,
1361                                         (uint32_t *) &flow_control.ts.tv_nsec);
1362                                 ADD_MACH_TIMESPEC(&flow_control.ts, &ts);
1363
1364                                 flow_control.state = FCS_DELAYED;
1365                                 msecs = vm_pageout_deadlock_wait;
1366
1367                                 break;
1368
1369                         case FCS_DELAYED:
1370                                 clock_get_system_nanotime(
1371                                         &ts.tv_sec,
1372                                         (uint32_t *) &ts.tv_nsec);
1373
1374                                 if (CMP_MACH_TIMESPEC(&ts, &flow_control.ts) >= 0) {
1375                                         /*
1376                                          * the pageout thread for the default pager is potentially
1377                                          * deadlocked since the
1378                                          * default pager queue has been throttled for more than the
1379                                          * allowable time... we need to move some clean pages or dirty
1380                                          * pages belonging to the external pagers if they aren't throttled
1381                                          * vm_page_free_wanted represents the number of threads currently
1382                                          * blocked waiting for pages... we'll move one page for each of
1383                                          * these plus a fixed amount to break the logjam... once we're done
1384                                          * moving this number of pages, we'll re-enter the FSC_DELAYED state
1385                                          * with a new timeout target since we have no way of knowing
1386                                          * whether we've broken the deadlock except through observation
1387                                          * of the queue associated with the default pager... we need to
1388                                          * stop moving pagings and allow the system to run to see what
1389                                          * state it settles into.
1390                                          */
1391                                         vm_pageout_deadlock_target = vm_pageout_deadlock_relief + vm_page_free_wanted;
1392                                         vm_pageout_scan_deadlock_detected++;
1393                                         flow_control.state = FCS_DEADLOCK_DETECTED;
1394
1395                                         thread_wakeup((event_t) &vm_pageout_garbage_collect);
1396                                         goto consider_inactive;
1397                                 }
1398                                 /*
1399                                  * just resniff instead of trying
1400                                  * to compute a new delay time... we're going to be
1401                                  * awakened immediately upon a laundry completion,
1402                                  * so we won't wait any longer than necessary
1403                                  */
1404                                 msecs = vm_pageout_idle_wait;
1405                                 break;
1406
1407                         case FCS_DEADLOCK_DETECTED:
1408                                 if (vm_pageout_deadlock_target)
1409                                         goto consider_inactive;
1410                                 goto reset_deadlock_timer;
1411
1412                         }
1413                         vm_pageout_scan_throttle++;
1414                         iq->pgo_throttled = TRUE;
1415 vm_pageout_scan_delay:
1416                         if (object != NULL) {
1417                                 vm_object_unlock(object);
1418                                 object = NULL;
1419                         }
1420                         if (local_freeq) {
1421                                 vm_page_free_list(local_freeq);
1422
1423                                 local_freeq = 0;
1424                                 local_freed = 0;
1425                         }
1426                         assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000*NSEC_PER_USEC);
1427
1428                         counter(c_vm_pageout_scan_block++);
1429
1430                         vm_page_unlock_queues();
1431
1432                         thread_block(THREAD_CONTINUE_NULL);
1433
1434                         vm_page_lock_queues();
1435                         delayed_unlock = 1;
1436
1437                         iq->pgo_throttled = FALSE;
1438
1439                         if (loop_count >= vm_page_inactive_count) {
1440                                 if (VM_PAGE_Q_THROTTLED(eq) || VM_PAGE_Q_THROTTLED(iq)) {
1441                                         /*
1442                                          * Make sure we move enough "appropriate"
1443                                          * pages to the inactive queue before trying
1444                                          * again.
1445                                          */
1446                                         need_internal_inactive = vm_pageout_inactive_relief;
1447                                 }
1448                                 loop_count = 0;
1449                         }
1450                         inactive_burst_count = 0;
1451
1452                         goto Restart;
1453                         /*NOTREACHED*/
1454                 }
1455
1456
1457                 flow_control.state = FCS_IDLE;
1458 consider_inactive:
1459                 loop_count++;
1460                 inactive_burst_count++;
1461                 vm_pageout_inactive++;
1462
1463                 if (!queue_empty(&vm_page_queue_inactive)) {
1464                         m = (vm_page_t) queue_first(&vm_page_queue_inactive);
1465
1466                         if (m->clustered && (m->no_isync == TRUE)) {
1467                                 goto use_this_page;
1468                         }
1469                 }
1470                 if (vm_zf_count < vm_accellerate_zf_pageout_trigger) {
1471                         vm_zf_iterator = 0;
1472                 } else {
1473                         last_page_zf = 0;
1474                         if((vm_zf_iterator+=1) >= vm_zf_iterator_count) {
1475                                         vm_zf_iterator = 0;
1476                         }
1477                 }
1478                 if (queue_empty(&vm_page_queue_zf) ||
1479                                 (((last_page_zf) || (vm_zf_iterator == 0)) &&
1480                                 !queue_empty(&vm_page_queue_inactive))) {
1481                         m = (vm_page_t) queue_first(&vm_page_queue_inactive);
1482                         last_page_zf = 0;
1483                 } else {
1484                         m = (vm_page_t) queue_first(&vm_page_queue_zf);
1485                         last_page_zf = 1;
1486                 }
1487 use_this_page:
1488                 assert(!m->active && m->inactive);
1489                 assert(!m->laundry);
1490                 assert(m->object != kernel_object);
1491
1492                 /*
1493                  * Try to lock object; since we've alread got the
1494                  * page queues lock, we can only 'try' for this one.
1495                  * if the 'try' fails, we need to do a mutex_pause
1496                  * to allow the owner of the object lock a chance to
1497                  * run... otherwise, we're likely to trip over this
1498                  * object in the same state as we work our way through
1499                  * the queue... clumps of pages associated with the same
1500                  * object are fairly typical on the inactive and active queues
1501                  */
1502                 if (m->object != object) {
1503                         if (object != NULL) {
1504                                 vm_object_unlock(object);
1505                                 object = NULL;
1506                         }
1507                         if (!vm_object_lock_try(m->object)) {
1508                                 /*
1509                                  *      Move page to end and continue.
1510                                  *      Don't re-issue ticket
1511                                  */
1512                                 if (m->zero_fill) {
1513                                         queue_remove(&vm_page_queue_zf, m,
1514                                                      vm_page_t, pageq);
1515                                         queue_enter(&vm_page_queue_zf, m,
1516                                                     vm_page_t, pageq);
1517                                 } else {
1518                                         queue_remove(&vm_page_queue_inactive, m,
1519                                                      vm_page_t, pageq);
1520                                         queue_enter(&vm_page_queue_inactive, m,
1521                                                     vm_page_t, pageq);
1522                                 }
1523                                 vm_pageout_inactive_nolock++;
1524
1525                                 /*
1526                                  * force us to dump any collected free pages
1527                                  * and to pause before moving on
1528                                  */
1529                                 delayed_unlock = DELAYED_UNLOCK_LIMIT + 1;
1530
1531                                 goto done_with_inactivepage;
1532                         }
1533                         object = m->object;
1534                 }
1535                 /*
1536                  * If the page belongs to a purgable object with no pending copies
1537                  * against it, then we reap all of the pages in the object
1538                  * and note that the object has been "emptied".  It'll be up to the
1539                  * application the discover this and recreate its contents if desired.
1540                  */
1541                 if ((object->purgable == VM_OBJECT_PURGABLE_VOLATILE ||
1542                      object->purgable == VM_OBJECT_PURGABLE_EMPTY) &&
1543                     object->copy == VM_OBJECT_NULL) {
1544
1545                         (void) vm_object_purge(object);
1546                         vm_pageout_purged_objects++;
1547                         /*
1548                          * we've just taken all of the pages from this object,
1549                          * so drop the lock now since we're not going to find
1550                          * any more pages belonging to it anytime soon
1551                          */
1552                         vm_object_unlock(object);
1553                         object = NULL;
1554
1555                         inactive_burst_count = 0;
1556
1557                         goto done_with_inactivepage;
1558                 }
1559
1560                 /*
1561                  *      Paging out pages of external objects which
1562                  *      are currently being created must be avoided.
1563                  *      The pager may claim for memory, thus leading to a
1564                  *      possible dead lock between it and the pageout thread,
1565                  *      if such pages are finally chosen. The remaining assumption
1566                  *      is that there will finally be enough available pages in the
1567                  *      inactive pool to page out in order to satisfy all memory
1568                  *      claimed by the thread which concurrently creates the pager.
1569                  */
1570                 if (!object->pager_initialized && object->pager_created) {
1571                         /*
1572                          *      Move page to end and continue, hoping that
1573                          *      there will be enough other inactive pages to
1574                          *      page out so that the thread which currently
1575                          *      initializes the pager will succeed.
1576                          *      Don't re-grant the ticket, the page should
1577                          *      pulled from the queue and paged out whenever
1578                          *      one of its logically adjacent fellows is
1579                          *      targeted.
1580                          */
1581                         if (m->zero_fill) {
1582                                 queue_remove(&vm_page_queue_zf, m,
1583                                              vm_page_t, pageq);
1584                                 queue_enter(&vm_page_queue_zf, m,
1585                                             vm_page_t, pageq);
1586                                 last_page_zf = 1;
1587                                 vm_zf_iterator = vm_zf_iterator_count - 1;
1588                         } else {
1589                                 queue_remove(&vm_page_queue_inactive, m,
1590                                              vm_page_t, pageq);
1591                                 queue_enter(&vm_page_queue_inactive, m,
1592                                             vm_page_t, pageq);
1593                                 last_page_zf = 0;
1594                                 vm_zf_iterator = 1;
1595                         }
1596                         vm_pageout_inactive_avoid++;
1597
1598                         goto done_with_inactivepage;
1599                 }
1600                 /*
1601                  *      Remove the page from the inactive list.
1602                  */
1603                 if (m->zero_fill) {
1604                         queue_remove(&vm_page_queue_zf, m, vm_page_t, pageq);
1605                 } else {
1606                         queue_remove(&vm_page_queue_inactive, m, vm_page_t, pageq);
1607                 }
1608                 m->pageq.next = NULL;
1609                 m->pageq.prev = NULL;
1610                 m->inactive = FALSE;
1611                 if (!m->fictitious)
1612                         vm_page_inactive_count--;
1613
1614                 if (m->busy || !object->alive) {
1615                         /*
1616                          *      Somebody is already playing with this page.
1617                          *      Leave it off the pageout queues.
1618                          */
1619                         vm_pageout_inactive_busy++;
1620
1621                         goto done_with_inactivepage;
1622                 }
1623
1624                 /*
1625                  *      If it's absent or in error, we can reclaim the page.
1626                  */
1627
1628                 if (m->absent || m->error) {
1629                         vm_pageout_inactive_absent++;
1630 reclaim_page:
1631                         if (vm_pageout_deadlock_target) {
1632                                 vm_pageout_scan_inactive_throttle_success++;
1633                                 vm_pageout_deadlock_target--;
1634                         }
1635                         if (m->tabled)
1636                                 vm_page_remove(m);    /* clears tabled, object, offset */
1637                         if (m->absent)
1638                                 vm_object_absent_release(object);
1639
1640                         assert(m->pageq.next == NULL &&
1641                                m->pageq.prev == NULL);
1642                         m->pageq.next = (queue_entry_t)local_freeq;
1643                         local_freeq = m;
1644                         local_freed++;
1645
1646                         inactive_burst_count = 0;
1647
1648                         goto done_with_inactivepage;
1649                 }
1650
1651                 assert(!m->private);
1652                 assert(!m->fictitious);
1653
1654                 /*
1655                  *      If already cleaning this page in place, convert from
1656                  *      "adjacent" to "target". We can leave the page mapped,
1657                  *      and vm_pageout_object_terminate will determine whether
1658                  *      to free or reactivate.
1659                  */
1660
1661                 if (m->cleaning) {
1662                         m->busy = TRUE;
1663                         m->pageout = TRUE;
1664                         m->dump_cleaning = TRUE;
1665                         vm_page_wire(m);
1666
1667                         CLUSTER_STAT(vm_pageout_cluster_conversions++);
1668
1669                         inactive_burst_count = 0;
1670
1671                         goto done_with_inactivepage;
1672                 }
1673
1674                 /*
1675                  *      If it's being used, reactivate.
1676                  *      (Fictitious pages are either busy or absent.)
1677                  */
1678                 if ( (!m->reference) ) {
1679                         refmod_state = pmap_get_refmod(m->phys_page);
1680
1681                         if (refmod_state & VM_MEM_REFERENCED)
1682                                 m->reference = TRUE;
1683                         if (refmod_state & VM_MEM_MODIFIED)
1684                                 m->dirty = TRUE;
1685                 }
1686                 if (m->reference) {
1687 was_referenced:
1688                         vm_page_activate(m);
1689                         VM_STAT(reactivations++);
1690
1691                         vm_pageout_inactive_used++;
1692                         last_page_zf = 0;
1693                         inactive_burst_count = 0;
1694
1695                         goto done_with_inactivepage;
1696                 }
1697
1698                 XPR(XPR_VM_PAGEOUT,
1699                 "vm_pageout_scan, replace object 0x%X offset 0x%X page 0x%X\n",
1700                 (integer_t)object, (integer_t)m->offset, (integer_t)m, 0,0);
1701
1702                 /*
1703                  * we've got a candidate page to steal...
1704                  *
1705                  * m->dirty is up to date courtesy of the
1706                  * preceding check for m->reference... if
1707                  * we get here, then m->reference had to be
1708                  * FALSE which means we did a pmap_get_refmod
1709                  * and updated both m->reference and m->dirty
1710                  *
1711                  * if it's dirty or precious we need to
1712                  * see if the target queue is throtttled
1713                  * it if is, we need to skip over it by moving it back
1714                  * to the end of the inactive queue
1715                  */
1716                 inactive_throttled = FALSE;
1717
1718                 if (m->dirty || m->precious) {
1719                         if (object->internal) {
1720                                 if ((VM_PAGE_Q_THROTTLED(iq) || !IP_VALID(memory_manager_default)))
1721                                         inactive_throttled = TRUE;
1722                         } else if (VM_PAGE_Q_THROTTLED(eq)) {
1723                                         inactive_throttled = TRUE;
1724                         }
1725                 }
1726                 if (inactive_throttled == TRUE) {
1727                         if (m->zero_fill) {
1728                                 queue_enter(&vm_page_queue_zf, m,
1729                                             vm_page_t, pageq);
1730                         } else {
1731                                 queue_enter(&vm_page_queue_inactive, m,
1732                                             vm_page_t, pageq);
1733                         }
1734                         if (!m->fictitious)
1735                                 vm_page_inactive_count++;
1736                         m->inactive = TRUE;
1737
1738                         vm_pageout_scan_inactive_throttled++;
1739
1740                         goto done_with_inactivepage;
1741                 }
1742                 /*
1743                  * we've got a page that we can steal...
1744                  * eliminate all mappings and make sure
1745                  * we have the up-to-date modified state
1746                  * first take the page BUSY, so that no new
1747                  * mappings can be made
1748                  */
1749                 m->busy = TRUE;
1750
1751                 /*
1752                  * if we need to do a pmap_disconnect then we
1753                  * need to re-evaluate m->dirty since the pmap_disconnect
1754                  * provides the true state atomically... the
1755                  * page was still mapped up to the pmap_disconnect
1756                  * and may have been dirtied at the last microsecond
1757                  *
1758                  * we also check for the page being referenced 'late'
1759                  * if it was, we first need to do a WAKEUP_DONE on it
1760                  * since we already set m->busy = TRUE, before
1761                  * going off to reactivate it
1762                  *
1763                  * if we don't need the pmap_disconnect, then
1764                  * m->dirty is up to date courtesy of the
1765                  * earlier check for m->reference... if
1766                  * we get here, then m->reference had to be
1767                  * FALSE which means we did a pmap_get_refmod
1768                  * and updated both m->reference and m->dirty...
1769                  */
1770                 if (m->no_isync == FALSE) {
1771                         refmod_state = pmap_disconnect(m->phys_page);
1772
1773                         if (refmod_state & VM_MEM_MODIFIED)
1774                                 m->dirty = TRUE;
1775                         if (refmod_state & VM_MEM_REFERENCED) {
1776                                 m->reference = TRUE;
1777
1778                                 PAGE_WAKEUP_DONE(m);
1779                                 goto was_referenced;
1780                         }
1781                 }
1782                 /*
1783                  *      If it's clean and not precious, we can free the page.
1784                  */
1785                 if (!m->dirty && !m->precious) {
1786                         vm_pageout_inactive_clean++;
1787                         goto reclaim_page;
1788                 }
1789                 vm_pageout_cluster(m);
1790
1791                 vm_pageout_inactive_dirty++;
1792
1793                 inactive_burst_count = 0;
1794
1795 done_with_inactivepage:
1796                 if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
1797
1798                         if (object != NULL) {
1799                                 vm_object_unlock(object);
1800                                 object = NULL;
1801                         }
1802                         if (local_freeq) {
1803                                 vm_page_free_list(local_freeq);
1804
1805                                 local_freeq = 0;
1806                                 local_freed = 0;
1807                         }
1808                         delayed_unlock = 0;
1809                         vm_page_unlock_queues();
1810                         mutex_pause();
1811                 }
1812                 /*
1813                  * back to top of pageout scan loop
1814                  */
1815         }
1816 }
1817
1818
1819 int vm_page_free_count_init;
1820
1821 void
1822 vm_page_free_reserve(
1823         int pages)
1824 {
1825         int             free_after_reserve;
1826
1827         vm_page_free_reserved += pages;
1828
1829         free_after_reserve = vm_page_free_count_init - vm_page_free_reserved;
1830
1831         vm_page_free_min = vm_page_free_reserved +
1832                 VM_PAGE_FREE_MIN(free_after_reserve);
1833
1834         vm_page_free_target = vm_page_free_reserved +
1835                 VM_PAGE_FREE_TARGET(free_after_reserve);
1836
1837         if (vm_page_free_target < vm_page_free_min + 5)
1838                 vm_page_free_target = vm_page_free_min + 5;
1839 }
1840
1841 /*
1842  *      vm_pageout is the high level pageout daemon.
1843  */
1844
1845 void
1846 vm_pageout_continue(void)
1847 {
1848         vm_pageout_scan_event_counter++;
1849         vm_pageout_scan();
1850         /* we hold vm_page_queue_free_lock now */
1851         assert(vm_page_free_wanted == 0);
1852         assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
1853         mutex_unlock(&vm_page_queue_free_lock);
1854
1855         counter(c_vm_pageout_block++);
1856         thread_block((thread_continue_t)vm_pageout_continue);
1857         /*NOTREACHED*/
1858 }
1859
1860
1861 /*
1862  * must be called with the
1863  * queues and object locks held
1864  */
1865 static void
1866 vm_pageout_queue_steal(vm_page_t m)
1867 {
1868         struct vm_pageout_queue *q;
1869
1870         if (m->object->internal == TRUE)
1871                 q = &vm_pageout_queue_internal;
1872         else
1873                 q = &vm_pageout_queue_external;
1874
1875         m->laundry = FALSE;
1876         m->pageout_queue = FALSE;
1877         queue_remove(&q->pgo_pending, m, vm_page_t, pageq);
1878
1879         m->pageq.next = NULL;
1880         m->pageq.prev = NULL;
1881
1882         vm_object_paging_end(m->object);
1883
1884         q->pgo_laundry--;
1885 }
1886
1887
1888 #ifdef FAKE_DEADLOCK
1889
1890 #define FAKE_COUNT      5000
1891
1892 int internal_count = 0;
1893 int fake_deadlock = 0;
1894
1895 #endif
1896
1897 static void
1898 vm_pageout_iothread_continue(struct vm_pageout_queue *q)
1899 {
1900         vm_page_t       m = NULL;
1901         vm_object_t     object;
1902         boolean_t       need_wakeup;
1903
1904         vm_page_lock_queues();
1905
1906         while ( !queue_empty(&q->pgo_pending) ) {
1907
1908                    q->pgo_busy = TRUE;
1909                    queue_remove_first(&q->pgo_pending, m, vm_page_t, pageq);
1910                    m->pageout_queue = FALSE;
1911                    vm_page_unlock_queues();
1912
1913                    m->pageq.next = NULL;
1914                    m->pageq.prev = NULL;
1915 #ifdef FAKE_DEADLOCK
1916                    if (q == &vm_pageout_queue_internal) {
1917                            vm_offset_t addr;
1918                            int  pg_count;
1919
1920                            internal_count++;
1921
1922                            if ((internal_count == FAKE_COUNT)) {
1923
1924                                    pg_count = vm_page_free_count + vm_page_free_reserved;
1925
1926                                    if (kmem_alloc(kernel_map, &addr, PAGE_SIZE * pg_count) == KERN_SUCCESS) {
1927                                            kmem_free(kernel_map, addr, PAGE_SIZE * pg_count);
1928                                    }
1929                                    internal_count = 0;
1930                                    fake_deadlock++;
1931                            }
1932                    }
1933 #endif
1934                    object = m->object;
1935
1936                    if (!object->pager_initialized) {
1937                            vm_object_lock(object);
1938
1939                            /*
1940                             *   If there is no memory object for the page, create
1941                             *   one and hand it to the default pager.
1942                             */
1943
1944                            if (!object->pager_initialized)
1945                                    vm_object_collapse(object,
1946                                                       (vm_object_offset_t) 0,
1947                                                       TRUE);
1948                            if (!object->pager_initialized)
1949                                    vm_object_pager_create(object);
1950                            if (!object->pager_initialized) {
1951                                    /*
1952                                     *   Still no pager for the object.
1953                                     *   Reactivate the page.
1954                                     *
1955                                     *   Should only happen if there is no
1956                                     *   default pager.
1957                                     */
1958                                    m->list_req_pending = FALSE;
1959                                    m->cleaning = FALSE;
1960                                    m->pageout = FALSE;
1961                                    vm_page_unwire(m);
1962
1963                                    vm_pageout_throttle_up(m);
1964
1965                                    vm_page_lock_queues();
1966                                    vm_pageout_dirty_no_pager++;
1967                                    vm_page_activate(m);
1968                                    vm_page_unlock_queues();
1969
1970                                    /*
1971                                     *   And we are done with it.
1972                                     */
1973                                    PAGE_WAKEUP_DONE(m);
1974
1975                                    vm_object_paging_end(object);
1976                                    vm_object_unlock(object);
1977
1978                                    vm_page_lock_queues();
1979                                    continue;
1980                            } else if (object->pager == MEMORY_OBJECT_NULL) {
1981                                    /*
1982                                     * This pager has been destroyed by either
1983                                     * memory_object_destroy or vm_object_destroy, and
1984                                     * so there is nowhere for the page to go.
1985                                     * Just free the page... VM_PAGE_FREE takes
1986                                     * care of cleaning up all the state...
1987                                     * including doing the vm_pageout_throttle_up
1988                                     */
1989                                    VM_PAGE_FREE(m);
1990
1991                                    vm_object_paging_end(object);
1992                                    vm_object_unlock(object);
1993
1994                                    vm_page_lock_queues();
1995                                    continue;
1996                            }
1997                            vm_object_unlock(object);
1998                    }
1999                    /*
2000                     * we expect the paging_in_progress reference to have
2001                     * already been taken on the object before it was added
2002                     * to the appropriate pageout I/O queue... this will
2003                     * keep the object from being terminated and/or the
2004                     * paging_offset from changing until the I/O has
2005                     * completed... therefore no need to lock the object to
2006                     * pull the paging_offset from it.
2007                     *
2008                     * Send the data to the pager.
2009                     * any pageout clustering happens there
2010                     */
2011                    memory_object_data_return(object->pager,
2012                                              m->offset + object->paging_offset,
2013                                              PAGE_SIZE,
2014                                              NULL,
2015                                              NULL,
2016                                              FALSE,
2017                                              FALSE,
2018                                              0);
2019
2020                    vm_object_lock(object);
2021                    vm_object_paging_end(object);
2022                    vm_object_unlock(object);
2023
2024                    vm_page_lock_queues();
2025         }
2026         assert_wait((event_t) q, THREAD_UNINT);
2027
2028
2029         if (q->pgo_throttled == TRUE && !VM_PAGE_Q_THROTTLED(q)) {
2030                 q->pgo_throttled = FALSE;
2031                 need_wakeup = TRUE;
2032         } else
2033                 need_wakeup = FALSE;
2034
2035         q->pgo_busy = FALSE;
2036         q->pgo_idle = TRUE;
2037         vm_page_unlock_queues();
2038
2039         if (need_wakeup == TRUE)
2040                 thread_wakeup((event_t) &q->pgo_laundry);
2041
2042         thread_block_parameter((thread_continue_t)vm_pageout_iothread_continue, (void *) &q->pgo_pending);
2043         /*NOTREACHED*/
2044 }
2045
2046
2047 static void
2048 vm_pageout_iothread_external(void)
2049 {
2050
2051         vm_pageout_iothread_continue(&vm_pageout_queue_external);
2052         /*NOTREACHED*/
2053 }
2054
2055
2056 static void
2057 vm_pageout_iothread_internal(void)
2058 {
2059         thread_t        self = current_thread();
2060
2061         self->options |= TH_OPT_VMPRIV;
2062
2063         vm_pageout_iothread_continue(&vm_pageout_queue_internal);
2064         /*NOTREACHED*/
2065 }
2066
2067 static void
2068 vm_pageout_garbage_collect(int collect)
2069 {
2070         if (collect) {
2071                 stack_collect();
2072
2073                 /*
2074                  * consider_zone_gc should be last, because the other operations
2075                  * might return memory to zones.
2076                  */
2077                 consider_machine_collect();
2078                 consider_zone_gc();
2079
2080                 consider_machine_adjust();
2081         }
2082
2083         assert_wait((event_t) &vm_pageout_garbage_collect, THREAD_UNINT);
2084
2085         thread_block_parameter((thread_continue_t) vm_pageout_garbage_collect, (void *)1);
2086         /*NOTREACHED*/
2087 }
2088
2089
2090
2091 void
2092 vm_pageout(void)
2093 {
2094         thread_t        self = current_thread();
2095         thread_t        thread;
2096         kern_return_t   result;
2097         spl_t           s;
2098
2099         /*
2100          * Set thread privileges.
2101          */
2102         s = splsched();
2103         thread_lock(self);
2104         self->priority = BASEPRI_PREEMPT - 1;
2105         set_sched_pri(self, self->priority);
2106         thread_unlock(self);
2107         splx(s);
2108
2109         /*
2110          *      Initialize some paging parameters.
2111          */
2112
2113         if (vm_pageout_idle_wait == 0)
2114                 vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
2115
2116         if (vm_pageout_burst_wait == 0)
2117                 vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
2118
2119         if (vm_pageout_empty_wait == 0)
2120                 vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
2121
2122         if (vm_pageout_deadlock_wait == 0)
2123                 vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
2124
2125         if (vm_pageout_deadlock_relief == 0)
2126                 vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
2127
2128         if (vm_pageout_inactive_relief == 0)
2129                 vm_pageout_inactive_relief = VM_PAGEOUT_INACTIVE_RELIEF;
2130
2131         if (vm_pageout_burst_active_throttle == 0)
2132                 vm_pageout_burst_active_throttle = VM_PAGEOUT_BURST_ACTIVE_THROTTLE;
2133
2134         if (vm_pageout_burst_inactive_throttle == 0)
2135                 vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
2136
2137         /*
2138          * Set kernel task to low backing store privileged
2139          * status
2140          */
2141         task_lock(kernel_task);
2142         kernel_task->priv_flags |= VM_BACKING_STORE_PRIV;
2143         task_unlock(kernel_task);
2144
2145         vm_page_free_count_init = vm_page_free_count;
2146         vm_zf_iterator = 0;
2147         /*
2148          * even if we've already called vm_page_free_reserve
2149          * call it again here to insure that the targets are
2150          * accurately calculated (it uses vm_page_free_count_init)
2151          * calling it with an arg of 0 will not change the reserve
2152          * but will re-calculate free_min and free_target
2153          */
2154         if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
2155                 vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
2156         } else
2157                 vm_page_free_reserve(0);
2158
2159
2160         queue_init(&vm_pageout_queue_external.pgo_pending);
2161         vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
2162         vm_pageout_queue_external.pgo_laundry = 0;
2163         vm_pageout_queue_external.pgo_idle = FALSE;
2164         vm_pageout_queue_external.pgo_busy = FALSE;
2165         vm_pageout_queue_external.pgo_throttled = FALSE;
2166
2167         queue_init(&vm_pageout_queue_internal.pgo_pending);
2168         vm_pageout_queue_internal.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
2169         vm_pageout_queue_internal.pgo_laundry = 0;
2170         vm_pageout_queue_internal.pgo_idle = FALSE;
2171         vm_pageout_queue_internal.pgo_busy = FALSE;
2172         vm_pageout_queue_internal.pgo_throttled = FALSE;
2173
2174
2175         result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal, NULL, BASEPRI_PREEMPT - 1, &thread);
2176         if (result != KERN_SUCCESS)
2177                 panic("vm_pageout_iothread_internal: create failed");
2178
2179         thread_deallocate(thread);
2180
2181
2182         result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external, NULL, BASEPRI_PREEMPT - 1, &thread);
2183         if (result != KERN_SUCCESS)
2184                 panic("vm_pageout_iothread_external: create failed");
2185
2186         thread_deallocate(thread);
2187
2188
2189         result = kernel_thread_start_priority((thread_continue_t)vm_pageout_garbage_collect, NULL, BASEPRI_PREEMPT - 2, &thread);
2190         if (result != KERN_SUCCESS)
2191                 panic("vm_pageout_garbage_collect: create failed");
2192
2193         thread_deallocate(thread);
2194
2195
2196         vm_pageout_continue();
2197         /*NOTREACHED*/
2198 }
2199
2200
2201 static upl_t
2202 upl_create(
2203         int                flags,
2204         upl_size_t       size)
2205 {
2206         upl_t   upl;
2207         int     page_field_size;  /* bit field in word size buf */
2208
2209         page_field_size = 0;
2210         if (flags & UPL_CREATE_LITE) {
2211                 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
2212                 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
2213         }
2214         if(flags & UPL_CREATE_INTERNAL) {
2215                 upl = (upl_t)kalloc(sizeof(struct upl)
2216                         + (sizeof(struct upl_page_info)*(size/PAGE_SIZE))
2217                         + page_field_size);
2218         } else {
2219                 upl = (upl_t)kalloc(sizeof(struct upl) + page_field_size);
2220         }
2221         upl->flags = 0;
2222         upl->src_object = NULL;
2223         upl->kaddr = (vm_offset_t)0;
2224         upl->size = 0;
2225         upl->map_object = NULL;
2226         upl->ref_count = 1;
2227         upl->highest_page = 0;
2228         upl_lock_init(upl);
2229 #ifdef UPL_DEBUG
2230         upl->ubc_alias1 = 0;
2231         upl->ubc_alias2 = 0;
2232 #endif /* UPL_DEBUG */
2233         return(upl);
2234 }
2235
2236 static void
2237 upl_destroy(
2238         upl_t   upl)
2239 {
2240         int     page_field_size;  /* bit field in word size buf */
2241
2242 #ifdef UPL_DEBUG
2243         {
2244                 upl_t   upl_ele;
2245                 vm_object_t     object;
2246                 if (upl->map_object->pageout) {
2247                         object = upl->map_object->shadow;
2248                 } else {
2249                         object = upl->map_object;
2250                 }
2251                 vm_object_lock(object);
2252                 queue_iterate(&object->uplq, upl_ele, upl_t, uplq) {
2253                         if(upl_ele == upl) {
2254                                 queue_remove(&object->uplq,
2255                                                 upl_ele, upl_t, uplq);
2256                                 break;
2257                         }
2258                 }
2259                 vm_object_unlock(object);
2260         }
2261 #endif /* UPL_DEBUG */
2262         /* drop a reference on the map_object whether or */
2263         /* not a pageout object is inserted */
2264         if(upl->map_object->pageout)
2265                 vm_object_deallocate(upl->map_object);
2266
2267         page_field_size = 0;
2268         if (upl->flags & UPL_LITE) {
2269                 page_field_size = ((upl->size/PAGE_SIZE) + 7) >> 3;
2270                 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
2271         }
2272         if(upl->flags & UPL_INTERNAL) {
2273                 kfree(upl,
2274                       sizeof(struct upl) +
2275                       (sizeof(struct upl_page_info) * (upl->size/PAGE_SIZE))
2276                       + page_field_size);
2277         } else {
2278                 kfree(upl, sizeof(struct upl) + page_field_size);
2279         }
2280 }
2281
2282 void uc_upl_dealloc(upl_t upl);
2283 __private_extern__ void
2284 uc_upl_dealloc(
2285         upl_t   upl)
2286 {
2287         upl->ref_count -= 1;
2288         if(upl->ref_count == 0) {
2289                 upl_destroy(upl);
2290         }
2291 }
2292
2293 void
2294 upl_deallocate(
2295         upl_t   upl)
2296 {
2297
2298         upl->ref_count -= 1;
2299         if(upl->ref_count == 0) {
2300                 upl_destroy(upl);
2301         }
2302 }
2303
2304 /*
2305  * Statistics about UPL enforcement of copy-on-write obligations.
2306  */
2307 unsigned long upl_cow = 0;
2308 unsigned long upl_cow_again = 0;
2309 unsigned long upl_cow_contiguous = 0;
2310 unsigned long upl_cow_pages = 0;
2311 unsigned long upl_cow_again_pages = 0;
2312 unsigned long upl_cow_contiguous_pages = 0;
2313
2314 /*
2315  *      Routine:        vm_object_upl_request
2316  *      Purpose:
2317  *              Cause the population of a portion of a vm_object.
2318  *              Depending on the nature of the request, the pages
2319  *              returned may be contain valid data or be uninitialized.
2320  *              A page list structure, listing the physical pages
2321  *              will be returned upon request.
2322  *              This function is called by the file system or any other
2323  *              supplier of backing store to a pager.
2324  *              IMPORTANT NOTE: The caller must still respect the relationship
2325  *              between the vm_object and its backing memory object.  The
2326  *              caller MUST NOT substitute changes in the backing file
2327  *              without first doing a memory_object_lock_request on the
2328  *              target range unless it is know that the pages are not
2329  *              shared with another entity at the pager level.
2330  *              Copy_in_to:
2331  *                      if a page list structure is present
2332  *                      return the mapped physical pages, where a
2333  *                      page is not present, return a non-initialized
2334  *                      one.  If the no_sync bit is turned on, don't
2335  *                      call the pager unlock to synchronize with other
2336  *                      possible copies of the page. Leave pages busy
2337  *                      in the original object, if a page list structure
2338  *                      was specified.  When a commit of the page list
2339  *                      pages is done, the dirty bit will be set for each one.
2340  *              Copy_out_from:
2341  *                      If a page list structure is present, return
2342  *                      all mapped pages.  Where a page does not exist
2343  *                      map a zero filled one. Leave pages busy in
2344  *                      the original object.  If a page list structure
2345  *                      is not specified, this call is a no-op.
2346  *
2347  *              Note:  access of default pager objects has a rather interesting
2348  *              twist.  The caller of this routine, presumably the file system
2349  *              page cache handling code, will never actually make a request
2350  *              against a default pager backed object.  Only the default
2351  *              pager will make requests on backing store related vm_objects
2352  *              In this way the default pager can maintain the relationship
2353  *              between backing store files (abstract memory objects) and
2354  *              the vm_objects (cache objects), they support.
2355  *
2356  */
2357
2358 __private_extern__ kern_return_t
2359 vm_object_upl_request(
2360         vm_object_t             object,
2361         vm_object_offset_t      offset,
2362         upl_size_t              size,
2363         upl_t                   *upl_ptr,
2364         upl_page_info_array_t   user_page_list,
2365         unsigned int            *page_list_count,
2366         int                     cntrl_flags)
2367 {
2368         vm_page_t               dst_page = VM_PAGE_NULL;
2369         vm_object_offset_t      dst_offset = offset;
2370         upl_size_t              xfer_size = size;
2371         boolean_t               do_m_lock = FALSE;
2372         boolean_t               dirty;
2373         boolean_t               hw_dirty;
2374         upl_t                   upl = NULL;
2375         unsigned int            entry;
2376 #if MACH_CLUSTER_STATS
2377         boolean_t               encountered_lrp = FALSE;
2378 #endif
2379         vm_page_t               alias_page = NULL;
2380         int                     page_ticket;
2381         int                     refmod_state;
2382         wpl_array_t             lite_list = NULL;
2383         vm_object_t             last_copy_object;
2384
2385
2386         if (cntrl_flags & ~UPL_VALID_FLAGS) {
2387                 /*
2388                  * For forward compatibility's sake,
2389                  * reject any unknown flag.
2390                  */
2391                 return KERN_INVALID_VALUE;
2392         }
2393
2394         page_ticket = (cntrl_flags & UPL_PAGE_TICKET_MASK)
2395                                         >> UPL_PAGE_TICKET_SHIFT;
2396
2397         if(((size/PAGE_SIZE) > MAX_UPL_TRANSFER) && !object->phys_contiguous) {
2398                 size = MAX_UPL_TRANSFER * PAGE_SIZE;
2399         }
2400
2401         if(cntrl_flags & UPL_SET_INTERNAL)
2402                 if(page_list_count != NULL)
2403                         *page_list_count = MAX_UPL_TRANSFER;
2404
2405         if((!object->internal) && (object->paging_offset != 0))
2406                 panic("vm_object_upl_request: external object with non-zero paging offset\n");
2407
2408         if((cntrl_flags & UPL_COPYOUT_FROM) && (upl_ptr == NULL)) {
2409                 return KERN_SUCCESS;
2410         }
2411
2412         vm_object_lock(object);
2413         vm_object_paging_begin(object);
2414         vm_object_unlock(object);
2415
2416         if(upl_ptr) {
2417                 if(cntrl_flags & UPL_SET_INTERNAL) {
2418                         if(cntrl_flags & UPL_SET_LITE) {
2419                                 uintptr_t page_field_size;
2420                                 upl = upl_create(
2421                                         UPL_CREATE_INTERNAL | UPL_CREATE_LITE,
2422                                         size);
2423                                 user_page_list = (upl_page_info_t *)
2424                                    (((uintptr_t)upl) + sizeof(struct upl));
2425                                 lite_list = (wpl_array_t)
2426                                         (((uintptr_t)user_page_list) +
2427                                         ((size/PAGE_SIZE) *
2428                                                 sizeof(upl_page_info_t)));
2429                                 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
2430                                 page_field_size =
2431                                         (page_field_size + 3) & 0xFFFFFFFC;
2432                                 bzero((char *)lite_list, page_field_size);
2433                                 upl->flags =
2434                                         UPL_LITE | UPL_INTERNAL;
2435                         } else {
2436                                 upl = upl_create(UPL_CREATE_INTERNAL, size);
2437                                 user_page_list = (upl_page_info_t *)
2438                                         (((uintptr_t)upl) + sizeof(struct upl));
2439                                 upl->flags = UPL_INTERNAL;
2440                         }
2441                 } else {
2442                         if(cntrl_flags & UPL_SET_LITE) {
2443                                 uintptr_t page_field_size;
2444                                 upl = upl_create(UPL_CREATE_LITE, size);
2445                                 lite_list = (wpl_array_t)
2446                                    (((uintptr_t)upl) + sizeof(struct upl));
2447                                 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
2448                                 page_field_size =
2449                                         (page_field_size + 3) & 0xFFFFFFFC;
2450                                 bzero((char *)lite_list, page_field_size);
2451                                 upl->flags = UPL_LITE;
2452                         } else {
2453                                 upl = upl_create(UPL_CREATE_EXTERNAL, size);
2454                                 upl->flags = 0;
2455                         }
2456                 }
2457
2458                 if (object->phys_contiguous) {
2459                         if ((cntrl_flags & UPL_WILL_MODIFY) &&
2460                             object->copy != VM_OBJECT_NULL) {
2461                                 /* Honor copy-on-write obligations */
2462
2463                                 /*
2464                                  * XXX FBDP
2465                                  * We could still have a race...
2466                                  * A is here building the UPL for a write().
2467                                  * A pushes the pages to the current copy
2468                                  * object.
2469                                  * A returns the UPL to the caller.
2470                                  * B comes along and establishes another
2471                                  * private mapping on this object, inserting
2472                                  * a new copy object between the original
2473                                  * object and the old copy object.
2474                                  * B reads a page and gets the original contents
2475                                  * from the original object.
2476                                  * A modifies the page in the original object.
2477                                  * B reads the page again and sees A's changes,
2478                                  * which is wrong...
2479                                  *
2480                                  * The problem is that the pages are not
2481                                  * marked "busy" in the original object, so
2482                                  * nothing prevents B from reading it before
2483                                  * before A's changes are completed.
2484                                  *
2485                                  * The "paging_in_progress" might protect us
2486                                  * from the insertion of a new copy object
2487                                  * though...  To be verified.
2488                                  */
2489                                 vm_object_lock_request(object,
2490                                                        offset,
2491                                                        size,
2492                                                        FALSE,
2493                                                        MEMORY_OBJECT_COPY_SYNC,
2494                                                        VM_PROT_NO_CHANGE);
2495                                 upl_cow_contiguous++;
2496                                 upl_cow_contiguous_pages += size >> PAGE_SHIFT;
2497                         }
2498
2499                         upl->map_object = object;
2500                         /* don't need any shadow mappings for this one */
2501                         /* since it is already I/O memory */
2502                         upl->flags |= UPL_DEVICE_MEMORY;
2503
2504
2505                         /* paging_in_progress protects paging_offset */
2506                         upl->offset = offset + object->paging_offset;
2507                         upl->size = size;
2508                         *upl_ptr = upl;
2509                         if(user_page_list) {
2510                                 user_page_list[0].phys_addr =
2511                                    (offset + object->shadow_offset)>>PAGE_SHIFT;
2512                                 user_page_list[0].device = TRUE;
2513                         }
2514                         upl->highest_page = (offset + object->shadow_offset + size - 1)>>PAGE_SHIFT;
2515
2516                         if(page_list_count != NULL) {
2517                                 if (upl->flags & UPL_INTERNAL) {
2518                                         *page_list_count = 0;
2519                                 } else {
2520                                         *page_list_count = 1;
2521                                 }
2522                         }
2523
2524                         return KERN_SUCCESS;
2525                 }
2526
2527                 if(user_page_list)
2528                         user_page_list[0].device = FALSE;
2529
2530                 if(cntrl_flags & UPL_SET_LITE) {
2531                         upl->map_object = object;
2532                 } else {
2533                         upl->map_object = vm_object_allocate(size);
2534                         /*
2535                          * No neeed to lock the new object: nobody else knows
2536                          * about it yet, so it's all ours so far.
2537                          */
2538                         upl->map_object->shadow = object;
2539                         upl->map_object->pageout = TRUE;
2540                         upl->map_object->can_persist = FALSE;
2541                         upl->map_object->copy_strategy =
2542                                         MEMORY_OBJECT_COPY_NONE;
2543                         upl->map_object->shadow_offset = offset;
2544                         upl->map_object->wimg_bits = object->wimg_bits;
2545                 }
2546
2547         }
2548         if (!(cntrl_flags & UPL_SET_LITE)) {
2549                 VM_PAGE_GRAB_FICTITIOUS(alias_page);
2550         }
2551
2552         /*
2553          * ENCRYPTED SWAP:
2554          * Just mark the UPL as "encrypted" here.
2555          * We'll actually encrypt the pages later,
2556          * in upl_encrypt(), when the caller has
2557          * selected which pages need to go to swap.
2558          */
2559         if (cntrl_flags & UPL_ENCRYPT) {
2560                 upl->flags |= UPL_ENCRYPTED;
2561         }
2562         if (cntrl_flags & UPL_FOR_PAGEOUT) {
2563                 upl->flags |= UPL_PAGEOUT;
2564         }
2565         vm_object_lock(object);
2566
2567         /* we can lock in the paging_offset once paging_in_progress is set */
2568         if(upl_ptr) {
2569                 upl->size = size;
2570                 upl->offset = offset + object->paging_offset;
2571                 *upl_ptr = upl;
2572 #ifdef UPL_DEBUG
2573                 queue_enter(&object->uplq, upl, upl_t, uplq);
2574 #endif /* UPL_DEBUG */
2575         }
2576
2577         if ((cntrl_flags & UPL_WILL_MODIFY) &&
2578             object->copy != VM_OBJECT_NULL) {
2579                 /* Honor copy-on-write obligations */
2580
2581                 /*
2582                  * The caller is gathering these pages and
2583                  * might modify their contents.  We need to
2584                  * make sure that the copy object has its own
2585                  * private copies of these pages before we let
2586                  * the caller modify them.
2587                  */
2588                 vm_object_update(object,
2589                                  offset,
2590                                  size,
2591                                  NULL,
2592                                  NULL,
2593                                  FALSE, /* should_return */
2594                                  MEMORY_OBJECT_COPY_SYNC,
2595                                  VM_PROT_NO_CHANGE);
2596                 upl_cow++;
2597                 upl_cow_pages += size >> PAGE_SHIFT;
2598
2599         }
2600         /* remember which copy object we synchronized with */
2601         last_copy_object = object->copy;
2602
2603         entry = 0;
2604         if(cntrl_flags & UPL_COPYOUT_FROM) {
2605                 upl->flags |= UPL_PAGE_SYNC_DONE;
2606
2607                 while (xfer_size) {
2608                         if((alias_page == NULL) &&
2609                                 !(cntrl_flags & UPL_SET_LITE)) {
2610                                 vm_object_unlock(object);
2611                                 VM_PAGE_GRAB_FICTITIOUS(alias_page);
2612                                 vm_object_lock(object);
2613                         }
2614                         if ( ((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
2615                                 dst_page->fictitious ||
2616                                 dst_page->absent ||
2617                                 dst_page->error ||
2618                                (dst_page->wire_count && !dst_page->pageout) ||
2619
2620                              ((!dst_page->inactive) && (cntrl_flags & UPL_FOR_PAGEOUT) &&
2621                                (dst_page->page_ticket != page_ticket) &&
2622                               ((dst_page->page_ticket+1) != page_ticket)) ) {
2623
2624                                 if (user_page_list)
2625                                         user_page_list[entry].phys_addr = 0;
2626                         } else {
2627                                 /*
2628                                  * grab this up front...
2629                                  * a high percentange of the time we're going to
2630                                  * need the hardware modification state a bit later
2631                                  * anyway... so we can eliminate an extra call into
2632                                  * the pmap layer by grabbing it here and recording it
2633                                  */
2634                                 refmod_state = pmap_get_refmod(dst_page->phys_page);
2635
2636                                 if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
2637                                         /*
2638                                          * we're only asking for DIRTY pages to be returned
2639                                          */
2640
2641                                         if (dst_page->list_req_pending || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
2642                                                 /*
2643                                                  * if we were the page stolen by vm_pageout_scan to be
2644                                                  * cleaned (as opposed to a buddy being clustered in
2645                                                  * or this request is not being driven by a PAGEOUT cluster
2646                                                  * then we only need to check for the page being diry or
2647                                                  * precious to decide whether to return it
2648                                                  */
2649                                                 if (dst_page->dirty || dst_page->precious ||
2650                                                     (refmod_state & VM_MEM_MODIFIED)) {
2651                                                         goto check_busy;
2652                                                 }
2653                                         }
2654                                         /*
2655                                          * this is a request for a PAGEOUT cluster and this page
2656                                          * is merely along for the ride as a 'buddy'... not only
2657                                          * does it have to be dirty to be returned, but it also
2658                                          * can't have been referenced recently... note that we've
2659                                          * already filtered above based on whether this page is
2660                                          * currently on the inactive queue or it meets the page
2661                                          * ticket (generation count) check
2662                                          */
2663                                         if ( !(refmod_state & VM_MEM_REFERENCED) &&
2664                                              ((refmod_state & VM_MEM_MODIFIED) ||
2665                                               dst_page->dirty || dst_page->precious) ) {
2666                                                 goto check_busy;
2667                                         }
2668                                         /*
2669                                          * if we reach here, we're not to return
2670                                          * the page... go on to the next one
2671                                          */
2672                                         if (user_page_list)
2673                                                 user_page_list[entry].phys_addr = 0;
2674                                         entry++;
2675                                         dst_offset += PAGE_SIZE_64;
2676                                         xfer_size -= PAGE_SIZE;
2677                                         continue;
2678                                 }
2679 check_busy:
2680                                 if(dst_page->busy &&
2681                                         (!(dst_page->list_req_pending &&
2682                                                 dst_page->pageout))) {
2683                                         if(cntrl_flags & UPL_NOBLOCK) {
2684                                                 if(user_page_list) {
2685                                                         user_page_list[entry].phys_addr = 0;
2686                                                 }
2687                                                 entry++;
2688                                                 dst_offset += PAGE_SIZE_64;
2689                                                 xfer_size -= PAGE_SIZE;
2690                                                 continue;
2691                                         }
2692                                         /*
2693                                          * someone else is playing with the
2694                                          * page.  We will have to wait.
2695                                          */
2696                                         PAGE_SLEEP(object, dst_page, THREAD_UNINT);
2697                                         continue;
2698                                 }
2699                                 /* Someone else already cleaning the page? */
2700                                 if((dst_page->cleaning || dst_page->absent ||
2701                                         dst_page->wire_count != 0) &&
2702                                         !dst_page->list_req_pending) {
2703                                    if(user_page_list) {
2704                                            user_page_list[entry].phys_addr = 0;
2705                                    }
2706                                    entry++;
2707                                    dst_offset += PAGE_SIZE_64;
2708                                    xfer_size -= PAGE_SIZE;
2709                                    continue;
2710                                 }
2711                                 /* eliminate all mappings from the */
2712                                 /* original object and its prodigy */
2713
2714                                 vm_page_lock_queues();
2715
2716                                 if (dst_page->pageout_queue == TRUE)
2717                                         /*
2718                                          * we've buddied up a page for a clustered pageout
2719                                          * that has already been moved to the pageout
2720                                          * queue by pageout_scan... we need to remove
2721                                          * it from the queue and drop the laundry count
2722                                          * on that queue
2723                                          */
2724                                         vm_pageout_queue_steal(dst_page);
2725 #if MACH_CLUSTER_STATS
2726                                 /* pageout statistics gathering.  count  */
2727                                 /* all the pages we will page out that   */
2728                                 /* were not counted in the initial       */
2729                                 /* vm_pageout_scan work                  */
2730                                 if(dst_page->list_req_pending)
2731                                         encountered_lrp = TRUE;
2732                                 if((dst_page->dirty ||
2733                                         (dst_page->object->internal &&
2734                                         dst_page->precious)) &&
2735                                         (dst_page->list_req_pending
2736                                         == FALSE)) {
2737                                         if(encountered_lrp) {
2738                                                 CLUSTER_STAT
2739                                                 (pages_at_higher_offsets++;)
2740                                         } else {
2741                                                 CLUSTER_STAT
2742                                                 (pages_at_lower_offsets++;)
2743                                         }
2744                                 }
2745 #endif
2746                                 /* Turn off busy indication on pending */
2747                                 /* pageout.  Note: we can only get here */
2748                                 /* in the request pending case.  */
2749                                 dst_page->list_req_pending = FALSE;
2750                                 dst_page->busy = FALSE;
2751                                 dst_page->cleaning = FALSE;
2752
2753                                 hw_dirty = refmod_state & VM_MEM_MODIFIED;
2754                                 dirty = hw_dirty ? TRUE : dst_page->dirty;
2755
2756                                 if(cntrl_flags & UPL_SET_LITE) {
2757                                         int     pg_num;
2758                                         pg_num = (dst_offset-offset)/PAGE_SIZE;
2759                                         lite_list[pg_num>>5] |=
2760                                                         1 << (pg_num & 31);
2761                                         if (hw_dirty)
2762                                                 pmap_clear_modify(dst_page->phys_page);
2763                                         /*
2764                                          * Record that this page has been
2765                                          * written out
2766                                          */
2767 #if     MACH_PAGEMAP
2768                                         vm_external_state_set(
2769                                                 object->existence_map,
2770                                                 dst_page->offset);
2771 #endif  /*MACH_PAGEMAP*/
2772
2773                                         /*
2774                                          * Mark original page as cleaning
2775                                          * in place.
2776                                          */
2777                                         dst_page->cleaning = TRUE;
2778                                         dst_page->dirty = TRUE;
2779                                         dst_page->precious = FALSE;
2780                                 } else {
2781                                         /* use pageclean setup, it is more */
2782                                         /* convenient even for the pageout */
2783                                         /* cases here */
2784
2785                                         vm_object_lock(upl->map_object);
2786                                         vm_pageclean_setup(dst_page,
2787                                                 alias_page, upl->map_object,
2788                                                 size - xfer_size);
2789                                         vm_object_unlock(upl->map_object);
2790
2791                                         alias_page->absent = FALSE;
2792                                         alias_page = NULL;
2793                                 }
2794
2795                                 if(!dirty) {
2796                                         dst_page->dirty = FALSE;
2797                                         dst_page->precious = TRUE;
2798                                 }
2799
2800                                 if(dst_page->pageout)
2801                                         dst_page->busy = TRUE;
2802
2803                                 if ( (cntrl_flags & UPL_ENCRYPT) ) {
2804                                         /*
2805                                          * ENCRYPTED SWAP:
2806                                          * We want to deny access to the target page
2807                                          * because its contents are about to be
2808                                          * encrypted and the user would be very
2809                                          * confused to see encrypted data instead
2810                                          * of their data.
2811                                          */
2812                                         dst_page->busy = TRUE;
2813                                 }
2814                                 if ( !(cntrl_flags & UPL_CLEAN_IN_PLACE) ) {
2815                                         /*
2816                                          * deny access to the target page
2817                                          * while it is being worked on
2818                                          */
2819                                         if ((!dst_page->pageout) &&
2820                                             (dst_page->wire_count == 0)) {
2821                                                 dst_page->busy = TRUE;
2822                                                 dst_page->pageout = TRUE;
2823                                                 vm_page_wire(dst_page);
2824                                         }
2825                                 }
2826
2827                                 if (dst_page->phys_page > upl->highest_page)
2828                                         upl->highest_page = dst_page->phys_page;
2829
2830                                 if(user_page_list) {
2831                                         user_page_list[entry].phys_addr
2832                                                 = dst_page->phys_page;
2833                                         user_page_list[entry].dirty =
2834                                                         dst_page->dirty;
2835                                         user_page_list[entry].pageout =
2836                                                         dst_page->pageout;
2837                                         user_page_list[entry].absent =
2838                                                         dst_page->absent;
2839                                         user_page_list[entry].precious =
2840                                                         dst_page->precious;
2841                                 }
2842                                 vm_page_unlock_queues();
2843
2844                                 /*
2845                                  * ENCRYPTED SWAP:
2846                                  * The caller is gathering this page and might
2847                                  * access its contents later on.  Decrypt the
2848                                  * page before adding it to the UPL, so that
2849                                  * the caller never sees encrypted data.
2850                                  */
2851                                 if (! (cntrl_flags & UPL_ENCRYPT) &&
2852                                     dst_page->encrypted) {
2853                                         assert(dst_page->busy);
2854
2855                                         vm_page_decrypt(dst_page, 0);
2856                                         vm_page_decrypt_for_upl_counter++;
2857
2858                                         /*
2859                                          * Retry this page, since anything
2860                                          * could have changed while we were
2861                                          * decrypting.
2862                                          */
2863                                         continue;
2864                                 }
2865                         }
2866                         entry++;
2867                         dst_offset += PAGE_SIZE_64;
2868                         xfer_size -= PAGE_SIZE;
2869                 }
2870         } else {
2871                 while (xfer_size) {
2872                         if((alias_page == NULL) &&
2873                                 !(cntrl_flags & UPL_SET_LITE)) {
2874                                 vm_object_unlock(object);
2875                                 VM_PAGE_GRAB_FICTITIOUS(alias_page);
2876                                 vm_object_lock(object);
2877                         }
2878
2879                         if ((cntrl_flags & UPL_WILL_MODIFY) &&
2880                             object->copy != last_copy_object) {
2881                                 /* Honor copy-on-write obligations */
2882
2883                                 /*
2884                                  * The copy object has changed since we
2885                                  * last synchronized for copy-on-write.
2886                                  * Another copy object might have been
2887                                  * inserted while we released the object's
2888                                  * lock.  Since someone could have seen the
2889                                  * original contents of the remaining pages
2890                                  * through that new object, we have to
2891                                  * synchronize with it again for the remaining
2892                                  * pages only.  The previous pages are "busy"
2893                                  * so they can not be seen through the new
2894                                  * mapping.  The new mapping will see our
2895                                  * upcoming changes for those previous pages,
2896                                  * but that's OK since they couldn't see what
2897                                  * was there before.  It's just a race anyway
2898                                  * and there's no guarantee of consistency or
2899                                  * atomicity.  We just don't want new mappings
2900                                  * to see both the *before* and *after* pages.
2901                                  */
2902                                 if (object->copy != VM_OBJECT_NULL) {
2903                                         vm_object_update(
2904                                                 object,
2905                                                 dst_offset,/* current offset */
2906                                                 xfer_size, /* remaining size */
2907                                                 NULL,
2908                                                 NULL,
2909                                                 FALSE,     /* should_return */
2910                                                 MEMORY_OBJECT_COPY_SYNC,
2911                                                 VM_PROT_NO_CHANGE);
2912                                         upl_cow_again++;
2913                                         upl_cow_again_pages +=
2914                                                 xfer_size >> PAGE_SHIFT;
2915                                 }
2916                                 /* remember the copy object we synced with */
2917                                 last_copy_object = object->copy;
2918                         }
2919
2920                         dst_page = vm_page_lookup(object, dst_offset);
2921
2922                         if(dst_page != VM_PAGE_NULL) {
2923                                 if((cntrl_flags & UPL_RET_ONLY_ABSENT) &&
2924                                         !((dst_page->list_req_pending)
2925                                                 && (dst_page->absent))) {
2926                                         /* we are doing extended range */
2927                                         /* requests.  we want to grab  */
2928                                         /* pages around some which are */
2929                                         /* already present.  */
2930                                         if(user_page_list) {
2931                                                 user_page_list[entry].phys_addr = 0;
2932                                         }
2933                                         entry++;
2934                                         dst_offset += PAGE_SIZE_64;
2935                                         xfer_size -= PAGE_SIZE;
2936                                         continue;
2937                                 }
2938                                 if((dst_page->cleaning) &&
2939                                    !(dst_page->list_req_pending)) {
2940                                         /*someone else is writing to the */
2941                                         /* page.  We will have to wait.  */
2942                                         PAGE_SLEEP(object,dst_page,THREAD_UNINT);
2943                                         continue;
2944                                 }
2945                                 if ((dst_page->fictitious &&
2946                                      dst_page->list_req_pending)) {
2947                                         /* dump the fictitious page */
2948                                         dst_page->list_req_pending = FALSE;
2949                                         dst_page->clustered = FALSE;
2950
2951                                         vm_page_lock_queues();
2952                                         vm_page_free(dst_page);
2953                                         vm_page_unlock_queues();
2954
2955                                         dst_page = NULL;
2956                                 } else if ((dst_page->absent &&
2957                                             dst_page->list_req_pending)) {
2958                                         /* the default_pager case */
2959                                         dst_page->list_req_pending = FALSE;
2960                                         dst_page->busy = FALSE;
2961                                 }
2962                         }
2963                         if(dst_page == VM_PAGE_NULL) {
2964                                 if(object->private) {
2965                                         /*
2966                                          * This is a nasty wrinkle for users
2967                                          * of upl who encounter device or
2968                                          * private memory however, it is
2969                                          * unavoidable, only a fault can
2970                                          * reslove the actual backing
2971                                          * physical page by asking the
2972                                          * backing device.
2973                                          */
2974                                         if(user_page_list) {
2975                                                 user_page_list[entry].phys_addr = 0;
2976                                         }
2977                                         entry++;
2978                                         dst_offset += PAGE_SIZE_64;
2979                                         xfer_size -= PAGE_SIZE;
2980                                         continue;
2981                                 }
2982                                 /* need to allocate a page */
2983                                 dst_page = vm_page_alloc(object, dst_offset);
2984                                 if (dst_page == VM_PAGE_NULL) {
2985                                         vm_object_unlock(object);
2986                                         VM_PAGE_WAIT();
2987                                         vm_object_lock(object);
2988                                         continue;
2989                                 }
2990                                 dst_page->busy = FALSE;
2991 #if 0
2992                                 if(cntrl_flags & UPL_NO_SYNC) {
2993                                         dst_page->page_lock = 0;
2994                                         dst_page->unlock_request = 0;
2995                                 }
2996 #endif
2997                                 if(cntrl_flags & UPL_RET_ONLY_ABSENT) {
2998                                         /*
2999                                          * if UPL_RET_ONLY_ABSENT was specified,
3000                                          * than we're definitely setting up a
3001                                          * upl for a clustered read/pagein
3002                                          * operation... mark the pages as clustered
3003                                          * so vm_fault can correctly attribute them
3004                                          * to the 'pagein' bucket the first time
3005                                          * a fault happens on them
3006                                          */
3007                                         dst_page->clustered = TRUE;
3008                                 }
3009                                 dst_page->absent = TRUE;
3010                                 object->absent_count++;
3011                         }
3012 #if 1
3013                         if(cntrl_flags & UPL_NO_SYNC) {
3014                                 dst_page->page_lock = 0;
3015                                 dst_page->unlock_request = 0;
3016                         }
3017 #endif /* 1 */
3018
3019                         /*
3020                          * ENCRYPTED SWAP:
3021                          */
3022                         if (cntrl_flags & UPL_ENCRYPT) {
3023                                 /*
3024                                  * The page is going to be encrypted when we
3025                                  * get it from the pager, so mark it so.
3026                                  */
3027                                 dst_page->encrypted = TRUE;
3028                         } else {
3029                                 /*
3030                                  * Otherwise, the page will not contain
3031                                  * encrypted data.
3032                                  */
3033                                 dst_page->encrypted = FALSE;
3034                         }
3035
3036                         dst_page->overwriting = TRUE;
3037                         if(dst_page->fictitious) {
3038                                 panic("need corner case for fictitious page");
3039                         }
3040                         if(dst_page->page_lock) {
3041                                 do_m_lock = TRUE;
3042                         }
3043                         if(upl_ptr) {
3044
3045                                 /* eliminate all mappings from the */
3046                                 /* original object and its prodigy */
3047
3048                                 if(dst_page->busy) {
3049                                         /*someone else is playing with the */
3050                                         /* page.  We will have to wait.    */
3051                                         PAGE_SLEEP(object, dst_page, THREAD_UNINT);
3052                                         continue;
3053                                 }
3054                                 vm_page_lock_queues();
3055
3056                                 if( !(cntrl_flags & UPL_FILE_IO))
3057                                         hw_dirty = pmap_disconnect(dst_page->phys_page) & VM_MEM_MODIFIED;
3058                                 else
3059                                         hw_dirty = pmap_get_refmod(dst_page->phys_page) & VM_MEM_MODIFIED;
3060                                 dirty = hw_dirty ? TRUE : dst_page->dirty;
3061
3062                                 if(cntrl_flags & UPL_SET_LITE) {
3063                                         int     pg_num;
3064                                         pg_num = (dst_offset-offset)/PAGE_SIZE;
3065                                         lite_list[pg_num>>5] |=
3066                                                         1 << (pg_num & 31);
3067                                         if (hw_dirty)
3068                                                 pmap_clear_modify(dst_page->phys_page);
3069                                         /*
3070                                          * Record that this page has been
3071                                          * written out
3072                                          */
3073 #if     MACH_PAGEMAP
3074                                         vm_external_state_set(
3075                                                 object->existence_map,
3076                                                 dst_page->offset);
3077 #endif  /*MACH_PAGEMAP*/
3078
3079                                         /*
3080                                          * Mark original page as cleaning
3081                                          * in place.
3082                                          */
3083                                         dst_page->cleaning = TRUE;
3084                                         dst_page->dirty = TRUE;
3085                                         dst_page->precious = FALSE;
3086                                 } else {
3087                                         /* use pageclean setup, it is more */
3088                                         /* convenient even for the pageout */
3089                                         /* cases here */
3090                                         vm_object_lock(upl->map_object);
3091                                         vm_pageclean_setup(dst_page,
3092                                                 alias_page, upl->map_object,
3093                                                 size - xfer_size);
3094                                         vm_object_unlock(upl->map_object);
3095
3096                                         alias_page->absent = FALSE;
3097                                         alias_page = NULL;
3098                                 }
3099
3100                                 if(cntrl_flags & UPL_CLEAN_IN_PLACE) {
3101                                         /* clean in place for read implies   */
3102                                         /* that a write will be done on all  */
3103                                         /* the pages that are dirty before   */
3104                                         /* a upl commit is done.  The caller */
3105                                         /* is obligated to preserve the      */
3106                                         /* contents of all pages marked      */
3107                                         /* dirty. */
3108                                         upl->flags |= UPL_CLEAR_DIRTY;
3109                                 }
3110
3111                                 if(!dirty) {
3112                                         dst_page->dirty = FALSE;
3113                                         dst_page->precious = TRUE;
3114                                 }
3115
3116                                 if (dst_page->wire_count == 0) {
3117                                    /* deny access to the target page while */
3118                                    /* it is being worked on */
3119                                         dst_page->busy = TRUE;
3120                                 } else {
3121                                         vm_page_wire(dst_page);
3122                                 }
3123                                 if(cntrl_flags & UPL_RET_ONLY_ABSENT) {
3124                                         /*
3125                                          * expect the page not to be used
3126                                          * since it's coming in as part
3127                                          * of a cluster and could be
3128                                          * speculative... pages that
3129                                          * are 'consumed' will get a
3130                                          * hardware reference
3131                                          */
3132                                         dst_page->reference = FALSE;
3133                                 } else {
3134                                         /*
3135                                          * expect the page to be used
3136                                          */
3137                                         dst_page->reference = TRUE;
3138                                 }
3139                                 dst_page->precious =
3140                                         (cntrl_flags & UPL_PRECIOUS)
3141                                                         ? TRUE : FALSE;
3142
3143                                 if (dst_page->phys_page > upl->highest_page)
3144                                         upl->highest_page = dst_page->phys_page;
3145
3146                                 if(user_page_list) {
3147                                         user_page_list[entry].phys_addr
3148                                                 = dst_page->phys_page;
3149                                         user_page_list[entry].dirty =
3150                                                         dst_page->dirty;
3151                                         user_page_list[entry].pageout =
3152                                                         dst_page->pageout;
3153                                         user_page_list[entry].absent =
3154                                                         dst_page->absent;
3155                                         user_page_list[entry].precious =
3156                                                         dst_page->precious;
3157                                 }
3158                                 vm_page_unlock_queues();
3159                         }
3160                         entry++;
3161                         dst_offset += PAGE_SIZE_64;
3162                         xfer_size -= PAGE_SIZE;
3163                 }
3164         }
3165
3166         if (upl->flags & UPL_INTERNAL) {
3167                 if(page_list_count != NULL)
3168                         *page_list_count = 0;
3169         } else if (*page_list_count > entry) {
3170                 if(page_list_count != NULL)
3171                         *page_list_count = entry;
3172         }
3173
3174         if(alias_page != NULL) {
3175                 vm_page_lock_queues();
3176                 vm_page_free(alias_page);
3177                 vm_page_unlock_queues();
3178         }
3179
3180         if(do_m_lock) {
3181            vm_prot_t    access_required;
3182            /* call back all associated pages from other users of the pager */
3183            /* all future updates will be on data which is based on the     */
3184            /* changes we are going to make here. Note: it is assumed that  */
3185            /* we already hold copies of the data so we will not be seeing  */
3186            /* an avalanche of incoming data from the pager */
3187            access_required = (cntrl_flags & UPL_COPYOUT_FROM)
3188                                         ? VM_PROT_READ : VM_PROT_WRITE;
3189            while (TRUE) {
3190                 kern_return_t   rc;
3191
3192                 if(!object->pager_ready) {
3193                    wait_result_t wait_result;
3194
3195                    wait_result = vm_object_sleep(object,
3196                                                 VM_OBJECT_EVENT_PAGER_READY,
3197                                                 THREAD_UNINT);
3198                    if (wait_result !=  THREAD_AWAKENED) {
3199                         vm_object_unlock(object);
3200                         return KERN_FAILURE;
3201                    }
3202                    continue;
3203                 }
3204
3205                 vm_object_unlock(object);
3206                 rc = memory_object_data_unlock(
3207                         object->pager,
3208                         dst_offset + object->paging_offset,
3209                         size,
3210                         access_required);
3211                 if (rc != KERN_SUCCESS && rc != MACH_SEND_INTERRUPTED)
3212                         return KERN_FAILURE;
3213                 vm_object_lock(object);
3214
3215                 if (rc == KERN_SUCCESS)
3216                         break;
3217            }
3218
3219            /* lets wait on the last page requested */
3220            /* NOTE: we will have to update lock completed routine to signal */
3221            if(dst_page != VM_PAGE_NULL &&
3222                 (access_required & dst_page->page_lock) != access_required) {
3223                 PAGE_ASSERT_WAIT(dst_page, THREAD_UNINT);
3224                 vm_object_unlock(object);
3225                 thread_block(THREAD_CONTINUE_NULL);
3226                 return KERN_SUCCESS;
3227            }
3228         }
3229
3230         vm_object_unlock(object);
3231         return KERN_SUCCESS;
3232 }
3233
3234 /* JMM - Backward compatability for now */
3235 kern_return_t
3236 vm_fault_list_request(                  /* forward */
3237         memory_object_control_t         control,
3238         vm_object_offset_t      offset,
3239         upl_size_t              size,
3240         upl_t                   *upl_ptr,
3241         upl_page_info_t         **user_page_list_ptr,
3242         int                     page_list_count,
3243         int                     cntrl_flags);
3244 kern_return_t
3245 vm_fault_list_request(
3246         memory_object_control_t         control,
3247         vm_object_offset_t      offset,
3248         upl_size_t              size,
3249         upl_t                   *upl_ptr,
3250         upl_page_info_t         **user_page_list_ptr,
3251         int                     page_list_count,
3252         int                     cntrl_flags)
3253 {
3254         unsigned int            local_list_count;
3255         upl_page_info_t         *user_page_list;
3256         kern_return_t           kr;
3257
3258         if (user_page_list_ptr != NULL) {
3259                 local_list_count = page_list_count;
3260                 user_page_list = *user_page_list_ptr;
3261         } else {
3262                 local_list_count = 0;
3263                 user_page_list = NULL;
3264         }
3265         kr =  memory_object_upl_request(control,
3266                                 offset,
3267                                 size,
3268                                 upl_ptr,
3269                                 user_page_list,
3270                                 &local_list_count,
3271                                 cntrl_flags);
3272
3273         if(kr != KERN_SUCCESS)
3274                 return kr;
3275
3276         if ((user_page_list_ptr != NULL) && (cntrl_flags & UPL_INTERNAL)) {
3277                 *user_page_list_ptr = UPL_GET_INTERNAL_PAGE_LIST(*upl_ptr);
3278         }
3279
3280         return KERN_SUCCESS;
3281 }
3282
3283
3284
3285 /*
3286  *      Routine:        vm_object_super_upl_request
3287  *      Purpose:
3288  *              Cause the population of a portion of a vm_object
3289  *              in much the same way as memory_object_upl_request.
3290  *              Depending on the nature of the request, the pages
3291  *              returned may be contain valid data or be uninitialized.
3292  *              However, the region may be expanded up to the super
3293  *              cluster size provided.
3294  */
3295
3296 __private_extern__ kern_return_t
3297 vm_object_super_upl_request(
3298         vm_object_t object,
3299         vm_object_offset_t      offset,
3300         upl_size_t              size,
3301         upl_size_t              super_cluster,
3302         upl_t                   *upl,
3303         upl_page_info_t         *user_page_list,
3304         unsigned int            *page_list_count,
3305         int                     cntrl_flags)
3306 {
3307         vm_page_t       target_page;
3308         int             ticket;
3309
3310
3311         if(object->paging_offset > offset)
3312                 return KERN_FAILURE;
3313
3314         assert(object->paging_in_progress);
3315         offset = offset - object->paging_offset;
3316
3317         if(cntrl_flags & UPL_FOR_PAGEOUT) {
3318
3319                 vm_object_lock(object);
3320
3321                 if((target_page = vm_page_lookup(object, offset))
3322                                                         != VM_PAGE_NULL) {
3323                         ticket = target_page->page_ticket;
3324                         cntrl_flags = cntrl_flags & ~(int)UPL_PAGE_TICKET_MASK;
3325                         cntrl_flags = cntrl_flags |
3326                                 ((ticket << UPL_PAGE_TICKET_SHIFT)
3327                                                         & UPL_PAGE_TICKET_MASK);
3328                 }
3329                 vm_object_unlock(object);
3330         }
3331
3332         if (super_cluster > size) {
3333
3334                 vm_object_offset_t      base_offset;
3335                 upl_size_t              super_size;
3336
3337                 base_offset = (offset &
3338                         ~((vm_object_offset_t) super_cluster - 1));
3339                 super_size = (offset+size) > (base_offset + super_cluster) ?
3340                                 super_cluster<<1 : super_cluster;
3341                 super_size = ((base_offset + super_size) > object->size) ?
3342                                 (object->size - base_offset) : super_size;
3343                 if(offset > (base_offset + super_size))
3344                    panic("vm_object_super_upl_request: Missed target pageout"
3345                          " %#llx,%#llx, %#x, %#x, %#x, %#llx\n",
3346                          offset, base_offset, super_size, super_cluster,
3347                          size, object->paging_offset);
3348                 /*
3349                  * apparently there is a case where the vm requests a
3350                  * page to be written out who's offset is beyond the
3351                  * object size
3352                  */
3353                 if((offset + size) > (base_offset + super_size))
3354                    super_size = (offset + size) - base_offset;
3355
3356                 offset = base_offset;
3357                 size = super_size;
3358         }
3359         return vm_object_upl_request(object, offset, size,
3360                                      upl, user_page_list, page_list_count,
3361                                      cntrl_flags);
3362 }
3363
3364
3365 kern_return_t
3366 vm_map_create_upl(
3367         vm_map_t                map,
3368         vm_map_address_t        offset,
3369         upl_size_t              *upl_size,
3370         upl_t                   *upl,
3371         upl_page_info_array_t   page_list,
3372         unsigned int            *count,
3373         int                     *flags)
3374 {
3375         vm_map_entry_t  entry;
3376         int             caller_flags;
3377         int             force_data_sync;
3378         int             sync_cow_data;
3379         vm_object_t     local_object;
3380         vm_map_offset_t local_offset;
3381         vm_map_offset_t local_start;
3382         kern_return_t   ret;
3383
3384         caller_flags = *flags;
3385
3386         if (caller_flags & ~UPL_VALID_FLAGS) {
3387                 /*
3388                  * For forward compatibility's sake,
3389                  * reject any unknown flag.
3390                  */
3391                 return KERN_INVALID_VALUE;
3392         }
3393
3394         force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
3395         sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
3396
3397         if(upl == NULL)
3398                 return KERN_INVALID_ARGUMENT;
3399
3400
3401 REDISCOVER_ENTRY:
3402         vm_map_lock(map);
3403         if (vm_map_lookup_entry(map, offset, &entry)) {
3404                 if (entry->object.vm_object == VM_OBJECT_NULL ||
3405                         !entry->object.vm_object->phys_contiguous) {
3406                         if((*upl_size/page_size) > MAX_UPL_TRANSFER) {
3407                                 *upl_size = MAX_UPL_TRANSFER * page_size;
3408                         }
3409                 }
3410                 if((entry->vme_end - offset) < *upl_size) {
3411                         *upl_size = entry->vme_end - offset;
3412                 }
3413                 if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
3414                         if (entry->object.vm_object == VM_OBJECT_NULL) {
3415                                 *flags = 0;
3416                         } else if (entry->object.vm_object->private) {
3417                                 *flags = UPL_DEV_MEMORY;
3418                                 if (entry->object.vm_object->phys_contiguous) {
3419                                         *flags |= UPL_PHYS_CONTIG;
3420                                 }
3421                         } else  {
3422                                 *flags = 0;
3423                         }
3424                         vm_map_unlock(map);
3425                         return KERN_SUCCESS;
3426                 }
3427                 /*
3428                  *      Create an object if necessary.
3429                  */
3430                 if (entry->object.vm_object == VM_OBJECT_NULL) {
3431                         entry->object.vm_object = vm_object_allocate(
3432                                 (vm_size_t)(entry->vme_end - entry->vme_start));
3433                         entry->offset = 0;
3434                 }
3435                 if (!(caller_flags & UPL_COPYOUT_FROM)) {
3436                         if (!(entry->protection & VM_PROT_WRITE)) {
3437                                 vm_map_unlock(map);
3438                                 return KERN_PROTECTION_FAILURE;
3439                         }
3440                         if (entry->needs_copy)  {
3441                                 vm_map_t                local_map;
3442                                 vm_object_t             object;
3443                                 vm_map_offset_t         offset_hi;
3444                                 vm_map_offset_t         offset_lo;
3445                                 vm_object_offset_t      new_offset;
3446                                 vm_prot_t               prot;
3447                                 boolean_t               wired;
3448                                 vm_behavior_t           behavior;
3449                                 vm_map_version_t        version;
3450                                 vm_map_t                real_map;
3451
3452                                 local_map = map;
3453                                 vm_map_lock_write_to_read(map);
3454                                 if(vm_map_lookup_locked(&local_map,
3455                                         offset, VM_PROT_WRITE,
3456                                         &version, &object,
3457                                         &new_offset, &prot, &wired,
3458                                         &behavior, &offset_lo,
3459                                         &offset_hi, &real_map)) {
3460                                         vm_map_unlock(local_map);
3461                                         return KERN_FAILURE;
3462                                 }
3463                                 if (real_map != map) {
3464                                         vm_map_unlock(real_map);
3465                                 }
3466                                 vm_object_unlock(object);
3467                                 vm_map_unlock(local_map);
3468
3469                                 goto REDISCOVER_ENTRY;
3470                         }
3471                 }
3472                 if (entry->is_sub_map) {
3473                         vm_map_t        submap;
3474
3475                         submap = entry->object.sub_map;
3476                         local_start = entry->vme_start;
3477                         local_offset = entry->offset;
3478                         vm_map_reference(submap);
3479                         vm_map_unlock(map);
3480
3481                         ret = (vm_map_create_upl(submap,
3482                                 local_offset + (offset - local_start),
3483                                 upl_size, upl, page_list, count,
3484                                 flags));
3485
3486                         vm_map_deallocate(submap);
3487                         return ret;
3488                 }
3489
3490                 if (sync_cow_data) {
3491                         if (entry->object.vm_object->shadow
3492                                     || entry->object.vm_object->copy) {
3493
3494                                 local_object = entry->object.vm_object;
3495                                 local_start = entry->vme_start;
3496                                 local_offset = entry->offset;
3497                                 vm_object_reference(local_object);
3498                                 vm_map_unlock(map);
3499
3500                                 if (entry->object.vm_object->shadow &&
3501                                            entry->object.vm_object->copy) {
3502                                    vm_object_lock_request(
3503                                         local_object->shadow,
3504                                         (vm_object_offset_t)
3505                                         ((offset - local_start) +
3506                                          local_offset) +
3507                                         local_object->shadow_offset,
3508                                         *upl_size, FALSE,
3509                                         MEMORY_OBJECT_DATA_SYNC,
3510                                         VM_PROT_NO_CHANGE);
3511                                 }
3512                                 sync_cow_data = FALSE;
3513                                 vm_object_deallocate(local_object);
3514                                 goto REDISCOVER_ENTRY;
3515                         }
3516                 }
3517
3518                 if (force_data_sync) {
3519
3520                         local_object = entry->object.vm_object;
3521                         local_start = entry->vme_start;
3522                         local_offset = entry->offset;
3523                         vm_object_reference(local_object);
3524                         vm_map_unlock(map);
3525
3526                         vm_object_lock_request(
3527                                    local_object,
3528                                    (vm_object_offset_t)
3529                                    ((offset - local_start) + local_offset),
3530                                    (vm_object_size_t)*upl_size, FALSE,
3531                                    MEMORY_OBJECT_DATA_SYNC,
3532                                    VM_PROT_NO_CHANGE);
3533                         force_data_sync = FALSE;
3534                         vm_object_deallocate(local_object);
3535                         goto REDISCOVER_ENTRY;
3536                 }
3537
3538                 if(!(entry->object.vm_object->private)) {
3539                         if(*upl_size > (MAX_UPL_TRANSFER*PAGE_SIZE))
3540                                 *upl_size = (MAX_UPL_TRANSFER*PAGE_SIZE);
3541                         if(entry->object.vm_object->phys_contiguous) {
3542                                 *flags = UPL_PHYS_CONTIG;
3543                         } else {
3544                                 *flags = 0;
3545                         }
3546                 } else {
3547                         *flags = UPL_DEV_MEMORY | UPL_PHYS_CONTIG;
3548                 }
3549                 local_object = entry->object.vm_object;
3550                 local_offset = entry->offset;
3551                 local_start = entry->vme_start;
3552                 vm_object_reference(local_object);
3553                 vm_map_unlock(map);
3554                 if(caller_flags & UPL_SET_IO_WIRE) {
3555                         ret = (vm_object_iopl_request(local_object,
3556                                 (vm_object_offset_t)
3557                                    ((offset - local_start)
3558                                                 + local_offset),
3559                                 *upl_size,
3560                                 upl,
3561                                 page_list,
3562                                 count,
3563                                 caller_flags));
3564                 } else {
3565                         ret = (vm_object_upl_request(local_object,
3566                                 (vm_object_offset_t)
3567                                    ((offset - local_start)
3568                                                 + local_offset),
3569                                 *upl_size,
3570                                 upl,
3571                                 page_list,
3572                                 count,
3573                                 caller_flags));
3574                 }
3575                 vm_object_deallocate(local_object);
3576                 return(ret);
3577         }
3578
3579         vm_map_unlock(map);
3580         return(KERN_FAILURE);
3581
3582 }
3583
3584 /*
3585  * Internal routine to enter a UPL into a VM map.
3586  *
3587  * JMM - This should just be doable through the standard
3588  * vm_map_enter() API.
3589  */
3590 kern_return_t
3591 vm_map_enter_upl(
3592         vm_map_t                map,
3593         upl_t                   upl,
3594         vm_map_offset_t *dst_addr)
3595 {
3596         vm_map_size_t           size;
3597         vm_object_offset_t      offset;
3598         vm_map_offset_t         addr;
3599         vm_page_t               m;
3600         kern_return_t           kr;
3601
3602         if (upl == UPL_NULL)
3603                 return KERN_INVALID_ARGUMENT;
3604
3605         upl_lock(upl);
3606
3607         /* check to see if already mapped */
3608         if(UPL_PAGE_LIST_MAPPED & upl->flags) {
3609                 upl_unlock(upl);
3610                 return KERN_FAILURE;
3611         }
3612
3613         if((!(upl->map_object->pageout)) &&
3614                 !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) ||
3615                                         (upl->map_object->phys_contiguous))) {
3616                 vm_object_t             object;
3617                 vm_page_t               alias_page;
3618                 vm_object_offset_t      new_offset;
3619                 int                     pg_num;
3620                 wpl_array_t             lite_list;
3621
3622                 if(upl->flags & UPL_INTERNAL) {
3623                         lite_list = (wpl_array_t)
3624                                 ((((uintptr_t)upl) + sizeof(struct upl))
3625                                 + ((upl->size/PAGE_SIZE)
3626                                                 * sizeof(upl_page_info_t)));
3627                 } else {
3628                         lite_list = (wpl_array_t)
3629                                 (((uintptr_t)upl) + sizeof(struct upl));
3630                 }
3631                 object = upl->map_object;
3632                 upl->map_object = vm_object_allocate(upl->size);
3633                 vm_object_lock(upl->map_object);
3634                 upl->map_object->shadow = object;
3635                 upl->map_object->pageout = TRUE;
3636                 upl->map_object->can_persist = FALSE;
3637                 upl->map_object->copy_strategy =
3638                                 MEMORY_OBJECT_COPY_NONE;
3639                 upl->map_object->shadow_offset =
3640                                 upl->offset - object->paging_offset;
3641                 upl->map_object->wimg_bits = object->wimg_bits;
3642                 offset = upl->map_object->shadow_offset;
3643                 new_offset = 0;
3644                 size = upl->size;
3645
3646                 vm_object_lock(object);
3647
3648                 while(size) {
3649                    pg_num = (new_offset)/PAGE_SIZE;
3650                    if(lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
3651                         vm_object_unlock(object);
3652                         VM_PAGE_GRAB_FICTITIOUS(alias_page);
3653                         vm_object_lock(object);
3654                         m = vm_page_lookup(object, offset);
3655                         if (m == VM_PAGE_NULL) {
3656                                 panic("vm_upl_map: page missing\n");
3657                         }
3658
3659                         vm_object_paging_begin(object);
3660
3661                         /*
3662                         * Convert the fictitious page to a private
3663                          * shadow of the real page.
3664                          */
3665                         assert(alias_page->fictitious);
3666                         alias_page->fictitious = FALSE;
3667                         alias_page->private = TRUE;
3668                         alias_page->pageout = TRUE;
3669                         alias_page->phys_page = m->phys_page;
3670
3671                         vm_page_lock_queues();
3672                         vm_page_wire(alias_page);
3673                         vm_page_unlock_queues();
3674
3675                         /*
3676                          * ENCRYPTED SWAP:
3677                          * The virtual page ("m") has to be wired in some way
3678                          * here or its physical page ("m->phys_page") could
3679                          * be recycled at any time.
3680                          * Assuming this is enforced by the caller, we can't
3681                          * get an encrypted page here.  Since the encryption
3682                          * key depends on the VM page's "pager" object and
3683                          * the "paging_offset", we couldn't handle 2 pageable
3684                          * VM pages (with different pagers and paging_offsets)
3685                          * sharing the same physical page:  we could end up
3686                          * encrypting with one key (via one VM page) and
3687                          * decrypting with another key (via the alias VM page).
3688                          */
3689                         ASSERT_PAGE_DECRYPTED(m);
3690
3691                         vm_page_insert(alias_page,
3692                                         upl->map_object, new_offset);
3693                         assert(!alias_page->wanted);
3694                         alias_page->busy = FALSE;
3695                         alias_page->absent = FALSE;
3696                    }
3697
3698                    size -= PAGE_SIZE;
3699                    offset += PAGE_SIZE_64;
3700                    new_offset += PAGE_SIZE_64;
3701                 }
3702                 vm_object_unlock(object);
3703                 vm_object_unlock(upl->map_object);
3704         }
3705         if ((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || upl->map_object->phys_contiguous)
3706                 offset = upl->offset - upl->map_object->paging_offset;
3707         else
3708                 offset = 0;
3709
3710         size = upl->size;
3711
3712         vm_object_lock(upl->map_object);
3713         upl->map_object->ref_count++;
3714         vm_object_res_reference(upl->map_object);
3715         vm_object_unlock(upl->map_object);
3716
3717         *dst_addr = 0;
3718
3719
3720         /* NEED A UPL_MAP ALIAS */
3721         kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
3722                 VM_FLAGS_ANYWHERE, upl->map_object, offset, FALSE,
3723                 VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
3724
3725         if (kr != KERN_SUCCESS) {
3726                 upl_unlock(upl);
3727                 return(kr);
3728         }
3729
3730         vm_object_lock(upl->map_object);
3731
3732         for(addr=*dst_addr; size > 0; size-=PAGE_SIZE,addr+=PAGE_SIZE) {
3733                 m = vm_page_lookup(upl->map_object, offset);
3734                 if(m) {
3735                    unsigned int cache_attr;
3736                    cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
3737
3738                    PMAP_ENTER(map->pmap, addr,
3739                                 m, VM_PROT_ALL,
3740                                 cache_attr, TRUE);
3741                 }
3742                 offset+=PAGE_SIZE_64;
3743         }
3744         vm_object_unlock(upl->map_object);
3745
3746         upl->ref_count++;  /* hold a reference for the mapping */
3747         upl->flags |= UPL_PAGE_LIST_MAPPED;
3748         upl->kaddr = *dst_addr;
3749         upl_unlock(upl);
3750         return KERN_SUCCESS;
3751 }
3752
3753 /*
3754  * Internal routine to remove a UPL mapping from a VM map.
3755  *
3756  * XXX - This should just be doable through a standard
3757  * vm_map_remove() operation.  Otherwise, implicit clean-up
3758  * of the target map won't be able to correctly remove
3759  * these (and release the reference on the UPL).  Having
3760  * to do this means we can't map these into user-space
3761  * maps yet.
3762  */
3763 kern_return_t
3764 vm_map_remove_upl(
3765         vm_map_t        map,
3766         upl_t           upl)
3767 {
3768         vm_address_t    addr;
3769         upl_size_t      size;
3770
3771         if (upl == UPL_NULL)
3772                 return KERN_INVALID_ARGUMENT;
3773
3774         upl_lock(upl);
3775         if(upl->flags & UPL_PAGE_LIST_MAPPED) {
3776                 addr = upl->kaddr;
3777                 size = upl->size;
3778                 assert(upl->ref_count > 1);
3779                 upl->ref_count--;               /* removing mapping ref */
3780                 upl->flags &= ~UPL_PAGE_LIST_MAPPED;
3781                 upl->kaddr = (vm_offset_t) 0;
3782                 upl_unlock(upl);
3783
3784                 vm_map_remove(  map,
3785                                 vm_map_trunc_page(addr),
3786                                 vm_map_round_page(addr + size),
3787                                 VM_MAP_NO_FLAGS);
3788                 return KERN_SUCCESS;
3789         }
3790         upl_unlock(upl);
3791         return KERN_FAILURE;
3792 }
3793
3794 kern_return_t
3795 upl_commit_range(
3796         upl_t                   upl,
3797         upl_offset_t            offset,
3798         upl_size_t              size,
3799         int                     flags,
3800         upl_page_info_t         *page_list,
3801         mach_msg_type_number_t  count,
3802         boolean_t               *empty)
3803 {
3804         upl_size_t              xfer_size = size;
3805         vm_object_t             shadow_object;
3806         vm_object_t             object = upl->map_object;
3807         vm_object_offset_t      target_offset;
3808         int                     entry;
3809         wpl_array_t             lite_list;
3810         int                     occupied;
3811         int                     delayed_unlock = 0;
3812         int                     clear_refmod = 0;
3813         boolean_t               shadow_internal;
3814
3815         *empty = FALSE;
3816
3817         if (upl == UPL_NULL)
3818                 return KERN_INVALID_ARGUMENT;
3819
3820
3821         if (count == 0)
3822                 page_list = NULL;
3823
3824         if (object->pageout) {
3825                 shadow_object = object->shadow;
3826         } else {
3827                 shadow_object = object;
3828         }
3829
3830         upl_lock(upl);
3831
3832         if (upl->flags & UPL_ACCESS_BLOCKED) {
3833                 /*
3834                  * We used this UPL to block access to the pages by marking
3835                  * them "busy".  Now we need to clear the "busy" bit to allow
3836                  * access to these pages again.
3837                  */
3838                 flags |= UPL_COMMIT_ALLOW_ACCESS;
3839         }
3840
3841         if (upl->flags & UPL_CLEAR_DIRTY)
3842                 flags |= UPL_COMMIT_CLEAR_DIRTY;
3843
3844         if (upl->flags & UPL_DEVICE_MEMORY) {
3845                 xfer_size = 0;
3846         } else if ((offset + size) > upl->size) {
3847                 upl_unlock(upl);
3848                 return KERN_FAILURE;
3849         }
3850
3851         if (upl->flags & UPL_INTERNAL) {
3852                 lite_list = (wpl_array_t)
3853                         ((((uintptr_t)upl) + sizeof(struct upl))
3854                         + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
3855         } else {
3856                 lite_list = (wpl_array_t)
3857                         (((uintptr_t)upl) + sizeof(struct upl));
3858         }
3859         if (object != shadow_object)
3860                 vm_object_lock(object);
3861         vm_object_lock(shadow_object);
3862
3863         shadow_internal = shadow_object->internal;
3864
3865         entry = offset/PAGE_SIZE;
3866         target_offset = (vm_object_offset_t)offset;
3867
3868         while (xfer_size) {
3869                 vm_page_t       t,m;
3870                 upl_page_info_t *p;
3871
3872                 m = VM_PAGE_NULL;
3873
3874                 if (upl->flags & UPL_LITE) {
3875                         int     pg_num;
3876
3877                         pg_num = target_offset/PAGE_SIZE;
3878
3879                         if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
3880                                 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
3881                                 m = vm_page_lookup(shadow_object,
3882                                                    target_offset + (upl->offset -
3883                                                                     shadow_object->paging_offset));
3884                         }
3885                 }
3886                 if (object->pageout) {
3887                         if ((t = vm_page_lookup(object, target_offset)) != NULL) {
3888                                 t->pageout = FALSE;
3889
3890                                 if (delayed_unlock) {
3891                                         delayed_unlock = 0;
3892                                         vm_page_unlock_queues();
3893                                 }
3894                                 VM_PAGE_FREE(t);
3895
3896                                 if (m == NULL) {
3897                                         m = vm_page_lookup(
3898                                             shadow_object,
3899                                             target_offset +
3900                                                 object->shadow_offset);
3901                                 }
3902                                 if (m != VM_PAGE_NULL)
3903                                         vm_object_paging_end(m->object);
3904                         }
3905                 }
3906                 if (m != VM_PAGE_NULL) {
3907
3908                    clear_refmod = 0;
3909
3910                    if (upl->flags & UPL_IO_WIRE) {
3911
3912                         if (delayed_unlock == 0)
3913                                 vm_page_lock_queues();
3914
3915                         vm_page_unwire(m);
3916
3917                         if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
3918                                 delayed_unlock = 0;
3919                                 vm_page_unlock_queues();
3920                         }
3921                         if (page_list) {
3922                                 page_list[entry].phys_addr = 0;
3923                         }
3924                         if (flags & UPL_COMMIT_SET_DIRTY) {
3925                                 m->dirty = TRUE;
3926                         } else if (flags & UPL_COMMIT_CLEAR_DIRTY) {
3927                                 m->dirty = FALSE;
3928                                 clear_refmod |= VM_MEM_MODIFIED;
3929                         }
3930                         if (flags & UPL_COMMIT_INACTIVATE) {
3931                                 m->reference = FALSE;
3932                                 clear_refmod |= VM_MEM_REFERENCED;
3933                                 vm_page_deactivate(m);
3934                         }
3935                         if (clear_refmod)
3936                                 pmap_clear_refmod(m->phys_page, clear_refmod);
3937
3938                         if (flags & UPL_COMMIT_ALLOW_ACCESS) {
3939                                 /*
3940                                  * We blocked access to the pages in this UPL.
3941                                  * Clear the "busy" bit and wake up any waiter
3942                                  * for this page.
3943                                  */
3944                                 PAGE_WAKEUP_DONE(m);
3945                         }
3946
3947                         target_offset += PAGE_SIZE_64;
3948                         xfer_size -= PAGE_SIZE;
3949                         entry++;
3950                         continue;
3951                    }
3952                    if (delayed_unlock == 0)
3953                         vm_page_lock_queues();
3954                    /*
3955                     * make sure to clear the hardware
3956                     * modify or reference bits before
3957                     * releasing the BUSY bit on this page
3958                     * otherwise we risk losing a legitimate
3959                     * change of state
3960                     */
3961                    if (flags & UPL_COMMIT_CLEAR_DIRTY) {
3962                         m->dirty = FALSE;
3963                         clear_refmod |= VM_MEM_MODIFIED;
3964                    }
3965                    if (flags & UPL_COMMIT_INACTIVATE)
3966                         clear_refmod |= VM_MEM_REFERENCED;
3967
3968                    if (clear_refmod)
3969                         pmap_clear_refmod(m->phys_page, clear_refmod);
3970
3971                    if (page_list) {
3972                         p = &(page_list[entry]);
3973                         if(p->phys_addr && p->pageout && !m->pageout) {
3974                                 m->busy = TRUE;
3975                                 m->pageout = TRUE;
3976                                 vm_page_wire(m);
3977                         } else if (page_list[entry].phys_addr &&
3978                                         !p->pageout && m->pageout &&
3979                                         !m->dump_cleaning) {
3980                                 m->pageout = FALSE;
3981                                 m->absent = FALSE;
3982                                 m->overwriting = FALSE;
3983                                 vm_page_unwire(m);
3984                                 PAGE_WAKEUP_DONE(m);
3985                         }
3986                         page_list[entry].phys_addr = 0;
3987                    }
3988                    m->dump_cleaning = FALSE;
3989                    if(m->laundry) {
3990                            vm_pageout_throttle_up(m);
3991                    }
3992                    if(m->pageout) {
3993                       m->cleaning = FALSE;
3994                       m->pageout = FALSE;
3995 #if MACH_CLUSTER_STATS
3996                       if (m->wanted) vm_pageout_target_collisions++;
3997 #endif
3998                       if (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED)
3999                               m->dirty = TRUE;
4000                       else
4001                               m->dirty = FALSE;
4002
4003                       if(m->dirty) {
4004                               vm_page_unwire(m);/* reactivates */
4005
4006                               if (upl->flags & UPL_PAGEOUT) {
4007                                       CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
4008                                       VM_STAT(reactivations++);
4009                               }
4010                               PAGE_WAKEUP_DONE(m);
4011                       } else {
4012                             vm_page_free(m);/* clears busy, etc. */
4013
4014                             if (upl->flags & UPL_PAGEOUT) {
4015                                     CLUSTER_STAT(vm_pageout_target_page_freed++;)
4016
4017                                     if (page_list[entry].dirty)
4018                                             VM_STAT(pageouts++);
4019                             }
4020                       }
4021                       if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
4022                             delayed_unlock = 0;
4023                             vm_page_unlock_queues();
4024                       }
4025                       target_offset += PAGE_SIZE_64;
4026                       xfer_size -= PAGE_SIZE;
4027                       entry++;
4028                       continue;
4029                    }
4030 #if MACH_CLUSTER_STATS
4031                    m->dirty = pmap_is_modified(m->phys_page);
4032
4033                    if (m->dirty)   vm_pageout_cluster_dirtied++;
4034                    else            vm_pageout_cluster_cleaned++;
4035                    if (m->wanted)  vm_pageout_cluster_collisions++;
4036 #else
4037                    m->dirty = 0;
4038 #endif
4039
4040                    if((m->busy) && (m->cleaning)) {
4041                         /* the request_page_list case */
4042                         if(m->absent) {
4043                                 m->absent = FALSE;
4044                                 if(shadow_object->absent_count == 1)
4045                                       vm_object_absent_release(shadow_object);
4046                                 else
4047                                       shadow_object->absent_count--;
4048                         }
4049                         m->overwriting = FALSE;
4050                         m->busy = FALSE;
4051                         m->dirty = FALSE;
4052                    } else if (m->overwriting) {
4053                          /* alternate request page list, write to
4054                           * page_list case.  Occurs when the original
4055                           * page was wired at the time of the list
4056                           * request */
4057                          assert(m->wire_count != 0);
4058                          vm_page_unwire(m);/* reactivates */
4059                          m->overwriting = FALSE;
4060                    }
4061                    m->cleaning = FALSE;
4062
4063                    /* It is a part of the semantic of COPYOUT_FROM */
4064                    /* UPLs that a commit implies cache sync           */
4065                    /* between the vm page and the backing store    */
4066                    /* this can be used to strip the precious bit   */
4067                    /* as well as clean */
4068                    if (upl->flags & UPL_PAGE_SYNC_DONE)
4069                          m->precious = FALSE;
4070
4071                    if (flags & UPL_COMMIT_SET_DIRTY)
4072                         m->dirty = TRUE;
4073
4074                    if (flags & UPL_COMMIT_INACTIVATE) {
4075                         m->reference = FALSE;
4076                         vm_page_deactivate(m);
4077                    } else if (!m->active && !m->inactive) {
4078                         if (m->reference)
4079                                 vm_page_activate(m);
4080                         else
4081                                 vm_page_deactivate(m);
4082                    }
4083
4084                    if (flags & UPL_COMMIT_ALLOW_ACCESS) {
4085                            /*
4086                             * We blocked access to the pages in this URL.
4087                             * Clear the "busy" bit on this page before we
4088                             * wake up any waiter.
4089                             */
4090                            m->busy = FALSE;
4091                    }
4092
4093                    /*
4094                     * Wakeup any thread waiting for the page to be un-cleaning.
4095                     */
4096                    PAGE_WAKEUP(m);
4097
4098                    if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
4099                          delayed_unlock = 0;
4100                          vm_page_unlock_queues();
4101                    }
4102                 }
4103                 target_offset += PAGE_SIZE_64;
4104                 xfer_size -= PAGE_SIZE;
4105                 entry++;
4106         }
4107         if (delayed_unlock)
4108                 vm_page_unlock_queues();
4109
4110         occupied = 1;
4111
4112         if (upl->flags & UPL_DEVICE_MEMORY)  {
4113                 occupied = 0;
4114         } else if (upl->flags & UPL_LITE) {
4115                 int     pg_num;
4116                 int     i;
4117                 pg_num = upl->size/PAGE_SIZE;
4118                 pg_num = (pg_num + 31) >> 5;
4119                 occupied = 0;
4120                 for(i= 0; i<pg_num; i++) {
4121                         if(lite_list[i] != 0) {
4122                                 occupied = 1;
4123                                 break;
4124                         }
4125                 }
4126         } else {
4127                 if(queue_empty(&upl->map_object->memq)) {
4128                         occupied = 0;
4129                 }
4130         }
4131
4132         if(occupied == 0) {
4133                 if(upl->flags & UPL_COMMIT_NOTIFY_EMPTY) {
4134                         *empty = TRUE;
4135                 }
4136                 if(object == shadow_object)
4137                         vm_object_paging_end(shadow_object);
4138         }
4139         vm_object_unlock(shadow_object);
4140         if (object != shadow_object)
4141                 vm_object_unlock(object);
4142         upl_unlock(upl);
4143
4144         return KERN_SUCCESS;
4145 }
4146
4147 kern_return_t
4148 upl_abort_range(
4149         upl_t                   upl,
4150         upl_offset_t            offset,
4151         upl_size_t              size,
4152         int                     error,
4153         boolean_t               *empty)
4154 {
4155         upl_size_t              xfer_size = size;
4156         vm_object_t             shadow_object;
4157         vm_object_t             object = upl->map_object;
4158         vm_object_offset_t      target_offset;
4159         int                     entry;
4160         wpl_array_t             lite_list;
4161         int                     occupied;
4162         boolean_t               shadow_internal;
4163
4164         *empty = FALSE;
4165
4166         if (upl == UPL_NULL)
4167                 return KERN_INVALID_ARGUMENT;
4168
4169         if (upl->flags & UPL_IO_WIRE) {
4170                 return upl_commit_range(upl,
4171                         offset, size, 0,
4172                         NULL, 0, empty);
4173         }
4174
4175         if(object->pageout) {
4176                 shadow_object = object->shadow;
4177         } else {
4178                 shadow_object = object;
4179         }
4180
4181         upl_lock(upl);
4182         if(upl->flags & UPL_DEVICE_MEMORY) {
4183                 xfer_size = 0;
4184         } else if ((offset + size) > upl->size) {
4185                 upl_unlock(upl);
4186                 return KERN_FAILURE;
4187         }
4188         if (object != shadow_object)
4189                 vm_object_lock(object);
4190         vm_object_lock(shadow_object);
4191
4192         shadow_internal = shadow_object->internal;
4193
4194         if(upl->flags & UPL_INTERNAL) {
4195                 lite_list = (wpl_array_t)
4196                         ((((uintptr_t)upl) + sizeof(struct upl))
4197                         + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
4198         } else {
4199                 lite_list = (wpl_array_t)
4200                         (((uintptr_t)upl) + sizeof(struct upl));
4201         }
4202
4203         entry = offset/PAGE_SIZE;
4204         target_offset = (vm_object_offset_t)offset;
4205         while(xfer_size) {
4206                 vm_page_t       t,m;
4207
4208                 m = VM_PAGE_NULL;
4209                 if(upl->flags & UPL_LITE) {
4210                         int     pg_num;
4211                         pg_num = target_offset/PAGE_SIZE;
4212                         if(lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
4213                                 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
4214                                 m = vm_page_lookup(shadow_object,
4215                                         target_offset + (upl->offset -
4216                                                 shadow_object->paging_offset));
4217                         }
4218                 }
4219                 if(object->pageout) {
4220                         if ((t = vm_page_lookup(object, target_offset))
4221                                                                 != NULL) {
4222                                 t->pageout = FALSE;
4223                                 VM_PAGE_FREE(t);
4224                                 if(m == NULL) {
4225                                         m = vm_page_lookup(
4226                                             shadow_object,
4227                                             target_offset +
4228                                                 object->shadow_offset);
4229                                 }
4230                                 if(m != VM_PAGE_NULL)
4231                                         vm_object_paging_end(m->object);
4232                         }
4233                 }
4234                 if(m != VM_PAGE_NULL) {
4235                         vm_page_lock_queues();
4236                         if(m->absent) {
4237                                 boolean_t must_free = TRUE;
4238
4239                                 /* COPYOUT = FALSE case */
4240                                 /* check for error conditions which must */
4241                                 /* be passed back to the pages customer  */
4242                                 if(error & UPL_ABORT_RESTART) {
4243                                         m->restart = TRUE;
4244                                         m->absent = FALSE;
4245                                         vm_object_absent_release(m->object);
4246                                         m->page_error = KERN_MEMORY_ERROR;
4247                                         m->error = TRUE;
4248                                         must_free = FALSE;
4249                                 } else if(error & UPL_ABORT_UNAVAILABLE) {
4250                                         m->restart = FALSE;
4251                                         m->unusual = TRUE;
4252                                         must_free = FALSE;
4253                                 } else if(error & UPL_ABORT_ERROR) {
4254                                         m->restart = FALSE;
4255                                         m->absent = FALSE;
4256                                         vm_object_absent_release(m->object);
4257                                         m->page_error = KERN_MEMORY_ERROR;
4258                                         m->error = TRUE;
4259                                         must_free = FALSE;
4260                                 }
4261
4262                                 /*
4263                                  * ENCRYPTED SWAP:
4264                                  * If the page was already encrypted,
4265                                  * we don't really need to decrypt it
4266                                  * now.  It will get decrypted later,
4267                                  * on demand, as soon as someone needs
4268                                  * to access its contents.
4269                                  */
4270
4271                                 m->cleaning = FALSE;
4272                                 m->overwriting = FALSE;
4273                                 PAGE_WAKEUP_DONE(m);
4274
4275                                 if (must_free == TRUE) {
4276                                         vm_page_free(m);
4277                                 } else {
4278                                         vm_page_activate(m);
4279                                 }
4280                                 vm_page_unlock_queues();
4281
4282                                 target_offset += PAGE_SIZE_64;
4283                                 xfer_size -= PAGE_SIZE;
4284                                 entry++;
4285                                 continue;
4286                         }
4287                         /*
4288                         * Handle the trusted pager throttle.
4289                         */
4290                         if (m->laundry) {
4291                                 vm_pageout_throttle_up(m);
4292                         }
4293                         if(m->pageout) {
4294                                 assert(m->busy);
4295                                 assert(m->wire_count == 1);
4296                                 m->pageout = FALSE;
4297                                 vm_page_unwire(m);
4298                         }
4299                         m->dump_cleaning = FALSE;
4300                         m->cleaning = FALSE;
4301                         m->overwriting = FALSE;
4302 #if     MACH_PAGEMAP
4303                         vm_external_state_clr(
4304                                 m->object->existence_map, m->offset);
4305 #endif  /* MACH_PAGEMAP */
4306                         if(error & UPL_ABORT_DUMP_PAGES) {
4307                                 vm_page_free(m);
4308                                 pmap_disconnect(m->phys_page);
4309                         } else {
4310                                 PAGE_WAKEUP_DONE(m);
4311                         }
4312                         vm_page_unlock_queues();
4313                 }
4314                 target_offset += PAGE_SIZE_64;
4315                 xfer_size -= PAGE_SIZE;
4316                 entry++;
4317         }
4318         occupied = 1;
4319         if (upl->flags & UPL_DEVICE_MEMORY)  {
4320                 occupied = 0;
4321         } else if (upl->flags & UPL_LITE) {
4322                 int     pg_num;
4323                 int     i;
4324                 pg_num = upl->size/PAGE_SIZE;
4325                 pg_num = (pg_num + 31) >> 5;
4326                 occupied = 0;
4327                 for(i= 0; i<pg_num; i++) {
4328                         if(lite_list[i] != 0) {
4329                                 occupied = 1;
4330                                 break;
4331                         }
4332                 }
4333         } else {
4334                 if(queue_empty(&upl->map_object->memq)) {
4335                         occupied = 0;
4336                 }
4337         }
4338
4339         if(occupied == 0) {
4340                 if(upl->flags & UPL_COMMIT_NOTIFY_EMPTY) {
4341                         *empty = TRUE;
4342                 }
4343                 if(object == shadow_object)
4344                         vm_object_paging_end(shadow_object);
4345         }
4346         vm_object_unlock(shadow_object);
4347         if (object != shadow_object)
4348                 vm_object_unlock(object);
4349
4350         upl_unlock(upl);
4351
4352         return KERN_SUCCESS;
4353 }
4354
4355 kern_return_t
4356 upl_abort(
4357         upl_t   upl,
4358         int     error)
4359 {
4360         vm_object_t             object = NULL;
4361         vm_object_t             shadow_object = NULL;
4362         vm_object_offset_t      offset;
4363         vm_object_offset_t      shadow_offset;
4364         vm_object_offset_t      target_offset;
4365         upl_size_t              i;
4366         wpl_array_t             lite_list;
4367         vm_page_t               t,m;
4368         int                     occupied;
4369         boolean_t               shadow_internal;
4370
4371         if (upl == UPL_NULL)
4372                 return KERN_INVALID_ARGUMENT;
4373
4374         if (upl->flags & UPL_IO_WIRE) {
4375                 boolean_t       empty;
4376                 return upl_commit_range(upl,
4377                         0, upl->size, 0,
4378                         NULL, 0, &empty);
4379         }
4380
4381         upl_lock(upl);
4382         if(upl->flags & UPL_DEVICE_MEMORY) {
4383                 upl_unlock(upl);
4384                 return KERN_SUCCESS;
4385         }
4386
4387         object = upl->map_object;
4388
4389         if (object == NULL) {
4390                 panic("upl_abort: upl object is not backed by an object");
4391                 upl_unlock(upl);
4392                 return KERN_INVALID_ARGUMENT;
4393         }
4394
4395         if(object->pageout) {
4396                 shadow_object = object->shadow;
4397                 shadow_offset = object->shadow_offset;
4398         } else {
4399                 shadow_object = object;
4400                 shadow_offset = upl->offset - object->paging_offset;
4401         }
4402
4403         if(upl->flags & UPL_INTERNAL) {
4404                 lite_list = (wpl_array_t)
4405                         ((((uintptr_t)upl) + sizeof(struct upl))
4406                         + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
4407         } else {
4408                 lite_list = (wpl_array_t)
4409                         (((uintptr_t)upl) + sizeof(struct upl));
4410         }
4411         offset = 0;
4412
4413         if (object != shadow_object)
4414                 vm_object_lock(object);
4415         vm_object_lock(shadow_object);
4416
4417         shadow_internal = shadow_object->internal;
4418
4419         for(i = 0; i<(upl->size); i+=PAGE_SIZE, offset += PAGE_SIZE_64) {
4420                 m = VM_PAGE_NULL;
4421                 target_offset = offset + shadow_offset;
4422                 if(upl->flags & UPL_LITE) {
4423                         int     pg_num;
4424                         pg_num = offset/PAGE_SIZE;
4425                         if(lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
4426                                 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
4427                                 m = vm_page_lookup(
4428                                         shadow_object, target_offset);
4429                         }
4430                 }
4431                 if(object->pageout) {
4432                         if ((t = vm_page_lookup(object, offset)) != NULL) {
4433                                 t->pageout = FALSE;
4434                                 VM_PAGE_FREE(t);
4435                                 if(m == NULL) {
4436                                         m = vm_page_lookup(
4437                                             shadow_object, target_offset);
4438                                 }
4439                                 if(m != VM_PAGE_NULL)
4440                                         vm_object_paging_end(m->object);
4441                         }
4442                 }
4443                 if(m != VM_PAGE_NULL) {
4444                         vm_page_lock_queues();
4445                         if(m->absent) {
4446                                 boolean_t must_free = TRUE;
4447
4448                                 /* COPYOUT = FALSE case */
4449                                 /* check for error conditions which must */
4450                                 /* be passed back to the pages customer  */
4451                                 if(error & UPL_ABORT_RESTART) {
4452                                         m->restart = TRUE;
4453                                         m->absent = FALSE;
4454                                         vm_object_absent_release(m->object);
4455                                         m->page_error = KERN_MEMORY_ERROR;
4456                                         m->error = TRUE;
4457                                         must_free = FALSE;
4458                                 } else if(error & UPL_ABORT_UNAVAILABLE) {
4459                                         m->restart = FALSE;
4460                                         m->unusual = TRUE;
4461                                         must_free = FALSE;
4462                                 } else if(error & UPL_ABORT_ERROR) {
4463                                         m->restart = FALSE;
4464                                         m->absent = FALSE;
4465                                         vm_object_absent_release(m->object);
4466                                         m->page_error = KERN_MEMORY_ERROR;
4467                                         m->error = TRUE;
4468                                         must_free = FALSE;
4469                                 }
4470
4471                                 /*
4472                                  * ENCRYPTED SWAP:
4473                                  * If the page was already encrypted,
4474                                  * we don't really need to decrypt it
4475                                  * now.  It will get decrypted later,
4476                                  * on demand, as soon as someone needs
4477                                  * to access its contents.
4478                                  */
4479
4480                                 m->cleaning = FALSE;
4481                                 m->overwriting = FALSE;
4482                                 PAGE_WAKEUP_DONE(m);
4483
4484                                 if (must_free == TRUE) {
4485                                         vm_page_free(m);
4486                                 } else {
4487                                         vm_page_activate(m);
4488                                 }
4489                                 vm_page_unlock_queues();
4490                                 continue;
4491                         }
4492                         /*
4493                          * Handle the trusted pager throttle.
4494                          */
4495                         if (m->laundry) {
4496                                 vm_pageout_throttle_up(m);
4497                         }
4498                         if(m->pageout) {
4499                                 assert(m->busy);
4500                                 assert(m->wire_count == 1);
4501                                 m->pageout = FALSE;
4502                                 vm_page_unwire(m);
4503                         }
4504                         m->dump_cleaning = FALSE;
4505                         m->cleaning = FALSE;
4506                         m->overwriting = FALSE;
4507 #if     MACH_PAGEMAP
4508                         vm_external_state_clr(
4509                                 m->object->existence_map, m->offset);
4510 #endif  /* MACH_PAGEMAP */
4511                         if(error & UPL_ABORT_DUMP_PAGES) {
4512                                 vm_page_free(m);
4513                                 pmap_disconnect(m->phys_page);
4514                         } else {
4515                                 PAGE_WAKEUP_DONE(m);
4516                         }
4517                         vm_page_unlock_queues();
4518                 }
4519         }
4520         occupied = 1;
4521         if (upl->flags & UPL_DEVICE_MEMORY)  {
4522                 occupied = 0;
4523         } else if (upl->flags & UPL_LITE) {
4524                 int     pg_num;
4525                 int     j;
4526                 pg_num = upl->size/PAGE_SIZE;
4527                 pg_num = (pg_num + 31) >> 5;
4528                 occupied = 0;
4529                 for(j= 0; j<pg_num; j++) {
4530                         if(lite_list[j] != 0) {
4531                                 occupied = 1;
4532                                 break;
4533                         }
4534                 }
4535         } else {
4536                 if(queue_empty(&upl->map_object->memq)) {
4537                         occupied = 0;
4538                 }
4539         }
4540
4541         if(occupied == 0) {
4542                 if(object == shadow_object)
4543                         vm_object_paging_end(shadow_object);
4544         }
4545         vm_object_unlock(shadow_object);
4546         if (object != shadow_object)
4547                 vm_object_unlock(object);
4548
4549         upl_unlock(upl);
4550         return KERN_SUCCESS;
4551 }
4552
4553 /* an option on commit should be wire */
4554 kern_return_t
4555 upl_commit(
4556         upl_t                   upl,
4557         upl_page_info_t         *page_list,
4558         mach_msg_type_number_t  count)
4559 {
4560         if (upl == UPL_NULL)
4561                 return KERN_INVALID_ARGUMENT;
4562
4563         if(upl->flags & (UPL_LITE | UPL_IO_WIRE)) {
4564                 boolean_t       empty;
4565                 return upl_commit_range(upl, 0, upl->size, 0,
4566                                         page_list, count, &empty);
4567         }
4568
4569         if (count == 0)
4570                 page_list = NULL;
4571
4572         upl_lock(upl);
4573         if (upl->flags & UPL_DEVICE_MEMORY)
4574                 page_list = NULL;
4575
4576         if (upl->flags & UPL_ENCRYPTED) {
4577                 /*
4578                  * ENCRYPTED SWAP:
4579                  * This UPL was encrypted, but we don't need
4580                  * to decrypt here.  We'll decrypt each page
4581                  * later, on demand, as soon as someone needs
4582                  * to access the page's contents.
4583                  */
4584         }
4585
4586         if ((upl->flags & UPL_CLEAR_DIRTY) ||
4587                 (upl->flags & UPL_PAGE_SYNC_DONE) || page_list) {
4588                 vm_object_t     shadow_object = upl->map_object->shadow;
4589                 vm_object_t     object = upl->map_object;
4590                 vm_object_offset_t target_offset;
4591                 upl_size_t      xfer_end;
4592                 int             entry;
4593
4594                 vm_page_t       t, m;
4595                 upl_page_info_t *p;
4596
4597                 if (object != shadow_object)
4598                         vm_object_lock(object);
4599                 vm_object_lock(shadow_object);
4600
4601                 entry = 0;
4602                 target_offset = object->shadow_offset;
4603                 xfer_end = upl->size + object->shadow_offset;
4604
4605                 while(target_offset < xfer_end) {
4606
4607                         if ((t = vm_page_lookup(object,
4608                                 target_offset - object->shadow_offset))
4609                                 == NULL) {
4610                                 target_offset += PAGE_SIZE_64;
4611                                 entry++;
4612                                 continue;
4613                         }
4614
4615                         m = vm_page_lookup(shadow_object, target_offset);
4616                         if(m != VM_PAGE_NULL) {
4617                             /*
4618                              * ENCRYPTED SWAP:
4619                              * If this page was encrypted, we
4620                              * don't need to decrypt it here.
4621                              * We'll decrypt it later, on demand,
4622                              * as soon as someone needs to access
4623                              * its contents.
4624                              */
4625
4626                             if (upl->flags & UPL_CLEAR_DIRTY) {
4627                                 pmap_clear_modify(m->phys_page);
4628                                 m->dirty = FALSE;
4629                             }
4630                             /* It is a part of the semantic of */
4631                             /* COPYOUT_FROM UPLs that a commit */
4632                             /* implies cache sync between the  */
4633                             /* vm page and the backing store   */
4634                             /* this can be used to strip the   */
4635                             /* precious bit as well as clean   */
4636                             if (upl->flags & UPL_PAGE_SYNC_DONE)
4637                                 m->precious = FALSE;
4638
4639                            if(page_list) {
4640                                 p = &(page_list[entry]);
4641                                 if(page_list[entry].phys_addr &&
4642                                                 p->pageout && !m->pageout) {
4643                                         vm_page_lock_queues();
4644                                         m->busy = TRUE;
4645                                         m->pageout = TRUE;
4646                                         vm_page_wire(m);
4647                                         vm_page_unlock_queues();
4648                                 } else if (page_list[entry].phys_addr &&
4649                                                 !p->pageout && m->pageout &&
4650                                                 !m->dump_cleaning) {
4651                                         vm_page_lock_queues();
4652                                         m->pageout = FALSE;
4653                                         m->absent = FALSE;
4654                                         m->overwriting = FALSE;
4655                                         vm_page_unwire(m);
4656                                         PAGE_WAKEUP_DONE(m);
4657                                         vm_page_unlock_queues();
4658                                 }
4659                                 page_list[entry].phys_addr = 0;
4660                            }
4661                         }
4662                         target_offset += PAGE_SIZE_64;
4663                         entry++;
4664                 }
4665                 vm_object_unlock(shadow_object);
4666                 if (object != shadow_object)
4667                         vm_object_unlock(object);
4668
4669         }
4670         if (upl->flags & UPL_DEVICE_MEMORY)  {
4671                 vm_object_lock(upl->map_object->shadow);
4672                 if(upl->map_object == upl->map_object->shadow)
4673                         vm_object_paging_end(upl->map_object->shadow);
4674                 vm_object_unlock(upl->map_object->shadow);
4675         }
4676         upl_unlock(upl);
4677         return KERN_SUCCESS;
4678 }
4679
4680
4681
4682 kern_return_t
4683 vm_object_iopl_request(
4684         vm_object_t             object,
4685         vm_object_offset_t      offset,
4686         upl_size_t              size,
4687         upl_t                   *upl_ptr,
4688         upl_page_info_array_t   user_page_list,
4689         unsigned int            *page_list_count,
4690         int                     cntrl_flags)
4691 {
4692         vm_page_t               dst_page;
4693         vm_object_offset_t      dst_offset = offset;
4694         upl_size_t              xfer_size = size;
4695         upl_t                   upl = NULL;
4696         unsigned int            entry;
4697         wpl_array_t             lite_list = NULL;
4698         int                     page_field_size;
4699         int                     delayed_unlock = 0;
4700         int                     no_zero_fill = FALSE;
4701         vm_page_t               alias_page = NULL;
4702         kern_return_t           ret;
4703         vm_prot_t               prot;
4704
4705
4706         if (cntrl_flags & ~UPL_VALID_FLAGS) {
4707                 /*
4708                  * For forward compatibility's sake,
4709                  * reject any unknown flag.
4710                  */
4711                 return KERN_INVALID_VALUE;
4712         }
4713         if (vm_lopage_poolsize == 0)
4714                 cntrl_flags &= ~UPL_NEED_32BIT_ADDR;
4715
4716         if (cntrl_flags & UPL_NEED_32BIT_ADDR) {
4717                 if ( (cntrl_flags & (UPL_SET_IO_WIRE | UPL_SET_LITE)) != (UPL_SET_IO_WIRE | UPL_SET_LITE))
4718                         return KERN_INVALID_VALUE;
4719
4720                 if (object->phys_contiguous) {
4721                         if ((offset + object->shadow_offset) >= (vm_object_offset_t)max_valid_dma_address)
4722                                 return KERN_INVALID_ADDRESS;
4723
4724                         if (((offset + object->shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address)
4725                                 return KERN_INVALID_ADDRESS;
4726                 }
4727         }
4728
4729         if (cntrl_flags & UPL_ENCRYPT) {
4730                 /*
4731                  * ENCRYPTED SWAP:
4732                  * The paging path doesn't use this interface,
4733                  * so we don't support the UPL_ENCRYPT flag
4734                  * here.  We won't encrypt the pages.
4735                  */
4736                 assert(! (cntrl_flags & UPL_ENCRYPT));
4737         }
4738
4739         if (cntrl_flags & UPL_NOZEROFILL)
4740                 no_zero_fill = TRUE;
4741
4742         if (cntrl_flags & UPL_COPYOUT_FROM)
4743                 prot = VM_PROT_READ;
4744         else
4745                 prot = VM_PROT_READ | VM_PROT_WRITE;
4746
4747         if(((size/page_size) > MAX_UPL_TRANSFER) && !object->phys_contiguous) {
4748                 size = MAX_UPL_TRANSFER * page_size;
4749         }
4750
4751         if(cntrl_flags & UPL_SET_INTERNAL)
4752                 if(page_list_count != NULL)
4753                         *page_list_count = MAX_UPL_TRANSFER;
4754         if(((cntrl_flags & UPL_SET_INTERNAL) && !(object->phys_contiguous)) &&
4755            ((page_list_count != NULL) && (*page_list_count != 0)
4756                                 && *page_list_count < (size/page_size)))
4757                 return KERN_INVALID_ARGUMENT;
4758
4759         if((!object->internal) && (object->paging_offset != 0))
4760                 panic("vm_object_upl_request: external object with non-zero paging offset\n");
4761
4762         if(object->phys_contiguous) {
4763                 /* No paging operations are possible against this memory */
4764                 /* and so no need for map object, ever */
4765                 cntrl_flags |= UPL_SET_LITE;
4766         }
4767
4768         if(upl_ptr) {
4769                 if(cntrl_flags & UPL_SET_INTERNAL) {
4770                         if(cntrl_flags & UPL_SET_LITE) {
4771                                 upl = upl_create(
4772                                         UPL_CREATE_INTERNAL | UPL_CREATE_LITE,
4773                                         size);
4774                                 user_page_list = (upl_page_info_t *)
4775                                    (((uintptr_t)upl) + sizeof(struct upl));
4776                                 lite_list = (wpl_array_t)
4777                                         (((uintptr_t)user_page_list) +
4778                                         ((size/PAGE_SIZE) *
4779                                                 sizeof(upl_page_info_t)));
4780                                 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
4781                                 page_field_size =
4782                                         (page_field_size + 3) & 0xFFFFFFFC;
4783                                 bzero((char *)lite_list, page_field_size);
4784                                 upl->flags =
4785                                         UPL_LITE | UPL_INTERNAL | UPL_IO_WIRE;
4786                         } else {
4787                                 upl = upl_create(UPL_CREATE_INTERNAL, size);
4788                                 user_page_list = (upl_page_info_t *)
4789                                         (((uintptr_t)upl)
4790                                                 + sizeof(struct upl));
4791                                 upl->flags = UPL_INTERNAL | UPL_IO_WIRE;
4792                         }
4793                 } else {
4794                         if(cntrl_flags & UPL_SET_LITE) {
4795                                 upl = upl_create(UPL_CREATE_LITE, size);
4796                                 lite_list = (wpl_array_t)
4797                                    (((uintptr_t)upl) + sizeof(struct upl));
4798                                 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
4799                                 page_field_size =
4800                                         (page_field_size + 3) & 0xFFFFFFFC;
4801                                 bzero((char *)lite_list, page_field_size);
4802                                 upl->flags = UPL_LITE | UPL_IO_WIRE;
4803                         } else {
4804                                 upl = upl_create(UPL_CREATE_EXTERNAL, size);
4805                                 upl->flags = UPL_IO_WIRE;
4806                         }
4807                 }
4808
4809                 if(object->phys_contiguous) {
4810                         upl->map_object = object;
4811                         /* don't need any shadow mappings for this one */
4812                         /* since it is already I/O memory */
4813                         upl->flags |= UPL_DEVICE_MEMORY;
4814
4815                         vm_object_lock(object);
4816                         vm_object_paging_begin(object);
4817                         vm_object_unlock(object);
4818
4819                         /* paging in progress also protects the paging_offset */
4820                         upl->offset = offset + object->paging_offset;
4821                         upl->size = size;
4822                         *upl_ptr = upl;
4823                         if(user_page_list) {
4824                                 user_page_list[0].phys_addr =
4825                                   (offset + object->shadow_offset)>>PAGE_SHIFT;
4826                                 user_page_list[0].device = TRUE;
4827                         }
4828                         upl->highest_page = (offset + object->shadow_offset + size - 1)>>PAGE_SHIFT;
4829
4830                         if(page_list_count != NULL) {
4831                                 if (upl->flags & UPL_INTERNAL) {
4832                                         *page_list_count = 0;
4833                                 } else {
4834                                         *page_list_count = 1;
4835                                 }
4836                         }
4837                         return KERN_SUCCESS;
4838                 }
4839                 if(user_page_list)
4840                         user_page_list[0].device = FALSE;
4841
4842                 if(cntrl_flags & UPL_SET_LITE) {
4843                         upl->map_object = object;
4844                 } else {
4845                         upl->map_object = vm_object_allocate(size);
4846                         vm_object_lock(upl->map_object);
4847                         upl->map_object->shadow = object;
4848                         upl->map_object->pageout = TRUE;
4849                         upl->map_object->can_persist = FALSE;
4850                         upl->map_object->copy_strategy =
4851                                         MEMORY_OBJECT_COPY_NONE;
4852                         upl->map_object->shadow_offset = offset;
4853                         upl->map_object->wimg_bits = object->wimg_bits;
4854                         vm_object_unlock(upl->map_object);
4855                 }
4856         }
4857         vm_object_lock(object);
4858         vm_object_paging_begin(object);
4859
4860         if (!object->phys_contiguous) {
4861                 /* Protect user space from future COW operations */
4862                 object->true_share = TRUE;
4863                 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC)
4864                         object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
4865         }
4866
4867         /* we can lock the upl offset now that paging_in_progress is set */
4868         if(upl_ptr) {
4869                 upl->size = size;
4870                 upl->offset = offset + object->paging_offset;
4871                 *upl_ptr = upl;
4872 #ifdef UPL_DEBUG
4873                 queue_enter(&object->uplq, upl, upl_t, uplq);
4874 #endif /* UPL_DEBUG */
4875         }
4876
4877         if (cntrl_flags & UPL_BLOCK_ACCESS) {
4878                 /*
4879                  * The user requested that access to the pages in this URL
4880                  * be blocked until the UPL is commited or aborted.
4881                  */
4882                 upl->flags |= UPL_ACCESS_BLOCKED;
4883         }
4884
4885         entry = 0;
4886         while (xfer_size) {
4887                 if((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
4888                         if (delayed_unlock) {
4889                                 delayed_unlock = 0;
4890                                 vm_page_unlock_queues();
4891                         }
4892                         vm_object_unlock(object);
4893                         VM_PAGE_GRAB_FICTITIOUS(alias_page);
4894                         vm_object_lock(object);
4895                 }
4896                 dst_page = vm_page_lookup(object, dst_offset);
4897
4898                 /*
4899                  * ENCRYPTED SWAP:
4900                  * If the page is encrypted, we need to decrypt it,
4901                  * so force a soft page fault.
4902                  */
4903                 if ((dst_page == VM_PAGE_NULL) || (dst_page->busy) ||
4904                     (dst_page->encrypted) ||
4905                     (dst_page->unusual && (dst_page->error ||
4906                                            dst_page->restart ||
4907                                            dst_page->absent ||
4908                                            dst_page->fictitious ||
4909                                            (prot & dst_page->page_lock)))) {
4910                         vm_fault_return_t       result;
4911                    do {
4912                         vm_page_t       top_page;
4913                         kern_return_t   error_code;
4914                         int             interruptible;
4915
4916                         vm_object_offset_t      lo_offset = offset;
4917                         vm_object_offset_t      hi_offset = offset + size;
4918
4919
4920                         if (delayed_unlock) {
4921                                 delayed_unlock = 0;
4922                                 vm_page_unlock_queues();
4923                         }
4924
4925                         if(cntrl_flags & UPL_SET_INTERRUPTIBLE) {
4926                                 interruptible = THREAD_ABORTSAFE;
4927                         } else {
4928                                 interruptible = THREAD_UNINT;
4929                         }
4930
4931                         result = vm_fault_page(object, dst_offset,
4932                                 prot | VM_PROT_WRITE, FALSE,
4933                                 interruptible,
4934                                 lo_offset, hi_offset,
4935                                 VM_BEHAVIOR_SEQUENTIAL,
4936                                 &prot, &dst_page, &top_page,
4937                                 (int *)0,
4938                                 &error_code, no_zero_fill, FALSE, NULL, 0);
4939
4940                         switch(result) {
4941                         case VM_FAULT_SUCCESS:
4942
4943                                 PAGE_WAKEUP_DONE(dst_page);
4944
4945                                 /*
4946                                  *      Release paging references and
4947                                  *      top-level placeholder page, if any.
4948                                  */
4949
4950                                 if(top_page != VM_PAGE_NULL) {
4951                                         vm_object_t local_object;
4952                                         local_object =
4953                                                 top_page->object;
4954                                         if(top_page->object
4955                                                 != dst_page->object) {
4956                                                 vm_object_lock(
4957                                                         local_object);
4958                                                 VM_PAGE_FREE(top_page);
4959                                                 vm_object_paging_end(
4960                                                         local_object);
4961                                                 vm_object_unlock(
4962                                                         local_object);
4963                                         } else {
4964                                                 VM_PAGE_FREE(top_page);
4965                                                 vm_object_paging_end(
4966                                                         local_object);
4967                                         }
4968                                 }
4969
4970                                 break;
4971
4972
4973                         case VM_FAULT_RETRY:
4974                                 vm_object_lock(object);
4975                                 vm_object_paging_begin(object);
4976                                 break;
4977
4978                         case VM_FAULT_FICTITIOUS_SHORTAGE:
4979                                 vm_page_more_fictitious();
4980                                 vm_object_lock(object);
4981                                 vm_object_paging_begin(object);
4982                                 break;
4983
4984                         case VM_FAULT_MEMORY_SHORTAGE:
4985                                 if (vm_page_wait(interruptible)) {
4986                                         vm_object_lock(object);
4987                                         vm_object_paging_begin(object);
4988                                         break;
4989                                 }
4990                                 /* fall thru */
4991
4992                         case VM_FAULT_INTERRUPTED:
4993                                 error_code = MACH_SEND_INTERRUPTED;
4994                         case VM_FAULT_MEMORY_ERROR:
4995                                 ret = (error_code ? error_code:
4996                                         KERN_MEMORY_ERROR);
4997                                 vm_object_lock(object);
4998
4999                                 goto return_err;
5000                         }
5001                    } while ((result != VM_FAULT_SUCCESS)
5002                                 || (result == VM_FAULT_INTERRUPTED));
5003                 }
5004
5005                 if ( (cntrl_flags & UPL_NEED_32BIT_ADDR) &&
5006                      dst_page->phys_page >= (max_valid_dma_address >> PAGE_SHIFT) ) {
5007                         vm_page_t       low_page;
5008                         int             refmod;
5009
5010                         /*
5011                          * support devices that can't DMA above 32 bits
5012                          * by substituting pages from a pool of low address
5013                          * memory for any pages we find above the 4G mark
5014                          * can't substitute if the page is already wired because
5015                          * we don't know whether that physical address has been
5016                          * handed out to some other 64 bit capable DMA device to use
5017                          */
5018                         if (dst_page->wire_count) {
5019                                 ret = KERN_PROTECTION_FAILURE;
5020                                 goto return_err;
5021                         }
5022                         if (delayed_unlock) {
5023                                 delayed_unlock = 0;
5024                                 vm_page_unlock_queues();
5025                         }
5026                         low_page = vm_page_grablo();
5027
5028                         if (low_page == VM_PAGE_NULL) {
5029                                 ret = KERN_RESOURCE_SHORTAGE;
5030                                 goto return_err;
5031                         }
5032                         /*
5033                          * from here until the vm_page_replace completes
5034                          * we musn't drop the object lock... we don't
5035                          * want anyone refaulting this page in and using
5036                          * it after we disconnect it... we want the fault
5037                          * to find the new page being substituted.
5038                          */
5039                         refmod = pmap_disconnect(dst_page->phys_page);
5040
5041                         vm_page_copy(dst_page, low_page);
5042
5043                         low_page->reference = dst_page->reference;
5044                         low_page->dirty     = dst_page->dirty;
5045
5046                         if (refmod & VM_MEM_REFERENCED)
5047                                 low_page->reference = TRUE;
5048                         if (refmod & VM_MEM_MODIFIED)
5049                                 low_page->dirty = TRUE;
5050
5051                         vm_page_lock_queues();
5052                         vm_page_replace(low_page, object, dst_offset);
5053                         /*
5054                          * keep the queue lock since we're going to
5055                          * need it immediately
5056                          */
5057                         delayed_unlock = 1;
5058
5059                         dst_page = low_page;
5060                         /*
5061                          * vm_page_grablo returned the page marked
5062                          * BUSY... we don't need a PAGE_WAKEUP_DONE
5063                          * here, because we've never dropped the object lock
5064                          */
5065                         dst_page->busy = FALSE;
5066                 }
5067                 if (delayed_unlock == 0)
5068                         vm_page_lock_queues();
5069                 vm_page_wire(dst_page);
5070
5071                 if (cntrl_flags & UPL_BLOCK_ACCESS) {
5072                         /*
5073                          * Mark the page "busy" to block any future page fault
5074                          * on this page.  We'll also remove the mapping
5075                          * of all these pages before leaving this routine.
5076                          */
5077                         assert(!dst_page->fictitious);
5078                         dst_page->busy = TRUE;
5079                 }
5080
5081                 if (upl_ptr) {
5082                         if (cntrl_flags & UPL_SET_LITE) {
5083                                 int     pg_num;
5084                                 pg_num = (dst_offset-offset)/PAGE_SIZE;
5085                                 lite_list[pg_num>>5] |= 1 << (pg_num & 31);
5086                         } else {
5087                                 /*
5088                                  * Convert the fictitious page to a
5089                                  * private shadow of the real page.
5090                                  */
5091                                 assert(alias_page->fictitious);
5092                                 alias_page->fictitious = FALSE;
5093                                 alias_page->private = TRUE;
5094                                 alias_page->pageout = TRUE;
5095                                 alias_page->phys_page = dst_page->phys_page;
5096                                 vm_page_wire(alias_page);
5097
5098                                 vm_page_insert(alias_page,
5099                                         upl->map_object, size - xfer_size);
5100                                 assert(!alias_page->wanted);
5101                                 alias_page->busy = FALSE;
5102                                 alias_page->absent = FALSE;
5103                         }
5104
5105                         /* expect the page to be used */
5106                         dst_page->reference = TRUE;
5107
5108                         if (!(cntrl_flags & UPL_COPYOUT_FROM))
5109                                 dst_page->dirty = TRUE;
5110                         alias_page = NULL;
5111
5112                         if (dst_page->phys_page > upl->highest_page)
5113                                 upl->highest_page = dst_page->phys_page;
5114
5115                         if (user_page_list) {
5116                                 user_page_list[entry].phys_addr
5117                                         = dst_page->phys_page;
5118                                 user_page_list[entry].dirty =
5119                                                 dst_page->dirty;
5120                                 user_page_list[entry].pageout =
5121                                                 dst_page->pageout;
5122                                 user_page_list[entry].absent =
5123                                                 dst_page->absent;
5124                                 user_page_list[entry].precious =
5125                                                 dst_page->precious;
5126                         }
5127                 }
5128                 if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
5129                         delayed_unlock = 0;
5130                         vm_page_unlock_queues();
5131                 }
5132                 entry++;
5133                 dst_offset += PAGE_SIZE_64;
5134                 xfer_size -= PAGE_SIZE;
5135         }
5136         if (delayed_unlock)
5137                 vm_page_unlock_queues();
5138
5139         if (upl->flags & UPL_INTERNAL) {
5140                 if(page_list_count != NULL)
5141                         *page_list_count = 0;
5142         } else if (*page_list_count > entry) {
5143                 if(page_list_count != NULL)
5144                         *page_list_count = entry;
5145         }
5146
5147         if (alias_page != NULL) {
5148                 vm_page_lock_queues();
5149                 vm_page_free(alias_page);
5150                 vm_page_unlock_queues();
5151         }
5152
5153         vm_object_unlock(object);
5154
5155         if (cntrl_flags & UPL_BLOCK_ACCESS) {
5156                 /*
5157                  * We've marked all the pages "busy" so that future
5158                  * page faults will block.
5159                  * Now remove the mapping for these pages, so that they
5160                  * can't be accessed without causing a page fault.
5161                  */
5162                 vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
5163                                        PMAP_NULL, 0, VM_PROT_NONE);
5164         }
5165
5166         return KERN_SUCCESS;
5167
5168
5169 return_err:
5170         if (delayed_unlock)
5171                 vm_page_unlock_queues();
5172
5173         for (; offset < dst_offset; offset += PAGE_SIZE) {
5174                 dst_page = vm_page_lookup(object, offset);
5175
5176                 if (dst_page == VM_PAGE_NULL)
5177                         panic("vm_object_iopl_request: Wired pages missing. \n");
5178                 vm_page_lock_queues();
5179                 vm_page_unwire(dst_page);
5180                 vm_page_unlock_queues();
5181                 VM_STAT(reactivations++);
5182         }
5183         vm_object_paging_end(object);
5184         vm_object_unlock(object);
5185         upl_destroy(upl);
5186
5187         return ret;
5188 }
5189
5190
5191 kern_return_t
5192 upl_transpose(
5193         upl_t           upl1,
5194         upl_t           upl2)
5195 {
5196         kern_return_t           retval;
5197         boolean_t               upls_locked;
5198         vm_object_t             object1, object2;
5199
5200         if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2) {
5201                 return KERN_INVALID_ARGUMENT;
5202         }
5203
5204         upls_locked = FALSE;
5205
5206         /*
5207          * Since we need to lock both UPLs at the same time,
5208          * avoid deadlocks by always taking locks in the same order.
5209          */
5210         if (upl1 < upl2) {
5211                 upl_lock(upl1);
5212                 upl_lock(upl2);
5213         } else {
5214                 upl_lock(upl2);
5215                 upl_lock(upl1);
5216         }
5217         upls_locked = TRUE;     /* the UPLs will need to be unlocked */
5218
5219         object1 = upl1->map_object;
5220         object2 = upl2->map_object;
5221
5222         if (upl1->offset != 0 || upl2->offset != 0 ||
5223             upl1->size != upl2->size) {
5224                 /*
5225                  * We deal only with full objects, not subsets.
5226                  * That's because we exchange the entire backing store info
5227                  * for the objects: pager, resident pages, etc...  We can't do
5228                  * only part of it.
5229                  */
5230                 retval = KERN_INVALID_VALUE;
5231                 goto done;
5232         }
5233
5234         /*
5235          * Tranpose the VM objects' backing store.
5236          */
5237         retval = vm_object_transpose(object1, object2,
5238                                      (vm_object_size_t) upl1->size);
5239
5240         if (retval == KERN_SUCCESS) {
5241                 /*
5242                  * Make each UPL point to the correct VM object, i.e. the
5243                  * object holding the pages that the UPL refers to...
5244                  */
5245                 upl1->map_object = object2;
5246                 upl2->map_object = object1;
5247         }
5248
5249 done:
5250         /*
5251          * Cleanup.
5252          */
5253         if (upls_locked) {
5254                 upl_unlock(upl1);
5255                 upl_unlock(upl2);
5256                 upls_locked = FALSE;
5257         }
5258
5259         return retval;
5260 }
5261
5262 /*
5263  * ENCRYPTED SWAP:
5264  *
5265  * Rationale:  the user might have some encrypted data on disk (via
5266  * FileVault or any other mechanism).  That data is then decrypted in
5267  * memory, which is safe as long as the machine is secure.  But that
5268  * decrypted data in memory could be paged out to disk by the default
5269  * pager.  The data would then be stored on disk in clear (not encrypted)
5270  * and it could be accessed by anyone who gets physical access to the
5271  * disk (if the laptop or the disk gets stolen for example).  This weakens
5272  * the security offered by FileVault.
5273  *
5274  * Solution:  the default pager will optionally request that all the
5275  * pages it gathers for pageout be encrypted, via the UPL interfaces,
5276  * before it sends this UPL to disk via the vnode_pageout() path.
5277  *
5278  * Notes:
5279  *
5280  * To avoid disrupting the VM LRU algorithms, we want to keep the
5281  * clean-in-place mechanisms, which allow us to send some extra pages to
5282  * swap (clustering) without actually removing them from the user's
5283  * address space.  We don't want the user to unknowingly access encrypted
5284  * data, so we have to actually remove the encrypted pages from the page
5285  * table.  When the user accesses the data, the hardware will fail to
5286  * locate the virtual page in its page table and will trigger a page
5287  * fault.  We can then decrypt the page and enter it in the page table
5288  * again.  Whenever we allow the user to access the contents of a page,
5289  * we have to make sure it's not encrypted.
5290  *
5291  *
5292  */
5293 /*
5294  * ENCRYPTED SWAP:
5295  * Reserve of virtual addresses in the kernel address space.
5296  * We need to map the physical pages in the kernel, so that we
5297  * can call the encryption/decryption routines with a kernel
5298  * virtual address.  We keep this pool of pre-allocated kernel
5299  * virtual addresses so that we don't have to scan the kernel's
5300  * virtaul address space each time we need to encrypt or decrypt
5301  * a physical page.
5302  * It would be nice to be able to encrypt and decrypt in physical
5303  * mode but that might not always be more efficient...
5304  */
5305 decl_simple_lock_data(,vm_paging_lock)
5306 #define VM_PAGING_NUM_PAGES     64
5307 vm_map_offset_t vm_paging_base_address = 0;
5308 boolean_t       vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
5309 int             vm_paging_max_index = 0;
5310 unsigned long   vm_paging_no_kernel_page = 0;
5311 unsigned long   vm_paging_objects_mapped = 0;
5312 unsigned long   vm_paging_pages_mapped = 0;
5313 unsigned long   vm_paging_objects_mapped_slow = 0;
5314 unsigned long   vm_paging_pages_mapped_slow = 0;
5315
5316 /*
5317  * ENCRYPTED SWAP:
5318  * vm_paging_map_object:
5319  *      Maps part of a VM object's pages in the kernel
5320  *      virtual address space, using the pre-allocated
5321  *      kernel virtual addresses, if possible.
5322  * Context:
5323  *      The VM object is locked.  This lock will get
5324  *      dropped and re-acquired though.
5325  */
5326 kern_return_t
5327 vm_paging_map_object(
5328         vm_map_offset_t         *address,
5329         vm_page_t               page,
5330         vm_object_t             object,
5331         vm_object_offset_t      offset,
5332         vm_map_size_t           *size)
5333 {
5334         kern_return_t           kr;
5335         vm_map_offset_t         page_map_offset;
5336         vm_map_size_t           map_size;
5337         vm_object_offset_t      object_offset;
5338 #ifdef __ppc__
5339         int                     i;
5340         vm_map_entry_t          map_entry;
5341 #endif /* __ppc__ */
5342
5343
5344 #ifdef __ppc__
5345         if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
5346                 /*
5347                  * Optimization for the PowerPC.
5348                  * Use one of the pre-allocated kernel virtual addresses
5349                  * and just enter the VM page in the kernel address space
5350                  * at that virtual address.
5351                  */
5352                 vm_object_unlock(object);
5353                 simple_lock(&vm_paging_lock);
5354
5355                 if (vm_paging_base_address == 0) {
5356                         /*
5357                          * Initialize our pool of pre-allocated kernel
5358                          * virtual addresses.
5359                          */
5360                         simple_unlock(&vm_paging_lock);
5361                         page_map_offset = 0;
5362                         kr = vm_map_find_space(kernel_map,
5363                                                &page_map_offset,
5364                                                VM_PAGING_NUM_PAGES * PAGE_SIZE,
5365                                                0,
5366                                                0,
5367                                                &map_entry);
5368                         if (kr != KERN_SUCCESS) {
5369                                 panic("vm_paging_map_object: "
5370                                       "kernel_map full\n");
5371                         }
5372                         map_entry->object.vm_object = kernel_object;
5373                         map_entry->offset =
5374                                 page_map_offset - VM_MIN_KERNEL_ADDRESS;
5375                         vm_object_reference(kernel_object);
5376                         vm_map_unlock(kernel_map);
5377
5378                         simple_lock(&vm_paging_lock);
5379                         if (vm_paging_base_address != 0) {
5380                                 /* someone raced us and won: undo */
5381                                 simple_unlock(&vm_paging_lock);
5382                                 kr = vm_map_remove(kernel_map,
5383                                                    page_map_offset,
5384                                                    page_map_offset +
5385                                                    (VM_PAGING_NUM_PAGES
5386                                                     * PAGE_SIZE),
5387                                                    VM_MAP_NO_FLAGS);
5388                                 assert(kr == KERN_SUCCESS);
5389                                 simple_lock(&vm_paging_lock);
5390                         } else {
5391                                 vm_paging_base_address = page_map_offset;
5392                         }
5393                 }
5394
5395                 /*
5396                  * Try and find an available kernel virtual address
5397                  * from our pre-allocated pool.
5398                  */
5399                 page_map_offset = 0;
5400                 for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
5401                         if (vm_paging_page_inuse[i] == FALSE) {
5402                                 page_map_offset = vm_paging_base_address +
5403                                         (i * PAGE_SIZE);
5404                                 break;
5405                         }
5406                 }
5407
5408                 if (page_map_offset != 0) {
5409                         /*
5410                          * We found a kernel virtual address;
5411                          * map the physical page to that virtual address.
5412                          */
5413                         if (i > vm_paging_max_index) {
5414                                 vm_paging_max_index = i;
5415                         }
5416                         vm_paging_page_inuse[i] = TRUE;
5417                         simple_unlock(&vm_paging_lock);
5418                         pmap_map_block(kernel_pmap,
5419                                        page_map_offset,
5420                                        page->phys_page,
5421                                        1,                                               /* Size is number of 4k pages */
5422                                        VM_PROT_DEFAULT,
5423                                        ((int) page->object->wimg_bits &
5424                                         VM_WIMG_MASK),
5425                                        0);
5426                         vm_paging_objects_mapped++;
5427                         vm_paging_pages_mapped++;
5428                         *address = page_map_offset;
5429                         vm_object_lock(object);
5430
5431                         /* all done and mapped, ready to use ! */
5432                         return KERN_SUCCESS;
5433                 }
5434
5435                 /*
5436                  * We ran out of pre-allocated kernel virtual
5437                  * addresses.  Just map the page in the kernel
5438                  * the slow and regular way.
5439                  */
5440                 vm_paging_no_kernel_page++;
5441                 simple_unlock(&vm_paging_lock);
5442                 vm_object_lock(object);
5443         }
5444 #endif /* __ppc__ */
5445
5446         object_offset = vm_object_trunc_page(offset);
5447         map_size = vm_map_round_page(*size);
5448
5449         /*
5450          * Try and map the required range of the object
5451          * in the kernel_map
5452          */
5453
5454         /* don't go beyond the object's end... */
5455         if (object_offset >= object->size) {
5456                 map_size = 0;
5457         } else if (map_size > object->size - offset) {
5458                 map_size = object->size - offset;
5459         }
5460
5461         vm_object_reference_locked(object);     /* for the map entry */
5462         vm_object_unlock(object);
5463
5464         kr = vm_map_enter(kernel_map,
5465                           address,
5466                           map_size,
5467                           0,
5468                           VM_FLAGS_ANYWHERE,
5469                           object,
5470                           object_offset,
5471                           FALSE,
5472                           VM_PROT_DEFAULT,
5473                           VM_PROT_ALL,
5474                           VM_INHERIT_NONE);
5475         if (kr != KERN_SUCCESS) {
5476                 *address = 0;
5477                 *size = 0;
5478                 vm_object_deallocate(object);   /* for the map entry */
5479                 return kr;
5480         }
5481
5482         *size = map_size;
5483
5484         /*
5485          * Enter the mapped pages in the page table now.
5486          */
5487         vm_object_lock(object);
5488         for (page_map_offset = 0;
5489              map_size != 0;
5490              map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
5491                 unsigned int    cache_attr;
5492
5493                 page = vm_page_lookup(object, offset + page_map_offset);
5494                 if (page == VM_PAGE_NULL) {
5495                         panic("vm_paging_map_object: no page !?");
5496                 }
5497                 if (page->no_isync == TRUE) {
5498                         pmap_sync_page_data_phys(page->phys_page);
5499                 }
5500                 cache_attr = ((unsigned int) object->wimg_bits) & VM_WIMG_MASK;
5501
5502                 PMAP_ENTER(kernel_pmap,
5503                            *address + page_map_offset,
5504                            page,
5505                            VM_PROT_DEFAULT,
5506                            cache_attr,
5507                            FALSE);
5508         }
5509
5510         vm_paging_objects_mapped_slow++;
5511         vm_paging_pages_mapped_slow += map_size / PAGE_SIZE_64;
5512
5513         return KERN_SUCCESS;
5514 }
5515
5516 /*
5517  * ENCRYPTED SWAP:
5518  * vm_paging_unmap_object:
5519  *      Unmaps part of a VM object's pages from the kernel
5520  *      virtual address space.
5521  * Context:
5522  *      The VM object is locked.  This lock will get
5523  *      dropped and re-acquired though.
5524  */
5525 void
5526 vm_paging_unmap_object(
5527         vm_object_t     object,
5528         vm_map_offset_t start,
5529         vm_map_offset_t end)
5530 {
5531         kern_return_t   kr;
5532 #ifdef __ppc__
5533         int             i;
5534 #endif /* __ppc__ */
5535
5536         if ((vm_paging_base_address == 0) &&
5537             ((start < vm_paging_base_address) ||
5538              (end > (vm_paging_base_address
5539                      + (VM_PAGING_NUM_PAGES * PAGE_SIZE))))) {
5540                 /*
5541                  * We didn't use our pre-allocated pool of
5542                  * kernel virtual address.  Deallocate the
5543                  * virtual memory.
5544                  */
5545                 if (object != VM_OBJECT_NULL) {
5546                         vm_object_unlock(object);
5547                 }
5548                 kr = vm_map_remove(kernel_map, start, end, VM_MAP_NO_FLAGS);
5549                 if (object != VM_OBJECT_NULL) {
5550                         vm_object_lock(object);
5551                 }
5552                 assert(kr == KERN_SUCCESS);
5553         } else {
5554                 /*
5555                  * We used a kernel virtual address from our
5556                  * pre-allocated pool.  Put it back in the pool
5557                  * for next time.
5558                  */
5559 #ifdef __ppc__
5560                 assert(end - start == PAGE_SIZE);
5561                 i = (start - vm_paging_base_address) >> PAGE_SHIFT;
5562
5563                 /* undo the pmap mapping */
5564                 mapping_remove(kernel_pmap, start);
5565
5566                 simple_lock(&vm_paging_lock);
5567                 vm_paging_page_inuse[i] = FALSE;
5568                 simple_unlock(&vm_paging_lock);
5569 #endif /* __ppc__ */
5570         }
5571 }
5572
5573 /*
5574  * Encryption data.
5575  * "iv" is the "initial vector".  Ideally, we want to
5576  * have a different one for each page we encrypt, so that
5577  * crackers can't find encryption patterns too easily.
5578  */
5579 #define SWAP_CRYPT_AES_KEY_SIZE 128     /* XXX 192 and 256 don't work ! */
5580 boolean_t               swap_crypt_ctx_initialized = FALSE;
5581 aes_32t                 swap_crypt_key[8]; /* big enough for a 256 key */
5582 aes_ctx                 swap_crypt_ctx;
5583 const unsigned char     swap_crypt_null_iv[AES_BLOCK_SIZE] = {0xa, };
5584
5585 #if DEBUG
5586 boolean_t               swap_crypt_ctx_tested = FALSE;
5587 unsigned char swap_crypt_test_page_ref[4096] __attribute__((aligned(4096)));
5588 unsigned char swap_crypt_test_page_encrypt[4096] __attribute__((aligned(4096)));
5589 unsigned char swap_crypt_test_page_decrypt[4096] __attribute__((aligned(4096)));
5590 #endif /* DEBUG */
5591
5592 extern u_long random(void);
5593
5594 /*
5595  * Initialize the encryption context: key and key size.
5596  */
5597 void swap_crypt_ctx_initialize(void); /* forward */
5598 void
5599 swap_crypt_ctx_initialize(void)
5600 {
5601         unsigned int    i;
5602
5603         /*
5604          * No need for locking to protect swap_crypt_ctx_initialized
5605          * because the first use of encryption will come from the
5606          * pageout thread (we won't pagein before there's been a pageout)
5607          * and there's only one pageout thread.
5608          */
5609         if (swap_crypt_ctx_initialized == FALSE) {
5610                 for (i = 0;
5611                      i < (sizeof (swap_crypt_key) /
5612                           sizeof (swap_crypt_key[0]));
5613                      i++) {
5614                         swap_crypt_key[i] = random();
5615                 }
5616                 aes_encrypt_key((const unsigned char *) swap_crypt_key,
5617                                 SWAP_CRYPT_AES_KEY_SIZE,
5618                                 &swap_crypt_ctx.encrypt);
5619                 aes_decrypt_key((const unsigned char *) swap_crypt_key,
5620                                 SWAP_CRYPT_AES_KEY_SIZE,
5621                                 &swap_crypt_ctx.decrypt);
5622                 swap_crypt_ctx_initialized = TRUE;
5623         }
5624
5625 #if DEBUG
5626         /*
5627          * Validate the encryption algorithms.
5628          */
5629         if (swap_crypt_ctx_tested == FALSE) {
5630                 /* initialize */
5631                 for (i = 0; i < 4096; i++) {
5632                         swap_crypt_test_page_ref[i] = (char) i;
5633                 }
5634                 /* encrypt */
5635                 aes_encrypt_cbc(swap_crypt_test_page_ref,
5636                                 swap_crypt_null_iv,
5637                                 PAGE_SIZE / AES_BLOCK_SIZE,
5638                                 swap_crypt_test_page_encrypt,
5639                                 &swap_crypt_ctx.encrypt);
5640                 /* decrypt */
5641                 aes_decrypt_cbc(swap_crypt_test_page_encrypt,
5642                                 swap_crypt_null_iv,
5643                                 PAGE_SIZE / AES_BLOCK_SIZE,
5644                                 swap_crypt_test_page_decrypt,
5645                                 &swap_crypt_ctx.decrypt);
5646                 /* compare result with original */
5647                 for (i = 0; i < 4096; i ++) {
5648                         if (swap_crypt_test_page_decrypt[i] !=
5649                             swap_crypt_test_page_ref[i]) {
5650                                 panic("encryption test failed");
5651                         }
5652                 }
5653
5654                 /* encrypt again */
5655                 aes_encrypt_cbc(swap_crypt_test_page_decrypt,
5656                                 swap_crypt_null_iv,
5657                                 PAGE_SIZE / AES_BLOCK_SIZE,
5658                                 swap_crypt_test_page_decrypt,
5659                                 &swap_crypt_ctx.encrypt);
5660                 /* decrypt in place */
5661                 aes_decrypt_cbc(swap_crypt_test_page_decrypt,
5662                                 swap_crypt_null_iv,
5663                                 PAGE_SIZE / AES_BLOCK_SIZE,
5664                                 swap_crypt_test_page_decrypt,
5665                                 &swap_crypt_ctx.decrypt);
5666                 for (i = 0; i < 4096; i ++) {
5667                         if (swap_crypt_test_page_decrypt[i] !=
5668                             swap_crypt_test_page_ref[i]) {
5669                                 panic("in place encryption test failed");
5670                         }
5671                 }
5672
5673                 swap_crypt_ctx_tested = TRUE;
5674         }
5675 #endif /* DEBUG */
5676 }
5677
5678 /*
5679  * ENCRYPTED SWAP:
5680  * vm_page_encrypt:
5681  *      Encrypt the given page, for secure paging.
5682  *      The page might already be mapped at kernel virtual
5683  *      address "kernel_mapping_offset".  Otherwise, we need
5684  *      to map it.
5685  *
5686  * Context:
5687  *      The page's object is locked, but this lock will be released
5688  *      and re-acquired.
5689  *      The page is busy and not accessible by users (not entered in any pmap).
5690  */
5691 void
5692 vm_page_encrypt(
5693         vm_page_t       page,
5694         vm_map_offset_t kernel_mapping_offset)
5695 {
5696         int                     clear_refmod = 0;
5697         kern_return_t           kr;
5698         boolean_t               page_was_referenced;
5699         boolean_t               page_was_modified;
5700         vm_map_size_t           kernel_mapping_size;
5701         vm_offset_t             kernel_vaddr;
5702         union {
5703                 unsigned char   aes_iv[AES_BLOCK_SIZE];
5704                 struct {
5705                         memory_object_t         pager_object;
5706                         vm_object_offset_t      paging_offset;
5707                 } vm;
5708         } encrypt_iv;
5709
5710         if (! vm_pages_encrypted) {
5711                 vm_pages_encrypted = TRUE;
5712         }
5713
5714         assert(page->busy);
5715         assert(page->dirty || page->precious);
5716
5717         if (page->encrypted) {
5718                 /*
5719                  * Already encrypted: no need to do it again.
5720                  */
5721                 vm_page_encrypt_already_encrypted_counter++;
5722                 return;
5723         }
5724         ASSERT_PAGE_DECRYPTED(page);
5725
5726         /*
5727          * Gather the "reference" and "modified" status of the page.
5728          * We'll restore these values after the encryption, so that
5729          * the encryption is transparent to the rest of the system
5730          * and doesn't impact the VM's LRU logic.
5731          */
5732         page_was_referenced =
5733                 (page->reference || pmap_is_referenced(page->phys_page));
5734         page_was_modified =
5735                 (page->dirty || pmap_is_modified(page->phys_page));
5736
5737         if (kernel_mapping_offset == 0) {
5738                 /*
5739                  * The page hasn't already been mapped in kernel space
5740                  * by the caller.  Map it now, so that we can access
5741                  * its contents and encrypt them.
5742                  */
5743                 kernel_mapping_size = PAGE_SIZE;
5744                 kr = vm_paging_map_object(&kernel_mapping_offset,
5745                                           page,
5746                                           page->object,
5747                                           page->offset,
5748                                           &kernel_mapping_size);
5749                 if (kr != KERN_SUCCESS) {
5750                         panic("vm_page_encrypt: "
5751                               "could not map page in kernel: 0x%x\n",
5752                               kr);
5753                 }
5754         } else {
5755                 kernel_mapping_size = 0;
5756         }
5757         kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
5758
5759         if (swap_crypt_ctx_initialized == FALSE) {
5760                 swap_crypt_ctx_initialize();
5761         }
5762         assert(swap_crypt_ctx_initialized);
5763
5764         /*
5765          * Prepare an "initial vector" for the encryption.
5766          * We use the "pager" and the "paging_offset" for that
5767          * page to obfuscate the encrypted data a bit more and
5768          * prevent crackers from finding patterns that they could
5769          * use to break the key.
5770          */
5771         bzero(&encrypt_iv.aes_iv[0], sizeof (encrypt_iv.aes_iv));
5772         encrypt_iv.vm.pager_object = page->object->pager;
5773         encrypt_iv.vm.paging_offset =
5774                 page->object->paging_offset + page->offset;
5775
5776         vm_object_unlock(page->object);
5777
5778         /* encrypt the "initial vector" */
5779         aes_encrypt_cbc((const unsigned char *) &encrypt_iv.aes_iv[0],
5780                         swap_crypt_null_iv,
5781                         1,
5782                         &encrypt_iv.aes_iv[0],
5783                         &swap_crypt_ctx.encrypt);
5784
5785         /*
5786          * Encrypt the page.
5787          */
5788         aes_encrypt_cbc((const unsigned char *) kernel_vaddr,
5789                         &encrypt_iv.aes_iv[0],
5790                         PAGE_SIZE / AES_BLOCK_SIZE,
5791                         (unsigned char *) kernel_vaddr,
5792                         &swap_crypt_ctx.encrypt);
5793
5794         vm_page_encrypt_counter++;
5795
5796         vm_object_lock(page->object);
5797
5798         /*
5799          * Unmap the page from the kernel's address space,
5800          * if we had to map it ourselves.  Otherwise, let
5801          * the caller undo the mapping if needed.
5802          */
5803         if (kernel_mapping_size != 0) {
5804                 vm_paging_unmap_object(page->object,
5805                                        kernel_mapping_offset,
5806                                        kernel_mapping_offset + kernel_mapping_size);
5807         }
5808
5809         /*
5810          * Restore the "reference" and "modified" bits.
5811          * This should clean up any impact the encryption had
5812          * on them.
5813          */
5814         if (! page_was_referenced) {
5815                 clear_refmod |= VM_MEM_REFERENCED;
5816                 page->reference = FALSE;
5817         }
5818         if (! page_was_modified) {
5819                 clear_refmod |= VM_MEM_MODIFIED;
5820                 page->dirty = FALSE;
5821         }
5822         if (clear_refmod)
5823                 pmap_clear_refmod(page->phys_page, clear_refmod);
5824
5825         page->encrypted = TRUE;
5826 }
5827
5828 /*
5829  * ENCRYPTED SWAP:
5830  * vm_page_decrypt:
5831  *      Decrypt the given page.
5832  *      The page might already be mapped at kernel virtual
5833  *      address "kernel_mapping_offset".  Otherwise, we need
5834  *      to map it.
5835  *
5836  * Context:
5837  *      The page's VM object is locked but will be unlocked and relocked.
5838  *      The page is busy and not accessible by users (not entered in any pmap).
5839  */
5840 void
5841 vm_page_decrypt(
5842         vm_page_t       page,
5843         vm_map_offset_t kernel_mapping_offset)
5844 {
5845         int                     clear_refmod = 0;
5846         kern_return_t           kr;
5847         vm_map_size_t           kernel_mapping_size;
5848         vm_offset_t             kernel_vaddr;
5849         boolean_t               page_was_referenced;
5850         union {
5851                 unsigned char   aes_iv[AES_BLOCK_SIZE];
5852                 struct {
5853                         memory_object_t         pager_object;
5854                         vm_object_offset_t      paging_offset;
5855                 } vm;
5856         } decrypt_iv;
5857
5858         assert(page->busy);
5859         assert(page->encrypted);
5860
5861         /*
5862          * Gather the "reference" status of the page.
5863          * We'll restore its value after the decryption, so that
5864          * the decryption is transparent to the rest of the system
5865          * and doesn't impact the VM's LRU logic.
5866          */
5867         page_was_referenced =
5868                 (page->reference || pmap_is_referenced(page->phys_page));
5869
5870         if (kernel_mapping_offset == 0) {
5871                 /*
5872                  * The page hasn't already been mapped in kernel space
5873                  * by the caller.  Map it now, so that we can access
5874                  * its contents and decrypt them.
5875                  */
5876                 kernel_mapping_size = PAGE_SIZE;
5877                 kr = vm_paging_map_object(&kernel_mapping_offset,
5878                                           page,
5879                                           page->object,
5880                                           page->offset,
5881                                           &kernel_mapping_size);
5882                 if (kr != KERN_SUCCESS) {
5883                         panic("vm_page_decrypt: "
5884                               "could not map page in kernel: 0x%x\n");
5885                 }
5886         } else {
5887                 kernel_mapping_size = 0;
5888         }
5889         kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
5890
5891         assert(swap_crypt_ctx_initialized);
5892
5893         /*
5894          * Prepare an "initial vector" for the decryption.
5895          * It has to be the same as the "initial vector" we
5896          * used to encrypt that page.
5897          */
5898         bzero(&decrypt_iv.aes_iv[0], sizeof (decrypt_iv.aes_iv));
5899         decrypt_iv.vm.pager_object = page->object->pager;
5900         decrypt_iv.vm.paging_offset =
5901                 page->object->paging_offset + page->offset;
5902
5903         vm_object_unlock(page->object);
5904
5905         /* encrypt the "initial vector" */
5906         aes_encrypt_cbc((const unsigned char *) &decrypt_iv.aes_iv[0],
5907                         swap_crypt_null_iv,
5908                         1,
5909                         &decrypt_iv.aes_iv[0],
5910                         &swap_crypt_ctx.encrypt);
5911
5912         /*
5913          * Decrypt the page.
5914          */
5915         aes_decrypt_cbc((const unsigned char *) kernel_vaddr,
5916                         &decrypt_iv.aes_iv[0],
5917                         PAGE_SIZE / AES_BLOCK_SIZE,
5918                         (unsigned char *) kernel_vaddr,
5919                         &swap_crypt_ctx.decrypt);
5920         vm_page_decrypt_counter++;
5921
5922         vm_object_lock(page->object);
5923
5924         /*
5925          * Unmap the page from the kernel's address space,
5926          * if we had to map it ourselves.  Otherwise, let
5927          * the caller undo the mapping if needed.
5928          */
5929         if (kernel_mapping_size != 0) {
5930                 vm_paging_unmap_object(page->object,
5931                                        kernel_vaddr,
5932                                        kernel_vaddr + PAGE_SIZE);
5933         }
5934
5935         /*
5936          * After decryption, the page is actually clean.
5937          * It was encrypted as part of paging, which "cleans"
5938          * the "dirty" pages.
5939          * Noone could access it after it was encrypted
5940          * and the decryption doesn't count.
5941          */
5942         page->dirty = FALSE;
5943         clear_refmod = VM_MEM_MODIFIED;
5944
5945         /* restore the "reference" bit */
5946         if (! page_was_referenced) {
5947                 page->reference = FALSE;
5948                 clear_refmod |= VM_MEM_REFERENCED;
5949         }
5950         pmap_clear_refmod(page->phys_page, clear_refmod);
5951
5952         page->encrypted = FALSE;
5953
5954         /*
5955          * We've just modified the page's contents via the data cache and part
5956          * of the new contents might still be in the cache and not yet in RAM.
5957          * Since the page is now available and might get gathered in a UPL to
5958          * be part of a DMA transfer from a driver that expects the memory to
5959          * be coherent at this point, we have to flush the data cache.
5960          */
5961         pmap_sync_page_attributes_phys(page->phys_page);
5962         /*
5963          * Since the page is not mapped yet, some code might assume that it
5964          * doesn't need to invalidate the instruction cache when writing to
5965          * that page.  That code relies on "no_isync" being set, so that the
5966          * caches get syncrhonized when the page is first mapped.  So we need
5967          * to set "no_isync" here too, despite the fact that we just
5968          * synchronized the caches above...
5969          */
5970         page->no_isync = TRUE;
5971 }
5972
5973 unsigned long upl_encrypt_upls = 0;
5974 unsigned long upl_encrypt_pages = 0;
5975
5976 /*
5977  * ENCRYPTED SWAP:
5978  *
5979  * upl_encrypt:
5980  *      Encrypts all the pages in the UPL, within the specified range.
5981  *
5982  */
5983 void
5984 upl_encrypt(
5985         upl_t                   upl,
5986         upl_offset_t            crypt_offset,
5987         upl_size_t              crypt_size)
5988 {
5989         upl_size_t              upl_size;
5990         upl_offset_t            upl_offset;
5991         vm_object_t             upl_object;
5992         vm_page_t               page;
5993         vm_object_t             shadow_object;
5994         vm_object_offset_t      shadow_offset;
5995         vm_object_offset_t      paging_offset;
5996         vm_object_offset_t      base_offset;
5997
5998         upl_encrypt_upls++;
5999         upl_encrypt_pages += crypt_size / PAGE_SIZE;
6000
6001         upl_lock(upl);
6002
6003         upl_object = upl->map_object;
6004         upl_offset = upl->offset;
6005         upl_size = upl->size;
6006
6007         upl_unlock(upl);
6008
6009         vm_object_lock(upl_object);
6010
6011         /*
6012          * Find the VM object that contains the actual pages.
6013          */
6014         if (upl_object->pageout) {
6015                 shadow_object = upl_object->shadow;
6016                 /*
6017                  * The offset in the shadow object is actually also
6018                  * accounted for in upl->offset.  It possibly shouldn't be
6019                  * this way, but for now don't account for it twice.
6020                  */
6021                 shadow_offset = 0;
6022                 assert(upl_object->paging_offset == 0); /* XXX ? */
6023                 vm_object_lock(shadow_object);
6024         } else {
6025                 shadow_object = upl_object;
6026                 shadow_offset = 0;
6027         }
6028
6029         paging_offset = shadow_object->paging_offset;
6030         vm_object_paging_begin(shadow_object);
6031
6032         if (shadow_object != upl_object) {
6033                 vm_object_unlock(shadow_object);
6034         }
6035         vm_object_unlock(upl_object);
6036
6037         base_offset = shadow_offset;
6038         base_offset += upl_offset;
6039         base_offset += crypt_offset;
6040         base_offset -= paging_offset;
6041         /*
6042          * Unmap the pages, so that nobody can continue accessing them while
6043          * they're encrypted.  After that point, all accesses to these pages
6044          * will cause a page fault and block while the page is being encrypted
6045          * (busy).  After the encryption completes, any access will cause a
6046          * page fault and the page gets decrypted at that time.
6047          */
6048         assert(crypt_offset + crypt_size <= upl_size);
6049         vm_object_pmap_protect(shadow_object,
6050                                base_offset,
6051                                (vm_object_size_t)crypt_size,
6052                                PMAP_NULL,
6053                                0,
6054                                VM_PROT_NONE);
6055
6056         /* XXX FBDP could the object have changed significantly here ? */
6057         vm_object_lock(shadow_object);
6058
6059         for (upl_offset = 0;
6060              upl_offset < crypt_size;
6061              upl_offset += PAGE_SIZE) {
6062                 page = vm_page_lookup(shadow_object,
6063                                       base_offset + upl_offset);
6064                 if (page == VM_PAGE_NULL) {
6065                         panic("upl_encrypt: "
6066                               "no page for (obj=%p,off=%lld+%d)!\n",
6067                               shadow_object,
6068                               base_offset,
6069                               upl_offset);
6070                 }
6071                 vm_page_encrypt(page, 0);
6072         }
6073
6074         vm_object_paging_end(shadow_object);
6075         vm_object_unlock(shadow_object);
6076 }
6077
6078 vm_size_t
6079 upl_get_internal_pagelist_offset(void)
6080 {
6081         return sizeof(struct upl);
6082 }
6083
6084 void
6085 upl_clear_dirty(
6086         upl_t           upl,
6087         boolean_t       value)
6088 {
6089         if (value) {
6090                 upl->flags |= UPL_CLEAR_DIRTY;
6091         } else {
6092                 upl->flags &= ~UPL_CLEAR_DIRTY;
6093         }
6094 }
6095
6096
6097 #ifdef MACH_BSD
6098
6099 boolean_t  upl_page_present(upl_page_info_t *upl, int index)
6100 {
6101         return(UPL_PAGE_PRESENT(upl, index));
6102 }
6103 boolean_t  upl_dirty_page(upl_page_info_t *upl, int index)
6104 {
6105         return(UPL_DIRTY_PAGE(upl, index));
6106 }
6107 boolean_t  upl_valid_page(upl_page_info_t *upl, int index)
6108 {
6109         return(UPL_VALID_PAGE(upl, index));
6110 }
6111 ppnum_t  upl_phys_page(upl_page_info_t *upl, int index)
6112 {
6113         return(UPL_PHYS_PAGE(upl, index));
6114 }
6115
6116 void
6117 vm_countdirtypages(void)
6118 {
6119         vm_page_t m;
6120         int dpages;
6121         int pgopages;
6122         int precpages;
6123
6124
6125         dpages=0;
6126         pgopages=0;
6127         precpages=0;
6128
6129         vm_page_lock_queues();
6130         m = (vm_page_t) queue_first(&vm_page_queue_inactive);
6131         do {
6132                 if (m ==(vm_page_t )0) break;
6133
6134                 if(m->dirty) dpages++;
6135                 if(m->pageout) pgopages++;
6136                 if(m->precious) precpages++;
6137
6138                 assert(m->object != kernel_object);
6139                 m = (vm_page_t) queue_next(&m->pageq);
6140                 if (m ==(vm_page_t )0) break;
6141
6142         } while (!queue_end(&vm_page_queue_inactive,(queue_entry_t) m));
6143         vm_page_unlock_queues();
6144
6145         vm_page_lock_queues();
6146         m = (vm_page_t) queue_first(&vm_page_queue_zf);
6147         do {
6148                 if (m ==(vm_page_t )0) break;
6149
6150                 if(m->dirty) dpages++;
6151                 if(m->pageout) pgopages++;
6152                 if(m->precious) precpages++;
6153
6154                 assert(m->object != kernel_object);
6155                 m = (vm_page_t) queue_next(&m->pageq);
6156                 if (m ==(vm_page_t )0) break;
6157
6158         } while (!queue_end(&vm_page_queue_zf,(queue_entry_t) m));
6159         vm_page_unlock_queues();
6160
6161         printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
6162
6163         dpages=0;
6164         pgopages=0;
6165         precpages=0;
6166
6167         vm_page_lock_queues();
6168         m = (vm_page_t) queue_first(&vm_page_queue_active);
6169
6170         do {
6171                 if(m == (vm_page_t )0) break;
6172                 if(m->dirty) dpages++;
6173                 if(m->pageout) pgopages++;
6174                 if(m->precious) precpages++;
6175
6176                 assert(m->object != kernel_object);
6177                 m = (vm_page_t) queue_next(&m->pageq);
6178                 if(m == (vm_page_t )0) break;
6179
6180         } while (!queue_end(&vm_page_queue_active,(queue_entry_t) m));
6181         vm_page_unlock_queues();
6182
6183         printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
6184
6185 }
6186 #endif /* MACH_BSD */
6187
6188 ppnum_t upl_get_highest_page(
6189         upl_t                   upl)
6190 {
6191         return upl->highest_page;
6192 }
6193
6194 #ifdef UPL_DEBUG
6195 kern_return_t  upl_ubc_alias_set(upl_t upl, unsigned int alias1, unsigned int alias2)
6196 {
6197         upl->ubc_alias1 = alias1;
6198         upl->ubc_alias2 = alias2;
6199         return KERN_SUCCESS;
6200 }
6201 int  upl_ubc_alias_get(upl_t upl, unsigned int * al, unsigned int * al2)
6202 {
6203         if(al)
6204                 *al = upl->ubc_alias1;
6205         if(al2)
6206                 *al2 = upl->ubc_alias2;
6207         return KERN_SUCCESS;
6208 }
6209 #endif /* UPL_DEBUG */
6210
6211
6212
6213 #if     MACH_KDB
6214 #include <ddb/db_output.h>
6215 #include <ddb/db_print.h>
6216 #include <vm/vm_print.h>
6217
6218 #define printf  kdbprintf
6219 void            db_pageout(void);
6220
6221 void
6222 db_vm(void)
6223 {
6224
6225         iprintf("VM Statistics:\n");
6226         db_indent += 2;
6227         iprintf("pages:\n");
6228         db_indent += 2;
6229         iprintf("activ %5d  inact %5d  free  %5d",
6230                 vm_page_active_count, vm_page_inactive_count,
6231                 vm_page_free_count);
6232         printf("   wire  %5d  gobbl %5d\n",
6233                vm_page_wire_count, vm_page_gobble_count);
6234         db_indent -= 2;
6235         iprintf("target:\n");
6236         db_indent += 2;
6237         iprintf("min   %5d  inact %5d  free  %5d",
6238                 vm_page_free_min, vm_page_inactive_target,
6239                 vm_page_free_target);
6240         printf("   resrv %5d\n", vm_page_free_reserved);
6241         db_indent -= 2;
6242         iprintf("pause:\n");
6243         db_pageout();
6244         db_indent -= 2;
6245 }
6246
6247 #if     MACH_COUNTERS
6248 extern int c_laundry_pages_freed;
6249 #endif  /* MACH_COUNTERS */
6250
6251 void
6252 db_pageout(void)
6253 {
6254         iprintf("Pageout Statistics:\n");
6255         db_indent += 2;
6256         iprintf("active %5d  inactv %5d\n",
6257                 vm_pageout_active, vm_pageout_inactive);
6258         iprintf("nolock %5d  avoid  %5d  busy   %5d  absent %5d\n",
6259                 vm_pageout_inactive_nolock, vm_pageout_inactive_avoid,
6260                 vm_pageout_inactive_busy, vm_pageout_inactive_absent);
6261         iprintf("used   %5d  clean  %5d  dirty  %5d\n",
6262                 vm_pageout_inactive_used, vm_pageout_inactive_clean,
6263                 vm_pageout_inactive_dirty);
6264 #if     MACH_COUNTERS
6265         iprintf("laundry_pages_freed %d\n", c_laundry_pages_freed);
6266 #endif  /* MACH_COUNTERS */
6267 #if     MACH_CLUSTER_STATS
6268         iprintf("Cluster Statistics:\n");
6269         db_indent += 2;
6270         iprintf("dirtied   %5d   cleaned  %5d   collisions  %5d\n",
6271                 vm_pageout_cluster_dirtied, vm_pageout_cluster_cleaned,
6272                 vm_pageout_cluster_collisions);
6273         iprintf("clusters  %5d   conversions  %5d\n",
6274                 vm_pageout_cluster_clusters, vm_pageout_cluster_conversions);
6275         db_indent -= 2;
6276         iprintf("Target Statistics:\n");
6277         db_indent += 2;
6278         iprintf("collisions   %5d   page_dirtied  %5d   page_freed  %5d\n",
6279                 vm_pageout_target_collisions, vm_pageout_target_page_dirtied,
6280                 vm_pageout_target_page_freed);
6281         db_indent -= 2;
6282 #endif  /* MACH_CLUSTER_STATS */
6283         db_indent -= 2;
6284 }
6285
6286 #endif  /* MACH_KDB */