osfmk/vm/vm_pageout.c

   1 /*
   2  * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 /*
  23  * @OSF_COPYRIGHT@
  24  */
  25 /*
  26  * Mach Operating System
  27  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  28  * All Rights Reserved.
  29  *
  30  * Permission to use, copy, modify and distribute this software and its
  31  * documentation is hereby granted, provided that both the copyright
  32  * notice and this permission notice appear in all copies of the
  33  * software, derivative works or modified versions, and any portions
  34  * thereof, and that both notices appear in supporting documentation.
  35  *
  36  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  37  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  38  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  39  *
  40  * Carnegie Mellon requests users of this software to return to
  41  *
  42  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  43  *  School of Computer Science
  44  *  Carnegie Mellon University
  45  *  Pittsburgh PA 15213-3890
  46  *
  47  * any improvements or extensions that they make and grant Carnegie Mellon
  48  * the rights to redistribute these changes.
  49  */
  50 /*
  51  */
  52 /*
  53  *      File:   vm/vm_pageout.c
  54  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  55  *      Date:   1985
  56  *
  57  *      The proverbial page-out daemon.
  58  */
  59
  60 #include <stdint.h>
  61
  62 #include <debug.h>
  63 #include <mach_pagemap.h>
  64 #include <mach_cluster_stats.h>
  65 #include <mach_kdb.h>
  66 #include <advisory_pageout.h>
  67
  68 #include <mach/mach_types.h>
  69 #include <mach/memory_object.h>
  70 #include <mach/memory_object_default.h>
  71 #include <mach/memory_object_control_server.h>
  72 #include <mach/mach_host_server.h>
  73 #include <mach/upl.h>
  74 #include <mach/vm_map.h>
  75 #include <mach/vm_param.h>
  76 #include <mach/vm_statistics.h>
  77
  78 #include <kern/kern_types.h>
  79 #include <kern/counters.h>
  80 #include <kern/host_statistics.h>
  81 #include <kern/machine.h>
  82 #include <kern/misc_protos.h>
  83 #include <kern/thread.h>
  84 #include <kern/xpr.h>
  85 #include <kern/kalloc.h>
  86
  87 #include <machine/vm_tuning.h>
  88
  89 #include <vm/pmap.h>
  90 #include <vm/vm_fault.h>
  91 #include <vm/vm_map.h>
  92 #include <vm/vm_object.h>
  93 #include <vm/vm_page.h>
  94 #include <vm/vm_pageout.h>
  95 #include <vm/vm_protos.h> /* must be last */
  96
  97 /*
  98  * ENCRYPTED SWAP:
  99  */
 100 #ifdef __ppc__
 101 #include <ppc/mappings.h>
 102 #endif /* __ppc__ */
 103 #include <../bsd/crypto/aes/aes.h>
 104
 105 extern ipc_port_t       memory_manager_default;
 106
 107
 108 #ifndef VM_PAGEOUT_BURST_ACTIVE_THROTTLE
 109 #define VM_PAGEOUT_BURST_ACTIVE_THROTTLE  10000  /* maximum iterations of the active queue to move pages to inactive */
 110 #endif
 111
 112 #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE
 113 #define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096  /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
 114 #endif
 115
 116 #ifndef VM_PAGEOUT_DEADLOCK_RELIEF
 117 #define VM_PAGEOUT_DEADLOCK_RELIEF 100  /* number of pages to move to break deadlock */
 118 #endif
 119
 120 #ifndef VM_PAGEOUT_INACTIVE_RELIEF
 121 #define VM_PAGEOUT_INACTIVE_RELIEF 50   /* minimum number of pages to move to the inactive q */
 122 #endif
 123
 124 #ifndef VM_PAGE_LAUNDRY_MAX
 125 #define VM_PAGE_LAUNDRY_MAX     16UL    /* maximum pageouts on a given pageout queue */
 126 #endif  /* VM_PAGEOUT_LAUNDRY_MAX */
 127
 128 #ifndef VM_PAGEOUT_BURST_WAIT
 129 #define VM_PAGEOUT_BURST_WAIT   30      /* milliseconds per page */
 130 #endif  /* VM_PAGEOUT_BURST_WAIT */
 131
 132 #ifndef VM_PAGEOUT_EMPTY_WAIT
 133 #define VM_PAGEOUT_EMPTY_WAIT   200     /* milliseconds */
 134 #endif  /* VM_PAGEOUT_EMPTY_WAIT */
 135
 136 #ifndef VM_PAGEOUT_DEADLOCK_WAIT
 137 #define VM_PAGEOUT_DEADLOCK_WAIT        300     /* milliseconds */
 138 #endif  /* VM_PAGEOUT_DEADLOCK_WAIT */
 139
 140 #ifndef VM_PAGEOUT_IDLE_WAIT
 141 #define VM_PAGEOUT_IDLE_WAIT    10      /* milliseconds */
 142 #endif  /* VM_PAGEOUT_IDLE_WAIT */
 143
 144
 145 /*
 146  *      To obtain a reasonable LRU approximation, the inactive queue
 147  *      needs to be large enough to give pages on it a chance to be
 148  *      referenced a second time.  This macro defines the fraction
 149  *      of active+inactive pages that should be inactive.
 150  *      The pageout daemon uses it to update vm_page_inactive_target.
 151  *
 152  *      If vm_page_free_count falls below vm_page_free_target and
 153  *      vm_page_inactive_count is below vm_page_inactive_target,
 154  *      then the pageout daemon starts running.
 155  */
 156
 157 #ifndef VM_PAGE_INACTIVE_TARGET
 158 #define VM_PAGE_INACTIVE_TARGET(avail)  ((avail) * 1 / 3)
 159 #endif  /* VM_PAGE_INACTIVE_TARGET */
 160
 161 /*
 162  *      Once the pageout daemon starts running, it keeps going
 163  *      until vm_page_free_count meets or exceeds vm_page_free_target.
 164  */
 165
 166 #ifndef VM_PAGE_FREE_TARGET
 167 #define VM_PAGE_FREE_TARGET(free)       (15 + (free) / 80)
 168 #endif  /* VM_PAGE_FREE_TARGET */
 169
 170 /*
 171  *      The pageout daemon always starts running once vm_page_free_count
 172  *      falls below vm_page_free_min.
 173  */
 174
 175 #ifndef VM_PAGE_FREE_MIN
 176 #define VM_PAGE_FREE_MIN(free)  (10 + (free) / 100)
 177 #endif  /* VM_PAGE_FREE_MIN */
 178
 179 /*
 180  *      When vm_page_free_count falls below vm_page_free_reserved,
 181  *      only vm-privileged threads can allocate pages.  vm-privilege
 182  *      allows the pageout daemon and default pager (and any other
 183  *      associated threads needed for default pageout) to continue
 184  *      operation by dipping into the reserved pool of pages.
 185  */
 186
 187 #ifndef VM_PAGE_FREE_RESERVED
 188 #define VM_PAGE_FREE_RESERVED(n)        \
 189         ((6 * VM_PAGE_LAUNDRY_MAX) + (n))
 190 #endif  /* VM_PAGE_FREE_RESERVED */
 191
 192
 193 /*
 194  * must hold the page queues lock to
 195  * manipulate this structure
 196  */
 197 struct vm_pageout_queue {
 198         queue_head_t    pgo_pending;    /* laundry pages to be processed by pager's iothread */
 199         unsigned int    pgo_laundry;    /* current count of laundry pages on queue or in flight */
 200         unsigned int    pgo_maxlaundry;
 201
 202         unsigned int    pgo_idle:1,     /* iothread is blocked waiting for work to do */
 203                         pgo_busy:1,     /* iothread is currently processing request from pgo_pending */
 204                         pgo_throttled:1,/* vm_pageout_scan thread needs a wakeup when pgo_laundry drops */
 205                         :0;
 206 };
 207
 208 #define VM_PAGE_Q_THROTTLED(q)          \
 209         ((q)->pgo_laundry >= (q)->pgo_maxlaundry)
 210
 211
 212 /*
 213  * Exported variable used to broadcast the activation of the pageout scan
 214  * Working Set uses this to throttle its use of pmap removes.  In this
 215  * way, code which runs within memory in an uncontested context does
 216  * not keep encountering soft faults.
 217  */
 218
 219 unsigned int    vm_pageout_scan_event_counter = 0;
 220
 221 /*
 222  * Forward declarations for internal routines.
 223  */
 224
 225 static void vm_pageout_garbage_collect(int);
 226 static void vm_pageout_iothread_continue(struct vm_pageout_queue *);
 227 static void vm_pageout_iothread_external(void);
 228 static void vm_pageout_iothread_internal(void);
 229 static void vm_pageout_queue_steal(vm_page_t);
 230
 231 extern void vm_pageout_continue(void);
 232 extern void vm_pageout_scan(void);
 233
 234 unsigned int vm_pageout_reserved_internal = 0;
 235 unsigned int vm_pageout_reserved_really = 0;
 236
 237 unsigned int vm_pageout_idle_wait = 0;          /* milliseconds */
 238 unsigned int vm_pageout_empty_wait = 0;         /* milliseconds */
 239 unsigned int vm_pageout_burst_wait = 0;         /* milliseconds */
 240 unsigned int vm_pageout_deadlock_wait = 0;      /* milliseconds */
 241 unsigned int vm_pageout_deadlock_relief = 0;
 242 unsigned int vm_pageout_inactive_relief = 0;
 243 unsigned int vm_pageout_burst_active_throttle = 0;
 244 unsigned int vm_pageout_burst_inactive_throttle = 0;
 245
 246 /*
 247  *      Protection against zero fill flushing live working sets derived
 248  *      from existing backing store and files
 249  */
 250 unsigned int vm_accellerate_zf_pageout_trigger = 400;
 251 unsigned int vm_zf_iterator;
 252 unsigned int vm_zf_iterator_count = 40;
 253 unsigned int last_page_zf;
 254 unsigned int vm_zf_count = 0;
 255
 256 /*
 257  *      These variables record the pageout daemon's actions:
 258  *      how many pages it looks at and what happens to those pages.
 259  *      No locking needed because only one thread modifies the variables.
 260  */
 261
 262 unsigned int vm_pageout_active = 0;             /* debugging */
 263 unsigned int vm_pageout_inactive = 0;           /* debugging */
 264 unsigned int vm_pageout_inactive_throttled = 0; /* debugging */
 265 unsigned int vm_pageout_inactive_forced = 0;    /* debugging */
 266 unsigned int vm_pageout_inactive_nolock = 0;    /* debugging */
 267 unsigned int vm_pageout_inactive_avoid = 0;     /* debugging */
 268 unsigned int vm_pageout_inactive_busy = 0;      /* debugging */
 269 unsigned int vm_pageout_inactive_absent = 0;    /* debugging */
 270 unsigned int vm_pageout_inactive_used = 0;      /* debugging */
 271 unsigned int vm_pageout_inactive_clean = 0;     /* debugging */
 272 unsigned int vm_pageout_inactive_dirty = 0;     /* debugging */
 273 unsigned int vm_pageout_dirty_no_pager = 0;     /* debugging */
 274 unsigned int vm_pageout_purged_objects = 0;     /* debugging */
 275 unsigned int vm_stat_discard = 0;               /* debugging */
 276 unsigned int vm_stat_discard_sent = 0;          /* debugging */
 277 unsigned int vm_stat_discard_failure = 0;       /* debugging */
 278 unsigned int vm_stat_discard_throttle = 0;      /* debugging */
 279
 280 unsigned int vm_pageout_scan_active_throttled = 0;
 281 unsigned int vm_pageout_scan_inactive_throttled = 0;
 282 unsigned int vm_pageout_scan_throttle = 0;                      /* debugging */
 283 unsigned int vm_pageout_scan_burst_throttle = 0;                /* debugging */
 284 unsigned int vm_pageout_scan_empty_throttle = 0;                /* debugging */
 285 unsigned int vm_pageout_scan_deadlock_detected = 0;             /* debugging */
 286 unsigned int vm_pageout_scan_active_throttle_success = 0;       /* debugging */
 287 unsigned int vm_pageout_scan_inactive_throttle_success = 0;     /* debugging */
 288 /*
 289  * Backing store throttle when BS is exhausted
 290  */
 291 unsigned int    vm_backing_store_low = 0;
 292
 293 unsigned int vm_pageout_out_of_line  = 0;
 294 unsigned int vm_pageout_in_place  = 0;
 295
 296 /*
 297  * ENCRYPTED SWAP:
 298  * counters and statistics...
 299  */
 300 unsigned long vm_page_decrypt_counter = 0;
 301 unsigned long vm_page_decrypt_for_upl_counter = 0;
 302 unsigned long vm_page_encrypt_counter = 0;
 303 unsigned long vm_page_encrypt_abort_counter = 0;
 304 unsigned long vm_page_encrypt_already_encrypted_counter = 0;
 305 boolean_t vm_pages_encrypted = FALSE; /* are there encrypted pages ? */
 306
 307
 308 struct  vm_pageout_queue vm_pageout_queue_internal;
 309 struct  vm_pageout_queue vm_pageout_queue_external;
 310
 311
 312 /*
 313  *      Routine:        vm_backing_store_disable
 314  *      Purpose:
 315  *              Suspend non-privileged threads wishing to extend
 316  *              backing store when we are low on backing store
 317  *              (Synchronized by caller)
 318  */
 319 void
 320 vm_backing_store_disable(
 321         boolean_t       disable)
 322 {
 323         if(disable) {
 324                 vm_backing_store_low = 1;
 325         } else {
 326                 if(vm_backing_store_low) {
 327                         vm_backing_store_low = 0;
 328                         thread_wakeup((event_t) &vm_backing_store_low);
 329                 }
 330         }
 331 }
 332
 333
 334 /*
 335  *      Routine:        vm_pageout_object_allocate
 336  *      Purpose:
 337  *              Allocate an object for use as out-of-line memory in a
 338  *              data_return/data_initialize message.
 339  *              The page must be in an unlocked object.
 340  *
 341  *              If the page belongs to a trusted pager, cleaning in place
 342  *              will be used, which utilizes a special "pageout object"
 343  *              containing private alias pages for the real page frames.
 344  *              Untrusted pagers use normal out-of-line memory.
 345  */
 346 vm_object_t
 347 vm_pageout_object_allocate(
 348         vm_page_t               m,
 349         vm_size_t               size,
 350         vm_object_offset_t      offset)
 351 {
 352         vm_object_t     object = m->object;
 353         vm_object_t     new_object;
 354
 355         assert(object->pager_ready);
 356
 357         new_object = vm_object_allocate(size);
 358
 359         if (object->pager_trusted) {
 360                 assert (offset < object->size);
 361
 362                 vm_object_lock(new_object);
 363                 new_object->pageout = TRUE;
 364                 new_object->shadow = object;
 365                 new_object->can_persist = FALSE;
 366                 new_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
 367                 new_object->shadow_offset = offset;
 368                 vm_object_unlock(new_object);
 369
 370                 /*
 371                  * Take a paging reference on the object. This will be dropped
 372                  * in vm_pageout_object_terminate()
 373                  */
 374                 vm_object_lock(object);
 375                 vm_object_paging_begin(object);
 376                 vm_page_lock_queues();
 377                 vm_page_unlock_queues();
 378                 vm_object_unlock(object);
 379
 380                 vm_pageout_in_place++;
 381         } else
 382                 vm_pageout_out_of_line++;
 383         return(new_object);
 384 }
 385
 386 #if MACH_CLUSTER_STATS
 387 unsigned long vm_pageout_cluster_dirtied = 0;
 388 unsigned long vm_pageout_cluster_cleaned = 0;
 389 unsigned long vm_pageout_cluster_collisions = 0;
 390 unsigned long vm_pageout_cluster_clusters = 0;
 391 unsigned long vm_pageout_cluster_conversions = 0;
 392 unsigned long vm_pageout_target_collisions = 0;
 393 unsigned long vm_pageout_target_page_dirtied = 0;
 394 unsigned long vm_pageout_target_page_freed = 0;
 395 #define CLUSTER_STAT(clause)    clause
 396 #else   /* MACH_CLUSTER_STATS */
 397 #define CLUSTER_STAT(clause)
 398 #endif  /* MACH_CLUSTER_STATS */
 399
 400 /*
 401  *      Routine:        vm_pageout_object_terminate
 402  *      Purpose:
 403  *              Destroy the pageout_object allocated by
 404  *              vm_pageout_object_allocate(), and perform all of the
 405  *              required cleanup actions.
 406  *
 407  *      In/Out conditions:
 408  *              The object must be locked, and will be returned locked.
 409  */
 410 void
 411 vm_pageout_object_terminate(
 412         vm_object_t     object)
 413 {
 414         vm_object_t     shadow_object;
 415         boolean_t       shadow_internal;
 416
 417         /*
 418          * Deal with the deallocation (last reference) of a pageout object
 419          * (used for cleaning-in-place) by dropping the paging references/
 420          * freeing pages in the original object.
 421          */
 422
 423         assert(object->pageout);
 424         shadow_object = object->shadow;
 425         vm_object_lock(shadow_object);
 426         shadow_internal = shadow_object->internal;
 427
 428         while (!queue_empty(&object->memq)) {
 429                 vm_page_t               p, m;
 430                 vm_object_offset_t      offset;
 431
 432                 p = (vm_page_t) queue_first(&object->memq);
 433
 434                 assert(p->private);
 435                 assert(p->pageout);
 436                 p->pageout = FALSE;
 437                 assert(!p->cleaning);
 438
 439                 offset = p->offset;
 440                 VM_PAGE_FREE(p);
 441                 p = VM_PAGE_NULL;
 442
 443                 m = vm_page_lookup(shadow_object,
 444                         offset + object->shadow_offset);
 445
 446                 if(m == VM_PAGE_NULL)
 447                         continue;
 448                 assert(m->cleaning);
 449                 /* used as a trigger on upl_commit etc to recognize the */
 450                 /* pageout daemon's subseqent desire to pageout a cleaning */
 451                 /* page.  When the bit is on the upl commit code will   */
 452                 /* respect the pageout bit in the target page over the  */
 453                 /* caller's page list indication */
 454                 m->dump_cleaning = FALSE;
 455
 456                 /*
 457                  * Account for the paging reference taken when
 458                  * m->cleaning was set on this page.
 459                  */
 460                 vm_object_paging_end(shadow_object);
 461                 assert((m->dirty) || (m->precious) ||
 462                                 (m->busy && m->cleaning));
 463
 464                 /*
 465                  * Handle the trusted pager throttle.
 466                  * Also decrement the burst throttle (if external).
 467                  */
 468                 vm_page_lock_queues();
 469                 if (m->laundry) {
 470                         vm_pageout_throttle_up(m);
 471                 }
 472
 473                 /*
 474                  * Handle the "target" page(s). These pages are to be freed if
 475                  * successfully cleaned. Target pages are always busy, and are
 476                  * wired exactly once. The initial target pages are not mapped,
 477                  * (so cannot be referenced or modified) but converted target
 478                  * pages may have been modified between the selection as an
 479                  * adjacent page and conversion to a target.
 480                  */
 481                 if (m->pageout) {
 482                         assert(m->busy);
 483                         assert(m->wire_count == 1);
 484                         m->cleaning = FALSE;
 485                         m->pageout = FALSE;
 486 #if MACH_CLUSTER_STATS
 487                         if (m->wanted) vm_pageout_target_collisions++;
 488 #endif
 489                         /*
 490                          * Revoke all access to the page. Since the object is
 491                          * locked, and the page is busy, this prevents the page
 492                          * from being dirtied after the pmap_disconnect() call
 493                          * returns.
 494                          *
 495                          * Since the page is left "dirty" but "not modifed", we
 496                          * can detect whether the page was redirtied during
 497                          * pageout by checking the modify state.
 498                          */
 499                         if (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED)
 500                               m->dirty = TRUE;
 501                         else
 502                               m->dirty = FALSE;
 503
 504                         if (m->dirty) {
 505                                 CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
 506                                 vm_page_unwire(m);/* reactivates */
 507                                 VM_STAT(reactivations++);
 508                                 PAGE_WAKEUP_DONE(m);
 509                         } else {
 510                                 CLUSTER_STAT(vm_pageout_target_page_freed++;)
 511                                 vm_page_free(m);/* clears busy, etc. */
 512                         }
 513                         vm_page_unlock_queues();
 514                         continue;
 515                 }
 516                 /*
 517                  * Handle the "adjacent" pages. These pages were cleaned in
 518                  * place, and should be left alone.
 519                  * If prep_pin_count is nonzero, then someone is using the
 520                  * page, so make it active.
 521                  */
 522                 if (!m->active && !m->inactive && !m->private) {
 523                         if (m->reference)
 524                                 vm_page_activate(m);
 525                         else
 526                                 vm_page_deactivate(m);
 527                 }
 528                 if((m->busy) && (m->cleaning)) {
 529
 530                         /* the request_page_list case, (COPY_OUT_FROM FALSE) */
 531                         m->busy = FALSE;
 532
 533                         /* We do not re-set m->dirty ! */
 534                         /* The page was busy so no extraneous activity     */
 535                         /* could have occurred. COPY_INTO is a read into the */
 536                         /* new pages. CLEAN_IN_PLACE does actually write   */
 537                         /* out the pages but handling outside of this code */
 538                         /* will take care of resetting dirty. We clear the */
 539                         /* modify however for the Programmed I/O case.     */
 540                         pmap_clear_modify(m->phys_page);
 541                         if(m->absent) {
 542                                 m->absent = FALSE;
 543                                 if(shadow_object->absent_count == 1)
 544                                         vm_object_absent_release(shadow_object);
 545                                 else
 546                                         shadow_object->absent_count--;
 547                         }
 548                         m->overwriting = FALSE;
 549                 } else if (m->overwriting) {
 550                         /* alternate request page list, write to page_list */
 551                         /* case.  Occurs when the original page was wired  */
 552                         /* at the time of the list request */
 553                         assert(m->wire_count != 0);
 554                         vm_page_unwire(m);/* reactivates */
 555                         m->overwriting = FALSE;
 556                 } else {
 557                 /*
 558                  * Set the dirty state according to whether or not the page was
 559                  * modified during the pageout. Note that we purposefully do
 560                  * NOT call pmap_clear_modify since the page is still mapped.
 561                  * If the page were to be dirtied between the 2 calls, this
 562                  * this fact would be lost. This code is only necessary to
 563                  * maintain statistics, since the pmap module is always
 564                  * consulted if m->dirty is false.
 565                  */
 566 #if MACH_CLUSTER_STATS
 567                         m->dirty = pmap_is_modified(m->phys_page);
 568
 569                         if (m->dirty)   vm_pageout_cluster_dirtied++;
 570                         else            vm_pageout_cluster_cleaned++;
 571                         if (m->wanted)  vm_pageout_cluster_collisions++;
 572 #else
 573                         m->dirty = 0;
 574 #endif
 575                 }
 576                 m->cleaning = FALSE;
 577
 578                 /*
 579                  * Wakeup any thread waiting for the page to be un-cleaning.
 580                  */
 581                 PAGE_WAKEUP(m);
 582                 vm_page_unlock_queues();
 583         }
 584         /*
 585          * Account for the paging reference taken in vm_paging_object_allocate.
 586          */
 587         vm_object_paging_end(shadow_object);
 588         vm_object_unlock(shadow_object);
 589
 590         assert(object->ref_count == 0);
 591         assert(object->paging_in_progress == 0);
 592         assert(object->resident_page_count == 0);
 593         return;
 594 }
 595
 596 /*
 597  *      Routine:        vm_pageout_setup
 598  *      Purpose:
 599  *              Set up a page for pageout (clean & flush).
 600  *
 601  *              Move the page to a new object, as part of which it will be
 602  *              sent to its memory manager in a memory_object_data_write or
 603  *              memory_object_initialize message.
 604  *
 605  *              The "new_object" and "new_offset" arguments
 606  *              indicate where the page should be moved.
 607  *
 608  *      In/Out conditions:
 609  *              The page in question must not be on any pageout queues,
 610  *              and must be busy.  The object to which it belongs
 611  *              must be unlocked, and the caller must hold a paging
 612  *              reference to it.  The new_object must not be locked.
 613  *
 614  *              This routine returns a pointer to a place-holder page,
 615  *              inserted at the same offset, to block out-of-order
 616  *              requests for the page.  The place-holder page must
 617  *              be freed after the data_write or initialize message
 618  *              has been sent.
 619  *
 620  *              The original page is put on a paging queue and marked
 621  *              not busy on exit.
 622  */
 623 vm_page_t
 624 vm_pageout_setup(
 625         register vm_page_t      m,
 626         register vm_object_t    new_object,
 627         vm_object_offset_t      new_offset)
 628 {
 629         register vm_object_t    old_object = m->object;
 630         vm_object_offset_t      paging_offset;
 631         vm_object_offset_t      offset;
 632         register vm_page_t      holding_page;
 633         register vm_page_t      new_m;
 634         boolean_t               need_to_wire = FALSE;
 635
 636
 637         XPR(XPR_VM_PAGEOUT,
 638      "vm_pageout_setup, obj 0x%X off 0x%X page 0x%X new obj 0x%X offset 0x%X\n",
 639                 (integer_t)m->object, (integer_t)m->offset,
 640                 (integer_t)m, (integer_t)new_object,
 641                 (integer_t)new_offset);
 642         assert(m && m->busy && !m->absent && !m->fictitious && !m->error &&
 643                 !m->restart);
 644
 645         assert(m->dirty || m->precious);
 646
 647         /*
 648          *      Create a place-holder page where the old one was, to prevent
 649          *      attempted pageins of this page while we're unlocked.
 650          */
 651         VM_PAGE_GRAB_FICTITIOUS(holding_page);
 652
 653         vm_object_lock(old_object);
 654
 655         offset = m->offset;
 656         paging_offset = offset + old_object->paging_offset;
 657
 658         if (old_object->pager_trusted) {
 659                 /*
 660                  * This pager is trusted, so we can clean this page
 661                  * in place. Leave it in the old object, and mark it
 662                  * cleaning & pageout.
 663                  */
 664                 new_m = holding_page;
 665                 holding_page = VM_PAGE_NULL;
 666
 667                 /*
 668                  * Set up new page to be private shadow of real page.
 669                  */
 670                 new_m->phys_page = m->phys_page;
 671                 new_m->fictitious = FALSE;
 672                 new_m->pageout = TRUE;
 673
 674                 /*
 675                  * Mark real page as cleaning (indicating that we hold a
 676                  * paging reference to be released via m_o_d_r_c) and
 677                  * pageout (indicating that the page should be freed
 678                  * when the pageout completes).
 679                  */
 680                 pmap_clear_modify(m->phys_page);
 681                 vm_page_lock_queues();
 682                 new_m->private = TRUE;
 683                 vm_page_wire(new_m);
 684                 m->cleaning = TRUE;
 685                 m->pageout = TRUE;
 686
 687                 vm_page_wire(m);
 688                 assert(m->wire_count == 1);
 689                 vm_page_unlock_queues();
 690
 691                 m->dirty = TRUE;
 692                 m->precious = FALSE;
 693                 m->page_lock = VM_PROT_NONE;
 694                 m->unusual = FALSE;
 695                 m->unlock_request = VM_PROT_NONE;
 696         } else {
 697                 /*
 698                  * Cannot clean in place, so rip the old page out of the
 699                  * object, and stick the holding page in. Set new_m to the
 700                  * page in the new object.
 701                  */
 702                 vm_page_lock_queues();
 703                 VM_PAGE_QUEUES_REMOVE(m);
 704                 vm_page_remove(m);
 705
 706                 vm_page_insert(holding_page, old_object, offset);
 707                 vm_page_unlock_queues();
 708
 709                 m->dirty = TRUE;
 710                 m->precious = FALSE;
 711                 new_m = m;
 712                 new_m->page_lock = VM_PROT_NONE;
 713                 new_m->unlock_request = VM_PROT_NONE;
 714
 715                 if (old_object->internal)
 716                         need_to_wire = TRUE;
 717         }
 718         /*
 719          *      Record that this page has been written out
 720          */
 721 #if     MACH_PAGEMAP
 722         vm_external_state_set(old_object->existence_map, offset);
 723 #endif  /* MACH_PAGEMAP */
 724
 725         vm_object_unlock(old_object);
 726
 727         vm_object_lock(new_object);
 728
 729         /*
 730          *      Put the page into the new object. If it is a not wired
 731          *      (if it's the real page) it will be activated.
 732          */
 733
 734         vm_page_lock_queues();
 735         vm_page_insert(new_m, new_object, new_offset);
 736         if (need_to_wire)
 737                 vm_page_wire(new_m);
 738         else
 739                 vm_page_activate(new_m);
 740         PAGE_WAKEUP_DONE(new_m);
 741         vm_page_unlock_queues();
 742
 743         vm_object_unlock(new_object);
 744
 745         /*
 746          *      Return the placeholder page to simplify cleanup.
 747          */
 748         return (holding_page);
 749 }
 750
 751 /*
 752  * Routine:     vm_pageclean_setup
 753  *
 754  * Purpose:     setup a page to be cleaned (made non-dirty), but not
 755  *              necessarily flushed from the VM page cache.
 756  *              This is accomplished by cleaning in place.
 757  *
 758  *              The page must not be busy, and the object and page
 759  *              queues must be locked.
 760  *
 761  */
 762 void
 763 vm_pageclean_setup(
 764         vm_page_t               m,
 765         vm_page_t               new_m,
 766         vm_object_t             new_object,
 767         vm_object_offset_t      new_offset)
 768 {
 769         vm_object_t old_object = m->object;
 770         assert(!m->busy);
 771         assert(!m->cleaning);
 772
 773         XPR(XPR_VM_PAGEOUT,
 774     "vm_pageclean_setup, obj 0x%X off 0x%X page 0x%X new 0x%X new_off 0x%X\n",
 775                 (integer_t)old_object, m->offset, (integer_t)m,
 776                 (integer_t)new_m, new_offset);
 777
 778         pmap_clear_modify(m->phys_page);
 779         vm_object_paging_begin(old_object);
 780
 781         /*
 782          *      Record that this page has been written out
 783          */
 784 #if     MACH_PAGEMAP
 785         vm_external_state_set(old_object->existence_map, m->offset);
 786 #endif  /*MACH_PAGEMAP*/
 787
 788         /*
 789          * Mark original page as cleaning in place.
 790          */
 791         m->cleaning = TRUE;
 792         m->dirty = TRUE;
 793         m->precious = FALSE;
 794
 795         /*
 796          * Convert the fictitious page to a private shadow of
 797          * the real page.
 798          */
 799         assert(new_m->fictitious);
 800         new_m->fictitious = FALSE;
 801         new_m->private = TRUE;
 802         new_m->pageout = TRUE;
 803         new_m->phys_page = m->phys_page;
 804         vm_page_wire(new_m);
 805
 806         vm_page_insert(new_m, new_object, new_offset);
 807         assert(!new_m->wanted);
 808         new_m->busy = FALSE;
 809 }
 810
 811 void
 812 vm_pageclean_copy(
 813         vm_page_t               m,
 814         vm_page_t               new_m,
 815         vm_object_t             new_object,
 816         vm_object_offset_t      new_offset)
 817 {
 818         XPR(XPR_VM_PAGEOUT,
 819         "vm_pageclean_copy, page 0x%X new_m 0x%X new_obj 0x%X offset 0x%X\n",
 820                 m, new_m, new_object, new_offset, 0);
 821
 822         assert((!m->busy) && (!m->cleaning));
 823
 824         assert(!new_m->private && !new_m->fictitious);
 825
 826         pmap_clear_modify(m->phys_page);
 827
 828         m->busy = TRUE;
 829         vm_object_paging_begin(m->object);
 830         vm_page_unlock_queues();
 831         vm_object_unlock(m->object);
 832
 833         /*
 834          * Copy the original page to the new page.
 835          */
 836         vm_page_copy(m, new_m);
 837
 838         /*
 839          * Mark the old page as clean. A request to pmap_is_modified
 840          * will get the right answer.
 841          */
 842         vm_object_lock(m->object);
 843         m->dirty = FALSE;
 844
 845         vm_object_paging_end(m->object);
 846
 847         vm_page_lock_queues();
 848         if (!m->active && !m->inactive)
 849                 vm_page_activate(m);
 850         PAGE_WAKEUP_DONE(m);
 851
 852         vm_page_insert(new_m, new_object, new_offset);
 853         vm_page_activate(new_m);
 854         new_m->busy = FALSE;    /* No other thread can be waiting */
 855 }
 856
 857
 858 /*
 859  *      Routine:        vm_pageout_initialize_page
 860  *      Purpose:
 861  *              Causes the specified page to be initialized in
 862  *              the appropriate memory object. This routine is used to push
 863  *              pages into a copy-object when they are modified in the
 864  *              permanent object.
 865  *
 866  *              The page is moved to a temporary object and paged out.
 867  *
 868  *      In/out conditions:
 869  *              The page in question must not be on any pageout queues.
 870  *              The object to which it belongs must be locked.
 871  *              The page must be busy, but not hold a paging reference.
 872  *
 873  *      Implementation:
 874  *              Move this page to a completely new object.
 875  */
 876 void
 877 vm_pageout_initialize_page(
 878         vm_page_t       m)
 879 {
 880         vm_object_t             object;
 881         vm_object_offset_t      paging_offset;
 882         vm_page_t               holding_page;
 883
 884
 885         XPR(XPR_VM_PAGEOUT,
 886                 "vm_pageout_initialize_page, page 0x%X\n",
 887                 (integer_t)m, 0, 0, 0, 0);
 888         assert(m->busy);
 889
 890         /*
 891          *      Verify that we really want to clean this page
 892          */
 893         assert(!m->absent);
 894         assert(!m->error);
 895         assert(m->dirty);
 896
 897         /*
 898          *      Create a paging reference to let us play with the object.
 899          */
 900         object = m->object;
 901         paging_offset = m->offset + object->paging_offset;
 902         vm_object_paging_begin(object);
 903         if (m->absent || m->error || m->restart ||
 904             (!m->dirty && !m->precious)) {
 905                 VM_PAGE_FREE(m);
 906                 panic("reservation without pageout?"); /* alan */
 907              vm_object_unlock(object);
 908                 return;
 909         }
 910
 911         /* set the page for future call to vm_fault_list_request */
 912         holding_page = NULL;
 913         vm_page_lock_queues();
 914         pmap_clear_modify(m->phys_page);
 915         m->dirty = TRUE;
 916         m->busy = TRUE;
 917         m->list_req_pending = TRUE;
 918         m->cleaning = TRUE;
 919         m->pageout = TRUE;
 920         vm_page_wire(m);
 921         vm_page_unlock_queues();
 922         vm_object_unlock(object);
 923
 924         /*
 925          *      Write the data to its pager.
 926          *      Note that the data is passed by naming the new object,
 927          *      not a virtual address; the pager interface has been
 928          *      manipulated to use the "internal memory" data type.
 929          *      [The object reference from its allocation is donated
 930          *      to the eventual recipient.]
 931          */
 932         memory_object_data_initialize(object->pager,
 933                                         paging_offset,
 934                                         PAGE_SIZE);
 935
 936         vm_object_lock(object);
 937 }
 938
 939 #if     MACH_CLUSTER_STATS
 940 #define MAXCLUSTERPAGES 16
 941 struct {
 942         unsigned long pages_in_cluster;
 943         unsigned long pages_at_higher_offsets;
 944         unsigned long pages_at_lower_offsets;
 945 } cluster_stats[MAXCLUSTERPAGES];
 946 #endif  /* MACH_CLUSTER_STATS */
 947
 948 boolean_t allow_clustered_pageouts = FALSE;
 949
 950 /*
 951  * vm_pageout_cluster:
 952  *
 953  * Given a page, queue it to the appropriate I/O thread,
 954  * which will page it out and attempt to clean adjacent pages
 955  * in the same operation.
 956  *
 957  * The page must be busy, and the object and queues locked. We will take a
 958  * paging reference to prevent deallocation or collapse when we
 959  * release the object lock back at the call site.  The I/O thread
 960  * is responsible for consuming this reference
 961  *
 962  * The page must not be on any pageout queue.
 963  */
 964
 965 void
 966 vm_pageout_cluster(vm_page_t m)
 967 {
 968         vm_object_t     object = m->object;
 969         struct          vm_pageout_queue *q;
 970
 971
 972         XPR(XPR_VM_PAGEOUT,
 973                 "vm_pageout_cluster, object 0x%X offset 0x%X page 0x%X\n",
 974                 (integer_t)object, m->offset, (integer_t)m, 0, 0);
 975
 976         /*
 977          * Only a certain kind of page is appreciated here.
 978          */
 979         assert(m->busy && (m->dirty || m->precious) && (m->wire_count == 0));
 980         assert(!m->cleaning && !m->pageout && !m->inactive && !m->active);
 981
 982         /*
 983          * protect the object from collapse -
 984          * locking in the object's paging_offset.
 985          */
 986         vm_object_paging_begin(object);
 987
 988         /*
 989          * set the page for future call to vm_fault_list_request
 990          * page should already be marked busy
 991          */
 992         vm_page_wire(m);
 993         m->list_req_pending = TRUE;
 994         m->cleaning = TRUE;
 995         m->pageout = TRUE;
 996         m->laundry = TRUE;
 997
 998         if (object->internal == TRUE)
 999                 q = &vm_pageout_queue_internal;
1000         else
1001                 q = &vm_pageout_queue_external;
1002         q->pgo_laundry++;
1003
1004         m->pageout_queue = TRUE;
1005         queue_enter(&q->pgo_pending, m, vm_page_t, pageq);
1006
1007         if (q->pgo_idle == TRUE) {
1008                 q->pgo_idle = FALSE;
1009                 thread_wakeup((event_t) &q->pgo_pending);
1010         }
1011 }
1012
1013
1014 unsigned long vm_pageout_throttle_up_count = 0;
1015
1016 /*
1017  * A page is back from laundry.  See if there are some pages waiting to
1018  * go to laundry and if we can let some of them go now.
1019  *
1020  * Object and page queues must be locked.
1021  */
1022 void
1023 vm_pageout_throttle_up(
1024         vm_page_t       m)
1025 {
1026         struct vm_pageout_queue *q;
1027
1028         vm_pageout_throttle_up_count++;
1029
1030         assert(m->laundry);
1031         assert(m->object != VM_OBJECT_NULL);
1032         assert(m->object != kernel_object);
1033
1034         if (m->object->internal == TRUE)
1035                 q = &vm_pageout_queue_internal;
1036         else
1037                 q = &vm_pageout_queue_external;
1038
1039         m->laundry = FALSE;
1040         q->pgo_laundry--;
1041
1042         if (q->pgo_throttled == TRUE) {
1043                 q->pgo_throttled = FALSE;
1044                 thread_wakeup((event_t) &q->pgo_laundry);
1045         }
1046 }
1047
1048
1049 /*
1050  *      vm_pageout_scan does the dirty work for the pageout daemon.
1051  *      It returns with vm_page_queue_free_lock held and
1052  *      vm_page_free_wanted == 0.
1053  */
1054
1055 #define DELAYED_UNLOCK_LIMIT  (3 * MAX_UPL_TRANSFER)
1056
1057 #define FCS_IDLE                0
1058 #define FCS_DELAYED             1
1059 #define FCS_DEADLOCK_DETECTED   2
1060
1061 struct flow_control {
1062         int             state;
1063         mach_timespec_t ts;
1064 };
1065
1066 void
1067 vm_pageout_scan(void)
1068 {
1069         unsigned int loop_count = 0;
1070         unsigned int inactive_burst_count = 0;
1071         unsigned int active_burst_count = 0;
1072         vm_page_t   local_freeq = 0;
1073         int         local_freed = 0;
1074         int         delayed_unlock = 0;
1075         int         need_internal_inactive = 0;
1076         int         refmod_state = 0;
1077         int     vm_pageout_deadlock_target = 0;
1078         struct  vm_pageout_queue *iq;
1079         struct  vm_pageout_queue *eq;
1080         struct  flow_control    flow_control;
1081         boolean_t active_throttled = FALSE;
1082         boolean_t inactive_throttled = FALSE;
1083         mach_timespec_t         ts;
1084         unsigned int msecs = 0;
1085         vm_object_t     object;
1086
1087
1088         flow_control.state = FCS_IDLE;
1089         iq = &vm_pageout_queue_internal;
1090         eq = &vm_pageout_queue_external;
1091
1092         XPR(XPR_VM_PAGEOUT, "vm_pageout_scan\n", 0, 0, 0, 0, 0);
1093
1094 /*???*/ /*
1095          *      We want to gradually dribble pages from the active queue
1096          *      to the inactive queue.  If we let the inactive queue get
1097          *      very small, and then suddenly dump many pages into it,
1098          *      those pages won't get a sufficient chance to be referenced
1099          *      before we start taking them from the inactive queue.
1100          *
1101          *      We must limit the rate at which we send pages to the pagers.
1102          *      data_write messages consume memory, for message buffers and
1103          *      for map-copy objects.  If we get too far ahead of the pagers,
1104          *      we can potentially run out of memory.
1105          *
1106          *      We can use the laundry count to limit directly the number
1107          *      of pages outstanding to the default pager.  A similar
1108          *      strategy for external pagers doesn't work, because
1109          *      external pagers don't have to deallocate the pages sent them,
1110          *      and because we might have to send pages to external pagers
1111          *      even if they aren't processing writes.  So we also
1112          *      use a burst count to limit writes to external pagers.
1113          *
1114          *      When memory is very tight, we can't rely on external pagers to
1115          *      clean pages.  They probably aren't running, because they
1116          *      aren't vm-privileged.  If we kept sending dirty pages to them,
1117          *      we could exhaust the free list.
1118          */
1119         vm_page_lock_queues();
1120         delayed_unlock = 1;
1121
1122
1123 Restart:
1124         /*
1125          *      Recalculate vm_page_inactivate_target.
1126          */
1127         vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
1128                                                           vm_page_inactive_count);
1129         object = NULL;
1130
1131         for (;;) {
1132                 vm_page_t m;
1133
1134                 if (delayed_unlock == 0)
1135                         vm_page_lock_queues();
1136
1137                 active_burst_count = vm_page_active_count;
1138
1139                 if (active_burst_count > vm_pageout_burst_active_throttle)
1140                         active_burst_count = vm_pageout_burst_active_throttle;
1141
1142                 /*
1143                  *      Move pages from active to inactive.
1144                  */
1145                 while ((need_internal_inactive ||
1146                            vm_page_inactive_count < vm_page_inactive_target) &&
1147                        !queue_empty(&vm_page_queue_active) &&
1148                        ((active_burst_count--) > 0)) {
1149
1150                         vm_pageout_active++;
1151
1152                         m = (vm_page_t) queue_first(&vm_page_queue_active);
1153
1154                         assert(m->active && !m->inactive);
1155                         assert(!m->laundry);
1156                         assert(m->object != kernel_object);
1157
1158                         /*
1159                          * Try to lock object; since we've already got the
1160                          * page queues lock, we can only 'try' for this one.
1161                          * if the 'try' fails, we need to do a mutex_pause
1162                          * to allow the owner of the object lock a chance to
1163                          * run... otherwise, we're likely to trip over this
1164                          * object in the same state as we work our way through
1165                          * the queue... clumps of pages associated with the same
1166                          * object are fairly typical on the inactive and active queues
1167                          */
1168                         if (m->object != object) {
1169                                 if (object != NULL) {
1170                                         vm_object_unlock(object);
1171                                         object = NULL;
1172                                 }
1173                                 if (!vm_object_lock_try(m->object)) {
1174                                         /*
1175                                          * move page to end of active queue and continue
1176                                          */
1177                                         queue_remove(&vm_page_queue_active, m,
1178                                                      vm_page_t, pageq);
1179                                         queue_enter(&vm_page_queue_active, m,
1180                                                     vm_page_t, pageq);
1181
1182                                         goto done_with_activepage;
1183                                 }
1184                                 object = m->object;
1185                         }
1186                         /*
1187                          * if the page is BUSY, then we pull it
1188                          * off the active queue and leave it alone.
1189                          * when BUSY is cleared, it will get stuck
1190                          * back on the appropriate queue
1191                          */
1192                         if (m->busy) {
1193                                 queue_remove(&vm_page_queue_active, m,
1194                                              vm_page_t, pageq);
1195                                 m->pageq.next = NULL;
1196                                 m->pageq.prev = NULL;
1197
1198                                 if (!m->fictitious)
1199                                         vm_page_active_count--;
1200                                 m->active = FALSE;
1201
1202                                 goto done_with_activepage;
1203                         }
1204                         if (need_internal_inactive) {
1205                                 /*
1206                                  * If we're unable to make forward progress
1207                                  * with the current set of pages on the
1208                                  * inactive queue due to busy objects or
1209                                  * throttled pageout queues, then
1210                                  * move a page that is already clean
1211                                  * or belongs to a pageout queue that
1212                                  * isn't currently throttled
1213                                  */
1214                                 active_throttled = FALSE;
1215
1216                                 if (object->internal) {
1217                                         if ((VM_PAGE_Q_THROTTLED(iq) || !IP_VALID(memory_manager_default)))
1218                                                 active_throttled = TRUE;
1219                                 } else if (VM_PAGE_Q_THROTTLED(eq)) {
1220                                                 active_throttled = TRUE;
1221                                 }
1222                                 if (active_throttled == TRUE) {
1223                                         if (!m->dirty) {
1224                                                 refmod_state = pmap_get_refmod(m->phys_page);
1225
1226                                                 if (refmod_state & VM_MEM_REFERENCED)
1227                                                         m->reference = TRUE;
1228                                                 if (refmod_state & VM_MEM_MODIFIED)
1229                                                         m->dirty = TRUE;
1230                                         }
1231                                         if (m->dirty || m->precious) {
1232                                                 /*
1233                                                  * page is dirty and targets a THROTTLED queue
1234                                                  * so all we can do is move it back to the
1235                                                  * end of the active queue to get it out
1236                                                  * of the way
1237                                                  */
1238                                                 queue_remove(&vm_page_queue_active, m,
1239                                                              vm_page_t, pageq);
1240                                                 queue_enter(&vm_page_queue_active, m,
1241                                                             vm_page_t, pageq);
1242
1243                                                 vm_pageout_scan_active_throttled++;
1244
1245                                                 goto done_with_activepage;
1246                                         }
1247                                 }
1248                                 vm_pageout_scan_active_throttle_success++;
1249                                 need_internal_inactive--;
1250                         }
1251                         /*
1252                          *      Deactivate the page while holding the object
1253                          *      locked, so we know the page is still not busy.
1254                          *      This should prevent races between pmap_enter
1255                          *      and pmap_clear_reference.  The page might be
1256                          *      absent or fictitious, but vm_page_deactivate
1257                          *      can handle that.
1258                          */
1259                         vm_page_deactivate(m);
1260 done_with_activepage:
1261                         if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
1262
1263                                 if (object != NULL) {
1264                                         vm_object_unlock(object);
1265                                         object = NULL;
1266                                 }
1267                                 if (local_freeq) {
1268                                         vm_page_free_list(local_freeq);
1269
1270                                         local_freeq = 0;
1271                                         local_freed = 0;
1272                                 }
1273                                 delayed_unlock = 0;
1274                                 vm_page_unlock_queues();
1275
1276                                 mutex_pause();
1277                                 vm_page_lock_queues();
1278                                 /*
1279                                  * continue the while loop processing
1280                                  * the active queue... need to hold
1281                                  * the page queues lock
1282                                  */
1283                                 continue;
1284                         }
1285                 }
1286
1287
1288
1289                 /**********************************************************************
1290                  * above this point we're playing with the active queue
1291                  * below this point we're playing with the throttling mechanisms
1292                  * and the inactive queue
1293                  **********************************************************************/
1294
1295
1296
1297                 /*
1298                  *      We are done if we have met our target *and*
1299                  *      nobody is still waiting for a page.
1300                  */
1301                 if (vm_page_free_count + local_freed >= vm_page_free_target) {
1302                         if (object != NULL) {
1303                                 vm_object_unlock(object);
1304                                 object = NULL;
1305                         }
1306                         if (local_freeq) {
1307                                 vm_page_free_list(local_freeq);
1308
1309                                 local_freeq = 0;
1310                                 local_freed = 0;
1311                         }
1312                         mutex_lock(&vm_page_queue_free_lock);
1313
1314                         if ((vm_page_free_count >= vm_page_free_target) &&
1315                                   (vm_page_free_wanted == 0)) {
1316
1317                                 vm_page_unlock_queues();
1318
1319                                 thread_wakeup((event_t) &vm_pageout_garbage_collect);
1320                                 return;
1321                         }
1322                         mutex_unlock(&vm_page_queue_free_lock);
1323                 }
1324
1325
1326                 /*
1327                  * Sometimes we have to pause:
1328                  *      1) No inactive pages - nothing to do.
1329                  *      2) Flow control - default pageout queue is full
1330                  *      3) Loop control - no acceptable pages found on the inactive queue
1331                  *         within the last vm_pageout_burst_inactive_throttle iterations
1332                  */
1333                 if ((queue_empty(&vm_page_queue_inactive) && queue_empty(&vm_page_queue_zf))) {
1334                         vm_pageout_scan_empty_throttle++;
1335                         msecs = vm_pageout_empty_wait;
1336                         goto vm_pageout_scan_delay;
1337
1338                 } else if (inactive_burst_count >= vm_pageout_burst_inactive_throttle) {
1339                         vm_pageout_scan_burst_throttle++;
1340                         msecs = vm_pageout_burst_wait;
1341                         goto vm_pageout_scan_delay;
1342
1343                 } else if (VM_PAGE_Q_THROTTLED(iq)) {
1344
1345                         switch (flow_control.state) {
1346
1347                         case FCS_IDLE:
1348 reset_deadlock_timer:
1349                                 ts.tv_sec = vm_pageout_deadlock_wait / 1000;
1350                                 ts.tv_nsec = (vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
1351                                 clock_get_system_nanotime(
1352                                         &flow_control.ts.tv_sec,
1353                                         (uint32_t *) &flow_control.ts.tv_nsec);
1354                                 ADD_MACH_TIMESPEC(&flow_control.ts, &ts);
1355
1356                                 flow_control.state = FCS_DELAYED;
1357                                 msecs = vm_pageout_deadlock_wait;
1358
1359                                 break;
1360
1361                         case FCS_DELAYED:
1362                                 clock_get_system_nanotime(
1363                                         &ts.tv_sec,
1364                                         (uint32_t *) &ts.tv_nsec);
1365
1366                                 if (CMP_MACH_TIMESPEC(&ts, &flow_control.ts) >= 0) {
1367                                         /*
1368                                          * the pageout thread for the default pager is potentially
1369                                          * deadlocked since the
1370                                          * default pager queue has been throttled for more than the
1371                                          * allowable time... we need to move some clean pages or dirty
1372                                          * pages belonging to the external pagers if they aren't throttled
1373                                          * vm_page_free_wanted represents the number of threads currently
1374                                          * blocked waiting for pages... we'll move one page for each of
1375                                          * these plus a fixed amount to break the logjam... once we're done
1376                                          * moving this number of pages, we'll re-enter the FSC_DELAYED state
1377                                          * with a new timeout target since we have no way of knowing
1378                                          * whether we've broken the deadlock except through observation
1379                                          * of the queue associated with the default pager... we need to
1380                                          * stop moving pagings and allow the system to run to see what
1381                                          * state it settles into.
1382                                          */
1383                                         vm_pageout_deadlock_target = vm_pageout_deadlock_relief + vm_page_free_wanted;
1384                                         vm_pageout_scan_deadlock_detected++;
1385                                         flow_control.state = FCS_DEADLOCK_DETECTED;
1386
1387                                         thread_wakeup((event_t) &vm_pageout_garbage_collect);
1388                                         goto consider_inactive;
1389                                 }
1390                                 /*
1391                                  * just resniff instead of trying
1392                                  * to compute a new delay time... we're going to be
1393                                  * awakened immediately upon a laundry completion,
1394                                  * so we won't wait any longer than necessary
1395                                  */
1396                                 msecs = vm_pageout_idle_wait;
1397                                 break;
1398
1399                         case FCS_DEADLOCK_DETECTED:
1400                                 if (vm_pageout_deadlock_target)
1401                                         goto consider_inactive;
1402                                 goto reset_deadlock_timer;
1403
1404                         }
1405                         vm_pageout_scan_throttle++;
1406                         iq->pgo_throttled = TRUE;
1407 vm_pageout_scan_delay:
1408                         if (object != NULL) {
1409                                 vm_object_unlock(object);
1410                                 object = NULL;
1411                         }
1412                         if (local_freeq) {
1413                                 vm_page_free_list(local_freeq);
1414
1415                                 local_freeq = 0;
1416                                 local_freed = 0;
1417                         }
1418                         assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000*NSEC_PER_USEC);
1419
1420                         counter(c_vm_pageout_scan_block++);
1421
1422                         vm_page_unlock_queues();
1423
1424                         thread_block(THREAD_CONTINUE_NULL);
1425
1426                         vm_page_lock_queues();
1427                         delayed_unlock = 1;
1428
1429                         iq->pgo_throttled = FALSE;
1430
1431                         if (loop_count >= vm_page_inactive_count) {
1432                                 if (VM_PAGE_Q_THROTTLED(eq) || VM_PAGE_Q_THROTTLED(iq)) {
1433                                         /*
1434                                          * Make sure we move enough "appropriate"
1435                                          * pages to the inactive queue before trying
1436                                          * again.
1437                                          */
1438                                         need_internal_inactive = vm_pageout_inactive_relief;
1439                                 }
1440                                 loop_count = 0;
1441                         }
1442                         inactive_burst_count = 0;
1443
1444                         goto Restart;
1445                         /*NOTREACHED*/
1446                 }
1447
1448
1449                 flow_control.state = FCS_IDLE;
1450 consider_inactive:
1451                 loop_count++;
1452                 inactive_burst_count++;
1453                 vm_pageout_inactive++;
1454
1455                 if (!queue_empty(&vm_page_queue_inactive)) {
1456                         m = (vm_page_t) queue_first(&vm_page_queue_inactive);
1457
1458                         if (m->clustered && (m->no_isync == TRUE)) {
1459                                 goto use_this_page;
1460                         }
1461                 }
1462                 if (vm_zf_count < vm_accellerate_zf_pageout_trigger) {
1463                         vm_zf_iterator = 0;
1464                 } else {
1465                         last_page_zf = 0;
1466                         if((vm_zf_iterator+=1) >= vm_zf_iterator_count) {
1467                                         vm_zf_iterator = 0;
1468                         }
1469                 }
1470                 if (queue_empty(&vm_page_queue_zf) ||
1471                                 (((last_page_zf) || (vm_zf_iterator == 0)) &&
1472                                 !queue_empty(&vm_page_queue_inactive))) {
1473                         m = (vm_page_t) queue_first(&vm_page_queue_inactive);
1474                         last_page_zf = 0;
1475                 } else {
1476                         m = (vm_page_t) queue_first(&vm_page_queue_zf);
1477                         last_page_zf = 1;
1478                 }
1479 use_this_page:
1480                 assert(!m->active && m->inactive);
1481                 assert(!m->laundry);
1482                 assert(m->object != kernel_object);
1483
1484                 /*
1485                  * Try to lock object; since we've alread got the
1486                  * page queues lock, we can only 'try' for this one.
1487                  * if the 'try' fails, we need to do a mutex_pause
1488                  * to allow the owner of the object lock a chance to
1489                  * run... otherwise, we're likely to trip over this
1490                  * object in the same state as we work our way through
1491                  * the queue... clumps of pages associated with the same
1492                  * object are fairly typical on the inactive and active queues
1493                  */
1494                 if (m->object != object) {
1495                         if (object != NULL) {
1496                                 vm_object_unlock(object);
1497                                 object = NULL;
1498                         }
1499                         if (!vm_object_lock_try(m->object)) {
1500                                 /*
1501                                  *      Move page to end and continue.
1502                                  *      Don't re-issue ticket
1503                                  */
1504                                 if (m->zero_fill) {
1505                                         queue_remove(&vm_page_queue_zf, m,
1506                                                      vm_page_t, pageq);
1507                                         queue_enter(&vm_page_queue_zf, m,
1508                                                     vm_page_t, pageq);
1509                                 } else {
1510                                         queue_remove(&vm_page_queue_inactive, m,
1511                                                      vm_page_t, pageq);
1512                                         queue_enter(&vm_page_queue_inactive, m,
1513                                                     vm_page_t, pageq);
1514                                 }
1515                                 vm_pageout_inactive_nolock++;
1516
1517                                 /*
1518                                  * force us to dump any collected free pages
1519                                  * and to pause before moving on
1520                                  */
1521                                 delayed_unlock = DELAYED_UNLOCK_LIMIT + 1;
1522
1523                                 goto done_with_inactivepage;
1524                         }
1525                         object = m->object;
1526                 }
1527                 /*
1528                  * If the page belongs to a purgable object with no pending copies
1529                  * against it, then we reap all of the pages in the object
1530                  * and note that the object has been "emptied".  It'll be up to the
1531                  * application the discover this and recreate its contents if desired.
1532                  */
1533                 if ((object->purgable == VM_OBJECT_PURGABLE_VOLATILE ||
1534                      object->purgable == VM_OBJECT_PURGABLE_EMPTY) &&
1535                     object->copy == VM_OBJECT_NULL) {
1536
1537                         (void) vm_object_purge(object);
1538                         vm_pageout_purged_objects++;
1539                         /*
1540                          * we've just taken all of the pages from this object,
1541                          * so drop the lock now since we're not going to find
1542                          * any more pages belonging to it anytime soon
1543                          */
1544                         vm_object_unlock(object);
1545                         object = NULL;
1546
1547                         inactive_burst_count = 0;
1548
1549                         goto done_with_inactivepage;
1550                 }
1551
1552                 /*
1553                  *      Paging out pages of external objects which
1554                  *      are currently being created must be avoided.
1555                  *      The pager may claim for memory, thus leading to a
1556                  *      possible dead lock between it and the pageout thread,
1557                  *      if such pages are finally chosen. The remaining assumption
1558                  *      is that there will finally be enough available pages in the
1559                  *      inactive pool to page out in order to satisfy all memory
1560                  *      claimed by the thread which concurrently creates the pager.
1561                  */
1562                 if (!object->pager_initialized && object->pager_created) {
1563                         /*
1564                          *      Move page to end and continue, hoping that
1565                          *      there will be enough other inactive pages to
1566                          *      page out so that the thread which currently
1567                          *      initializes the pager will succeed.
1568                          *      Don't re-grant the ticket, the page should
1569                          *      pulled from the queue and paged out whenever
1570                          *      one of its logically adjacent fellows is
1571                          *      targeted.
1572                          */
1573                         if (m->zero_fill) {
1574                                 queue_remove(&vm_page_queue_zf, m,
1575                                              vm_page_t, pageq);
1576                                 queue_enter(&vm_page_queue_zf, m,
1577                                             vm_page_t, pageq);
1578                                 last_page_zf = 1;
1579                                 vm_zf_iterator = vm_zf_iterator_count - 1;
1580                         } else {
1581                                 queue_remove(&vm_page_queue_inactive, m,
1582                                              vm_page_t, pageq);
1583                                 queue_enter(&vm_page_queue_inactive, m,
1584                                             vm_page_t, pageq);
1585                                 last_page_zf = 0;
1586                                 vm_zf_iterator = 1;
1587                         }
1588                         vm_pageout_inactive_avoid++;
1589
1590                         goto done_with_inactivepage;
1591                 }
1592                 /*
1593                  *      Remove the page from the inactive list.
1594                  */
1595                 if (m->zero_fill) {
1596                         queue_remove(&vm_page_queue_zf, m, vm_page_t, pageq);
1597                 } else {
1598                         queue_remove(&vm_page_queue_inactive, m, vm_page_t, pageq);
1599                 }
1600                 m->pageq.next = NULL;
1601                 m->pageq.prev = NULL;
1602                 m->inactive = FALSE;
1603                 if (!m->fictitious)
1604                         vm_page_inactive_count--;
1605
1606                 if (m->busy || !object->alive) {
1607                         /*
1608                          *      Somebody is already playing with this page.
1609                          *      Leave it off the pageout queues.
1610                          */
1611                         vm_pageout_inactive_busy++;
1612
1613                         goto done_with_inactivepage;
1614                 }
1615
1616                 /*
1617                  *      If it's absent or in error, we can reclaim the page.
1618                  */
1619
1620                 if (m->absent || m->error) {
1621                         vm_pageout_inactive_absent++;
1622 reclaim_page:
1623                         if (vm_pageout_deadlock_target) {
1624                                 vm_pageout_scan_inactive_throttle_success++;
1625                                 vm_pageout_deadlock_target--;
1626                         }
1627                         if (m->tabled)
1628                                 vm_page_remove(m);    /* clears tabled, object, offset */
1629                         if (m->absent)
1630                                 vm_object_absent_release(object);
1631
1632                         assert(m->pageq.next == NULL &&
1633                                m->pageq.prev == NULL);
1634                         m->pageq.next = (queue_entry_t)local_freeq;
1635                         local_freeq = m;
1636                         local_freed++;
1637
1638                         inactive_burst_count = 0;
1639
1640                         goto done_with_inactivepage;
1641                 }
1642
1643                 assert(!m->private);
1644                 assert(!m->fictitious);
1645
1646                 /*
1647                  *      If already cleaning this page in place, convert from
1648                  *      "adjacent" to "target". We can leave the page mapped,
1649                  *      and vm_pageout_object_terminate will determine whether
1650                  *      to free or reactivate.
1651                  */
1652
1653                 if (m->cleaning) {
1654                         m->busy = TRUE;
1655                         m->pageout = TRUE;
1656                         m->dump_cleaning = TRUE;
1657                         vm_page_wire(m);
1658
1659                         CLUSTER_STAT(vm_pageout_cluster_conversions++);
1660
1661                         inactive_burst_count = 0;
1662
1663                         goto done_with_inactivepage;
1664                 }
1665
1666                 /*
1667                  *      If it's being used, reactivate.
1668                  *      (Fictitious pages are either busy or absent.)
1669                  */
1670                 if ( (!m->reference) ) {
1671                         refmod_state = pmap_get_refmod(m->phys_page);
1672
1673                         if (refmod_state & VM_MEM_REFERENCED)
1674                                 m->reference = TRUE;
1675                         if (refmod_state & VM_MEM_MODIFIED)
1676                                 m->dirty = TRUE;
1677                 }
1678                 if (m->reference) {
1679 was_referenced:
1680                         vm_page_activate(m);
1681                         VM_STAT(reactivations++);
1682
1683                         vm_pageout_inactive_used++;
1684                         last_page_zf = 0;
1685                         inactive_burst_count = 0;
1686
1687                         goto done_with_inactivepage;
1688                 }
1689
1690                 XPR(XPR_VM_PAGEOUT,
1691                 "vm_pageout_scan, replace object 0x%X offset 0x%X page 0x%X\n",
1692                 (integer_t)object, (integer_t)m->offset, (integer_t)m, 0,0);
1693
1694                 /*
1695                  * we've got a candidate page to steal...
1696                  *
1697                  * m->dirty is up to date courtesy of the
1698                  * preceding check for m->reference... if
1699                  * we get here, then m->reference had to be
1700                  * FALSE which means we did a pmap_get_refmod
1701                  * and updated both m->reference and m->dirty
1702                  *
1703                  * if it's dirty or precious we need to
1704                  * see if the target queue is throtttled
1705                  * it if is, we need to skip over it by moving it back
1706                  * to the end of the inactive queue
1707                  */
1708                 inactive_throttled = FALSE;
1709
1710                 if (m->dirty || m->precious) {
1711                         if (object->internal) {
1712                                 if ((VM_PAGE_Q_THROTTLED(iq) || !IP_VALID(memory_manager_default)))
1713                                         inactive_throttled = TRUE;
1714                         } else if (VM_PAGE_Q_THROTTLED(eq)) {
1715                                         inactive_throttled = TRUE;
1716                         }
1717                 }
1718                 if (inactive_throttled == TRUE) {
1719                         if (m->zero_fill) {
1720                                 queue_enter(&vm_page_queue_zf, m,
1721                                             vm_page_t, pageq);
1722                         } else {
1723                                 queue_enter(&vm_page_queue_inactive, m,
1724                                             vm_page_t, pageq);
1725                         }
1726                         if (!m->fictitious)
1727                                 vm_page_inactive_count++;
1728                         m->inactive = TRUE;
1729
1730                         vm_pageout_scan_inactive_throttled++;
1731
1732                         goto done_with_inactivepage;
1733                 }
1734                 /*
1735                  * we've got a page that we can steal...
1736                  * eliminate all mappings and make sure
1737                  * we have the up-to-date modified state
1738                  * first take the page BUSY, so that no new
1739                  * mappings can be made
1740                  */
1741                 m->busy = TRUE;
1742
1743                 /*
1744                  * if we need to do a pmap_disconnect then we
1745                  * need to re-evaluate m->dirty since the pmap_disconnect
1746                  * provides the true state atomically... the
1747                  * page was still mapped up to the pmap_disconnect
1748                  * and may have been dirtied at the last microsecond
1749                  *
1750                  * we also check for the page being referenced 'late'
1751                  * if it was, we first need to do a WAKEUP_DONE on it
1752                  * since we already set m->busy = TRUE, before
1753                  * going off to reactivate it
1754                  *
1755                  * if we don't need the pmap_disconnect, then
1756                  * m->dirty is up to date courtesy of the
1757                  * earlier check for m->reference... if
1758                  * we get here, then m->reference had to be
1759                  * FALSE which means we did a pmap_get_refmod
1760                  * and updated both m->reference and m->dirty...
1761                  */
1762                 if (m->no_isync == FALSE) {
1763                         refmod_state = pmap_disconnect(m->phys_page);
1764
1765                         if (refmod_state & VM_MEM_MODIFIED)
1766                                 m->dirty = TRUE;
1767                         if (refmod_state & VM_MEM_REFERENCED) {
1768                                 m->reference = TRUE;
1769
1770                                 PAGE_WAKEUP_DONE(m);
1771                                 goto was_referenced;
1772                         }
1773                 }
1774                 /*
1775                  *      If it's clean and not precious, we can free the page.
1776                  */
1777                 if (!m->dirty && !m->precious) {
1778                         vm_pageout_inactive_clean++;
1779                         goto reclaim_page;
1780                 }
1781                 vm_pageout_cluster(m);
1782
1783                 vm_pageout_inactive_dirty++;
1784
1785                 inactive_burst_count = 0;
1786
1787 done_with_inactivepage:
1788                 if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
1789
1790                         if (object != NULL) {
1791                                 vm_object_unlock(object);
1792                                 object = NULL;
1793                         }
1794                         if (local_freeq) {
1795                                 vm_page_free_list(local_freeq);
1796
1797                                 local_freeq = 0;
1798                                 local_freed = 0;
1799                         }
1800                         delayed_unlock = 0;
1801                         vm_page_unlock_queues();
1802                         mutex_pause();
1803                 }
1804                 /*
1805                  * back to top of pageout scan loop
1806                  */
1807         }
1808 }
1809
1810
1811 int vm_page_free_count_init;
1812
1813 void
1814 vm_page_free_reserve(
1815         int pages)
1816 {
1817         int             free_after_reserve;
1818
1819         vm_page_free_reserved += pages;
1820
1821         free_after_reserve = vm_page_free_count_init - vm_page_free_reserved;
1822
1823         vm_page_free_min = vm_page_free_reserved +
1824                 VM_PAGE_FREE_MIN(free_after_reserve);
1825
1826         vm_page_free_target = vm_page_free_reserved +
1827                 VM_PAGE_FREE_TARGET(free_after_reserve);
1828
1829         if (vm_page_free_target < vm_page_free_min + 5)
1830                 vm_page_free_target = vm_page_free_min + 5;
1831 }
1832
1833 /*
1834  *      vm_pageout is the high level pageout daemon.
1835  */
1836
1837 void
1838 vm_pageout_continue(void)
1839 {
1840         vm_pageout_scan_event_counter++;
1841         vm_pageout_scan();
1842         /* we hold vm_page_queue_free_lock now */
1843         assert(vm_page_free_wanted == 0);
1844         assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
1845         mutex_unlock(&vm_page_queue_free_lock);
1846
1847         counter(c_vm_pageout_block++);
1848         thread_block((thread_continue_t)vm_pageout_continue);
1849         /*NOTREACHED*/
1850 }
1851
1852
1853 /*
1854  * must be called with the
1855  * queues and object locks held
1856  */
1857 static void
1858 vm_pageout_queue_steal(vm_page_t m)
1859 {
1860         struct vm_pageout_queue *q;
1861
1862         if (m->object->internal == TRUE)
1863                 q = &vm_pageout_queue_internal;
1864         else
1865                 q = &vm_pageout_queue_external;
1866
1867         m->laundry = FALSE;
1868         m->pageout_queue = FALSE;
1869         queue_remove(&q->pgo_pending, m, vm_page_t, pageq);
1870
1871         m->pageq.next = NULL;
1872         m->pageq.prev = NULL;
1873
1874         vm_object_paging_end(m->object);
1875
1876         q->pgo_laundry--;
1877 }
1878
1879
1880 #ifdef FAKE_DEADLOCK
1881
1882 #define FAKE_COUNT      5000
1883
1884 int internal_count = 0;
1885 int fake_deadlock = 0;
1886
1887 #endif
1888
1889 static void
1890 vm_pageout_iothread_continue(struct vm_pageout_queue *q)
1891 {
1892         vm_page_t       m = NULL;
1893         vm_object_t     object;
1894         boolean_t       need_wakeup;
1895
1896         vm_page_lock_queues();
1897
1898         while ( !queue_empty(&q->pgo_pending) ) {
1899
1900                    q->pgo_busy = TRUE;
1901                    queue_remove_first(&q->pgo_pending, m, vm_page_t, pageq);
1902                    m->pageout_queue = FALSE;
1903                    vm_page_unlock_queues();
1904
1905                    m->pageq.next = NULL;
1906                    m->pageq.prev = NULL;
1907 #ifdef FAKE_DEADLOCK
1908                    if (q == &vm_pageout_queue_internal) {
1909                            vm_offset_t addr;
1910                            int  pg_count;
1911
1912                            internal_count++;
1913
1914                            if ((internal_count == FAKE_COUNT)) {
1915
1916                                    pg_count = vm_page_free_count + vm_page_free_reserved;
1917
1918                                    if (kmem_alloc(kernel_map, &addr, PAGE_SIZE * pg_count) == KERN_SUCCESS) {
1919                                            kmem_free(kernel_map, addr, PAGE_SIZE * pg_count);
1920                                    }
1921                                    internal_count = 0;
1922                                    fake_deadlock++;
1923                            }
1924                    }
1925 #endif
1926                    object = m->object;
1927
1928                    if (!object->pager_initialized) {
1929                            vm_object_lock(object);
1930
1931                            /*
1932                             *   If there is no memory object for the page, create
1933                             *   one and hand it to the default pager.
1934                             */
1935
1936                            if (!object->pager_initialized)
1937                                    vm_object_collapse(object,
1938                                                       (vm_object_offset_t) 0,
1939                                                       TRUE);
1940                            if (!object->pager_initialized)
1941                                    vm_object_pager_create(object);
1942                            if (!object->pager_initialized) {
1943                                    /*
1944                                     *   Still no pager for the object.
1945                                     *   Reactivate the page.
1946                                     *
1947                                     *   Should only happen if there is no
1948                                     *   default pager.
1949                                     */
1950                                    m->list_req_pending = FALSE;
1951                                    m->cleaning = FALSE;
1952                                    m->pageout = FALSE;
1953                                    vm_page_unwire(m);
1954
1955                                    vm_pageout_throttle_up(m);
1956
1957                                    vm_page_lock_queues();
1958                                    vm_pageout_dirty_no_pager++;
1959                                    vm_page_activate(m);
1960                                    vm_page_unlock_queues();
1961
1962                                    /*
1963                                     *   And we are done with it.
1964                                     */
1965                                    PAGE_WAKEUP_DONE(m);
1966
1967                                    vm_object_paging_end(object);
1968                                    vm_object_unlock(object);
1969
1970                                    vm_page_lock_queues();
1971                                    continue;
1972                            } else if (object->pager == MEMORY_OBJECT_NULL) {
1973                                    /*
1974                                     * This pager has been destroyed by either
1975                                     * memory_object_destroy or vm_object_destroy, and
1976                                     * so there is nowhere for the page to go.
1977                                     * Just free the page... VM_PAGE_FREE takes
1978                                     * care of cleaning up all the state...
1979                                     * including doing the vm_pageout_throttle_up
1980                                     */
1981                                    VM_PAGE_FREE(m);
1982
1983                                    vm_object_paging_end(object);
1984                                    vm_object_unlock(object);
1985
1986                                    vm_page_lock_queues();
1987                                    continue;
1988                            }
1989                            vm_object_unlock(object);
1990                    }
1991                    /*
1992                     * we expect the paging_in_progress reference to have
1993                     * already been taken on the object before it was added
1994                     * to the appropriate pageout I/O queue... this will
1995                     * keep the object from being terminated and/or the
1996                     * paging_offset from changing until the I/O has
1997                     * completed... therefore no need to lock the object to
1998                     * pull the paging_offset from it.
1999                     *
2000                     * Send the data to the pager.
2001                     * any pageout clustering happens there
2002                     */
2003                    memory_object_data_return(object->pager,
2004                                              m->offset + object->paging_offset,
2005                                              PAGE_SIZE,
2006                                              NULL,
2007                                              NULL,
2008                                              FALSE,
2009                                              FALSE,
2010                                              0);
2011
2012                    vm_object_lock(object);
2013                    vm_object_paging_end(object);
2014                    vm_object_unlock(object);
2015
2016                    vm_page_lock_queues();
2017         }
2018         assert_wait((event_t) q, THREAD_UNINT);
2019
2020
2021         if (q->pgo_throttled == TRUE && !VM_PAGE_Q_THROTTLED(q)) {
2022                 q->pgo_throttled = FALSE;
2023                 need_wakeup = TRUE;
2024         } else
2025                 need_wakeup = FALSE;
2026
2027         q->pgo_busy = FALSE;
2028         q->pgo_idle = TRUE;
2029         vm_page_unlock_queues();
2030
2031         if (need_wakeup == TRUE)
2032                 thread_wakeup((event_t) &q->pgo_laundry);
2033
2034         thread_block_parameter((thread_continue_t)vm_pageout_iothread_continue, (void *) &q->pgo_pending);
2035         /*NOTREACHED*/
2036 }
2037
2038
2039 static void
2040 vm_pageout_iothread_external(void)
2041 {
2042
2043         vm_pageout_iothread_continue(&vm_pageout_queue_external);
2044         /*NOTREACHED*/
2045 }
2046
2047
2048 static void
2049 vm_pageout_iothread_internal(void)
2050 {
2051         thread_t        self = current_thread();
2052
2053         self->options |= TH_OPT_VMPRIV;
2054
2055         vm_pageout_iothread_continue(&vm_pageout_queue_internal);
2056         /*NOTREACHED*/
2057 }
2058
2059 static void
2060 vm_pageout_garbage_collect(int collect)
2061 {
2062         if (collect) {
2063                 stack_collect();
2064
2065                 /*
2066                  * consider_zone_gc should be last, because the other operations
2067                  * might return memory to zones.
2068                  */
2069                 consider_machine_collect();
2070                 consider_zone_gc();
2071
2072                 consider_machine_adjust();
2073         }
2074
2075         assert_wait((event_t) &vm_pageout_garbage_collect, THREAD_UNINT);
2076
2077         thread_block_parameter((thread_continue_t) vm_pageout_garbage_collect, (void *)1);
2078         /*NOTREACHED*/
2079 }
2080
2081
2082
2083 void
2084 vm_pageout(void)
2085 {
2086         thread_t        self = current_thread();
2087         thread_t        thread;
2088         kern_return_t   result;
2089         spl_t           s;
2090
2091         /*
2092          * Set thread privileges.
2093          */
2094         s = splsched();
2095         thread_lock(self);
2096         self->priority = BASEPRI_PREEMPT - 1;
2097         set_sched_pri(self, self->priority);
2098         thread_unlock(self);
2099         splx(s);
2100
2101         /*
2102          *      Initialize some paging parameters.
2103          */
2104
2105         if (vm_pageout_idle_wait == 0)
2106                 vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
2107
2108         if (vm_pageout_burst_wait == 0)
2109                 vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
2110
2111         if (vm_pageout_empty_wait == 0)
2112                 vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
2113
2114         if (vm_pageout_deadlock_wait == 0)
2115                 vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
2116
2117         if (vm_pageout_deadlock_relief == 0)
2118                 vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
2119
2120         if (vm_pageout_inactive_relief == 0)
2121                 vm_pageout_inactive_relief = VM_PAGEOUT_INACTIVE_RELIEF;
2122
2123         if (vm_pageout_burst_active_throttle == 0)
2124                 vm_pageout_burst_active_throttle = VM_PAGEOUT_BURST_ACTIVE_THROTTLE;
2125
2126         if (vm_pageout_burst_inactive_throttle == 0)
2127                 vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
2128
2129         /*
2130          * Set kernel task to low backing store privileged
2131          * status
2132          */
2133         task_lock(kernel_task);
2134         kernel_task->priv_flags |= VM_BACKING_STORE_PRIV;
2135         task_unlock(kernel_task);
2136
2137         vm_page_free_count_init = vm_page_free_count;
2138         vm_zf_iterator = 0;
2139         /*
2140          * even if we've already called vm_page_free_reserve
2141          * call it again here to insure that the targets are
2142          * accurately calculated (it uses vm_page_free_count_init)
2143          * calling it with an arg of 0 will not change the reserve
2144          * but will re-calculate free_min and free_target
2145          */
2146         if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
2147                 vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
2148         } else
2149                 vm_page_free_reserve(0);
2150
2151
2152         queue_init(&vm_pageout_queue_external.pgo_pending);
2153         vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
2154         vm_pageout_queue_external.pgo_laundry = 0;
2155         vm_pageout_queue_external.pgo_idle = FALSE;
2156         vm_pageout_queue_external.pgo_busy = FALSE;
2157         vm_pageout_queue_external.pgo_throttled = FALSE;
2158
2159         queue_init(&vm_pageout_queue_internal.pgo_pending);
2160         vm_pageout_queue_internal.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
2161         vm_pageout_queue_internal.pgo_laundry = 0;
2162         vm_pageout_queue_internal.pgo_idle = FALSE;
2163         vm_pageout_queue_internal.pgo_busy = FALSE;
2164         vm_pageout_queue_internal.pgo_throttled = FALSE;
2165
2166
2167         result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal, NULL, BASEPRI_PREEMPT - 1, &thread);
2168         if (result != KERN_SUCCESS)
2169                 panic("vm_pageout_iothread_internal: create failed");
2170
2171         thread_deallocate(thread);
2172
2173
2174         result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external, NULL, BASEPRI_PREEMPT - 1, &thread);
2175         if (result != KERN_SUCCESS)
2176                 panic("vm_pageout_iothread_external: create failed");
2177
2178         thread_deallocate(thread);
2179
2180
2181         result = kernel_thread_start_priority((thread_continue_t)vm_pageout_garbage_collect, NULL, BASEPRI_PREEMPT - 2, &thread);
2182         if (result != KERN_SUCCESS)
2183                 panic("vm_pageout_garbage_collect: create failed");
2184
2185         thread_deallocate(thread);
2186
2187
2188         vm_pageout_continue();
2189         /*NOTREACHED*/
2190 }
2191
2192
2193 static upl_t
2194 upl_create(
2195         int                flags,
2196         upl_size_t       size)
2197 {
2198         upl_t   upl;
2199         int     page_field_size;  /* bit field in word size buf */
2200
2201         page_field_size = 0;
2202         if (flags & UPL_CREATE_LITE) {
2203                 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
2204                 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
2205         }
2206         if(flags & UPL_CREATE_INTERNAL) {
2207                 upl = (upl_t)kalloc(sizeof(struct upl)
2208                         + (sizeof(struct upl_page_info)*(size/PAGE_SIZE))
2209                         + page_field_size);
2210         } else {
2211                 upl = (upl_t)kalloc(sizeof(struct upl) + page_field_size);
2212         }
2213         upl->flags = 0;
2214         upl->src_object = NULL;
2215         upl->kaddr = (vm_offset_t)0;
2216         upl->size = 0;
2217         upl->map_object = NULL;
2218         upl->ref_count = 1;
2219         upl->highest_page = 0;
2220         upl_lock_init(upl);
2221 #ifdef UPL_DEBUG
2222         upl->ubc_alias1 = 0;
2223         upl->ubc_alias2 = 0;
2224 #endif /* UPL_DEBUG */
2225         return(upl);
2226 }
2227
2228 static void
2229 upl_destroy(
2230         upl_t   upl)
2231 {
2232         int     page_field_size;  /* bit field in word size buf */
2233
2234 #ifdef UPL_DEBUG
2235         {
2236                 upl_t   upl_ele;
2237                 vm_object_t     object;
2238                 if (upl->map_object->pageout) {
2239                         object = upl->map_object->shadow;
2240                 } else {
2241                         object = upl->map_object;
2242                 }
2243                 vm_object_lock(object);
2244                 queue_iterate(&object->uplq, upl_ele, upl_t, uplq) {
2245                         if(upl_ele == upl) {
2246                                 queue_remove(&object->uplq,
2247                                                 upl_ele, upl_t, uplq);
2248                                 break;
2249                         }
2250                 }
2251                 vm_object_unlock(object);
2252         }
2253 #endif /* UPL_DEBUG */
2254         /* drop a reference on the map_object whether or */
2255         /* not a pageout object is inserted */
2256         if(upl->map_object->pageout)
2257                 vm_object_deallocate(upl->map_object);
2258
2259         page_field_size = 0;
2260         if (upl->flags & UPL_LITE) {
2261                 page_field_size = ((upl->size/PAGE_SIZE) + 7) >> 3;
2262                 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
2263         }
2264         if(upl->flags & UPL_INTERNAL) {
2265                 kfree(upl,
2266                       sizeof(struct upl) +
2267                       (sizeof(struct upl_page_info) * (upl->size/PAGE_SIZE))
2268                       + page_field_size);
2269         } else {
2270                 kfree(upl, sizeof(struct upl) + page_field_size);
2271         }
2272 }
2273
2274 void uc_upl_dealloc(upl_t upl);
2275 __private_extern__ void
2276 uc_upl_dealloc(
2277         upl_t   upl)
2278 {
2279         upl->ref_count -= 1;
2280         if(upl->ref_count == 0) {
2281                 upl_destroy(upl);
2282         }
2283 }
2284
2285 void
2286 upl_deallocate(
2287         upl_t   upl)
2288 {
2289
2290         upl->ref_count -= 1;
2291         if(upl->ref_count == 0) {
2292                 upl_destroy(upl);
2293         }
2294 }
2295
2296 /*
2297  * Statistics about UPL enforcement of copy-on-write obligations.
2298  */
2299 unsigned long upl_cow = 0;
2300 unsigned long upl_cow_again = 0;
2301 unsigned long upl_cow_contiguous = 0;
2302 unsigned long upl_cow_pages = 0;
2303 unsigned long upl_cow_again_pages = 0;
2304 unsigned long upl_cow_contiguous_pages = 0;
2305
2306 /*
2307  *      Routine:        vm_object_upl_request
2308  *      Purpose:
2309  *              Cause the population of a portion of a vm_object.
2310  *              Depending on the nature of the request, the pages
2311  *              returned may be contain valid data or be uninitialized.
2312  *              A page list structure, listing the physical pages
2313  *              will be returned upon request.
2314  *              This function is called by the file system or any other
2315  *              supplier of backing store to a pager.
2316  *              IMPORTANT NOTE: The caller must still respect the relationship
2317  *              between the vm_object and its backing memory object.  The
2318  *              caller MUST NOT substitute changes in the backing file
2319  *              without first doing a memory_object_lock_request on the
2320  *              target range unless it is know that the pages are not
2321  *              shared with another entity at the pager level.
2322  *              Copy_in_to:
2323  *                      if a page list structure is present
2324  *                      return the mapped physical pages, where a
2325  *                      page is not present, return a non-initialized
2326  *                      one.  If the no_sync bit is turned on, don't
2327  *                      call the pager unlock to synchronize with other
2328  *                      possible copies of the page. Leave pages busy
2329  *                      in the original object, if a page list structure
2330  *                      was specified.  When a commit of the page list
2331  *                      pages is done, the dirty bit will be set for each one.
2332  *              Copy_out_from:
2333  *                      If a page list structure is present, return
2334  *                      all mapped pages.  Where a page does not exist
2335  *                      map a zero filled one. Leave pages busy in
2336  *                      the original object.  If a page list structure
2337  *                      is not specified, this call is a no-op.
2338  *
2339  *              Note:  access of default pager objects has a rather interesting
2340  *              twist.  The caller of this routine, presumably the file system
2341  *              page cache handling code, will never actually make a request
2342  *              against a default pager backed object.  Only the default
2343  *              pager will make requests on backing store related vm_objects
2344  *              In this way the default pager can maintain the relationship
2345  *              between backing store files (abstract memory objects) and
2346  *              the vm_objects (cache objects), they support.
2347  *
2348  */
2349
2350 __private_extern__ kern_return_t
2351 vm_object_upl_request(
2352         vm_object_t             object,
2353         vm_object_offset_t      offset,
2354         upl_size_t              size,
2355         upl_t                   *upl_ptr,
2356         upl_page_info_array_t   user_page_list,
2357         unsigned int            *page_list_count,
2358         int                     cntrl_flags)
2359 {
2360         vm_page_t               dst_page = VM_PAGE_NULL;
2361         vm_object_offset_t      dst_offset = offset;
2362         upl_size_t              xfer_size = size;
2363         boolean_t               do_m_lock = FALSE;
2364         boolean_t               dirty;
2365         boolean_t               hw_dirty;
2366         upl_t                   upl = NULL;
2367         unsigned int            entry;
2368 #if MACH_CLUSTER_STATS
2369         boolean_t               encountered_lrp = FALSE;
2370 #endif
2371         vm_page_t               alias_page = NULL;
2372         int                     page_ticket;
2373         int                     refmod_state;
2374         wpl_array_t             lite_list = NULL;
2375         vm_object_t             last_copy_object;
2376
2377
2378         if (cntrl_flags & ~UPL_VALID_FLAGS) {
2379                 /*
2380                  * For forward compatibility's sake,
2381                  * reject any unknown flag.
2382                  */
2383                 return KERN_INVALID_VALUE;
2384         }
2385
2386         page_ticket = (cntrl_flags & UPL_PAGE_TICKET_MASK)
2387                                         >> UPL_PAGE_TICKET_SHIFT;
2388
2389         if(((size/PAGE_SIZE) > MAX_UPL_TRANSFER) && !object->phys_contiguous) {
2390                 size = MAX_UPL_TRANSFER * PAGE_SIZE;
2391         }
2392
2393         if(cntrl_flags & UPL_SET_INTERNAL)
2394                 if(page_list_count != NULL)
2395                         *page_list_count = MAX_UPL_TRANSFER;
2396
2397         if((!object->internal) && (object->paging_offset != 0))
2398                 panic("vm_object_upl_request: external object with non-zero paging offset\n");
2399
2400         if((cntrl_flags & UPL_COPYOUT_FROM) && (upl_ptr == NULL)) {
2401                 return KERN_SUCCESS;
2402         }
2403
2404         vm_object_lock(object);
2405         vm_object_paging_begin(object);
2406         vm_object_unlock(object);
2407
2408         if(upl_ptr) {
2409                 if(cntrl_flags & UPL_SET_INTERNAL) {
2410                         if(cntrl_flags & UPL_SET_LITE) {
2411                                 uintptr_t page_field_size;
2412                                 upl = upl_create(
2413                                         UPL_CREATE_INTERNAL | UPL_CREATE_LITE,
2414                                         size);
2415                                 user_page_list = (upl_page_info_t *)
2416                                    (((uintptr_t)upl) + sizeof(struct upl));
2417                                 lite_list = (wpl_array_t)
2418                                         (((uintptr_t)user_page_list) +
2419                                         ((size/PAGE_SIZE) *
2420                                                 sizeof(upl_page_info_t)));
2421                                 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
2422                                 page_field_size =
2423                                         (page_field_size + 3) & 0xFFFFFFFC;
2424                                 bzero((char *)lite_list, page_field_size);
2425                                 upl->flags =
2426                                         UPL_LITE | UPL_INTERNAL;
2427                         } else {
2428                                 upl = upl_create(UPL_CREATE_INTERNAL, size);
2429                                 user_page_list = (upl_page_info_t *)
2430                                         (((uintptr_t)upl) + sizeof(struct upl));
2431                                 upl->flags = UPL_INTERNAL;
2432                         }
2433                 } else {
2434                         if(cntrl_flags & UPL_SET_LITE) {
2435                                 uintptr_t page_field_size;
2436                                 upl = upl_create(UPL_CREATE_LITE, size);
2437                                 lite_list = (wpl_array_t)
2438                                    (((uintptr_t)upl) + sizeof(struct upl));
2439                                 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
2440                                 page_field_size =
2441                                         (page_field_size + 3) & 0xFFFFFFFC;
2442                                 bzero((char *)lite_list, page_field_size);
2443                                 upl->flags = UPL_LITE;
2444                         } else {
2445                                 upl = upl_create(UPL_CREATE_EXTERNAL, size);
2446                                 upl->flags = 0;
2447                         }
2448                 }
2449
2450                 if (object->phys_contiguous) {
2451                         if ((cntrl_flags & UPL_WILL_MODIFY) &&
2452                             object->copy != VM_OBJECT_NULL) {
2453                                 /* Honor copy-on-write obligations */
2454
2455                                 /*
2456                                  * XXX FBDP
2457                                  * We could still have a race...
2458                                  * A is here building the UPL for a write().
2459                                  * A pushes the pages to the current copy
2460                                  * object.
2461                                  * A returns the UPL to the caller.
2462                                  * B comes along and establishes another
2463                                  * private mapping on this object, inserting
2464                                  * a new copy object between the original
2465                                  * object and the old copy object.
2466                                  * B reads a page and gets the original contents
2467                                  * from the original object.
2468                                  * A modifies the page in the original object.
2469                                  * B reads the page again and sees A's changes,
2470                                  * which is wrong...
2471                                  *
2472                                  * The problem is that the pages are not
2473                                  * marked "busy" in the original object, so
2474                                  * nothing prevents B from reading it before
2475                                  * before A's changes are completed.
2476                                  *
2477                                  * The "paging_in_progress" might protect us
2478                                  * from the insertion of a new copy object
2479                                  * though...  To be verified.
2480                                  */
2481                                 vm_object_lock_request(object,
2482                                                        offset,
2483                                                        size,
2484                                                        FALSE,
2485                                                        MEMORY_OBJECT_COPY_SYNC,
2486                                                        VM_PROT_NO_CHANGE);
2487                                 upl_cow_contiguous++;
2488                                 upl_cow_contiguous_pages += size >> PAGE_SHIFT;
2489                         }
2490
2491                         upl->map_object = object;
2492                         /* don't need any shadow mappings for this one */
2493                         /* since it is already I/O memory */
2494                         upl->flags |= UPL_DEVICE_MEMORY;
2495
2496
2497                         /* paging_in_progress protects paging_offset */
2498                         upl->offset = offset + object->paging_offset;
2499                         upl->size = size;
2500                         *upl_ptr = upl;
2501                         if(user_page_list) {
2502                                 user_page_list[0].phys_addr =
2503                                    (offset + object->shadow_offset)>>PAGE_SHIFT;
2504                                 user_page_list[0].device = TRUE;
2505                         }
2506                         upl->highest_page = (offset + object->shadow_offset + size - 1)>>PAGE_SHIFT;
2507
2508                         if(page_list_count != NULL) {
2509                                 if (upl->flags & UPL_INTERNAL) {
2510                                         *page_list_count = 0;
2511                                 } else {
2512                                         *page_list_count = 1;
2513                                 }
2514                         }
2515
2516                         return KERN_SUCCESS;
2517                 }
2518
2519                 if(user_page_list)
2520                         user_page_list[0].device = FALSE;
2521
2522                 if(cntrl_flags & UPL_SET_LITE) {
2523                         upl->map_object = object;
2524                 } else {
2525                         upl->map_object = vm_object_allocate(size);
2526                         /*
2527                          * No neeed to lock the new object: nobody else knows
2528                          * about it yet, so it's all ours so far.
2529                          */
2530                         upl->map_object->shadow = object;
2531                         upl->map_object->pageout = TRUE;
2532                         upl->map_object->can_persist = FALSE;
2533                         upl->map_object->copy_strategy =
2534                                         MEMORY_OBJECT_COPY_NONE;
2535                         upl->map_object->shadow_offset = offset;
2536                         upl->map_object->wimg_bits = object->wimg_bits;
2537                 }
2538
2539         }
2540         if (!(cntrl_flags & UPL_SET_LITE)) {
2541                 VM_PAGE_GRAB_FICTITIOUS(alias_page);
2542         }
2543
2544         /*
2545          * ENCRYPTED SWAP:
2546          * Just mark the UPL as "encrypted" here.
2547          * We'll actually encrypt the pages later,
2548          * in upl_encrypt(), when the caller has
2549          * selected which pages need to go to swap.
2550          */
2551         if (cntrl_flags & UPL_ENCRYPT) {
2552                 upl->flags |= UPL_ENCRYPTED;
2553         }
2554         if (cntrl_flags & UPL_FOR_PAGEOUT) {
2555                 upl->flags |= UPL_PAGEOUT;
2556         }
2557         vm_object_lock(object);
2558
2559         /* we can lock in the paging_offset once paging_in_progress is set */
2560         if(upl_ptr) {
2561                 upl->size = size;
2562                 upl->offset = offset + object->paging_offset;
2563                 *upl_ptr = upl;
2564 #ifdef UPL_DEBUG
2565                 queue_enter(&object->uplq, upl, upl_t, uplq);
2566 #endif /* UPL_DEBUG */
2567         }
2568
2569         if ((cntrl_flags & UPL_WILL_MODIFY) &&
2570             object->copy != VM_OBJECT_NULL) {
2571                 /* Honor copy-on-write obligations */
2572
2573                 /*
2574                  * The caller is gathering these pages and
2575                  * might modify their contents.  We need to
2576                  * make sure that the copy object has its own
2577                  * private copies of these pages before we let
2578                  * the caller modify them.
2579                  */
2580                 vm_object_update(object,
2581                                  offset,
2582                                  size,
2583                                  NULL,
2584                                  NULL,
2585                                  FALSE, /* should_return */
2586                                  MEMORY_OBJECT_COPY_SYNC,
2587                                  VM_PROT_NO_CHANGE);
2588                 upl_cow++;
2589                 upl_cow_pages += size >> PAGE_SHIFT;
2590
2591         }
2592         /* remember which copy object we synchronized with */
2593         last_copy_object = object->copy;
2594
2595         entry = 0;
2596         if(cntrl_flags & UPL_COPYOUT_FROM) {
2597                 upl->flags |= UPL_PAGE_SYNC_DONE;
2598
2599                 while (xfer_size) {
2600                         if((alias_page == NULL) &&
2601                                 !(cntrl_flags & UPL_SET_LITE)) {
2602                                 vm_object_unlock(object);
2603                                 VM_PAGE_GRAB_FICTITIOUS(alias_page);
2604                                 vm_object_lock(object);
2605                         }
2606                         if ( ((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
2607                                 dst_page->fictitious ||
2608                                 dst_page->absent ||
2609                                 dst_page->error ||
2610                                (dst_page->wire_count && !dst_page->pageout) ||
2611
2612                              ((!dst_page->inactive) && (cntrl_flags & UPL_FOR_PAGEOUT) &&
2613                                (dst_page->page_ticket != page_ticket) &&
2614                               ((dst_page->page_ticket+1) != page_ticket)) ) {
2615
2616                                 if (user_page_list)
2617                                         user_page_list[entry].phys_addr = 0;
2618                         } else {
2619                                 /*
2620                                  * grab this up front...
2621                                  * a high percentange of the time we're going to
2622                                  * need the hardware modification state a bit later
2623                                  * anyway... so we can eliminate an extra call into
2624                                  * the pmap layer by grabbing it here and recording it
2625                                  */
2626                                 refmod_state = pmap_get_refmod(dst_page->phys_page);
2627
2628                                 if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
2629                                         /*
2630                                          * we're only asking for DIRTY pages to be returned
2631                                          */
2632
2633                                         if (dst_page->list_req_pending || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
2634                                                 /*
2635                                                  * if we were the page stolen by vm_pageout_scan to be
2636                                                  * cleaned (as opposed to a buddy being clustered in
2637                                                  * or this request is not being driven by a PAGEOUT cluster
2638                                                  * then we only need to check for the page being diry or
2639                                                  * precious to decide whether to return it
2640                                                  */
2641                                                 if (dst_page->dirty || dst_page->precious ||
2642                                                     (refmod_state & VM_MEM_MODIFIED)) {
2643                                                         goto check_busy;
2644                                                 }
2645                                         }
2646                                         /*
2647                                          * this is a request for a PAGEOUT cluster and this page
2648                                          * is merely along for the ride as a 'buddy'... not only
2649                                          * does it have to be dirty to be returned, but it also
2650                                          * can't have been referenced recently... note that we've
2651                                          * already filtered above based on whether this page is
2652                                          * currently on the inactive queue or it meets the page
2653                                          * ticket (generation count) check
2654                                          */
2655                                         if ( !(refmod_state & VM_MEM_REFERENCED) &&
2656                                              ((refmod_state & VM_MEM_MODIFIED) ||
2657                                               dst_page->dirty || dst_page->precious) ) {
2658                                                 goto check_busy;
2659                                         }
2660                                         /*
2661                                          * if we reach here, we're not to return
2662                                          * the page... go on to the next one
2663                                          */
2664                                         if (user_page_list)
2665                                                 user_page_list[entry].phys_addr = 0;
2666                                         entry++;
2667                                         dst_offset += PAGE_SIZE_64;
2668                                         xfer_size -= PAGE_SIZE;
2669                                         continue;
2670                                 }
2671 check_busy:
2672                                 if(dst_page->busy &&
2673                                         (!(dst_page->list_req_pending &&
2674                                                 dst_page->pageout))) {
2675                                         if(cntrl_flags & UPL_NOBLOCK) {
2676                                                 if(user_page_list) {
2677                                                         user_page_list[entry].phys_addr = 0;
2678                                                 }
2679                                                 entry++;
2680                                                 dst_offset += PAGE_SIZE_64;
2681                                                 xfer_size -= PAGE_SIZE;
2682                                                 continue;
2683                                         }
2684                                         /*
2685                                          * someone else is playing with the
2686                                          * page.  We will have to wait.
2687                                          */
2688                                         PAGE_SLEEP(object, dst_page, THREAD_UNINT);
2689                                         continue;
2690                                 }
2691                                 /* Someone else already cleaning the page? */
2692                                 if((dst_page->cleaning || dst_page->absent ||
2693                                         dst_page->wire_count != 0) &&
2694                                         !dst_page->list_req_pending) {
2695                                    if(user_page_list) {
2696                                            user_page_list[entry].phys_addr = 0;
2697                                    }
2698                                    entry++;
2699                                    dst_offset += PAGE_SIZE_64;
2700                                    xfer_size -= PAGE_SIZE;
2701                                    continue;
2702                                 }
2703                                 /* eliminate all mappings from the */
2704                                 /* original object and its prodigy */
2705
2706                                 vm_page_lock_queues();
2707
2708                                 if (dst_page->pageout_queue == TRUE)
2709                                         /*
2710                                          * we've buddied up a page for a clustered pageout
2711                                          * that has already been moved to the pageout
2712                                          * queue by pageout_scan... we need to remove
2713                                          * it from the queue and drop the laundry count
2714                                          * on that queue
2715                                          */
2716                                         vm_pageout_queue_steal(dst_page);
2717 #if MACH_CLUSTER_STATS
2718                                 /* pageout statistics gathering.  count  */
2719                                 /* all the pages we will page out that   */
2720                                 /* were not counted in the initial       */
2721                                 /* vm_pageout_scan work                  */
2722                                 if(dst_page->list_req_pending)
2723                                         encountered_lrp = TRUE;
2724                                 if((dst_page->dirty ||
2725                                         (dst_page->object->internal &&
2726                                         dst_page->precious)) &&
2727                                         (dst_page->list_req_pending
2728                                         == FALSE)) {
2729                                         if(encountered_lrp) {
2730                                                 CLUSTER_STAT
2731                                                 (pages_at_higher_offsets++;)
2732                                         } else {
2733                                                 CLUSTER_STAT
2734                                                 (pages_at_lower_offsets++;)
2735                                         }
2736                                 }
2737 #endif
2738                                 /* Turn off busy indication on pending */
2739                                 /* pageout.  Note: we can only get here */
2740                                 /* in the request pending case.  */
2741                                 dst_page->list_req_pending = FALSE;
2742                                 dst_page->busy = FALSE;
2743                                 dst_page->cleaning = FALSE;
2744
2745                                 hw_dirty = refmod_state & VM_MEM_MODIFIED;
2746                                 dirty = hw_dirty ? TRUE : dst_page->dirty;
2747
2748                                 if(cntrl_flags & UPL_SET_LITE) {
2749                                         int     pg_num;
2750                                         pg_num = (dst_offset-offset)/PAGE_SIZE;
2751                                         lite_list[pg_num>>5] |=
2752                                                         1 << (pg_num & 31);
2753                                         if (hw_dirty)
2754                                                 pmap_clear_modify(dst_page->phys_page);
2755                                         /*
2756                                          * Record that this page has been
2757                                          * written out
2758                                          */
2759 #if     MACH_PAGEMAP
2760                                         vm_external_state_set(
2761                                                 object->existence_map,
2762                                                 dst_page->offset);
2763 #endif  /*MACH_PAGEMAP*/
2764
2765                                         /*
2766                                          * Mark original page as cleaning
2767                                          * in place.
2768                                          */
2769                                         dst_page->cleaning = TRUE;
2770                                         dst_page->dirty = TRUE;
2771                                         dst_page->precious = FALSE;
2772                                 } else {
2773                                         /* use pageclean setup, it is more */
2774                                         /* convenient even for the pageout */
2775                                         /* cases here */
2776
2777                                         vm_object_lock(upl->map_object);
2778                                         vm_pageclean_setup(dst_page,
2779                                                 alias_page, upl->map_object,
2780                                                 size - xfer_size);
2781                                         vm_object_unlock(upl->map_object);
2782
2783                                         alias_page->absent = FALSE;
2784                                         alias_page = NULL;
2785                                 }
2786
2787                                 if(!dirty) {
2788                                         dst_page->dirty = FALSE;
2789                                         dst_page->precious = TRUE;
2790                                 }
2791
2792                                 if(dst_page->pageout)
2793                                         dst_page->busy = TRUE;
2794
2795                                 if ( (cntrl_flags & UPL_ENCRYPT) ) {
2796                                         /*
2797                                          * ENCRYPTED SWAP:
2798                                          * We want to deny access to the target page
2799                                          * because its contents are about to be
2800                                          * encrypted and the user would be very
2801                                          * confused to see encrypted data instead
2802                                          * of their data.
2803                                          */
2804                                         dst_page->busy = TRUE;
2805                                 }
2806                                 if ( !(cntrl_flags & UPL_CLEAN_IN_PLACE) ) {
2807                                         /*
2808                                          * deny access to the target page
2809                                          * while it is being worked on
2810                                          */
2811                                         if ((!dst_page->pageout) &&
2812                                             (dst_page->wire_count == 0)) {
2813                                                 dst_page->busy = TRUE;
2814                                                 dst_page->pageout = TRUE;
2815                                                 vm_page_wire(dst_page);
2816                                         }
2817                                 }
2818
2819                                 if (dst_page->phys_page > upl->highest_page)
2820                                         upl->highest_page = dst_page->phys_page;
2821
2822                                 if(user_page_list) {
2823                                         user_page_list[entry].phys_addr
2824                                                 = dst_page->phys_page;
2825                                         user_page_list[entry].dirty =
2826                                                         dst_page->dirty;
2827                                         user_page_list[entry].pageout =
2828                                                         dst_page->pageout;
2829                                         user_page_list[entry].absent =
2830                                                         dst_page->absent;
2831                                         user_page_list[entry].precious =
2832                                                         dst_page->precious;
2833                                 }
2834                                 vm_page_unlock_queues();
2835
2836                                 /*
2837                                  * ENCRYPTED SWAP:
2838                                  * The caller is gathering this page and might
2839                                  * access its contents later on.  Decrypt the
2840                                  * page before adding it to the UPL, so that
2841                                  * the caller never sees encrypted data.
2842                                  */
2843                                 if (! (cntrl_flags & UPL_ENCRYPT) &&
2844                                     dst_page->encrypted) {
2845                                         assert(dst_page->busy);
2846
2847                                         vm_page_decrypt(dst_page, 0);
2848                                         vm_page_decrypt_for_upl_counter++;
2849
2850                                         /*
2851                                          * Retry this page, since anything
2852                                          * could have changed while we were
2853                                          * decrypting.
2854                                          */
2855                                         continue;
2856                                 }
2857                         }
2858                         entry++;
2859                         dst_offset += PAGE_SIZE_64;
2860                         xfer_size -= PAGE_SIZE;
2861                 }
2862         } else {
2863                 while (xfer_size) {
2864                         if((alias_page == NULL) &&
2865                                 !(cntrl_flags & UPL_SET_LITE)) {
2866                                 vm_object_unlock(object);
2867                                 VM_PAGE_GRAB_FICTITIOUS(alias_page);
2868                                 vm_object_lock(object);
2869                         }
2870
2871                         if ((cntrl_flags & UPL_WILL_MODIFY) &&
2872                             object->copy != last_copy_object) {
2873                                 /* Honor copy-on-write obligations */
2874
2875                                 /*
2876                                  * The copy object has changed since we
2877                                  * last synchronized for copy-on-write.
2878                                  * Another copy object might have been
2879                                  * inserted while we released the object's
2880                                  * lock.  Since someone could have seen the
2881                                  * original contents of the remaining pages
2882                                  * through that new object, we have to
2883                                  * synchronize with it again for the remaining
2884                                  * pages only.  The previous pages are "busy"
2885                                  * so they can not be seen through the new
2886                                  * mapping.  The new mapping will see our
2887                                  * upcoming changes for those previous pages,
2888                                  * but that's OK since they couldn't see what
2889                                  * was there before.  It's just a race anyway
2890                                  * and there's no guarantee of consistency or
2891                                  * atomicity.  We just don't want new mappings
2892                                  * to see both the *before* and *after* pages.
2893                                  */
2894                                 if (object->copy != VM_OBJECT_NULL) {
2895                                         vm_object_update(
2896                                                 object,
2897                                                 dst_offset,/* current offset */
2898                                                 xfer_size, /* remaining size */
2899                                                 NULL,
2900                                                 NULL,
2901                                                 FALSE,     /* should_return */
2902                                                 MEMORY_OBJECT_COPY_SYNC,
2903                                                 VM_PROT_NO_CHANGE);
2904                                         upl_cow_again++;
2905                                         upl_cow_again_pages +=
2906                                                 xfer_size >> PAGE_SHIFT;
2907                                 }
2908                                 /* remember the copy object we synced with */
2909                                 last_copy_object = object->copy;
2910                         }
2911
2912                         dst_page = vm_page_lookup(object, dst_offset);
2913
2914                         if(dst_page != VM_PAGE_NULL) {
2915                                 if((cntrl_flags & UPL_RET_ONLY_ABSENT) &&
2916                                         !((dst_page->list_req_pending)
2917                                                 && (dst_page->absent))) {
2918                                         /* we are doing extended range */
2919                                         /* requests.  we want to grab  */
2920                                         /* pages around some which are */
2921                                         /* already present.  */
2922                                         if(user_page_list) {
2923                                                 user_page_list[entry].phys_addr = 0;
2924                                         }
2925                                         entry++;
2926                                         dst_offset += PAGE_SIZE_64;
2927                                         xfer_size -= PAGE_SIZE;
2928                                         continue;
2929                                 }
2930                                 if((dst_page->cleaning) &&
2931                                    !(dst_page->list_req_pending)) {
2932                                         /*someone else is writing to the */
2933                                         /* page.  We will have to wait.  */
2934                                         PAGE_SLEEP(object,dst_page,THREAD_UNINT);
2935                                         continue;
2936                                 }
2937                                 if ((dst_page->fictitious &&
2938                                      dst_page->list_req_pending)) {
2939                                         /* dump the fictitious page */
2940                                         dst_page->list_req_pending = FALSE;
2941                                         dst_page->clustered = FALSE;
2942
2943                                         vm_page_lock_queues();
2944                                         vm_page_free(dst_page);
2945                                         vm_page_unlock_queues();
2946
2947                                         dst_page = NULL;
2948                                 } else if ((dst_page->absent &&
2949                                             dst_page->list_req_pending)) {
2950                                         /* the default_pager case */
2951                                         dst_page->list_req_pending = FALSE;
2952                                         dst_page->busy = FALSE;
2953                                 }
2954                         }
2955                         if(dst_page == VM_PAGE_NULL) {
2956                                 if(object->private) {
2957                                         /*
2958                                          * This is a nasty wrinkle for users
2959                                          * of upl who encounter device or
2960                                          * private memory however, it is
2961                                          * unavoidable, only a fault can
2962                                          * reslove the actual backing
2963                                          * physical page by asking the
2964                                          * backing device.
2965                                          */
2966                                         if(user_page_list) {
2967                                                 user_page_list[entry].phys_addr = 0;
2968                                         }
2969                                         entry++;
2970                                         dst_offset += PAGE_SIZE_64;
2971                                         xfer_size -= PAGE_SIZE;
2972                                         continue;
2973                                 }
2974                                 /* need to allocate a page */
2975                                 dst_page = vm_page_alloc(object, dst_offset);
2976                                 if (dst_page == VM_PAGE_NULL) {
2977                                         vm_object_unlock(object);
2978                                         VM_PAGE_WAIT();
2979                                         vm_object_lock(object);
2980                                         continue;
2981                                 }
2982                                 dst_page->busy = FALSE;
2983 #if 0
2984                                 if(cntrl_flags & UPL_NO_SYNC) {
2985                                         dst_page->page_lock = 0;
2986                                         dst_page->unlock_request = 0;
2987                                 }
2988 #endif
2989                                 if(cntrl_flags & UPL_RET_ONLY_ABSENT) {
2990                                         /*
2991                                          * if UPL_RET_ONLY_ABSENT was specified,
2992                                          * than we're definitely setting up a
2993                                          * upl for a clustered read/pagein
2994                                          * operation... mark the pages as clustered
2995                                          * so vm_fault can correctly attribute them
2996                                          * to the 'pagein' bucket the first time
2997                                          * a fault happens on them
2998                                          */
2999                                         dst_page->clustered = TRUE;
3000                                 }
3001                                 dst_page->absent = TRUE;
3002                                 object->absent_count++;
3003                         }
3004 #if 1
3005                         if(cntrl_flags & UPL_NO_SYNC) {
3006                                 dst_page->page_lock = 0;
3007                                 dst_page->unlock_request = 0;
3008                         }
3009 #endif /* 1 */
3010
3011                         /*
3012                          * ENCRYPTED SWAP:
3013                          */
3014                         if (cntrl_flags & UPL_ENCRYPT) {
3015                                 /*
3016                                  * The page is going to be encrypted when we
3017                                  * get it from the pager, so mark it so.
3018                                  */
3019                                 dst_page->encrypted = TRUE;
3020                         } else {
3021                                 /*
3022                                  * Otherwise, the page will not contain
3023                                  * encrypted data.
3024                                  */
3025                                 dst_page->encrypted = FALSE;
3026                         }
3027
3028                         dst_page->overwriting = TRUE;
3029                         if(dst_page->fictitious) {
3030                                 panic("need corner case for fictitious page");
3031                         }
3032                         if(dst_page->page_lock) {
3033                                 do_m_lock = TRUE;
3034                         }
3035                         if(upl_ptr) {
3036
3037                                 /* eliminate all mappings from the */
3038                                 /* original object and its prodigy */
3039
3040                                 if(dst_page->busy) {
3041                                         /*someone else is playing with the */
3042                                         /* page.  We will have to wait.    */
3043                                         PAGE_SLEEP(object, dst_page, THREAD_UNINT);
3044                                         continue;
3045                                 }
3046                                 vm_page_lock_queues();
3047
3048                                 if( !(cntrl_flags & UPL_FILE_IO))
3049                                         hw_dirty = pmap_disconnect(dst_page->phys_page) & VM_MEM_MODIFIED;
3050                                 else
3051                                         hw_dirty = pmap_get_refmod(dst_page->phys_page) & VM_MEM_MODIFIED;
3052                                 dirty = hw_dirty ? TRUE : dst_page->dirty;
3053
3054                                 if(cntrl_flags & UPL_SET_LITE) {
3055                                         int     pg_num;
3056                                         pg_num = (dst_offset-offset)/PAGE_SIZE;
3057                                         lite_list[pg_num>>5] |=
3058                                                         1 << (pg_num & 31);
3059                                         if (hw_dirty)
3060                                                 pmap_clear_modify(dst_page->phys_page);
3061                                         /*
3062                                          * Record that this page has been
3063                                          * written out
3064                                          */
3065 #if     MACH_PAGEMAP
3066                                         vm_external_state_set(
3067                                                 object->existence_map,
3068                                                 dst_page->offset);
3069 #endif  /*MACH_PAGEMAP*/
3070
3071                                         /*
3072                                          * Mark original page as cleaning
3073                                          * in place.
3074                                          */
3075                                         dst_page->cleaning = TRUE;
3076                                         dst_page->dirty = TRUE;
3077                                         dst_page->precious = FALSE;
3078                                 } else {
3079                                         /* use pageclean setup, it is more */
3080                                         /* convenient even for the pageout */
3081                                         /* cases here */
3082                                         vm_object_lock(upl->map_object);
3083                                         vm_pageclean_setup(dst_page,
3084                                                 alias_page, upl->map_object,
3085                                                 size - xfer_size);
3086                                         vm_object_unlock(upl->map_object);
3087
3088                                         alias_page->absent = FALSE;
3089                                         alias_page = NULL;
3090                                 }
3091
3092                                 if(cntrl_flags & UPL_CLEAN_IN_PLACE) {
3093                                         /* clean in place for read implies   */
3094                                         /* that a write will be done on all  */
3095                                         /* the pages that are dirty before   */
3096                                         /* a upl commit is done.  The caller */
3097                                         /* is obligated to preserve the      */
3098                                         /* contents of all pages marked      */
3099                                         /* dirty. */
3100                                         upl->flags |= UPL_CLEAR_DIRTY;
3101                                 }
3102
3103                                 if(!dirty) {
3104                                         dst_page->dirty = FALSE;
3105                                         dst_page->precious = TRUE;
3106                                 }
3107
3108                                 if (dst_page->wire_count == 0) {
3109                                    /* deny access to the target page while */
3110                                    /* it is being worked on */
3111                                         dst_page->busy = TRUE;
3112                                 } else {
3113                                         vm_page_wire(dst_page);
3114                                 }
3115                                 if(cntrl_flags & UPL_RET_ONLY_ABSENT) {
3116                                         /*
3117                                          * expect the page not to be used
3118                                          * since it's coming in as part
3119                                          * of a cluster and could be
3120                                          * speculative... pages that
3121                                          * are 'consumed' will get a
3122                                          * hardware reference
3123                                          */
3124                                         dst_page->reference = FALSE;
3125                                 } else {
3126                                         /*
3127                                          * expect the page to be used
3128                                          */
3129                                         dst_page->reference = TRUE;
3130                                 }
3131                                 dst_page->precious =
3132                                         (cntrl_flags & UPL_PRECIOUS)
3133                                                         ? TRUE : FALSE;
3134
3135                                 if (dst_page->phys_page > upl->highest_page)
3136                                         upl->highest_page = dst_page->phys_page;
3137
3138                                 if(user_page_list) {
3139                                         user_page_list[entry].phys_addr
3140                                                 = dst_page->phys_page;
3141                                         user_page_list[entry].dirty =
3142                                                         dst_page->dirty;
3143                                         user_page_list[entry].pageout =
3144                                                         dst_page->pageout;
3145                                         user_page_list[entry].absent =
3146                                                         dst_page->absent;
3147                                         user_page_list[entry].precious =
3148                                                         dst_page->precious;
3149                                 }
3150                                 vm_page_unlock_queues();
3151                         }
3152                         entry++;
3153                         dst_offset += PAGE_SIZE_64;
3154                         xfer_size -= PAGE_SIZE;
3155                 }
3156         }
3157
3158         if (upl->flags & UPL_INTERNAL) {
3159                 if(page_list_count != NULL)
3160                         *page_list_count = 0;
3161         } else if (*page_list_count > entry) {
3162                 if(page_list_count != NULL)
3163                         *page_list_count = entry;
3164         }
3165
3166         if(alias_page != NULL) {
3167                 vm_page_lock_queues();
3168                 vm_page_free(alias_page);
3169                 vm_page_unlock_queues();
3170         }
3171
3172         if(do_m_lock) {
3173            vm_prot_t    access_required;
3174            /* call back all associated pages from other users of the pager */
3175            /* all future updates will be on data which is based on the     */
3176            /* changes we are going to make here. Note: it is assumed that  */
3177            /* we already hold copies of the data so we will not be seeing  */
3178            /* an avalanche of incoming data from the pager */
3179            access_required = (cntrl_flags & UPL_COPYOUT_FROM)
3180                                         ? VM_PROT_READ : VM_PROT_WRITE;
3181            while (TRUE) {
3182                 kern_return_t   rc;
3183
3184                 if(!object->pager_ready) {
3185                    wait_result_t wait_result;
3186
3187                    wait_result = vm_object_sleep(object,
3188                                                 VM_OBJECT_EVENT_PAGER_READY,
3189                                                 THREAD_UNINT);
3190                    if (wait_result !=  THREAD_AWAKENED) {
3191                         vm_object_unlock(object);
3192                         return KERN_FAILURE;
3193                    }
3194                    continue;
3195                 }
3196
3197                 vm_object_unlock(object);
3198                 rc = memory_object_data_unlock(
3199                         object->pager,
3200                         dst_offset + object->paging_offset,
3201                         size,
3202                         access_required);
3203                 if (rc != KERN_SUCCESS && rc != MACH_SEND_INTERRUPTED)
3204                         return KERN_FAILURE;
3205                 vm_object_lock(object);
3206
3207                 if (rc == KERN_SUCCESS)
3208                         break;
3209            }
3210
3211            /* lets wait on the last page requested */
3212            /* NOTE: we will have to update lock completed routine to signal */
3213            if(dst_page != VM_PAGE_NULL &&
3214                 (access_required & dst_page->page_lock) != access_required) {
3215                 PAGE_ASSERT_WAIT(dst_page, THREAD_UNINT);
3216                 vm_object_unlock(object);
3217                 thread_block(THREAD_CONTINUE_NULL);
3218                 return KERN_SUCCESS;
3219            }
3220         }
3221
3222         vm_object_unlock(object);
3223         return KERN_SUCCESS;
3224 }
3225
3226 /* JMM - Backward compatability for now */
3227 kern_return_t
3228 vm_fault_list_request(                  /* forward */
3229         memory_object_control_t         control,
3230         vm_object_offset_t      offset,
3231         upl_size_t              size,
3232         upl_t                   *upl_ptr,
3233         upl_page_info_t         **user_page_list_ptr,
3234         int                     page_list_count,
3235         int                     cntrl_flags);
3236 kern_return_t
3237 vm_fault_list_request(
3238         memory_object_control_t         control,
3239         vm_object_offset_t      offset,
3240         upl_size_t              size,
3241         upl_t                   *upl_ptr,
3242         upl_page_info_t         **user_page_list_ptr,
3243         int                     page_list_count,
3244         int                     cntrl_flags)
3245 {
3246         unsigned int            local_list_count;
3247         upl_page_info_t         *user_page_list;
3248         kern_return_t           kr;
3249
3250         if (user_page_list_ptr != NULL) {
3251                 local_list_count = page_list_count;
3252                 user_page_list = *user_page_list_ptr;
3253         } else {
3254                 local_list_count = 0;
3255                 user_page_list = NULL;
3256         }
3257         kr =  memory_object_upl_request(control,
3258                                 offset,
3259                                 size,
3260                                 upl_ptr,
3261                                 user_page_list,
3262                                 &local_list_count,
3263                                 cntrl_flags);
3264
3265         if(kr != KERN_SUCCESS)
3266                 return kr;
3267
3268         if ((user_page_list_ptr != NULL) && (cntrl_flags & UPL_INTERNAL)) {
3269                 *user_page_list_ptr = UPL_GET_INTERNAL_PAGE_LIST(*upl_ptr);
3270         }
3271
3272         return KERN_SUCCESS;
3273 }
3274
3275
3276
3277 /*
3278  *      Routine:        vm_object_super_upl_request
3279  *      Purpose:
3280  *              Cause the population of a portion of a vm_object
3281  *              in much the same way as memory_object_upl_request.
3282  *              Depending on the nature of the request, the pages
3283  *              returned may be contain valid data or be uninitialized.
3284  *              However, the region may be expanded up to the super
3285  *              cluster size provided.
3286  */
3287
3288 __private_extern__ kern_return_t
3289 vm_object_super_upl_request(
3290         vm_object_t object,
3291         vm_object_offset_t      offset,
3292         upl_size_t              size,
3293         upl_size_t              super_cluster,
3294         upl_t                   *upl,
3295         upl_page_info_t         *user_page_list,
3296         unsigned int            *page_list_count,
3297         int                     cntrl_flags)
3298 {
3299         vm_page_t       target_page;
3300         int             ticket;
3301
3302
3303         if(object->paging_offset > offset)
3304                 return KERN_FAILURE;
3305
3306         assert(object->paging_in_progress);
3307         offset = offset - object->paging_offset;
3308
3309         if(cntrl_flags & UPL_FOR_PAGEOUT) {
3310
3311                 vm_object_lock(object);
3312
3313                 if((target_page = vm_page_lookup(object, offset))
3314                                                         != VM_PAGE_NULL) {
3315                         ticket = target_page->page_ticket;
3316                         cntrl_flags = cntrl_flags & ~(int)UPL_PAGE_TICKET_MASK;
3317                         cntrl_flags = cntrl_flags |
3318                                 ((ticket << UPL_PAGE_TICKET_SHIFT)
3319                                                         & UPL_PAGE_TICKET_MASK);
3320                 }
3321                 vm_object_unlock(object);
3322         }
3323
3324         if (super_cluster > size) {
3325
3326                 vm_object_offset_t      base_offset;
3327                 upl_size_t              super_size;
3328
3329                 base_offset = (offset &
3330                         ~((vm_object_offset_t) super_cluster - 1));
3331                 super_size = (offset+size) > (base_offset + super_cluster) ?
3332                                 super_cluster<<1 : super_cluster;
3333                 super_size = ((base_offset + super_size) > object->size) ?
3334                                 (object->size - base_offset) : super_size;
3335                 if(offset > (base_offset + super_size))
3336                    panic("vm_object_super_upl_request: Missed target pageout"
3337                          " %#llx,%#llx, %#x, %#x, %#x, %#llx\n",
3338                          offset, base_offset, super_size, super_cluster,
3339                          size, object->paging_offset);
3340                 /*
3341                  * apparently there is a case where the vm requests a
3342                  * page to be written out who's offset is beyond the
3343                  * object size
3344                  */
3345                 if((offset + size) > (base_offset + super_size))
3346                    super_size = (offset + size) - base_offset;
3347
3348                 offset = base_offset;
3349                 size = super_size;
3350         }
3351         return vm_object_upl_request(object, offset, size,
3352                                      upl, user_page_list, page_list_count,
3353                                      cntrl_flags);
3354 }
3355
3356
3357 kern_return_t
3358 vm_map_create_upl(
3359         vm_map_t                map,
3360         vm_map_address_t        offset,
3361         upl_size_t              *upl_size,
3362         upl_t                   *upl,
3363         upl_page_info_array_t   page_list,
3364         unsigned int            *count,
3365         int                     *flags)
3366 {
3367         vm_map_entry_t  entry;
3368         int             caller_flags;
3369         int             force_data_sync;
3370         int             sync_cow_data;
3371         vm_object_t     local_object;
3372         vm_map_offset_t local_offset;
3373         vm_map_offset_t local_start;
3374         kern_return_t   ret;
3375
3376         caller_flags = *flags;
3377
3378         if (caller_flags & ~UPL_VALID_FLAGS) {
3379                 /*
3380                  * For forward compatibility's sake,
3381                  * reject any unknown flag.
3382                  */
3383                 return KERN_INVALID_VALUE;
3384         }
3385
3386         force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
3387         sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
3388
3389         if(upl == NULL)
3390                 return KERN_INVALID_ARGUMENT;
3391
3392
3393 REDISCOVER_ENTRY:
3394         vm_map_lock(map);
3395         if (vm_map_lookup_entry(map, offset, &entry)) {
3396                 if (entry->object.vm_object == VM_OBJECT_NULL ||
3397                         !entry->object.vm_object->phys_contiguous) {
3398                         if((*upl_size/page_size) > MAX_UPL_TRANSFER) {
3399                                 *upl_size = MAX_UPL_TRANSFER * page_size;
3400                         }
3401                 }
3402                 if((entry->vme_end - offset) < *upl_size) {
3403                         *upl_size = entry->vme_end - offset;
3404                 }
3405                 if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
3406                         if (entry->object.vm_object == VM_OBJECT_NULL) {
3407                                 *flags = 0;
3408                         } else if (entry->object.vm_object->private) {
3409                                 *flags = UPL_DEV_MEMORY;
3410                                 if (entry->object.vm_object->phys_contiguous) {
3411                                         *flags |= UPL_PHYS_CONTIG;
3412                                 }
3413                         } else  {
3414                                 *flags = 0;
3415                         }
3416                         vm_map_unlock(map);
3417                         return KERN_SUCCESS;
3418                 }
3419                 /*
3420                  *      Create an object if necessary.
3421                  */
3422                 if (entry->object.vm_object == VM_OBJECT_NULL) {
3423                         entry->object.vm_object = vm_object_allocate(
3424                                 (vm_size_t)(entry->vme_end - entry->vme_start));
3425                         entry->offset = 0;
3426                 }
3427                 if (!(caller_flags & UPL_COPYOUT_FROM)) {
3428                         if (!(entry->protection & VM_PROT_WRITE)) {
3429                                 vm_map_unlock(map);
3430                                 return KERN_PROTECTION_FAILURE;
3431                         }
3432                         if (entry->needs_copy)  {
3433                                 vm_map_t                local_map;
3434                                 vm_object_t             object;
3435                                 vm_map_offset_t         offset_hi;
3436                                 vm_map_offset_t         offset_lo;
3437                                 vm_object_offset_t      new_offset;
3438                                 vm_prot_t               prot;
3439                                 boolean_t               wired;
3440                                 vm_behavior_t           behavior;
3441                                 vm_map_version_t        version;
3442                                 vm_map_t                real_map;
3443
3444                                 local_map = map;
3445                                 vm_map_lock_write_to_read(map);
3446                                 if(vm_map_lookup_locked(&local_map,
3447                                         offset, VM_PROT_WRITE,
3448                                         &version, &object,
3449                                         &new_offset, &prot, &wired,
3450                                         &behavior, &offset_lo,
3451                                         &offset_hi, &real_map)) {
3452                                         vm_map_unlock(local_map);
3453                                         return KERN_FAILURE;
3454                                 }
3455                                 if (real_map != map) {
3456                                         vm_map_unlock(real_map);
3457                                 }
3458                                 vm_object_unlock(object);
3459                                 vm_map_unlock(local_map);
3460
3461                                 goto REDISCOVER_ENTRY;
3462                         }
3463                 }
3464                 if (entry->is_sub_map) {
3465                         vm_map_t        submap;
3466
3467                         submap = entry->object.sub_map;
3468                         local_start = entry->vme_start;
3469                         local_offset = entry->offset;
3470                         vm_map_reference(submap);
3471                         vm_map_unlock(map);
3472
3473                         ret = (vm_map_create_upl(submap,
3474                                 local_offset + (offset - local_start),
3475                                 upl_size, upl, page_list, count,
3476                                 flags));
3477
3478                         vm_map_deallocate(submap);
3479                         return ret;
3480                 }
3481
3482                 if (sync_cow_data) {
3483                         if (entry->object.vm_object->shadow
3484                                     || entry->object.vm_object->copy) {
3485
3486                                 local_object = entry->object.vm_object;
3487                                 local_start = entry->vme_start;
3488                                 local_offset = entry->offset;
3489                                 vm_object_reference(local_object);
3490                                 vm_map_unlock(map);
3491
3492                                 if (entry->object.vm_object->shadow &&
3493                                            entry->object.vm_object->copy) {
3494                                    vm_object_lock_request(
3495                                         local_object->shadow,
3496                                         (vm_object_offset_t)
3497                                         ((offset - local_start) +
3498                                          local_offset) +
3499                                         local_object->shadow_offset,
3500                                         *upl_size, FALSE,
3501                                         MEMORY_OBJECT_DATA_SYNC,
3502                                         VM_PROT_NO_CHANGE);
3503                                 }
3504                                 sync_cow_data = FALSE;
3505                                 vm_object_deallocate(local_object);
3506                                 goto REDISCOVER_ENTRY;
3507                         }
3508                 }
3509
3510                 if (force_data_sync) {
3511
3512                         local_object = entry->object.vm_object;
3513                         local_start = entry->vme_start;
3514                         local_offset = entry->offset;
3515                         vm_object_reference(local_object);
3516                         vm_map_unlock(map);
3517
3518                         vm_object_lock_request(
3519                                    local_object,
3520                                    (vm_object_offset_t)
3521                                    ((offset - local_start) + local_offset),
3522                                    (vm_object_size_t)*upl_size, FALSE,
3523                                    MEMORY_OBJECT_DATA_SYNC,
3524                                    VM_PROT_NO_CHANGE);
3525                         force_data_sync = FALSE;
3526                         vm_object_deallocate(local_object);
3527                         goto REDISCOVER_ENTRY;
3528                 }
3529
3530                 if(!(entry->object.vm_object->private)) {
3531                         if(*upl_size > (MAX_UPL_TRANSFER*PAGE_SIZE))
3532                                 *upl_size = (MAX_UPL_TRANSFER*PAGE_SIZE);
3533                         if(entry->object.vm_object->phys_contiguous) {
3534                                 *flags = UPL_PHYS_CONTIG;
3535                         } else {
3536                                 *flags = 0;
3537                         }
3538                 } else {
3539                         *flags = UPL_DEV_MEMORY | UPL_PHYS_CONTIG;
3540                 }
3541                 local_object = entry->object.vm_object;
3542                 local_offset = entry->offset;
3543                 local_start = entry->vme_start;
3544                 vm_object_reference(local_object);
3545                 vm_map_unlock(map);
3546                 if(caller_flags & UPL_SET_IO_WIRE) {
3547                         ret = (vm_object_iopl_request(local_object,
3548                                 (vm_object_offset_t)
3549                                    ((offset - local_start)
3550                                                 + local_offset),
3551                                 *upl_size,
3552                                 upl,
3553                                 page_list,
3554                                 count,
3555                                 caller_flags));
3556                 } else {
3557                         ret = (vm_object_upl_request(local_object,
3558                                 (vm_object_offset_t)
3559                                    ((offset - local_start)
3560                                                 + local_offset),
3561                                 *upl_size,
3562                                 upl,
3563                                 page_list,
3564                                 count,
3565                                 caller_flags));
3566                 }
3567                 vm_object_deallocate(local_object);
3568                 return(ret);
3569         }
3570
3571         vm_map_unlock(map);
3572         return(KERN_FAILURE);
3573
3574 }
3575
3576 /*
3577  * Internal routine to enter a UPL into a VM map.
3578  *
3579  * JMM - This should just be doable through the standard
3580  * vm_map_enter() API.
3581  */
3582 kern_return_t
3583 vm_map_enter_upl(
3584         vm_map_t                map,
3585         upl_t                   upl,
3586         vm_map_offset_t *dst_addr)
3587 {
3588         vm_map_size_t           size;
3589         vm_object_offset_t      offset;
3590         vm_map_offset_t         addr;
3591         vm_page_t               m;
3592         kern_return_t           kr;
3593
3594         if (upl == UPL_NULL)
3595                 return KERN_INVALID_ARGUMENT;
3596
3597         upl_lock(upl);
3598
3599         /* check to see if already mapped */
3600         if(UPL_PAGE_LIST_MAPPED & upl->flags) {
3601                 upl_unlock(upl);
3602                 return KERN_FAILURE;
3603         }
3604
3605         if((!(upl->map_object->pageout)) &&
3606                 !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) ||
3607                                         (upl->map_object->phys_contiguous))) {
3608                 vm_object_t             object;
3609                 vm_page_t               alias_page;
3610                 vm_object_offset_t      new_offset;
3611                 int                     pg_num;
3612                 wpl_array_t             lite_list;
3613
3614                 if(upl->flags & UPL_INTERNAL) {
3615                         lite_list = (wpl_array_t)
3616                                 ((((uintptr_t)upl) + sizeof(struct upl))
3617                                 + ((upl->size/PAGE_SIZE)
3618                                                 * sizeof(upl_page_info_t)));
3619                 } else {
3620                         lite_list = (wpl_array_t)
3621                                 (((uintptr_t)upl) + sizeof(struct upl));
3622                 }
3623                 object = upl->map_object;
3624                 upl->map_object = vm_object_allocate(upl->size);
3625                 vm_object_lock(upl->map_object);
3626                 upl->map_object->shadow = object;
3627                 upl->map_object->pageout = TRUE;
3628                 upl->map_object->can_persist = FALSE;
3629                 upl->map_object->copy_strategy =
3630                                 MEMORY_OBJECT_COPY_NONE;
3631                 upl->map_object->shadow_offset =
3632                                 upl->offset - object->paging_offset;
3633                 upl->map_object->wimg_bits = object->wimg_bits;
3634                 offset = upl->map_object->shadow_offset;
3635                 new_offset = 0;
3636                 size = upl->size;
3637
3638                 vm_object_lock(object);
3639
3640                 while(size) {
3641                    pg_num = (new_offset)/PAGE_SIZE;
3642                    if(lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
3643                         vm_object_unlock(object);
3644                         VM_PAGE_GRAB_FICTITIOUS(alias_page);
3645                         vm_object_lock(object);
3646                         m = vm_page_lookup(object, offset);
3647                         if (m == VM_PAGE_NULL) {
3648                                 panic("vm_upl_map: page missing\n");
3649                         }
3650
3651                         vm_object_paging_begin(object);
3652
3653                         /*
3654                         * Convert the fictitious page to a private
3655                          * shadow of the real page.
3656                          */
3657                         assert(alias_page->fictitious);
3658                         alias_page->fictitious = FALSE;
3659                         alias_page->private = TRUE;
3660                         alias_page->pageout = TRUE;
3661                         alias_page->phys_page = m->phys_page;
3662
3663                         vm_page_lock_queues();
3664                         vm_page_wire(alias_page);
3665                         vm_page_unlock_queues();
3666
3667                         /*
3668                          * ENCRYPTED SWAP:
3669                          * The virtual page ("m") has to be wired in some way
3670                          * here or its physical page ("m->phys_page") could
3671                          * be recycled at any time.
3672                          * Assuming this is enforced by the caller, we can't
3673                          * get an encrypted page here.  Since the encryption
3674                          * key depends on the VM page's "pager" object and
3675                          * the "paging_offset", we couldn't handle 2 pageable
3676                          * VM pages (with different pagers and paging_offsets)
3677                          * sharing the same physical page:  we could end up
3678                          * encrypting with one key (via one VM page) and
3679                          * decrypting with another key (via the alias VM page).
3680                          */
3681                         ASSERT_PAGE_DECRYPTED(m);
3682
3683                         vm_page_insert(alias_page,
3684                                         upl->map_object, new_offset);
3685                         assert(!alias_page->wanted);
3686                         alias_page->busy = FALSE;
3687                         alias_page->absent = FALSE;
3688                    }
3689
3690                    size -= PAGE_SIZE;
3691                    offset += PAGE_SIZE_64;
3692                    new_offset += PAGE_SIZE_64;
3693                 }
3694                 vm_object_unlock(object);
3695                 vm_object_unlock(upl->map_object);
3696         }
3697         if ((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || upl->map_object->phys_contiguous)
3698                 offset = upl->offset - upl->map_object->paging_offset;
3699         else
3700                 offset = 0;
3701
3702         size = upl->size;
3703
3704         vm_object_lock(upl->map_object);
3705         upl->map_object->ref_count++;
3706         vm_object_res_reference(upl->map_object);
3707         vm_object_unlock(upl->map_object);
3708
3709         *dst_addr = 0;
3710
3711
3712         /* NEED A UPL_MAP ALIAS */
3713         kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
3714                 VM_FLAGS_ANYWHERE, upl->map_object, offset, FALSE,
3715                 VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
3716
3717         if (kr != KERN_SUCCESS) {
3718                 upl_unlock(upl);
3719                 return(kr);
3720         }
3721
3722         vm_object_lock(upl->map_object);
3723
3724         for(addr=*dst_addr; size > 0; size-=PAGE_SIZE,addr+=PAGE_SIZE) {
3725                 m = vm_page_lookup(upl->map_object, offset);
3726                 if(m) {
3727                    unsigned int cache_attr;
3728                    cache_attr = ((unsigned int)m->object->wimg_bits) & VM_WIMG_MASK;
3729
3730                    PMAP_ENTER(map->pmap, addr,
3731                                 m, VM_PROT_ALL,
3732                                 cache_attr, TRUE);
3733                 }
3734                 offset+=PAGE_SIZE_64;
3735         }
3736         vm_object_unlock(upl->map_object);
3737
3738         upl->ref_count++;  /* hold a reference for the mapping */
3739         upl->flags |= UPL_PAGE_LIST_MAPPED;
3740         upl->kaddr = *dst_addr;
3741         upl_unlock(upl);
3742         return KERN_SUCCESS;
3743 }
3744
3745 /*
3746  * Internal routine to remove a UPL mapping from a VM map.
3747  *
3748  * XXX - This should just be doable through a standard
3749  * vm_map_remove() operation.  Otherwise, implicit clean-up
3750  * of the target map won't be able to correctly remove
3751  * these (and release the reference on the UPL).  Having
3752  * to do this means we can't map these into user-space
3753  * maps yet.
3754  */
3755 kern_return_t
3756 vm_map_remove_upl(
3757         vm_map_t        map,
3758         upl_t           upl)
3759 {
3760         vm_address_t    addr;
3761         upl_size_t      size;
3762
3763         if (upl == UPL_NULL)
3764                 return KERN_INVALID_ARGUMENT;
3765
3766         upl_lock(upl);
3767         if(upl->flags & UPL_PAGE_LIST_MAPPED) {
3768                 addr = upl->kaddr;
3769                 size = upl->size;
3770                 assert(upl->ref_count > 1);
3771                 upl->ref_count--;               /* removing mapping ref */
3772                 upl->flags &= ~UPL_PAGE_LIST_MAPPED;
3773                 upl->kaddr = (vm_offset_t) 0;
3774                 upl_unlock(upl);
3775
3776                 vm_map_remove(  map,
3777                                 vm_map_trunc_page(addr),
3778                                 vm_map_round_page(addr + size),
3779                                 VM_MAP_NO_FLAGS);
3780                 return KERN_SUCCESS;
3781         }
3782         upl_unlock(upl);
3783         return KERN_FAILURE;
3784 }
3785
3786 kern_return_t
3787 upl_commit_range(
3788         upl_t                   upl,
3789         upl_offset_t            offset,
3790         upl_size_t              size,
3791         int                     flags,
3792         upl_page_info_t         *page_list,
3793         mach_msg_type_number_t  count,
3794         boolean_t               *empty)
3795 {
3796         upl_size_t              xfer_size = size;
3797         vm_object_t             shadow_object;
3798         vm_object_t             object = upl->map_object;
3799         vm_object_offset_t      target_offset;
3800         int                     entry;
3801         wpl_array_t             lite_list;
3802         int                     occupied;
3803         int                     delayed_unlock = 0;
3804         int                     clear_refmod = 0;
3805         boolean_t               shadow_internal;
3806
3807         *empty = FALSE;
3808
3809         if (upl == UPL_NULL)
3810                 return KERN_INVALID_ARGUMENT;
3811
3812
3813         if (count == 0)
3814                 page_list = NULL;
3815
3816         if (object->pageout) {
3817                 shadow_object = object->shadow;
3818         } else {
3819                 shadow_object = object;
3820         }
3821
3822         upl_lock(upl);
3823
3824         if (upl->flags & UPL_ACCESS_BLOCKED) {
3825                 /*
3826                  * We used this UPL to block access to the pages by marking
3827                  * them "busy".  Now we need to clear the "busy" bit to allow
3828                  * access to these pages again.
3829                  */
3830                 flags |= UPL_COMMIT_ALLOW_ACCESS;
3831         }
3832
3833         if (upl->flags & UPL_CLEAR_DIRTY)
3834                 flags |= UPL_COMMIT_CLEAR_DIRTY;
3835
3836         if (upl->flags & UPL_DEVICE_MEMORY) {
3837                 xfer_size = 0;
3838         } else if ((offset + size) > upl->size) {
3839                 upl_unlock(upl);
3840                 return KERN_FAILURE;
3841         }
3842
3843         if (upl->flags & UPL_INTERNAL) {
3844                 lite_list = (wpl_array_t)
3845                         ((((uintptr_t)upl) + sizeof(struct upl))
3846                         + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
3847         } else {
3848                 lite_list = (wpl_array_t)
3849                         (((uintptr_t)upl) + sizeof(struct upl));
3850         }
3851         if (object != shadow_object)
3852                 vm_object_lock(object);
3853         vm_object_lock(shadow_object);
3854
3855         shadow_internal = shadow_object->internal;
3856
3857         entry = offset/PAGE_SIZE;
3858         target_offset = (vm_object_offset_t)offset;
3859
3860         while (xfer_size) {
3861                 vm_page_t       t,m;
3862                 upl_page_info_t *p;
3863
3864                 m = VM_PAGE_NULL;
3865
3866                 if (upl->flags & UPL_LITE) {
3867                         int     pg_num;
3868
3869                         pg_num = target_offset/PAGE_SIZE;
3870
3871                         if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
3872                                 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
3873                                 m = vm_page_lookup(shadow_object,
3874                                                    target_offset + (upl->offset -
3875                                                                     shadow_object->paging_offset));
3876                         }
3877                 }
3878                 if (object->pageout) {
3879                         if ((t = vm_page_lookup(object, target_offset)) != NULL) {
3880                                 t->pageout = FALSE;
3881
3882                                 if (delayed_unlock) {
3883                                         delayed_unlock = 0;
3884                                         vm_page_unlock_queues();
3885                                 }
3886                                 VM_PAGE_FREE(t);
3887
3888                                 if (m == NULL) {
3889                                         m = vm_page_lookup(
3890                                             shadow_object,
3891                                             target_offset +
3892                                                 object->shadow_offset);
3893                                 }
3894                                 if (m != VM_PAGE_NULL)
3895                                         vm_object_paging_end(m->object);
3896                         }
3897                 }
3898                 if (m != VM_PAGE_NULL) {
3899
3900                    clear_refmod = 0;
3901
3902                    if (upl->flags & UPL_IO_WIRE) {
3903
3904                         if (delayed_unlock == 0)
3905                                 vm_page_lock_queues();
3906
3907                         vm_page_unwire(m);
3908
3909                         if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
3910                                 delayed_unlock = 0;
3911                                 vm_page_unlock_queues();
3912                         }
3913                         if (page_list) {
3914                                 page_list[entry].phys_addr = 0;
3915                         }
3916                         if (flags & UPL_COMMIT_SET_DIRTY) {
3917                                 m->dirty = TRUE;
3918                         } else if (flags & UPL_COMMIT_CLEAR_DIRTY) {
3919                                 m->dirty = FALSE;
3920                                 clear_refmod |= VM_MEM_MODIFIED;
3921                         }
3922                         if (flags & UPL_COMMIT_INACTIVATE) {
3923                                 m->reference = FALSE;
3924                                 clear_refmod |= VM_MEM_REFERENCED;
3925                                 vm_page_deactivate(m);
3926                         }
3927                         if (clear_refmod)
3928                                 pmap_clear_refmod(m->phys_page, clear_refmod);
3929
3930                         if (flags & UPL_COMMIT_ALLOW_ACCESS) {
3931                                 /*
3932                                  * We blocked access to the pages in this UPL.
3933                                  * Clear the "busy" bit and wake up any waiter
3934                                  * for this page.
3935                                  */
3936                                 PAGE_WAKEUP_DONE(m);
3937                         }
3938
3939                         target_offset += PAGE_SIZE_64;
3940                         xfer_size -= PAGE_SIZE;
3941                         entry++;
3942                         continue;
3943                    }
3944                    if (delayed_unlock == 0)
3945                         vm_page_lock_queues();
3946                    /*
3947                     * make sure to clear the hardware
3948                     * modify or reference bits before
3949                     * releasing the BUSY bit on this page
3950                     * otherwise we risk losing a legitimate
3951                     * change of state
3952                     */
3953                    if (flags & UPL_COMMIT_CLEAR_DIRTY) {
3954                         m->dirty = FALSE;
3955                         clear_refmod |= VM_MEM_MODIFIED;
3956                    }
3957                    if (flags & UPL_COMMIT_INACTIVATE)
3958                         clear_refmod |= VM_MEM_REFERENCED;
3959
3960                    if (clear_refmod)
3961                         pmap_clear_refmod(m->phys_page, clear_refmod);
3962
3963                    if (page_list) {
3964                         p = &(page_list[entry]);
3965                         if(p->phys_addr && p->pageout && !m->pageout) {
3966                                 m->busy = TRUE;
3967                                 m->pageout = TRUE;
3968                                 vm_page_wire(m);
3969                         } else if (page_list[entry].phys_addr &&
3970                                         !p->pageout && m->pageout &&
3971                                         !m->dump_cleaning) {
3972                                 m->pageout = FALSE;
3973                                 m->absent = FALSE;
3974                                 m->overwriting = FALSE;
3975                                 vm_page_unwire(m);
3976                                 PAGE_WAKEUP_DONE(m);
3977                         }
3978                         page_list[entry].phys_addr = 0;
3979                    }
3980                    m->dump_cleaning = FALSE;
3981                    if(m->laundry) {
3982                            vm_pageout_throttle_up(m);
3983                    }
3984                    if(m->pageout) {
3985                       m->cleaning = FALSE;
3986                       m->pageout = FALSE;
3987 #if MACH_CLUSTER_STATS
3988                       if (m->wanted) vm_pageout_target_collisions++;
3989 #endif
3990                       if (pmap_disconnect(m->phys_page) & VM_MEM_MODIFIED)
3991                               m->dirty = TRUE;
3992                       else
3993                               m->dirty = FALSE;
3994
3995                       if(m->dirty) {
3996                               vm_page_unwire(m);/* reactivates */
3997
3998                               if (upl->flags & UPL_PAGEOUT) {
3999                                       CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
4000                                       VM_STAT(reactivations++);
4001                               }
4002                               PAGE_WAKEUP_DONE(m);
4003                       } else {
4004                             vm_page_free(m);/* clears busy, etc. */
4005
4006                             if (upl->flags & UPL_PAGEOUT) {
4007                                     CLUSTER_STAT(vm_pageout_target_page_freed++;)
4008
4009                                     if (page_list[entry].dirty)
4010                                             VM_STAT(pageouts++);
4011                             }
4012                       }
4013                       if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
4014                             delayed_unlock = 0;
4015                             vm_page_unlock_queues();
4016                       }
4017                       target_offset += PAGE_SIZE_64;
4018                       xfer_size -= PAGE_SIZE;
4019                       entry++;
4020                       continue;
4021                    }
4022 #if MACH_CLUSTER_STATS
4023                    m->dirty = pmap_is_modified(m->phys_page);
4024
4025                    if (m->dirty)   vm_pageout_cluster_dirtied++;
4026                    else            vm_pageout_cluster_cleaned++;
4027                    if (m->wanted)  vm_pageout_cluster_collisions++;
4028 #else
4029                    m->dirty = 0;
4030 #endif
4031
4032                    if((m->busy) && (m->cleaning)) {
4033                         /* the request_page_list case */
4034                         if(m->absent) {
4035                                 m->absent = FALSE;
4036                                 if(shadow_object->absent_count == 1)
4037                                       vm_object_absent_release(shadow_object);
4038                                 else
4039                                       shadow_object->absent_count--;
4040                         }
4041                         m->overwriting = FALSE;
4042                         m->busy = FALSE;
4043                         m->dirty = FALSE;
4044                    } else if (m->overwriting) {
4045                          /* alternate request page list, write to
4046                           * page_list case.  Occurs when the original
4047                           * page was wired at the time of the list
4048                           * request */
4049                          assert(m->wire_count != 0);
4050                          vm_page_unwire(m);/* reactivates */
4051                          m->overwriting = FALSE;
4052                    }
4053                    m->cleaning = FALSE;
4054
4055                    /* It is a part of the semantic of COPYOUT_FROM */
4056                    /* UPLs that a commit implies cache sync           */
4057                    /* between the vm page and the backing store    */
4058                    /* this can be used to strip the precious bit   */
4059                    /* as well as clean */
4060                    if (upl->flags & UPL_PAGE_SYNC_DONE)
4061                          m->precious = FALSE;
4062
4063                    if (flags & UPL_COMMIT_SET_DIRTY)
4064                         m->dirty = TRUE;
4065
4066                    if (flags & UPL_COMMIT_INACTIVATE) {
4067                         m->reference = FALSE;
4068                         vm_page_deactivate(m);
4069                    } else if (!m->active && !m->inactive) {
4070                         if (m->reference)
4071                                 vm_page_activate(m);
4072                         else
4073                                 vm_page_deactivate(m);
4074                    }
4075
4076                    if (flags & UPL_COMMIT_ALLOW_ACCESS) {
4077                            /*
4078                             * We blocked access to the pages in this URL.
4079                             * Clear the "busy" bit on this page before we
4080                             * wake up any waiter.
4081                             */
4082                            m->busy = FALSE;
4083                    }
4084
4085                    /*
4086                     * Wakeup any thread waiting for the page to be un-cleaning.
4087                     */
4088                    PAGE_WAKEUP(m);
4089
4090                    if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
4091                          delayed_unlock = 0;
4092                          vm_page_unlock_queues();
4093                    }
4094                 }
4095                 target_offset += PAGE_SIZE_64;
4096                 xfer_size -= PAGE_SIZE;
4097                 entry++;
4098         }
4099         if (delayed_unlock)
4100                 vm_page_unlock_queues();
4101
4102         occupied = 1;
4103
4104         if (upl->flags & UPL_DEVICE_MEMORY)  {
4105                 occupied = 0;
4106         } else if (upl->flags & UPL_LITE) {
4107                 int     pg_num;
4108                 int     i;
4109                 pg_num = upl->size/PAGE_SIZE;
4110                 pg_num = (pg_num + 31) >> 5;
4111                 occupied = 0;
4112                 for(i= 0; i<pg_num; i++) {
4113                         if(lite_list[i] != 0) {
4114                                 occupied = 1;
4115                                 break;
4116                         }
4117                 }
4118         } else {
4119                 if(queue_empty(&upl->map_object->memq)) {
4120                         occupied = 0;
4121                 }
4122         }
4123
4124         if(occupied == 0) {
4125                 if(upl->flags & UPL_COMMIT_NOTIFY_EMPTY) {
4126                         *empty = TRUE;
4127                 }
4128                 if(object == shadow_object)
4129                         vm_object_paging_end(shadow_object);
4130         }
4131         vm_object_unlock(shadow_object);
4132         if (object != shadow_object)
4133                 vm_object_unlock(object);
4134         upl_unlock(upl);
4135
4136         return KERN_SUCCESS;
4137 }
4138
4139 kern_return_t
4140 upl_abort_range(
4141         upl_t                   upl,
4142         upl_offset_t            offset,
4143         upl_size_t              size,
4144         int                     error,
4145         boolean_t               *empty)
4146 {
4147         upl_size_t              xfer_size = size;
4148         vm_object_t             shadow_object;
4149         vm_object_t             object = upl->map_object;
4150         vm_object_offset_t      target_offset;
4151         int                     entry;
4152         wpl_array_t             lite_list;
4153         int                     occupied;
4154         boolean_t               shadow_internal;
4155
4156         *empty = FALSE;
4157
4158         if (upl == UPL_NULL)
4159                 return KERN_INVALID_ARGUMENT;
4160
4161         if (upl->flags & UPL_IO_WIRE) {
4162                 return upl_commit_range(upl,
4163                         offset, size, 0,
4164                         NULL, 0, empty);
4165         }
4166
4167         if(object->pageout) {
4168                 shadow_object = object->shadow;
4169         } else {
4170                 shadow_object = object;
4171         }
4172
4173         upl_lock(upl);
4174         if(upl->flags & UPL_DEVICE_MEMORY) {
4175                 xfer_size = 0;
4176         } else if ((offset + size) > upl->size) {
4177                 upl_unlock(upl);
4178                 return KERN_FAILURE;
4179         }
4180         if (object != shadow_object)
4181                 vm_object_lock(object);
4182         vm_object_lock(shadow_object);
4183
4184         shadow_internal = shadow_object->internal;
4185
4186         if(upl->flags & UPL_INTERNAL) {
4187                 lite_list = (wpl_array_t)
4188                         ((((uintptr_t)upl) + sizeof(struct upl))
4189                         + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
4190         } else {
4191                 lite_list = (wpl_array_t)
4192                         (((uintptr_t)upl) + sizeof(struct upl));
4193         }
4194
4195         entry = offset/PAGE_SIZE;
4196         target_offset = (vm_object_offset_t)offset;
4197         while(xfer_size) {
4198                 vm_page_t       t,m;
4199
4200                 m = VM_PAGE_NULL;
4201                 if(upl->flags & UPL_LITE) {
4202                         int     pg_num;
4203                         pg_num = target_offset/PAGE_SIZE;
4204                         if(lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
4205                                 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
4206                                 m = vm_page_lookup(shadow_object,
4207                                         target_offset + (upl->offset -
4208                                                 shadow_object->paging_offset));
4209                         }
4210                 }
4211                 if(object->pageout) {
4212                         if ((t = vm_page_lookup(object, target_offset))
4213                                                                 != NULL) {
4214                                 t->pageout = FALSE;
4215                                 VM_PAGE_FREE(t);
4216                                 if(m == NULL) {
4217                                         m = vm_page_lookup(
4218                                             shadow_object,
4219                                             target_offset +
4220                                                 object->shadow_offset);
4221                                 }
4222                                 if(m != VM_PAGE_NULL)
4223                                         vm_object_paging_end(m->object);
4224                         }
4225                 }
4226                 if(m != VM_PAGE_NULL) {
4227                         vm_page_lock_queues();
4228                         if(m->absent) {
4229                                 boolean_t must_free = TRUE;
4230
4231                                 /* COPYOUT = FALSE case */
4232                                 /* check for error conditions which must */
4233                                 /* be passed back to the pages customer  */
4234                                 if(error & UPL_ABORT_RESTART) {
4235                                         m->restart = TRUE;
4236                                         m->absent = FALSE;
4237                                         vm_object_absent_release(m->object);
4238                                         m->page_error = KERN_MEMORY_ERROR;
4239                                         m->error = TRUE;
4240                                         must_free = FALSE;
4241                                 } else if(error & UPL_ABORT_UNAVAILABLE) {
4242                                         m->restart = FALSE;
4243                                         m->unusual = TRUE;
4244                                         must_free = FALSE;
4245                                 } else if(error & UPL_ABORT_ERROR) {
4246                                         m->restart = FALSE;
4247                                         m->absent = FALSE;
4248                                         vm_object_absent_release(m->object);
4249                                         m->page_error = KERN_MEMORY_ERROR;
4250                                         m->error = TRUE;
4251                                         must_free = FALSE;
4252                                 }
4253
4254                                 /*
4255                                  * ENCRYPTED SWAP:
4256                                  * If the page was already encrypted,
4257                                  * we don't really need to decrypt it
4258                                  * now.  It will get decrypted later,
4259                                  * on demand, as soon as someone needs
4260                                  * to access its contents.
4261                                  */
4262
4263                                 m->cleaning = FALSE;
4264                                 m->overwriting = FALSE;
4265                                 PAGE_WAKEUP_DONE(m);
4266
4267                                 if (must_free == TRUE) {
4268                                         vm_page_free(m);
4269                                 } else {
4270                                         vm_page_activate(m);
4271                                 }
4272                                 vm_page_unlock_queues();
4273
4274                                 target_offset += PAGE_SIZE_64;
4275                                 xfer_size -= PAGE_SIZE;
4276                                 entry++;
4277                                 continue;
4278                         }
4279                         /*
4280                         * Handle the trusted pager throttle.
4281                         */
4282                         if (m->laundry) {
4283                                 vm_pageout_throttle_up(m);
4284                         }
4285                         if(m->pageout) {
4286                                 assert(m->busy);
4287                                 assert(m->wire_count == 1);
4288                                 m->pageout = FALSE;
4289                                 vm_page_unwire(m);
4290                         }
4291                         m->dump_cleaning = FALSE;
4292                         m->cleaning = FALSE;
4293                         m->overwriting = FALSE;
4294 #if     MACH_PAGEMAP
4295                         vm_external_state_clr(
4296                                 m->object->existence_map, m->offset);
4297 #endif  /* MACH_PAGEMAP */
4298                         if(error & UPL_ABORT_DUMP_PAGES) {
4299                                 vm_page_free(m);
4300                                 pmap_disconnect(m->phys_page);
4301                         } else {
4302                                 PAGE_WAKEUP_DONE(m);
4303                         }
4304                         vm_page_unlock_queues();
4305                 }
4306                 target_offset += PAGE_SIZE_64;
4307                 xfer_size -= PAGE_SIZE;
4308                 entry++;
4309         }
4310         occupied = 1;
4311         if (upl->flags & UPL_DEVICE_MEMORY)  {
4312                 occupied = 0;
4313         } else if (upl->flags & UPL_LITE) {
4314                 int     pg_num;
4315                 int     i;
4316                 pg_num = upl->size/PAGE_SIZE;
4317                 pg_num = (pg_num + 31) >> 5;
4318                 occupied = 0;
4319                 for(i= 0; i<pg_num; i++) {
4320                         if(lite_list[i] != 0) {
4321                                 occupied = 1;
4322                                 break;
4323                         }
4324                 }
4325         } else {
4326                 if(queue_empty(&upl->map_object->memq)) {
4327                         occupied = 0;
4328                 }
4329         }
4330
4331         if(occupied == 0) {
4332                 if(upl->flags & UPL_COMMIT_NOTIFY_EMPTY) {
4333                         *empty = TRUE;
4334                 }
4335                 if(object == shadow_object)
4336                         vm_object_paging_end(shadow_object);
4337         }
4338         vm_object_unlock(shadow_object);
4339         if (object != shadow_object)
4340                 vm_object_unlock(object);
4341
4342         upl_unlock(upl);
4343
4344         return KERN_SUCCESS;
4345 }
4346
4347 kern_return_t
4348 upl_abort(
4349         upl_t   upl,
4350         int     error)
4351 {
4352         vm_object_t             object = NULL;
4353         vm_object_t             shadow_object = NULL;
4354         vm_object_offset_t      offset;
4355         vm_object_offset_t      shadow_offset;
4356         vm_object_offset_t      target_offset;
4357         upl_size_t              i;
4358         wpl_array_t             lite_list;
4359         vm_page_t               t,m;
4360         int                     occupied;
4361         boolean_t               shadow_internal;
4362
4363         if (upl == UPL_NULL)
4364                 return KERN_INVALID_ARGUMENT;
4365
4366         if (upl->flags & UPL_IO_WIRE) {
4367                 boolean_t       empty;
4368                 return upl_commit_range(upl,
4369                         0, upl->size, 0,
4370                         NULL, 0, &empty);
4371         }
4372
4373         upl_lock(upl);
4374         if(upl->flags & UPL_DEVICE_MEMORY) {
4375                 upl_unlock(upl);
4376                 return KERN_SUCCESS;
4377         }
4378
4379         object = upl->map_object;
4380
4381         if (object == NULL) {
4382                 panic("upl_abort: upl object is not backed by an object");
4383                 upl_unlock(upl);
4384                 return KERN_INVALID_ARGUMENT;
4385         }
4386
4387         if(object->pageout) {
4388                 shadow_object = object->shadow;
4389                 shadow_offset = object->shadow_offset;
4390         } else {
4391                 shadow_object = object;
4392                 shadow_offset = upl->offset - object->paging_offset;
4393         }
4394
4395         if(upl->flags & UPL_INTERNAL) {
4396                 lite_list = (wpl_array_t)
4397                         ((((uintptr_t)upl) + sizeof(struct upl))
4398                         + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
4399         } else {
4400                 lite_list = (wpl_array_t)
4401                         (((uintptr_t)upl) + sizeof(struct upl));
4402         }
4403         offset = 0;
4404
4405         if (object != shadow_object)
4406                 vm_object_lock(object);
4407         vm_object_lock(shadow_object);
4408
4409         shadow_internal = shadow_object->internal;
4410
4411         for(i = 0; i<(upl->size); i+=PAGE_SIZE, offset += PAGE_SIZE_64) {
4412                 m = VM_PAGE_NULL;
4413                 target_offset = offset + shadow_offset;
4414                 if(upl->flags & UPL_LITE) {
4415                         int     pg_num;
4416                         pg_num = offset/PAGE_SIZE;
4417                         if(lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
4418                                 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
4419                                 m = vm_page_lookup(
4420                                         shadow_object, target_offset);
4421                         }
4422                 }
4423                 if(object->pageout) {
4424                         if ((t = vm_page_lookup(object, offset)) != NULL) {
4425                                 t->pageout = FALSE;
4426                                 VM_PAGE_FREE(t);
4427                                 if(m == NULL) {
4428                                         m = vm_page_lookup(
4429                                             shadow_object, target_offset);
4430                                 }
4431                                 if(m != VM_PAGE_NULL)
4432                                         vm_object_paging_end(m->object);
4433                         }
4434                 }
4435                 if(m != VM_PAGE_NULL) {
4436                         vm_page_lock_queues();
4437                         if(m->absent) {
4438                                 boolean_t must_free = TRUE;
4439
4440                                 /* COPYOUT = FALSE case */
4441                                 /* check for error conditions which must */
4442                                 /* be passed back to the pages customer  */
4443                                 if(error & UPL_ABORT_RESTART) {
4444                                         m->restart = TRUE;
4445                                         m->absent = FALSE;
4446                                         vm_object_absent_release(m->object);
4447                                         m->page_error = KERN_MEMORY_ERROR;
4448                                         m->error = TRUE;
4449                                         must_free = FALSE;
4450                                 } else if(error & UPL_ABORT_UNAVAILABLE) {
4451                                         m->restart = FALSE;
4452                                         m->unusual = TRUE;
4453                                         must_free = FALSE;
4454                                 } else if(error & UPL_ABORT_ERROR) {
4455                                         m->restart = FALSE;
4456                                         m->absent = FALSE;
4457                                         vm_object_absent_release(m->object);
4458                                         m->page_error = KERN_MEMORY_ERROR;
4459                                         m->error = TRUE;
4460                                         must_free = FALSE;
4461                                 }
4462
4463                                 /*
4464                                  * ENCRYPTED SWAP:
4465                                  * If the page was already encrypted,
4466                                  * we don't really need to decrypt it
4467                                  * now.  It will get decrypted later,
4468                                  * on demand, as soon as someone needs
4469                                  * to access its contents.
4470                                  */
4471
4472                                 m->cleaning = FALSE;
4473                                 m->overwriting = FALSE;
4474                                 PAGE_WAKEUP_DONE(m);
4475
4476                                 if (must_free == TRUE) {
4477                                         vm_page_free(m);
4478                                 } else {
4479                                         vm_page_activate(m);
4480                                 }
4481                                 vm_page_unlock_queues();
4482                                 continue;
4483                         }
4484                         /*
4485                          * Handle the trusted pager throttle.
4486                          */
4487                         if (m->laundry) {
4488                                 vm_pageout_throttle_up(m);
4489                         }
4490                         if(m->pageout) {
4491                                 assert(m->busy);
4492                                 assert(m->wire_count == 1);
4493                                 m->pageout = FALSE;
4494                                 vm_page_unwire(m);
4495                         }
4496                         m->dump_cleaning = FALSE;
4497                         m->cleaning = FALSE;
4498                         m->overwriting = FALSE;
4499 #if     MACH_PAGEMAP
4500                         vm_external_state_clr(
4501                                 m->object->existence_map, m->offset);
4502 #endif  /* MACH_PAGEMAP */
4503                         if(error & UPL_ABORT_DUMP_PAGES) {
4504                                 vm_page_free(m);
4505                                 pmap_disconnect(m->phys_page);
4506                         } else {
4507                                 PAGE_WAKEUP_DONE(m);
4508                         }
4509                         vm_page_unlock_queues();
4510                 }
4511         }
4512         occupied = 1;
4513         if (upl->flags & UPL_DEVICE_MEMORY)  {
4514                 occupied = 0;
4515         } else if (upl->flags & UPL_LITE) {
4516                 int     pg_num;
4517                 int     j;
4518                 pg_num = upl->size/PAGE_SIZE;
4519                 pg_num = (pg_num + 31) >> 5;
4520                 occupied = 0;
4521                 for(j= 0; j<pg_num; j++) {
4522                         if(lite_list[j] != 0) {
4523                                 occupied = 1;
4524                                 break;
4525                         }
4526                 }
4527         } else {
4528                 if(queue_empty(&upl->map_object->memq)) {
4529                         occupied = 0;
4530                 }
4531         }
4532
4533         if(occupied == 0) {
4534                 if(object == shadow_object)
4535                         vm_object_paging_end(shadow_object);
4536         }
4537         vm_object_unlock(shadow_object);
4538         if (object != shadow_object)
4539                 vm_object_unlock(object);
4540
4541         upl_unlock(upl);
4542         return KERN_SUCCESS;
4543 }
4544
4545 /* an option on commit should be wire */
4546 kern_return_t
4547 upl_commit(
4548         upl_t                   upl,
4549         upl_page_info_t         *page_list,
4550         mach_msg_type_number_t  count)
4551 {
4552         if (upl == UPL_NULL)
4553                 return KERN_INVALID_ARGUMENT;
4554
4555         if(upl->flags & (UPL_LITE | UPL_IO_WIRE)) {
4556                 boolean_t       empty;
4557                 return upl_commit_range(upl, 0, upl->size, 0,
4558                                         page_list, count, &empty);
4559         }
4560
4561         if (count == 0)
4562                 page_list = NULL;
4563
4564         upl_lock(upl);
4565         if (upl->flags & UPL_DEVICE_MEMORY)
4566                 page_list = NULL;
4567
4568         if (upl->flags & UPL_ENCRYPTED) {
4569                 /*
4570                  * ENCRYPTED SWAP:
4571                  * This UPL was encrypted, but we don't need
4572                  * to decrypt here.  We'll decrypt each page
4573                  * later, on demand, as soon as someone needs
4574                  * to access the page's contents.
4575                  */
4576         }
4577
4578         if ((upl->flags & UPL_CLEAR_DIRTY) ||
4579                 (upl->flags & UPL_PAGE_SYNC_DONE) || page_list) {
4580                 vm_object_t     shadow_object = upl->map_object->shadow;
4581                 vm_object_t     object = upl->map_object;
4582                 vm_object_offset_t target_offset;
4583                 upl_size_t      xfer_end;
4584                 int             entry;
4585
4586                 vm_page_t       t, m;
4587                 upl_page_info_t *p;
4588
4589                 if (object != shadow_object)
4590                         vm_object_lock(object);
4591                 vm_object_lock(shadow_object);
4592
4593                 entry = 0;
4594                 target_offset = object->shadow_offset;
4595                 xfer_end = upl->size + object->shadow_offset;
4596
4597                 while(target_offset < xfer_end) {
4598
4599                         if ((t = vm_page_lookup(object,
4600                                 target_offset - object->shadow_offset))
4601                                 == NULL) {
4602                                 target_offset += PAGE_SIZE_64;
4603                                 entry++;
4604                                 continue;
4605                         }
4606
4607                         m = vm_page_lookup(shadow_object, target_offset);
4608                         if(m != VM_PAGE_NULL) {
4609                             /*
4610                              * ENCRYPTED SWAP:
4611                              * If this page was encrypted, we
4612                              * don't need to decrypt it here.
4613                              * We'll decrypt it later, on demand,
4614                              * as soon as someone needs to access
4615                              * its contents.
4616                              */
4617
4618                             if (upl->flags & UPL_CLEAR_DIRTY) {
4619                                 pmap_clear_modify(m->phys_page);
4620                                 m->dirty = FALSE;
4621                             }
4622                             /* It is a part of the semantic of */
4623                             /* COPYOUT_FROM UPLs that a commit */
4624                             /* implies cache sync between the  */
4625                             /* vm page and the backing store   */
4626                             /* this can be used to strip the   */
4627                             /* precious bit as well as clean   */
4628                             if (upl->flags & UPL_PAGE_SYNC_DONE)
4629                                 m->precious = FALSE;
4630
4631                            if(page_list) {
4632                                 p = &(page_list[entry]);
4633                                 if(page_list[entry].phys_addr &&
4634                                                 p->pageout && !m->pageout) {
4635                                         vm_page_lock_queues();
4636                                         m->busy = TRUE;
4637                                         m->pageout = TRUE;
4638                                         vm_page_wire(m);
4639                                         vm_page_unlock_queues();
4640                                 } else if (page_list[entry].phys_addr &&
4641                                                 !p->pageout && m->pageout &&
4642                                                 !m->dump_cleaning) {
4643                                         vm_page_lock_queues();
4644                                         m->pageout = FALSE;
4645                                         m->absent = FALSE;
4646                                         m->overwriting = FALSE;
4647                                         vm_page_unwire(m);
4648                                         PAGE_WAKEUP_DONE(m);
4649                                         vm_page_unlock_queues();
4650                                 }
4651                                 page_list[entry].phys_addr = 0;
4652                            }
4653                         }
4654                         target_offset += PAGE_SIZE_64;
4655                         entry++;
4656                 }
4657                 vm_object_unlock(shadow_object);
4658                 if (object != shadow_object)
4659                         vm_object_unlock(object);
4660
4661         }
4662         if (upl->flags & UPL_DEVICE_MEMORY)  {
4663                 vm_object_lock(upl->map_object->shadow);
4664                 if(upl->map_object == upl->map_object->shadow)
4665                         vm_object_paging_end(upl->map_object->shadow);
4666                 vm_object_unlock(upl->map_object->shadow);
4667         }
4668         upl_unlock(upl);
4669         return KERN_SUCCESS;
4670 }
4671
4672
4673
4674 kern_return_t
4675 vm_object_iopl_request(
4676         vm_object_t             object,
4677         vm_object_offset_t      offset,
4678         upl_size_t              size,
4679         upl_t                   *upl_ptr,
4680         upl_page_info_array_t   user_page_list,
4681         unsigned int            *page_list_count,
4682         int                     cntrl_flags)
4683 {
4684         vm_page_t               dst_page;
4685         vm_object_offset_t      dst_offset = offset;
4686         upl_size_t              xfer_size = size;
4687         upl_t                   upl = NULL;
4688         unsigned int            entry;
4689         wpl_array_t             lite_list = NULL;
4690         int                     page_field_size;
4691         int                     delayed_unlock = 0;
4692         int                     no_zero_fill = FALSE;
4693         vm_page_t               alias_page = NULL;
4694         kern_return_t           ret;
4695         vm_prot_t               prot;
4696
4697
4698         if (cntrl_flags & ~UPL_VALID_FLAGS) {
4699                 /*
4700                  * For forward compatibility's sake,
4701                  * reject any unknown flag.
4702                  */
4703                 return KERN_INVALID_VALUE;
4704         }
4705         if (vm_lopage_poolsize == 0)
4706                 cntrl_flags &= ~UPL_NEED_32BIT_ADDR;
4707
4708         if (cntrl_flags & UPL_NEED_32BIT_ADDR) {
4709                 if ( (cntrl_flags & (UPL_SET_IO_WIRE | UPL_SET_LITE)) != (UPL_SET_IO_WIRE | UPL_SET_LITE))
4710                         return KERN_INVALID_VALUE;
4711
4712                 if (object->phys_contiguous) {
4713                         if ((offset + object->shadow_offset) >= (vm_object_offset_t)max_valid_dma_address)
4714                                 return KERN_INVALID_ADDRESS;
4715
4716                         if (((offset + object->shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address)
4717                                 return KERN_INVALID_ADDRESS;
4718                 }
4719         }
4720
4721         if (cntrl_flags & UPL_ENCRYPT) {
4722                 /*
4723                  * ENCRYPTED SWAP:
4724                  * The paging path doesn't use this interface,
4725                  * so we don't support the UPL_ENCRYPT flag
4726                  * here.  We won't encrypt the pages.
4727                  */
4728                 assert(! (cntrl_flags & UPL_ENCRYPT));
4729         }
4730
4731         if (cntrl_flags & UPL_NOZEROFILL)
4732                 no_zero_fill = TRUE;
4733
4734         if (cntrl_flags & UPL_COPYOUT_FROM)
4735                 prot = VM_PROT_READ;
4736         else
4737                 prot = VM_PROT_READ | VM_PROT_WRITE;
4738
4739         if(((size/page_size) > MAX_UPL_TRANSFER) && !object->phys_contiguous) {
4740                 size = MAX_UPL_TRANSFER * page_size;
4741         }
4742
4743         if(cntrl_flags & UPL_SET_INTERNAL)
4744                 if(page_list_count != NULL)
4745                         *page_list_count = MAX_UPL_TRANSFER;
4746         if(((cntrl_flags & UPL_SET_INTERNAL) && !(object->phys_contiguous)) &&
4747            ((page_list_count != NULL) && (*page_list_count != 0)
4748                                 && *page_list_count < (size/page_size)))
4749                 return KERN_INVALID_ARGUMENT;
4750
4751         if((!object->internal) && (object->paging_offset != 0))
4752                 panic("vm_object_upl_request: external object with non-zero paging offset\n");
4753
4754         if(object->phys_contiguous) {
4755                 /* No paging operations are possible against this memory */
4756                 /* and so no need for map object, ever */
4757                 cntrl_flags |= UPL_SET_LITE;
4758         }
4759
4760         if(upl_ptr) {
4761                 if(cntrl_flags & UPL_SET_INTERNAL) {
4762                         if(cntrl_flags & UPL_SET_LITE) {
4763                                 upl = upl_create(
4764                                         UPL_CREATE_INTERNAL | UPL_CREATE_LITE,
4765                                         size);
4766                                 user_page_list = (upl_page_info_t *)
4767                                    (((uintptr_t)upl) + sizeof(struct upl));
4768                                 lite_list = (wpl_array_t)
4769                                         (((uintptr_t)user_page_list) +
4770                                         ((size/PAGE_SIZE) *
4771                                                 sizeof(upl_page_info_t)));
4772                                 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
4773                                 page_field_size =
4774                                         (page_field_size + 3) & 0xFFFFFFFC;
4775                                 bzero((char *)lite_list, page_field_size);
4776                                 upl->flags =
4777                                         UPL_LITE | UPL_INTERNAL | UPL_IO_WIRE;
4778                         } else {
4779                                 upl = upl_create(UPL_CREATE_INTERNAL, size);
4780                                 user_page_list = (upl_page_info_t *)
4781                                         (((uintptr_t)upl)
4782                                                 + sizeof(struct upl));
4783                                 upl->flags = UPL_INTERNAL | UPL_IO_WIRE;
4784                         }
4785                 } else {
4786                         if(cntrl_flags & UPL_SET_LITE) {
4787                                 upl = upl_create(UPL_CREATE_LITE, size);
4788                                 lite_list = (wpl_array_t)
4789                                    (((uintptr_t)upl) + sizeof(struct upl));
4790                                 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
4791                                 page_field_size =
4792                                         (page_field_size + 3) & 0xFFFFFFFC;
4793                                 bzero((char *)lite_list, page_field_size);
4794                                 upl->flags = UPL_LITE | UPL_IO_WIRE;
4795                         } else {
4796                                 upl = upl_create(UPL_CREATE_EXTERNAL, size);
4797                                 upl->flags = UPL_IO_WIRE;
4798                         }
4799                 }
4800
4801                 if(object->phys_contiguous) {
4802                         upl->map_object = object;
4803                         /* don't need any shadow mappings for this one */
4804                         /* since it is already I/O memory */
4805                         upl->flags |= UPL_DEVICE_MEMORY;
4806
4807                         vm_object_lock(object);
4808                         vm_object_paging_begin(object);
4809                         vm_object_unlock(object);
4810
4811                         /* paging in progress also protects the paging_offset */
4812                         upl->offset = offset + object->paging_offset;
4813                         upl->size = size;
4814                         *upl_ptr = upl;
4815                         if(user_page_list) {
4816                                 user_page_list[0].phys_addr =
4817                                   (offset + object->shadow_offset)>>PAGE_SHIFT;
4818                                 user_page_list[0].device = TRUE;
4819                         }
4820                         upl->highest_page = (offset + object->shadow_offset + size - 1)>>PAGE_SHIFT;
4821
4822                         if(page_list_count != NULL) {
4823                                 if (upl->flags & UPL_INTERNAL) {
4824                                         *page_list_count = 0;
4825                                 } else {
4826                                         *page_list_count = 1;
4827                                 }
4828                         }
4829                         return KERN_SUCCESS;
4830                 }
4831                 if(user_page_list)
4832                         user_page_list[0].device = FALSE;
4833
4834                 if(cntrl_flags & UPL_SET_LITE) {
4835                         upl->map_object = object;
4836                 } else {
4837                         upl->map_object = vm_object_allocate(size);
4838                         vm_object_lock(upl->map_object);
4839                         upl->map_object->shadow = object;
4840                         upl->map_object->pageout = TRUE;
4841                         upl->map_object->can_persist = FALSE;
4842                         upl->map_object->copy_strategy =
4843                                         MEMORY_OBJECT_COPY_NONE;
4844                         upl->map_object->shadow_offset = offset;
4845                         upl->map_object->wimg_bits = object->wimg_bits;
4846                         vm_object_unlock(upl->map_object);
4847                 }
4848         }
4849         vm_object_lock(object);
4850         vm_object_paging_begin(object);
4851
4852         if (!object->phys_contiguous) {
4853                 /* Protect user space from future COW operations */
4854                 object->true_share = TRUE;
4855                 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC)
4856                         object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
4857         }
4858
4859         /* we can lock the upl offset now that paging_in_progress is set */
4860         if(upl_ptr) {
4861                 upl->size = size;
4862                 upl->offset = offset + object->paging_offset;
4863                 *upl_ptr = upl;
4864 #ifdef UPL_DEBUG
4865                 queue_enter(&object->uplq, upl, upl_t, uplq);
4866 #endif /* UPL_DEBUG */
4867         }
4868
4869         if (cntrl_flags & UPL_BLOCK_ACCESS) {
4870                 /*
4871                  * The user requested that access to the pages in this URL
4872                  * be blocked until the UPL is commited or aborted.
4873                  */
4874                 upl->flags |= UPL_ACCESS_BLOCKED;
4875         }
4876
4877         entry = 0;
4878         while (xfer_size) {
4879                 if((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
4880                         if (delayed_unlock) {
4881                                 delayed_unlock = 0;
4882                                 vm_page_unlock_queues();
4883                         }
4884                         vm_object_unlock(object);
4885                         VM_PAGE_GRAB_FICTITIOUS(alias_page);
4886                         vm_object_lock(object);
4887                 }
4888                 dst_page = vm_page_lookup(object, dst_offset);
4889
4890                 /*
4891                  * ENCRYPTED SWAP:
4892                  * If the page is encrypted, we need to decrypt it,
4893                  * so force a soft page fault.
4894                  */
4895                 if ((dst_page == VM_PAGE_NULL) || (dst_page->busy) ||
4896                     (dst_page->encrypted) ||
4897                     (dst_page->unusual && (dst_page->error ||
4898                                            dst_page->restart ||
4899                                            dst_page->absent ||
4900                                            dst_page->fictitious ||
4901                                            (prot & dst_page->page_lock)))) {
4902                         vm_fault_return_t       result;
4903                    do {
4904                         vm_page_t       top_page;
4905                         kern_return_t   error_code;
4906                         int             interruptible;
4907
4908                         vm_object_offset_t      lo_offset = offset;
4909                         vm_object_offset_t      hi_offset = offset + size;
4910
4911
4912                         if (delayed_unlock) {
4913                                 delayed_unlock = 0;
4914                                 vm_page_unlock_queues();
4915                         }
4916
4917                         if(cntrl_flags & UPL_SET_INTERRUPTIBLE) {
4918                                 interruptible = THREAD_ABORTSAFE;
4919                         } else {
4920                                 interruptible = THREAD_UNINT;
4921                         }
4922
4923                         result = vm_fault_page(object, dst_offset,
4924                                 prot | VM_PROT_WRITE, FALSE,
4925                                 interruptible,
4926                                 lo_offset, hi_offset,
4927                                 VM_BEHAVIOR_SEQUENTIAL,
4928                                 &prot, &dst_page, &top_page,
4929                                 (int *)0,
4930                                 &error_code, no_zero_fill, FALSE, NULL, 0);
4931
4932                         switch(result) {
4933                         case VM_FAULT_SUCCESS:
4934
4935                                 PAGE_WAKEUP_DONE(dst_page);
4936
4937                                 /*
4938                                  *      Release paging references and
4939                                  *      top-level placeholder page, if any.
4940                                  */
4941
4942                                 if(top_page != VM_PAGE_NULL) {
4943                                         vm_object_t local_object;
4944                                         local_object =
4945                                                 top_page->object;
4946                                         if(top_page->object
4947                                                 != dst_page->object) {
4948                                                 vm_object_lock(
4949                                                         local_object);
4950                                                 VM_PAGE_FREE(top_page);
4951                                                 vm_object_paging_end(
4952                                                         local_object);
4953                                                 vm_object_unlock(
4954                                                         local_object);
4955                                         } else {
4956                                                 VM_PAGE_FREE(top_page);
4957                                                 vm_object_paging_end(
4958                                                         local_object);
4959                                         }
4960                                 }
4961
4962                                 break;
4963
4964
4965                         case VM_FAULT_RETRY:
4966                                 vm_object_lock(object);
4967                                 vm_object_paging_begin(object);
4968                                 break;
4969
4970                         case VM_FAULT_FICTITIOUS_SHORTAGE:
4971                                 vm_page_more_fictitious();
4972                                 vm_object_lock(object);
4973                                 vm_object_paging_begin(object);
4974                                 break;
4975
4976                         case VM_FAULT_MEMORY_SHORTAGE:
4977                                 if (vm_page_wait(interruptible)) {
4978                                         vm_object_lock(object);
4979                                         vm_object_paging_begin(object);
4980                                         break;
4981                                 }
4982                                 /* fall thru */
4983
4984                         case VM_FAULT_INTERRUPTED:
4985                                 error_code = MACH_SEND_INTERRUPTED;
4986                         case VM_FAULT_MEMORY_ERROR:
4987                                 ret = (error_code ? error_code:
4988                                         KERN_MEMORY_ERROR);
4989                                 vm_object_lock(object);
4990
4991                                 goto return_err;
4992                         }
4993                    } while ((result != VM_FAULT_SUCCESS)
4994                                 || (result == VM_FAULT_INTERRUPTED));
4995                 }
4996
4997                 if ( (cntrl_flags & UPL_NEED_32BIT_ADDR) &&
4998                      dst_page->phys_page >= (max_valid_dma_address >> PAGE_SHIFT) ) {
4999                         vm_page_t       low_page;
5000                         int             refmod;
5001
5002                         /*
5003                          * support devices that can't DMA above 32 bits
5004                          * by substituting pages from a pool of low address
5005                          * memory for any pages we find above the 4G mark
5006                          * can't substitute if the page is already wired because
5007                          * we don't know whether that physical address has been
5008                          * handed out to some other 64 bit capable DMA device to use
5009                          */
5010                         if (dst_page->wire_count) {
5011                                 ret = KERN_PROTECTION_FAILURE;
5012                                 goto return_err;
5013                         }
5014                         if (delayed_unlock) {
5015                                 delayed_unlock = 0;
5016                                 vm_page_unlock_queues();
5017                         }
5018                         low_page = vm_page_grablo();
5019
5020                         if (low_page == VM_PAGE_NULL) {
5021                                 ret = KERN_RESOURCE_SHORTAGE;
5022                                 goto return_err;
5023                         }
5024                         /*
5025                          * from here until the vm_page_replace completes
5026                          * we musn't drop the object lock... we don't
5027                          * want anyone refaulting this page in and using
5028                          * it after we disconnect it... we want the fault
5029                          * to find the new page being substituted.
5030                          */
5031                         refmod = pmap_disconnect(dst_page->phys_page);
5032
5033                         vm_page_copy(dst_page, low_page);
5034
5035                         low_page->reference = dst_page->reference;
5036                         low_page->dirty     = dst_page->dirty;
5037
5038                         if (refmod & VM_MEM_REFERENCED)
5039                                 low_page->reference = TRUE;
5040                         if (refmod & VM_MEM_MODIFIED)
5041                                 low_page->dirty = TRUE;
5042
5043                         vm_page_lock_queues();
5044                         vm_page_replace(low_page, object, dst_offset);
5045                         /*
5046                          * keep the queue lock since we're going to
5047                          * need it immediately
5048                          */
5049                         delayed_unlock = 1;
5050
5051                         dst_page = low_page;
5052                         /*
5053                          * vm_page_grablo returned the page marked
5054                          * BUSY... we don't need a PAGE_WAKEUP_DONE
5055                          * here, because we've never dropped the object lock
5056                          */
5057                         dst_page->busy = FALSE;
5058                 }
5059                 if (delayed_unlock == 0)
5060                         vm_page_lock_queues();
5061                 vm_page_wire(dst_page);
5062
5063                 if (cntrl_flags & UPL_BLOCK_ACCESS) {
5064                         /*
5065                          * Mark the page "busy" to block any future page fault
5066                          * on this page.  We'll also remove the mapping
5067                          * of all these pages before leaving this routine.
5068                          */
5069                         assert(!dst_page->fictitious);
5070                         dst_page->busy = TRUE;
5071                 }
5072
5073                 if (upl_ptr) {
5074                         if (cntrl_flags & UPL_SET_LITE) {
5075                                 int     pg_num;
5076                                 pg_num = (dst_offset-offset)/PAGE_SIZE;
5077                                 lite_list[pg_num>>5] |= 1 << (pg_num & 31);
5078                         } else {
5079                                 /*
5080                                  * Convert the fictitious page to a
5081                                  * private shadow of the real page.
5082                                  */
5083                                 assert(alias_page->fictitious);
5084                                 alias_page->fictitious = FALSE;
5085                                 alias_page->private = TRUE;
5086                                 alias_page->pageout = TRUE;
5087                                 alias_page->phys_page = dst_page->phys_page;
5088                                 vm_page_wire(alias_page);
5089
5090                                 vm_page_insert(alias_page,
5091                                         upl->map_object, size - xfer_size);
5092                                 assert(!alias_page->wanted);
5093                                 alias_page->busy = FALSE;
5094                                 alias_page->absent = FALSE;
5095                         }
5096
5097                         /* expect the page to be used */
5098                         dst_page->reference = TRUE;
5099
5100                         if (!(cntrl_flags & UPL_COPYOUT_FROM))
5101                                 dst_page->dirty = TRUE;
5102                         alias_page = NULL;
5103
5104                         if (dst_page->phys_page > upl->highest_page)
5105                                 upl->highest_page = dst_page->phys_page;
5106
5107                         if (user_page_list) {
5108                                 user_page_list[entry].phys_addr
5109                                         = dst_page->phys_page;
5110                                 user_page_list[entry].dirty =
5111                                                 dst_page->dirty;
5112                                 user_page_list[entry].pageout =
5113                                                 dst_page->pageout;
5114                                 user_page_list[entry].absent =
5115                                                 dst_page->absent;
5116                                 user_page_list[entry].precious =
5117                                                 dst_page->precious;
5118                         }
5119                 }
5120                 if (delayed_unlock++ > DELAYED_UNLOCK_LIMIT) {
5121                         delayed_unlock = 0;
5122                         vm_page_unlock_queues();
5123                 }
5124                 entry++;
5125                 dst_offset += PAGE_SIZE_64;
5126                 xfer_size -= PAGE_SIZE;
5127         }
5128         if (delayed_unlock)
5129                 vm_page_unlock_queues();
5130
5131         if (upl->flags & UPL_INTERNAL) {
5132                 if(page_list_count != NULL)
5133                         *page_list_count = 0;
5134         } else if (*page_list_count > entry) {
5135                 if(page_list_count != NULL)
5136                         *page_list_count = entry;
5137         }
5138
5139         if (alias_page != NULL) {
5140                 vm_page_lock_queues();
5141                 vm_page_free(alias_page);
5142                 vm_page_unlock_queues();
5143         }
5144
5145         vm_object_unlock(object);
5146
5147         if (cntrl_flags & UPL_BLOCK_ACCESS) {
5148                 /*
5149                  * We've marked all the pages "busy" so that future
5150                  * page faults will block.
5151                  * Now remove the mapping for these pages, so that they
5152                  * can't be accessed without causing a page fault.
5153                  */
5154                 vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
5155                                        PMAP_NULL, 0, VM_PROT_NONE);
5156         }
5157
5158         return KERN_SUCCESS;
5159
5160
5161 return_err:
5162         if (delayed_unlock)
5163                 vm_page_unlock_queues();
5164
5165         for (; offset < dst_offset; offset += PAGE_SIZE) {
5166                 dst_page = vm_page_lookup(object, offset);
5167
5168                 if (dst_page == VM_PAGE_NULL)
5169                         panic("vm_object_iopl_request: Wired pages missing. \n");
5170                 vm_page_lock_queues();
5171                 vm_page_unwire(dst_page);
5172                 vm_page_unlock_queues();
5173                 VM_STAT(reactivations++);
5174         }
5175         vm_object_paging_end(object);
5176         vm_object_unlock(object);
5177         upl_destroy(upl);
5178
5179         return ret;
5180 }
5181
5182
5183 kern_return_t
5184 upl_transpose(
5185         upl_t           upl1,
5186         upl_t           upl2)
5187 {
5188         kern_return_t           retval;
5189         boolean_t               upls_locked;
5190         vm_object_t             object1, object2;
5191
5192         if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2) {
5193                 return KERN_INVALID_ARGUMENT;
5194         }
5195
5196         upls_locked = FALSE;
5197
5198         /*
5199          * Since we need to lock both UPLs at the same time,
5200          * avoid deadlocks by always taking locks in the same order.
5201          */
5202         if (upl1 < upl2) {
5203                 upl_lock(upl1);
5204                 upl_lock(upl2);
5205         } else {
5206                 upl_lock(upl2);
5207                 upl_lock(upl1);
5208         }
5209         upls_locked = TRUE;     /* the UPLs will need to be unlocked */
5210
5211         object1 = upl1->map_object;
5212         object2 = upl2->map_object;
5213
5214         if (upl1->offset != 0 || upl2->offset != 0 ||
5215             upl1->size != upl2->size) {
5216                 /*
5217                  * We deal only with full objects, not subsets.
5218                  * That's because we exchange the entire backing store info
5219                  * for the objects: pager, resident pages, etc...  We can't do
5220                  * only part of it.
5221                  */
5222                 retval = KERN_INVALID_VALUE;
5223                 goto done;
5224         }
5225
5226         /*
5227          * Tranpose the VM objects' backing store.
5228          */
5229         retval = vm_object_transpose(object1, object2,
5230                                      (vm_object_size_t) upl1->size);
5231
5232         if (retval == KERN_SUCCESS) {
5233                 /*
5234                  * Make each UPL point to the correct VM object, i.e. the
5235                  * object holding the pages that the UPL refers to...
5236                  */
5237                 upl1->map_object = object2;
5238                 upl2->map_object = object1;
5239         }
5240
5241 done:
5242         /*
5243          * Cleanup.
5244          */
5245         if (upls_locked) {
5246                 upl_unlock(upl1);
5247                 upl_unlock(upl2);
5248                 upls_locked = FALSE;
5249         }
5250
5251         return retval;
5252 }
5253
5254 /*
5255  * ENCRYPTED SWAP:
5256  *
5257  * Rationale:  the user might have some encrypted data on disk (via
5258  * FileVault or any other mechanism).  That data is then decrypted in
5259  * memory, which is safe as long as the machine is secure.  But that
5260  * decrypted data in memory could be paged out to disk by the default
5261  * pager.  The data would then be stored on disk in clear (not encrypted)
5262  * and it could be accessed by anyone who gets physical access to the
5263  * disk (if the laptop or the disk gets stolen for example).  This weakens
5264  * the security offered by FileVault.
5265  *
5266  * Solution:  the default pager will optionally request that all the
5267  * pages it gathers for pageout be encrypted, via the UPL interfaces,
5268  * before it sends this UPL to disk via the vnode_pageout() path.
5269  *
5270  * Notes:
5271  *
5272  * To avoid disrupting the VM LRU algorithms, we want to keep the
5273  * clean-in-place mechanisms, which allow us to send some extra pages to
5274  * swap (clustering) without actually removing them from the user's
5275  * address space.  We don't want the user to unknowingly access encrypted
5276  * data, so we have to actually remove the encrypted pages from the page
5277  * table.  When the user accesses the data, the hardware will fail to
5278  * locate the virtual page in its page table and will trigger a page
5279  * fault.  We can then decrypt the page and enter it in the page table
5280  * again.  Whenever we allow the user to access the contents of a page,
5281  * we have to make sure it's not encrypted.
5282  *
5283  *
5284  */
5285 /*
5286  * ENCRYPTED SWAP:
5287  * Reserve of virtual addresses in the kernel address space.
5288  * We need to map the physical pages in the kernel, so that we
5289  * can call the encryption/decryption routines with a kernel
5290  * virtual address.  We keep this pool of pre-allocated kernel
5291  * virtual addresses so that we don't have to scan the kernel's
5292  * virtaul address space each time we need to encrypt or decrypt
5293  * a physical page.
5294  * It would be nice to be able to encrypt and decrypt in physical
5295  * mode but that might not always be more efficient...
5296  */
5297 decl_simple_lock_data(,vm_paging_lock)
5298 #define VM_PAGING_NUM_PAGES     64
5299 vm_map_offset_t vm_paging_base_address = 0;
5300 boolean_t       vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
5301 int             vm_paging_max_index = 0;
5302 unsigned long   vm_paging_no_kernel_page = 0;
5303 unsigned long   vm_paging_objects_mapped = 0;
5304 unsigned long   vm_paging_pages_mapped = 0;
5305 unsigned long   vm_paging_objects_mapped_slow = 0;
5306 unsigned long   vm_paging_pages_mapped_slow = 0;
5307
5308 /*
5309  * ENCRYPTED SWAP:
5310  * vm_paging_map_object:
5311  *      Maps part of a VM object's pages in the kernel
5312  *      virtual address space, using the pre-allocated
5313  *      kernel virtual addresses, if possible.
5314  * Context:
5315  *      The VM object is locked.  This lock will get
5316  *      dropped and re-acquired though.
5317  */
5318 kern_return_t
5319 vm_paging_map_object(
5320         vm_map_offset_t         *address,
5321         vm_page_t               page,
5322         vm_object_t             object,
5323         vm_object_offset_t      offset,
5324         vm_map_size_t           *size)
5325 {
5326         kern_return_t           kr;
5327         vm_map_offset_t         page_map_offset;
5328         vm_map_size_t           map_size;
5329         vm_object_offset_t      object_offset;
5330 #ifdef __ppc__
5331         int                     i;
5332         vm_map_entry_t          map_entry;
5333 #endif /* __ppc__ */
5334
5335
5336 #ifdef __ppc__
5337         if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
5338                 /*
5339                  * Optimization for the PowerPC.
5340                  * Use one of the pre-allocated kernel virtual addresses
5341                  * and just enter the VM page in the kernel address space
5342                  * at that virtual address.
5343                  */
5344                 vm_object_unlock(object);
5345                 simple_lock(&vm_paging_lock);
5346
5347                 if (vm_paging_base_address == 0) {
5348                         /*
5349                          * Initialize our pool of pre-allocated kernel
5350                          * virtual addresses.
5351                          */
5352                         simple_unlock(&vm_paging_lock);
5353                         page_map_offset = 0;
5354                         kr = vm_map_find_space(kernel_map,
5355                                                &page_map_offset,
5356                                                VM_PAGING_NUM_PAGES * PAGE_SIZE,
5357                                                0,
5358                                                0,
5359                                                &map_entry);
5360                         if (kr != KERN_SUCCESS) {
5361                                 panic("vm_paging_map_object: "
5362                                       "kernel_map full\n");
5363                         }
5364                         map_entry->object.vm_object = kernel_object;
5365                         map_entry->offset =
5366                                 page_map_offset - VM_MIN_KERNEL_ADDRESS;
5367                         vm_object_reference(kernel_object);
5368                         vm_map_unlock(kernel_map);
5369
5370                         simple_lock(&vm_paging_lock);
5371                         if (vm_paging_base_address != 0) {
5372                                 /* someone raced us and won: undo */
5373                                 simple_unlock(&vm_paging_lock);
5374                                 kr = vm_map_remove(kernel_map,
5375                                                    page_map_offset,
5376                                                    page_map_offset +
5377                                                    (VM_PAGING_NUM_PAGES
5378                                                     * PAGE_SIZE),
5379                                                    VM_MAP_NO_FLAGS);
5380                                 assert(kr == KERN_SUCCESS);
5381                                 simple_lock(&vm_paging_lock);
5382                         } else {
5383                                 vm_paging_base_address = page_map_offset;
5384                         }
5385                 }
5386
5387                 /*
5388                  * Try and find an available kernel virtual address
5389                  * from our pre-allocated pool.
5390                  */
5391                 page_map_offset = 0;
5392                 for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
5393                         if (vm_paging_page_inuse[i] == FALSE) {
5394                                 page_map_offset = vm_paging_base_address +
5395                                         (i * PAGE_SIZE);
5396                                 break;
5397                         }
5398                 }
5399
5400                 if (page_map_offset != 0) {
5401                         /*
5402                          * We found a kernel virtual address;
5403                          * map the physical page to that virtual address.
5404                          */
5405                         if (i > vm_paging_max_index) {
5406                                 vm_paging_max_index = i;
5407                         }
5408                         vm_paging_page_inuse[i] = TRUE;
5409                         simple_unlock(&vm_paging_lock);
5410                         pmap_map_block(kernel_pmap,
5411                                        page_map_offset,
5412                                        page->phys_page,
5413                                        1,                                               /* Size is number of 4k pages */
5414                                        VM_PROT_DEFAULT,
5415                                        ((int) page->object->wimg_bits &
5416                                         VM_WIMG_MASK),
5417                                        0);
5418                         vm_paging_objects_mapped++;
5419                         vm_paging_pages_mapped++;
5420                         *address = page_map_offset;
5421                         vm_object_lock(object);
5422
5423                         /* all done and mapped, ready to use ! */
5424                         return KERN_SUCCESS;
5425                 }
5426
5427                 /*
5428                  * We ran out of pre-allocated kernel virtual
5429                  * addresses.  Just map the page in the kernel
5430                  * the slow and regular way.
5431                  */
5432                 vm_paging_no_kernel_page++;
5433                 simple_unlock(&vm_paging_lock);
5434                 vm_object_lock(object);
5435         }
5436 #endif /* __ppc__ */
5437
5438         object_offset = vm_object_trunc_page(offset);
5439         map_size = vm_map_round_page(*size);
5440
5441         /*
5442          * Try and map the required range of the object
5443          * in the kernel_map
5444          */
5445
5446         /* don't go beyond the object's end... */
5447         if (object_offset >= object->size) {
5448                 map_size = 0;
5449         } else if (map_size > object->size - offset) {
5450                 map_size = object->size - offset;
5451         }
5452
5453         vm_object_reference_locked(object);     /* for the map entry */
5454         vm_object_unlock(object);
5455
5456         kr = vm_map_enter(kernel_map,
5457                           address,
5458                           map_size,
5459                           0,
5460                           VM_FLAGS_ANYWHERE,
5461                           object,
5462                           object_offset,
5463                           FALSE,
5464                           VM_PROT_DEFAULT,
5465                           VM_PROT_ALL,
5466                           VM_INHERIT_NONE);
5467         if (kr != KERN_SUCCESS) {
5468                 *address = 0;
5469                 *size = 0;
5470                 vm_object_deallocate(object);   /* for the map entry */
5471                 return kr;
5472         }
5473
5474         *size = map_size;
5475
5476         /*
5477          * Enter the mapped pages in the page table now.
5478          */
5479         vm_object_lock(object);
5480         for (page_map_offset = 0;
5481              map_size != 0;
5482              map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
5483                 unsigned int    cache_attr;
5484
5485                 page = vm_page_lookup(object, offset + page_map_offset);
5486                 if (page == VM_PAGE_NULL) {
5487                         panic("vm_paging_map_object: no page !?");
5488                 }
5489                 if (page->no_isync == TRUE) {
5490                         pmap_sync_page_data_phys(page->phys_page);
5491                 }
5492                 cache_attr = ((unsigned int) object->wimg_bits) & VM_WIMG_MASK;
5493
5494                 PMAP_ENTER(kernel_pmap,
5495                            *address + page_map_offset,
5496                            page,
5497                            VM_PROT_DEFAULT,
5498                            cache_attr,
5499                            FALSE);
5500         }
5501
5502         vm_paging_objects_mapped_slow++;
5503         vm_paging_pages_mapped_slow += map_size / PAGE_SIZE_64;
5504
5505         return KERN_SUCCESS;
5506 }
5507
5508 /*
5509  * ENCRYPTED SWAP:
5510  * vm_paging_unmap_object:
5511  *      Unmaps part of a VM object's pages from the kernel
5512  *      virtual address space.
5513  * Context:
5514  *      The VM object is locked.  This lock will get
5515  *      dropped and re-acquired though.
5516  */
5517 void
5518 vm_paging_unmap_object(
5519         vm_object_t     object,
5520         vm_map_offset_t start,
5521         vm_map_offset_t end)
5522 {
5523         kern_return_t   kr;
5524 #ifdef __ppc__
5525         int             i;
5526 #endif /* __ppc__ */
5527
5528         if ((vm_paging_base_address == 0) &&
5529             ((start < vm_paging_base_address) ||
5530              (end > (vm_paging_base_address
5531                      + (VM_PAGING_NUM_PAGES * PAGE_SIZE))))) {
5532                 /*
5533                  * We didn't use our pre-allocated pool of
5534                  * kernel virtual address.  Deallocate the
5535                  * virtual memory.
5536                  */
5537                 if (object != VM_OBJECT_NULL) {
5538                         vm_object_unlock(object);
5539                 }
5540                 kr = vm_map_remove(kernel_map, start, end, VM_MAP_NO_FLAGS);
5541                 if (object != VM_OBJECT_NULL) {
5542                         vm_object_lock(object);
5543                 }
5544                 assert(kr == KERN_SUCCESS);
5545         } else {
5546                 /*
5547                  * We used a kernel virtual address from our
5548                  * pre-allocated pool.  Put it back in the pool
5549                  * for next time.
5550                  */
5551 #ifdef __ppc__
5552                 assert(end - start == PAGE_SIZE);
5553                 i = (start - vm_paging_base_address) >> PAGE_SHIFT;
5554
5555                 /* undo the pmap mapping */
5556                 mapping_remove(kernel_pmap, start);
5557
5558                 simple_lock(&vm_paging_lock);
5559                 vm_paging_page_inuse[i] = FALSE;
5560                 simple_unlock(&vm_paging_lock);
5561 #endif /* __ppc__ */
5562         }
5563 }
5564
5565 /*
5566  * Encryption data.
5567  * "iv" is the "initial vector".  Ideally, we want to
5568  * have a different one for each page we encrypt, so that
5569  * crackers can't find encryption patterns too easily.
5570  */
5571 #define SWAP_CRYPT_AES_KEY_SIZE 128     /* XXX 192 and 256 don't work ! */
5572 boolean_t               swap_crypt_ctx_initialized = FALSE;
5573 aes_32t                 swap_crypt_key[8]; /* big enough for a 256 key */
5574 aes_ctx                 swap_crypt_ctx;
5575 const unsigned char     swap_crypt_null_iv[AES_BLOCK_SIZE] = {0xa, };
5576
5577 #if DEBUG
5578 boolean_t               swap_crypt_ctx_tested = FALSE;
5579 unsigned char swap_crypt_test_page_ref[4096] __attribute__((aligned(4096)));
5580 unsigned char swap_crypt_test_page_encrypt[4096] __attribute__((aligned(4096)));
5581 unsigned char swap_crypt_test_page_decrypt[4096] __attribute__((aligned(4096)));
5582 #endif /* DEBUG */
5583
5584 extern u_long random(void);
5585
5586 /*
5587  * Initialize the encryption context: key and key size.
5588  */
5589 void swap_crypt_ctx_initialize(void); /* forward */
5590 void
5591 swap_crypt_ctx_initialize(void)
5592 {
5593         unsigned int    i;
5594
5595         /*
5596          * No need for locking to protect swap_crypt_ctx_initialized
5597          * because the first use of encryption will come from the
5598          * pageout thread (we won't pagein before there's been a pageout)
5599          * and there's only one pageout thread.
5600          */
5601         if (swap_crypt_ctx_initialized == FALSE) {
5602                 for (i = 0;
5603                      i < (sizeof (swap_crypt_key) /
5604                           sizeof (swap_crypt_key[0]));
5605                      i++) {
5606                         swap_crypt_key[i] = random();
5607                 }
5608                 aes_encrypt_key((const unsigned char *) swap_crypt_key,
5609                                 SWAP_CRYPT_AES_KEY_SIZE,
5610                                 &swap_crypt_ctx.encrypt);
5611                 aes_decrypt_key((const unsigned char *) swap_crypt_key,
5612                                 SWAP_CRYPT_AES_KEY_SIZE,
5613                                 &swap_crypt_ctx.decrypt);
5614                 swap_crypt_ctx_initialized = TRUE;
5615         }
5616
5617 #if DEBUG
5618         /*
5619          * Validate the encryption algorithms.
5620          */
5621         if (swap_crypt_ctx_tested == FALSE) {
5622                 /* initialize */
5623                 for (i = 0; i < 4096; i++) {
5624                         swap_crypt_test_page_ref[i] = (char) i;
5625                 }
5626                 /* encrypt */
5627                 aes_encrypt_cbc(swap_crypt_test_page_ref,
5628                                 swap_crypt_null_iv,
5629                                 PAGE_SIZE / AES_BLOCK_SIZE,
5630                                 swap_crypt_test_page_encrypt,
5631                                 &swap_crypt_ctx.encrypt);
5632                 /* decrypt */
5633                 aes_decrypt_cbc(swap_crypt_test_page_encrypt,
5634                                 swap_crypt_null_iv,
5635                                 PAGE_SIZE / AES_BLOCK_SIZE,
5636                                 swap_crypt_test_page_decrypt,
5637                                 &swap_crypt_ctx.decrypt);
5638                 /* compare result with original */
5639                 for (i = 0; i < 4096; i ++) {
5640                         if (swap_crypt_test_page_decrypt[i] !=
5641                             swap_crypt_test_page_ref[i]) {
5642                                 panic("encryption test failed");
5643                         }
5644                 }
5645
5646                 /* encrypt again */
5647                 aes_encrypt_cbc(swap_crypt_test_page_decrypt,
5648                                 swap_crypt_null_iv,
5649                                 PAGE_SIZE / AES_BLOCK_SIZE,
5650                                 swap_crypt_test_page_decrypt,
5651                                 &swap_crypt_ctx.encrypt);
5652                 /* decrypt in place */
5653                 aes_decrypt_cbc(swap_crypt_test_page_decrypt,
5654                                 swap_crypt_null_iv,
5655                                 PAGE_SIZE / AES_BLOCK_SIZE,
5656                                 swap_crypt_test_page_decrypt,
5657                                 &swap_crypt_ctx.decrypt);
5658                 for (i = 0; i < 4096; i ++) {
5659                         if (swap_crypt_test_page_decrypt[i] !=
5660                             swap_crypt_test_page_ref[i]) {
5661                                 panic("in place encryption test failed");
5662                         }
5663                 }
5664
5665                 swap_crypt_ctx_tested = TRUE;
5666         }
5667 #endif /* DEBUG */
5668 }
5669
5670 /*
5671  * ENCRYPTED SWAP:
5672  * vm_page_encrypt:
5673  *      Encrypt the given page, for secure paging.
5674  *      The page might already be mapped at kernel virtual
5675  *      address "kernel_mapping_offset".  Otherwise, we need
5676  *      to map it.
5677  *
5678  * Context:
5679  *      The page's object is locked, but this lock will be released
5680  *      and re-acquired.
5681  *      The page is busy and not accessible by users (not entered in any pmap).
5682  */
5683 void
5684 vm_page_encrypt(
5685         vm_page_t       page,
5686         vm_map_offset_t kernel_mapping_offset)
5687 {
5688         int                     clear_refmod = 0;
5689         kern_return_t           kr;
5690         boolean_t               page_was_referenced;
5691         boolean_t               page_was_modified;
5692         vm_map_size_t           kernel_mapping_size;
5693         vm_offset_t             kernel_vaddr;
5694         union {
5695                 unsigned char   aes_iv[AES_BLOCK_SIZE];
5696                 struct {
5697                         memory_object_t         pager_object;
5698                         vm_object_offset_t      paging_offset;
5699                 } vm;
5700         } encrypt_iv;
5701
5702         if (! vm_pages_encrypted) {
5703                 vm_pages_encrypted = TRUE;
5704         }
5705
5706         assert(page->busy);
5707         assert(page->dirty || page->precious);
5708
5709         if (page->encrypted) {
5710                 /*
5711                  * Already encrypted: no need to do it again.
5712                  */
5713                 vm_page_encrypt_already_encrypted_counter++;
5714                 return;
5715         }
5716         ASSERT_PAGE_DECRYPTED(page);
5717
5718         /*
5719          * Gather the "reference" and "modified" status of the page.
5720          * We'll restore these values after the encryption, so that
5721          * the encryption is transparent to the rest of the system
5722          * and doesn't impact the VM's LRU logic.
5723          */
5724         page_was_referenced =
5725                 (page->reference || pmap_is_referenced(page->phys_page));
5726         page_was_modified =
5727                 (page->dirty || pmap_is_modified(page->phys_page));
5728
5729         if (kernel_mapping_offset == 0) {
5730                 /*
5731                  * The page hasn't already been mapped in kernel space
5732                  * by the caller.  Map it now, so that we can access
5733                  * its contents and encrypt them.
5734                  */
5735                 kernel_mapping_size = PAGE_SIZE;
5736                 kr = vm_paging_map_object(&kernel_mapping_offset,
5737                                           page,
5738                                           page->object,
5739                                           page->offset,
5740                                           &kernel_mapping_size);
5741                 if (kr != KERN_SUCCESS) {
5742                         panic("vm_page_encrypt: "
5743                               "could not map page in kernel: 0x%x\n",
5744                               kr);
5745                 }
5746         } else {
5747                 kernel_mapping_size = 0;
5748         }
5749         kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
5750
5751         if (swap_crypt_ctx_initialized == FALSE) {
5752                 swap_crypt_ctx_initialize();
5753         }
5754         assert(swap_crypt_ctx_initialized);
5755
5756         /*
5757          * Prepare an "initial vector" for the encryption.
5758          * We use the "pager" and the "paging_offset" for that
5759          * page to obfuscate the encrypted data a bit more and
5760          * prevent crackers from finding patterns that they could
5761          * use to break the key.
5762          */
5763         bzero(&encrypt_iv.aes_iv[0], sizeof (encrypt_iv.aes_iv));
5764         encrypt_iv.vm.pager_object = page->object->pager;
5765         encrypt_iv.vm.paging_offset =
5766                 page->object->paging_offset + page->offset;
5767
5768         vm_object_unlock(page->object);
5769
5770         /* encrypt the "initial vector" */
5771         aes_encrypt_cbc((const unsigned char *) &encrypt_iv.aes_iv[0],
5772                         swap_crypt_null_iv,
5773                         1,
5774                         &encrypt_iv.aes_iv[0],
5775                         &swap_crypt_ctx.encrypt);
5776
5777         /*
5778          * Encrypt the page.
5779          */
5780         aes_encrypt_cbc((const unsigned char *) kernel_vaddr,
5781                         &encrypt_iv.aes_iv[0],
5782                         PAGE_SIZE / AES_BLOCK_SIZE,
5783                         (unsigned char *) kernel_vaddr,
5784                         &swap_crypt_ctx.encrypt);
5785
5786         vm_page_encrypt_counter++;
5787
5788         vm_object_lock(page->object);
5789
5790         /*
5791          * Unmap the page from the kernel's address space,
5792          * if we had to map it ourselves.  Otherwise, let
5793          * the caller undo the mapping if needed.
5794          */
5795         if (kernel_mapping_size != 0) {
5796                 vm_paging_unmap_object(page->object,
5797                                        kernel_mapping_offset,
5798                                        kernel_mapping_offset + kernel_mapping_size);
5799         }
5800
5801         /*
5802          * Restore the "reference" and "modified" bits.
5803          * This should clean up any impact the encryption had
5804          * on them.
5805          */
5806         if (! page_was_referenced) {
5807                 clear_refmod |= VM_MEM_REFERENCED;
5808                 page->reference = FALSE;
5809         }
5810         if (! page_was_modified) {
5811                 clear_refmod |= VM_MEM_MODIFIED;
5812                 page->dirty = FALSE;
5813         }
5814         if (clear_refmod)
5815                 pmap_clear_refmod(page->phys_page, clear_refmod);
5816
5817         page->encrypted = TRUE;
5818 }
5819
5820 /*
5821  * ENCRYPTED SWAP:
5822  * vm_page_decrypt:
5823  *      Decrypt the given page.
5824  *      The page might already be mapped at kernel virtual
5825  *      address "kernel_mapping_offset".  Otherwise, we need
5826  *      to map it.
5827  *
5828  * Context:
5829  *      The page's VM object is locked but will be unlocked and relocked.
5830  *      The page is busy and not accessible by users (not entered in any pmap).
5831  */
5832 void
5833 vm_page_decrypt(
5834         vm_page_t       page,
5835         vm_map_offset_t kernel_mapping_offset)
5836 {
5837         int                     clear_refmod = 0;
5838         kern_return_t           kr;
5839         vm_map_size_t           kernel_mapping_size;
5840         vm_offset_t             kernel_vaddr;
5841         boolean_t               page_was_referenced;
5842         union {
5843                 unsigned char   aes_iv[AES_BLOCK_SIZE];
5844                 struct {
5845                         memory_object_t         pager_object;
5846                         vm_object_offset_t      paging_offset;
5847                 } vm;
5848         } decrypt_iv;
5849
5850         assert(page->busy);
5851         assert(page->encrypted);
5852
5853         /*
5854          * Gather the "reference" status of the page.
5855          * We'll restore its value after the decryption, so that
5856          * the decryption is transparent to the rest of the system
5857          * and doesn't impact the VM's LRU logic.
5858          */
5859         page_was_referenced =
5860                 (page->reference || pmap_is_referenced(page->phys_page));
5861
5862         if (kernel_mapping_offset == 0) {
5863                 /*
5864                  * The page hasn't already been mapped in kernel space
5865                  * by the caller.  Map it now, so that we can access
5866                  * its contents and decrypt them.
5867                  */
5868                 kernel_mapping_size = PAGE_SIZE;
5869                 kr = vm_paging_map_object(&kernel_mapping_offset,
5870                                           page,
5871                                           page->object,
5872                                           page->offset,
5873                                           &kernel_mapping_size);
5874                 if (kr != KERN_SUCCESS) {
5875                         panic("vm_page_decrypt: "
5876                               "could not map page in kernel: 0x%x\n");
5877                 }
5878         } else {
5879                 kernel_mapping_size = 0;
5880         }
5881         kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
5882
5883         assert(swap_crypt_ctx_initialized);
5884
5885         /*
5886          * Prepare an "initial vector" for the decryption.
5887          * It has to be the same as the "initial vector" we
5888          * used to encrypt that page.
5889          */
5890         bzero(&decrypt_iv.aes_iv[0], sizeof (decrypt_iv.aes_iv));
5891         decrypt_iv.vm.pager_object = page->object->pager;
5892         decrypt_iv.vm.paging_offset =
5893                 page->object->paging_offset + page->offset;
5894
5895         vm_object_unlock(page->object);
5896
5897         /* encrypt the "initial vector" */
5898         aes_encrypt_cbc((const unsigned char *) &decrypt_iv.aes_iv[0],
5899                         swap_crypt_null_iv,
5900                         1,
5901                         &decrypt_iv.aes_iv[0],
5902                         &swap_crypt_ctx.encrypt);
5903
5904         /*
5905          * Decrypt the page.
5906          */
5907         aes_decrypt_cbc((const unsigned char *) kernel_vaddr,
5908                         &decrypt_iv.aes_iv[0],
5909                         PAGE_SIZE / AES_BLOCK_SIZE,
5910                         (unsigned char *) kernel_vaddr,
5911                         &swap_crypt_ctx.decrypt);
5912         vm_page_decrypt_counter++;
5913
5914         vm_object_lock(page->object);
5915
5916         /*
5917          * Unmap the page from the kernel's address space,
5918          * if we had to map it ourselves.  Otherwise, let
5919          * the caller undo the mapping if needed.
5920          */
5921         if (kernel_mapping_size != 0) {
5922                 vm_paging_unmap_object(page->object,
5923                                        kernel_vaddr,
5924                                        kernel_vaddr + PAGE_SIZE);
5925         }
5926
5927         /*
5928          * After decryption, the page is actually clean.
5929          * It was encrypted as part of paging, which "cleans"
5930          * the "dirty" pages.
5931          * Noone could access it after it was encrypted
5932          * and the decryption doesn't count.
5933          */
5934         page->dirty = FALSE;
5935         clear_refmod = VM_MEM_MODIFIED;
5936
5937         /* restore the "reference" bit */
5938         if (! page_was_referenced) {
5939                 page->reference = FALSE;
5940                 clear_refmod |= VM_MEM_REFERENCED;
5941         }
5942         pmap_clear_refmod(page->phys_page, clear_refmod);
5943
5944         page->encrypted = FALSE;
5945
5946         /*
5947          * We've just modified the page's contents via the data cache and part
5948          * of the new contents might still be in the cache and not yet in RAM.
5949          * Since the page is now available and might get gathered in a UPL to
5950          * be part of a DMA transfer from a driver that expects the memory to
5951          * be coherent at this point, we have to flush the data cache.
5952          */
5953         pmap_sync_page_attributes_phys(page->phys_page);
5954         /*
5955          * Since the page is not mapped yet, some code might assume that it
5956          * doesn't need to invalidate the instruction cache when writing to
5957          * that page.  That code relies on "no_isync" being set, so that the
5958          * caches get syncrhonized when the page is first mapped.  So we need
5959          * to set "no_isync" here too, despite the fact that we just
5960          * synchronized the caches above...
5961          */
5962         page->no_isync = TRUE;
5963 }
5964
5965 unsigned long upl_encrypt_upls = 0;
5966 unsigned long upl_encrypt_pages = 0;
5967
5968 /*
5969  * ENCRYPTED SWAP:
5970  *
5971  * upl_encrypt:
5972  *      Encrypts all the pages in the UPL, within the specified range.
5973  *
5974  */
5975 void
5976 upl_encrypt(
5977         upl_t                   upl,
5978         upl_offset_t            crypt_offset,
5979         upl_size_t              crypt_size)
5980 {
5981         upl_size_t              upl_size;
5982         upl_offset_t            upl_offset;
5983         vm_object_t             upl_object;
5984         vm_page_t               page;
5985         vm_object_t             shadow_object;
5986         vm_object_offset_t      shadow_offset;
5987         vm_object_offset_t      paging_offset;
5988         vm_object_offset_t      base_offset;
5989
5990         upl_encrypt_upls++;
5991         upl_encrypt_pages += crypt_size / PAGE_SIZE;
5992
5993         upl_lock(upl);
5994
5995         upl_object = upl->map_object;
5996         upl_offset = upl->offset;
5997         upl_size = upl->size;
5998
5999         upl_unlock(upl);
6000
6001         vm_object_lock(upl_object);
6002
6003         /*
6004          * Find the VM object that contains the actual pages.
6005          */
6006         if (upl_object->pageout) {
6007                 shadow_object = upl_object->shadow;
6008                 /*
6009                  * The offset in the shadow object is actually also
6010                  * accounted for in upl->offset.  It possibly shouldn't be
6011                  * this way, but for now don't account for it twice.
6012                  */
6013                 shadow_offset = 0;
6014                 assert(upl_object->paging_offset == 0); /* XXX ? */
6015                 vm_object_lock(shadow_object);
6016         } else {
6017                 shadow_object = upl_object;
6018                 shadow_offset = 0;
6019         }
6020
6021         paging_offset = shadow_object->paging_offset;
6022         vm_object_paging_begin(shadow_object);
6023
6024         if (shadow_object != upl_object) {
6025                 vm_object_unlock(shadow_object);
6026         }
6027         vm_object_unlock(upl_object);
6028
6029         base_offset = shadow_offset;
6030         base_offset += upl_offset;
6031         base_offset += crypt_offset;
6032         base_offset -= paging_offset;
6033         /*
6034          * Unmap the pages, so that nobody can continue accessing them while
6035          * they're encrypted.  After that point, all accesses to these pages
6036          * will cause a page fault and block while the page is being encrypted
6037          * (busy).  After the encryption completes, any access will cause a
6038          * page fault and the page gets decrypted at that time.
6039          */
6040         assert(crypt_offset + crypt_size <= upl_size);
6041         vm_object_pmap_protect(shadow_object,
6042                                base_offset,
6043                                (vm_object_size_t)crypt_size,
6044                                PMAP_NULL,
6045                                0,
6046                                VM_PROT_NONE);
6047
6048         /* XXX FBDP could the object have changed significantly here ? */
6049         vm_object_lock(shadow_object);
6050
6051         for (upl_offset = 0;
6052              upl_offset < crypt_size;
6053              upl_offset += PAGE_SIZE) {
6054                 page = vm_page_lookup(shadow_object,
6055                                       base_offset + upl_offset);
6056                 if (page == VM_PAGE_NULL) {
6057                         panic("upl_encrypt: "
6058                               "no page for (obj=%p,off=%lld+%d)!\n",
6059                               shadow_object,
6060                               base_offset,
6061                               upl_offset);
6062                 }
6063                 vm_page_encrypt(page, 0);
6064         }
6065
6066         vm_object_paging_end(shadow_object);
6067         vm_object_unlock(shadow_object);
6068 }
6069
6070 vm_size_t
6071 upl_get_internal_pagelist_offset(void)
6072 {
6073         return sizeof(struct upl);
6074 }
6075
6076 void
6077 upl_clear_dirty(
6078         upl_t           upl,
6079         boolean_t       value)
6080 {
6081         if (value) {
6082                 upl->flags |= UPL_CLEAR_DIRTY;
6083         } else {
6084                 upl->flags &= ~UPL_CLEAR_DIRTY;
6085         }
6086 }
6087
6088
6089 #ifdef MACH_BSD
6090
6091 boolean_t  upl_page_present(upl_page_info_t *upl, int index)
6092 {
6093         return(UPL_PAGE_PRESENT(upl, index));
6094 }
6095 boolean_t  upl_dirty_page(upl_page_info_t *upl, int index)
6096 {
6097         return(UPL_DIRTY_PAGE(upl, index));
6098 }
6099 boolean_t  upl_valid_page(upl_page_info_t *upl, int index)
6100 {
6101         return(UPL_VALID_PAGE(upl, index));
6102 }
6103 ppnum_t  upl_phys_page(upl_page_info_t *upl, int index)
6104 {
6105         return(UPL_PHYS_PAGE(upl, index));
6106 }
6107
6108 void
6109 vm_countdirtypages(void)
6110 {
6111         vm_page_t m;
6112         int dpages;
6113         int pgopages;
6114         int precpages;
6115
6116
6117         dpages=0;
6118         pgopages=0;
6119         precpages=0;
6120
6121         vm_page_lock_queues();
6122         m = (vm_page_t) queue_first(&vm_page_queue_inactive);
6123         do {
6124                 if (m ==(vm_page_t )0) break;
6125
6126                 if(m->dirty) dpages++;
6127                 if(m->pageout) pgopages++;
6128                 if(m->precious) precpages++;
6129
6130                 assert(m->object != kernel_object);
6131                 m = (vm_page_t) queue_next(&m->pageq);
6132                 if (m ==(vm_page_t )0) break;
6133
6134         } while (!queue_end(&vm_page_queue_inactive,(queue_entry_t) m));
6135         vm_page_unlock_queues();
6136
6137         vm_page_lock_queues();
6138         m = (vm_page_t) queue_first(&vm_page_queue_zf);
6139         do {
6140                 if (m ==(vm_page_t )0) break;
6141
6142                 if(m->dirty) dpages++;
6143                 if(m->pageout) pgopages++;
6144                 if(m->precious) precpages++;
6145
6146                 assert(m->object != kernel_object);
6147                 m = (vm_page_t) queue_next(&m->pageq);
6148                 if (m ==(vm_page_t )0) break;
6149
6150         } while (!queue_end(&vm_page_queue_zf,(queue_entry_t) m));
6151         vm_page_unlock_queues();
6152
6153         printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
6154
6155         dpages=0;
6156         pgopages=0;
6157         precpages=0;
6158
6159         vm_page_lock_queues();
6160         m = (vm_page_t) queue_first(&vm_page_queue_active);
6161
6162         do {
6163                 if(m == (vm_page_t )0) break;
6164                 if(m->dirty) dpages++;
6165                 if(m->pageout) pgopages++;
6166                 if(m->precious) precpages++;
6167
6168                 assert(m->object != kernel_object);
6169                 m = (vm_page_t) queue_next(&m->pageq);
6170                 if(m == (vm_page_t )0) break;
6171
6172         } while (!queue_end(&vm_page_queue_active,(queue_entry_t) m));
6173         vm_page_unlock_queues();
6174
6175         printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
6176
6177 }
6178 #endif /* MACH_BSD */
6179
6180 ppnum_t upl_get_highest_page(
6181         upl_t                   upl)
6182 {
6183         return upl->highest_page;
6184 }
6185
6186 #ifdef UPL_DEBUG
6187 kern_return_t  upl_ubc_alias_set(upl_t upl, unsigned int alias1, unsigned int alias2)
6188 {
6189         upl->ubc_alias1 = alias1;
6190         upl->ubc_alias2 = alias2;
6191         return KERN_SUCCESS;
6192 }
6193 int  upl_ubc_alias_get(upl_t upl, unsigned int * al, unsigned int * al2)
6194 {
6195         if(al)
6196                 *al = upl->ubc_alias1;
6197         if(al2)
6198                 *al2 = upl->ubc_alias2;
6199         return KERN_SUCCESS;
6200 }
6201 #endif /* UPL_DEBUG */
6202
6203
6204
6205 #if     MACH_KDB
6206 #include <ddb/db_output.h>
6207 #include <ddb/db_print.h>
6208 #include <vm/vm_print.h>
6209
6210 #define printf  kdbprintf
6211 void            db_pageout(void);
6212
6213 void
6214 db_vm(void)
6215 {
6216
6217         iprintf("VM Statistics:\n");
6218         db_indent += 2;
6219         iprintf("pages:\n");
6220         db_indent += 2;
6221         iprintf("activ %5d  inact %5d  free  %5d",
6222                 vm_page_active_count, vm_page_inactive_count,
6223                 vm_page_free_count);
6224         printf("   wire  %5d  gobbl %5d\n",
6225                vm_page_wire_count, vm_page_gobble_count);
6226         db_indent -= 2;
6227         iprintf("target:\n");
6228         db_indent += 2;
6229         iprintf("min   %5d  inact %5d  free  %5d",
6230                 vm_page_free_min, vm_page_inactive_target,
6231                 vm_page_free_target);
6232         printf("   resrv %5d\n", vm_page_free_reserved);
6233         db_indent -= 2;
6234         iprintf("pause:\n");
6235         db_pageout();
6236         db_indent -= 2;
6237 }
6238
6239 #if     MACH_COUNTERS
6240 extern int c_laundry_pages_freed;
6241 #endif  /* MACH_COUNTERS */
6242
6243 void
6244 db_pageout(void)
6245 {
6246         iprintf("Pageout Statistics:\n");
6247         db_indent += 2;
6248         iprintf("active %5d  inactv %5d\n",
6249                 vm_pageout_active, vm_pageout_inactive);
6250         iprintf("nolock %5d  avoid  %5d  busy   %5d  absent %5d\n",
6251                 vm_pageout_inactive_nolock, vm_pageout_inactive_avoid,
6252                 vm_pageout_inactive_busy, vm_pageout_inactive_absent);
6253         iprintf("used   %5d  clean  %5d  dirty  %5d\n",
6254                 vm_pageout_inactive_used, vm_pageout_inactive_clean,
6255                 vm_pageout_inactive_dirty);
6256 #if     MACH_COUNTERS
6257         iprintf("laundry_pages_freed %d\n", c_laundry_pages_freed);
6258 #endif  /* MACH_COUNTERS */
6259 #if     MACH_CLUSTER_STATS
6260         iprintf("Cluster Statistics:\n");
6261         db_indent += 2;
6262         iprintf("dirtied   %5d   cleaned  %5d   collisions  %5d\n",
6263                 vm_pageout_cluster_dirtied, vm_pageout_cluster_cleaned,
6264                 vm_pageout_cluster_collisions);
6265         iprintf("clusters  %5d   conversions  %5d\n",
6266                 vm_pageout_cluster_clusters, vm_pageout_cluster_conversions);
6267         db_indent -= 2;
6268         iprintf("Target Statistics:\n");
6269         db_indent += 2;
6270         iprintf("collisions   %5d   page_dirtied  %5d   page_freed  %5d\n",
6271                 vm_pageout_target_collisions, vm_pageout_target_page_dirtied,
6272                 vm_pageout_target_page_freed);
6273         db_indent -= 2;
6274 #endif  /* MACH_CLUSTER_STATS */
6275         db_indent -= 2;
6276 }
6277
6278 #endif  /* MACH_KDB */