osfmk/vm/vm_fault.c

   1 /*
   2  * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58 /*
  59  *      File:   vm_fault.c
  60  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  61  *
  62  *      Page fault handling module.
  63  */
  64
  65 #include <mach_cluster_stats.h>
  66 #include <mach_pagemap.h>
  67 #include <libkern/OSAtomic.h>
  68
  69 #include <mach/mach_types.h>
  70 #include <mach/kern_return.h>
  71 #include <mach/message.h>       /* for error codes */
  72 #include <mach/vm_param.h>
  73 #include <mach/vm_behavior.h>
  74 #include <mach/memory_object.h>
  75                                 /* For memory_object_data_{request,unlock} */
  76 #include <mach/sdt.h>
  77
  78 #include <kern/kern_types.h>
  79 #include <kern/host_statistics.h>
  80 #include <kern/counters.h>
  81 #include <kern/task.h>
  82 #include <kern/thread.h>
  83 #include <kern/sched_prim.h>
  84 #include <kern/host.h>
  85 #include <kern/xpr.h>
  86 #include <kern/mach_param.h>
  87 #include <kern/macro_help.h>
  88 #include <kern/zalloc.h>
  89 #include <kern/misc_protos.h>
  90
  91 #include <vm/vm_fault.h>
  92 #include <vm/vm_map.h>
  93 #include <vm/vm_object.h>
  94 #include <vm/vm_page.h>
  95 #include <vm/vm_kern.h>
  96 #include <vm/pmap.h>
  97 #include <vm/vm_pageout.h>
  98 #include <vm/vm_protos.h>
  99 #include <vm/vm_external.h>
 100 #include <vm/memory_object.h>
 101 #include <vm/vm_purgeable_internal.h>   /* Needed by some vm_page.h macros */
 102 #include <vm/vm_shared_region.h>
 103
 104 #define VM_FAULT_CLASSIFY       0
 105
 106 #define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
 107
 108 int     vm_object_pagein_throttle = 16;
 109
 110 /*
 111  * We apply a hard throttle to the demand zero rate of tasks that we believe are running out of control which
 112  * kicks in when swap space runs out.  64-bit programs have massive address spaces and can leak enormous amounts
 113  * of memory if they're buggy and can run the system completely out of swap space.  If this happens, we
 114  * impose a hard throttle on them to prevent them from taking the last bit of memory left.  This helps
 115  * keep the UI active so that the user has a chance to kill the offending task before the system
 116  * completely hangs.
 117  *
 118  * The hard throttle is only applied when the system is nearly completely out of swap space and is only applied
 119  * to tasks that appear to be bloated.  When swap runs out, any task using more than vm_hard_throttle_threshold
 120  * will be throttled.  The throttling is done by giving the thread that's trying to demand zero a page a
 121  * delay of HARD_THROTTLE_DELAY microseconds before being allowed to try the page fault again.
 122  */
 123
 124 extern boolean_t thread_is_io_throttled(void);
 125 extern void throttle_lowpri_io(int);
 126
 127 uint64_t vm_hard_throttle_threshold;
 128
 129 extern unsigned int dp_pages_free, dp_pages_reserve;
 130
 131 #define NEED_TO_HARD_THROTTLE_THIS_TASK()       (((dp_pages_free + dp_pages_reserve < 2000) && \
 132                                                  (get_task_resident_size(current_task()) > vm_hard_throttle_threshold) && \
 133                                                  (current_task() != kernel_task) && VM_DYNAMIC_PAGING_ENABLED(memory_manager_default)) || \
 134                                                  (vm_page_free_count < vm_page_throttle_limit && thread_is_io_throttled() && \
 135                                                   (get_task_resident_size(current_task()) > vm_hard_throttle_threshold)))
 136
 137
 138 #define HARD_THROTTLE_DELAY     20000   /* 20000 us == 20 ms */
 139 #define SOFT_THROTTLE_DELAY     2000    /* 2000 us == 2 ms */
 140
 141
 142 extern int cs_debug;
 143
 144 boolean_t current_thread_aborted(void);
 145
 146 /* Forward declarations of internal routines. */
 147 extern kern_return_t vm_fault_wire_fast(
 148                                 vm_map_t        map,
 149                                 vm_map_offset_t va,
 150                                 vm_map_entry_t  entry,
 151                                 pmap_t          pmap,
 152                                 vm_map_offset_t pmap_addr);
 153
 154 extern void vm_fault_continue(void);
 155
 156 extern void vm_fault_copy_cleanup(
 157                                 vm_page_t       page,
 158                                 vm_page_t       top_page);
 159
 160 extern void vm_fault_copy_dst_cleanup(
 161                                 vm_page_t       page);
 162
 163 #if     VM_FAULT_CLASSIFY
 164 extern void vm_fault_classify(vm_object_t       object,
 165                           vm_object_offset_t    offset,
 166                           vm_prot_t             fault_type);
 167
 168 extern void vm_fault_classify_init(void);
 169 #endif
 170
 171 unsigned long vm_pmap_enter_blocked = 0;
 172 unsigned long vm_pmap_enter_retried = 0;
 173
 174 unsigned long vm_cs_validates = 0;
 175 unsigned long vm_cs_revalidates = 0;
 176 unsigned long vm_cs_query_modified = 0;
 177 unsigned long vm_cs_validated_dirtied = 0;
 178 unsigned long vm_cs_bitmap_validated = 0;
 179 #if CONFIG_ENFORCE_SIGNED_CODE
 180 int cs_enforcement_disable=0;
 181 #else
 182 static const int cs_enforcement_disable=1;
 183 #endif
 184
 185 /*
 186  *      Routine:        vm_fault_init
 187  *      Purpose:
 188  *              Initialize our private data structures.
 189  */
 190 void
 191 vm_fault_init(void)
 192 {
 193 #if !SECURE_KERNEL
 194 #if CONFIG_ENFORCE_SIGNED_CODE
 195         PE_parse_boot_argn("cs_enforcement_disable", &cs_enforcement_disable,
 196                            sizeof (cs_enforcement_disable));
 197 #endif
 198         PE_parse_boot_argn("cs_debug", &cs_debug, sizeof (cs_debug));
 199 #endif
 200
 201         /*
 202          * Choose a value for the hard throttle threshold based on the amount of ram.  The threshold is
 203          * computed as a percentage of available memory, and the percentage used is scaled inversely with
 204          * the amount of memory.  The pertange runs between 10% and 35%.  We use 35% for small memory systems
 205          * and reduce the value down to 10% for very large memory configurations.  This helps give us a
 206          * definition of a memory hog that makes more sense relative to the amount of ram in the machine.
 207          * The formula here simply uses the number of gigabytes of ram to adjust the percentage.
 208          */
 209
 210         vm_hard_throttle_threshold = sane_size * (35 - MIN((int)(sane_size / (1024*1024*1024)), 25)) / 100;
 211 }
 212
 213 /*
 214  *      Routine:        vm_fault_cleanup
 215  *      Purpose:
 216  *              Clean up the result of vm_fault_page.
 217  *      Results:
 218  *              The paging reference for "object" is released.
 219  *              "object" is unlocked.
 220  *              If "top_page" is not null,  "top_page" is
 221  *              freed and the paging reference for the object
 222  *              containing it is released.
 223  *
 224  *      In/out conditions:
 225  *              "object" must be locked.
 226  */
 227 void
 228 vm_fault_cleanup(
 229         register vm_object_t    object,
 230         register vm_page_t      top_page)
 231 {
 232         vm_object_paging_end(object);
 233         vm_object_unlock(object);
 234
 235         if (top_page != VM_PAGE_NULL) {
 236                 object = top_page->object;
 237
 238                 vm_object_lock(object);
 239                 VM_PAGE_FREE(top_page);
 240                 vm_object_paging_end(object);
 241                 vm_object_unlock(object);
 242         }
 243 }
 244
 245 #if     MACH_CLUSTER_STATS
 246 #define MAXCLUSTERPAGES 16
 247 struct {
 248         unsigned long pages_in_cluster;
 249         unsigned long pages_at_higher_offsets;
 250         unsigned long pages_at_lower_offsets;
 251 } cluster_stats_in[MAXCLUSTERPAGES];
 252 #define CLUSTER_STAT(clause)    clause
 253 #define CLUSTER_STAT_HIGHER(x)  \
 254         ((cluster_stats_in[(x)].pages_at_higher_offsets)++)
 255 #define CLUSTER_STAT_LOWER(x)   \
 256          ((cluster_stats_in[(x)].pages_at_lower_offsets)++)
 257 #define CLUSTER_STAT_CLUSTER(x) \
 258         ((cluster_stats_in[(x)].pages_in_cluster)++)
 259 #else   /* MACH_CLUSTER_STATS */
 260 #define CLUSTER_STAT(clause)
 261 #endif  /* MACH_CLUSTER_STATS */
 262
 263 #define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
 264
 265
 266 boolean_t       vm_page_deactivate_behind = TRUE;
 267 /*
 268  * default sizes given VM_BEHAVIOR_DEFAULT reference behavior
 269  */
 270 #define VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW     128
 271 #define VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER    16              /* don't make this too big... */
 272                                                                 /* we use it to size an array on the stack */
 273
 274 int vm_default_behind = VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW;
 275
 276 #define MAX_SEQUENTIAL_RUN      (1024 * 1024 * 1024)
 277
 278 /*
 279  * vm_page_is_sequential
 280  *
 281  * Determine if sequential access is in progress
 282  * in accordance with the behavior specified.
 283  * Update state to indicate current access pattern.
 284  *
 285  * object must have at least the shared lock held
 286  */
 287 static
 288 void
 289 vm_fault_is_sequential(
 290         vm_object_t             object,
 291         vm_object_offset_t      offset,
 292         vm_behavior_t           behavior)
 293 {
 294         vm_object_offset_t      last_alloc;
 295         int                     sequential;
 296         int                     orig_sequential;
 297
 298         last_alloc = object->last_alloc;
 299         sequential = object->sequential;
 300         orig_sequential = sequential;
 301
 302         switch (behavior) {
 303         case VM_BEHAVIOR_RANDOM:
 304                 /*
 305                  * reset indicator of sequential behavior
 306                  */
 307                 sequential = 0;
 308                 break;
 309
 310         case VM_BEHAVIOR_SEQUENTIAL:
 311                 if (offset && last_alloc == offset - PAGE_SIZE_64) {
 312                         /*
 313                          * advance indicator of sequential behavior
 314                          */
 315                         if (sequential < MAX_SEQUENTIAL_RUN)
 316                                 sequential += PAGE_SIZE;
 317                 } else {
 318                         /*
 319                          * reset indicator of sequential behavior
 320                          */
 321                         sequential = 0;
 322                 }
 323                 break;
 324
 325         case VM_BEHAVIOR_RSEQNTL:
 326                 if (last_alloc && last_alloc == offset + PAGE_SIZE_64) {
 327                         /*
 328                          * advance indicator of sequential behavior
 329                          */
 330                         if (sequential > -MAX_SEQUENTIAL_RUN)
 331                                 sequential -= PAGE_SIZE;
 332                 } else {
 333                         /*
 334                          * reset indicator of sequential behavior
 335                          */
 336                         sequential = 0;
 337                 }
 338                 break;
 339
 340         case VM_BEHAVIOR_DEFAULT:
 341         default:
 342                 if (offset && last_alloc == (offset - PAGE_SIZE_64)) {
 343                         /*
 344                          * advance indicator of sequential behavior
 345                          */
 346                         if (sequential < 0)
 347                                 sequential = 0;
 348                         if (sequential < MAX_SEQUENTIAL_RUN)
 349                                 sequential += PAGE_SIZE;
 350
 351                 } else if (last_alloc && last_alloc == (offset + PAGE_SIZE_64)) {
 352                         /*
 353                          * advance indicator of sequential behavior
 354                          */
 355                         if (sequential > 0)
 356                                 sequential = 0;
 357                         if (sequential > -MAX_SEQUENTIAL_RUN)
 358                                 sequential -= PAGE_SIZE;
 359                 } else {
 360                         /*
 361                          * reset indicator of sequential behavior
 362                          */
 363                         sequential = 0;
 364                 }
 365                 break;
 366         }
 367         if (sequential != orig_sequential) {
 368                 if (!OSCompareAndSwap(orig_sequential, sequential, (UInt32 *)&object->sequential)) {
 369                         /*
 370                          * if someone else has already updated object->sequential
 371                          * don't bother trying to update it or object->last_alloc
 372                          */
 373                         return;
 374                 }
 375         }
 376         /*
 377          * I'd like to do this with a OSCompareAndSwap64, but that
 378          * doesn't exist for PPC...  however, it shouldn't matter
 379          * that much... last_alloc is maintained so that we can determine
 380          * if a sequential access pattern is taking place... if only
 381          * one thread is banging on this object, no problem with the unprotected
 382          * update... if 2 or more threads are banging away, we run the risk of
 383          * someone seeing a mangled update... however, in the face of multiple
 384          * accesses, no sequential access pattern can develop anyway, so we
 385          * haven't lost any real info.
 386          */
 387         object->last_alloc = offset;
 388 }
 389
 390
 391 int vm_page_deactivate_behind_count = 0;
 392
 393 /*
 394  * vm_page_deactivate_behind
 395  *
 396  * Determine if sequential access is in progress
 397  * in accordance with the behavior specified.  If
 398  * so, compute a potential page to deactivate and
 399  * deactivate it.
 400  *
 401  * object must be locked.
 402  *
 403  * return TRUE if we actually deactivate a page
 404  */
 405 static
 406 boolean_t
 407 vm_fault_deactivate_behind(
 408         vm_object_t             object,
 409         vm_object_offset_t      offset,
 410         vm_behavior_t           behavior)
 411 {
 412         int             n;
 413         int             pages_in_run = 0;
 414         int             max_pages_in_run = 0;
 415         int             sequential_run;
 416         int             sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
 417         vm_object_offset_t      run_offset = 0;
 418         vm_object_offset_t      pg_offset = 0;
 419         vm_page_t       m;
 420         vm_page_t       page_run[VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER];
 421
 422         pages_in_run = 0;
 423 #if TRACEFAULTPAGE
 424         dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind); /* (TEST/DEBUG) */
 425 #endif
 426
 427         if (object == kernel_object || vm_page_deactivate_behind == FALSE) {
 428                 /*
 429                  * Do not deactivate pages from the kernel object: they
 430                  * are not intended to become pageable.
 431                  * or we've disabled the deactivate behind mechanism
 432                  */
 433                 return FALSE;
 434         }
 435         if ((sequential_run = object->sequential)) {
 436                   if (sequential_run < 0) {
 437                           sequential_behavior = VM_BEHAVIOR_RSEQNTL;
 438                           sequential_run = 0 - sequential_run;
 439                   } else {
 440                           sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
 441                   }
 442         }
 443         switch (behavior) {
 444         case VM_BEHAVIOR_RANDOM:
 445                 break;
 446         case VM_BEHAVIOR_SEQUENTIAL:
 447                 if (sequential_run >= (int)PAGE_SIZE) {
 448                         run_offset = 0 - PAGE_SIZE_64;
 449                         max_pages_in_run = 1;
 450                 }
 451                 break;
 452         case VM_BEHAVIOR_RSEQNTL:
 453                 if (sequential_run >= (int)PAGE_SIZE) {
 454                         run_offset = PAGE_SIZE_64;
 455                         max_pages_in_run = 1;
 456                 }
 457                 break;
 458         case VM_BEHAVIOR_DEFAULT:
 459         default:
 460         {       vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
 461
 462                 /*
 463                  * determine if the run of sequential accesss has been
 464                  * long enough on an object with default access behavior
 465                  * to consider it for deactivation
 466                  */
 467                 if ((uint64_t)sequential_run >= behind && (sequential_run % (VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER * PAGE_SIZE)) == 0) {
 468                         /*
 469                          * the comparisons between offset and behind are done
 470                          * in this kind of odd fashion in order to prevent wrap around
 471                          * at the end points
 472                          */
 473                         if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) {
 474                                 if (offset >= behind) {
 475                                         run_offset = 0 - behind;
 476                                         pg_offset = PAGE_SIZE_64;
 477                                         max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
 478                                 }
 479                         } else {
 480                                 if (offset < -behind) {
 481                                         run_offset = behind;
 482                                         pg_offset = 0 - PAGE_SIZE_64;
 483                                         max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
 484                                 }
 485                         }
 486                 }
 487                 break;
 488         }
 489         }
 490         for (n = 0; n < max_pages_in_run; n++) {
 491                 m = vm_page_lookup(object, offset + run_offset + (n * pg_offset));
 492
 493                 if (m && !m->laundry && !m->busy && !m->no_cache && !m->throttled && !m->fictitious && !m->absent) {
 494                         page_run[pages_in_run++] = m;
 495                         pmap_clear_reference(m->phys_page);
 496                 }
 497         }
 498         if (pages_in_run) {
 499                 vm_page_lockspin_queues();
 500
 501                 for (n = 0; n < pages_in_run; n++) {
 502
 503                         m = page_run[n];
 504
 505                         vm_page_deactivate_internal(m, FALSE);
 506
 507                         vm_page_deactivate_behind_count++;
 508 #if TRACEFAULTPAGE
 509                         dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
 510 #endif
 511                 }
 512                 vm_page_unlock_queues();
 513
 514                 return TRUE;
 515         }
 516         return FALSE;
 517 }
 518
 519
 520 static int
 521 vm_page_throttled(void)
 522 {
 523         clock_sec_t     elapsed_sec;
 524         clock_sec_t     tv_sec;
 525         clock_usec_t    tv_usec;
 526
 527         thread_t thread = current_thread();
 528
 529         if (thread->options & TH_OPT_VMPRIV)
 530                 return (0);
 531
 532         thread->t_page_creation_count++;
 533
 534         if (NEED_TO_HARD_THROTTLE_THIS_TASK())
 535                 return (HARD_THROTTLE_DELAY);
 536
 537         if (vm_page_free_count < vm_page_throttle_limit &&
 538             thread->t_page_creation_count > vm_page_creation_throttle) {
 539
 540                 clock_get_system_microtime(&tv_sec, &tv_usec);
 541
 542                 elapsed_sec = tv_sec - thread->t_page_creation_time;
 543
 544                 if (elapsed_sec <= 6 || (thread->t_page_creation_count / elapsed_sec) >= (vm_page_creation_throttle / 6)) {
 545
 546                         if (elapsed_sec >= 60) {
 547                                 /*
 548                                  * we'll reset our stats to give a well behaved app
 549                                  * that was unlucky enough to accumulate a bunch of pages
 550                                  * over a long period of time a chance to get out of
 551                                  * the throttled state... we reset the counter and timestamp
 552                                  * so that if it stays under the rate limit for the next second
 553                                  * it will be back in our good graces... if it exceeds it, it
 554                                  * will remain in the throttled state
 555                                  */
 556                                 thread->t_page_creation_time = tv_sec;
 557                                 thread->t_page_creation_count = (vm_page_creation_throttle / 6) * 5;
 558                         }
 559                         ++vm_page_throttle_count;
 560
 561                         return (SOFT_THROTTLE_DELAY);
 562                 }
 563                 thread->t_page_creation_time = tv_sec;
 564                 thread->t_page_creation_count = 0;
 565         }
 566         return (0);
 567 }
 568
 569
 570 /*
 571  * check for various conditions that would
 572  * prevent us from creating a ZF page...
 573  * cleanup is based on being called from vm_fault_page
 574  *
 575  * object must be locked
 576  * object == m->object
 577  */
 578 static vm_fault_return_t
 579 vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t interruptible_state)
 580 {
 581         int throttle_delay;
 582
 583         if (object->shadow_severed ||
 584             VM_OBJECT_PURGEABLE_FAULT_ERROR(object)) {
 585                 /*
 586                  * Either:
 587                  * 1. the shadow chain was severed,
 588                  * 2. the purgeable object is volatile or empty and is marked
 589                  *    to fault on access while volatile.
 590                  * Just have to return an error at this point
 591                  */
 592                 if (m != VM_PAGE_NULL)
 593                         VM_PAGE_FREE(m);
 594                 vm_fault_cleanup(object, first_m);
 595
 596                 thread_interrupt_level(interruptible_state);
 597
 598                 return (VM_FAULT_MEMORY_ERROR);
 599         }
 600         if (vm_backing_store_low) {
 601                 /*
 602                  * are we protecting the system from
 603                  * backing store exhaustion.  If so
 604                  * sleep unless we are privileged.
 605                  */
 606                 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
 607
 608                         if (m != VM_PAGE_NULL)
 609                                 VM_PAGE_FREE(m);
 610                         vm_fault_cleanup(object, first_m);
 611
 612                         assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
 613
 614                         thread_block(THREAD_CONTINUE_NULL);
 615                         thread_interrupt_level(interruptible_state);
 616
 617                         return (VM_FAULT_RETRY);
 618                 }
 619         }
 620         if ((throttle_delay = vm_page_throttled())) {
 621                 /*
 622                  * we're throttling zero-fills...
 623                  * treat this as if we couldn't grab a page
 624                  */
 625                 if (m != VM_PAGE_NULL)
 626                         VM_PAGE_FREE(m);
 627                 vm_fault_cleanup(object, first_m);
 628
 629                 VM_DEBUG_EVENT(vmf_check_zfdelay, VMF_CHECK_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
 630
 631                 delay(throttle_delay);
 632
 633                 if (current_thread_aborted()) {
 634                         thread_interrupt_level(interruptible_state);
 635                         return VM_FAULT_INTERRUPTED;
 636                 }
 637                 thread_interrupt_level(interruptible_state);
 638
 639                 return (VM_FAULT_MEMORY_SHORTAGE);
 640         }
 641         return (VM_FAULT_SUCCESS);
 642 }
 643
 644
 645 /*
 646  * do the work to zero fill a page and
 647  * inject it into the correct paging queue
 648  *
 649  * m->object must be locked
 650  * page queue lock must NOT be held
 651  */
 652 static int
 653 vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill)
 654 {
 655         int my_fault = DBG_ZERO_FILL_FAULT;
 656
 657         /*
 658          * This is is a zero-fill page fault...
 659          *
 660          * Checking the page lock is a waste of
 661          * time;  this page was absent, so
 662          * it can't be page locked by a pager.
 663          *
 664          * we also consider it undefined
 665          * with respect to instruction
 666          * execution.  i.e. it is the responsibility
 667          * of higher layers to call for an instruction
 668          * sync after changing the contents and before
 669          * sending a program into this area.  We
 670          * choose this approach for performance
 671          */
 672         m->pmapped = TRUE;
 673
 674         m->cs_validated = FALSE;
 675         m->cs_tainted = FALSE;
 676
 677         if (no_zero_fill == TRUE) {
 678                 my_fault = DBG_NZF_PAGE_FAULT;
 679         } else {
 680                 vm_page_zero_fill(m);
 681
 682                 VM_STAT_INCR(zero_fill_count);
 683                 DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
 684         }
 685         assert(!m->laundry);
 686         assert(m->object != kernel_object);
 687         //assert(m->pageq.next == NULL && m->pageq.prev == NULL);
 688
 689         if (!VM_DYNAMIC_PAGING_ENABLED(memory_manager_default) &&
 690                 (m->object->purgable == VM_PURGABLE_DENY ||
 691                  m->object->purgable == VM_PURGABLE_NONVOLATILE ||
 692                  m->object->purgable == VM_PURGABLE_VOLATILE )) {
 693
 694                 vm_page_lockspin_queues();
 695
 696                 assert(!VM_PAGE_WIRED(m));
 697
 698                 /*
 699                  * can't be on the pageout queue since we don't
 700                  * have a pager to try and clean to
 701                  */
 702                 assert(!m->pageout_queue);
 703
 704                 VM_PAGE_QUEUES_REMOVE(m);
 705
 706                 queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq);
 707                 m->throttled = TRUE;
 708                 vm_page_throttled_count++;
 709
 710                 vm_page_unlock_queues();
 711         }
 712         return (my_fault);
 713 }
 714
 715
 716 /*
 717  *      Routine:        vm_fault_page
 718  *      Purpose:
 719  *              Find the resident page for the virtual memory
 720  *              specified by the given virtual memory object
 721  *              and offset.
 722  *      Additional arguments:
 723  *              The required permissions for the page is given
 724  *              in "fault_type".  Desired permissions are included
 725  *              in "protection".
 726  *              fault_info is passed along to determine pagein cluster
 727  *              limits... it contains the expected reference pattern,
 728  *              cluster size if available, etc...
 729  *
 730  *              If the desired page is known to be resident (for
 731  *              example, because it was previously wired down), asserting
 732  *              the "unwiring" parameter will speed the search.
 733  *
 734  *              If the operation can be interrupted (by thread_abort
 735  *              or thread_terminate), then the "interruptible"
 736  *              parameter should be asserted.
 737  *
 738  *      Results:
 739  *              The page containing the proper data is returned
 740  *              in "result_page".
 741  *
 742  *      In/out conditions:
 743  *              The source object must be locked and referenced,
 744  *              and must donate one paging reference.  The reference
 745  *              is not affected.  The paging reference and lock are
 746  *              consumed.
 747  *
 748  *              If the call succeeds, the object in which "result_page"
 749  *              resides is left locked and holding a paging reference.
 750  *              If this is not the original object, a busy page in the
 751  *              original object is returned in "top_page", to prevent other
 752  *              callers from pursuing this same data, along with a paging
 753  *              reference for the original object.  The "top_page" should
 754  *              be destroyed when this guarantee is no longer required.
 755  *              The "result_page" is also left busy.  It is not removed
 756  *              from the pageout queues.
 757  *      Special Case:
 758  *              A return value of VM_FAULT_SUCCESS_NO_PAGE means that the
 759  *              fault succeeded but there's no VM page (i.e. the VM object
 760  *              does not actually hold VM pages, but device memory or
 761  *              large pages).  The object is still locked and we still hold a
 762  *              paging_in_progress reference.
 763  */
 764 unsigned int vm_fault_page_blocked_access = 0;
 765 unsigned int vm_fault_page_forced_retry = 0;
 766
 767 vm_fault_return_t
 768 vm_fault_page(
 769         /* Arguments: */
 770         vm_object_t     first_object,   /* Object to begin search */
 771         vm_object_offset_t first_offset,        /* Offset into object */
 772         vm_prot_t       fault_type,     /* What access is requested */
 773         boolean_t       must_be_resident,/* Must page be resident? */
 774         /* Modifies in place: */
 775         vm_prot_t       *protection,    /* Protection for mapping */
 776         /* Returns: */
 777         vm_page_t       *result_page,   /* Page found, if successful */
 778         vm_page_t       *top_page,      /* Page in top object, if
 779                                          * not result_page.  */
 780         int             *type_of_fault, /* if non-null, fill in with type of fault
 781                                          * COW, zero-fill, etc... returned in trace point */
 782         /* More arguments: */
 783         kern_return_t   *error_code,    /* code if page is in error */
 784         boolean_t       no_zero_fill,   /* don't zero fill absent pages */
 785 #if MACH_PAGEMAP
 786         boolean_t       data_supply,    /* treat as data_supply if
 787                                          * it is a write fault and a full
 788                                          * page is provided */
 789 #else
 790         __unused boolean_t data_supply,
 791 #endif
 792         vm_object_fault_info_t fault_info)
 793 {
 794         vm_page_t               m;
 795         vm_object_t             object;
 796         vm_object_offset_t      offset;
 797         vm_page_t               first_m;
 798         vm_object_t             next_object;
 799         vm_object_t             copy_object;
 800         boolean_t               look_for_page;
 801         boolean_t               force_fault_retry = FALSE;
 802         vm_prot_t               access_required = fault_type;
 803         vm_prot_t               wants_copy_flag;
 804         CLUSTER_STAT(int pages_at_higher_offsets;)
 805         CLUSTER_STAT(int pages_at_lower_offsets;)
 806         kern_return_t           wait_result;
 807         boolean_t               interruptible_state;
 808         boolean_t               data_already_requested = FALSE;
 809         vm_behavior_t           orig_behavior;
 810         vm_size_t               orig_cluster_size;
 811         vm_fault_return_t       error;
 812         int                     my_fault;
 813         uint32_t                try_failed_count;
 814         int                     interruptible; /* how may fault be interrupted? */
 815         memory_object_t         pager;
 816         vm_fault_return_t       retval;
 817
 818 /*
 819  * MACH page map - an optional optimization where a bit map is maintained
 820  * by the VM subsystem for internal objects to indicate which pages of
 821  * the object currently reside on backing store.  This existence map
 822  * duplicates information maintained by the vnode pager.  It is
 823  * created at the time of the first pageout against the object, i.e.
 824  * at the same time pager for the object is created.  The optimization
 825  * is designed to eliminate pager interaction overhead, if it is
 826  * 'known' that the page does not exist on backing store.
 827  *
 828  * MUST_ASK_PAGER() evaluates to TRUE if the page specified by object/offset is
 829  * either marked as paged out in the existence map for the object or no
 830  * existence map exists for the object.  MUST_ASK_PAGER() is one of the
 831  * criteria in the decision to invoke the pager.   It is also used as one
 832  * of the criteria to terminate the scan for adjacent pages in a clustered
 833  * pagein operation.  Note that MUST_ASK_PAGER() always evaluates to TRUE for
 834  * permanent objects.  Note also that if the pager for an internal object
 835  * has not been created, the pager is not invoked regardless of the value
 836  * of MUST_ASK_PAGER() and that clustered pagein scans are only done on an object
 837  * for which a pager has been created.
 838  *
 839  * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
 840  * is marked as paged out in the existence map for the object.  PAGED_OUT()
 841  * PAGED_OUT() is used to determine if a page has already been pushed
 842  * into a copy object in order to avoid a redundant page out operation.
 843  */
 844 #if MACH_PAGEMAP
 845 #define MUST_ASK_PAGER(o, f) (vm_external_state_get((o)->existence_map, (f)) \
 846                         != VM_EXTERNAL_STATE_ABSENT)
 847 #define PAGED_OUT(o, f) (vm_external_state_get((o)->existence_map, (f)) \
 848                         == VM_EXTERNAL_STATE_EXISTS)
 849 #else
 850 #define MUST_ASK_PAGER(o, f) (TRUE)
 851 #define PAGED_OUT(o, f) (FALSE)
 852 #endif
 853
 854 /*
 855  *      Recovery actions
 856  */
 857 #define RELEASE_PAGE(m)                                 \
 858         MACRO_BEGIN                                     \
 859         PAGE_WAKEUP_DONE(m);                            \
 860         if (!m->active && !m->inactive && !m->throttled) {              \
 861                 vm_page_lockspin_queues();                              \
 862                 if (!m->active && !m->inactive && !m->throttled)        \
 863                         vm_page_activate(m);                            \
 864                 vm_page_unlock_queues();                                \
 865         }                                                               \
 866         MACRO_END
 867
 868 #if TRACEFAULTPAGE
 869         dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */
 870 #endif
 871
 872         interruptible = fault_info->interruptible;
 873         interruptible_state = thread_interrupt_level(interruptible);
 874
 875         /*
 876          *      INVARIANTS (through entire routine):
 877          *
 878          *      1)      At all times, we must either have the object
 879          *              lock or a busy page in some object to prevent
 880          *              some other thread from trying to bring in
 881          *              the same page.
 882          *
 883          *              Note that we cannot hold any locks during the
 884          *              pager access or when waiting for memory, so
 885          *              we use a busy page then.
 886          *
 887          *      2)      To prevent another thread from racing us down the
 888          *              shadow chain and entering a new page in the top
 889          *              object before we do, we must keep a busy page in
 890          *              the top object while following the shadow chain.
 891          *
 892          *      3)      We must increment paging_in_progress on any object
 893          *              for which we have a busy page before dropping
 894          *              the object lock
 895          *
 896          *      4)      We leave busy pages on the pageout queues.
 897          *              If the pageout daemon comes across a busy page,
 898          *              it will remove the page from the pageout queues.
 899          */
 900
 901         object = first_object;
 902         offset = first_offset;
 903         first_m = VM_PAGE_NULL;
 904         access_required = fault_type;
 905
 906
 907         XPR(XPR_VM_FAULT,
 908                 "vm_f_page: obj 0x%X, offset 0x%X, type %d, prot %d\n",
 909                 object, offset, fault_type, *protection, 0);
 910
 911         /*
 912          * default type of fault
 913          */
 914         my_fault = DBG_CACHE_HIT_FAULT;
 915
 916         while (TRUE) {
 917 #if TRACEFAULTPAGE
 918                 dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
 919 #endif
 920                 if (!object->alive) {
 921                         /*
 922                          * object is no longer valid
 923                          * clean up and return error
 924                          */
 925                         vm_fault_cleanup(object, first_m);
 926                         thread_interrupt_level(interruptible_state);
 927
 928                         return (VM_FAULT_MEMORY_ERROR);
 929                 }
 930
 931                 if (!object->pager_created && object->phys_contiguous) {
 932                         /*
 933                          * A physically-contiguous object without a pager:
 934                          * must be a "large page" object.  We do not deal
 935                          * with VM pages for this object.
 936                          */
 937                         m = VM_PAGE_NULL;
 938                         goto phys_contig_object;
 939                 }
 940
 941                 if (object->blocked_access) {
 942                         /*
 943                          * Access to this VM object has been blocked.
 944                          * Replace our "paging_in_progress" reference with
 945                          * a "activity_in_progress" reference and wait for
 946                          * access to be unblocked.
 947                          */
 948                         vm_object_activity_begin(object);
 949                         vm_object_paging_end(object);
 950                         while (object->blocked_access) {
 951                                 vm_object_sleep(object,
 952                                                 VM_OBJECT_EVENT_UNBLOCKED,
 953                                                 THREAD_UNINT);
 954                         }
 955                         vm_fault_page_blocked_access++;
 956                         vm_object_paging_begin(object);
 957                         vm_object_activity_end(object);
 958                 }
 959
 960                 /*
 961                  * See whether the page at 'offset' is resident
 962                  */
 963                 m = vm_page_lookup(object, offset);
 964 #if TRACEFAULTPAGE
 965                 dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
 966 #endif
 967                 if (m != VM_PAGE_NULL) {
 968
 969                         if (m->busy) {
 970                                 /*
 971                                  * The page is being brought in,
 972                                  * wait for it and then retry.
 973                                  */
 974 #if TRACEFAULTPAGE
 975                                 dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
 976 #endif
 977                                 wait_result = PAGE_SLEEP(object, m, interruptible);
 978
 979                                 XPR(XPR_VM_FAULT,
 980                                     "vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n",
 981                                     object, offset,
 982                                     m, 0, 0);
 983                                 counter(c_vm_fault_page_block_busy_kernel++);
 984
 985                                 if (wait_result != THREAD_AWAKENED) {
 986                                         vm_fault_cleanup(object, first_m);
 987                                         thread_interrupt_level(interruptible_state);
 988
 989                                         if (wait_result == THREAD_RESTART)
 990                                                 return (VM_FAULT_RETRY);
 991                                         else
 992                                                 return (VM_FAULT_INTERRUPTED);
 993                                 }
 994                                 continue;
 995                         }
 996                         if (m->laundry) {
 997                                 m->pageout = FALSE;
 998
 999                                 if (!m->cleaning)
1000                                         vm_pageout_steal_laundry(m, FALSE);
1001                         }
1002                         if (m->phys_page == vm_page_guard_addr) {
1003                                 /*
1004                                  * Guard page: off limits !
1005                                  */
1006                                 if (fault_type == VM_PROT_NONE) {
1007                                         /*
1008                                          * The fault is not requesting any
1009                                          * access to the guard page, so it must
1010                                          * be just to wire or unwire it.
1011                                          * Let's pretend it succeeded...
1012                                          */
1013                                         m->busy = TRUE;
1014                                         *result_page = m;
1015                                         assert(first_m == VM_PAGE_NULL);
1016                                         *top_page = first_m;
1017                                         if (type_of_fault)
1018                                                 *type_of_fault = DBG_GUARD_FAULT;
1019                                         thread_interrupt_level(interruptible_state);
1020                                         return VM_FAULT_SUCCESS;
1021                                 } else {
1022                                         /*
1023                                          * The fault requests access to the
1024                                          * guard page: let's deny that !
1025                                          */
1026                                         vm_fault_cleanup(object, first_m);
1027                                         thread_interrupt_level(interruptible_state);
1028                                         return VM_FAULT_MEMORY_ERROR;
1029                                 }
1030                         }
1031
1032                         if (m->error) {
1033                                 /*
1034                                  * The page is in error, give up now.
1035                                  */
1036 #if TRACEFAULTPAGE
1037                                 dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code);      /* (TEST/DEBUG) */
1038 #endif
1039                                 if (error_code)
1040                                         *error_code = KERN_MEMORY_ERROR;
1041                                 VM_PAGE_FREE(m);
1042
1043                                 vm_fault_cleanup(object, first_m);
1044                                 thread_interrupt_level(interruptible_state);
1045
1046                                 return (VM_FAULT_MEMORY_ERROR);
1047                         }
1048                         if (m->restart) {
1049                                 /*
1050                                  * The pager wants us to restart
1051                                  * at the top of the chain,
1052                                  * typically because it has moved the
1053                                  * page to another pager, then do so.
1054                                  */
1055 #if TRACEFAULTPAGE
1056                                 dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1057 #endif
1058                                 VM_PAGE_FREE(m);
1059
1060                                 vm_fault_cleanup(object, first_m);
1061                                 thread_interrupt_level(interruptible_state);
1062
1063                                 return (VM_FAULT_RETRY);
1064                         }
1065                         if (m->absent) {
1066                                 /*
1067                                  * The page isn't busy, but is absent,
1068                                  * therefore it's deemed "unavailable".
1069                                  *
1070                                  * Remove the non-existent page (unless it's
1071                                  * in the top object) and move on down to the
1072                                  * next object (if there is one).
1073                                  */
1074 #if TRACEFAULTPAGE
1075                                 dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow);  /* (TEST/DEBUG) */
1076 #endif
1077                                 next_object = object->shadow;
1078
1079                                 if (next_object == VM_OBJECT_NULL) {
1080                                         /*
1081                                          * Absent page at bottom of shadow
1082                                          * chain; zero fill the page we left
1083                                          * busy in the first object, and free
1084                                          * the absent page.
1085                                          */
1086                                         assert(!must_be_resident);
1087
1088                                         /*
1089                                          * check for any conditions that prevent
1090                                          * us from creating a new zero-fill page
1091                                          * vm_fault_check will do all of the
1092                                          * fault cleanup in the case of an error condition
1093                                          * including resetting the thread_interrupt_level
1094                                          */
1095                                         error = vm_fault_check(object, m, first_m, interruptible_state);
1096
1097                                         if (error != VM_FAULT_SUCCESS)
1098                                                 return (error);
1099
1100                                         XPR(XPR_VM_FAULT,
1101                                             "vm_f_page: zero obj 0x%X, off 0x%X, page 0x%X, first_obj 0x%X\n",
1102                                                 object, offset,
1103                                                 m,
1104                                                 first_object, 0);
1105
1106                                         if (object != first_object) {
1107                                                 /*
1108                                                  * free the absent page we just found
1109                                                  */
1110                                                 VM_PAGE_FREE(m);
1111
1112                                                 /*
1113                                                  * drop reference and lock on current object
1114                                                  */
1115                                                 vm_object_paging_end(object);
1116                                                 vm_object_unlock(object);
1117
1118                                                 /*
1119                                                  * grab the original page we
1120                                                  * 'soldered' in place and
1121                                                  * retake lock on 'first_object'
1122                                                  */
1123                                                 m = first_m;
1124                                                 first_m = VM_PAGE_NULL;
1125
1126                                                 object = first_object;
1127                                                 offset = first_offset;
1128
1129                                                 vm_object_lock(object);
1130                                         } else {
1131                                                 /*
1132                                                  * we're going to use the absent page we just found
1133                                                  * so convert it to a 'busy' page
1134                                                  */
1135                                                 m->absent = FALSE;
1136                                                 m->busy = TRUE;
1137                                         }
1138                                         /*
1139                                          * zero-fill the page and put it on
1140                                          * the correct paging queue
1141                                          */
1142                                         my_fault = vm_fault_zero_page(m, no_zero_fill);
1143
1144                                         if (fault_info->mark_zf_absent && no_zero_fill == TRUE)
1145                                                 m->absent = TRUE;
1146
1147                                         break;
1148                                 } else {
1149                                         if (must_be_resident)
1150                                                 vm_object_paging_end(object);
1151                                         else if (object != first_object) {
1152                                                 vm_object_paging_end(object);
1153                                                 VM_PAGE_FREE(m);
1154                                         } else {
1155                                                 first_m = m;
1156                                                 m->absent = FALSE;
1157                                                 m->busy = TRUE;
1158
1159                                                 vm_page_lockspin_queues();
1160
1161                                                 assert(!m->pageout_queue);
1162                                                 VM_PAGE_QUEUES_REMOVE(m);
1163
1164                                                 vm_page_unlock_queues();
1165                                         }
1166                                         XPR(XPR_VM_FAULT,
1167                                             "vm_f_page: unavail obj 0x%X, off 0x%X, next_obj 0x%X, newoff 0x%X\n",
1168                                                 object, offset,
1169                                                 next_object,
1170                                                 offset+object->vo_shadow_offset,0);
1171
1172                                         offset += object->vo_shadow_offset;
1173                                         fault_info->lo_offset += object->vo_shadow_offset;
1174                                         fault_info->hi_offset += object->vo_shadow_offset;
1175                                         access_required = VM_PROT_READ;
1176
1177                                         vm_object_lock(next_object);
1178                                         vm_object_unlock(object);
1179                                         object = next_object;
1180                                         vm_object_paging_begin(object);
1181
1182                                         /*
1183                                          * reset to default type of fault
1184                                          */
1185                                         my_fault = DBG_CACHE_HIT_FAULT;
1186
1187                                         continue;
1188                                 }
1189                         }
1190                         if ((m->cleaning)
1191                             && ((object != first_object) || (object->copy != VM_OBJECT_NULL))
1192                             && (fault_type & VM_PROT_WRITE)) {
1193                                 /*
1194                                  * This is a copy-on-write fault that will
1195                                  * cause us to revoke access to this page, but
1196                                  * this page is in the process of being cleaned
1197                                  * in a clustered pageout. We must wait until
1198                                  * the cleaning operation completes before
1199                                  * revoking access to the original page,
1200                                  * otherwise we might attempt to remove a
1201                                  * wired mapping.
1202                                  */
1203 #if TRACEFAULTPAGE
1204                                 dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset);  /* (TEST/DEBUG) */
1205 #endif
1206                                 XPR(XPR_VM_FAULT,
1207                                     "vm_f_page: cleaning obj 0x%X, offset 0x%X, page 0x%X\n",
1208                                         object, offset,
1209                                         m, 0, 0);
1210                                 /*
1211                                  * take an extra ref so that object won't die
1212                                  */
1213                                 vm_object_reference_locked(object);
1214
1215                                 vm_fault_cleanup(object, first_m);
1216
1217                                 counter(c_vm_fault_page_block_backoff_kernel++);
1218                                 vm_object_lock(object);
1219                                 assert(object->ref_count > 0);
1220
1221                                 m = vm_page_lookup(object, offset);
1222
1223                                 if (m != VM_PAGE_NULL && m->cleaning) {
1224                                         PAGE_ASSERT_WAIT(m, interruptible);
1225
1226                                         vm_object_unlock(object);
1227                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1228                                         vm_object_deallocate(object);
1229
1230                                         goto backoff;
1231                                 } else {
1232                                         vm_object_unlock(object);
1233
1234                                         vm_object_deallocate(object);
1235                                         thread_interrupt_level(interruptible_state);
1236
1237                                         return (VM_FAULT_RETRY);
1238                                 }
1239                         }
1240                         if (type_of_fault == NULL && m->speculative &&
1241                             !(fault_info != NULL && fault_info->stealth)) {
1242                                 /*
1243                                  * If we were passed a non-NULL pointer for
1244                                  * "type_of_fault", than we came from
1245                                  * vm_fault... we'll let it deal with
1246                                  * this condition, since it
1247                                  * needs to see m->speculative to correctly
1248                                  * account the pageins, otherwise...
1249                                  * take it off the speculative queue, we'll
1250                                  * let the caller of vm_fault_page deal
1251                                  * with getting it onto the correct queue
1252                                  *
1253                                  * If the caller specified in fault_info that
1254                                  * it wants a "stealth" fault, we also leave
1255                                  * the page in the speculative queue.
1256                                  */
1257                                 vm_page_lockspin_queues();
1258                                 if (m->speculative)
1259                                         VM_PAGE_QUEUES_REMOVE(m);
1260                                 vm_page_unlock_queues();
1261                         }
1262
1263                         if (m->encrypted) {
1264                                 /*
1265                                  * ENCRYPTED SWAP:
1266                                  * the user needs access to a page that we
1267                                  * encrypted before paging it out.
1268                                  * Decrypt the page now.
1269                                  * Keep it busy to prevent anyone from
1270                                  * accessing it during the decryption.
1271                                  */
1272                                 m->busy = TRUE;
1273                                 vm_page_decrypt(m, 0);
1274                                 assert(object == m->object);
1275                                 assert(m->busy);
1276                                 PAGE_WAKEUP_DONE(m);
1277
1278                                 /*
1279                                  * Retry from the top, in case
1280                                  * something changed while we were
1281                                  * decrypting.
1282                                  */
1283                                 continue;
1284                         }
1285                         ASSERT_PAGE_DECRYPTED(m);
1286
1287                         if (m->object->code_signed) {
1288                                 /*
1289                                  * CODE SIGNING:
1290                                  * We just paged in a page from a signed
1291                                  * memory object but we don't need to
1292                                  * validate it now.  We'll validate it if
1293                                  * when it gets mapped into a user address
1294                                  * space for the first time or when the page
1295                                  * gets copied to another object as a result
1296                                  * of a copy-on-write.
1297                                  */
1298                         }
1299
1300                         /*
1301                          * We mark the page busy and leave it on
1302                          * the pageout queues.  If the pageout
1303                          * deamon comes across it, then it will
1304                          * remove the page from the queue, but not the object
1305                          */
1306 #if TRACEFAULTPAGE
1307                         dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1308 #endif
1309                         XPR(XPR_VM_FAULT,
1310                             "vm_f_page: found page obj 0x%X, offset 0x%X, page 0x%X\n",
1311                                 object, offset, m, 0, 0);
1312                         assert(!m->busy);
1313                         assert(!m->absent);
1314
1315                         m->busy = TRUE;
1316                         break;
1317                 }
1318
1319
1320                 /*
1321                  * we get here when there is no page present in the object at
1322                  * the offset we're interested in... we'll allocate a page
1323                  * at this point if the pager associated with
1324                  * this object can provide the data or we're the top object...
1325                  * object is locked;  m == NULL
1326                  */
1327                 if (must_be_resident)
1328                         goto dont_look_for_page;
1329
1330                 look_for_page = (object->pager_created && (MUST_ASK_PAGER(object, offset) == TRUE) && !data_supply);
1331
1332 #if TRACEFAULTPAGE
1333                 dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object);      /* (TEST/DEBUG) */
1334 #endif
1335                 if (!look_for_page && object == first_object && !object->phys_contiguous) {
1336                         /*
1337                          * Allocate a new page for this object/offset pair as a placeholder
1338                          */
1339                         m = vm_page_grab();
1340 #if TRACEFAULTPAGE
1341                         dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
1342 #endif
1343                         if (m == VM_PAGE_NULL) {
1344
1345                                 vm_fault_cleanup(object, first_m);
1346                                 thread_interrupt_level(interruptible_state);
1347
1348                                 return (VM_FAULT_MEMORY_SHORTAGE);
1349                         }
1350
1351                         if (fault_info && fault_info->batch_pmap_op == TRUE) {
1352                                 vm_page_insert_internal(m, object, offset, FALSE, TRUE, TRUE);
1353                         } else {
1354                                 vm_page_insert(m, object, offset);
1355                         }
1356                 }
1357                 if (look_for_page) {
1358                         kern_return_t   rc;
1359
1360                         /*
1361                          *      If the memory manager is not ready, we
1362                          *      cannot make requests.
1363                          */
1364                         if (!object->pager_ready) {
1365 #if TRACEFAULTPAGE
1366                                 dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
1367 #endif
1368                                 if (m != VM_PAGE_NULL)
1369                                         VM_PAGE_FREE(m);
1370
1371                                 XPR(XPR_VM_FAULT,
1372                                 "vm_f_page: ready wait obj 0x%X, offset 0x%X\n",
1373                                         object, offset, 0, 0, 0);
1374
1375                                 /*
1376                                  * take an extra ref so object won't die
1377                                  */
1378                                 vm_object_reference_locked(object);
1379                                 vm_fault_cleanup(object, first_m);
1380                                 counter(c_vm_fault_page_block_backoff_kernel++);
1381
1382                                 vm_object_lock(object);
1383                                 assert(object->ref_count > 0);
1384
1385                                 if (!object->pager_ready) {
1386                                         wait_result = vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGER_READY, interruptible);
1387
1388                                         vm_object_unlock(object);
1389                                         if (wait_result == THREAD_WAITING)
1390                                                 wait_result = thread_block(THREAD_CONTINUE_NULL);
1391                                         vm_object_deallocate(object);
1392
1393                                         goto backoff;
1394                                 } else {
1395                                         vm_object_unlock(object);
1396                                         vm_object_deallocate(object);
1397                                         thread_interrupt_level(interruptible_state);
1398
1399                                         return (VM_FAULT_RETRY);
1400                                 }
1401                         }
1402                         if (!object->internal && !object->phys_contiguous && object->paging_in_progress > vm_object_pagein_throttle) {
1403                                 /*
1404                                  * If there are too many outstanding page
1405                                  * requests pending on this external object, we
1406                                  * wait for them to be resolved now.
1407                                  */
1408 #if TRACEFAULTPAGE
1409                                 dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1410 #endif
1411                                 if (m != VM_PAGE_NULL)
1412                                         VM_PAGE_FREE(m);
1413                                 /*
1414                                  * take an extra ref so object won't die
1415                                  */
1416                                 vm_object_reference_locked(object);
1417
1418                                 vm_fault_cleanup(object, first_m);
1419
1420                                 counter(c_vm_fault_page_block_backoff_kernel++);
1421
1422                                 vm_object_lock(object);
1423                                 assert(object->ref_count > 0);
1424
1425                                 if (object->paging_in_progress >= vm_object_pagein_throttle) {
1426                                         vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGING_ONLY_IN_PROGRESS, interruptible);
1427
1428                                         vm_object_unlock(object);
1429                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1430                                         vm_object_deallocate(object);
1431
1432                                         goto backoff;
1433                                 } else {
1434                                         vm_object_unlock(object);
1435                                         vm_object_deallocate(object);
1436                                         thread_interrupt_level(interruptible_state);
1437
1438                                         return (VM_FAULT_RETRY);
1439                                 }
1440                         }
1441                         if (m != VM_PAGE_NULL) {
1442                                 VM_PAGE_FREE(m);
1443                                 m = VM_PAGE_NULL;
1444                         }
1445
1446 #if TRACEFAULTPAGE
1447                         dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0);  /* (TEST/DEBUG) */
1448 #endif
1449
1450                         /*
1451                          * It's possible someone called vm_object_destroy while we weren't
1452                          * holding the object lock.  If that has happened, then bail out
1453                          * here.
1454                          */
1455
1456                         pager = object->pager;
1457
1458                         if (pager == MEMORY_OBJECT_NULL) {
1459                                 vm_fault_cleanup(object, first_m);
1460                                 thread_interrupt_level(interruptible_state);
1461                                 return VM_FAULT_MEMORY_ERROR;
1462                         }
1463
1464                         /*
1465                          * We have an absent page in place for the faulting offset,
1466                          * so we can release the object lock.
1467                          */
1468
1469                         vm_object_unlock(object);
1470
1471                         /*
1472                          * If this object uses a copy_call strategy,
1473                          * and we are interested in a copy of this object
1474                          * (having gotten here only by following a
1475                          * shadow chain), then tell the memory manager
1476                          * via a flag added to the desired_access
1477                          * parameter, so that it can detect a race
1478                          * between our walking down the shadow chain
1479                          * and its pushing pages up into a copy of
1480                          * the object that it manages.
1481                          */
1482                         if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL && object != first_object)
1483                                 wants_copy_flag = VM_PROT_WANTS_COPY;
1484                         else
1485                                 wants_copy_flag = VM_PROT_NONE;
1486
1487                         XPR(XPR_VM_FAULT,
1488                             "vm_f_page: data_req obj 0x%X, offset 0x%X, page 0x%X, acc %d\n",
1489                                 object, offset, m,
1490                                 access_required | wants_copy_flag, 0);
1491
1492                         if (object->copy == first_object) {
1493                                 /*
1494                                  * if we issue the memory_object_data_request in
1495                                  * this state, we are subject to a deadlock with
1496                                  * the underlying filesystem if it is trying to
1497                                  * shrink the file resulting in a push of pages
1498                                  * into the copy object...  that push will stall
1499                                  * on the placeholder page, and if the pushing thread
1500                                  * is holding a lock that is required on the pagein
1501                                  * path (such as a truncate lock), we'll deadlock...
1502                                  * to avoid this potential deadlock, we throw away
1503                                  * our placeholder page before calling memory_object_data_request
1504                                  * and force this thread to retry the vm_fault_page after
1505                                  * we have issued the I/O.  the second time through this path
1506                                  * we will find the page already in the cache (presumably still
1507                                  * busy waiting for the I/O to complete) and then complete
1508                                  * the fault w/o having to go through memory_object_data_request again
1509                                  */
1510                                 assert(first_m != VM_PAGE_NULL);
1511                                 assert(first_m->object == first_object);
1512
1513                                 vm_object_lock(first_object);
1514                                 VM_PAGE_FREE(first_m);
1515                                 vm_object_paging_end(first_object);
1516                                 vm_object_unlock(first_object);
1517
1518                                 first_m = VM_PAGE_NULL;
1519                                 force_fault_retry = TRUE;
1520
1521                                 vm_fault_page_forced_retry++;
1522                         }
1523
1524                         if (data_already_requested == TRUE) {
1525                                 orig_behavior = fault_info->behavior;
1526                                 orig_cluster_size = fault_info->cluster_size;
1527
1528                                 fault_info->behavior = VM_BEHAVIOR_RANDOM;
1529                                 fault_info->cluster_size = PAGE_SIZE;
1530                         }
1531                         /*
1532                          * Call the memory manager to retrieve the data.
1533                          */
1534                         rc = memory_object_data_request(
1535                                 pager,
1536                                 offset + object->paging_offset,
1537                                 PAGE_SIZE,
1538                                 access_required | wants_copy_flag,
1539                                 (memory_object_fault_info_t)fault_info);
1540
1541                         if (data_already_requested == TRUE) {
1542                                 fault_info->behavior = orig_behavior;
1543                                 fault_info->cluster_size = orig_cluster_size;
1544                         } else
1545                                 data_already_requested = TRUE;
1546
1547 #if TRACEFAULTPAGE
1548                         dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
1549 #endif
1550                         vm_object_lock(object);
1551
1552                         if (rc != KERN_SUCCESS) {
1553
1554                                 vm_fault_cleanup(object, first_m);
1555                                 thread_interrupt_level(interruptible_state);
1556
1557                                 return ((rc == MACH_SEND_INTERRUPTED) ?
1558                                         VM_FAULT_INTERRUPTED :
1559                                         VM_FAULT_MEMORY_ERROR);
1560                         } else {
1561                                 clock_sec_t     tv_sec;
1562                                 clock_usec_t    tv_usec;
1563
1564                                 clock_get_system_microtime(&tv_sec, &tv_usec);
1565                                 current_thread()->t_page_creation_time = tv_sec;
1566                                 current_thread()->t_page_creation_count = 0;
1567                         }
1568                         if ((interruptible != THREAD_UNINT) && (current_thread()->sched_flags & TH_SFLAG_ABORT)) {
1569
1570                                 vm_fault_cleanup(object, first_m);
1571                                 thread_interrupt_level(interruptible_state);
1572
1573                                 return (VM_FAULT_INTERRUPTED);
1574                         }
1575                         if (force_fault_retry == TRUE) {
1576
1577                                 vm_fault_cleanup(object, first_m);
1578                                 thread_interrupt_level(interruptible_state);
1579
1580                                 return (VM_FAULT_RETRY);
1581                         }
1582                         if (m == VM_PAGE_NULL && object->phys_contiguous) {
1583                                 /*
1584                                  * No page here means that the object we
1585                                  * initially looked up was "physically
1586                                  * contiguous" (i.e. device memory).  However,
1587                                  * with Virtual VRAM, the object might not
1588                                  * be backed by that device memory anymore,
1589                                  * so we're done here only if the object is
1590                                  * still "phys_contiguous".
1591                                  * Otherwise, if the object is no longer
1592                                  * "phys_contiguous", we need to retry the
1593                                  * page fault against the object's new backing
1594                                  * store (different memory object).
1595                                  */
1596                         phys_contig_object:
1597                                 goto done;
1598                         }
1599                         /*
1600                          * potentially a pagein fault
1601                          * if we make it through the state checks
1602                          * above, than we'll count it as such
1603                          */
1604                         my_fault = DBG_PAGEIN_FAULT;
1605
1606                         /*
1607                          * Retry with same object/offset, since new data may
1608                          * be in a different page (i.e., m is meaningless at
1609                          * this point).
1610                          */
1611                         continue;
1612                 }
1613 dont_look_for_page:
1614                 /*
1615                  * We get here if the object has no pager, or an existence map
1616                  * exists and indicates the page isn't present on the pager
1617                  * or we're unwiring a page.  If a pager exists, but there
1618                  * is no existence map, then the m->absent case above handles
1619                  * the ZF case when the pager can't provide the page
1620                  */
1621 #if TRACEFAULTPAGE
1622                 dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1623 #endif
1624                 if (object == first_object)
1625                         first_m = m;
1626                 else
1627                         assert(m == VM_PAGE_NULL);
1628
1629                 XPR(XPR_VM_FAULT,
1630                     "vm_f_page: no pager obj 0x%X, offset 0x%X, page 0x%X, next_obj 0x%X\n",
1631                         object, offset, m,
1632                         object->shadow, 0);
1633
1634                 next_object = object->shadow;
1635
1636                 if (next_object == VM_OBJECT_NULL) {
1637                         /*
1638                          * we've hit the bottom of the shadown chain,
1639                          * fill the page in the top object with zeros.
1640                          */
1641                         assert(!must_be_resident);
1642
1643                         if (object != first_object) {
1644                                 vm_object_paging_end(object);
1645                                 vm_object_unlock(object);
1646
1647                                 object = first_object;
1648                                 offset = first_offset;
1649                                 vm_object_lock(object);
1650                         }
1651                         m = first_m;
1652                         assert(m->object == object);
1653                         first_m = VM_PAGE_NULL;
1654
1655                         /*
1656                          * check for any conditions that prevent
1657                          * us from creating a new zero-fill page
1658                          * vm_fault_check will do all of the
1659                          * fault cleanup in the case of an error condition
1660                          * including resetting the thread_interrupt_level
1661                          */
1662                         error = vm_fault_check(object, m, first_m, interruptible_state);
1663
1664                         if (error != VM_FAULT_SUCCESS)
1665                                 return (error);
1666
1667                         if (m == VM_PAGE_NULL) {
1668                                 m = vm_page_grab();
1669
1670                                 if (m == VM_PAGE_NULL) {
1671                                         vm_fault_cleanup(object, VM_PAGE_NULL);
1672                                         thread_interrupt_level(interruptible_state);
1673
1674                                         return (VM_FAULT_MEMORY_SHORTAGE);
1675                                 }
1676                                 vm_page_insert(m, object, offset);
1677                         }
1678                         my_fault = vm_fault_zero_page(m, no_zero_fill);
1679
1680                         if (fault_info->mark_zf_absent && no_zero_fill == TRUE)
1681                                 m->absent = TRUE;
1682                         break;
1683
1684                 } else {
1685                         /*
1686                          * Move on to the next object.  Lock the next
1687                          * object before unlocking the current one.
1688                          */
1689                         if ((object != first_object) || must_be_resident)
1690                                 vm_object_paging_end(object);
1691
1692                         offset += object->vo_shadow_offset;
1693                         fault_info->lo_offset += object->vo_shadow_offset;
1694                         fault_info->hi_offset += object->vo_shadow_offset;
1695                         access_required = VM_PROT_READ;
1696
1697                         vm_object_lock(next_object);
1698                         vm_object_unlock(object);
1699
1700                         object = next_object;
1701                         vm_object_paging_begin(object);
1702                 }
1703         }
1704
1705         /*
1706          *      PAGE HAS BEEN FOUND.
1707          *
1708          *      This page (m) is:
1709          *              busy, so that we can play with it;
1710          *              not absent, so that nobody else will fill it;
1711          *              possibly eligible for pageout;
1712          *
1713          *      The top-level page (first_m) is:
1714          *              VM_PAGE_NULL if the page was found in the
1715          *               top-level object;
1716          *              busy, not absent, and ineligible for pageout.
1717          *
1718          *      The current object (object) is locked.  A paging
1719          *      reference is held for the current and top-level
1720          *      objects.
1721          */
1722
1723 #if TRACEFAULTPAGE
1724         dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1725 #endif
1726 #if     EXTRA_ASSERTIONS
1727         assert(m->busy && !m->absent);
1728         assert((first_m == VM_PAGE_NULL) ||
1729                (first_m->busy && !first_m->absent &&
1730                 !first_m->active && !first_m->inactive));
1731 #endif  /* EXTRA_ASSERTIONS */
1732
1733         /*
1734          * ENCRYPTED SWAP:
1735          * If we found a page, we must have decrypted it before we
1736          * get here...
1737          */
1738         ASSERT_PAGE_DECRYPTED(m);
1739
1740         XPR(XPR_VM_FAULT,
1741             "vm_f_page: FOUND obj 0x%X, off 0x%X, page 0x%X, 1_obj 0x%X, 1_m 0x%X\n",
1742                 object, offset, m,
1743                 first_object, first_m);
1744
1745         /*
1746          * If the page is being written, but isn't
1747          * already owned by the top-level object,
1748          * we have to copy it into a new page owned
1749          * by the top-level object.
1750          */
1751         if (object != first_object) {
1752
1753 #if TRACEFAULTPAGE
1754                 dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1755 #endif
1756                 if (fault_type & VM_PROT_WRITE) {
1757                         vm_page_t copy_m;
1758
1759                         /*
1760                          * We only really need to copy if we
1761                          * want to write it.
1762                          */
1763                         assert(!must_be_resident);
1764
1765                         /*
1766                          * are we protecting the system from
1767                          * backing store exhaustion.  If so
1768                          * sleep unless we are privileged.
1769                          */
1770                         if (vm_backing_store_low) {
1771                                 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
1772
1773                                         RELEASE_PAGE(m);
1774                                         vm_fault_cleanup(object, first_m);
1775
1776                                         assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
1777
1778                                         thread_block(THREAD_CONTINUE_NULL);
1779                                         thread_interrupt_level(interruptible_state);
1780
1781                                         return (VM_FAULT_RETRY);
1782                                 }
1783                         }
1784                         /*
1785                          * If we try to collapse first_object at this
1786                          * point, we may deadlock when we try to get
1787                          * the lock on an intermediate object (since we
1788                          * have the bottom object locked).  We can't
1789                          * unlock the bottom object, because the page
1790                          * we found may move (by collapse) if we do.
1791                          *
1792                          * Instead, we first copy the page.  Then, when
1793                          * we have no more use for the bottom object,
1794                          * we unlock it and try to collapse.
1795                          *
1796                          * Note that we copy the page even if we didn't
1797                          * need to... that's the breaks.
1798                          */
1799
1800                         /*
1801                          * Allocate a page for the copy
1802                          */
1803                         copy_m = vm_page_grab();
1804
1805                         if (copy_m == VM_PAGE_NULL) {
1806                                 RELEASE_PAGE(m);
1807
1808                                 vm_fault_cleanup(object, first_m);
1809                                 thread_interrupt_level(interruptible_state);
1810
1811                                 return (VM_FAULT_MEMORY_SHORTAGE);
1812                         }
1813                         XPR(XPR_VM_FAULT,
1814                             "vm_f_page: page_copy obj 0x%X, offset 0x%X, m 0x%X, copy_m 0x%X\n",
1815                                 object, offset,
1816                                 m, copy_m, 0);
1817
1818                         vm_page_copy(m, copy_m);
1819
1820                         /*
1821                          * If another map is truly sharing this
1822                          * page with us, we have to flush all
1823                          * uses of the original page, since we
1824                          * can't distinguish those which want the
1825                          * original from those which need the
1826                          * new copy.
1827                          *
1828                          * XXXO If we know that only one map has
1829                          * access to this page, then we could
1830                          * avoid the pmap_disconnect() call.
1831                          */
1832                         if (m->pmapped)
1833                                 pmap_disconnect(m->phys_page);
1834
1835                         assert(!m->cleaning);
1836
1837                         /*
1838                          * We no longer need the old page or object.
1839                          */
1840                         PAGE_WAKEUP_DONE(m);
1841                         vm_object_paging_end(object);
1842                         vm_object_unlock(object);
1843
1844                         my_fault = DBG_COW_FAULT;
1845                         VM_STAT_INCR(cow_faults);
1846                         DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
1847                         current_task()->cow_faults++;
1848
1849                         object = first_object;
1850                         offset = first_offset;
1851
1852                         vm_object_lock(object);
1853                         /*
1854                          * get rid of the place holder
1855                          * page that we soldered in earlier
1856                          */
1857                         VM_PAGE_FREE(first_m);
1858                         first_m = VM_PAGE_NULL;
1859
1860                         /*
1861                          * and replace it with the
1862                          * page we just copied into
1863                          */
1864                         assert(copy_m->busy);
1865                         vm_page_insert(copy_m, object, offset);
1866                         SET_PAGE_DIRTY(copy_m, TRUE);
1867
1868                         m = copy_m;
1869                         /*
1870                          * Now that we've gotten the copy out of the
1871                          * way, let's try to collapse the top object.
1872                          * But we have to play ugly games with
1873                          * paging_in_progress to do that...
1874                          */
1875                         vm_object_paging_end(object);
1876                         vm_object_collapse(object, offset, TRUE);
1877                         vm_object_paging_begin(object);
1878
1879                 } else
1880                         *protection &= (~VM_PROT_WRITE);
1881         }
1882         /*
1883          * Now check whether the page needs to be pushed into the
1884          * copy object.  The use of asymmetric copy on write for
1885          * shared temporary objects means that we may do two copies to
1886          * satisfy the fault; one above to get the page from a
1887          * shadowed object, and one here to push it into the copy.
1888          */
1889         try_failed_count = 0;
1890
1891         while ((copy_object = first_object->copy) != VM_OBJECT_NULL) {
1892                 vm_object_offset_t      copy_offset;
1893                 vm_page_t               copy_m;
1894
1895 #if TRACEFAULTPAGE
1896                 dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type);    /* (TEST/DEBUG) */
1897 #endif
1898                 /*
1899                  * If the page is being written, but hasn't been
1900                  * copied to the copy-object, we have to copy it there.
1901                  */
1902                 if ((fault_type & VM_PROT_WRITE) == 0) {
1903                         *protection &= ~VM_PROT_WRITE;
1904                         break;
1905                 }
1906
1907                 /*
1908                  * If the page was guaranteed to be resident,
1909                  * we must have already performed the copy.
1910                  */
1911                 if (must_be_resident)
1912                         break;
1913
1914                 /*
1915                  * Try to get the lock on the copy_object.
1916                  */
1917                 if (!vm_object_lock_try(copy_object)) {
1918
1919                         vm_object_unlock(object);
1920                         try_failed_count++;
1921
1922                         mutex_pause(try_failed_count);  /* wait a bit */
1923                         vm_object_lock(object);
1924
1925                         continue;
1926                 }
1927                 try_failed_count = 0;
1928
1929                 /*
1930                  * Make another reference to the copy-object,
1931                  * to keep it from disappearing during the
1932                  * copy.
1933                  */
1934                 vm_object_reference_locked(copy_object);
1935
1936                 /*
1937                  * Does the page exist in the copy?
1938                  */
1939                 copy_offset = first_offset - copy_object->vo_shadow_offset;
1940
1941                 if (copy_object->vo_size <= copy_offset)
1942                         /*
1943                          * Copy object doesn't cover this page -- do nothing.
1944                          */
1945                         ;
1946                 else if ((copy_m = vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
1947                         /*
1948                          * Page currently exists in the copy object
1949                          */
1950                         if (copy_m->busy) {
1951                                 /*
1952                                  * If the page is being brought
1953                                  * in, wait for it and then retry.
1954                                  */
1955                                 RELEASE_PAGE(m);
1956
1957                                 /*
1958                                  * take an extra ref so object won't die
1959                                  */
1960                                 vm_object_reference_locked(copy_object);
1961                                 vm_object_unlock(copy_object);
1962                                 vm_fault_cleanup(object, first_m);
1963                                 counter(c_vm_fault_page_block_backoff_kernel++);
1964
1965                                 vm_object_lock(copy_object);
1966                                 assert(copy_object->ref_count > 0);
1967                                 VM_OBJ_RES_DECR(copy_object);
1968                                 vm_object_lock_assert_exclusive(copy_object);
1969                                 copy_object->ref_count--;
1970                                 assert(copy_object->ref_count > 0);
1971                                 copy_m = vm_page_lookup(copy_object, copy_offset);
1972                                 /*
1973                                  * ENCRYPTED SWAP:
1974                                  * it's OK if the "copy_m" page is encrypted,
1975                                  * because we're not moving it nor handling its
1976                                  * contents.
1977                                  */
1978                                 if (copy_m != VM_PAGE_NULL && copy_m->busy) {
1979                                         PAGE_ASSERT_WAIT(copy_m, interruptible);
1980
1981                                         vm_object_unlock(copy_object);
1982                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1983                                         vm_object_deallocate(copy_object);
1984
1985                                         goto backoff;
1986                                 } else {
1987                                         vm_object_unlock(copy_object);
1988                                         vm_object_deallocate(copy_object);
1989                                         thread_interrupt_level(interruptible_state);
1990
1991                                         return (VM_FAULT_RETRY);
1992                                 }
1993                         }
1994                 }
1995                 else if (!PAGED_OUT(copy_object, copy_offset)) {
1996                         /*
1997                          * If PAGED_OUT is TRUE, then the page used to exist
1998                          * in the copy-object, and has already been paged out.
1999                          * We don't need to repeat this. If PAGED_OUT is
2000                          * FALSE, then either we don't know (!pager_created,
2001                          * for example) or it hasn't been paged out.
2002                          * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
2003                          * We must copy the page to the copy object.
2004                          */
2005
2006                         if (vm_backing_store_low) {
2007                                 /*
2008                                  * we are protecting the system from
2009                                  * backing store exhaustion.  If so
2010                                  * sleep unless we are privileged.
2011                                  */
2012                                 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
2013                                         assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
2014
2015                                         RELEASE_PAGE(m);
2016                                         VM_OBJ_RES_DECR(copy_object);
2017                                         vm_object_lock_assert_exclusive(copy_object);
2018                                         copy_object->ref_count--;
2019                                         assert(copy_object->ref_count > 0);
2020
2021                                         vm_object_unlock(copy_object);
2022                                         vm_fault_cleanup(object, first_m);
2023                                         thread_block(THREAD_CONTINUE_NULL);
2024                                         thread_interrupt_level(interruptible_state);
2025
2026                                         return (VM_FAULT_RETRY);
2027                                 }
2028                         }
2029                         /*
2030                          * Allocate a page for the copy
2031                          */
2032                         copy_m = vm_page_alloc(copy_object, copy_offset);
2033
2034                         if (copy_m == VM_PAGE_NULL) {
2035                                 RELEASE_PAGE(m);
2036
2037                                 VM_OBJ_RES_DECR(copy_object);
2038                                 vm_object_lock_assert_exclusive(copy_object);
2039                                 copy_object->ref_count--;
2040                                 assert(copy_object->ref_count > 0);
2041
2042                                 vm_object_unlock(copy_object);
2043                                 vm_fault_cleanup(object, first_m);
2044                                 thread_interrupt_level(interruptible_state);
2045
2046                                 return (VM_FAULT_MEMORY_SHORTAGE);
2047                         }
2048                         /*
2049                          * Must copy page into copy-object.
2050                          */
2051                         vm_page_copy(m, copy_m);
2052
2053                         /*
2054                          * If the old page was in use by any users
2055                          * of the copy-object, it must be removed
2056                          * from all pmaps.  (We can't know which
2057                          * pmaps use it.)
2058                          */
2059                         if (m->pmapped)
2060                                 pmap_disconnect(m->phys_page);
2061
2062                         /*
2063                          * If there's a pager, then immediately
2064                          * page out this page, using the "initialize"
2065                          * option.  Else, we use the copy.
2066                          */
2067                         if ((!copy_object->pager_created)
2068 #if MACH_PAGEMAP
2069                             || vm_external_state_get(copy_object->existence_map, copy_offset) == VM_EXTERNAL_STATE_ABSENT
2070 #endif
2071                             ) {
2072
2073                                 vm_page_lockspin_queues();
2074                                 assert(!m->cleaning);
2075                                 vm_page_activate(copy_m);
2076                                 vm_page_unlock_queues();
2077
2078                                 SET_PAGE_DIRTY(copy_m, TRUE);
2079                                 PAGE_WAKEUP_DONE(copy_m);
2080
2081                         } else if (copy_object->internal) {
2082                                 /*
2083                                  * For internal objects check with the pager to see
2084                                  * if the page already exists in the backing store.
2085                                  * If yes, then we can drop the copy page. If not,
2086                                  * then we'll activate it, mark it dirty and keep it
2087                                  * around.
2088                                  */
2089
2090                                 kern_return_t kr = KERN_SUCCESS;
2091
2092                                 memory_object_t copy_pager = copy_object->pager;
2093                                 assert(copy_pager != MEMORY_OBJECT_NULL);
2094                                 vm_object_paging_begin(copy_object);
2095
2096                                 vm_object_unlock(copy_object);
2097
2098                                 kr = memory_object_data_request(
2099                                         copy_pager,
2100                                         copy_offset + copy_object->paging_offset,
2101                                         0, /* Only query the pager. */
2102                                         VM_PROT_READ,
2103                                         NULL);
2104
2105                                 vm_object_lock(copy_object);
2106
2107                                 vm_object_paging_end(copy_object);
2108
2109                                 /*
2110                                  * Since we dropped the copy_object's lock,
2111                                  * check whether we'll have to deallocate
2112                                  * the hard way.
2113                                  */
2114                                 if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
2115                                         vm_object_unlock(copy_object);
2116                                         vm_object_deallocate(copy_object);
2117                                         vm_object_lock(object);
2118
2119                                         continue;
2120                                 }
2121                                 if (kr == KERN_SUCCESS) {
2122                                         /*
2123                                          * The pager has the page. We don't want to overwrite
2124                                          * that page by sending this one out to the backing store.
2125                                          * So we drop the copy page.
2126                                          */
2127                                         VM_PAGE_FREE(copy_m);
2128
2129                                 } else {
2130                                         /*
2131                                          * The pager doesn't have the page. We'll keep this one
2132                                          * around in the copy object. It might get sent out to
2133                                          * the backing store under memory pressure.
2134                                          */
2135                                         vm_page_lockspin_queues();
2136                                         assert(!m->cleaning);
2137                                         vm_page_activate(copy_m);
2138                                         vm_page_unlock_queues();
2139
2140                                         SET_PAGE_DIRTY(copy_m, TRUE);
2141                                         PAGE_WAKEUP_DONE(copy_m);
2142                                 }
2143                         } else {
2144
2145                                 assert(copy_m->busy == TRUE);
2146                                 assert(!m->cleaning);
2147
2148                                 /*
2149                                  * dirty is protected by the object lock
2150                                  */
2151                                 SET_PAGE_DIRTY(copy_m, TRUE);
2152
2153                                 /*
2154                                  * The page is already ready for pageout:
2155                                  * not on pageout queues and busy.
2156                                  * Unlock everything except the
2157                                  * copy_object itself.
2158                                  */
2159                                 vm_object_unlock(object);
2160
2161                                 /*
2162                                  * Write the page to the copy-object,
2163                                  * flushing it from the kernel.
2164                                  */
2165                                 vm_pageout_initialize_page(copy_m);
2166
2167                                 /*
2168                                  * Since the pageout may have
2169                                  * temporarily dropped the
2170                                  * copy_object's lock, we
2171                                  * check whether we'll have
2172                                  * to deallocate the hard way.
2173                                  */
2174                                 if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
2175                                         vm_object_unlock(copy_object);
2176                                         vm_object_deallocate(copy_object);
2177                                         vm_object_lock(object);
2178
2179                                         continue;
2180                                 }
2181                                 /*
2182                                  * Pick back up the old object's
2183                                  * lock.  [It is safe to do so,
2184                                  * since it must be deeper in the
2185                                  * object tree.]
2186                                  */
2187                                 vm_object_lock(object);
2188                         }
2189
2190                         /*
2191                          * Because we're pushing a page upward
2192                          * in the object tree, we must restart
2193                          * any faults that are waiting here.
2194                          * [Note that this is an expansion of
2195                          * PAGE_WAKEUP that uses the THREAD_RESTART
2196                          * wait result].  Can't turn off the page's
2197                          * busy bit because we're not done with it.
2198                          */
2199                         if (m->wanted) {
2200                                 m->wanted = FALSE;
2201                                 thread_wakeup_with_result((event_t) m, THREAD_RESTART);
2202                         }
2203                 }
2204                 /*
2205                  * The reference count on copy_object must be
2206                  * at least 2: one for our extra reference,
2207                  * and at least one from the outside world
2208                  * (we checked that when we last locked
2209                  * copy_object).
2210                  */
2211                 vm_object_lock_assert_exclusive(copy_object);
2212                 copy_object->ref_count--;
2213                 assert(copy_object->ref_count > 0);
2214
2215                 VM_OBJ_RES_DECR(copy_object);
2216                 vm_object_unlock(copy_object);
2217
2218                 break;
2219         }
2220
2221 done:
2222         *result_page = m;
2223         *top_page = first_m;
2224
2225         XPR(XPR_VM_FAULT,
2226                 "vm_f_page: DONE obj 0x%X, offset 0x%X, m 0x%X, first_m 0x%X\n",
2227                 object, offset, m, first_m, 0);
2228
2229         if (m != VM_PAGE_NULL) {
2230                 retval = VM_FAULT_SUCCESS;
2231                 if (my_fault == DBG_PAGEIN_FAULT) {
2232
2233                         VM_STAT_INCR(pageins);
2234                         DTRACE_VM2(pgin, int, 1, (uint64_t *), NULL);
2235                         DTRACE_VM2(maj_fault, int, 1, (uint64_t *), NULL);
2236                         current_task()->pageins++;
2237
2238                         if (m->object->internal) {
2239                                 DTRACE_VM2(anonpgin, int, 1, (uint64_t *), NULL);
2240                                 my_fault = DBG_PAGEIND_FAULT;
2241                         } else {
2242                                 DTRACE_VM2(fspgin, int, 1, (uint64_t *), NULL);
2243                                 my_fault = DBG_PAGEINV_FAULT;
2244                         }
2245
2246                         /*
2247                          * evaluate access pattern and update state
2248                          * vm_fault_deactivate_behind depends on the
2249                          * state being up to date
2250                          */
2251                         vm_fault_is_sequential(object, offset, fault_info->behavior);
2252
2253                         vm_fault_deactivate_behind(object, offset, fault_info->behavior);
2254                 }
2255                 if (type_of_fault)
2256                         *type_of_fault = my_fault;
2257         } else {
2258                 retval = VM_FAULT_SUCCESS_NO_VM_PAGE;
2259                 assert(first_m == VM_PAGE_NULL);
2260                 assert(object == first_object);
2261         }
2262
2263         thread_interrupt_level(interruptible_state);
2264
2265 #if TRACEFAULTPAGE
2266         dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0);       /* (TEST/DEBUG) */
2267 #endif
2268         return retval;
2269
2270 backoff:
2271         thread_interrupt_level(interruptible_state);
2272
2273         if (wait_result == THREAD_INTERRUPTED)
2274                 return (VM_FAULT_INTERRUPTED);
2275         return (VM_FAULT_RETRY);
2276
2277 #undef  RELEASE_PAGE
2278 }
2279
2280
2281
2282 /*
2283  * CODE SIGNING:
2284  * When soft faulting a page, we have to validate the page if:
2285  * 1. the page is being mapped in user space
2286  * 2. the page hasn't already been found to be "tainted"
2287  * 3. the page belongs to a code-signed object
2288  * 4. the page has not been validated yet or has been mapped for write.
2289  */
2290 #define VM_FAULT_NEED_CS_VALIDATION(pmap, page)                         \
2291         ((pmap) != kernel_pmap /*1*/ &&                                 \
2292          !(page)->cs_tainted /*2*/ &&                                   \
2293          (page)->object->code_signed /*3*/ &&                           \
2294          (!(page)->cs_validated || (page)->wpmapped /*4*/))
2295
2296
2297 /*
2298  * page queue lock must NOT be held
2299  * m->object must be locked
2300  *
2301  * NOTE: m->object could be locked "shared" only if we are called
2302  * from vm_fault() as part of a soft fault.  If so, we must be
2303  * careful not to modify the VM object in any way that is not
2304  * legal under a shared lock...
2305  */
2306 unsigned long cs_enter_tainted_rejected = 0;
2307 unsigned long cs_enter_tainted_accepted = 0;
2308 kern_return_t
2309 vm_fault_enter(vm_page_t m,
2310                pmap_t pmap,
2311                vm_map_offset_t vaddr,
2312                vm_prot_t prot,
2313                vm_prot_t fault_type,
2314                boolean_t wired,
2315                boolean_t change_wiring,
2316                boolean_t no_cache,
2317                boolean_t cs_bypass,
2318                boolean_t *need_retry,
2319                int *type_of_fault)
2320 {
2321         kern_return_t   kr, pe_result;
2322         boolean_t       previously_pmapped = m->pmapped;
2323         boolean_t       must_disconnect = 0;
2324         boolean_t       map_is_switched, map_is_switch_protected;
2325
2326         vm_object_lock_assert_held(m->object);
2327 #if DEBUG
2328         lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
2329 #endif /* DEBUG */
2330
2331         if (m->phys_page == vm_page_guard_addr) {
2332                 assert(m->fictitious);
2333                 return KERN_SUCCESS;
2334         }
2335
2336         if (*type_of_fault == DBG_ZERO_FILL_FAULT) {
2337
2338                 vm_object_lock_assert_exclusive(m->object);
2339
2340         } else if ((fault_type & VM_PROT_WRITE) == 0) {
2341                 /*
2342                  * This is not a "write" fault, so we
2343                  * might not have taken the object lock
2344                  * exclusively and we might not be able
2345                  * to update the "wpmapped" bit in
2346                  * vm_fault_enter().
2347                  * Let's just grant read access to
2348                  * the page for now and we'll
2349                  * soft-fault again if we need write
2350                  * access later...
2351                  */
2352                 prot &= ~VM_PROT_WRITE;
2353         }
2354         if (m->pmapped == FALSE) {
2355
2356                 if ((*type_of_fault == DBG_CACHE_HIT_FAULT) && m->clustered) {
2357                         /*
2358                          * found it in the cache, but this
2359                          * is the first fault-in of the page (m->pmapped == FALSE)
2360                          * so it must have come in as part of
2361                          * a cluster... account 1 pagein against it
2362                          */
2363                         VM_STAT_INCR(pageins);
2364                         DTRACE_VM2(pgin, int, 1, (uint64_t *), NULL);
2365
2366                         if (m->object->internal) {
2367                                 DTRACE_VM2(anonpgin, int, 1, (uint64_t *), NULL);
2368                                 *type_of_fault = DBG_PAGEIND_FAULT;
2369                         } else {
2370                                 DTRACE_VM2(fspgin, int, 1, (uint64_t *), NULL);
2371                                 *type_of_fault = DBG_PAGEINV_FAULT;
2372                         }
2373
2374                         current_task()->pageins++;
2375                 }
2376                 VM_PAGE_CONSUME_CLUSTERED(m);
2377
2378         }
2379
2380         if (*type_of_fault != DBG_COW_FAULT) {
2381                 DTRACE_VM2(as_fault, int, 1, (uint64_t *), NULL);
2382
2383                 if (pmap == kernel_pmap) {
2384                         DTRACE_VM2(kernel_asflt, int, 1, (uint64_t *), NULL);
2385                 }
2386         }
2387
2388         /* Validate code signature if necessary. */
2389         if (VM_FAULT_NEED_CS_VALIDATION(pmap, m)) {
2390                 vm_object_lock_assert_exclusive(m->object);
2391
2392                 if (m->cs_validated) {
2393                         vm_cs_revalidates++;
2394                 }
2395
2396                 /* VM map is locked, so 1 ref will remain on VM object -
2397                  * so no harm if vm_page_validate_cs drops the object lock */
2398                 vm_page_validate_cs(m);
2399         }
2400
2401 #define page_immutable(m,prot) ((m)->cs_validated /*&& ((prot) & VM_PROT_EXECUTE)*/)
2402
2403         map_is_switched = ((pmap != vm_map_pmap(current_task()->map)) &&
2404                            (pmap == vm_map_pmap(current_thread()->map)));
2405         map_is_switch_protected = current_thread()->map->switch_protect;
2406
2407         /* If the map is switched, and is switch-protected, we must protect
2408          * some pages from being write-faulted: immutable pages because by
2409          * definition they may not be written, and executable pages because that
2410          * would provide a way to inject unsigned code.
2411          * If the page is immutable, we can simply return. However, we can't
2412          * immediately determine whether a page is executable anywhere. But,
2413          * we can disconnect it everywhere and remove the executable protection
2414          * from the current map. We do that below right before we do the
2415          * PMAP_ENTER.
2416          */
2417         if(!cs_enforcement_disable && map_is_switched &&
2418            map_is_switch_protected && page_immutable(m, prot) &&
2419            (prot & VM_PROT_WRITE))
2420         {
2421                 return KERN_CODESIGN_ERROR;
2422         }
2423
2424         /* A page could be tainted, or pose a risk of being tainted later.
2425          * Check whether the receiving process wants it, and make it feel
2426          * the consequences (that hapens in cs_invalid_page()).
2427          * For CS Enforcement, two other conditions will
2428          * cause that page to be tainted as well:
2429          * - pmapping an unsigned page executable - this means unsigned code;
2430          * - writeable mapping of a validated page - the content of that page
2431          *   can be changed without the kernel noticing, therefore unsigned
2432          *   code can be created
2433          */
2434         if (m->cs_tainted ||
2435             (( !cs_enforcement_disable && !cs_bypass ) &&
2436              (/* The page is unsigned and wants to be executable */
2437               (!m->cs_validated && (prot & VM_PROT_EXECUTE))  ||
2438               /* The page should be immutable, but is in danger of being modified
2439                 * This is the case where we want policy from the code directory -
2440                 * is the page immutable or not? For now we have to assume that
2441                 * code pages will be immutable, data pages not.
2442                 * We'll assume a page is a code page if it has a code directory
2443                 * and we fault for execution.
2444                 * That is good enough since if we faulted the code page for
2445                 * writing in another map before, it is wpmapped; if we fault
2446                 * it for writing in this map later it will also be faulted for executing
2447                 * at the same time; and if we fault for writing in another map
2448                 * later, we will disconnect it from this pmap so we'll notice
2449                 * the change.
2450                 */
2451               (page_immutable(m, prot) && ((prot & VM_PROT_WRITE) || m->wpmapped))
2452               ))
2453                 )
2454         {
2455                 /* We will have a tainted page. Have to handle the special case
2456                  * of a switched map now. If the map is not switched, standard
2457                  * procedure applies - call cs_invalid_page().
2458                  * If the map is switched, the real owner is invalid already.
2459                  * There is no point in invalidating the switching process since
2460                  * it will not be executing from the map. So we don't call
2461                  * cs_invalid_page() in that case. */
2462                 boolean_t reject_page;
2463                 if(map_is_switched) {
2464                         assert(pmap==vm_map_pmap(current_thread()->map));
2465                         assert(!(prot & VM_PROT_WRITE) || (map_is_switch_protected == FALSE));
2466                         reject_page = FALSE;
2467                 } else {
2468                         reject_page = cs_invalid_page((addr64_t) vaddr);
2469                 }
2470
2471                 if (reject_page) {
2472                         /* reject the tainted page: abort the page fault */
2473                         kr = KERN_CODESIGN_ERROR;
2474                         cs_enter_tainted_rejected++;
2475                 } else {
2476                         /* proceed with the tainted page */
2477                         kr = KERN_SUCCESS;
2478                         /* Page might have been tainted before or not; now it
2479                          * definitively is. If the page wasn't tainted, we must
2480                          * disconnect it from all pmaps later. */
2481                         must_disconnect = !m->cs_tainted;
2482                         m->cs_tainted = TRUE;
2483                         cs_enter_tainted_accepted++;
2484                 }
2485                 if (cs_debug || kr != KERN_SUCCESS) {
2486                         printf("CODESIGNING: vm_fault_enter(0x%llx): "
2487                                "page %p obj %p off 0x%llx *** INVALID PAGE ***\n",
2488                                (long long)vaddr, m, m->object, m->offset);
2489                 }
2490
2491         } else {
2492                 /* proceed with the valid page */
2493                 kr = KERN_SUCCESS;
2494         }
2495
2496         /* If we have a KERN_SUCCESS from the previous checks, we either have
2497          * a good page, or a tainted page that has been accepted by the process.
2498          * In both cases the page will be entered into the pmap.
2499          * If the page is writeable, we need to disconnect it from other pmaps
2500          * now so those processes can take note.
2501          */
2502         if (kr == KERN_SUCCESS) {
2503                 /*
2504                  * NOTE: we may only hold the vm_object lock SHARED
2505                  * at this point, but the update of pmapped is ok
2506                  * since this is the ONLY bit updated behind the SHARED
2507                  * lock... however, we need to figure out how to do an atomic
2508                  * update on a bit field to make this less fragile... right
2509                  * now I don't know how to coerce 'C' to give me the offset info
2510                  * that's needed for an AtomicCompareAndSwap
2511                  */
2512                 m->pmapped = TRUE;
2513                 if(vm_page_is_slideable(m)) {
2514                         boolean_t was_busy = m->busy;
2515                         m->busy = TRUE;
2516                         kr = vm_page_slide(m, 0);
2517                         assert(m->busy);
2518                         if(!was_busy) {
2519                                 PAGE_WAKEUP_DONE(m);
2520                         }
2521                         if (kr != KERN_SUCCESS) {
2522                                 /*
2523                                  * This page has not been slid correctly,
2524                                  * do not do the pmap_enter() !
2525                                  * Let vm_fault_enter() return the error
2526                                  * so the caller can fail the fault.
2527                                  */
2528                                 goto after_the_pmap_enter;
2529                         }
2530                 }
2531
2532                 if (fault_type & VM_PROT_WRITE) {
2533
2534                         if (m->wpmapped == FALSE) {
2535                                 vm_object_lock_assert_exclusive(m->object);
2536
2537                                 m->wpmapped = TRUE;
2538                         }
2539                         if (must_disconnect) {
2540                                 /*
2541                                  * We can only get here
2542                                  * because of the CSE logic
2543                                  */
2544                                 assert(cs_enforcement_disable == FALSE);
2545                                 pmap_disconnect(m->phys_page);
2546                                 /*
2547                                  * If we are faulting for a write, we can clear
2548                                  * the execute bit - that will ensure the page is
2549                                  * checked again before being executable, which
2550                                  * protects against a map switch.
2551                                  * This only happens the first time the page
2552                                  * gets tainted, so we won't get stuck here
2553                                  * to make an already writeable page executable.
2554                                  */
2555                                 if (!cs_bypass){
2556                                         prot &= ~VM_PROT_EXECUTE;
2557                                 }
2558                         }
2559                 }
2560
2561                 /* Prevent a deadlock by not
2562                  * holding the object lock if we need to wait for a page in
2563                  * pmap_enter() - <rdar://problem/7138958> */
2564                 PMAP_ENTER_OPTIONS(pmap, vaddr, m, prot, fault_type, 0,
2565                                   wired, PMAP_OPTIONS_NOWAIT, pe_result);
2566
2567                 if(pe_result == KERN_RESOURCE_SHORTAGE) {
2568
2569                         if (need_retry) {
2570                                 /*
2571                                  * this will be non-null in the case where we hold the lock
2572                                  * on the top-object in this chain... we can't just drop
2573                                  * the lock on the object we're inserting the page into
2574                                  * and recall the PMAP_ENTER since we can still cause
2575                                  * a deadlock if one of the critical paths tries to
2576                                  * acquire the lock on the top-object and we're blocked
2577                                  * in PMAP_ENTER waiting for memory... our only recourse
2578                                  * is to deal with it at a higher level where we can
2579                                  * drop both locks.
2580                                  */
2581                                 *need_retry = TRUE;
2582                                 vm_pmap_enter_retried++;
2583                                 goto after_the_pmap_enter;
2584                         }
2585                         /* The nonblocking version of pmap_enter did not succeed.
2586                          * and we don't need to drop other locks and retry
2587                          * at the level above us, so
2588                          * use the blocking version instead. Requires marking
2589                          * the page busy and unlocking the object */
2590                         boolean_t was_busy = m->busy;
2591                         m->busy = TRUE;
2592                         vm_object_unlock(m->object);
2593
2594                         PMAP_ENTER(pmap, vaddr, m, prot, fault_type, 0, wired);
2595
2596                         /* Take the object lock again. */
2597                         vm_object_lock(m->object);
2598
2599                         /* If the page was busy, someone else will wake it up.
2600                          * Otherwise, we have to do it now. */
2601                         assert(m->busy);
2602                         if(!was_busy) {
2603                                 PAGE_WAKEUP_DONE(m);
2604                         }
2605                         vm_pmap_enter_blocked++;
2606                 }
2607         }
2608
2609 after_the_pmap_enter:
2610         /*
2611          * Hold queues lock to manipulate
2612          * the page queues.  Change wiring
2613          * case is obvious.
2614          */
2615         if (change_wiring) {
2616                 vm_page_lockspin_queues();
2617
2618                 if (wired) {
2619                         if (kr == KERN_SUCCESS) {
2620                                 vm_page_wire(m);
2621                         }
2622                 } else {
2623                         vm_page_unwire(m, TRUE);
2624                 }
2625                 vm_page_unlock_queues();
2626
2627         } else {
2628                 if (kr != KERN_SUCCESS) {
2629                         vm_page_lockspin_queues();
2630                         vm_page_deactivate(m);
2631                         vm_page_unlock_queues();
2632                 } else {
2633                         if (((!m->active && !m->inactive) || m->clean_queue || no_cache) && !VM_PAGE_WIRED(m) && !m->throttled) {
2634
2635                                 if ( vm_page_local_q && !no_cache && (*type_of_fault == DBG_COW_FAULT || *type_of_fault == DBG_ZERO_FILL_FAULT) ) {
2636                                         struct vpl      *lq;
2637                                         uint32_t        lid;
2638
2639                                         /*
2640                                          * we got a local queue to stuff this new page on...
2641                                          * its safe to manipulate local and local_id at this point
2642                                          * since we're behind an exclusive object lock and the
2643                                          * page is not on any global queue.
2644                                          *
2645                                          * we'll use the current cpu number to select the queue
2646                                          * note that we don't need to disable preemption... we're
2647                                          * going to behind the local queue's lock to do the real
2648                                          * work
2649                                          */
2650                                         lid = cpu_number();
2651
2652                                         lq = &vm_page_local_q[lid].vpl_un.vpl;
2653
2654                                         VPL_LOCK(&lq->vpl_lock);
2655
2656                                         queue_enter(&lq->vpl_queue, m, vm_page_t, pageq);
2657                                         m->local = TRUE;
2658                                         m->local_id = lid;
2659                                         lq->vpl_count++;
2660
2661                                         VPL_UNLOCK(&lq->vpl_lock);
2662
2663                                         if (lq->vpl_count > vm_page_local_q_soft_limit) {
2664                                                 /*
2665                                                  * we're beyond the soft limit for the local queue
2666                                                  * vm_page_reactivate_local will 'try' to take
2667                                                  * the global page queue lock... if it can't that's
2668                                                  * ok... we'll let the queue continue to grow up
2669                                                  * to the hard limit... at that point we'll wait
2670                                                  * for the lock... once we've got the lock, we'll
2671                                                  * transfer all of the pages from the local queue
2672                                                  * to the global active queue
2673                                                  */
2674                                                 vm_page_reactivate_local(lid, FALSE, FALSE);
2675                                         }
2676                                         return kr;
2677                                 }
2678
2679                                 vm_page_lockspin_queues();
2680                                 /*
2681                                  * test again now that we hold the page queue lock
2682                                  */
2683                                 if (!VM_PAGE_WIRED(m)) {
2684                                         if (m->clean_queue) {
2685                                                 VM_PAGE_QUEUES_REMOVE(m);
2686
2687                                                 vm_pageout_cleaned_reactivated++;
2688                                                 vm_pageout_cleaned_fault_reactivated++;
2689                                         }
2690
2691                                         if ((!m->active && !m->inactive) || no_cache) {
2692                                                 /*
2693                                                  * If this is a no_cache mapping and the page has never been
2694                                                  * mapped before or was previously a no_cache page, then we
2695                                                  * want to leave pages in the speculative state so that they
2696                                                  * can be readily recycled if free memory runs low.  Otherwise
2697                                                  * the page is activated as normal.
2698                                                  */
2699
2700                                                 if (no_cache && (!previously_pmapped || m->no_cache)) {
2701                                                         m->no_cache = TRUE;
2702
2703                                                         if (!m->speculative)
2704                                                                 vm_page_speculate(m, FALSE);
2705
2706                                                 } else if (!m->active && !m->inactive) {
2707
2708                                                         vm_page_activate(m);
2709                                                 }
2710                                         }
2711                                 }
2712                                 vm_page_unlock_queues();
2713                         }
2714                 }
2715         }
2716         return kr;
2717 }
2718
2719
2720 /*
2721  *      Routine:        vm_fault
2722  *      Purpose:
2723  *              Handle page faults, including pseudo-faults
2724  *              used to change the wiring status of pages.
2725  *      Returns:
2726  *              Explicit continuations have been removed.
2727  *      Implementation:
2728  *              vm_fault and vm_fault_page save mucho state
2729  *              in the moral equivalent of a closure.  The state
2730  *              structure is allocated when first entering vm_fault
2731  *              and deallocated when leaving vm_fault.
2732  */
2733
2734 extern int _map_enter_debug;
2735
2736 unsigned long vm_fault_collapse_total = 0;
2737 unsigned long vm_fault_collapse_skipped = 0;
2738
2739 kern_return_t
2740 vm_fault(
2741         vm_map_t        map,
2742         vm_map_offset_t vaddr,
2743         vm_prot_t       fault_type,
2744         boolean_t       change_wiring,
2745         int             interruptible,
2746         pmap_t          caller_pmap,
2747         vm_map_offset_t caller_pmap_addr)
2748 {
2749         vm_map_version_t        version;        /* Map version for verificiation */
2750         boolean_t               wired;          /* Should mapping be wired down? */
2751         vm_object_t             object;         /* Top-level object */
2752         vm_object_offset_t      offset;         /* Top-level offset */
2753         vm_prot_t               prot;           /* Protection for mapping */
2754         vm_object_t             old_copy_object; /* Saved copy object */
2755         vm_page_t               result_page;    /* Result of vm_fault_page */
2756         vm_page_t               top_page;       /* Placeholder page */
2757         kern_return_t           kr;
2758
2759         vm_page_t               m;      /* Fast access to result_page */
2760         kern_return_t           error_code;
2761         vm_object_t             cur_object;
2762         vm_object_offset_t      cur_offset;
2763         vm_page_t               cur_m;
2764         vm_object_t             new_object;
2765         int                     type_of_fault;
2766         pmap_t                  pmap;
2767         boolean_t               interruptible_state;
2768         vm_map_t                real_map = map;
2769         vm_map_t                original_map = map;
2770         vm_prot_t               original_fault_type;
2771         struct vm_object_fault_info fault_info;
2772         boolean_t               need_collapse = FALSE;
2773         boolean_t               need_retry = FALSE;
2774         int                     object_lock_type = 0;
2775         int                     cur_object_lock_type;
2776         vm_object_t             top_object = VM_OBJECT_NULL;
2777         int                     throttle_delay;
2778
2779
2780         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2781                       (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START,
2782                               (int)((uint64_t)vaddr >> 32),
2783                               (int)vaddr,
2784                               (map == kernel_map),
2785                               0,
2786                               0);
2787
2788         if (get_preemption_level() != 0) {
2789                 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2790                                       (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
2791                                       (int)((uint64_t)vaddr >> 32),
2792                                       (int)vaddr,
2793                                       KERN_FAILURE,
2794                                       0,
2795                                       0);
2796
2797                 return (KERN_FAILURE);
2798         }
2799
2800         interruptible_state = thread_interrupt_level(interruptible);
2801
2802         VM_STAT_INCR(faults);
2803         current_task()->faults++;
2804         original_fault_type = fault_type;
2805
2806         if (fault_type & VM_PROT_WRITE)
2807                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2808         else
2809                 object_lock_type = OBJECT_LOCK_SHARED;
2810
2811         cur_object_lock_type = OBJECT_LOCK_SHARED;
2812
2813 RetryFault:
2814         /*
2815          * assume we will hit a page in the cache
2816          * otherwise, explicitly override with
2817          * the real fault type once we determine it
2818          */
2819         type_of_fault = DBG_CACHE_HIT_FAULT;
2820
2821         /*
2822          *      Find the backing store object and offset into
2823          *      it to begin the search.
2824          */
2825         fault_type = original_fault_type;
2826         map = original_map;
2827         vm_map_lock_read(map);
2828
2829         kr = vm_map_lookup_locked(&map, vaddr, fault_type,
2830                                   object_lock_type, &version,
2831                                   &object, &offset, &prot, &wired,
2832                                   &fault_info,
2833                                   &real_map);
2834
2835         if (kr != KERN_SUCCESS) {
2836                 vm_map_unlock_read(map);
2837                 goto done;
2838         }
2839         pmap = real_map->pmap;
2840         fault_info.interruptible = interruptible;
2841         fault_info.stealth = FALSE;
2842         fault_info.io_sync = FALSE;
2843         fault_info.mark_zf_absent = FALSE;
2844         fault_info.batch_pmap_op = FALSE;
2845
2846         /*
2847          * If the page is wired, we must fault for the current protection
2848          * value, to avoid further faults.
2849          */
2850         if (wired) {
2851                 fault_type = prot | VM_PROT_WRITE;
2852                 /*
2853                  * since we're treating this fault as a 'write'
2854                  * we must hold the top object lock exclusively
2855                  */
2856                 if (object_lock_type == OBJECT_LOCK_SHARED) {
2857
2858                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2859
2860                         if (vm_object_lock_upgrade(object) == FALSE) {
2861                                 /*
2862                                  * couldn't upgrade, so explictly
2863                                  * take the lock exclusively
2864                                  */
2865                                 vm_object_lock(object);
2866                         }
2867                 }
2868         }
2869
2870 #if     VM_FAULT_CLASSIFY
2871         /*
2872          *      Temporary data gathering code
2873          */
2874         vm_fault_classify(object, offset, fault_type);
2875 #endif
2876         /*
2877          *      Fast fault code.  The basic idea is to do as much as
2878          *      possible while holding the map lock and object locks.
2879          *      Busy pages are not used until the object lock has to
2880          *      be dropped to do something (copy, zero fill, pmap enter).
2881          *      Similarly, paging references aren't acquired until that
2882          *      point, and object references aren't used.
2883          *
2884          *      If we can figure out what to do
2885          *      (zero fill, copy on write, pmap enter) while holding
2886          *      the locks, then it gets done.  Otherwise, we give up,
2887          *      and use the original fault path (which doesn't hold
2888          *      the map lock, and relies on busy pages).
2889          *      The give up cases include:
2890          *              - Have to talk to pager.
2891          *              - Page is busy, absent or in error.
2892          *              - Pager has locked out desired access.
2893          *              - Fault needs to be restarted.
2894          *              - Have to push page into copy object.
2895          *
2896          *      The code is an infinite loop that moves one level down
2897          *      the shadow chain each time.  cur_object and cur_offset
2898          *      refer to the current object being examined. object and offset
2899          *      are the original object from the map.  The loop is at the
2900          *      top level if and only if object and cur_object are the same.
2901          *
2902          *      Invariants:  Map lock is held throughout.  Lock is held on
2903          *              original object and cur_object (if different) when
2904          *              continuing or exiting loop.
2905          *
2906          */
2907
2908
2909         /*
2910          * If this page is to be inserted in a copy delay object
2911          * for writing, and if the object has a copy, then the
2912          * copy delay strategy is implemented in the slow fault page.
2913          */
2914         if (object->copy_strategy == MEMORY_OBJECT_COPY_DELAY &&
2915             object->copy != VM_OBJECT_NULL && (fault_type & VM_PROT_WRITE))
2916                 goto handle_copy_delay;
2917
2918         cur_object = object;
2919         cur_offset = offset;
2920
2921         while (TRUE) {
2922                 if (!cur_object->pager_created &&
2923                     cur_object->phys_contiguous) /* superpage */
2924                         break;
2925
2926                 if (cur_object->blocked_access) {
2927                         /*
2928                          * Access to this VM object has been blocked.
2929                          * Let the slow path handle it.
2930                          */
2931                         break;
2932                 }
2933
2934                 m = vm_page_lookup(cur_object, cur_offset);
2935
2936                 if (m != VM_PAGE_NULL) {
2937                         if (m->busy) {
2938                                 wait_result_t   result;
2939
2940                                 /*
2941                                  * in order to do the PAGE_ASSERT_WAIT, we must
2942                                  * have object that 'm' belongs to locked exclusively
2943                                  */
2944                                 if (object != cur_object) {
2945                                         vm_object_unlock(object);
2946
2947                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
2948
2949                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2950
2951                                                 if (vm_object_lock_upgrade(cur_object) == FALSE) {
2952                                                         /*
2953                                                          * couldn't upgrade so go do a full retry
2954                                                          * immediately since we've already dropped
2955                                                          * the top object lock associated with this page
2956                                                          * and the current one got dropped due to the
2957                                                          * failed upgrade... the state is no longer valid
2958                                                          */
2959                                                         vm_map_unlock_read(map);
2960                                                         if (real_map != map)
2961                                                                 vm_map_unlock(real_map);
2962
2963                                                         goto RetryFault;
2964                                                 }
2965                                         }
2966                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
2967
2968                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2969
2970                                         if (vm_object_lock_upgrade(object) == FALSE) {
2971                                                 /*
2972                                                  * couldn't upgrade, so explictly take the lock
2973                                                  * exclusively and go relookup the page since we
2974                                                  * will have dropped the object lock and
2975                                                  * a different thread could have inserted
2976                                                  * a page at this offset
2977                                                  * no need for a full retry since we're
2978                                                  * at the top level of the object chain
2979                                                  */
2980                                                 vm_object_lock(object);
2981
2982                                                 continue;
2983                                         }
2984                                 }
2985                                 vm_map_unlock_read(map);
2986                                 if (real_map != map)
2987                                         vm_map_unlock(real_map);
2988
2989                                 result = PAGE_ASSERT_WAIT(m, interruptible);
2990
2991                                 vm_object_unlock(cur_object);
2992
2993                                 if (result == THREAD_WAITING) {
2994                                         result = thread_block(THREAD_CONTINUE_NULL);
2995
2996                                         counter(c_vm_fault_page_block_busy_kernel++);
2997                                 }
2998                                 if (result == THREAD_AWAKENED || result == THREAD_RESTART)
2999                                         goto RetryFault;
3000
3001                                 kr = KERN_ABORTED;
3002                                 goto done;
3003                         }
3004                         if (m->laundry) {
3005                                 if (object != cur_object) {
3006                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3007                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3008
3009                                                 vm_object_unlock(object);
3010                                                 vm_object_unlock(cur_object);
3011
3012                                                 vm_map_unlock_read(map);
3013                                                 if (real_map != map)
3014                                                         vm_map_unlock(real_map);
3015
3016                                                 goto RetryFault;
3017                                         }
3018
3019                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3020
3021                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3022
3023                                         if (vm_object_lock_upgrade(object) == FALSE) {
3024                                                 /*
3025                                                  * couldn't upgrade, so explictly take the lock
3026                                                  * exclusively and go relookup the page since we
3027                                                  * will have dropped the object lock and
3028                                                  * a different thread could have inserted
3029                                                  * a page at this offset
3030                                                  * no need for a full retry since we're
3031                                                  * at the top level of the object chain
3032                                                  */
3033                                                 vm_object_lock(object);
3034
3035                                                 continue;
3036                                         }
3037                                 }
3038                                 m->pageout = FALSE;
3039
3040                                 vm_pageout_steal_laundry(m, FALSE);
3041                         }
3042
3043                         if (m->phys_page == vm_page_guard_addr) {
3044                                 /*
3045                                  * Guard page: let the slow path deal with it
3046                                  */
3047                                 break;
3048                         }
3049                         if (m->unusual && (m->error || m->restart || m->private || m->absent)) {
3050                                 /*
3051                                  * Unusual case... let the slow path deal with it
3052                                  */
3053                                 break;
3054                         }
3055                         if (VM_OBJECT_PURGEABLE_FAULT_ERROR(m->object)) {
3056                                 if (object != cur_object)
3057                                         vm_object_unlock(object);
3058                                 vm_map_unlock_read(map);
3059                                 if (real_map != map)
3060                                         vm_map_unlock(real_map);
3061                                 vm_object_unlock(cur_object);
3062                                 kr = KERN_MEMORY_ERROR;
3063                                 goto done;
3064                         }
3065
3066                         if (m->encrypted) {
3067                                 /*
3068                                  * ENCRYPTED SWAP:
3069                                  * We've soft-faulted (because it's not in the page
3070                                  * table) on an encrypted page.
3071                                  * Keep the page "busy" so that no one messes with
3072                                  * it during the decryption.
3073                                  * Release the extra locks we're holding, keep only
3074                                  * the page's VM object lock.
3075                                  *
3076                                  * in order to set 'busy' on 'm', we must
3077                                  * have object that 'm' belongs to locked exclusively
3078                                  */
3079                                 if (object != cur_object) {
3080                                         vm_object_unlock(object);
3081
3082                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3083
3084                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3085
3086                                                 if (vm_object_lock_upgrade(cur_object) == FALSE) {
3087                                                         /*
3088                                                          * couldn't upgrade so go do a full retry
3089                                                          * immediately since we've already dropped
3090                                                          * the top object lock associated with this page
3091                                                          * and the current one got dropped due to the
3092                                                          * failed upgrade... the state is no longer valid
3093                                                          */
3094                                                         vm_map_unlock_read(map);
3095                                                         if (real_map != map)
3096                                                                 vm_map_unlock(real_map);
3097
3098                                                         goto RetryFault;
3099                                                 }
3100                                         }
3101                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3102
3103                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3104
3105                                         if (vm_object_lock_upgrade(object) == FALSE) {
3106                                                 /*
3107                                                  * couldn't upgrade, so explictly take the lock
3108                                                  * exclusively and go relookup the page since we
3109                                                  * will have dropped the object lock and
3110                                                  * a different thread could have inserted
3111                                                  * a page at this offset
3112                                                  * no need for a full retry since we're
3113                                                  * at the top level of the object chain
3114                                                  */
3115                                                 vm_object_lock(object);
3116
3117                                                 continue;
3118                                         }
3119                                 }
3120                                 m->busy = TRUE;
3121
3122                                 vm_map_unlock_read(map);
3123                                 if (real_map != map)
3124                                         vm_map_unlock(real_map);
3125
3126                                 vm_page_decrypt(m, 0);
3127
3128                                 assert(m->busy);
3129                                 PAGE_WAKEUP_DONE(m);
3130
3131                                 vm_object_unlock(cur_object);
3132                                 /*
3133                                  * Retry from the top, in case anything
3134                                  * changed while we were decrypting...
3135                                  */
3136                                 goto RetryFault;
3137                         }
3138                         ASSERT_PAGE_DECRYPTED(m);
3139
3140                         if(vm_page_is_slideable(m)) {
3141                                 /*
3142                                  * We might need to slide this page, and so,
3143                                  * we want to hold the VM object exclusively.
3144                                  */
3145                                 if (object != cur_object) {
3146                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3147                                                 vm_object_unlock(object);
3148                                                 vm_object_unlock(cur_object);
3149
3150                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3151
3152                                                 vm_map_unlock_read(map);
3153                                                 if (real_map != map)
3154                                                         vm_map_unlock(real_map);
3155
3156                                                 goto RetryFault;
3157                                         }
3158                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3159
3160                                         vm_object_unlock(object);
3161                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3162                                         vm_map_unlock_read(map);
3163                                         goto RetryFault;
3164                                 }
3165                         }
3166
3167                         if (VM_FAULT_NEED_CS_VALIDATION(map->pmap, m)) {
3168 upgrade_for_validation:
3169                                 /*
3170                                  * We might need to validate this page
3171                                  * against its code signature, so we
3172                                  * want to hold the VM object exclusively.
3173                                  */
3174                                 if (object != cur_object) {
3175                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3176                                                 vm_object_unlock(object);
3177                                                 vm_object_unlock(cur_object);
3178
3179                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3180
3181                                                 vm_map_unlock_read(map);
3182                                                 if (real_map != map)
3183                                                         vm_map_unlock(real_map);
3184
3185                                                 goto RetryFault;
3186                                         }
3187
3188                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3189
3190                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3191
3192                                         if (vm_object_lock_upgrade(object) == FALSE) {
3193                                                 /*
3194                                                  * couldn't upgrade, so explictly take the lock
3195                                                  * exclusively and go relookup the page since we
3196                                                  * will have dropped the object lock and
3197                                                  * a different thread could have inserted
3198                                                  * a page at this offset
3199                                                  * no need for a full retry since we're
3200                                                  * at the top level of the object chain
3201                                                  */
3202                                                 vm_object_lock(object);
3203
3204                                                 continue;
3205                                         }
3206                                 }
3207                         }
3208                         /*
3209                          *      Two cases of map in faults:
3210                          *          - At top level w/o copy object.
3211                          *          - Read fault anywhere.
3212                          *              --> must disallow write.
3213                          */
3214
3215                         if (object == cur_object && object->copy == VM_OBJECT_NULL) {
3216
3217                                 goto FastPmapEnter;
3218                         }
3219
3220                         if ((fault_type & VM_PROT_WRITE) == 0) {
3221
3222                                 if (object != cur_object) {
3223                                         /*
3224                                          * We still need to hold the top object
3225                                          * lock here to prevent a race between
3226                                          * a read fault (taking only "shared"
3227                                          * locks) and a write fault (taking
3228                                          * an "exclusive" lock on the top
3229                                          * object.
3230                                          * Otherwise, as soon as we release the
3231                                          * top lock, the write fault could
3232                                          * proceed and actually complete before
3233                                          * the read fault, and the copied page's
3234                                          * translation could then be overwritten
3235                                          * by the read fault's translation for
3236                                          * the original page.
3237                                          *
3238                                          * Let's just record what the top object
3239                                          * is and we'll release it later.
3240                                          */
3241                                         top_object = object;
3242
3243                                         /*
3244                                          * switch to the object that has the new page
3245                                          */
3246                                         object = cur_object;
3247                                         object_lock_type = cur_object_lock_type;
3248                                 }
3249 FastPmapEnter:
3250                                 /*
3251                                  * prepare for the pmap_enter...
3252                                  * object and map are both locked
3253                                  * m contains valid data
3254                                  * object == m->object
3255                                  * cur_object == NULL or it's been unlocked
3256                                  * no paging references on either object or cur_object
3257                                  */
3258                                 if (caller_pmap) {
3259                                         kr = vm_fault_enter(m,
3260                                                             caller_pmap,
3261                                                             caller_pmap_addr,
3262                                                             prot,
3263                                                             fault_type,
3264                                                             wired,
3265                                                             change_wiring,
3266                                                             fault_info.no_cache,
3267                                                             fault_info.cs_bypass,
3268                                                             (top_object != VM_OBJECT_NULL ? &need_retry : NULL),
3269                                                             &type_of_fault);
3270                                 } else {
3271                                         kr = vm_fault_enter(m,
3272                                                             pmap,
3273                                                             vaddr,
3274                                                             prot,
3275                                                             fault_type,
3276                                                             wired,
3277                                                             change_wiring,
3278                                                             fault_info.no_cache,
3279                                                             fault_info.cs_bypass,
3280                                                             (top_object != VM_OBJECT_NULL ? &need_retry : NULL),
3281                                                             &type_of_fault);
3282                                 }
3283
3284                                 if (top_object != VM_OBJECT_NULL) {
3285                                         /*
3286                                          * It's safe to drop the top object
3287                                          * now that we've done our
3288                                          * vm_fault_enter().  Any other fault
3289                                          * in progress for that virtual
3290                                          * address will either find our page
3291                                          * and translation or put in a new page
3292                                          * and translation.
3293                                          */
3294                                         vm_object_unlock(top_object);
3295                                         top_object = VM_OBJECT_NULL;
3296                                 }
3297
3298                                 if (need_collapse == TRUE)
3299                                         vm_object_collapse(object, offset, TRUE);
3300
3301                                 if (need_retry == FALSE &&
3302                                     (type_of_fault == DBG_PAGEIND_FAULT || type_of_fault == DBG_PAGEINV_FAULT || type_of_fault == DBG_CACHE_HIT_FAULT)) {
3303                                         /*
3304                                          * evaluate access pattern and update state
3305                                          * vm_fault_deactivate_behind depends on the
3306                                          * state being up to date
3307                                          */
3308                                         vm_fault_is_sequential(object, cur_offset, fault_info.behavior);
3309
3310                                         vm_fault_deactivate_behind(object, cur_offset, fault_info.behavior);
3311                                 }
3312                                 /*
3313                                  * That's it, clean up and return.
3314                                  */
3315                                 if (m->busy)
3316                                         PAGE_WAKEUP_DONE(m);
3317
3318                                 vm_object_unlock(object);
3319
3320                                 vm_map_unlock_read(map);
3321                                 if (real_map != map)
3322                                         vm_map_unlock(real_map);
3323
3324                                 if (need_retry == TRUE) {
3325                                         /*
3326                                          * vm_fault_enter couldn't complete the PMAP_ENTER...
3327                                          * at this point we don't hold any locks so it's safe
3328                                          * to ask the pmap layer to expand the page table to
3329                                          * accommodate this mapping... once expanded, we'll
3330                                          * re-drive the fault which should result in vm_fault_enter
3331                                          * being able to successfully enter the mapping this time around
3332                                          */
3333                                         (void)pmap_enter_options(pmap, vaddr, 0, 0, 0, 0, 0, PMAP_OPTIONS_NOENTER);
3334
3335                                         need_retry = FALSE;
3336                                         goto RetryFault;
3337                                 }
3338                                 goto done;
3339                         }
3340                         /*
3341                          * COPY ON WRITE FAULT
3342                          */
3343                         assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE);
3344
3345                         if ((throttle_delay = vm_page_throttled())) {
3346                                 /*
3347                                  * drop all of our locks...
3348                                  * wait until the free queue is
3349                                  * pumped back up and then
3350                                  * redrive the fault
3351                                  */
3352                                 if (object != cur_object)
3353                                         vm_object_unlock(cur_object);
3354                                 vm_object_unlock(object);
3355                                 vm_map_unlock_read(map);
3356                                 if (real_map != map)
3357                                         vm_map_unlock(real_map);
3358
3359                                 VM_DEBUG_EVENT(vmf_cowdelay, VMF_COWDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
3360
3361                                 delay(throttle_delay);
3362
3363                                 if (!current_thread_aborted() && vm_page_wait((change_wiring) ?
3364                                                  THREAD_UNINT :
3365                                                  THREAD_ABORTSAFE))
3366                                         goto RetryFault;
3367                                 kr = KERN_ABORTED;
3368                                 goto done;
3369                         }
3370                         /*
3371                          * If objects match, then
3372                          * object->copy must not be NULL (else control
3373                          * would be in previous code block), and we
3374                          * have a potential push into the copy object
3375                          * with which we can't cope with here.
3376                          */
3377                         if (cur_object == object) {
3378                                 /*
3379                                  * must take the slow path to
3380                                  * deal with the copy push
3381                                  */
3382                                 break;
3383                         }
3384
3385                         /*
3386                          * This is now a shadow based copy on write
3387                          * fault -- it requires a copy up the shadow
3388                          * chain.
3389                          */
3390
3391                         if ((cur_object_lock_type == OBJECT_LOCK_SHARED) &&
3392                             VM_FAULT_NEED_CS_VALIDATION(NULL, m)) {
3393                                 goto upgrade_for_validation;
3394                         }
3395
3396                         /*
3397                          * Allocate a page in the original top level
3398                          * object. Give up if allocate fails.  Also
3399                          * need to remember current page, as it's the
3400                          * source of the copy.
3401                          *
3402                          * at this point we hold locks on both
3403                          * object and cur_object... no need to take
3404                          * paging refs or mark pages BUSY since
3405                          * we don't drop either object lock until
3406                          * the page has been copied and inserted
3407                          */
3408                         cur_m = m;
3409                         m = vm_page_grab();
3410
3411                         if (m == VM_PAGE_NULL) {
3412                                 /*
3413                                  * no free page currently available...
3414                                  * must take the slow path
3415                                  */
3416                                 break;
3417                         }
3418                         /*
3419                          * Now do the copy.  Mark the source page busy...
3420                          *
3421                          *      NOTE: This code holds the map lock across
3422                          *      the page copy.
3423                          */
3424                         vm_page_copy(cur_m, m);
3425                         vm_page_insert(m, object, offset);
3426                         SET_PAGE_DIRTY(m, FALSE);
3427
3428                         /*
3429                          * Now cope with the source page and object
3430                          */
3431                         if (object->ref_count > 1 && cur_m->pmapped)
3432                                 pmap_disconnect(cur_m->phys_page);
3433
3434                         need_collapse = TRUE;
3435
3436                         if (!cur_object->internal &&
3437                             cur_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
3438                                 /*
3439                                  * The object from which we've just
3440                                  * copied a page is most probably backed
3441                                  * by a vnode.  We don't want to waste too
3442                                  * much time trying to collapse the VM objects
3443                                  * and create a bottleneck when several tasks
3444                                  * map the same file.
3445                                  */
3446                                 if (cur_object->copy == object) {
3447                                         /*
3448                                          * Shared mapping or no COW yet.
3449                                          * We can never collapse a copy
3450                                          * object into its backing object.
3451                                          */
3452                                         need_collapse = FALSE;
3453                                 } else if (cur_object->copy == object->shadow &&
3454                                            object->shadow->resident_page_count == 0) {
3455                                         /*
3456                                          * Shared mapping after a COW occurred.
3457                                          */
3458                                         need_collapse = FALSE;
3459                                 }
3460                         }
3461                         vm_object_unlock(cur_object);
3462
3463                         if (need_collapse == FALSE)
3464                                 vm_fault_collapse_skipped++;
3465                         vm_fault_collapse_total++;
3466
3467                         type_of_fault = DBG_COW_FAULT;
3468                         VM_STAT_INCR(cow_faults);
3469                         DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
3470                         current_task()->cow_faults++;
3471
3472                         goto FastPmapEnter;
3473
3474                 } else {
3475                         /*
3476                          * No page at cur_object, cur_offset... m == NULL
3477                          */
3478                         if (cur_object->pager_created) {
3479                                 if (MUST_ASK_PAGER(cur_object, cur_offset) == TRUE) {
3480                                         /*
3481                                          * May have to talk to a pager...
3482                                          * take the slow path.
3483                                          */
3484                                         break;
3485                                 }
3486                                 /*
3487                                  * existence map present and indicates
3488                                  * that the pager doesn't have this page
3489                                  */
3490                         }
3491                         if (cur_object->shadow == VM_OBJECT_NULL) {
3492                                 /*
3493                                  * Zero fill fault.  Page gets
3494                                  * inserted into the original object.
3495                                  */
3496                                 if (cur_object->shadow_severed ||
3497                                     VM_OBJECT_PURGEABLE_FAULT_ERROR(cur_object))
3498                                 {
3499                                         if (object != cur_object)
3500                                                 vm_object_unlock(cur_object);
3501                                         vm_object_unlock(object);
3502
3503                                         vm_map_unlock_read(map);
3504                                         if (real_map != map)
3505                                                 vm_map_unlock(real_map);
3506
3507                                         kr = KERN_MEMORY_ERROR;
3508                                         goto done;
3509                                 }
3510                                 if ((throttle_delay = vm_page_throttled())) {
3511                                         /*
3512                                          * drop all of our locks...
3513                                          * wait until the free queue is
3514                                          * pumped back up and then
3515                                          * redrive the fault
3516                                          */
3517                                         if (object != cur_object)
3518                                                 vm_object_unlock(cur_object);
3519                                         vm_object_unlock(object);
3520                                         vm_map_unlock_read(map);
3521                                         if (real_map != map)
3522                                                 vm_map_unlock(real_map);
3523
3524                                         VM_DEBUG_EVENT(vmf_zfdelay, VMF_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
3525
3526                                         delay(throttle_delay);
3527
3528                                         if (!current_thread_aborted() && vm_page_wait((change_wiring) ?
3529                                                          THREAD_UNINT :
3530                                                          THREAD_ABORTSAFE))
3531                                                 goto RetryFault;
3532                                         kr = KERN_ABORTED;
3533                                         goto done;
3534                                 }
3535                                 if (vm_backing_store_low) {
3536                                         /*
3537                                          * we are protecting the system from
3538                                          * backing store exhaustion...
3539                                          * must take the slow path if we're
3540                                          * not privileged
3541                                          */
3542                                         if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV))
3543                                                 break;
3544                                 }
3545                                 if (cur_object != object) {
3546                                         vm_object_unlock(cur_object);
3547
3548                                         cur_object = object;
3549                                 }
3550                                 if (object_lock_type == OBJECT_LOCK_SHARED) {
3551
3552                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3553
3554                                         if (vm_object_lock_upgrade(object) == FALSE) {
3555                                                 /*
3556                                                  * couldn't upgrade so do a full retry on the fault
3557                                                  * since we dropped the object lock which
3558                                                  * could allow another thread to insert
3559                                                  * a page at this offset
3560                                                  */
3561                                                 vm_map_unlock_read(map);
3562                                                 if (real_map != map)
3563                                                         vm_map_unlock(real_map);
3564
3565                                                 goto RetryFault;
3566                                         }
3567                                 }
3568                                 m = vm_page_alloc(object, offset);
3569
3570                                 if (m == VM_PAGE_NULL) {
3571                                         /*
3572                                          * no free page currently available...
3573                                          * must take the slow path
3574                                          */
3575                                         break;
3576                                 }
3577
3578                                 /*
3579                                  * Now zero fill page...
3580                                  * the page is probably going to
3581                                  * be written soon, so don't bother
3582                                  * to clear the modified bit
3583                                  *
3584                                  *   NOTE: This code holds the map
3585                                  *   lock across the zero fill.
3586                                  */
3587                                 type_of_fault = vm_fault_zero_page(m, map->no_zero_fill);
3588
3589                                 goto FastPmapEnter;
3590                         }
3591                         /*
3592                          * On to the next level in the shadow chain
3593                          */
3594                         cur_offset += cur_object->vo_shadow_offset;
3595                         new_object = cur_object->shadow;
3596
3597                         /*
3598                          * take the new_object's lock with the indicated state
3599                          */
3600                         if (cur_object_lock_type == OBJECT_LOCK_SHARED)
3601                                 vm_object_lock_shared(new_object);
3602                         else
3603                                 vm_object_lock(new_object);
3604
3605                         if (cur_object != object)
3606                                 vm_object_unlock(cur_object);
3607
3608                         cur_object = new_object;
3609
3610                         continue;
3611                 }
3612         }
3613         /*
3614          * Cleanup from fast fault failure.  Drop any object
3615          * lock other than original and drop map lock.
3616          */
3617         if (object != cur_object)
3618                 vm_object_unlock(cur_object);
3619
3620         /*
3621          * must own the object lock exclusively at this point
3622          */
3623         if (object_lock_type == OBJECT_LOCK_SHARED) {
3624                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3625
3626                 if (vm_object_lock_upgrade(object) == FALSE) {
3627                         /*
3628                          * couldn't upgrade, so explictly
3629                          * take the lock exclusively
3630                          * no need to retry the fault at this
3631                          * point since "vm_fault_page" will
3632                          * completely re-evaluate the state
3633                          */
3634                         vm_object_lock(object);
3635                 }
3636         }
3637
3638 handle_copy_delay:
3639         vm_map_unlock_read(map);
3640         if (real_map != map)
3641                 vm_map_unlock(real_map);
3642
3643         /*
3644          * Make a reference to this object to
3645          * prevent its disposal while we are messing with
3646          * it.  Once we have the reference, the map is free
3647          * to be diddled.  Since objects reference their
3648          * shadows (and copies), they will stay around as well.
3649          */
3650         vm_object_reference_locked(object);
3651         vm_object_paging_begin(object);
3652
3653         XPR(XPR_VM_FAULT,"vm_fault -> vm_fault_page\n",0,0,0,0,0);
3654
3655         error_code = 0;
3656
3657         kr = vm_fault_page(object, offset, fault_type,
3658                            (change_wiring && !wired),
3659                            &prot, &result_page, &top_page,
3660                            &type_of_fault,
3661                            &error_code, map->no_zero_fill,
3662                            FALSE, &fault_info);
3663
3664         /*
3665          * if kr != VM_FAULT_SUCCESS, then the paging reference
3666          * has been dropped and the object unlocked... the ref_count
3667          * is still held
3668          *
3669          * if kr == VM_FAULT_SUCCESS, then the paging reference
3670          * is still held along with the ref_count on the original object
3671          *
3672          *      the object is returned locked with a paging reference
3673          *
3674          *      if top_page != NULL, then it's BUSY and the
3675          *      object it belongs to has a paging reference
3676          *      but is returned unlocked
3677          */
3678         if (kr != VM_FAULT_SUCCESS &&
3679             kr != VM_FAULT_SUCCESS_NO_VM_PAGE) {
3680                 /*
3681                  * we didn't succeed, lose the object reference immediately.
3682                  */
3683                 vm_object_deallocate(object);
3684
3685                 /*
3686                  * See why we failed, and take corrective action.
3687                  */
3688                 switch (kr) {
3689                 case VM_FAULT_MEMORY_SHORTAGE:
3690                         if (vm_page_wait((change_wiring) ?
3691                                          THREAD_UNINT :
3692                                          THREAD_ABORTSAFE))
3693                                 goto RetryFault;
3694                         /*
3695                          * fall thru
3696                          */
3697                 case VM_FAULT_INTERRUPTED:
3698                         kr = KERN_ABORTED;
3699                         goto done;
3700                 case VM_FAULT_RETRY:
3701                         goto RetryFault;
3702                 case VM_FAULT_MEMORY_ERROR:
3703                         if (error_code)
3704                                 kr = error_code;
3705                         else
3706                                 kr = KERN_MEMORY_ERROR;
3707                         goto done;
3708                 default:
3709                         panic("vm_fault: unexpected error 0x%x from "
3710                               "vm_fault_page()\n", kr);
3711                 }
3712         }
3713         m = result_page;
3714
3715         if (m != VM_PAGE_NULL) {
3716                 assert((change_wiring && !wired) ?
3717                     (top_page == VM_PAGE_NULL) :
3718                     ((top_page == VM_PAGE_NULL) == (m->object == object)));
3719         }
3720
3721         /*
3722          * What to do with the resulting page from vm_fault_page
3723          * if it doesn't get entered into the physical map:
3724          */
3725 #define RELEASE_PAGE(m)                                 \
3726         MACRO_BEGIN                                     \
3727         PAGE_WAKEUP_DONE(m);                            \
3728         if (!m->active && !m->inactive && !m->throttled) {              \
3729                 vm_page_lockspin_queues();                              \
3730                 if (!m->active && !m->inactive && !m->throttled)        \
3731                         vm_page_activate(m);                            \
3732                 vm_page_unlock_queues();                                \
3733         }                                                               \
3734         MACRO_END
3735
3736         /*
3737          * We must verify that the maps have not changed
3738          * since our last lookup.
3739          */
3740         if (m != VM_PAGE_NULL) {
3741                 old_copy_object = m->object->copy;
3742                 vm_object_unlock(m->object);
3743         } else {
3744                 old_copy_object = VM_OBJECT_NULL;
3745                 vm_object_unlock(object);
3746         }
3747
3748         /*
3749          * no object locks are held at this point
3750          */
3751         if ((map != original_map) || !vm_map_verify(map, &version)) {
3752                 vm_object_t             retry_object;
3753                 vm_object_offset_t      retry_offset;
3754                 vm_prot_t               retry_prot;
3755
3756                 /*
3757                  * To avoid trying to write_lock the map while another
3758                  * thread has it read_locked (in vm_map_pageable), we
3759                  * do not try for write permission.  If the page is
3760                  * still writable, we will get write permission.  If it
3761                  * is not, or has been marked needs_copy, we enter the
3762                  * mapping without write permission, and will merely
3763                  * take another fault.
3764                  */
3765                 map = original_map;
3766                 vm_map_lock_read(map);
3767
3768                 kr = vm_map_lookup_locked(&map, vaddr,
3769                                           fault_type & ~VM_PROT_WRITE,
3770                                           OBJECT_LOCK_EXCLUSIVE, &version,
3771                                           &retry_object, &retry_offset, &retry_prot,
3772                                           &wired,
3773                                           &fault_info,
3774                                           &real_map);
3775                 pmap = real_map->pmap;
3776
3777                 if (kr != KERN_SUCCESS) {
3778                         vm_map_unlock_read(map);
3779
3780                         if (m != VM_PAGE_NULL) {
3781                                 /*
3782                                  * retake the lock so that
3783                                  * we can drop the paging reference
3784                                  * in vm_fault_cleanup and do the
3785                                  * PAGE_WAKEUP_DONE in RELEASE_PAGE
3786                                  */
3787                                 vm_object_lock(m->object);
3788
3789                                 RELEASE_PAGE(m);
3790
3791                                 vm_fault_cleanup(m->object, top_page);
3792                         } else {
3793                                 /*
3794                                  * retake the lock so that
3795                                  * we can drop the paging reference
3796                                  * in vm_fault_cleanup
3797                                  */
3798                                 vm_object_lock(object);
3799
3800                                 vm_fault_cleanup(object, top_page);
3801                         }
3802                         vm_object_deallocate(object);
3803
3804                         goto done;
3805                 }
3806                 vm_object_unlock(retry_object);
3807
3808                 if ((retry_object != object) || (retry_offset != offset)) {
3809
3810                         vm_map_unlock_read(map);
3811                         if (real_map != map)
3812                                 vm_map_unlock(real_map);
3813
3814                         if (m != VM_PAGE_NULL) {
3815                                 /*
3816                                  * retake the lock so that
3817                                  * we can drop the paging reference
3818                                  * in vm_fault_cleanup and do the
3819                                  * PAGE_WAKEUP_DONE in RELEASE_PAGE
3820                                  */
3821                                 vm_object_lock(m->object);
3822
3823                                 RELEASE_PAGE(m);
3824
3825                                 vm_fault_cleanup(m->object, top_page);
3826                         } else {
3827                                 /*
3828                                  * retake the lock so that
3829                                  * we can drop the paging reference
3830                                  * in vm_fault_cleanup
3831                                  */
3832                                 vm_object_lock(object);
3833
3834                                 vm_fault_cleanup(object, top_page);
3835                         }
3836                         vm_object_deallocate(object);
3837
3838                         goto RetryFault;
3839                 }
3840                 /*
3841                  * Check whether the protection has changed or the object
3842                  * has been copied while we left the map unlocked.
3843                  */
3844                 prot &= retry_prot;
3845         }
3846         if (m != VM_PAGE_NULL) {
3847                 vm_object_lock(m->object);
3848
3849                 if (m->object->copy != old_copy_object) {
3850                         /*
3851                          * The copy object changed while the top-level object
3852                          * was unlocked, so take away write permission.
3853                          */
3854                         prot &= ~VM_PROT_WRITE;
3855                 }
3856         } else
3857                 vm_object_lock(object);
3858
3859         /*
3860          * If we want to wire down this page, but no longer have
3861          * adequate permissions, we must start all over.
3862          */
3863         if (wired && (fault_type != (prot | VM_PROT_WRITE))) {
3864
3865                 vm_map_verify_done(map, &version);
3866                 if (real_map != map)
3867                         vm_map_unlock(real_map);
3868
3869                 if (m != VM_PAGE_NULL) {
3870                         RELEASE_PAGE(m);
3871
3872                         vm_fault_cleanup(m->object, top_page);
3873                 } else
3874                         vm_fault_cleanup(object, top_page);
3875
3876                 vm_object_deallocate(object);
3877
3878                 goto RetryFault;
3879         }
3880         if (m != VM_PAGE_NULL) {
3881                 /*
3882                  * Put this page into the physical map.
3883                  * We had to do the unlock above because pmap_enter
3884                  * may cause other faults.  The page may be on
3885                  * the pageout queues.  If the pageout daemon comes
3886                  * across the page, it will remove it from the queues.
3887                  */
3888                 if (caller_pmap) {
3889                         kr = vm_fault_enter(m,
3890                                             caller_pmap,
3891                                             caller_pmap_addr,
3892                                             prot,
3893                                             fault_type,
3894                                             wired,
3895                                             change_wiring,
3896                                             fault_info.no_cache,
3897                                             fault_info.cs_bypass,
3898                                             NULL,
3899                                             &type_of_fault);
3900                 } else {
3901                         kr = vm_fault_enter(m,
3902                                             pmap,
3903                                             vaddr,
3904                                             prot,
3905                                             fault_type,
3906                                             wired,
3907                                             change_wiring,
3908                                             fault_info.no_cache,
3909                                             fault_info.cs_bypass,
3910                                             NULL,
3911                                             &type_of_fault);
3912                 }
3913                 if (kr != KERN_SUCCESS) {
3914                         /* abort this page fault */
3915                         vm_map_verify_done(map, &version);
3916                         if (real_map != map)
3917                                 vm_map_unlock(real_map);
3918                         PAGE_WAKEUP_DONE(m);
3919                         vm_fault_cleanup(m->object, top_page);
3920                         vm_object_deallocate(object);
3921                         goto done;
3922                 }
3923         } else {
3924
3925                 vm_map_entry_t          entry;
3926                 vm_map_offset_t         laddr;
3927                 vm_map_offset_t         ldelta, hdelta;
3928
3929                 /*
3930                  * do a pmap block mapping from the physical address
3931                  * in the object
3932                  */
3933
3934 #ifdef ppc
3935                 /* While we do not worry about execution protection in   */
3936                 /* general, certian pages may have instruction execution */
3937                 /* disallowed.  We will check here, and if not allowed   */
3938                 /* to execute, we return with a protection failure.      */
3939
3940                 if ((fault_type & VM_PROT_EXECUTE) &&
3941                         (!pmap_eligible_for_execute((ppnum_t)(object->vo_shadow_offset >> 12)))) {
3942
3943                         vm_map_verify_done(map, &version);
3944
3945                         if (real_map != map)
3946                                 vm_map_unlock(real_map);
3947
3948                         vm_fault_cleanup(object, top_page);
3949                         vm_object_deallocate(object);
3950
3951                         kr = KERN_PROTECTION_FAILURE;
3952                         goto done;
3953                 }
3954 #endif  /* ppc */
3955
3956                 if (real_map != map)
3957                         vm_map_unlock(real_map);
3958
3959                 if (original_map != map) {
3960                         vm_map_unlock_read(map);
3961                         vm_map_lock_read(original_map);
3962                         map = original_map;
3963                 }
3964                 real_map = map;
3965
3966                 laddr = vaddr;
3967                 hdelta = 0xFFFFF000;
3968                 ldelta = 0xFFFFF000;
3969
3970                 while (vm_map_lookup_entry(map, laddr, &entry)) {
3971                         if (ldelta > (laddr - entry->vme_start))
3972                                 ldelta = laddr - entry->vme_start;
3973                         if (hdelta > (entry->vme_end - laddr))
3974                                 hdelta = entry->vme_end - laddr;
3975                         if (entry->is_sub_map) {
3976
3977                                 laddr = (laddr - entry->vme_start)
3978                                                         + entry->offset;
3979                                 vm_map_lock_read(entry->object.sub_map);
3980
3981                                 if (map != real_map)
3982                                         vm_map_unlock_read(map);
3983                                 if (entry->use_pmap) {
3984                                         vm_map_unlock_read(real_map);
3985                                         real_map = entry->object.sub_map;
3986                                 }
3987                                 map = entry->object.sub_map;
3988
3989                         } else {
3990                                 break;
3991                         }
3992                 }
3993
3994                 if (vm_map_lookup_entry(map, laddr, &entry) &&
3995                                         (entry->object.vm_object != NULL) &&
3996                                         (entry->object.vm_object == object)) {
3997
3998                         int superpage = (!object->pager_created && object->phys_contiguous)? VM_MEM_SUPERPAGE : 0;
3999                         if (caller_pmap) {
4000                                 /*
4001                                  * Set up a block mapped area
4002                                  */
4003                                 assert((uint32_t)((ldelta + hdelta) >> 12) == ((ldelta + hdelta) >> 12));
4004                                 pmap_map_block(caller_pmap,
4005                                                (addr64_t)(caller_pmap_addr - ldelta),
4006                                                (ppnum_t)((((vm_map_offset_t) (entry->object.vm_object->vo_shadow_offset)) +
4007                                                           entry->offset + (laddr - entry->vme_start) - ldelta) >> 12),
4008                                                (uint32_t)((ldelta + hdelta) >> 12), prot,
4009                                                (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
4010                         } else {
4011                                 /*
4012                                  * Set up a block mapped area
4013                                  */
4014                                 assert((uint32_t)((ldelta + hdelta) >> 12) == ((ldelta + hdelta) >> 12));
4015                                 pmap_map_block(real_map->pmap,
4016                                                (addr64_t)(vaddr - ldelta),
4017                                                (ppnum_t)((((vm_map_offset_t)(entry->object.vm_object->vo_shadow_offset)) +
4018                                                           entry->offset + (laddr - entry->vme_start) - ldelta) >> 12),
4019                                                (uint32_t)((ldelta + hdelta) >> 12), prot,
4020                                                (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
4021                         }
4022                 }
4023         }
4024
4025         /*
4026          * Unlock everything, and return
4027          */
4028         vm_map_verify_done(map, &version);
4029         if (real_map != map)
4030                 vm_map_unlock(real_map);
4031
4032         if (m != VM_PAGE_NULL) {
4033                 PAGE_WAKEUP_DONE(m);
4034
4035                 vm_fault_cleanup(m->object, top_page);
4036         } else
4037                 vm_fault_cleanup(object, top_page);
4038
4039         vm_object_deallocate(object);
4040
4041 #undef  RELEASE_PAGE
4042
4043         kr = KERN_SUCCESS;
4044 done:
4045         thread_interrupt_level(interruptible_state);
4046
4047         throttle_lowpri_io(TRUE);
4048
4049         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4050                               (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
4051                               (int)((uint64_t)vaddr >> 32),
4052                               (int)vaddr,
4053                               kr,
4054                               type_of_fault,
4055                               0);
4056
4057         return (kr);
4058 }
4059
4060 /*
4061  *      vm_fault_wire:
4062  *
4063  *      Wire down a range of virtual addresses in a map.
4064  */
4065 kern_return_t
4066 vm_fault_wire(
4067         vm_map_t        map,
4068         vm_map_entry_t  entry,
4069         pmap_t          pmap,
4070         vm_map_offset_t pmap_addr)
4071 {
4072
4073         register vm_map_offset_t        va;
4074         register vm_map_offset_t        end_addr = entry->vme_end;
4075         register kern_return_t  rc;
4076
4077         assert(entry->in_transition);
4078
4079         if ((entry->object.vm_object != NULL) &&
4080                         !entry->is_sub_map &&
4081                         entry->object.vm_object->phys_contiguous) {
4082                 return KERN_SUCCESS;
4083         }
4084
4085         /*
4086          *      Inform the physical mapping system that the
4087          *      range of addresses may not fault, so that
4088          *      page tables and such can be locked down as well.
4089          */
4090
4091         pmap_pageable(pmap, pmap_addr,
4092                 pmap_addr + (end_addr - entry->vme_start), FALSE);
4093
4094         /*
4095          *      We simulate a fault to get the page and enter it
4096          *      in the physical map.
4097          */
4098
4099         for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
4100                 if ((rc = vm_fault_wire_fast(
4101                         map, va, entry, pmap,
4102                         pmap_addr + (va - entry->vme_start)
4103                         )) != KERN_SUCCESS) {
4104                         rc = vm_fault(map, va, VM_PROT_NONE, TRUE,
4105                                 (pmap == kernel_pmap) ?
4106                                         THREAD_UNINT : THREAD_ABORTSAFE,
4107                                 pmap, pmap_addr + (va - entry->vme_start));
4108                         DTRACE_VM2(softlock, int, 1, (uint64_t *), NULL);
4109                 }
4110
4111                 if (rc != KERN_SUCCESS) {
4112                         struct vm_map_entry     tmp_entry = *entry;
4113
4114                         /* unwire wired pages */
4115                         tmp_entry.vme_end = va;
4116                         vm_fault_unwire(map,
4117                                 &tmp_entry, FALSE, pmap, pmap_addr);
4118
4119                         return rc;
4120                 }
4121         }
4122         return KERN_SUCCESS;
4123 }
4124
4125 /*
4126  *      vm_fault_unwire:
4127  *
4128  *      Unwire a range of virtual addresses in a map.
4129  */
4130 void
4131 vm_fault_unwire(
4132         vm_map_t        map,
4133         vm_map_entry_t  entry,
4134         boolean_t       deallocate,
4135         pmap_t          pmap,
4136         vm_map_offset_t pmap_addr)
4137 {
4138         register vm_map_offset_t        va;
4139         register vm_map_offset_t        end_addr = entry->vme_end;
4140         vm_object_t             object;
4141         struct vm_object_fault_info fault_info;
4142
4143         object = (entry->is_sub_map)
4144                         ? VM_OBJECT_NULL : entry->object.vm_object;
4145
4146         /*
4147          * If it's marked phys_contiguous, then vm_fault_wire() didn't actually
4148          * do anything since such memory is wired by default.  So we don't have
4149          * anything to undo here.
4150          */
4151
4152         if (object != VM_OBJECT_NULL && object->phys_contiguous)
4153                 return;
4154
4155         fault_info.interruptible = THREAD_UNINT;
4156         fault_info.behavior = entry->behavior;
4157         fault_info.user_tag = entry->alias;
4158         fault_info.lo_offset = entry->offset;
4159         fault_info.hi_offset = (entry->vme_end - entry->vme_start) + entry->offset;
4160         fault_info.no_cache = entry->no_cache;
4161         fault_info.stealth = TRUE;
4162         fault_info.io_sync = FALSE;
4163         fault_info.cs_bypass = FALSE;
4164         fault_info.mark_zf_absent = FALSE;
4165         fault_info.batch_pmap_op = FALSE;
4166
4167         /*
4168          *      Since the pages are wired down, we must be able to
4169          *      get their mappings from the physical map system.
4170          */
4171
4172         for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
4173
4174                 if (object == VM_OBJECT_NULL) {
4175                         if (pmap) {
4176                                 pmap_change_wiring(pmap,
4177                                                    pmap_addr + (va - entry->vme_start), FALSE);
4178                         }
4179                         (void) vm_fault(map, va, VM_PROT_NONE,
4180                                         TRUE, THREAD_UNINT, pmap, pmap_addr);
4181                 } else {
4182                         vm_prot_t       prot;
4183                         vm_page_t       result_page;
4184                         vm_page_t       top_page;
4185                         vm_object_t     result_object;
4186                         vm_fault_return_t result;
4187
4188                         if (end_addr - va > (vm_size_t) -1) {
4189                                 /* 32-bit overflow */
4190                                 fault_info.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
4191                         } else {
4192                                 fault_info.cluster_size = (vm_size_t) (end_addr - va);
4193                                 assert(fault_info.cluster_size == end_addr - va);
4194                         }
4195
4196                         do {
4197                                 prot = VM_PROT_NONE;
4198
4199                                 vm_object_lock(object);
4200                                 vm_object_paging_begin(object);
4201                                 XPR(XPR_VM_FAULT,
4202                                         "vm_fault_unwire -> vm_fault_page\n",
4203                                         0,0,0,0,0);
4204                                 result = vm_fault_page(
4205                                         object,
4206                                         entry->offset + (va - entry->vme_start),
4207                                         VM_PROT_NONE, TRUE,
4208                                         &prot, &result_page, &top_page,
4209                                         (int *)0,
4210                                         NULL, map->no_zero_fill,
4211                                         FALSE, &fault_info);
4212                         } while (result == VM_FAULT_RETRY);
4213
4214                         /*
4215                          * If this was a mapping to a file on a device that has been forcibly
4216                          * unmounted, then we won't get a page back from vm_fault_page().  Just
4217                          * move on to the next one in case the remaining pages are mapped from
4218                          * different objects.  During a forced unmount, the object is terminated
4219                          * so the alive flag will be false if this happens.  A forced unmount will
4220                          * will occur when an external disk is unplugged before the user does an
4221                          * eject, so we don't want to panic in that situation.
4222                          */
4223
4224                         if (result == VM_FAULT_MEMORY_ERROR && !object->alive)
4225                                 continue;
4226
4227                         if (result != VM_FAULT_SUCCESS)
4228                                 panic("vm_fault_unwire: failure");
4229
4230                         result_object = result_page->object;
4231
4232                         if (deallocate) {
4233                                 assert(result_page->phys_page !=
4234                                        vm_page_fictitious_addr);
4235                                 pmap_disconnect(result_page->phys_page);
4236                                 VM_PAGE_FREE(result_page);
4237                         } else {
4238                                 if ((pmap) && (result_page->phys_page != vm_page_guard_addr))
4239                                         pmap_change_wiring(pmap,
4240                                             pmap_addr + (va - entry->vme_start), FALSE);
4241
4242
4243                                 if (VM_PAGE_WIRED(result_page)) {
4244                                         vm_page_lockspin_queues();
4245                                         vm_page_unwire(result_page, TRUE);
4246                                         vm_page_unlock_queues();
4247                                 }
4248                                 if(entry->zero_wired_pages) {
4249                                         pmap_zero_page(result_page->phys_page);
4250                                         entry->zero_wired_pages = FALSE;
4251                                 }
4252
4253                                 PAGE_WAKEUP_DONE(result_page);
4254                         }
4255                         vm_fault_cleanup(result_object, top_page);
4256                 }
4257         }
4258
4259         /*
4260          *      Inform the physical mapping system that the range
4261          *      of addresses may fault, so that page tables and
4262          *      such may be unwired themselves.
4263          */
4264
4265         pmap_pageable(pmap, pmap_addr,
4266                 pmap_addr + (end_addr - entry->vme_start), TRUE);
4267
4268 }
4269
4270 /*
4271  *      vm_fault_wire_fast:
4272  *
4273  *      Handle common case of a wire down page fault at the given address.
4274  *      If successful, the page is inserted into the associated physical map.
4275  *      The map entry is passed in to avoid the overhead of a map lookup.
4276  *
4277  *      NOTE: the given address should be truncated to the
4278  *      proper page address.
4279  *
4280  *      KERN_SUCCESS is returned if the page fault is handled; otherwise,
4281  *      a standard error specifying why the fault is fatal is returned.
4282  *
4283  *      The map in question must be referenced, and remains so.
4284  *      Caller has a read lock on the map.
4285  *
4286  *      This is a stripped version of vm_fault() for wiring pages.  Anything
4287  *      other than the common case will return KERN_FAILURE, and the caller
4288  *      is expected to call vm_fault().
4289  */
4290 kern_return_t
4291 vm_fault_wire_fast(
4292         __unused vm_map_t       map,
4293         vm_map_offset_t va,
4294         vm_map_entry_t  entry,
4295         pmap_t                  pmap,
4296         vm_map_offset_t pmap_addr)
4297 {
4298         vm_object_t             object;
4299         vm_object_offset_t      offset;
4300         register vm_page_t      m;
4301         vm_prot_t               prot;
4302         thread_t                thread = current_thread();
4303         int                     type_of_fault;
4304         kern_return_t           kr;
4305
4306         VM_STAT_INCR(faults);
4307
4308         if (thread != THREAD_NULL && thread->task != TASK_NULL)
4309           thread->task->faults++;
4310
4311 /*
4312  *      Recovery actions
4313  */
4314
4315 #undef  RELEASE_PAGE
4316 #define RELEASE_PAGE(m) {                               \
4317         PAGE_WAKEUP_DONE(m);                            \
4318         vm_page_lockspin_queues();                      \
4319         vm_page_unwire(m, TRUE);                        \
4320         vm_page_unlock_queues();                        \
4321 }
4322
4323
4324 #undef  UNLOCK_THINGS
4325 #define UNLOCK_THINGS   {                               \
4326         vm_object_paging_end(object);                      \
4327         vm_object_unlock(object);                          \
4328 }
4329
4330 #undef  UNLOCK_AND_DEALLOCATE
4331 #define UNLOCK_AND_DEALLOCATE   {                       \
4332         UNLOCK_THINGS;                                  \
4333         vm_object_deallocate(object);                   \
4334 }
4335 /*
4336  *      Give up and have caller do things the hard way.
4337  */
4338
4339 #define GIVE_UP {                                       \
4340         UNLOCK_AND_DEALLOCATE;                          \
4341         return(KERN_FAILURE);                           \
4342 }
4343
4344
4345         /*
4346          *      If this entry is not directly to a vm_object, bail out.
4347          */
4348         if (entry->is_sub_map)
4349                 return(KERN_FAILURE);
4350
4351         /*
4352          *      Find the backing store object and offset into it.
4353          */
4354
4355         object = entry->object.vm_object;
4356         offset = (va - entry->vme_start) + entry->offset;
4357         prot = entry->protection;
4358
4359         /*
4360          *      Make a reference to this object to prevent its
4361          *      disposal while we are messing with it.
4362          */
4363
4364         vm_object_lock(object);
4365         vm_object_reference_locked(object);
4366         vm_object_paging_begin(object);
4367
4368         /*
4369          *      INVARIANTS (through entire routine):
4370          *
4371          *      1)      At all times, we must either have the object
4372          *              lock or a busy page in some object to prevent
4373          *              some other thread from trying to bring in
4374          *              the same page.
4375          *
4376          *      2)      Once we have a busy page, we must remove it from
4377          *              the pageout queues, so that the pageout daemon
4378          *              will not grab it away.
4379          *
4380          */
4381
4382         /*
4383          *      Look for page in top-level object.  If it's not there or
4384          *      there's something going on, give up.
4385          * ENCRYPTED SWAP: use the slow fault path, since we'll need to
4386          * decrypt the page before wiring it down.
4387          */
4388         m = vm_page_lookup(object, offset);
4389         if ((m == VM_PAGE_NULL) || (m->busy) || (m->encrypted) ||
4390             (m->unusual && ( m->error || m->restart || m->absent))) {
4391
4392                 GIVE_UP;
4393         }
4394         ASSERT_PAGE_DECRYPTED(m);
4395
4396         if (m->fictitious &&
4397             m->phys_page == vm_page_guard_addr) {
4398                 /*
4399                  * Guard pages are fictitious pages and are never
4400                  * entered into a pmap, so let's say it's been wired...
4401                  */
4402                 kr = KERN_SUCCESS;
4403                 goto done;
4404         }
4405
4406         /*
4407          *      Wire the page down now.  All bail outs beyond this
4408          *      point must unwire the page.
4409          */
4410
4411         vm_page_lockspin_queues();
4412         vm_page_wire(m);
4413         vm_page_unlock_queues();
4414
4415         /*
4416          *      Mark page busy for other threads.
4417          */
4418         assert(!m->busy);
4419         m->busy = TRUE;
4420         assert(!m->absent);
4421
4422         /*
4423          *      Give up if the page is being written and there's a copy object
4424          */
4425         if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
4426                 RELEASE_PAGE(m);
4427                 GIVE_UP;
4428         }
4429
4430         /*
4431          *      Put this page into the physical map.
4432          */
4433         type_of_fault = DBG_CACHE_HIT_FAULT;
4434         kr = vm_fault_enter(m,
4435                             pmap,
4436                             pmap_addr,
4437                             prot,
4438                             prot,
4439                             TRUE,
4440                             FALSE,
4441                             FALSE,
4442                             FALSE,
4443                             NULL,
4444                             &type_of_fault);
4445
4446 done:
4447         /*
4448          *      Unlock everything, and return
4449          */
4450
4451         PAGE_WAKEUP_DONE(m);
4452         UNLOCK_AND_DEALLOCATE;
4453
4454         return kr;
4455
4456 }
4457
4458 /*
4459  *      Routine:        vm_fault_copy_cleanup
4460  *      Purpose:
4461  *              Release a page used by vm_fault_copy.
4462  */
4463
4464 void
4465 vm_fault_copy_cleanup(
4466         vm_page_t       page,
4467         vm_page_t       top_page)
4468 {
4469         vm_object_t     object = page->object;
4470
4471         vm_object_lock(object);
4472         PAGE_WAKEUP_DONE(page);
4473         if (!page->active && !page->inactive && !page->throttled) {
4474                 vm_page_lockspin_queues();
4475                 if (!page->active && !page->inactive && !page->throttled)
4476                         vm_page_activate(page);
4477                 vm_page_unlock_queues();
4478         }
4479         vm_fault_cleanup(object, top_page);
4480 }
4481
4482 void
4483 vm_fault_copy_dst_cleanup(
4484         vm_page_t       page)
4485 {
4486         vm_object_t     object;
4487
4488         if (page != VM_PAGE_NULL) {
4489                 object = page->object;
4490                 vm_object_lock(object);
4491                 vm_page_lockspin_queues();
4492                 vm_page_unwire(page, TRUE);
4493                 vm_page_unlock_queues();
4494                 vm_object_paging_end(object);
4495                 vm_object_unlock(object);
4496         }
4497 }
4498
4499 /*
4500  *      Routine:        vm_fault_copy
4501  *
4502  *      Purpose:
4503  *              Copy pages from one virtual memory object to another --
4504  *              neither the source nor destination pages need be resident.
4505  *
4506  *              Before actually copying a page, the version associated with
4507  *              the destination address map wil be verified.
4508  *
4509  *      In/out conditions:
4510  *              The caller must hold a reference, but not a lock, to
4511  *              each of the source and destination objects and to the
4512  *              destination map.
4513  *
4514  *      Results:
4515  *              Returns KERN_SUCCESS if no errors were encountered in
4516  *              reading or writing the data.  Returns KERN_INTERRUPTED if
4517  *              the operation was interrupted (only possible if the
4518  *              "interruptible" argument is asserted).  Other return values
4519  *              indicate a permanent error in copying the data.
4520  *
4521  *              The actual amount of data copied will be returned in the
4522  *              "copy_size" argument.  In the event that the destination map
4523  *              verification failed, this amount may be less than the amount
4524  *              requested.
4525  */
4526 kern_return_t
4527 vm_fault_copy(
4528         vm_object_t             src_object,
4529         vm_object_offset_t      src_offset,
4530         vm_map_size_t           *copy_size,             /* INOUT */
4531         vm_object_t             dst_object,
4532         vm_object_offset_t      dst_offset,
4533         vm_map_t                dst_map,
4534         vm_map_version_t         *dst_version,
4535         int                     interruptible)
4536 {
4537         vm_page_t               result_page;
4538
4539         vm_page_t               src_page;
4540         vm_page_t               src_top_page;
4541         vm_prot_t               src_prot;
4542
4543         vm_page_t               dst_page;
4544         vm_page_t               dst_top_page;
4545         vm_prot_t               dst_prot;
4546
4547         vm_map_size_t           amount_left;
4548         vm_object_t             old_copy_object;
4549         kern_return_t           error = 0;
4550         vm_fault_return_t       result;
4551
4552         vm_map_size_t           part_size;
4553         struct vm_object_fault_info fault_info_src;
4554         struct vm_object_fault_info fault_info_dst;
4555
4556         /*
4557          * In order not to confuse the clustered pageins, align
4558          * the different offsets on a page boundary.
4559          */
4560
4561 #define RETURN(x)                                       \
4562         MACRO_BEGIN                                     \
4563         *copy_size -= amount_left;                      \
4564         MACRO_RETURN(x);                                \
4565         MACRO_END
4566
4567         amount_left = *copy_size;
4568
4569         fault_info_src.interruptible = interruptible;
4570         fault_info_src.behavior = VM_BEHAVIOR_SEQUENTIAL;
4571         fault_info_src.user_tag  = 0;
4572         fault_info_src.lo_offset = vm_object_trunc_page(src_offset);
4573         fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left;
4574         fault_info_src.no_cache   = FALSE;
4575         fault_info_src.stealth = TRUE;
4576         fault_info_src.io_sync = FALSE;
4577         fault_info_src.cs_bypass = FALSE;
4578         fault_info_src.mark_zf_absent = FALSE;
4579         fault_info_src.batch_pmap_op = FALSE;
4580
4581         fault_info_dst.interruptible = interruptible;
4582         fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL;
4583         fault_info_dst.user_tag  = 0;
4584         fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset);
4585         fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left;
4586         fault_info_dst.no_cache   = FALSE;
4587         fault_info_dst.stealth = TRUE;
4588         fault_info_dst.io_sync = FALSE;
4589         fault_info_dst.cs_bypass = FALSE;
4590         fault_info_dst.mark_zf_absent = FALSE;
4591         fault_info_dst.batch_pmap_op = FALSE;
4592
4593         do { /* while (amount_left > 0) */
4594                 /*
4595                  * There may be a deadlock if both source and destination
4596                  * pages are the same. To avoid this deadlock, the copy must
4597                  * start by getting the destination page in order to apply
4598                  * COW semantics if any.
4599                  */
4600
4601         RetryDestinationFault: ;
4602
4603                 dst_prot = VM_PROT_WRITE|VM_PROT_READ;
4604
4605                 vm_object_lock(dst_object);
4606                 vm_object_paging_begin(dst_object);
4607
4608                 if (amount_left > (vm_size_t) -1) {
4609                         /* 32-bit overflow */
4610                         fault_info_dst.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
4611                 } else {
4612                         fault_info_dst.cluster_size = (vm_size_t) amount_left;
4613                         assert(fault_info_dst.cluster_size == amount_left);
4614                 }
4615
4616                 XPR(XPR_VM_FAULT,"vm_fault_copy -> vm_fault_page\n",0,0,0,0,0);
4617                 result = vm_fault_page(dst_object,
4618                                        vm_object_trunc_page(dst_offset),
4619                                        VM_PROT_WRITE|VM_PROT_READ,
4620                                        FALSE,
4621                                        &dst_prot, &dst_page, &dst_top_page,
4622                                        (int *)0,
4623                                        &error,
4624                                        dst_map->no_zero_fill,
4625                                        FALSE, &fault_info_dst);
4626                 switch (result) {
4627                 case VM_FAULT_SUCCESS:
4628                         break;
4629                 case VM_FAULT_RETRY:
4630                         goto RetryDestinationFault;
4631                 case VM_FAULT_MEMORY_SHORTAGE:
4632                         if (vm_page_wait(interruptible))
4633                                 goto RetryDestinationFault;
4634                         /* fall thru */
4635                 case VM_FAULT_INTERRUPTED:
4636                         RETURN(MACH_SEND_INTERRUPTED);
4637                 case VM_FAULT_SUCCESS_NO_VM_PAGE:
4638                         /* success but no VM page: fail the copy */
4639                         vm_object_paging_end(dst_object);
4640                         vm_object_unlock(dst_object);
4641                         /*FALLTHROUGH*/
4642                 case VM_FAULT_MEMORY_ERROR:
4643                         if (error)
4644                                 return (error);
4645                         else
4646                                 return(KERN_MEMORY_ERROR);
4647                 default:
4648                         panic("vm_fault_copy: unexpected error 0x%x from "
4649                               "vm_fault_page()\n", result);
4650                 }
4651                 assert ((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
4652
4653                 old_copy_object = dst_page->object->copy;
4654
4655                 /*
4656                  * There exists the possiblity that the source and
4657                  * destination page are the same.  But we can't
4658                  * easily determine that now.  If they are the
4659                  * same, the call to vm_fault_page() for the
4660                  * destination page will deadlock.  To prevent this we
4661                  * wire the page so we can drop busy without having
4662                  * the page daemon steal the page.  We clean up the
4663                  * top page  but keep the paging reference on the object
4664                  * holding the dest page so it doesn't go away.
4665                  */
4666
4667                 vm_page_lockspin_queues();
4668                 vm_page_wire(dst_page);
4669                 vm_page_unlock_queues();
4670                 PAGE_WAKEUP_DONE(dst_page);
4671                 vm_object_unlock(dst_page->object);
4672
4673                 if (dst_top_page != VM_PAGE_NULL) {
4674                         vm_object_lock(dst_object);
4675                         VM_PAGE_FREE(dst_top_page);
4676                         vm_object_paging_end(dst_object);
4677                         vm_object_unlock(dst_object);
4678                 }
4679
4680         RetrySourceFault: ;
4681
4682                 if (src_object == VM_OBJECT_NULL) {
4683                         /*
4684                          *      No source object.  We will just
4685                          *      zero-fill the page in dst_object.
4686                          */
4687                         src_page = VM_PAGE_NULL;
4688                         result_page = VM_PAGE_NULL;
4689                 } else {
4690                         vm_object_lock(src_object);
4691                         src_page = vm_page_lookup(src_object,
4692                                                   vm_object_trunc_page(src_offset));
4693                         if (src_page == dst_page) {
4694                                 src_prot = dst_prot;
4695                                 result_page = VM_PAGE_NULL;
4696                         } else {
4697                                 src_prot = VM_PROT_READ;
4698                                 vm_object_paging_begin(src_object);
4699
4700                                 if (amount_left > (vm_size_t) -1) {
4701                                         /* 32-bit overflow */
4702                                         fault_info_src.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
4703                                 } else {
4704                                         fault_info_src.cluster_size = (vm_size_t) amount_left;
4705                                         assert(fault_info_src.cluster_size == amount_left);
4706                                 }
4707
4708                                 XPR(XPR_VM_FAULT,
4709                                         "vm_fault_copy(2) -> vm_fault_page\n",
4710                                         0,0,0,0,0);
4711                                 result = vm_fault_page(
4712                                         src_object,
4713                                         vm_object_trunc_page(src_offset),
4714                                         VM_PROT_READ, FALSE,
4715                                         &src_prot,
4716                                         &result_page, &src_top_page,
4717                                         (int *)0, &error, FALSE,
4718                                         FALSE, &fault_info_src);
4719
4720                                 switch (result) {
4721                                 case VM_FAULT_SUCCESS:
4722                                         break;
4723                                 case VM_FAULT_RETRY:
4724                                         goto RetrySourceFault;
4725                                 case VM_FAULT_MEMORY_SHORTAGE:
4726                                         if (vm_page_wait(interruptible))
4727                                                 goto RetrySourceFault;
4728                                         /* fall thru */
4729                                 case VM_FAULT_INTERRUPTED:
4730                                         vm_fault_copy_dst_cleanup(dst_page);
4731                                         RETURN(MACH_SEND_INTERRUPTED);
4732                                 case VM_FAULT_SUCCESS_NO_VM_PAGE:
4733                                         /* success but no VM page: fail */
4734                                         vm_object_paging_end(src_object);
4735                                         vm_object_unlock(src_object);
4736                                         /*FALLTHROUGH*/
4737                                 case VM_FAULT_MEMORY_ERROR:
4738                                         vm_fault_copy_dst_cleanup(dst_page);
4739                                         if (error)
4740                                                 return (error);
4741                                         else
4742                                                 return(KERN_MEMORY_ERROR);
4743                                 default:
4744                                         panic("vm_fault_copy(2): unexpected "
4745                                               "error 0x%x from "
4746                                               "vm_fault_page()\n", result);
4747                                 }
4748
4749
4750                                 assert((src_top_page == VM_PAGE_NULL) ==
4751                                        (result_page->object == src_object));
4752                         }
4753                         assert ((src_prot & VM_PROT_READ) != VM_PROT_NONE);
4754                         vm_object_unlock(result_page->object);
4755                 }
4756
4757                 if (!vm_map_verify(dst_map, dst_version)) {
4758                         if (result_page != VM_PAGE_NULL && src_page != dst_page)
4759                                 vm_fault_copy_cleanup(result_page, src_top_page);
4760                         vm_fault_copy_dst_cleanup(dst_page);
4761                         break;
4762                 }
4763
4764                 vm_object_lock(dst_page->object);
4765
4766                 if (dst_page->object->copy != old_copy_object) {
4767                         vm_object_unlock(dst_page->object);
4768                         vm_map_verify_done(dst_map, dst_version);
4769                         if (result_page != VM_PAGE_NULL && src_page != dst_page)
4770                                 vm_fault_copy_cleanup(result_page, src_top_page);
4771                         vm_fault_copy_dst_cleanup(dst_page);
4772                         break;
4773                 }
4774                 vm_object_unlock(dst_page->object);
4775
4776                 /*
4777                  *      Copy the page, and note that it is dirty
4778                  *      immediately.
4779                  */
4780
4781                 if (!page_aligned(src_offset) ||
4782                         !page_aligned(dst_offset) ||
4783                         !page_aligned(amount_left)) {
4784
4785                         vm_object_offset_t      src_po,
4786                                                 dst_po;
4787
4788                         src_po = src_offset - vm_object_trunc_page(src_offset);
4789                         dst_po = dst_offset - vm_object_trunc_page(dst_offset);
4790
4791                         if (dst_po > src_po) {
4792                                 part_size = PAGE_SIZE - dst_po;
4793                         } else {
4794                                 part_size = PAGE_SIZE - src_po;
4795                         }
4796                         if (part_size > (amount_left)){
4797                                 part_size = amount_left;
4798                         }
4799
4800                         if (result_page == VM_PAGE_NULL) {
4801                                 assert((vm_offset_t) dst_po == dst_po);
4802                                 assert((vm_size_t) part_size == part_size);
4803                                 vm_page_part_zero_fill(dst_page,
4804                                                        (vm_offset_t) dst_po,
4805                                                        (vm_size_t) part_size);
4806                         } else {
4807                                 assert((vm_offset_t) src_po == src_po);
4808                                 assert((vm_offset_t) dst_po == dst_po);
4809                                 assert((vm_size_t) part_size == part_size);
4810                                 vm_page_part_copy(result_page,
4811                                                   (vm_offset_t) src_po,
4812                                                   dst_page,
4813                                                   (vm_offset_t) dst_po,
4814                                                   (vm_size_t)part_size);
4815                                 if(!dst_page->dirty){
4816                                         vm_object_lock(dst_object);
4817                                         SET_PAGE_DIRTY(dst_page, TRUE);
4818                                         vm_object_unlock(dst_page->object);
4819                                 }
4820
4821                         }
4822                 } else {
4823                         part_size = PAGE_SIZE;
4824
4825                         if (result_page == VM_PAGE_NULL)
4826                                 vm_page_zero_fill(dst_page);
4827                         else{
4828                                 vm_object_lock(result_page->object);
4829                                 vm_page_copy(result_page, dst_page);
4830                                 vm_object_unlock(result_page->object);
4831
4832                                 if(!dst_page->dirty){
4833                                         vm_object_lock(dst_object);
4834                                         SET_PAGE_DIRTY(dst_page, TRUE);
4835                                         vm_object_unlock(dst_page->object);
4836                                 }
4837                         }
4838
4839                 }
4840
4841                 /*
4842                  *      Unlock everything, and return
4843                  */
4844
4845                 vm_map_verify_done(dst_map, dst_version);
4846
4847                 if (result_page != VM_PAGE_NULL && src_page != dst_page)
4848                         vm_fault_copy_cleanup(result_page, src_top_page);
4849                 vm_fault_copy_dst_cleanup(dst_page);
4850
4851                 amount_left -= part_size;
4852                 src_offset += part_size;
4853                 dst_offset += part_size;
4854         } while (amount_left > 0);
4855
4856         RETURN(KERN_SUCCESS);
4857 #undef  RETURN
4858
4859         /*NOTREACHED*/
4860 }
4861
4862 #if     VM_FAULT_CLASSIFY
4863 /*
4864  *      Temporary statistics gathering support.
4865  */
4866
4867 /*
4868  *      Statistics arrays:
4869  */
4870 #define VM_FAULT_TYPES_MAX      5
4871 #define VM_FAULT_LEVEL_MAX      8
4872
4873 int     vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
4874
4875 #define VM_FAULT_TYPE_ZERO_FILL 0
4876 #define VM_FAULT_TYPE_MAP_IN    1
4877 #define VM_FAULT_TYPE_PAGER     2
4878 #define VM_FAULT_TYPE_COPY      3
4879 #define VM_FAULT_TYPE_OTHER     4
4880
4881
4882 void
4883 vm_fault_classify(vm_object_t           object,
4884                   vm_object_offset_t    offset,
4885                   vm_prot_t             fault_type)
4886 {
4887         int             type, level = 0;
4888         vm_page_t       m;
4889
4890         while (TRUE) {
4891                 m = vm_page_lookup(object, offset);
4892                 if (m != VM_PAGE_NULL) {
4893                         if (m->busy || m->error || m->restart || m->absent) {
4894                                 type = VM_FAULT_TYPE_OTHER;
4895                                 break;
4896                         }
4897                         if (((fault_type & VM_PROT_WRITE) == 0) ||
4898                             ((level == 0) && object->copy == VM_OBJECT_NULL)) {
4899                                 type = VM_FAULT_TYPE_MAP_IN;
4900                                 break;
4901                         }
4902                         type = VM_FAULT_TYPE_COPY;
4903                         break;
4904                 }
4905                 else {
4906                         if (object->pager_created) {
4907                                 type = VM_FAULT_TYPE_PAGER;
4908                                 break;
4909                         }
4910                         if (object->shadow == VM_OBJECT_NULL) {
4911                                 type = VM_FAULT_TYPE_ZERO_FILL;
4912                                 break;
4913                         }
4914
4915                         offset += object->vo_shadow_offset;
4916                         object = object->shadow;
4917                         level++;
4918                         continue;
4919                 }
4920         }
4921
4922         if (level > VM_FAULT_LEVEL_MAX)
4923                 level = VM_FAULT_LEVEL_MAX;
4924
4925         vm_fault_stats[type][level] += 1;
4926
4927         return;
4928 }
4929
4930 /* cleanup routine to call from debugger */
4931
4932 void
4933 vm_fault_classify_init(void)
4934 {
4935         int type, level;
4936
4937         for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
4938                 for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
4939                         vm_fault_stats[type][level] = 0;
4940                 }
4941         }
4942
4943         return;
4944 }
4945 #endif  /* VM_FAULT_CLASSIFY */
4946
4947
4948 extern int cs_validation;
4949
4950 void
4951 vm_page_validate_cs_mapped(
4952         vm_page_t       page,
4953         const void      *kaddr)
4954 {
4955         vm_object_t             object;
4956         vm_object_offset_t      offset;
4957         kern_return_t           kr;
4958         memory_object_t         pager;
4959         void                    *blobs;
4960         boolean_t               validated, tainted;
4961
4962         assert(page->busy);
4963         vm_object_lock_assert_exclusive(page->object);
4964
4965         if (!cs_validation) {
4966                 return;
4967         }
4968
4969         if (page->wpmapped && !page->cs_tainted) {
4970                 /*
4971                  * This page was mapped for "write" access sometime in the
4972                  * past and could still be modifiable in the future.
4973                  * Consider it tainted.
4974                  * [ If the page was already found to be "tainted", no
4975                  * need to re-validate. ]
4976                  */
4977                 page->cs_validated = TRUE;
4978                 page->cs_tainted = TRUE;
4979                 if (cs_debug) {
4980                         printf("CODESIGNING: vm_page_validate_cs: "
4981                                "page %p obj %p off 0x%llx "
4982                                "was modified\n",
4983                                page, page->object, page->offset);
4984                 }
4985                 vm_cs_validated_dirtied++;
4986         }
4987
4988         if (page->cs_validated) {
4989                 return;
4990         }
4991
4992         vm_cs_validates++;
4993
4994         object = page->object;
4995         assert(object->code_signed);
4996         offset = page->offset;
4997
4998         if (!object->alive || object->terminating || object->pager == NULL) {
4999                 /*
5000                  * The object is terminating and we don't have its pager
5001                  * so we can't validate the data...
5002                  */
5003                 return;
5004         }
5005         /*
5006          * Since we get here to validate a page that was brought in by
5007          * the pager, we know that this pager is all setup and ready
5008          * by now.
5009          */
5010         assert(!object->internal);
5011         assert(object->pager != NULL);
5012         assert(object->pager_ready);
5013
5014         pager = object->pager;
5015         assert(object->paging_in_progress);
5016         kr = vnode_pager_get_object_cs_blobs(pager, &blobs);
5017         if (kr != KERN_SUCCESS) {
5018                 blobs = NULL;
5019         }
5020
5021         /* verify the SHA1 hash for this page */
5022         validated = cs_validate_page(blobs,
5023                                      pager,
5024                                      offset + object->paging_offset,
5025                                      (const void *)kaddr,
5026                                      &tainted);
5027
5028         page->cs_validated = validated;
5029         if (validated) {
5030                 page->cs_tainted = tainted;
5031         }
5032 }
5033
5034 void
5035 vm_page_validate_cs(
5036         vm_page_t       page)
5037 {
5038         vm_object_t             object;
5039         vm_object_offset_t      offset;
5040         vm_map_offset_t         koffset;
5041         vm_map_size_t           ksize;
5042         vm_offset_t             kaddr;
5043         kern_return_t           kr;
5044         boolean_t               busy_page;
5045
5046         vm_object_lock_assert_held(page->object);
5047
5048         if (!cs_validation) {
5049                 return;
5050         }
5051
5052         if (page->wpmapped && !page->cs_tainted) {
5053                 vm_object_lock_assert_exclusive(page->object);
5054
5055                 /*
5056                  * This page was mapped for "write" access sometime in the
5057                  * past and could still be modifiable in the future.
5058                  * Consider it tainted.
5059                  * [ If the page was already found to be "tainted", no
5060                  * need to re-validate. ]
5061                  */
5062                 page->cs_validated = TRUE;
5063                 page->cs_tainted = TRUE;
5064                 if (cs_debug) {
5065                         printf("CODESIGNING: vm_page_validate_cs: "
5066                                "page %p obj %p off 0x%llx "
5067                                "was modified\n",
5068                                page, page->object, page->offset);
5069                 }
5070                 vm_cs_validated_dirtied++;
5071         }
5072
5073         if (page->cs_validated) {
5074                 return;
5075         }
5076
5077 #if CHECK_CS_VALIDATION_BITMAP
5078         if ( vnode_pager_cs_check_validation_bitmap( page->object->pager, trunc_page(page->offset + page->object->paging_offset), CS_BITMAP_CHECK ) == KERN_SUCCESS) {
5079                 page->cs_validated = TRUE;
5080                 page->cs_tainted = FALSE;
5081                 vm_cs_bitmap_validated++;
5082                 return;
5083         }
5084 #endif
5085         vm_object_lock_assert_exclusive(page->object);
5086
5087         object = page->object;
5088         assert(object->code_signed);
5089         offset = page->offset;
5090
5091         busy_page = page->busy;
5092         if (!busy_page) {
5093                 /* keep page busy while we map (and unlock) the VM object */
5094                 page->busy = TRUE;
5095         }
5096
5097         /*
5098          * Take a paging reference on the VM object
5099          * to protect it from collapse or bypass,
5100          * and keep it from disappearing too.
5101          */
5102         vm_object_paging_begin(object);
5103
5104         /* map the page in the kernel address space */
5105         koffset = 0;
5106         ksize = PAGE_SIZE_64;
5107         kr = vm_paging_map_object(&koffset,
5108                                   page,
5109                                   object,
5110                                   offset,
5111                                   &ksize,
5112                                   VM_PROT_READ,
5113                                   FALSE); /* can't unlock object ! */
5114         if (kr != KERN_SUCCESS) {
5115                 panic("vm_page_validate_cs: could not map page: 0x%x\n", kr);
5116         }
5117         kaddr = CAST_DOWN(vm_offset_t, koffset);
5118
5119         /* validate the mapped page */
5120         vm_page_validate_cs_mapped(page, (const void *) kaddr);
5121
5122 #if CHECK_CS_VALIDATION_BITMAP
5123         if ( page->cs_validated == TRUE && page->cs_tainted == FALSE ) {
5124                 vnode_pager_cs_check_validation_bitmap( object->pager, trunc_page( offset + object->paging_offset), CS_BITMAP_SET );
5125         }
5126 #endif
5127         assert(page->busy);
5128         assert(object == page->object);
5129         vm_object_lock_assert_exclusive(object);
5130
5131         if (!busy_page) {
5132                 PAGE_WAKEUP_DONE(page);
5133         }
5134         if (koffset != 0) {
5135                 /* unmap the map from the kernel address space */
5136                 vm_paging_unmap_object(object, koffset, koffset + ksize);
5137                 koffset = 0;
5138                 ksize = 0;
5139                 kaddr = 0;
5140         }
5141         vm_object_paging_end(object);
5142 }