osfmk/vm/vm_fault.c

   1 /*
   2  * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58 /*
  59  *      File:   vm_fault.c
  60  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  61  *
  62  *      Page fault handling module.
  63  */
  64
  65 #include <mach_cluster_stats.h>
  66 #include <mach_pagemap.h>
  67 #include <libkern/OSAtomic.h>
  68
  69 #include <mach/mach_types.h>
  70 #include <mach/kern_return.h>
  71 #include <mach/message.h>       /* for error codes */
  72 #include <mach/vm_param.h>
  73 #include <mach/vm_behavior.h>
  74 #include <mach/memory_object.h>
  75                                 /* For memory_object_data_{request,unlock} */
  76 #include <mach/sdt.h>
  77
  78 #include <kern/kern_types.h>
  79 #include <kern/host_statistics.h>
  80 #include <kern/counters.h>
  81 #include <kern/task.h>
  82 #include <kern/thread.h>
  83 #include <kern/sched_prim.h>
  84 #include <kern/host.h>
  85 #include <kern/xpr.h>
  86 #include <kern/mach_param.h>
  87 #include <kern/macro_help.h>
  88 #include <kern/zalloc.h>
  89 #include <kern/misc_protos.h>
  90
  91 #include <vm/vm_fault.h>
  92 #include <vm/vm_map.h>
  93 #include <vm/vm_object.h>
  94 #include <vm/vm_page.h>
  95 #include <vm/vm_kern.h>
  96 #include <vm/pmap.h>
  97 #include <vm/vm_pageout.h>
  98 #include <vm/vm_protos.h>
  99 #include <vm/vm_external.h>
 100 #include <vm/memory_object.h>
 101 #include <vm/vm_purgeable_internal.h>   /* Needed by some vm_page.h macros */
 102 #include <vm/vm_shared_region.h>
 103
 104 #define VM_FAULT_CLASSIFY       0
 105
 106 #define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
 107
 108 int     vm_object_pagein_throttle = 16;
 109
 110 /*
 111  * We apply a hard throttle to the demand zero rate of tasks that we believe are running out of control which
 112  * kicks in when swap space runs out.  64-bit programs have massive address spaces and can leak enormous amounts
 113  * of memory if they're buggy and can run the system completely out of swap space.  If this happens, we
 114  * impose a hard throttle on them to prevent them from taking the last bit of memory left.  This helps
 115  * keep the UI active so that the user has a chance to kill the offending task before the system
 116  * completely hangs.
 117  *
 118  * The hard throttle is only applied when the system is nearly completely out of swap space and is only applied
 119  * to tasks that appear to be bloated.  When swap runs out, any task using more than vm_hard_throttle_threshold
 120  * will be throttled.  The throttling is done by giving the thread that's trying to demand zero a page a
 121  * delay of HARD_THROTTLE_DELAY microseconds before being allowed to try the page fault again.
 122  */
 123
 124 boolean_t thread_is_io_throttled(void);
 125
 126 uint64_t vm_hard_throttle_threshold;
 127
 128 extern unsigned int dp_pages_free, dp_pages_reserve;
 129
 130 #define NEED_TO_HARD_THROTTLE_THIS_TASK()       (((dp_pages_free + dp_pages_reserve < 2000) && \
 131                                                  (get_task_resident_size(current_task()) > vm_hard_throttle_threshold) && \
 132                                                  (current_task() != kernel_task) && VM_DYNAMIC_PAGING_ENABLED(memory_manager_default)) || \
 133                                                  (vm_page_free_count < vm_page_throttle_limit && thread_is_io_throttled() && \
 134                                                   (get_task_resident_size(current_task()) > vm_hard_throttle_threshold)))
 135
 136
 137 #define HARD_THROTTLE_DELAY     20000   /* 20000 us == 20 ms */
 138 #define SOFT_THROTTLE_DELAY     2000    /* 2000 us == 2 ms */
 139
 140
 141 extern int cs_debug;
 142
 143 boolean_t current_thread_aborted(void);
 144
 145 /* Forward declarations of internal routines. */
 146 extern kern_return_t vm_fault_wire_fast(
 147                                 vm_map_t        map,
 148                                 vm_map_offset_t va,
 149                                 vm_map_entry_t  entry,
 150                                 pmap_t          pmap,
 151                                 vm_map_offset_t pmap_addr);
 152
 153 extern void vm_fault_continue(void);
 154
 155 extern void vm_fault_copy_cleanup(
 156                                 vm_page_t       page,
 157                                 vm_page_t       top_page);
 158
 159 extern void vm_fault_copy_dst_cleanup(
 160                                 vm_page_t       page);
 161
 162 #if     VM_FAULT_CLASSIFY
 163 extern void vm_fault_classify(vm_object_t       object,
 164                           vm_object_offset_t    offset,
 165                           vm_prot_t             fault_type);
 166
 167 extern void vm_fault_classify_init(void);
 168 #endif
 169
 170 unsigned long vm_pmap_enter_blocked = 0;
 171 unsigned long vm_pmap_enter_retried = 0;
 172
 173 unsigned long vm_cs_validates = 0;
 174 unsigned long vm_cs_revalidates = 0;
 175 unsigned long vm_cs_query_modified = 0;
 176 unsigned long vm_cs_validated_dirtied = 0;
 177 unsigned long vm_cs_bitmap_validated = 0;
 178 #if CONFIG_ENFORCE_SIGNED_CODE
 179 int cs_enforcement_disable=0;
 180 #else
 181 static const int cs_enforcement_disable=1;
 182 #endif
 183
 184 /*
 185  *      Routine:        vm_fault_init
 186  *      Purpose:
 187  *              Initialize our private data structures.
 188  */
 189 void
 190 vm_fault_init(void)
 191 {
 192 #if !SECURE_KERNEL
 193 #if CONFIG_ENFORCE_SIGNED_CODE
 194         PE_parse_boot_argn("cs_enforcement_disable", &cs_enforcement_disable,
 195                            sizeof (cs_enforcement_disable));
 196 #endif
 197         PE_parse_boot_argn("cs_debug", &cs_debug, sizeof (cs_debug));
 198 #endif
 199
 200         /*
 201          * Choose a value for the hard throttle threshold based on the amount of ram.  The threshold is
 202          * computed as a percentage of available memory, and the percentage used is scaled inversely with
 203          * the amount of memory.  The pertange runs between 10% and 35%.  We use 35% for small memory systems
 204          * and reduce the value down to 10% for very large memory configurations.  This helps give us a
 205          * definition of a memory hog that makes more sense relative to the amount of ram in the machine.
 206          * The formula here simply uses the number of gigabytes of ram to adjust the percentage.
 207          */
 208
 209         vm_hard_throttle_threshold = sane_size * (35 - MIN((int)(sane_size / (1024*1024*1024)), 25)) / 100;
 210 }
 211
 212 /*
 213  *      Routine:        vm_fault_cleanup
 214  *      Purpose:
 215  *              Clean up the result of vm_fault_page.
 216  *      Results:
 217  *              The paging reference for "object" is released.
 218  *              "object" is unlocked.
 219  *              If "top_page" is not null,  "top_page" is
 220  *              freed and the paging reference for the object
 221  *              containing it is released.
 222  *
 223  *      In/out conditions:
 224  *              "object" must be locked.
 225  */
 226 void
 227 vm_fault_cleanup(
 228         register vm_object_t    object,
 229         register vm_page_t      top_page)
 230 {
 231         vm_object_paging_end(object);
 232         vm_object_unlock(object);
 233
 234         if (top_page != VM_PAGE_NULL) {
 235                 object = top_page->object;
 236
 237                 vm_object_lock(object);
 238                 VM_PAGE_FREE(top_page);
 239                 vm_object_paging_end(object);
 240                 vm_object_unlock(object);
 241         }
 242 }
 243
 244 #if     MACH_CLUSTER_STATS
 245 #define MAXCLUSTERPAGES 16
 246 struct {
 247         unsigned long pages_in_cluster;
 248         unsigned long pages_at_higher_offsets;
 249         unsigned long pages_at_lower_offsets;
 250 } cluster_stats_in[MAXCLUSTERPAGES];
 251 #define CLUSTER_STAT(clause)    clause
 252 #define CLUSTER_STAT_HIGHER(x)  \
 253         ((cluster_stats_in[(x)].pages_at_higher_offsets)++)
 254 #define CLUSTER_STAT_LOWER(x)   \
 255          ((cluster_stats_in[(x)].pages_at_lower_offsets)++)
 256 #define CLUSTER_STAT_CLUSTER(x) \
 257         ((cluster_stats_in[(x)].pages_in_cluster)++)
 258 #else   /* MACH_CLUSTER_STATS */
 259 #define CLUSTER_STAT(clause)
 260 #endif  /* MACH_CLUSTER_STATS */
 261
 262 #define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
 263
 264
 265 boolean_t       vm_page_deactivate_behind = TRUE;
 266 /*
 267  * default sizes given VM_BEHAVIOR_DEFAULT reference behavior
 268  */
 269 #define VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW     128
 270 #define VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER    16              /* don't make this too big... */
 271                                                                 /* we use it to size an array on the stack */
 272
 273 int vm_default_behind = VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW;
 274
 275 #define MAX_SEQUENTIAL_RUN      (1024 * 1024 * 1024)
 276
 277 /*
 278  * vm_page_is_sequential
 279  *
 280  * Determine if sequential access is in progress
 281  * in accordance with the behavior specified.
 282  * Update state to indicate current access pattern.
 283  *
 284  * object must have at least the shared lock held
 285  */
 286 static
 287 void
 288 vm_fault_is_sequential(
 289         vm_object_t             object,
 290         vm_object_offset_t      offset,
 291         vm_behavior_t           behavior)
 292 {
 293         vm_object_offset_t      last_alloc;
 294         int                     sequential;
 295         int                     orig_sequential;
 296
 297         last_alloc = object->last_alloc;
 298         sequential = object->sequential;
 299         orig_sequential = sequential;
 300
 301         switch (behavior) {
 302         case VM_BEHAVIOR_RANDOM:
 303                 /*
 304                  * reset indicator of sequential behavior
 305                  */
 306                 sequential = 0;
 307                 break;
 308
 309         case VM_BEHAVIOR_SEQUENTIAL:
 310                 if (offset && last_alloc == offset - PAGE_SIZE_64) {
 311                         /*
 312                          * advance indicator of sequential behavior
 313                          */
 314                         if (sequential < MAX_SEQUENTIAL_RUN)
 315                                 sequential += PAGE_SIZE;
 316                 } else {
 317                         /*
 318                          * reset indicator of sequential behavior
 319                          */
 320                         sequential = 0;
 321                 }
 322                 break;
 323
 324         case VM_BEHAVIOR_RSEQNTL:
 325                 if (last_alloc && last_alloc == offset + PAGE_SIZE_64) {
 326                         /*
 327                          * advance indicator of sequential behavior
 328                          */
 329                         if (sequential > -MAX_SEQUENTIAL_RUN)
 330                                 sequential -= PAGE_SIZE;
 331                 } else {
 332                         /*
 333                          * reset indicator of sequential behavior
 334                          */
 335                         sequential = 0;
 336                 }
 337                 break;
 338
 339         case VM_BEHAVIOR_DEFAULT:
 340         default:
 341                 if (offset && last_alloc == (offset - PAGE_SIZE_64)) {
 342                         /*
 343                          * advance indicator of sequential behavior
 344                          */
 345                         if (sequential < 0)
 346                                 sequential = 0;
 347                         if (sequential < MAX_SEQUENTIAL_RUN)
 348                                 sequential += PAGE_SIZE;
 349
 350                 } else if (last_alloc && last_alloc == (offset + PAGE_SIZE_64)) {
 351                         /*
 352                          * advance indicator of sequential behavior
 353                          */
 354                         if (sequential > 0)
 355                                 sequential = 0;
 356                         if (sequential > -MAX_SEQUENTIAL_RUN)
 357                                 sequential -= PAGE_SIZE;
 358                 } else {
 359                         /*
 360                          * reset indicator of sequential behavior
 361                          */
 362                         sequential = 0;
 363                 }
 364                 break;
 365         }
 366         if (sequential != orig_sequential) {
 367                 if (!OSCompareAndSwap(orig_sequential, sequential, (UInt32 *)&object->sequential)) {
 368                         /*
 369                          * if someone else has already updated object->sequential
 370                          * don't bother trying to update it or object->last_alloc
 371                          */
 372                         return;
 373                 }
 374         }
 375         /*
 376          * I'd like to do this with a OSCompareAndSwap64, but that
 377          * doesn't exist for PPC...  however, it shouldn't matter
 378          * that much... last_alloc is maintained so that we can determine
 379          * if a sequential access pattern is taking place... if only
 380          * one thread is banging on this object, no problem with the unprotected
 381          * update... if 2 or more threads are banging away, we run the risk of
 382          * someone seeing a mangled update... however, in the face of multiple
 383          * accesses, no sequential access pattern can develop anyway, so we
 384          * haven't lost any real info.
 385          */
 386         object->last_alloc = offset;
 387 }
 388
 389
 390 int vm_page_deactivate_behind_count = 0;
 391
 392 /*
 393  * vm_page_deactivate_behind
 394  *
 395  * Determine if sequential access is in progress
 396  * in accordance with the behavior specified.  If
 397  * so, compute a potential page to deactivate and
 398  * deactivate it.
 399  *
 400  * object must be locked.
 401  *
 402  * return TRUE if we actually deactivate a page
 403  */
 404 static
 405 boolean_t
 406 vm_fault_deactivate_behind(
 407         vm_object_t             object,
 408         vm_object_offset_t      offset,
 409         vm_behavior_t           behavior)
 410 {
 411         int             n;
 412         int             pages_in_run = 0;
 413         int             max_pages_in_run = 0;
 414         int             sequential_run;
 415         int             sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
 416         vm_object_offset_t      run_offset = 0;
 417         vm_object_offset_t      pg_offset = 0;
 418         vm_page_t       m;
 419         vm_page_t       page_run[VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER];
 420
 421         pages_in_run = 0;
 422 #if TRACEFAULTPAGE
 423         dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind); /* (TEST/DEBUG) */
 424 #endif
 425
 426         if (object == kernel_object || vm_page_deactivate_behind == FALSE) {
 427                 /*
 428                  * Do not deactivate pages from the kernel object: they
 429                  * are not intended to become pageable.
 430                  * or we've disabled the deactivate behind mechanism
 431                  */
 432                 return FALSE;
 433         }
 434         if ((sequential_run = object->sequential)) {
 435                   if (sequential_run < 0) {
 436                           sequential_behavior = VM_BEHAVIOR_RSEQNTL;
 437                           sequential_run = 0 - sequential_run;
 438                   } else {
 439                           sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
 440                   }
 441         }
 442         switch (behavior) {
 443         case VM_BEHAVIOR_RANDOM:
 444                 break;
 445         case VM_BEHAVIOR_SEQUENTIAL:
 446                 if (sequential_run >= (int)PAGE_SIZE) {
 447                         run_offset = 0 - PAGE_SIZE_64;
 448                         max_pages_in_run = 1;
 449                 }
 450                 break;
 451         case VM_BEHAVIOR_RSEQNTL:
 452                 if (sequential_run >= (int)PAGE_SIZE) {
 453                         run_offset = PAGE_SIZE_64;
 454                         max_pages_in_run = 1;
 455                 }
 456                 break;
 457         case VM_BEHAVIOR_DEFAULT:
 458         default:
 459         {       vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
 460
 461                 /*
 462                  * determine if the run of sequential accesss has been
 463                  * long enough on an object with default access behavior
 464                  * to consider it for deactivation
 465                  */
 466                 if ((uint64_t)sequential_run >= behind && (sequential_run % (VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER * PAGE_SIZE)) == 0) {
 467                         /*
 468                          * the comparisons between offset and behind are done
 469                          * in this kind of odd fashion in order to prevent wrap around
 470                          * at the end points
 471                          */
 472                         if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) {
 473                                 if (offset >= behind) {
 474                                         run_offset = 0 - behind;
 475                                         pg_offset = PAGE_SIZE_64;
 476                                         max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
 477                                 }
 478                         } else {
 479                                 if (offset < -behind) {
 480                                         run_offset = behind;
 481                                         pg_offset = 0 - PAGE_SIZE_64;
 482                                         max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
 483                                 }
 484                         }
 485                 }
 486                 break;
 487         }
 488         }
 489         for (n = 0; n < max_pages_in_run; n++) {
 490                 m = vm_page_lookup(object, offset + run_offset + (n * pg_offset));
 491
 492                 if (m && !m->laundry && !m->busy && !m->no_cache && !m->throttled && !m->fictitious && !m->absent) {
 493                         page_run[pages_in_run++] = m;
 494                         pmap_clear_reference(m->phys_page);
 495                 }
 496         }
 497         if (pages_in_run) {
 498                 vm_page_lockspin_queues();
 499
 500                 for (n = 0; n < pages_in_run; n++) {
 501
 502                         m = page_run[n];
 503
 504                         vm_page_deactivate_internal(m, FALSE);
 505
 506                         vm_page_deactivate_behind_count++;
 507 #if TRACEFAULTPAGE
 508                         dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
 509 #endif
 510                 }
 511                 vm_page_unlock_queues();
 512
 513                 return TRUE;
 514         }
 515         return FALSE;
 516 }
 517
 518
 519 static int
 520 vm_page_throttled(void)
 521 {
 522         clock_sec_t     elapsed_sec;
 523         clock_sec_t     tv_sec;
 524         clock_usec_t    tv_usec;
 525
 526         thread_t thread = current_thread();
 527
 528         if (thread->options & TH_OPT_VMPRIV)
 529                 return (0);
 530
 531         thread->t_page_creation_count++;
 532
 533         if (NEED_TO_HARD_THROTTLE_THIS_TASK())
 534                 return (HARD_THROTTLE_DELAY);
 535
 536         if (vm_page_free_count < vm_page_throttle_limit &&
 537             thread->t_page_creation_count > vm_page_creation_throttle) {
 538
 539                 clock_get_system_microtime(&tv_sec, &tv_usec);
 540
 541                 elapsed_sec = tv_sec - thread->t_page_creation_time;
 542
 543                 if (elapsed_sec <= 6 || (thread->t_page_creation_count / elapsed_sec) >= (vm_page_creation_throttle / 6)) {
 544
 545                         if (elapsed_sec >= 60) {
 546                                 /*
 547                                  * we'll reset our stats to give a well behaved app
 548                                  * that was unlucky enough to accumulate a bunch of pages
 549                                  * over a long period of time a chance to get out of
 550                                  * the throttled state... we reset the counter and timestamp
 551                                  * so that if it stays under the rate limit for the next second
 552                                  * it will be back in our good graces... if it exceeds it, it
 553                                  * will remain in the throttled state
 554                                  */
 555                                 thread->t_page_creation_time = tv_sec;
 556                                 thread->t_page_creation_count = (vm_page_creation_throttle / 6) * 5;
 557                         }
 558                         ++vm_page_throttle_count;
 559
 560                         return (SOFT_THROTTLE_DELAY);
 561                 }
 562                 thread->t_page_creation_time = tv_sec;
 563                 thread->t_page_creation_count = 0;
 564         }
 565         return (0);
 566 }
 567
 568
 569 /*
 570  * check for various conditions that would
 571  * prevent us from creating a ZF page...
 572  * cleanup is based on being called from vm_fault_page
 573  *
 574  * object must be locked
 575  * object == m->object
 576  */
 577 static vm_fault_return_t
 578 vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t interruptible_state)
 579 {
 580         int throttle_delay;
 581
 582         if (object->shadow_severed ||
 583             VM_OBJECT_PURGEABLE_FAULT_ERROR(object)) {
 584                 /*
 585                  * Either:
 586                  * 1. the shadow chain was severed,
 587                  * 2. the purgeable object is volatile or empty and is marked
 588                  *    to fault on access while volatile.
 589                  * Just have to return an error at this point
 590                  */
 591                 if (m != VM_PAGE_NULL)
 592                         VM_PAGE_FREE(m);
 593                 vm_fault_cleanup(object, first_m);
 594
 595                 thread_interrupt_level(interruptible_state);
 596
 597                 return (VM_FAULT_MEMORY_ERROR);
 598         }
 599         if (vm_backing_store_low) {
 600                 /*
 601                  * are we protecting the system from
 602                  * backing store exhaustion.  If so
 603                  * sleep unless we are privileged.
 604                  */
 605                 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
 606
 607                         if (m != VM_PAGE_NULL)
 608                                 VM_PAGE_FREE(m);
 609                         vm_fault_cleanup(object, first_m);
 610
 611                         assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
 612
 613                         thread_block(THREAD_CONTINUE_NULL);
 614                         thread_interrupt_level(interruptible_state);
 615
 616                         return (VM_FAULT_RETRY);
 617                 }
 618         }
 619         if ((throttle_delay = vm_page_throttled())) {
 620                 /*
 621                  * we're throttling zero-fills...
 622                  * treat this as if we couldn't grab a page
 623                  */
 624                 if (m != VM_PAGE_NULL)
 625                         VM_PAGE_FREE(m);
 626                 vm_fault_cleanup(object, first_m);
 627
 628                 VM_DEBUG_EVENT(vmf_check_zfdelay, VMF_CHECK_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
 629
 630                 delay(throttle_delay);
 631
 632                 if (current_thread_aborted()) {
 633                         thread_interrupt_level(interruptible_state);
 634                         return VM_FAULT_INTERRUPTED;
 635                 }
 636                 thread_interrupt_level(interruptible_state);
 637
 638                 return (VM_FAULT_MEMORY_SHORTAGE);
 639         }
 640         return (VM_FAULT_SUCCESS);
 641 }
 642
 643
 644 /*
 645  * do the work to zero fill a page and
 646  * inject it into the correct paging queue
 647  *
 648  * m->object must be locked
 649  * page queue lock must NOT be held
 650  */
 651 static int
 652 vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill)
 653 {
 654         int my_fault = DBG_ZERO_FILL_FAULT;
 655
 656         /*
 657          * This is is a zero-fill page fault...
 658          *
 659          * Checking the page lock is a waste of
 660          * time;  this page was absent, so
 661          * it can't be page locked by a pager.
 662          *
 663          * we also consider it undefined
 664          * with respect to instruction
 665          * execution.  i.e. it is the responsibility
 666          * of higher layers to call for an instruction
 667          * sync after changing the contents and before
 668          * sending a program into this area.  We
 669          * choose this approach for performance
 670          */
 671         m->pmapped = TRUE;
 672
 673         m->cs_validated = FALSE;
 674         m->cs_tainted = FALSE;
 675
 676         if (no_zero_fill == TRUE) {
 677                 my_fault = DBG_NZF_PAGE_FAULT;
 678         } else {
 679                 vm_page_zero_fill(m);
 680
 681                 VM_STAT_INCR(zero_fill_count);
 682                 DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
 683         }
 684         assert(!m->laundry);
 685         assert(m->object != kernel_object);
 686         //assert(m->pageq.next == NULL && m->pageq.prev == NULL);
 687
 688         if (!VM_DYNAMIC_PAGING_ENABLED(memory_manager_default) &&
 689                 (m->object->purgable == VM_PURGABLE_DENY ||
 690                  m->object->purgable == VM_PURGABLE_NONVOLATILE ||
 691                  m->object->purgable == VM_PURGABLE_VOLATILE )) {
 692
 693                 vm_page_lockspin_queues();
 694
 695                 assert(!VM_PAGE_WIRED(m));
 696
 697                 /*
 698                  * can't be on the pageout queue since we don't
 699                  * have a pager to try and clean to
 700                  */
 701                 assert(!m->pageout_queue);
 702
 703                 VM_PAGE_QUEUES_REMOVE(m);
 704
 705                 queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq);
 706                 m->throttled = TRUE;
 707                 vm_page_throttled_count++;
 708
 709                 vm_page_unlock_queues();
 710         }
 711         return (my_fault);
 712 }
 713
 714
 715 /*
 716  *      Routine:        vm_fault_page
 717  *      Purpose:
 718  *              Find the resident page for the virtual memory
 719  *              specified by the given virtual memory object
 720  *              and offset.
 721  *      Additional arguments:
 722  *              The required permissions for the page is given
 723  *              in "fault_type".  Desired permissions are included
 724  *              in "protection".
 725  *              fault_info is passed along to determine pagein cluster
 726  *              limits... it contains the expected reference pattern,
 727  *              cluster size if available, etc...
 728  *
 729  *              If the desired page is known to be resident (for
 730  *              example, because it was previously wired down), asserting
 731  *              the "unwiring" parameter will speed the search.
 732  *
 733  *              If the operation can be interrupted (by thread_abort
 734  *              or thread_terminate), then the "interruptible"
 735  *              parameter should be asserted.
 736  *
 737  *      Results:
 738  *              The page containing the proper data is returned
 739  *              in "result_page".
 740  *
 741  *      In/out conditions:
 742  *              The source object must be locked and referenced,
 743  *              and must donate one paging reference.  The reference
 744  *              is not affected.  The paging reference and lock are
 745  *              consumed.
 746  *
 747  *              If the call succeeds, the object in which "result_page"
 748  *              resides is left locked and holding a paging reference.
 749  *              If this is not the original object, a busy page in the
 750  *              original object is returned in "top_page", to prevent other
 751  *              callers from pursuing this same data, along with a paging
 752  *              reference for the original object.  The "top_page" should
 753  *              be destroyed when this guarantee is no longer required.
 754  *              The "result_page" is also left busy.  It is not removed
 755  *              from the pageout queues.
 756  *      Special Case:
 757  *              A return value of VM_FAULT_SUCCESS_NO_PAGE means that the
 758  *              fault succeeded but there's no VM page (i.e. the VM object
 759  *              does not actually hold VM pages, but device memory or
 760  *              large pages).  The object is still locked and we still hold a
 761  *              paging_in_progress reference.
 762  */
 763 unsigned int vm_fault_page_blocked_access = 0;
 764 unsigned int vm_fault_page_forced_retry = 0;
 765
 766 vm_fault_return_t
 767 vm_fault_page(
 768         /* Arguments: */
 769         vm_object_t     first_object,   /* Object to begin search */
 770         vm_object_offset_t first_offset,        /* Offset into object */
 771         vm_prot_t       fault_type,     /* What access is requested */
 772         boolean_t       must_be_resident,/* Must page be resident? */
 773         /* Modifies in place: */
 774         vm_prot_t       *protection,    /* Protection for mapping */
 775         /* Returns: */
 776         vm_page_t       *result_page,   /* Page found, if successful */
 777         vm_page_t       *top_page,      /* Page in top object, if
 778                                          * not result_page.  */
 779         int             *type_of_fault, /* if non-null, fill in with type of fault
 780                                          * COW, zero-fill, etc... returned in trace point */
 781         /* More arguments: */
 782         kern_return_t   *error_code,    /* code if page is in error */
 783         boolean_t       no_zero_fill,   /* don't zero fill absent pages */
 784 #if MACH_PAGEMAP
 785         boolean_t       data_supply,    /* treat as data_supply if
 786                                          * it is a write fault and a full
 787                                          * page is provided */
 788 #else
 789         __unused boolean_t data_supply,
 790 #endif
 791         vm_object_fault_info_t fault_info)
 792 {
 793         vm_page_t               m;
 794         vm_object_t             object;
 795         vm_object_offset_t      offset;
 796         vm_page_t               first_m;
 797         vm_object_t             next_object;
 798         vm_object_t             copy_object;
 799         boolean_t               look_for_page;
 800         boolean_t               force_fault_retry = FALSE;
 801         vm_prot_t               access_required = fault_type;
 802         vm_prot_t               wants_copy_flag;
 803         CLUSTER_STAT(int pages_at_higher_offsets;)
 804         CLUSTER_STAT(int pages_at_lower_offsets;)
 805         kern_return_t           wait_result;
 806         boolean_t               interruptible_state;
 807         boolean_t               data_already_requested = FALSE;
 808         vm_behavior_t           orig_behavior;
 809         vm_size_t               orig_cluster_size;
 810         vm_fault_return_t       error;
 811         int                     my_fault;
 812         uint32_t                try_failed_count;
 813         int                     interruptible; /* how may fault be interrupted? */
 814         memory_object_t         pager;
 815         vm_fault_return_t       retval;
 816
 817 /*
 818  * MACH page map - an optional optimization where a bit map is maintained
 819  * by the VM subsystem for internal objects to indicate which pages of
 820  * the object currently reside on backing store.  This existence map
 821  * duplicates information maintained by the vnode pager.  It is
 822  * created at the time of the first pageout against the object, i.e.
 823  * at the same time pager for the object is created.  The optimization
 824  * is designed to eliminate pager interaction overhead, if it is
 825  * 'known' that the page does not exist on backing store.
 826  *
 827  * MUST_ASK_PAGER() evaluates to TRUE if the page specified by object/offset is
 828  * either marked as paged out in the existence map for the object or no
 829  * existence map exists for the object.  MUST_ASK_PAGER() is one of the
 830  * criteria in the decision to invoke the pager.   It is also used as one
 831  * of the criteria to terminate the scan for adjacent pages in a clustered
 832  * pagein operation.  Note that MUST_ASK_PAGER() always evaluates to TRUE for
 833  * permanent objects.  Note also that if the pager for an internal object
 834  * has not been created, the pager is not invoked regardless of the value
 835  * of MUST_ASK_PAGER() and that clustered pagein scans are only done on an object
 836  * for which a pager has been created.
 837  *
 838  * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
 839  * is marked as paged out in the existence map for the object.  PAGED_OUT()
 840  * PAGED_OUT() is used to determine if a page has already been pushed
 841  * into a copy object in order to avoid a redundant page out operation.
 842  */
 843 #if MACH_PAGEMAP
 844 #define MUST_ASK_PAGER(o, f) (vm_external_state_get((o)->existence_map, (f)) \
 845                         != VM_EXTERNAL_STATE_ABSENT)
 846 #define PAGED_OUT(o, f) (vm_external_state_get((o)->existence_map, (f)) \
 847                         == VM_EXTERNAL_STATE_EXISTS)
 848 #else
 849 #define MUST_ASK_PAGER(o, f) (TRUE)
 850 #define PAGED_OUT(o, f) (FALSE)
 851 #endif
 852
 853 /*
 854  *      Recovery actions
 855  */
 856 #define RELEASE_PAGE(m)                                 \
 857         MACRO_BEGIN                                     \
 858         PAGE_WAKEUP_DONE(m);                            \
 859         if (!m->active && !m->inactive && !m->throttled) {              \
 860                 vm_page_lockspin_queues();                              \
 861                 if (!m->active && !m->inactive && !m->throttled)        \
 862                         vm_page_activate(m);                            \
 863                 vm_page_unlock_queues();                                \
 864         }                                                               \
 865         MACRO_END
 866
 867 #if TRACEFAULTPAGE
 868         dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */
 869 #endif
 870
 871         interruptible = fault_info->interruptible;
 872         interruptible_state = thread_interrupt_level(interruptible);
 873
 874         /*
 875          *      INVARIANTS (through entire routine):
 876          *
 877          *      1)      At all times, we must either have the object
 878          *              lock or a busy page in some object to prevent
 879          *              some other thread from trying to bring in
 880          *              the same page.
 881          *
 882          *              Note that we cannot hold any locks during the
 883          *              pager access or when waiting for memory, so
 884          *              we use a busy page then.
 885          *
 886          *      2)      To prevent another thread from racing us down the
 887          *              shadow chain and entering a new page in the top
 888          *              object before we do, we must keep a busy page in
 889          *              the top object while following the shadow chain.
 890          *
 891          *      3)      We must increment paging_in_progress on any object
 892          *              for which we have a busy page before dropping
 893          *              the object lock
 894          *
 895          *      4)      We leave busy pages on the pageout queues.
 896          *              If the pageout daemon comes across a busy page,
 897          *              it will remove the page from the pageout queues.
 898          */
 899
 900         object = first_object;
 901         offset = first_offset;
 902         first_m = VM_PAGE_NULL;
 903         access_required = fault_type;
 904
 905
 906         XPR(XPR_VM_FAULT,
 907                 "vm_f_page: obj 0x%X, offset 0x%X, type %d, prot %d\n",
 908                 object, offset, fault_type, *protection, 0);
 909
 910         /*
 911          * default type of fault
 912          */
 913         my_fault = DBG_CACHE_HIT_FAULT;
 914
 915         while (TRUE) {
 916 #if TRACEFAULTPAGE
 917                 dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
 918 #endif
 919                 if (!object->alive) {
 920                         /*
 921                          * object is no longer valid
 922                          * clean up and return error
 923                          */
 924                         vm_fault_cleanup(object, first_m);
 925                         thread_interrupt_level(interruptible_state);
 926
 927                         return (VM_FAULT_MEMORY_ERROR);
 928                 }
 929
 930                 if (!object->pager_created && object->phys_contiguous) {
 931                         /*
 932                          * A physically-contiguous object without a pager:
 933                          * must be a "large page" object.  We do not deal
 934                          * with VM pages for this object.
 935                          */
 936                         m = VM_PAGE_NULL;
 937                         goto phys_contig_object;
 938                 }
 939
 940                 if (object->blocked_access) {
 941                         /*
 942                          * Access to this VM object has been blocked.
 943                          * Replace our "paging_in_progress" reference with
 944                          * a "activity_in_progress" reference and wait for
 945                          * access to be unblocked.
 946                          */
 947                         vm_object_activity_begin(object);
 948                         vm_object_paging_end(object);
 949                         while (object->blocked_access) {
 950                                 vm_object_sleep(object,
 951                                                 VM_OBJECT_EVENT_UNBLOCKED,
 952                                                 THREAD_UNINT);
 953                         }
 954                         vm_fault_page_blocked_access++;
 955                         vm_object_paging_begin(object);
 956                         vm_object_activity_end(object);
 957                 }
 958
 959                 /*
 960                  * See whether the page at 'offset' is resident
 961                  */
 962                 m = vm_page_lookup(object, offset);
 963 #if TRACEFAULTPAGE
 964                 dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
 965 #endif
 966                 if (m != VM_PAGE_NULL) {
 967
 968                         if (m->busy) {
 969                                 /*
 970                                  * The page is being brought in,
 971                                  * wait for it and then retry.
 972                                  */
 973 #if TRACEFAULTPAGE
 974                                 dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
 975 #endif
 976                                 wait_result = PAGE_SLEEP(object, m, interruptible);
 977
 978                                 XPR(XPR_VM_FAULT,
 979                                     "vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n",
 980                                     object, offset,
 981                                     m, 0, 0);
 982                                 counter(c_vm_fault_page_block_busy_kernel++);
 983
 984                                 if (wait_result != THREAD_AWAKENED) {
 985                                         vm_fault_cleanup(object, first_m);
 986                                         thread_interrupt_level(interruptible_state);
 987
 988                                         if (wait_result == THREAD_RESTART)
 989                                                 return (VM_FAULT_RETRY);
 990                                         else
 991                                                 return (VM_FAULT_INTERRUPTED);
 992                                 }
 993                                 continue;
 994                         }
 995                         if (m->laundry) {
 996                                 m->pageout = FALSE;
 997
 998                                 if (!m->cleaning)
 999                                         vm_pageout_steal_laundry(m, FALSE);
1000                         }
1001                         if (m->phys_page == vm_page_guard_addr) {
1002                                 /*
1003                                  * Guard page: off limits !
1004                                  */
1005                                 if (fault_type == VM_PROT_NONE) {
1006                                         /*
1007                                          * The fault is not requesting any
1008                                          * access to the guard page, so it must
1009                                          * be just to wire or unwire it.
1010                                          * Let's pretend it succeeded...
1011                                          */
1012                                         m->busy = TRUE;
1013                                         *result_page = m;
1014                                         assert(first_m == VM_PAGE_NULL);
1015                                         *top_page = first_m;
1016                                         if (type_of_fault)
1017                                                 *type_of_fault = DBG_GUARD_FAULT;
1018                                         return VM_FAULT_SUCCESS;
1019                                 } else {
1020                                         /*
1021                                          * The fault requests access to the
1022                                          * guard page: let's deny that !
1023                                          */
1024                                         vm_fault_cleanup(object, first_m);
1025                                         thread_interrupt_level(interruptible_state);
1026                                         return VM_FAULT_MEMORY_ERROR;
1027                                 }
1028                         }
1029
1030                         if (m->error) {
1031                                 /*
1032                                  * The page is in error, give up now.
1033                                  */
1034 #if TRACEFAULTPAGE
1035                                 dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code);      /* (TEST/DEBUG) */
1036 #endif
1037                                 if (error_code)
1038                                         *error_code = KERN_MEMORY_ERROR;
1039                                 VM_PAGE_FREE(m);
1040
1041                                 vm_fault_cleanup(object, first_m);
1042                                 thread_interrupt_level(interruptible_state);
1043
1044                                 return (VM_FAULT_MEMORY_ERROR);
1045                         }
1046                         if (m->restart) {
1047                                 /*
1048                                  * The pager wants us to restart
1049                                  * at the top of the chain,
1050                                  * typically because it has moved the
1051                                  * page to another pager, then do so.
1052                                  */
1053 #if TRACEFAULTPAGE
1054                                 dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1055 #endif
1056                                 VM_PAGE_FREE(m);
1057
1058                                 vm_fault_cleanup(object, first_m);
1059                                 thread_interrupt_level(interruptible_state);
1060
1061                                 return (VM_FAULT_RETRY);
1062                         }
1063                         if (m->absent) {
1064                                 /*
1065                                  * The page isn't busy, but is absent,
1066                                  * therefore it's deemed "unavailable".
1067                                  *
1068                                  * Remove the non-existent page (unless it's
1069                                  * in the top object) and move on down to the
1070                                  * next object (if there is one).
1071                                  */
1072 #if TRACEFAULTPAGE
1073                                 dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow);  /* (TEST/DEBUG) */
1074 #endif
1075                                 next_object = object->shadow;
1076
1077                                 if (next_object == VM_OBJECT_NULL) {
1078                                         /*
1079                                          * Absent page at bottom of shadow
1080                                          * chain; zero fill the page we left
1081                                          * busy in the first object, and free
1082                                          * the absent page.
1083                                          */
1084                                         assert(!must_be_resident);
1085
1086                                         /*
1087                                          * check for any conditions that prevent
1088                                          * us from creating a new zero-fill page
1089                                          * vm_fault_check will do all of the
1090                                          * fault cleanup in the case of an error condition
1091                                          * including resetting the thread_interrupt_level
1092                                          */
1093                                         error = vm_fault_check(object, m, first_m, interruptible_state);
1094
1095                                         if (error != VM_FAULT_SUCCESS)
1096                                                 return (error);
1097
1098                                         XPR(XPR_VM_FAULT,
1099                                             "vm_f_page: zero obj 0x%X, off 0x%X, page 0x%X, first_obj 0x%X\n",
1100                                                 object, offset,
1101                                                 m,
1102                                                 first_object, 0);
1103
1104                                         if (object != first_object) {
1105                                                 /*
1106                                                  * free the absent page we just found
1107                                                  */
1108                                                 VM_PAGE_FREE(m);
1109
1110                                                 /*
1111                                                  * drop reference and lock on current object
1112                                                  */
1113                                                 vm_object_paging_end(object);
1114                                                 vm_object_unlock(object);
1115
1116                                                 /*
1117                                                  * grab the original page we
1118                                                  * 'soldered' in place and
1119                                                  * retake lock on 'first_object'
1120                                                  */
1121                                                 m = first_m;
1122                                                 first_m = VM_PAGE_NULL;
1123
1124                                                 object = first_object;
1125                                                 offset = first_offset;
1126
1127                                                 vm_object_lock(object);
1128                                         } else {
1129                                                 /*
1130                                                  * we're going to use the absent page we just found
1131                                                  * so convert it to a 'busy' page
1132                                                  */
1133                                                 m->absent = FALSE;
1134                                                 m->busy = TRUE;
1135                                         }
1136                                         /*
1137                                          * zero-fill the page and put it on
1138                                          * the correct paging queue
1139                                          */
1140                                         my_fault = vm_fault_zero_page(m, no_zero_fill);
1141
1142                                         if (fault_info->mark_zf_absent && no_zero_fill == TRUE)
1143                                                 m->absent = TRUE;
1144
1145                                         break;
1146                                 } else {
1147                                         if (must_be_resident)
1148                                                 vm_object_paging_end(object);
1149                                         else if (object != first_object) {
1150                                                 vm_object_paging_end(object);
1151                                                 VM_PAGE_FREE(m);
1152                                         } else {
1153                                                 first_m = m;
1154                                                 m->absent = FALSE;
1155                                                 m->busy = TRUE;
1156
1157                                                 vm_page_lockspin_queues();
1158
1159                                                 assert(!m->pageout_queue);
1160                                                 VM_PAGE_QUEUES_REMOVE(m);
1161
1162                                                 vm_page_unlock_queues();
1163                                         }
1164                                         XPR(XPR_VM_FAULT,
1165                                             "vm_f_page: unavail obj 0x%X, off 0x%X, next_obj 0x%X, newoff 0x%X\n",
1166                                                 object, offset,
1167                                                 next_object,
1168                                                 offset+object->vo_shadow_offset,0);
1169
1170                                         offset += object->vo_shadow_offset;
1171                                         fault_info->lo_offset += object->vo_shadow_offset;
1172                                         fault_info->hi_offset += object->vo_shadow_offset;
1173                                         access_required = VM_PROT_READ;
1174
1175                                         vm_object_lock(next_object);
1176                                         vm_object_unlock(object);
1177                                         object = next_object;
1178                                         vm_object_paging_begin(object);
1179
1180                                         /*
1181                                          * reset to default type of fault
1182                                          */
1183                                         my_fault = DBG_CACHE_HIT_FAULT;
1184
1185                                         continue;
1186                                 }
1187                         }
1188                         if ((m->cleaning)
1189                             && ((object != first_object) || (object->copy != VM_OBJECT_NULL))
1190                             && (fault_type & VM_PROT_WRITE)) {
1191                                 /*
1192                                  * This is a copy-on-write fault that will
1193                                  * cause us to revoke access to this page, but
1194                                  * this page is in the process of being cleaned
1195                                  * in a clustered pageout. We must wait until
1196                                  * the cleaning operation completes before
1197                                  * revoking access to the original page,
1198                                  * otherwise we might attempt to remove a
1199                                  * wired mapping.
1200                                  */
1201 #if TRACEFAULTPAGE
1202                                 dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset);  /* (TEST/DEBUG) */
1203 #endif
1204                                 XPR(XPR_VM_FAULT,
1205                                     "vm_f_page: cleaning obj 0x%X, offset 0x%X, page 0x%X\n",
1206                                         object, offset,
1207                                         m, 0, 0);
1208                                 /*
1209                                  * take an extra ref so that object won't die
1210                                  */
1211                                 vm_object_reference_locked(object);
1212
1213                                 vm_fault_cleanup(object, first_m);
1214
1215                                 counter(c_vm_fault_page_block_backoff_kernel++);
1216                                 vm_object_lock(object);
1217                                 assert(object->ref_count > 0);
1218
1219                                 m = vm_page_lookup(object, offset);
1220
1221                                 if (m != VM_PAGE_NULL && m->cleaning) {
1222                                         PAGE_ASSERT_WAIT(m, interruptible);
1223
1224                                         vm_object_unlock(object);
1225                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1226                                         vm_object_deallocate(object);
1227
1228                                         goto backoff;
1229                                 } else {
1230                                         vm_object_unlock(object);
1231
1232                                         vm_object_deallocate(object);
1233                                         thread_interrupt_level(interruptible_state);
1234
1235                                         return (VM_FAULT_RETRY);
1236                                 }
1237                         }
1238                         if (type_of_fault == NULL && m->speculative &&
1239                             !(fault_info != NULL && fault_info->stealth)) {
1240                                 /*
1241                                  * If we were passed a non-NULL pointer for
1242                                  * "type_of_fault", than we came from
1243                                  * vm_fault... we'll let it deal with
1244                                  * this condition, since it
1245                                  * needs to see m->speculative to correctly
1246                                  * account the pageins, otherwise...
1247                                  * take it off the speculative queue, we'll
1248                                  * let the caller of vm_fault_page deal
1249                                  * with getting it onto the correct queue
1250                                  *
1251                                  * If the caller specified in fault_info that
1252                                  * it wants a "stealth" fault, we also leave
1253                                  * the page in the speculative queue.
1254                                  */
1255                                 vm_page_lockspin_queues();
1256                                 if (m->speculative)
1257                                         VM_PAGE_QUEUES_REMOVE(m);
1258                                 vm_page_unlock_queues();
1259                         }
1260
1261                         if (m->encrypted) {
1262                                 /*
1263                                  * ENCRYPTED SWAP:
1264                                  * the user needs access to a page that we
1265                                  * encrypted before paging it out.
1266                                  * Decrypt the page now.
1267                                  * Keep it busy to prevent anyone from
1268                                  * accessing it during the decryption.
1269                                  */
1270                                 m->busy = TRUE;
1271                                 vm_page_decrypt(m, 0);
1272                                 assert(object == m->object);
1273                                 assert(m->busy);
1274                                 PAGE_WAKEUP_DONE(m);
1275
1276                                 /*
1277                                  * Retry from the top, in case
1278                                  * something changed while we were
1279                                  * decrypting.
1280                                  */
1281                                 continue;
1282                         }
1283                         ASSERT_PAGE_DECRYPTED(m);
1284
1285                         if (m->object->code_signed) {
1286                                 /*
1287                                  * CODE SIGNING:
1288                                  * We just paged in a page from a signed
1289                                  * memory object but we don't need to
1290                                  * validate it now.  We'll validate it if
1291                                  * when it gets mapped into a user address
1292                                  * space for the first time or when the page
1293                                  * gets copied to another object as a result
1294                                  * of a copy-on-write.
1295                                  */
1296                         }
1297
1298                         /*
1299                          * We mark the page busy and leave it on
1300                          * the pageout queues.  If the pageout
1301                          * deamon comes across it, then it will
1302                          * remove the page from the queue, but not the object
1303                          */
1304 #if TRACEFAULTPAGE
1305                         dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1306 #endif
1307                         XPR(XPR_VM_FAULT,
1308                             "vm_f_page: found page obj 0x%X, offset 0x%X, page 0x%X\n",
1309                                 object, offset, m, 0, 0);
1310                         assert(!m->busy);
1311                         assert(!m->absent);
1312
1313                         m->busy = TRUE;
1314                         break;
1315                 }
1316
1317
1318                 /*
1319                  * we get here when there is no page present in the object at
1320                  * the offset we're interested in... we'll allocate a page
1321                  * at this point if the pager associated with
1322                  * this object can provide the data or we're the top object...
1323                  * object is locked;  m == NULL
1324                  */
1325                 if (must_be_resident)
1326                         goto dont_look_for_page;
1327
1328                 look_for_page = (object->pager_created && (MUST_ASK_PAGER(object, offset) == TRUE) && !data_supply);
1329
1330 #if TRACEFAULTPAGE
1331                 dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object);      /* (TEST/DEBUG) */
1332 #endif
1333                 if (!look_for_page && object == first_object && !object->phys_contiguous) {
1334                         /*
1335                          * Allocate a new page for this object/offset pair as a placeholder
1336                          */
1337                         m = vm_page_grab();
1338 #if TRACEFAULTPAGE
1339                         dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
1340 #endif
1341                         if (m == VM_PAGE_NULL) {
1342
1343                                 vm_fault_cleanup(object, first_m);
1344                                 thread_interrupt_level(interruptible_state);
1345
1346                                 return (VM_FAULT_MEMORY_SHORTAGE);
1347                         }
1348
1349                         if (fault_info && fault_info->batch_pmap_op == TRUE) {
1350                                 vm_page_insert_internal(m, object, offset, FALSE, TRUE, TRUE);
1351                         } else {
1352                                 vm_page_insert(m, object, offset);
1353                         }
1354                 }
1355                 if (look_for_page) {
1356                         kern_return_t   rc;
1357
1358                         /*
1359                          *      If the memory manager is not ready, we
1360                          *      cannot make requests.
1361                          */
1362                         if (!object->pager_ready) {
1363 #if TRACEFAULTPAGE
1364                                 dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
1365 #endif
1366                                 if (m != VM_PAGE_NULL)
1367                                         VM_PAGE_FREE(m);
1368
1369                                 XPR(XPR_VM_FAULT,
1370                                 "vm_f_page: ready wait obj 0x%X, offset 0x%X\n",
1371                                         object, offset, 0, 0, 0);
1372
1373                                 /*
1374                                  * take an extra ref so object won't die
1375                                  */
1376                                 vm_object_reference_locked(object);
1377                                 vm_fault_cleanup(object, first_m);
1378                                 counter(c_vm_fault_page_block_backoff_kernel++);
1379
1380                                 vm_object_lock(object);
1381                                 assert(object->ref_count > 0);
1382
1383                                 if (!object->pager_ready) {
1384                                         wait_result = vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGER_READY, interruptible);
1385
1386                                         vm_object_unlock(object);
1387                                         if (wait_result == THREAD_WAITING)
1388                                                 wait_result = thread_block(THREAD_CONTINUE_NULL);
1389                                         vm_object_deallocate(object);
1390
1391                                         goto backoff;
1392                                 } else {
1393                                         vm_object_unlock(object);
1394                                         vm_object_deallocate(object);
1395                                         thread_interrupt_level(interruptible_state);
1396
1397                                         return (VM_FAULT_RETRY);
1398                                 }
1399                         }
1400                         if (!object->internal && !object->phys_contiguous && object->paging_in_progress > vm_object_pagein_throttle) {
1401                                 /*
1402                                  * If there are too many outstanding page
1403                                  * requests pending on this external object, we
1404                                  * wait for them to be resolved now.
1405                                  */
1406 #if TRACEFAULTPAGE
1407                                 dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1408 #endif
1409                                 if (m != VM_PAGE_NULL)
1410                                         VM_PAGE_FREE(m);
1411                                 /*
1412                                  * take an extra ref so object won't die
1413                                  */
1414                                 vm_object_reference_locked(object);
1415
1416                                 vm_fault_cleanup(object, first_m);
1417
1418                                 counter(c_vm_fault_page_block_backoff_kernel++);
1419
1420                                 vm_object_lock(object);
1421                                 assert(object->ref_count > 0);
1422
1423                                 if (object->paging_in_progress >= vm_object_pagein_throttle) {
1424                                         vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGING_ONLY_IN_PROGRESS, interruptible);
1425
1426                                         vm_object_unlock(object);
1427                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1428                                         vm_object_deallocate(object);
1429
1430                                         goto backoff;
1431                                 } else {
1432                                         vm_object_unlock(object);
1433                                         vm_object_deallocate(object);
1434                                         thread_interrupt_level(interruptible_state);
1435
1436                                         return (VM_FAULT_RETRY);
1437                                 }
1438                         }
1439                         if (m != VM_PAGE_NULL) {
1440                                 VM_PAGE_FREE(m);
1441                                 m = VM_PAGE_NULL;
1442                         }
1443
1444 #if TRACEFAULTPAGE
1445                         dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0);  /* (TEST/DEBUG) */
1446 #endif
1447
1448                         /*
1449                          * It's possible someone called vm_object_destroy while we weren't
1450                          * holding the object lock.  If that has happened, then bail out
1451                          * here.
1452                          */
1453
1454                         pager = object->pager;
1455
1456                         if (pager == MEMORY_OBJECT_NULL) {
1457                                 vm_fault_cleanup(object, first_m);
1458                                 thread_interrupt_level(interruptible_state);
1459                                 return VM_FAULT_MEMORY_ERROR;
1460                         }
1461
1462                         /*
1463                          * We have an absent page in place for the faulting offset,
1464                          * so we can release the object lock.
1465                          */
1466
1467                         vm_object_unlock(object);
1468
1469                         /*
1470                          * If this object uses a copy_call strategy,
1471                          * and we are interested in a copy of this object
1472                          * (having gotten here only by following a
1473                          * shadow chain), then tell the memory manager
1474                          * via a flag added to the desired_access
1475                          * parameter, so that it can detect a race
1476                          * between our walking down the shadow chain
1477                          * and its pushing pages up into a copy of
1478                          * the object that it manages.
1479                          */
1480                         if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL && object != first_object)
1481                                 wants_copy_flag = VM_PROT_WANTS_COPY;
1482                         else
1483                                 wants_copy_flag = VM_PROT_NONE;
1484
1485                         XPR(XPR_VM_FAULT,
1486                             "vm_f_page: data_req obj 0x%X, offset 0x%X, page 0x%X, acc %d\n",
1487                                 object, offset, m,
1488                                 access_required | wants_copy_flag, 0);
1489
1490                         if (object->copy == first_object) {
1491                                 /*
1492                                  * if we issue the memory_object_data_request in
1493                                  * this state, we are subject to a deadlock with
1494                                  * the underlying filesystem if it is trying to
1495                                  * shrink the file resulting in a push of pages
1496                                  * into the copy object...  that push will stall
1497                                  * on the placeholder page, and if the pushing thread
1498                                  * is holding a lock that is required on the pagein
1499                                  * path (such as a truncate lock), we'll deadlock...
1500                                  * to avoid this potential deadlock, we throw away
1501                                  * our placeholder page before calling memory_object_data_request
1502                                  * and force this thread to retry the vm_fault_page after
1503                                  * we have issued the I/O.  the second time through this path
1504                                  * we will find the page already in the cache (presumably still
1505                                  * busy waiting for the I/O to complete) and then complete
1506                                  * the fault w/o having to go through memory_object_data_request again
1507                                  */
1508                                 assert(first_m != VM_PAGE_NULL);
1509                                 assert(first_m->object == first_object);
1510
1511                                 vm_object_lock(first_object);
1512                                 VM_PAGE_FREE(first_m);
1513                                 vm_object_paging_end(first_object);
1514                                 vm_object_unlock(first_object);
1515
1516                                 first_m = VM_PAGE_NULL;
1517                                 force_fault_retry = TRUE;
1518
1519                                 vm_fault_page_forced_retry++;
1520                         }
1521
1522                         if (data_already_requested == TRUE) {
1523                                 orig_behavior = fault_info->behavior;
1524                                 orig_cluster_size = fault_info->cluster_size;
1525
1526                                 fault_info->behavior = VM_BEHAVIOR_RANDOM;
1527                                 fault_info->cluster_size = PAGE_SIZE;
1528                         }
1529                         /*
1530                          * Call the memory manager to retrieve the data.
1531                          */
1532                         rc = memory_object_data_request(
1533                                 pager,
1534                                 offset + object->paging_offset,
1535                                 PAGE_SIZE,
1536                                 access_required | wants_copy_flag,
1537                                 (memory_object_fault_info_t)fault_info);
1538
1539                         if (data_already_requested == TRUE) {
1540                                 fault_info->behavior = orig_behavior;
1541                                 fault_info->cluster_size = orig_cluster_size;
1542                         } else
1543                                 data_already_requested = TRUE;
1544
1545 #if TRACEFAULTPAGE
1546                         dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
1547 #endif
1548                         vm_object_lock(object);
1549
1550                         if (rc != KERN_SUCCESS) {
1551
1552                                 vm_fault_cleanup(object, first_m);
1553                                 thread_interrupt_level(interruptible_state);
1554
1555                                 return ((rc == MACH_SEND_INTERRUPTED) ?
1556                                         VM_FAULT_INTERRUPTED :
1557                                         VM_FAULT_MEMORY_ERROR);
1558                         } else {
1559                                 clock_sec_t     tv_sec;
1560                                 clock_usec_t    tv_usec;
1561
1562                                 clock_get_system_microtime(&tv_sec, &tv_usec);
1563                                 current_thread()->t_page_creation_time = tv_sec;
1564                                 current_thread()->t_page_creation_count = 0;
1565                         }
1566                         if ((interruptible != THREAD_UNINT) && (current_thread()->sched_flags & TH_SFLAG_ABORT)) {
1567
1568                                 vm_fault_cleanup(object, first_m);
1569                                 thread_interrupt_level(interruptible_state);
1570
1571                                 return (VM_FAULT_INTERRUPTED);
1572                         }
1573                         if (force_fault_retry == TRUE) {
1574
1575                                 vm_fault_cleanup(object, first_m);
1576                                 thread_interrupt_level(interruptible_state);
1577
1578                                 return (VM_FAULT_RETRY);
1579                         }
1580                         if (m == VM_PAGE_NULL && object->phys_contiguous) {
1581                                 /*
1582                                  * No page here means that the object we
1583                                  * initially looked up was "physically
1584                                  * contiguous" (i.e. device memory).  However,
1585                                  * with Virtual VRAM, the object might not
1586                                  * be backed by that device memory anymore,
1587                                  * so we're done here only if the object is
1588                                  * still "phys_contiguous".
1589                                  * Otherwise, if the object is no longer
1590                                  * "phys_contiguous", we need to retry the
1591                                  * page fault against the object's new backing
1592                                  * store (different memory object).
1593                                  */
1594                         phys_contig_object:
1595                                 goto done;
1596                         }
1597                         /*
1598                          * potentially a pagein fault
1599                          * if we make it through the state checks
1600                          * above, than we'll count it as such
1601                          */
1602                         my_fault = DBG_PAGEIN_FAULT;
1603
1604                         /*
1605                          * Retry with same object/offset, since new data may
1606                          * be in a different page (i.e., m is meaningless at
1607                          * this point).
1608                          */
1609                         continue;
1610                 }
1611 dont_look_for_page:
1612                 /*
1613                  * We get here if the object has no pager, or an existence map
1614                  * exists and indicates the page isn't present on the pager
1615                  * or we're unwiring a page.  If a pager exists, but there
1616                  * is no existence map, then the m->absent case above handles
1617                  * the ZF case when the pager can't provide the page
1618                  */
1619 #if TRACEFAULTPAGE
1620                 dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1621 #endif
1622                 if (object == first_object)
1623                         first_m = m;
1624                 else
1625                         assert(m == VM_PAGE_NULL);
1626
1627                 XPR(XPR_VM_FAULT,
1628                     "vm_f_page: no pager obj 0x%X, offset 0x%X, page 0x%X, next_obj 0x%X\n",
1629                         object, offset, m,
1630                         object->shadow, 0);
1631
1632                 next_object = object->shadow;
1633
1634                 if (next_object == VM_OBJECT_NULL) {
1635                         /*
1636                          * we've hit the bottom of the shadown chain,
1637                          * fill the page in the top object with zeros.
1638                          */
1639                         assert(!must_be_resident);
1640
1641                         if (object != first_object) {
1642                                 vm_object_paging_end(object);
1643                                 vm_object_unlock(object);
1644
1645                                 object = first_object;
1646                                 offset = first_offset;
1647                                 vm_object_lock(object);
1648                         }
1649                         m = first_m;
1650                         assert(m->object == object);
1651                         first_m = VM_PAGE_NULL;
1652
1653                         /*
1654                          * check for any conditions that prevent
1655                          * us from creating a new zero-fill page
1656                          * vm_fault_check will do all of the
1657                          * fault cleanup in the case of an error condition
1658                          * including resetting the thread_interrupt_level
1659                          */
1660                         error = vm_fault_check(object, m, first_m, interruptible_state);
1661
1662                         if (error != VM_FAULT_SUCCESS)
1663                                 return (error);
1664
1665                         if (m == VM_PAGE_NULL) {
1666                                 m = vm_page_grab();
1667
1668                                 if (m == VM_PAGE_NULL) {
1669                                         vm_fault_cleanup(object, VM_PAGE_NULL);
1670                                         thread_interrupt_level(interruptible_state);
1671
1672                                         return (VM_FAULT_MEMORY_SHORTAGE);
1673                                 }
1674                                 vm_page_insert(m, object, offset);
1675                         }
1676                         my_fault = vm_fault_zero_page(m, no_zero_fill);
1677
1678                         if (fault_info->mark_zf_absent && no_zero_fill == TRUE)
1679                                 m->absent = TRUE;
1680                         break;
1681
1682                 } else {
1683                         /*
1684                          * Move on to the next object.  Lock the next
1685                          * object before unlocking the current one.
1686                          */
1687                         if ((object != first_object) || must_be_resident)
1688                                 vm_object_paging_end(object);
1689
1690                         offset += object->vo_shadow_offset;
1691                         fault_info->lo_offset += object->vo_shadow_offset;
1692                         fault_info->hi_offset += object->vo_shadow_offset;
1693                         access_required = VM_PROT_READ;
1694
1695                         vm_object_lock(next_object);
1696                         vm_object_unlock(object);
1697
1698                         object = next_object;
1699                         vm_object_paging_begin(object);
1700                 }
1701         }
1702
1703         /*
1704          *      PAGE HAS BEEN FOUND.
1705          *
1706          *      This page (m) is:
1707          *              busy, so that we can play with it;
1708          *              not absent, so that nobody else will fill it;
1709          *              possibly eligible for pageout;
1710          *
1711          *      The top-level page (first_m) is:
1712          *              VM_PAGE_NULL if the page was found in the
1713          *               top-level object;
1714          *              busy, not absent, and ineligible for pageout.
1715          *
1716          *      The current object (object) is locked.  A paging
1717          *      reference is held for the current and top-level
1718          *      objects.
1719          */
1720
1721 #if TRACEFAULTPAGE
1722         dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1723 #endif
1724 #if     EXTRA_ASSERTIONS
1725         assert(m->busy && !m->absent);
1726         assert((first_m == VM_PAGE_NULL) ||
1727                (first_m->busy && !first_m->absent &&
1728                 !first_m->active && !first_m->inactive));
1729 #endif  /* EXTRA_ASSERTIONS */
1730
1731         /*
1732          * ENCRYPTED SWAP:
1733          * If we found a page, we must have decrypted it before we
1734          * get here...
1735          */
1736         ASSERT_PAGE_DECRYPTED(m);
1737
1738         XPR(XPR_VM_FAULT,
1739             "vm_f_page: FOUND obj 0x%X, off 0x%X, page 0x%X, 1_obj 0x%X, 1_m 0x%X\n",
1740                 object, offset, m,
1741                 first_object, first_m);
1742
1743         /*
1744          * If the page is being written, but isn't
1745          * already owned by the top-level object,
1746          * we have to copy it into a new page owned
1747          * by the top-level object.
1748          */
1749         if (object != first_object) {
1750
1751 #if TRACEFAULTPAGE
1752                 dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1753 #endif
1754                 if (fault_type & VM_PROT_WRITE) {
1755                         vm_page_t copy_m;
1756
1757                         /*
1758                          * We only really need to copy if we
1759                          * want to write it.
1760                          */
1761                         assert(!must_be_resident);
1762
1763                         /*
1764                          * are we protecting the system from
1765                          * backing store exhaustion.  If so
1766                          * sleep unless we are privileged.
1767                          */
1768                         if (vm_backing_store_low) {
1769                                 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
1770
1771                                         RELEASE_PAGE(m);
1772                                         vm_fault_cleanup(object, first_m);
1773
1774                                         assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
1775
1776                                         thread_block(THREAD_CONTINUE_NULL);
1777                                         thread_interrupt_level(interruptible_state);
1778
1779                                         return (VM_FAULT_RETRY);
1780                                 }
1781                         }
1782                         /*
1783                          * If we try to collapse first_object at this
1784                          * point, we may deadlock when we try to get
1785                          * the lock on an intermediate object (since we
1786                          * have the bottom object locked).  We can't
1787                          * unlock the bottom object, because the page
1788                          * we found may move (by collapse) if we do.
1789                          *
1790                          * Instead, we first copy the page.  Then, when
1791                          * we have no more use for the bottom object,
1792                          * we unlock it and try to collapse.
1793                          *
1794                          * Note that we copy the page even if we didn't
1795                          * need to... that's the breaks.
1796                          */
1797
1798                         /*
1799                          * Allocate a page for the copy
1800                          */
1801                         copy_m = vm_page_grab();
1802
1803                         if (copy_m == VM_PAGE_NULL) {
1804                                 RELEASE_PAGE(m);
1805
1806                                 vm_fault_cleanup(object, first_m);
1807                                 thread_interrupt_level(interruptible_state);
1808
1809                                 return (VM_FAULT_MEMORY_SHORTAGE);
1810                         }
1811                         XPR(XPR_VM_FAULT,
1812                             "vm_f_page: page_copy obj 0x%X, offset 0x%X, m 0x%X, copy_m 0x%X\n",
1813                                 object, offset,
1814                                 m, copy_m, 0);
1815
1816                         vm_page_copy(m, copy_m);
1817
1818                         /*
1819                          * If another map is truly sharing this
1820                          * page with us, we have to flush all
1821                          * uses of the original page, since we
1822                          * can't distinguish those which want the
1823                          * original from those which need the
1824                          * new copy.
1825                          *
1826                          * XXXO If we know that only one map has
1827                          * access to this page, then we could
1828                          * avoid the pmap_disconnect() call.
1829                          */
1830                         if (m->pmapped)
1831                                 pmap_disconnect(m->phys_page);
1832
1833                         assert(!m->cleaning);
1834
1835                         /*
1836                          * We no longer need the old page or object.
1837                          */
1838                         PAGE_WAKEUP_DONE(m);
1839                         vm_object_paging_end(object);
1840                         vm_object_unlock(object);
1841
1842                         my_fault = DBG_COW_FAULT;
1843                         VM_STAT_INCR(cow_faults);
1844                         DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
1845                         current_task()->cow_faults++;
1846
1847                         object = first_object;
1848                         offset = first_offset;
1849
1850                         vm_object_lock(object);
1851                         /*
1852                          * get rid of the place holder
1853                          * page that we soldered in earlier
1854                          */
1855                         VM_PAGE_FREE(first_m);
1856                         first_m = VM_PAGE_NULL;
1857
1858                         /*
1859                          * and replace it with the
1860                          * page we just copied into
1861                          */
1862                         assert(copy_m->busy);
1863                         vm_page_insert(copy_m, object, offset);
1864                         SET_PAGE_DIRTY(copy_m, TRUE);
1865
1866                         m = copy_m;
1867                         /*
1868                          * Now that we've gotten the copy out of the
1869                          * way, let's try to collapse the top object.
1870                          * But we have to play ugly games with
1871                          * paging_in_progress to do that...
1872                          */
1873                         vm_object_paging_end(object);
1874                         vm_object_collapse(object, offset, TRUE);
1875                         vm_object_paging_begin(object);
1876
1877                 } else
1878                         *protection &= (~VM_PROT_WRITE);
1879         }
1880         /*
1881          * Now check whether the page needs to be pushed into the
1882          * copy object.  The use of asymmetric copy on write for
1883          * shared temporary objects means that we may do two copies to
1884          * satisfy the fault; one above to get the page from a
1885          * shadowed object, and one here to push it into the copy.
1886          */
1887         try_failed_count = 0;
1888
1889         while ((copy_object = first_object->copy) != VM_OBJECT_NULL) {
1890                 vm_object_offset_t      copy_offset;
1891                 vm_page_t               copy_m;
1892
1893 #if TRACEFAULTPAGE
1894                 dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type);    /* (TEST/DEBUG) */
1895 #endif
1896                 /*
1897                  * If the page is being written, but hasn't been
1898                  * copied to the copy-object, we have to copy it there.
1899                  */
1900                 if ((fault_type & VM_PROT_WRITE) == 0) {
1901                         *protection &= ~VM_PROT_WRITE;
1902                         break;
1903                 }
1904
1905                 /*
1906                  * If the page was guaranteed to be resident,
1907                  * we must have already performed the copy.
1908                  */
1909                 if (must_be_resident)
1910                         break;
1911
1912                 /*
1913                  * Try to get the lock on the copy_object.
1914                  */
1915                 if (!vm_object_lock_try(copy_object)) {
1916
1917                         vm_object_unlock(object);
1918                         try_failed_count++;
1919
1920                         mutex_pause(try_failed_count);  /* wait a bit */
1921                         vm_object_lock(object);
1922
1923                         continue;
1924                 }
1925                 try_failed_count = 0;
1926
1927                 /*
1928                  * Make another reference to the copy-object,
1929                  * to keep it from disappearing during the
1930                  * copy.
1931                  */
1932                 vm_object_reference_locked(copy_object);
1933
1934                 /*
1935                  * Does the page exist in the copy?
1936                  */
1937                 copy_offset = first_offset - copy_object->vo_shadow_offset;
1938
1939                 if (copy_object->vo_size <= copy_offset)
1940                         /*
1941                          * Copy object doesn't cover this page -- do nothing.
1942                          */
1943                         ;
1944                 else if ((copy_m = vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
1945                         /*
1946                          * Page currently exists in the copy object
1947                          */
1948                         if (copy_m->busy) {
1949                                 /*
1950                                  * If the page is being brought
1951                                  * in, wait for it and then retry.
1952                                  */
1953                                 RELEASE_PAGE(m);
1954
1955                                 /*
1956                                  * take an extra ref so object won't die
1957                                  */
1958                                 vm_object_reference_locked(copy_object);
1959                                 vm_object_unlock(copy_object);
1960                                 vm_fault_cleanup(object, first_m);
1961                                 counter(c_vm_fault_page_block_backoff_kernel++);
1962
1963                                 vm_object_lock(copy_object);
1964                                 assert(copy_object->ref_count > 0);
1965                                 VM_OBJ_RES_DECR(copy_object);
1966                                 vm_object_lock_assert_exclusive(copy_object);
1967                                 copy_object->ref_count--;
1968                                 assert(copy_object->ref_count > 0);
1969                                 copy_m = vm_page_lookup(copy_object, copy_offset);
1970                                 /*
1971                                  * ENCRYPTED SWAP:
1972                                  * it's OK if the "copy_m" page is encrypted,
1973                                  * because we're not moving it nor handling its
1974                                  * contents.
1975                                  */
1976                                 if (copy_m != VM_PAGE_NULL && copy_m->busy) {
1977                                         PAGE_ASSERT_WAIT(copy_m, interruptible);
1978
1979                                         vm_object_unlock(copy_object);
1980                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1981                                         vm_object_deallocate(copy_object);
1982
1983                                         goto backoff;
1984                                 } else {
1985                                         vm_object_unlock(copy_object);
1986                                         vm_object_deallocate(copy_object);
1987                                         thread_interrupt_level(interruptible_state);
1988
1989                                         return (VM_FAULT_RETRY);
1990                                 }
1991                         }
1992                 }
1993                 else if (!PAGED_OUT(copy_object, copy_offset)) {
1994                         /*
1995                          * If PAGED_OUT is TRUE, then the page used to exist
1996                          * in the copy-object, and has already been paged out.
1997                          * We don't need to repeat this. If PAGED_OUT is
1998                          * FALSE, then either we don't know (!pager_created,
1999                          * for example) or it hasn't been paged out.
2000                          * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
2001                          * We must copy the page to the copy object.
2002                          */
2003
2004                         if (vm_backing_store_low) {
2005                                 /*
2006                                  * we are protecting the system from
2007                                  * backing store exhaustion.  If so
2008                                  * sleep unless we are privileged.
2009                                  */
2010                                 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
2011                                         assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
2012
2013                                         RELEASE_PAGE(m);
2014                                         VM_OBJ_RES_DECR(copy_object);
2015                                         vm_object_lock_assert_exclusive(copy_object);
2016                                         copy_object->ref_count--;
2017                                         assert(copy_object->ref_count > 0);
2018
2019                                         vm_object_unlock(copy_object);
2020                                         vm_fault_cleanup(object, first_m);
2021                                         thread_block(THREAD_CONTINUE_NULL);
2022                                         thread_interrupt_level(interruptible_state);
2023
2024                                         return (VM_FAULT_RETRY);
2025                                 }
2026                         }
2027                         /*
2028                          * Allocate a page for the copy
2029                          */
2030                         copy_m = vm_page_alloc(copy_object, copy_offset);
2031
2032                         if (copy_m == VM_PAGE_NULL) {
2033                                 RELEASE_PAGE(m);
2034
2035                                 VM_OBJ_RES_DECR(copy_object);
2036                                 vm_object_lock_assert_exclusive(copy_object);
2037                                 copy_object->ref_count--;
2038                                 assert(copy_object->ref_count > 0);
2039
2040                                 vm_object_unlock(copy_object);
2041                                 vm_fault_cleanup(object, first_m);
2042                                 thread_interrupt_level(interruptible_state);
2043
2044                                 return (VM_FAULT_MEMORY_SHORTAGE);
2045                         }
2046                         /*
2047                          * Must copy page into copy-object.
2048                          */
2049                         vm_page_copy(m, copy_m);
2050
2051                         /*
2052                          * If the old page was in use by any users
2053                          * of the copy-object, it must be removed
2054                          * from all pmaps.  (We can't know which
2055                          * pmaps use it.)
2056                          */
2057                         if (m->pmapped)
2058                                 pmap_disconnect(m->phys_page);
2059
2060                         /*
2061                          * If there's a pager, then immediately
2062                          * page out this page, using the "initialize"
2063                          * option.  Else, we use the copy.
2064                          */
2065                         if ((!copy_object->pager_created)
2066 #if MACH_PAGEMAP
2067                             || vm_external_state_get(copy_object->existence_map, copy_offset) == VM_EXTERNAL_STATE_ABSENT
2068 #endif
2069                             ) {
2070
2071                                 vm_page_lockspin_queues();
2072                                 assert(!m->cleaning);
2073                                 vm_page_activate(copy_m);
2074                                 vm_page_unlock_queues();
2075
2076                                 SET_PAGE_DIRTY(copy_m, TRUE);
2077                                 PAGE_WAKEUP_DONE(copy_m);
2078
2079                         } else if (copy_object->internal) {
2080                                 /*
2081                                  * For internal objects check with the pager to see
2082                                  * if the page already exists in the backing store.
2083                                  * If yes, then we can drop the copy page. If not,
2084                                  * then we'll activate it, mark it dirty and keep it
2085                                  * around.
2086                                  */
2087
2088                                 kern_return_t kr = KERN_SUCCESS;
2089
2090                                 memory_object_t copy_pager = copy_object->pager;
2091                                 assert(copy_pager != MEMORY_OBJECT_NULL);
2092                                 vm_object_paging_begin(copy_object);
2093
2094                                 vm_object_unlock(copy_object);
2095
2096                                 kr = memory_object_data_request(
2097                                         copy_pager,
2098                                         copy_offset + copy_object->paging_offset,
2099                                         0, /* Only query the pager. */
2100                                         VM_PROT_READ,
2101                                         NULL);
2102
2103                                 vm_object_lock(copy_object);
2104
2105                                 vm_object_paging_end(copy_object);
2106
2107                                 /*
2108                                  * Since we dropped the copy_object's lock,
2109                                  * check whether we'll have to deallocate
2110                                  * the hard way.
2111                                  */
2112                                 if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
2113                                         vm_object_unlock(copy_object);
2114                                         vm_object_deallocate(copy_object);
2115                                         vm_object_lock(object);
2116
2117                                         continue;
2118                                 }
2119                                 if (kr == KERN_SUCCESS) {
2120                                         /*
2121                                          * The pager has the page. We don't want to overwrite
2122                                          * that page by sending this one out to the backing store.
2123                                          * So we drop the copy page.
2124                                          */
2125                                         VM_PAGE_FREE(copy_m);
2126
2127                                 } else {
2128                                         /*
2129                                          * The pager doesn't have the page. We'll keep this one
2130                                          * around in the copy object. It might get sent out to
2131                                          * the backing store under memory pressure.
2132                                          */
2133                                         vm_page_lockspin_queues();
2134                                         assert(!m->cleaning);
2135                                         vm_page_activate(copy_m);
2136                                         vm_page_unlock_queues();
2137
2138                                         SET_PAGE_DIRTY(copy_m, TRUE);
2139                                         PAGE_WAKEUP_DONE(copy_m);
2140                                 }
2141                         } else {
2142
2143                                 assert(copy_m->busy == TRUE);
2144                                 assert(!m->cleaning);
2145
2146                                 /*
2147                                  * dirty is protected by the object lock
2148                                  */
2149                                 SET_PAGE_DIRTY(copy_m, TRUE);
2150
2151                                 /*
2152                                  * The page is already ready for pageout:
2153                                  * not on pageout queues and busy.
2154                                  * Unlock everything except the
2155                                  * copy_object itself.
2156                                  */
2157                                 vm_object_unlock(object);
2158
2159                                 /*
2160                                  * Write the page to the copy-object,
2161                                  * flushing it from the kernel.
2162                                  */
2163                                 vm_pageout_initialize_page(copy_m);
2164
2165                                 /*
2166                                  * Since the pageout may have
2167                                  * temporarily dropped the
2168                                  * copy_object's lock, we
2169                                  * check whether we'll have
2170                                  * to deallocate the hard way.
2171                                  */
2172                                 if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
2173                                         vm_object_unlock(copy_object);
2174                                         vm_object_deallocate(copy_object);
2175                                         vm_object_lock(object);
2176
2177                                         continue;
2178                                 }
2179                                 /*
2180                                  * Pick back up the old object's
2181                                  * lock.  [It is safe to do so,
2182                                  * since it must be deeper in the
2183                                  * object tree.]
2184                                  */
2185                                 vm_object_lock(object);
2186                         }
2187
2188                         /*
2189                          * Because we're pushing a page upward
2190                          * in the object tree, we must restart
2191                          * any faults that are waiting here.
2192                          * [Note that this is an expansion of
2193                          * PAGE_WAKEUP that uses the THREAD_RESTART
2194                          * wait result].  Can't turn off the page's
2195                          * busy bit because we're not done with it.
2196                          */
2197                         if (m->wanted) {
2198                                 m->wanted = FALSE;
2199                                 thread_wakeup_with_result((event_t) m, THREAD_RESTART);
2200                         }
2201                 }
2202                 /*
2203                  * The reference count on copy_object must be
2204                  * at least 2: one for our extra reference,
2205                  * and at least one from the outside world
2206                  * (we checked that when we last locked
2207                  * copy_object).
2208                  */
2209                 vm_object_lock_assert_exclusive(copy_object);
2210                 copy_object->ref_count--;
2211                 assert(copy_object->ref_count > 0);
2212
2213                 VM_OBJ_RES_DECR(copy_object);
2214                 vm_object_unlock(copy_object);
2215
2216                 break;
2217         }
2218
2219 done:
2220         *result_page = m;
2221         *top_page = first_m;
2222
2223         XPR(XPR_VM_FAULT,
2224                 "vm_f_page: DONE obj 0x%X, offset 0x%X, m 0x%X, first_m 0x%X\n",
2225                 object, offset, m, first_m, 0);
2226
2227         if (m != VM_PAGE_NULL) {
2228                 retval = VM_FAULT_SUCCESS;
2229                 if (my_fault == DBG_PAGEIN_FAULT) {
2230
2231                         VM_STAT_INCR(pageins);
2232                         DTRACE_VM2(pgin, int, 1, (uint64_t *), NULL);
2233                         DTRACE_VM2(maj_fault, int, 1, (uint64_t *), NULL);
2234                         current_task()->pageins++;
2235
2236                         if (m->object->internal) {
2237                                 DTRACE_VM2(anonpgin, int, 1, (uint64_t *), NULL);
2238                                 my_fault = DBG_PAGEIND_FAULT;
2239                         } else {
2240                                 DTRACE_VM2(fspgin, int, 1, (uint64_t *), NULL);
2241                                 my_fault = DBG_PAGEINV_FAULT;
2242                         }
2243
2244                         /*
2245                          * evaluate access pattern and update state
2246                          * vm_fault_deactivate_behind depends on the
2247                          * state being up to date
2248                          */
2249                         vm_fault_is_sequential(object, offset, fault_info->behavior);
2250
2251                         vm_fault_deactivate_behind(object, offset, fault_info->behavior);
2252                 }
2253                 if (type_of_fault)
2254                         *type_of_fault = my_fault;
2255         } else {
2256                 retval = VM_FAULT_SUCCESS_NO_VM_PAGE;
2257                 assert(first_m == VM_PAGE_NULL);
2258                 assert(object == first_object);
2259         }
2260
2261         thread_interrupt_level(interruptible_state);
2262
2263 #if TRACEFAULTPAGE
2264         dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0);       /* (TEST/DEBUG) */
2265 #endif
2266         return retval;
2267
2268 backoff:
2269         thread_interrupt_level(interruptible_state);
2270
2271         if (wait_result == THREAD_INTERRUPTED)
2272                 return (VM_FAULT_INTERRUPTED);
2273         return (VM_FAULT_RETRY);
2274
2275 #undef  RELEASE_PAGE
2276 }
2277
2278
2279
2280 /*
2281  * CODE SIGNING:
2282  * When soft faulting a page, we have to validate the page if:
2283  * 1. the page is being mapped in user space
2284  * 2. the page hasn't already been found to be "tainted"
2285  * 3. the page belongs to a code-signed object
2286  * 4. the page has not been validated yet or has been mapped for write.
2287  */
2288 #define VM_FAULT_NEED_CS_VALIDATION(pmap, page)                         \
2289         ((pmap) != kernel_pmap /*1*/ &&                                 \
2290          !(page)->cs_tainted /*2*/ &&                                   \
2291          (page)->object->code_signed /*3*/ &&                           \
2292          (!(page)->cs_validated || (page)->wpmapped /*4*/))
2293
2294
2295 /*
2296  * page queue lock must NOT be held
2297  * m->object must be locked
2298  *
2299  * NOTE: m->object could be locked "shared" only if we are called
2300  * from vm_fault() as part of a soft fault.  If so, we must be
2301  * careful not to modify the VM object in any way that is not
2302  * legal under a shared lock...
2303  */
2304 unsigned long cs_enter_tainted_rejected = 0;
2305 unsigned long cs_enter_tainted_accepted = 0;
2306 kern_return_t
2307 vm_fault_enter(vm_page_t m,
2308                pmap_t pmap,
2309                vm_map_offset_t vaddr,
2310                vm_prot_t prot,
2311                vm_prot_t fault_type,
2312                boolean_t wired,
2313                boolean_t change_wiring,
2314                boolean_t no_cache,
2315                boolean_t cs_bypass,
2316                boolean_t *need_retry,
2317                int *type_of_fault)
2318 {
2319         kern_return_t   kr, pe_result;
2320         boolean_t       previously_pmapped = m->pmapped;
2321         boolean_t       must_disconnect = 0;
2322         boolean_t       map_is_switched, map_is_switch_protected;
2323
2324         vm_object_lock_assert_held(m->object);
2325 #if DEBUG
2326         lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
2327 #endif /* DEBUG */
2328
2329         if (m->phys_page == vm_page_guard_addr) {
2330                 assert(m->fictitious);
2331                 return KERN_SUCCESS;
2332         }
2333
2334         if (*type_of_fault == DBG_ZERO_FILL_FAULT) {
2335
2336                 vm_object_lock_assert_exclusive(m->object);
2337
2338         } else if ((fault_type & VM_PROT_WRITE) == 0) {
2339                 /*
2340                  * This is not a "write" fault, so we
2341                  * might not have taken the object lock
2342                  * exclusively and we might not be able
2343                  * to update the "wpmapped" bit in
2344                  * vm_fault_enter().
2345                  * Let's just grant read access to
2346                  * the page for now and we'll
2347                  * soft-fault again if we need write
2348                  * access later...
2349                  */
2350                 prot &= ~VM_PROT_WRITE;
2351         }
2352         if (m->pmapped == FALSE) {
2353
2354                 if ((*type_of_fault == DBG_CACHE_HIT_FAULT) && m->clustered) {
2355                         /*
2356                          * found it in the cache, but this
2357                          * is the first fault-in of the page (m->pmapped == FALSE)
2358                          * so it must have come in as part of
2359                          * a cluster... account 1 pagein against it
2360                          */
2361                         VM_STAT_INCR(pageins);
2362                         DTRACE_VM2(pgin, int, 1, (uint64_t *), NULL);
2363
2364                         if (m->object->internal) {
2365                                 DTRACE_VM2(anonpgin, int, 1, (uint64_t *), NULL);
2366                                 *type_of_fault = DBG_PAGEIND_FAULT;
2367                         } else {
2368                                 DTRACE_VM2(fspgin, int, 1, (uint64_t *), NULL);
2369                                 *type_of_fault = DBG_PAGEINV_FAULT;
2370                         }
2371
2372                         current_task()->pageins++;
2373                 }
2374                 VM_PAGE_CONSUME_CLUSTERED(m);
2375
2376         }
2377
2378         if (*type_of_fault != DBG_COW_FAULT) {
2379                 DTRACE_VM2(as_fault, int, 1, (uint64_t *), NULL);
2380
2381                 if (pmap == kernel_pmap) {
2382                         DTRACE_VM2(kernel_asflt, int, 1, (uint64_t *), NULL);
2383                 }
2384         }
2385
2386         /* Validate code signature if necessary. */
2387         if (VM_FAULT_NEED_CS_VALIDATION(pmap, m)) {
2388                 vm_object_lock_assert_exclusive(m->object);
2389
2390                 if (m->cs_validated) {
2391                         vm_cs_revalidates++;
2392                 }
2393
2394                 /* VM map is locked, so 1 ref will remain on VM object -
2395                  * so no harm if vm_page_validate_cs drops the object lock */
2396                 vm_page_validate_cs(m);
2397         }
2398
2399 #define page_immutable(m,prot) ((m)->cs_validated /*&& ((prot) & VM_PROT_EXECUTE)*/)
2400
2401         map_is_switched = ((pmap != vm_map_pmap(current_task()->map)) &&
2402                            (pmap == vm_map_pmap(current_thread()->map)));
2403         map_is_switch_protected = current_thread()->map->switch_protect;
2404
2405         /* If the map is switched, and is switch-protected, we must protect
2406          * some pages from being write-faulted: immutable pages because by
2407          * definition they may not be written, and executable pages because that
2408          * would provide a way to inject unsigned code.
2409          * If the page is immutable, we can simply return. However, we can't
2410          * immediately determine whether a page is executable anywhere. But,
2411          * we can disconnect it everywhere and remove the executable protection
2412          * from the current map. We do that below right before we do the
2413          * PMAP_ENTER.
2414          */
2415         if(!cs_enforcement_disable && map_is_switched &&
2416            map_is_switch_protected && page_immutable(m, prot) &&
2417            (prot & VM_PROT_WRITE))
2418         {
2419                 return KERN_CODESIGN_ERROR;
2420         }
2421
2422         /* A page could be tainted, or pose a risk of being tainted later.
2423          * Check whether the receiving process wants it, and make it feel
2424          * the consequences (that hapens in cs_invalid_page()).
2425          * For CS Enforcement, two other conditions will
2426          * cause that page to be tainted as well:
2427          * - pmapping an unsigned page executable - this means unsigned code;
2428          * - writeable mapping of a validated page - the content of that page
2429          *   can be changed without the kernel noticing, therefore unsigned
2430          *   code can be created
2431          */
2432         if (m->cs_tainted ||
2433             (( !cs_enforcement_disable && !cs_bypass ) &&
2434              (/* The page is unsigned and wants to be executable */
2435               (!m->cs_validated && (prot & VM_PROT_EXECUTE))  ||
2436               /* The page should be immutable, but is in danger of being modified
2437                 * This is the case where we want policy from the code directory -
2438                 * is the page immutable or not? For now we have to assume that
2439                 * code pages will be immutable, data pages not.
2440                 * We'll assume a page is a code page if it has a code directory
2441                 * and we fault for execution.
2442                 * That is good enough since if we faulted the code page for
2443                 * writing in another map before, it is wpmapped; if we fault
2444                 * it for writing in this map later it will also be faulted for executing
2445                 * at the same time; and if we fault for writing in another map
2446                 * later, we will disconnect it from this pmap so we'll notice
2447                 * the change.
2448                 */
2449               (page_immutable(m, prot) && ((prot & VM_PROT_WRITE) || m->wpmapped))
2450               ))
2451                 )
2452         {
2453                 /* We will have a tainted page. Have to handle the special case
2454                  * of a switched map now. If the map is not switched, standard
2455                  * procedure applies - call cs_invalid_page().
2456                  * If the map is switched, the real owner is invalid already.
2457                  * There is no point in invalidating the switching process since
2458                  * it will not be executing from the map. So we don't call
2459                  * cs_invalid_page() in that case. */
2460                 boolean_t reject_page;
2461                 if(map_is_switched) {
2462                         assert(pmap==vm_map_pmap(current_thread()->map));
2463                         assert(!(prot & VM_PROT_WRITE) || (map_is_switch_protected == FALSE));
2464                         reject_page = FALSE;
2465                 } else {
2466                         reject_page = cs_invalid_page((addr64_t) vaddr);
2467                 }
2468
2469                 if (reject_page) {
2470                         /* reject the tainted page: abort the page fault */
2471                         kr = KERN_CODESIGN_ERROR;
2472                         cs_enter_tainted_rejected++;
2473                 } else {
2474                         /* proceed with the tainted page */
2475                         kr = KERN_SUCCESS;
2476                         /* Page might have been tainted before or not; now it
2477                          * definitively is. If the page wasn't tainted, we must
2478                          * disconnect it from all pmaps later. */
2479                         must_disconnect = !m->cs_tainted;
2480                         m->cs_tainted = TRUE;
2481                         cs_enter_tainted_accepted++;
2482                 }
2483                 if (cs_debug || kr != KERN_SUCCESS) {
2484                         printf("CODESIGNING: vm_fault_enter(0x%llx): "
2485                                "page %p obj %p off 0x%llx *** INVALID PAGE ***\n",
2486                                (long long)vaddr, m, m->object, m->offset);
2487                 }
2488
2489         } else {
2490                 /* proceed with the valid page */
2491                 kr = KERN_SUCCESS;
2492         }
2493
2494         /* If we have a KERN_SUCCESS from the previous checks, we either have
2495          * a good page, or a tainted page that has been accepted by the process.
2496          * In both cases the page will be entered into the pmap.
2497          * If the page is writeable, we need to disconnect it from other pmaps
2498          * now so those processes can take note.
2499          */
2500         if (kr == KERN_SUCCESS) {
2501                 /*
2502                  * NOTE: we may only hold the vm_object lock SHARED
2503                  * at this point, but the update of pmapped is ok
2504                  * since this is the ONLY bit updated behind the SHARED
2505                  * lock... however, we need to figure out how to do an atomic
2506                  * update on a bit field to make this less fragile... right
2507                  * now I don't know how to coerce 'C' to give me the offset info
2508                  * that's needed for an AtomicCompareAndSwap
2509                  */
2510                 m->pmapped = TRUE;
2511                 if(vm_page_is_slideable(m)) {
2512                         boolean_t was_busy = m->busy;
2513                         m->busy = TRUE;
2514                         kr = vm_page_slide(m, 0);
2515                         assert(m->busy);
2516                         if(!was_busy) {
2517                                 PAGE_WAKEUP_DONE(m);
2518                         }
2519                         if (kr != KERN_SUCCESS) {
2520                                 /*
2521                                  * This page has not been slid correctly,
2522                                  * do not do the pmap_enter() !
2523                                  * Let vm_fault_enter() return the error
2524                                  * so the caller can fail the fault.
2525                                  */
2526                                 goto after_the_pmap_enter;
2527                         }
2528                 }
2529
2530                 if (fault_type & VM_PROT_WRITE) {
2531
2532                         if (m->wpmapped == FALSE) {
2533                                 vm_object_lock_assert_exclusive(m->object);
2534
2535                                 m->wpmapped = TRUE;
2536                         }
2537                         if (must_disconnect) {
2538                                 /*
2539                                  * We can only get here
2540                                  * because of the CSE logic
2541                                  */
2542                                 assert(cs_enforcement_disable == FALSE);
2543                                 pmap_disconnect(m->phys_page);
2544                                 /*
2545                                  * If we are faulting for a write, we can clear
2546                                  * the execute bit - that will ensure the page is
2547                                  * checked again before being executable, which
2548                                  * protects against a map switch.
2549                                  * This only happens the first time the page
2550                                  * gets tainted, so we won't get stuck here
2551                                  * to make an already writeable page executable.
2552                                  */
2553                                 if (!cs_bypass){
2554                                         prot &= ~VM_PROT_EXECUTE;
2555                                 }
2556                         }
2557                 }
2558
2559                 /* Prevent a deadlock by not
2560                  * holding the object lock if we need to wait for a page in
2561                  * pmap_enter() - <rdar://problem/7138958> */
2562                 PMAP_ENTER_OPTIONS(pmap, vaddr, m, prot, fault_type, 0,
2563                                   wired, PMAP_OPTIONS_NOWAIT, pe_result);
2564
2565                 if(pe_result == KERN_RESOURCE_SHORTAGE) {
2566
2567                         if (need_retry) {
2568                                 /*
2569                                  * this will be non-null in the case where we hold the lock
2570                                  * on the top-object in this chain... we can't just drop
2571                                  * the lock on the object we're inserting the page into
2572                                  * and recall the PMAP_ENTER since we can still cause
2573                                  * a deadlock if one of the critical paths tries to
2574                                  * acquire the lock on the top-object and we're blocked
2575                                  * in PMAP_ENTER waiting for memory... our only recourse
2576                                  * is to deal with it at a higher level where we can
2577                                  * drop both locks.
2578                                  */
2579                                 *need_retry = TRUE;
2580                                 vm_pmap_enter_retried++;
2581                                 goto after_the_pmap_enter;
2582                         }
2583                         /* The nonblocking version of pmap_enter did not succeed.
2584                          * and we don't need to drop other locks and retry
2585                          * at the level above us, so
2586                          * use the blocking version instead. Requires marking
2587                          * the page busy and unlocking the object */
2588                         boolean_t was_busy = m->busy;
2589                         m->busy = TRUE;
2590                         vm_object_unlock(m->object);
2591
2592                         PMAP_ENTER(pmap, vaddr, m, prot, fault_type, 0, wired);
2593
2594                         /* Take the object lock again. */
2595                         vm_object_lock(m->object);
2596
2597                         /* If the page was busy, someone else will wake it up.
2598                          * Otherwise, we have to do it now. */
2599                         assert(m->busy);
2600                         if(!was_busy) {
2601                                 PAGE_WAKEUP_DONE(m);
2602                         }
2603                         vm_pmap_enter_blocked++;
2604                 }
2605         }
2606
2607 after_the_pmap_enter:
2608         /*
2609          * Hold queues lock to manipulate
2610          * the page queues.  Change wiring
2611          * case is obvious.
2612          */
2613         if (change_wiring) {
2614                 vm_page_lockspin_queues();
2615
2616                 if (wired) {
2617                         if (kr == KERN_SUCCESS) {
2618                                 vm_page_wire(m);
2619                         }
2620                 } else {
2621                         vm_page_unwire(m, TRUE);
2622                 }
2623                 vm_page_unlock_queues();
2624
2625         } else {
2626                 if (kr != KERN_SUCCESS) {
2627                         vm_page_lockspin_queues();
2628                         vm_page_deactivate(m);
2629                         vm_page_unlock_queues();
2630                 } else {
2631                         if (((!m->active && !m->inactive) || m->clean_queue || no_cache) && !VM_PAGE_WIRED(m) && !m->throttled) {
2632
2633                                 if ( vm_page_local_q && !no_cache && (*type_of_fault == DBG_COW_FAULT || *type_of_fault == DBG_ZERO_FILL_FAULT) ) {
2634                                         struct vpl      *lq;
2635                                         uint32_t        lid;
2636
2637                                         /*
2638                                          * we got a local queue to stuff this new page on...
2639                                          * its safe to manipulate local and local_id at this point
2640                                          * since we're behind an exclusive object lock and the
2641                                          * page is not on any global queue.
2642                                          *
2643                                          * we'll use the current cpu number to select the queue
2644                                          * note that we don't need to disable preemption... we're
2645                                          * going to behind the local queue's lock to do the real
2646                                          * work
2647                                          */
2648                                         lid = cpu_number();
2649
2650                                         lq = &vm_page_local_q[lid].vpl_un.vpl;
2651
2652                                         VPL_LOCK(&lq->vpl_lock);
2653
2654                                         queue_enter(&lq->vpl_queue, m, vm_page_t, pageq);
2655                                         m->local = TRUE;
2656                                         m->local_id = lid;
2657                                         lq->vpl_count++;
2658
2659                                         VPL_UNLOCK(&lq->vpl_lock);
2660
2661                                         if (lq->vpl_count > vm_page_local_q_soft_limit) {
2662                                                 /*
2663                                                  * we're beyond the soft limit for the local queue
2664                                                  * vm_page_reactivate_local will 'try' to take
2665                                                  * the global page queue lock... if it can't that's
2666                                                  * ok... we'll let the queue continue to grow up
2667                                                  * to the hard limit... at that point we'll wait
2668                                                  * for the lock... once we've got the lock, we'll
2669                                                  * transfer all of the pages from the local queue
2670                                                  * to the global active queue
2671                                                  */
2672                                                 vm_page_reactivate_local(lid, FALSE, FALSE);
2673                                         }
2674                                         return kr;
2675                                 }
2676
2677                                 vm_page_lockspin_queues();
2678                                 /*
2679                                  * test again now that we hold the page queue lock
2680                                  */
2681                                 if (!VM_PAGE_WIRED(m)) {
2682                                         if (m->clean_queue) {
2683                                                 VM_PAGE_QUEUES_REMOVE(m);
2684
2685                                                 vm_pageout_cleaned_reactivated++;
2686                                                 vm_pageout_cleaned_fault_reactivated++;
2687                                         }
2688
2689                                         if ((!m->active && !m->inactive) || no_cache) {
2690                                                 /*
2691                                                  * If this is a no_cache mapping and the page has never been
2692                                                  * mapped before or was previously a no_cache page, then we
2693                                                  * want to leave pages in the speculative state so that they
2694                                                  * can be readily recycled if free memory runs low.  Otherwise
2695                                                  * the page is activated as normal.
2696                                                  */
2697
2698                                                 if (no_cache && (!previously_pmapped || m->no_cache)) {
2699                                                         m->no_cache = TRUE;
2700
2701                                                         if (!m->speculative)
2702                                                                 vm_page_speculate(m, FALSE);
2703
2704                                                 } else if (!m->active && !m->inactive) {
2705
2706                                                         vm_page_activate(m);
2707                                                 }
2708                                         }
2709                                 }
2710                                 vm_page_unlock_queues();
2711                         }
2712                 }
2713         }
2714         return kr;
2715 }
2716
2717
2718 /*
2719  *      Routine:        vm_fault
2720  *      Purpose:
2721  *              Handle page faults, including pseudo-faults
2722  *              used to change the wiring status of pages.
2723  *      Returns:
2724  *              Explicit continuations have been removed.
2725  *      Implementation:
2726  *              vm_fault and vm_fault_page save mucho state
2727  *              in the moral equivalent of a closure.  The state
2728  *              structure is allocated when first entering vm_fault
2729  *              and deallocated when leaving vm_fault.
2730  */
2731
2732 extern int _map_enter_debug;
2733
2734 unsigned long vm_fault_collapse_total = 0;
2735 unsigned long vm_fault_collapse_skipped = 0;
2736
2737 kern_return_t
2738 vm_fault(
2739         vm_map_t        map,
2740         vm_map_offset_t vaddr,
2741         vm_prot_t       fault_type,
2742         boolean_t       change_wiring,
2743         int             interruptible,
2744         pmap_t          caller_pmap,
2745         vm_map_offset_t caller_pmap_addr)
2746 {
2747         vm_map_version_t        version;        /* Map version for verificiation */
2748         boolean_t               wired;          /* Should mapping be wired down? */
2749         vm_object_t             object;         /* Top-level object */
2750         vm_object_offset_t      offset;         /* Top-level offset */
2751         vm_prot_t               prot;           /* Protection for mapping */
2752         vm_object_t             old_copy_object; /* Saved copy object */
2753         vm_page_t               result_page;    /* Result of vm_fault_page */
2754         vm_page_t               top_page;       /* Placeholder page */
2755         kern_return_t           kr;
2756
2757         vm_page_t               m;      /* Fast access to result_page */
2758         kern_return_t           error_code;
2759         vm_object_t             cur_object;
2760         vm_object_offset_t      cur_offset;
2761         vm_page_t               cur_m;
2762         vm_object_t             new_object;
2763         int                     type_of_fault;
2764         pmap_t                  pmap;
2765         boolean_t               interruptible_state;
2766         vm_map_t                real_map = map;
2767         vm_map_t                original_map = map;
2768         vm_prot_t               original_fault_type;
2769         struct vm_object_fault_info fault_info;
2770         boolean_t               need_collapse = FALSE;
2771         boolean_t               need_retry = FALSE;
2772         int                     object_lock_type = 0;
2773         int                     cur_object_lock_type;
2774         vm_object_t             top_object = VM_OBJECT_NULL;
2775         int                     throttle_delay;
2776
2777
2778         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2779                       (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START,
2780                               (int)((uint64_t)vaddr >> 32),
2781                               (int)vaddr,
2782                               (map == kernel_map),
2783                               0,
2784                               0);
2785
2786         if (get_preemption_level() != 0) {
2787                 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
2788                                       (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
2789                                       (int)((uint64_t)vaddr >> 32),
2790                                       (int)vaddr,
2791                                       KERN_FAILURE,
2792                                       0,
2793                                       0);
2794
2795                 return (KERN_FAILURE);
2796         }
2797
2798         interruptible_state = thread_interrupt_level(interruptible);
2799
2800         VM_STAT_INCR(faults);
2801         current_task()->faults++;
2802         original_fault_type = fault_type;
2803
2804         if (fault_type & VM_PROT_WRITE)
2805                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2806         else
2807                 object_lock_type = OBJECT_LOCK_SHARED;
2808
2809         cur_object_lock_type = OBJECT_LOCK_SHARED;
2810
2811 RetryFault:
2812         /*
2813          * assume we will hit a page in the cache
2814          * otherwise, explicitly override with
2815          * the real fault type once we determine it
2816          */
2817         type_of_fault = DBG_CACHE_HIT_FAULT;
2818
2819         /*
2820          *      Find the backing store object and offset into
2821          *      it to begin the search.
2822          */
2823         fault_type = original_fault_type;
2824         map = original_map;
2825         vm_map_lock_read(map);
2826
2827         kr = vm_map_lookup_locked(&map, vaddr, fault_type,
2828                                   object_lock_type, &version,
2829                                   &object, &offset, &prot, &wired,
2830                                   &fault_info,
2831                                   &real_map);
2832
2833         if (kr != KERN_SUCCESS) {
2834                 vm_map_unlock_read(map);
2835                 goto done;
2836         }
2837         pmap = real_map->pmap;
2838         fault_info.interruptible = interruptible;
2839         fault_info.stealth = FALSE;
2840         fault_info.io_sync = FALSE;
2841         fault_info.mark_zf_absent = FALSE;
2842         fault_info.batch_pmap_op = FALSE;
2843
2844         /*
2845          * If the page is wired, we must fault for the current protection
2846          * value, to avoid further faults.
2847          */
2848         if (wired) {
2849                 fault_type = prot | VM_PROT_WRITE;
2850                 /*
2851                  * since we're treating this fault as a 'write'
2852                  * we must hold the top object lock exclusively
2853                  */
2854                 if (object_lock_type == OBJECT_LOCK_SHARED) {
2855
2856                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2857
2858                         if (vm_object_lock_upgrade(object) == FALSE) {
2859                                 /*
2860                                  * couldn't upgrade, so explictly
2861                                  * take the lock exclusively
2862                                  */
2863                                 vm_object_lock(object);
2864                         }
2865                 }
2866         }
2867
2868 #if     VM_FAULT_CLASSIFY
2869         /*
2870          *      Temporary data gathering code
2871          */
2872         vm_fault_classify(object, offset, fault_type);
2873 #endif
2874         /*
2875          *      Fast fault code.  The basic idea is to do as much as
2876          *      possible while holding the map lock and object locks.
2877          *      Busy pages are not used until the object lock has to
2878          *      be dropped to do something (copy, zero fill, pmap enter).
2879          *      Similarly, paging references aren't acquired until that
2880          *      point, and object references aren't used.
2881          *
2882          *      If we can figure out what to do
2883          *      (zero fill, copy on write, pmap enter) while holding
2884          *      the locks, then it gets done.  Otherwise, we give up,
2885          *      and use the original fault path (which doesn't hold
2886          *      the map lock, and relies on busy pages).
2887          *      The give up cases include:
2888          *              - Have to talk to pager.
2889          *              - Page is busy, absent or in error.
2890          *              - Pager has locked out desired access.
2891          *              - Fault needs to be restarted.
2892          *              - Have to push page into copy object.
2893          *
2894          *      The code is an infinite loop that moves one level down
2895          *      the shadow chain each time.  cur_object and cur_offset
2896          *      refer to the current object being examined. object and offset
2897          *      are the original object from the map.  The loop is at the
2898          *      top level if and only if object and cur_object are the same.
2899          *
2900          *      Invariants:  Map lock is held throughout.  Lock is held on
2901          *              original object and cur_object (if different) when
2902          *              continuing or exiting loop.
2903          *
2904          */
2905
2906
2907         /*
2908          * If this page is to be inserted in a copy delay object
2909          * for writing, and if the object has a copy, then the
2910          * copy delay strategy is implemented in the slow fault page.
2911          */
2912         if (object->copy_strategy == MEMORY_OBJECT_COPY_DELAY &&
2913             object->copy != VM_OBJECT_NULL && (fault_type & VM_PROT_WRITE))
2914                 goto handle_copy_delay;
2915
2916         cur_object = object;
2917         cur_offset = offset;
2918
2919         while (TRUE) {
2920                 if (!cur_object->pager_created &&
2921                     cur_object->phys_contiguous) /* superpage */
2922                         break;
2923
2924                 if (cur_object->blocked_access) {
2925                         /*
2926                          * Access to this VM object has been blocked.
2927                          * Let the slow path handle it.
2928                          */
2929                         break;
2930                 }
2931
2932                 m = vm_page_lookup(cur_object, cur_offset);
2933
2934                 if (m != VM_PAGE_NULL) {
2935                         if (m->busy) {
2936                                 wait_result_t   result;
2937
2938                                 /*
2939                                  * in order to do the PAGE_ASSERT_WAIT, we must
2940                                  * have object that 'm' belongs to locked exclusively
2941                                  */
2942                                 if (object != cur_object) {
2943                                         vm_object_unlock(object);
2944
2945                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
2946
2947                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2948
2949                                                 if (vm_object_lock_upgrade(cur_object) == FALSE) {
2950                                                         /*
2951                                                          * couldn't upgrade so go do a full retry
2952                                                          * immediately since we've already dropped
2953                                                          * the top object lock associated with this page
2954                                                          * and the current one got dropped due to the
2955                                                          * failed upgrade... the state is no longer valid
2956                                                          */
2957                                                         vm_map_unlock_read(map);
2958                                                         if (real_map != map)
2959                                                                 vm_map_unlock(real_map);
2960
2961                                                         goto RetryFault;
2962                                                 }
2963                                         }
2964                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
2965
2966                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2967
2968                                         if (vm_object_lock_upgrade(object) == FALSE) {
2969                                                 /*
2970                                                  * couldn't upgrade, so explictly take the lock
2971                                                  * exclusively and go relookup the page since we
2972                                                  * will have dropped the object lock and
2973                                                  * a different thread could have inserted
2974                                                  * a page at this offset
2975                                                  * no need for a full retry since we're
2976                                                  * at the top level of the object chain
2977                                                  */
2978                                                 vm_object_lock(object);
2979
2980                                                 continue;
2981                                         }
2982                                 }
2983                                 vm_map_unlock_read(map);
2984                                 if (real_map != map)
2985                                         vm_map_unlock(real_map);
2986
2987                                 result = PAGE_ASSERT_WAIT(m, interruptible);
2988
2989                                 vm_object_unlock(cur_object);
2990
2991                                 if (result == THREAD_WAITING) {
2992                                         result = thread_block(THREAD_CONTINUE_NULL);
2993
2994                                         counter(c_vm_fault_page_block_busy_kernel++);
2995                                 }
2996                                 if (result == THREAD_AWAKENED || result == THREAD_RESTART)
2997                                         goto RetryFault;
2998
2999                                 kr = KERN_ABORTED;
3000                                 goto done;
3001                         }
3002                         if (m->laundry) {
3003                                 if (object != cur_object) {
3004                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3005                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3006
3007                                                 vm_object_unlock(object);
3008                                                 vm_object_unlock(cur_object);
3009
3010                                                 vm_map_unlock_read(map);
3011                                                 if (real_map != map)
3012                                                         vm_map_unlock(real_map);
3013
3014                                                 goto RetryFault;
3015                                         }
3016
3017                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3018
3019                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3020
3021                                         if (vm_object_lock_upgrade(object) == FALSE) {
3022                                                 /*
3023                                                  * couldn't upgrade, so explictly take the lock
3024                                                  * exclusively and go relookup the page since we
3025                                                  * will have dropped the object lock and
3026                                                  * a different thread could have inserted
3027                                                  * a page at this offset
3028                                                  * no need for a full retry since we're
3029                                                  * at the top level of the object chain
3030                                                  */
3031                                                 vm_object_lock(object);
3032
3033                                                 continue;
3034                                         }
3035                                 }
3036                                 m->pageout = FALSE;
3037
3038                                 vm_pageout_steal_laundry(m, FALSE);
3039                         }
3040
3041                         if (m->phys_page == vm_page_guard_addr) {
3042                                 /*
3043                                  * Guard page: let the slow path deal with it
3044                                  */
3045                                 break;
3046                         }
3047                         if (m->unusual && (m->error || m->restart || m->private || m->absent)) {
3048                                 /*
3049                                  * Unusual case... let the slow path deal with it
3050                                  */
3051                                 break;
3052                         }
3053                         if (VM_OBJECT_PURGEABLE_FAULT_ERROR(m->object)) {
3054                                 if (object != cur_object)
3055                                         vm_object_unlock(object);
3056                                 vm_map_unlock_read(map);
3057                                 if (real_map != map)
3058                                         vm_map_unlock(real_map);
3059                                 vm_object_unlock(cur_object);
3060                                 kr = KERN_MEMORY_ERROR;
3061                                 goto done;
3062                         }
3063
3064                         if (m->encrypted) {
3065                                 /*
3066                                  * ENCRYPTED SWAP:
3067                                  * We've soft-faulted (because it's not in the page
3068                                  * table) on an encrypted page.
3069                                  * Keep the page "busy" so that no one messes with
3070                                  * it during the decryption.
3071                                  * Release the extra locks we're holding, keep only
3072                                  * the page's VM object lock.
3073                                  *
3074                                  * in order to set 'busy' on 'm', we must
3075                                  * have object that 'm' belongs to locked exclusively
3076                                  */
3077                                 if (object != cur_object) {
3078                                         vm_object_unlock(object);
3079
3080                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3081
3082                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3083
3084                                                 if (vm_object_lock_upgrade(cur_object) == FALSE) {
3085                                                         /*
3086                                                          * couldn't upgrade so go do a full retry
3087                                                          * immediately since we've already dropped
3088                                                          * the top object lock associated with this page
3089                                                          * and the current one got dropped due to the
3090                                                          * failed upgrade... the state is no longer valid
3091                                                          */
3092                                                         vm_map_unlock_read(map);
3093                                                         if (real_map != map)
3094                                                                 vm_map_unlock(real_map);
3095
3096                                                         goto RetryFault;
3097                                                 }
3098                                         }
3099                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3100
3101                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3102
3103                                         if (vm_object_lock_upgrade(object) == FALSE) {
3104                                                 /*
3105                                                  * couldn't upgrade, so explictly take the lock
3106                                                  * exclusively and go relookup the page since we
3107                                                  * will have dropped the object lock and
3108                                                  * a different thread could have inserted
3109                                                  * a page at this offset
3110                                                  * no need for a full retry since we're
3111                                                  * at the top level of the object chain
3112                                                  */
3113                                                 vm_object_lock(object);
3114
3115                                                 continue;
3116                                         }
3117                                 }
3118                                 m->busy = TRUE;
3119
3120                                 vm_map_unlock_read(map);
3121                                 if (real_map != map)
3122                                         vm_map_unlock(real_map);
3123
3124                                 vm_page_decrypt(m, 0);
3125
3126                                 assert(m->busy);
3127                                 PAGE_WAKEUP_DONE(m);
3128
3129                                 vm_object_unlock(cur_object);
3130                                 /*
3131                                  * Retry from the top, in case anything
3132                                  * changed while we were decrypting...
3133                                  */
3134                                 goto RetryFault;
3135                         }
3136                         ASSERT_PAGE_DECRYPTED(m);
3137
3138                         if(vm_page_is_slideable(m)) {
3139                                 /*
3140                                  * We might need to slide this page, and so,
3141                                  * we want to hold the VM object exclusively.
3142                                  */
3143                                 if (object != cur_object) {
3144                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3145                                                 vm_object_unlock(object);
3146                                                 vm_object_unlock(cur_object);
3147
3148                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3149
3150                                                 vm_map_unlock_read(map);
3151                                                 if (real_map != map)
3152                                                         vm_map_unlock(real_map);
3153
3154                                                 goto RetryFault;
3155                                         }
3156                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3157
3158                                         vm_object_unlock(object);
3159                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3160                                         vm_map_unlock_read(map);
3161                                         goto RetryFault;
3162                                 }
3163                         }
3164
3165                         if (VM_FAULT_NEED_CS_VALIDATION(map->pmap, m)) {
3166 upgrade_for_validation:
3167                                 /*
3168                                  * We might need to validate this page
3169                                  * against its code signature, so we
3170                                  * want to hold the VM object exclusively.
3171                                  */
3172                                 if (object != cur_object) {
3173                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3174                                                 vm_object_unlock(object);
3175                                                 vm_object_unlock(cur_object);
3176
3177                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3178
3179                                                 vm_map_unlock_read(map);
3180                                                 if (real_map != map)
3181                                                         vm_map_unlock(real_map);
3182
3183                                                 goto RetryFault;
3184                                         }
3185
3186                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3187
3188                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3189
3190                                         if (vm_object_lock_upgrade(object) == FALSE) {
3191                                                 /*
3192                                                  * couldn't upgrade, so explictly take the lock
3193                                                  * exclusively and go relookup the page since we
3194                                                  * will have dropped the object lock and
3195                                                  * a different thread could have inserted
3196                                                  * a page at this offset
3197                                                  * no need for a full retry since we're
3198                                                  * at the top level of the object chain
3199                                                  */
3200                                                 vm_object_lock(object);
3201
3202                                                 continue;
3203                                         }
3204                                 }
3205                         }
3206                         /*
3207                          *      Two cases of map in faults:
3208                          *          - At top level w/o copy object.
3209                          *          - Read fault anywhere.
3210                          *              --> must disallow write.
3211                          */
3212
3213                         if (object == cur_object && object->copy == VM_OBJECT_NULL) {
3214
3215                                 goto FastPmapEnter;
3216                         }
3217
3218                         if ((fault_type & VM_PROT_WRITE) == 0) {
3219
3220                                 if (object != cur_object) {
3221                                         /*
3222                                          * We still need to hold the top object
3223                                          * lock here to prevent a race between
3224                                          * a read fault (taking only "shared"
3225                                          * locks) and a write fault (taking
3226                                          * an "exclusive" lock on the top
3227                                          * object.
3228                                          * Otherwise, as soon as we release the
3229                                          * top lock, the write fault could
3230                                          * proceed and actually complete before
3231                                          * the read fault, and the copied page's
3232                                          * translation could then be overwritten
3233                                          * by the read fault's translation for
3234                                          * the original page.
3235                                          *
3236                                          * Let's just record what the top object
3237                                          * is and we'll release it later.
3238                                          */
3239                                         top_object = object;
3240
3241                                         /*
3242                                          * switch to the object that has the new page
3243                                          */
3244                                         object = cur_object;
3245                                         object_lock_type = cur_object_lock_type;
3246                                 }
3247 FastPmapEnter:
3248                                 /*
3249                                  * prepare for the pmap_enter...
3250                                  * object and map are both locked
3251                                  * m contains valid data
3252                                  * object == m->object
3253                                  * cur_object == NULL or it's been unlocked
3254                                  * no paging references on either object or cur_object
3255                                  */
3256                                 if (caller_pmap) {
3257                                         kr = vm_fault_enter(m,
3258                                                             caller_pmap,
3259                                                             caller_pmap_addr,
3260                                                             prot,
3261                                                             fault_type,
3262                                                             wired,
3263                                                             change_wiring,
3264                                                             fault_info.no_cache,
3265                                                             fault_info.cs_bypass,
3266                                                             (top_object != VM_OBJECT_NULL ? &need_retry : NULL),
3267                                                             &type_of_fault);
3268                                 } else {
3269                                         kr = vm_fault_enter(m,
3270                                                             pmap,
3271                                                             vaddr,
3272                                                             prot,
3273                                                             fault_type,
3274                                                             wired,
3275                                                             change_wiring,
3276                                                             fault_info.no_cache,
3277                                                             fault_info.cs_bypass,
3278                                                             (top_object != VM_OBJECT_NULL ? &need_retry : NULL),
3279                                                             &type_of_fault);
3280                                 }
3281
3282                                 if (top_object != VM_OBJECT_NULL) {
3283                                         /*
3284                                          * It's safe to drop the top object
3285                                          * now that we've done our
3286                                          * vm_fault_enter().  Any other fault
3287                                          * in progress for that virtual
3288                                          * address will either find our page
3289                                          * and translation or put in a new page
3290                                          * and translation.
3291                                          */
3292                                         vm_object_unlock(top_object);
3293                                         top_object = VM_OBJECT_NULL;
3294                                 }
3295
3296                                 if (need_collapse == TRUE)
3297                                         vm_object_collapse(object, offset, TRUE);
3298
3299                                 if (need_retry == FALSE &&
3300                                     (type_of_fault == DBG_PAGEIND_FAULT || type_of_fault == DBG_PAGEINV_FAULT || type_of_fault == DBG_CACHE_HIT_FAULT)) {
3301                                         /*
3302                                          * evaluate access pattern and update state
3303                                          * vm_fault_deactivate_behind depends on the
3304                                          * state being up to date
3305                                          */
3306                                         vm_fault_is_sequential(object, cur_offset, fault_info.behavior);
3307
3308                                         vm_fault_deactivate_behind(object, cur_offset, fault_info.behavior);
3309                                 }
3310                                 /*
3311                                  * That's it, clean up and return.
3312                                  */
3313                                 if (m->busy)
3314                                         PAGE_WAKEUP_DONE(m);
3315
3316                                 vm_object_unlock(object);
3317
3318                                 vm_map_unlock_read(map);
3319                                 if (real_map != map)
3320                                         vm_map_unlock(real_map);
3321
3322                                 if (need_retry == TRUE) {
3323                                         /*
3324                                          * vm_fault_enter couldn't complete the PMAP_ENTER...
3325                                          * at this point we don't hold any locks so it's safe
3326                                          * to ask the pmap layer to expand the page table to
3327                                          * accommodate this mapping... once expanded, we'll
3328                                          * re-drive the fault which should result in vm_fault_enter
3329                                          * being able to successfully enter the mapping this time around
3330                                          */
3331                                         (void)pmap_enter_options(pmap, vaddr, 0, 0, 0, 0, 0, PMAP_OPTIONS_NOENTER);
3332
3333                                         need_retry = FALSE;
3334                                         goto RetryFault;
3335                                 }
3336                                 goto done;
3337                         }
3338                         /*
3339                          * COPY ON WRITE FAULT
3340                          */
3341                         assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE);
3342
3343                         if ((throttle_delay = vm_page_throttled())) {
3344                                 /*
3345                                  * drop all of our locks...
3346                                  * wait until the free queue is
3347                                  * pumped back up and then
3348                                  * redrive the fault
3349                                  */
3350                                 if (object != cur_object)
3351                                         vm_object_unlock(cur_object);
3352                                 vm_object_unlock(object);
3353                                 vm_map_unlock_read(map);
3354                                 if (real_map != map)
3355                                         vm_map_unlock(real_map);
3356
3357                                 VM_DEBUG_EVENT(vmf_cowdelay, VMF_COWDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
3358
3359                                 delay(throttle_delay);
3360
3361                                 if (!current_thread_aborted() && vm_page_wait((change_wiring) ?
3362                                                  THREAD_UNINT :
3363                                                  THREAD_ABORTSAFE))
3364                                         goto RetryFault;
3365                                 kr = KERN_ABORTED;
3366                                 goto done;
3367                         }
3368                         /*
3369                          * If objects match, then
3370                          * object->copy must not be NULL (else control
3371                          * would be in previous code block), and we
3372                          * have a potential push into the copy object
3373                          * with which we can't cope with here.
3374                          */
3375                         if (cur_object == object) {
3376                                 /*
3377                                  * must take the slow path to
3378                                  * deal with the copy push
3379                                  */
3380                                 break;
3381                         }
3382
3383                         /*
3384                          * This is now a shadow based copy on write
3385                          * fault -- it requires a copy up the shadow
3386                          * chain.
3387                          */
3388
3389                         if ((cur_object_lock_type == OBJECT_LOCK_SHARED) &&
3390                             VM_FAULT_NEED_CS_VALIDATION(NULL, m)) {
3391                                 goto upgrade_for_validation;
3392                         }
3393
3394                         /*
3395                          * Allocate a page in the original top level
3396                          * object. Give up if allocate fails.  Also
3397                          * need to remember current page, as it's the
3398                          * source of the copy.
3399                          *
3400                          * at this point we hold locks on both
3401                          * object and cur_object... no need to take
3402                          * paging refs or mark pages BUSY since
3403                          * we don't drop either object lock until
3404                          * the page has been copied and inserted
3405                          */
3406                         cur_m = m;
3407                         m = vm_page_grab();
3408
3409                         if (m == VM_PAGE_NULL) {
3410                                 /*
3411                                  * no free page currently available...
3412                                  * must take the slow path
3413                                  */
3414                                 break;
3415                         }
3416                         /*
3417                          * Now do the copy.  Mark the source page busy...
3418                          *
3419                          *      NOTE: This code holds the map lock across
3420                          *      the page copy.
3421                          */
3422                         vm_page_copy(cur_m, m);
3423                         vm_page_insert(m, object, offset);
3424                         SET_PAGE_DIRTY(m, FALSE);
3425
3426                         /*
3427                          * Now cope with the source page and object
3428                          */
3429                         if (object->ref_count > 1 && cur_m->pmapped)
3430                                 pmap_disconnect(cur_m->phys_page);
3431
3432                         need_collapse = TRUE;
3433
3434                         if (!cur_object->internal &&
3435                             cur_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
3436                                 /*
3437                                  * The object from which we've just
3438                                  * copied a page is most probably backed
3439                                  * by a vnode.  We don't want to waste too
3440                                  * much time trying to collapse the VM objects
3441                                  * and create a bottleneck when several tasks
3442                                  * map the same file.
3443                                  */
3444                                 if (cur_object->copy == object) {
3445                                         /*
3446                                          * Shared mapping or no COW yet.
3447                                          * We can never collapse a copy
3448                                          * object into its backing object.
3449                                          */
3450                                         need_collapse = FALSE;
3451                                 } else if (cur_object->copy == object->shadow &&
3452                                            object->shadow->resident_page_count == 0) {
3453                                         /*
3454                                          * Shared mapping after a COW occurred.
3455                                          */
3456                                         need_collapse = FALSE;
3457                                 }
3458                         }
3459                         vm_object_unlock(cur_object);
3460
3461                         if (need_collapse == FALSE)
3462                                 vm_fault_collapse_skipped++;
3463                         vm_fault_collapse_total++;
3464
3465                         type_of_fault = DBG_COW_FAULT;
3466                         VM_STAT_INCR(cow_faults);
3467                         DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
3468                         current_task()->cow_faults++;
3469
3470                         goto FastPmapEnter;
3471
3472                 } else {
3473                         /*
3474                          * No page at cur_object, cur_offset... m == NULL
3475                          */
3476                         if (cur_object->pager_created) {
3477                                 if (MUST_ASK_PAGER(cur_object, cur_offset) == TRUE) {
3478                                         /*
3479                                          * May have to talk to a pager...
3480                                          * take the slow path.
3481                                          */
3482                                         break;
3483                                 }
3484                                 /*
3485                                  * existence map present and indicates
3486                                  * that the pager doesn't have this page
3487                                  */
3488                         }
3489                         if (cur_object->shadow == VM_OBJECT_NULL) {
3490                                 /*
3491                                  * Zero fill fault.  Page gets
3492                                  * inserted into the original object.
3493                                  */
3494                                 if (cur_object->shadow_severed ||
3495                                     VM_OBJECT_PURGEABLE_FAULT_ERROR(cur_object))
3496                                 {
3497                                         if (object != cur_object)
3498                                                 vm_object_unlock(cur_object);
3499                                         vm_object_unlock(object);
3500
3501                                         vm_map_unlock_read(map);
3502                                         if (real_map != map)
3503                                                 vm_map_unlock(real_map);
3504
3505                                         kr = KERN_MEMORY_ERROR;
3506                                         goto done;
3507                                 }
3508                                 if ((throttle_delay = vm_page_throttled())) {
3509                                         /*
3510                                          * drop all of our locks...
3511                                          * wait until the free queue is
3512                                          * pumped back up and then
3513                                          * redrive the fault
3514                                          */
3515                                         if (object != cur_object)
3516                                                 vm_object_unlock(cur_object);
3517                                         vm_object_unlock(object);
3518                                         vm_map_unlock_read(map);
3519                                         if (real_map != map)
3520                                                 vm_map_unlock(real_map);
3521
3522                                         VM_DEBUG_EVENT(vmf_zfdelay, VMF_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
3523
3524                                         delay(throttle_delay);
3525
3526                                         if (!current_thread_aborted() && vm_page_wait((change_wiring) ?
3527                                                          THREAD_UNINT :
3528                                                          THREAD_ABORTSAFE))
3529                                                 goto RetryFault;
3530                                         kr = KERN_ABORTED;
3531                                         goto done;
3532                                 }
3533                                 if (vm_backing_store_low) {
3534                                         /*
3535                                          * we are protecting the system from
3536                                          * backing store exhaustion...
3537                                          * must take the slow path if we're
3538                                          * not privileged
3539                                          */
3540                                         if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV))
3541                                                 break;
3542                                 }
3543                                 if (cur_object != object) {
3544                                         vm_object_unlock(cur_object);
3545
3546                                         cur_object = object;
3547                                 }
3548                                 if (object_lock_type == OBJECT_LOCK_SHARED) {
3549
3550                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3551
3552                                         if (vm_object_lock_upgrade(object) == FALSE) {
3553                                                 /*
3554                                                  * couldn't upgrade so do a full retry on the fault
3555                                                  * since we dropped the object lock which
3556                                                  * could allow another thread to insert
3557                                                  * a page at this offset
3558                                                  */
3559                                                 vm_map_unlock_read(map);
3560                                                 if (real_map != map)
3561                                                         vm_map_unlock(real_map);
3562
3563                                                 goto RetryFault;
3564                                         }
3565                                 }
3566                                 m = vm_page_alloc(object, offset);
3567
3568                                 if (m == VM_PAGE_NULL) {
3569                                         /*
3570                                          * no free page currently available...
3571                                          * must take the slow path
3572                                          */
3573                                         break;
3574                                 }
3575
3576                                 /*
3577                                  * Now zero fill page...
3578                                  * the page is probably going to
3579                                  * be written soon, so don't bother
3580                                  * to clear the modified bit
3581                                  *
3582                                  *   NOTE: This code holds the map
3583                                  *   lock across the zero fill.
3584                                  */
3585                                 type_of_fault = vm_fault_zero_page(m, map->no_zero_fill);
3586
3587                                 goto FastPmapEnter;
3588                         }
3589                         /*
3590                          * On to the next level in the shadow chain
3591                          */
3592                         cur_offset += cur_object->vo_shadow_offset;
3593                         new_object = cur_object->shadow;
3594
3595                         /*
3596                          * take the new_object's lock with the indicated state
3597                          */
3598                         if (cur_object_lock_type == OBJECT_LOCK_SHARED)
3599                                 vm_object_lock_shared(new_object);
3600                         else
3601                                 vm_object_lock(new_object);
3602
3603                         if (cur_object != object)
3604                                 vm_object_unlock(cur_object);
3605
3606                         cur_object = new_object;
3607
3608                         continue;
3609                 }
3610         }
3611         /*
3612          * Cleanup from fast fault failure.  Drop any object
3613          * lock other than original and drop map lock.
3614          */
3615         if (object != cur_object)
3616                 vm_object_unlock(cur_object);
3617
3618         /*
3619          * must own the object lock exclusively at this point
3620          */
3621         if (object_lock_type == OBJECT_LOCK_SHARED) {
3622                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3623
3624                 if (vm_object_lock_upgrade(object) == FALSE) {
3625                         /*
3626                          * couldn't upgrade, so explictly
3627                          * take the lock exclusively
3628                          * no need to retry the fault at this
3629                          * point since "vm_fault_page" will
3630                          * completely re-evaluate the state
3631                          */
3632                         vm_object_lock(object);
3633                 }
3634         }
3635
3636 handle_copy_delay:
3637         vm_map_unlock_read(map);
3638         if (real_map != map)
3639                 vm_map_unlock(real_map);
3640
3641         /*
3642          * Make a reference to this object to
3643          * prevent its disposal while we are messing with
3644          * it.  Once we have the reference, the map is free
3645          * to be diddled.  Since objects reference their
3646          * shadows (and copies), they will stay around as well.
3647          */
3648         vm_object_reference_locked(object);
3649         vm_object_paging_begin(object);
3650
3651         XPR(XPR_VM_FAULT,"vm_fault -> vm_fault_page\n",0,0,0,0,0);
3652
3653         error_code = 0;
3654
3655         kr = vm_fault_page(object, offset, fault_type,
3656                            (change_wiring && !wired),
3657                            &prot, &result_page, &top_page,
3658                            &type_of_fault,
3659                            &error_code, map->no_zero_fill,
3660                            FALSE, &fault_info);
3661
3662         /*
3663          * if kr != VM_FAULT_SUCCESS, then the paging reference
3664          * has been dropped and the object unlocked... the ref_count
3665          * is still held
3666          *
3667          * if kr == VM_FAULT_SUCCESS, then the paging reference
3668          * is still held along with the ref_count on the original object
3669          *
3670          *      the object is returned locked with a paging reference
3671          *
3672          *      if top_page != NULL, then it's BUSY and the
3673          *      object it belongs to has a paging reference
3674          *      but is returned unlocked
3675          */
3676         if (kr != VM_FAULT_SUCCESS &&
3677             kr != VM_FAULT_SUCCESS_NO_VM_PAGE) {
3678                 /*
3679                  * we didn't succeed, lose the object reference immediately.
3680                  */
3681                 vm_object_deallocate(object);
3682
3683                 /*
3684                  * See why we failed, and take corrective action.
3685                  */
3686                 switch (kr) {
3687                 case VM_FAULT_MEMORY_SHORTAGE:
3688                         if (vm_page_wait((change_wiring) ?
3689                                          THREAD_UNINT :
3690                                          THREAD_ABORTSAFE))
3691                                 goto RetryFault;
3692                         /*
3693                          * fall thru
3694                          */
3695                 case VM_FAULT_INTERRUPTED:
3696                         kr = KERN_ABORTED;
3697                         goto done;
3698                 case VM_FAULT_RETRY:
3699                         goto RetryFault;
3700                 case VM_FAULT_MEMORY_ERROR:
3701                         if (error_code)
3702                                 kr = error_code;
3703                         else
3704                                 kr = KERN_MEMORY_ERROR;
3705                         goto done;
3706                 default:
3707                         panic("vm_fault: unexpected error 0x%x from "
3708                               "vm_fault_page()\n", kr);
3709                 }
3710         }
3711         m = result_page;
3712
3713         if (m != VM_PAGE_NULL) {
3714                 assert((change_wiring && !wired) ?
3715                     (top_page == VM_PAGE_NULL) :
3716                     ((top_page == VM_PAGE_NULL) == (m->object == object)));
3717         }
3718
3719         /*
3720          * What to do with the resulting page from vm_fault_page
3721          * if it doesn't get entered into the physical map:
3722          */
3723 #define RELEASE_PAGE(m)                                 \
3724         MACRO_BEGIN                                     \
3725         PAGE_WAKEUP_DONE(m);                            \
3726         if (!m->active && !m->inactive && !m->throttled) {              \
3727                 vm_page_lockspin_queues();                              \
3728                 if (!m->active && !m->inactive && !m->throttled)        \
3729                         vm_page_activate(m);                            \
3730                 vm_page_unlock_queues();                                \
3731         }                                                               \
3732         MACRO_END
3733
3734         /*
3735          * We must verify that the maps have not changed
3736          * since our last lookup.
3737          */
3738         if (m != VM_PAGE_NULL) {
3739                 old_copy_object = m->object->copy;
3740                 vm_object_unlock(m->object);
3741         } else {
3742                 old_copy_object = VM_OBJECT_NULL;
3743                 vm_object_unlock(object);
3744         }
3745
3746         /*
3747          * no object locks are held at this point
3748          */
3749         if ((map != original_map) || !vm_map_verify(map, &version)) {
3750                 vm_object_t             retry_object;
3751                 vm_object_offset_t      retry_offset;
3752                 vm_prot_t               retry_prot;
3753
3754                 /*
3755                  * To avoid trying to write_lock the map while another
3756                  * thread has it read_locked (in vm_map_pageable), we
3757                  * do not try for write permission.  If the page is
3758                  * still writable, we will get write permission.  If it
3759                  * is not, or has been marked needs_copy, we enter the
3760                  * mapping without write permission, and will merely
3761                  * take another fault.
3762                  */
3763                 map = original_map;
3764                 vm_map_lock_read(map);
3765
3766                 kr = vm_map_lookup_locked(&map, vaddr,
3767                                           fault_type & ~VM_PROT_WRITE,
3768                                           OBJECT_LOCK_EXCLUSIVE, &version,
3769                                           &retry_object, &retry_offset, &retry_prot,
3770                                           &wired,
3771                                           &fault_info,
3772                                           &real_map);
3773                 pmap = real_map->pmap;
3774
3775                 if (kr != KERN_SUCCESS) {
3776                         vm_map_unlock_read(map);
3777
3778                         if (m != VM_PAGE_NULL) {
3779                                 /*
3780                                  * retake the lock so that
3781                                  * we can drop the paging reference
3782                                  * in vm_fault_cleanup and do the
3783                                  * PAGE_WAKEUP_DONE in RELEASE_PAGE
3784                                  */
3785                                 vm_object_lock(m->object);
3786
3787                                 RELEASE_PAGE(m);
3788
3789                                 vm_fault_cleanup(m->object, top_page);
3790                         } else {
3791                                 /*
3792                                  * retake the lock so that
3793                                  * we can drop the paging reference
3794                                  * in vm_fault_cleanup
3795                                  */
3796                                 vm_object_lock(object);
3797
3798                                 vm_fault_cleanup(object, top_page);
3799                         }
3800                         vm_object_deallocate(object);
3801
3802                         goto done;
3803                 }
3804                 vm_object_unlock(retry_object);
3805
3806                 if ((retry_object != object) || (retry_offset != offset)) {
3807
3808                         vm_map_unlock_read(map);
3809                         if (real_map != map)
3810                                 vm_map_unlock(real_map);
3811
3812                         if (m != VM_PAGE_NULL) {
3813                                 /*
3814                                  * retake the lock so that
3815                                  * we can drop the paging reference
3816                                  * in vm_fault_cleanup and do the
3817                                  * PAGE_WAKEUP_DONE in RELEASE_PAGE
3818                                  */
3819                                 vm_object_lock(m->object);
3820
3821                                 RELEASE_PAGE(m);
3822
3823                                 vm_fault_cleanup(m->object, top_page);
3824                         } else {
3825                                 /*
3826                                  * retake the lock so that
3827                                  * we can drop the paging reference
3828                                  * in vm_fault_cleanup
3829                                  */
3830                                 vm_object_lock(object);
3831
3832                                 vm_fault_cleanup(object, top_page);
3833                         }
3834                         vm_object_deallocate(object);
3835
3836                         goto RetryFault;
3837                 }
3838                 /*
3839                  * Check whether the protection has changed or the object
3840                  * has been copied while we left the map unlocked.
3841                  */
3842                 prot &= retry_prot;
3843         }
3844         if (m != VM_PAGE_NULL) {
3845                 vm_object_lock(m->object);
3846
3847                 if (m->object->copy != old_copy_object) {
3848                         /*
3849                          * The copy object changed while the top-level object
3850                          * was unlocked, so take away write permission.
3851                          */
3852                         prot &= ~VM_PROT_WRITE;
3853                 }
3854         } else
3855                 vm_object_lock(object);
3856
3857         /*
3858          * If we want to wire down this page, but no longer have
3859          * adequate permissions, we must start all over.
3860          */
3861         if (wired && (fault_type != (prot | VM_PROT_WRITE))) {
3862
3863                 vm_map_verify_done(map, &version);
3864                 if (real_map != map)
3865                         vm_map_unlock(real_map);
3866
3867                 if (m != VM_PAGE_NULL) {
3868                         RELEASE_PAGE(m);
3869
3870                         vm_fault_cleanup(m->object, top_page);
3871                 } else
3872                         vm_fault_cleanup(object, top_page);
3873
3874                 vm_object_deallocate(object);
3875
3876                 goto RetryFault;
3877         }
3878         if (m != VM_PAGE_NULL) {
3879                 /*
3880                  * Put this page into the physical map.
3881                  * We had to do the unlock above because pmap_enter
3882                  * may cause other faults.  The page may be on
3883                  * the pageout queues.  If the pageout daemon comes
3884                  * across the page, it will remove it from the queues.
3885                  */
3886                 if (caller_pmap) {
3887                         kr = vm_fault_enter(m,
3888                                             caller_pmap,
3889                                             caller_pmap_addr,
3890                                             prot,
3891                                             fault_type,
3892                                             wired,
3893                                             change_wiring,
3894                                             fault_info.no_cache,
3895                                             fault_info.cs_bypass,
3896                                             NULL,
3897                                             &type_of_fault);
3898                 } else {
3899                         kr = vm_fault_enter(m,
3900                                             pmap,
3901                                             vaddr,
3902                                             prot,
3903                                             fault_type,
3904                                             wired,
3905                                             change_wiring,
3906                                             fault_info.no_cache,
3907                                             fault_info.cs_bypass,
3908                                             NULL,
3909                                             &type_of_fault);
3910                 }
3911                 if (kr != KERN_SUCCESS) {
3912                         /* abort this page fault */
3913                         vm_map_verify_done(map, &version);
3914                         if (real_map != map)
3915                                 vm_map_unlock(real_map);
3916                         PAGE_WAKEUP_DONE(m);
3917                         vm_fault_cleanup(m->object, top_page);
3918                         vm_object_deallocate(object);
3919                         goto done;
3920                 }
3921         } else {
3922
3923                 vm_map_entry_t          entry;
3924                 vm_map_offset_t         laddr;
3925                 vm_map_offset_t         ldelta, hdelta;
3926
3927                 /*
3928                  * do a pmap block mapping from the physical address
3929                  * in the object
3930                  */
3931
3932 #ifdef ppc
3933                 /* While we do not worry about execution protection in   */
3934                 /* general, certian pages may have instruction execution */
3935                 /* disallowed.  We will check here, and if not allowed   */
3936                 /* to execute, we return with a protection failure.      */
3937
3938                 if ((fault_type & VM_PROT_EXECUTE) &&
3939                         (!pmap_eligible_for_execute((ppnum_t)(object->vo_shadow_offset >> 12)))) {
3940
3941                         vm_map_verify_done(map, &version);
3942
3943                         if (real_map != map)
3944                                 vm_map_unlock(real_map);
3945
3946                         vm_fault_cleanup(object, top_page);
3947                         vm_object_deallocate(object);
3948
3949                         kr = KERN_PROTECTION_FAILURE;
3950                         goto done;
3951                 }
3952 #endif  /* ppc */
3953
3954                 if (real_map != map)
3955                         vm_map_unlock(real_map);
3956
3957                 if (original_map != map) {
3958                         vm_map_unlock_read(map);
3959                         vm_map_lock_read(original_map);
3960                         map = original_map;
3961                 }
3962                 real_map = map;
3963
3964                 laddr = vaddr;
3965                 hdelta = 0xFFFFF000;
3966                 ldelta = 0xFFFFF000;
3967
3968                 while (vm_map_lookup_entry(map, laddr, &entry)) {
3969                         if (ldelta > (laddr - entry->vme_start))
3970                                 ldelta = laddr - entry->vme_start;
3971                         if (hdelta > (entry->vme_end - laddr))
3972                                 hdelta = entry->vme_end - laddr;
3973                         if (entry->is_sub_map) {
3974
3975                                 laddr = (laddr - entry->vme_start)
3976                                                         + entry->offset;
3977                                 vm_map_lock_read(entry->object.sub_map);
3978
3979                                 if (map != real_map)
3980                                         vm_map_unlock_read(map);
3981                                 if (entry->use_pmap) {
3982                                         vm_map_unlock_read(real_map);
3983                                         real_map = entry->object.sub_map;
3984                                 }
3985                                 map = entry->object.sub_map;
3986
3987                         } else {
3988                                 break;
3989                         }
3990                 }
3991
3992                 if (vm_map_lookup_entry(map, laddr, &entry) &&
3993                                         (entry->object.vm_object != NULL) &&
3994                                         (entry->object.vm_object == object)) {
3995
3996                         int superpage = (!object->pager_created && object->phys_contiguous)? VM_MEM_SUPERPAGE : 0;
3997                         if (caller_pmap) {
3998                                 /*
3999                                  * Set up a block mapped area
4000                                  */
4001                                 assert((uint32_t)((ldelta + hdelta) >> 12) == ((ldelta + hdelta) >> 12));
4002                                 pmap_map_block(caller_pmap,
4003                                                (addr64_t)(caller_pmap_addr - ldelta),
4004                                                (ppnum_t)((((vm_map_offset_t) (entry->object.vm_object->vo_shadow_offset)) +
4005                                                           entry->offset + (laddr - entry->vme_start) - ldelta) >> 12),
4006                                                (uint32_t)((ldelta + hdelta) >> 12), prot,
4007                                                (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
4008                         } else {
4009                                 /*
4010                                  * Set up a block mapped area
4011                                  */
4012                                 assert((uint32_t)((ldelta + hdelta) >> 12) == ((ldelta + hdelta) >> 12));
4013                                 pmap_map_block(real_map->pmap,
4014                                                (addr64_t)(vaddr - ldelta),
4015                                                (ppnum_t)((((vm_map_offset_t)(entry->object.vm_object->vo_shadow_offset)) +
4016                                                           entry->offset + (laddr - entry->vme_start) - ldelta) >> 12),
4017                                                (uint32_t)((ldelta + hdelta) >> 12), prot,
4018                                                (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
4019                         }
4020                 }
4021         }
4022
4023         /*
4024          * Unlock everything, and return
4025          */
4026         vm_map_verify_done(map, &version);
4027         if (real_map != map)
4028                 vm_map_unlock(real_map);
4029
4030         if (m != VM_PAGE_NULL) {
4031                 PAGE_WAKEUP_DONE(m);
4032
4033                 vm_fault_cleanup(m->object, top_page);
4034         } else
4035                 vm_fault_cleanup(object, top_page);
4036
4037         vm_object_deallocate(object);
4038
4039 #undef  RELEASE_PAGE
4040
4041         kr = KERN_SUCCESS;
4042 done:
4043         thread_interrupt_level(interruptible_state);
4044
4045         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4046                               (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
4047                               (int)((uint64_t)vaddr >> 32),
4048                               (int)vaddr,
4049                               kr,
4050                               type_of_fault,
4051                               0);
4052
4053         return (kr);
4054 }
4055
4056 /*
4057  *      vm_fault_wire:
4058  *
4059  *      Wire down a range of virtual addresses in a map.
4060  */
4061 kern_return_t
4062 vm_fault_wire(
4063         vm_map_t        map,
4064         vm_map_entry_t  entry,
4065         pmap_t          pmap,
4066         vm_map_offset_t pmap_addr)
4067 {
4068
4069         register vm_map_offset_t        va;
4070         register vm_map_offset_t        end_addr = entry->vme_end;
4071         register kern_return_t  rc;
4072
4073         assert(entry->in_transition);
4074
4075         if ((entry->object.vm_object != NULL) &&
4076                         !entry->is_sub_map &&
4077                         entry->object.vm_object->phys_contiguous) {
4078                 return KERN_SUCCESS;
4079         }
4080
4081         /*
4082          *      Inform the physical mapping system that the
4083          *      range of addresses may not fault, so that
4084          *      page tables and such can be locked down as well.
4085          */
4086
4087         pmap_pageable(pmap, pmap_addr,
4088                 pmap_addr + (end_addr - entry->vme_start), FALSE);
4089
4090         /*
4091          *      We simulate a fault to get the page and enter it
4092          *      in the physical map.
4093          */
4094
4095         for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
4096                 if ((rc = vm_fault_wire_fast(
4097                         map, va, entry, pmap,
4098                         pmap_addr + (va - entry->vme_start)
4099                         )) != KERN_SUCCESS) {
4100                         rc = vm_fault(map, va, VM_PROT_NONE, TRUE,
4101                                 (pmap == kernel_pmap) ?
4102                                         THREAD_UNINT : THREAD_ABORTSAFE,
4103                                 pmap, pmap_addr + (va - entry->vme_start));
4104                         DTRACE_VM2(softlock, int, 1, (uint64_t *), NULL);
4105                 }
4106
4107                 if (rc != KERN_SUCCESS) {
4108                         struct vm_map_entry     tmp_entry = *entry;
4109
4110                         /* unwire wired pages */
4111                         tmp_entry.vme_end = va;
4112                         vm_fault_unwire(map,
4113                                 &tmp_entry, FALSE, pmap, pmap_addr);
4114
4115                         return rc;
4116                 }
4117         }
4118         return KERN_SUCCESS;
4119 }
4120
4121 /*
4122  *      vm_fault_unwire:
4123  *
4124  *      Unwire a range of virtual addresses in a map.
4125  */
4126 void
4127 vm_fault_unwire(
4128         vm_map_t        map,
4129         vm_map_entry_t  entry,
4130         boolean_t       deallocate,
4131         pmap_t          pmap,
4132         vm_map_offset_t pmap_addr)
4133 {
4134         register vm_map_offset_t        va;
4135         register vm_map_offset_t        end_addr = entry->vme_end;
4136         vm_object_t             object;
4137         struct vm_object_fault_info fault_info;
4138
4139         object = (entry->is_sub_map)
4140                         ? VM_OBJECT_NULL : entry->object.vm_object;
4141
4142         /*
4143          * If it's marked phys_contiguous, then vm_fault_wire() didn't actually
4144          * do anything since such memory is wired by default.  So we don't have
4145          * anything to undo here.
4146          */
4147
4148         if (object != VM_OBJECT_NULL && object->phys_contiguous)
4149                 return;
4150
4151         fault_info.interruptible = THREAD_UNINT;
4152         fault_info.behavior = entry->behavior;
4153         fault_info.user_tag = entry->alias;
4154         fault_info.lo_offset = entry->offset;
4155         fault_info.hi_offset = (entry->vme_end - entry->vme_start) + entry->offset;
4156         fault_info.no_cache = entry->no_cache;
4157         fault_info.stealth = TRUE;
4158         fault_info.io_sync = FALSE;
4159         fault_info.cs_bypass = FALSE;
4160         fault_info.mark_zf_absent = FALSE;
4161         fault_info.batch_pmap_op = FALSE;
4162
4163         /*
4164          *      Since the pages are wired down, we must be able to
4165          *      get their mappings from the physical map system.
4166          */
4167
4168         for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
4169
4170                 if (object == VM_OBJECT_NULL) {
4171                         if (pmap) {
4172                                 pmap_change_wiring(pmap,
4173                                                    pmap_addr + (va - entry->vme_start), FALSE);
4174                         }
4175                         (void) vm_fault(map, va, VM_PROT_NONE,
4176                                         TRUE, THREAD_UNINT, pmap, pmap_addr);
4177                 } else {
4178                         vm_prot_t       prot;
4179                         vm_page_t       result_page;
4180                         vm_page_t       top_page;
4181                         vm_object_t     result_object;
4182                         vm_fault_return_t result;
4183
4184                         if (end_addr - va > (vm_size_t) -1) {
4185                                 /* 32-bit overflow */
4186                                 fault_info.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
4187                         } else {
4188                                 fault_info.cluster_size = (vm_size_t) (end_addr - va);
4189                                 assert(fault_info.cluster_size == end_addr - va);
4190                         }
4191
4192                         do {
4193                                 prot = VM_PROT_NONE;
4194
4195                                 vm_object_lock(object);
4196                                 vm_object_paging_begin(object);
4197                                 XPR(XPR_VM_FAULT,
4198                                         "vm_fault_unwire -> vm_fault_page\n",
4199                                         0,0,0,0,0);
4200                                 result = vm_fault_page(
4201                                         object,
4202                                         entry->offset + (va - entry->vme_start),
4203                                         VM_PROT_NONE, TRUE,
4204                                         &prot, &result_page, &top_page,
4205                                         (int *)0,
4206                                         NULL, map->no_zero_fill,
4207                                         FALSE, &fault_info);
4208                         } while (result == VM_FAULT_RETRY);
4209
4210                         /*
4211                          * If this was a mapping to a file on a device that has been forcibly
4212                          * unmounted, then we won't get a page back from vm_fault_page().  Just
4213                          * move on to the next one in case the remaining pages are mapped from
4214                          * different objects.  During a forced unmount, the object is terminated
4215                          * so the alive flag will be false if this happens.  A forced unmount will
4216                          * will occur when an external disk is unplugged before the user does an
4217                          * eject, so we don't want to panic in that situation.
4218                          */
4219
4220                         if (result == VM_FAULT_MEMORY_ERROR && !object->alive)
4221                                 continue;
4222
4223                         if (result != VM_FAULT_SUCCESS)
4224                                 panic("vm_fault_unwire: failure");
4225
4226                         result_object = result_page->object;
4227
4228                         if (deallocate) {
4229                                 assert(result_page->phys_page !=
4230                                        vm_page_fictitious_addr);
4231                                 pmap_disconnect(result_page->phys_page);
4232                                 VM_PAGE_FREE(result_page);
4233                         } else {
4234                                 if ((pmap) && (result_page->phys_page != vm_page_guard_addr))
4235                                         pmap_change_wiring(pmap,
4236                                             pmap_addr + (va - entry->vme_start), FALSE);
4237
4238
4239                                 if (VM_PAGE_WIRED(result_page)) {
4240                                         vm_page_lockspin_queues();
4241                                         vm_page_unwire(result_page, TRUE);
4242                                         vm_page_unlock_queues();
4243                                 }
4244                                 if(entry->zero_wired_pages) {
4245                                         pmap_zero_page(result_page->phys_page);
4246                                         entry->zero_wired_pages = FALSE;
4247                                 }
4248
4249                                 PAGE_WAKEUP_DONE(result_page);
4250                         }
4251                         vm_fault_cleanup(result_object, top_page);
4252                 }
4253         }
4254
4255         /*
4256          *      Inform the physical mapping system that the range
4257          *      of addresses may fault, so that page tables and
4258          *      such may be unwired themselves.
4259          */
4260
4261         pmap_pageable(pmap, pmap_addr,
4262                 pmap_addr + (end_addr - entry->vme_start), TRUE);
4263
4264 }
4265
4266 /*
4267  *      vm_fault_wire_fast:
4268  *
4269  *      Handle common case of a wire down page fault at the given address.
4270  *      If successful, the page is inserted into the associated physical map.
4271  *      The map entry is passed in to avoid the overhead of a map lookup.
4272  *
4273  *      NOTE: the given address should be truncated to the
4274  *      proper page address.
4275  *
4276  *      KERN_SUCCESS is returned if the page fault is handled; otherwise,
4277  *      a standard error specifying why the fault is fatal is returned.
4278  *
4279  *      The map in question must be referenced, and remains so.
4280  *      Caller has a read lock on the map.
4281  *
4282  *      This is a stripped version of vm_fault() for wiring pages.  Anything
4283  *      other than the common case will return KERN_FAILURE, and the caller
4284  *      is expected to call vm_fault().
4285  */
4286 kern_return_t
4287 vm_fault_wire_fast(
4288         __unused vm_map_t       map,
4289         vm_map_offset_t va,
4290         vm_map_entry_t  entry,
4291         pmap_t                  pmap,
4292         vm_map_offset_t pmap_addr)
4293 {
4294         vm_object_t             object;
4295         vm_object_offset_t      offset;
4296         register vm_page_t      m;
4297         vm_prot_t               prot;
4298         thread_t                thread = current_thread();
4299         int                     type_of_fault;
4300         kern_return_t           kr;
4301
4302         VM_STAT_INCR(faults);
4303
4304         if (thread != THREAD_NULL && thread->task != TASK_NULL)
4305           thread->task->faults++;
4306
4307 /*
4308  *      Recovery actions
4309  */
4310
4311 #undef  RELEASE_PAGE
4312 #define RELEASE_PAGE(m) {                               \
4313         PAGE_WAKEUP_DONE(m);                            \
4314         vm_page_lockspin_queues();                      \
4315         vm_page_unwire(m, TRUE);                        \
4316         vm_page_unlock_queues();                        \
4317 }
4318
4319
4320 #undef  UNLOCK_THINGS
4321 #define UNLOCK_THINGS   {                               \
4322         vm_object_paging_end(object);                      \
4323         vm_object_unlock(object);                          \
4324 }
4325
4326 #undef  UNLOCK_AND_DEALLOCATE
4327 #define UNLOCK_AND_DEALLOCATE   {                       \
4328         UNLOCK_THINGS;                                  \
4329         vm_object_deallocate(object);                   \
4330 }
4331 /*
4332  *      Give up and have caller do things the hard way.
4333  */
4334
4335 #define GIVE_UP {                                       \
4336         UNLOCK_AND_DEALLOCATE;                          \
4337         return(KERN_FAILURE);                           \
4338 }
4339
4340
4341         /*
4342          *      If this entry is not directly to a vm_object, bail out.
4343          */
4344         if (entry->is_sub_map)
4345                 return(KERN_FAILURE);
4346
4347         /*
4348          *      Find the backing store object and offset into it.
4349          */
4350
4351         object = entry->object.vm_object;
4352         offset = (va - entry->vme_start) + entry->offset;
4353         prot = entry->protection;
4354
4355         /*
4356          *      Make a reference to this object to prevent its
4357          *      disposal while we are messing with it.
4358          */
4359
4360         vm_object_lock(object);
4361         vm_object_reference_locked(object);
4362         vm_object_paging_begin(object);
4363
4364         /*
4365          *      INVARIANTS (through entire routine):
4366          *
4367          *      1)      At all times, we must either have the object
4368          *              lock or a busy page in some object to prevent
4369          *              some other thread from trying to bring in
4370          *              the same page.
4371          *
4372          *      2)      Once we have a busy page, we must remove it from
4373          *              the pageout queues, so that the pageout daemon
4374          *              will not grab it away.
4375          *
4376          */
4377
4378         /*
4379          *      Look for page in top-level object.  If it's not there or
4380          *      there's something going on, give up.
4381          * ENCRYPTED SWAP: use the slow fault path, since we'll need to
4382          * decrypt the page before wiring it down.
4383          */
4384         m = vm_page_lookup(object, offset);
4385         if ((m == VM_PAGE_NULL) || (m->busy) || (m->encrypted) ||
4386             (m->unusual && ( m->error || m->restart || m->absent))) {
4387
4388                 GIVE_UP;
4389         }
4390         ASSERT_PAGE_DECRYPTED(m);
4391
4392         if (m->fictitious &&
4393             m->phys_page == vm_page_guard_addr) {
4394                 /*
4395                  * Guard pages are fictitious pages and are never
4396                  * entered into a pmap, so let's say it's been wired...
4397                  */
4398                 kr = KERN_SUCCESS;
4399                 goto done;
4400         }
4401
4402         /*
4403          *      Wire the page down now.  All bail outs beyond this
4404          *      point must unwire the page.
4405          */
4406
4407         vm_page_lockspin_queues();
4408         vm_page_wire(m);
4409         vm_page_unlock_queues();
4410
4411         /*
4412          *      Mark page busy for other threads.
4413          */
4414         assert(!m->busy);
4415         m->busy = TRUE;
4416         assert(!m->absent);
4417
4418         /*
4419          *      Give up if the page is being written and there's a copy object
4420          */
4421         if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
4422                 RELEASE_PAGE(m);
4423                 GIVE_UP;
4424         }
4425
4426         /*
4427          *      Put this page into the physical map.
4428          */
4429         type_of_fault = DBG_CACHE_HIT_FAULT;
4430         kr = vm_fault_enter(m,
4431                             pmap,
4432                             pmap_addr,
4433                             prot,
4434                             prot,
4435                             TRUE,
4436                             FALSE,
4437                             FALSE,
4438                             FALSE,
4439                             NULL,
4440                             &type_of_fault);
4441
4442 done:
4443         /*
4444          *      Unlock everything, and return
4445          */
4446
4447         PAGE_WAKEUP_DONE(m);
4448         UNLOCK_AND_DEALLOCATE;
4449
4450         return kr;
4451
4452 }
4453
4454 /*
4455  *      Routine:        vm_fault_copy_cleanup
4456  *      Purpose:
4457  *              Release a page used by vm_fault_copy.
4458  */
4459
4460 void
4461 vm_fault_copy_cleanup(
4462         vm_page_t       page,
4463         vm_page_t       top_page)
4464 {
4465         vm_object_t     object = page->object;
4466
4467         vm_object_lock(object);
4468         PAGE_WAKEUP_DONE(page);
4469         if (!page->active && !page->inactive && !page->throttled) {
4470                 vm_page_lockspin_queues();
4471                 if (!page->active && !page->inactive && !page->throttled)
4472                         vm_page_activate(page);
4473                 vm_page_unlock_queues();
4474         }
4475         vm_fault_cleanup(object, top_page);
4476 }
4477
4478 void
4479 vm_fault_copy_dst_cleanup(
4480         vm_page_t       page)
4481 {
4482         vm_object_t     object;
4483
4484         if (page != VM_PAGE_NULL) {
4485                 object = page->object;
4486                 vm_object_lock(object);
4487                 vm_page_lockspin_queues();
4488                 vm_page_unwire(page, TRUE);
4489                 vm_page_unlock_queues();
4490                 vm_object_paging_end(object);
4491                 vm_object_unlock(object);
4492         }
4493 }
4494
4495 /*
4496  *      Routine:        vm_fault_copy
4497  *
4498  *      Purpose:
4499  *              Copy pages from one virtual memory object to another --
4500  *              neither the source nor destination pages need be resident.
4501  *
4502  *              Before actually copying a page, the version associated with
4503  *              the destination address map wil be verified.
4504  *
4505  *      In/out conditions:
4506  *              The caller must hold a reference, but not a lock, to
4507  *              each of the source and destination objects and to the
4508  *              destination map.
4509  *
4510  *      Results:
4511  *              Returns KERN_SUCCESS if no errors were encountered in
4512  *              reading or writing the data.  Returns KERN_INTERRUPTED if
4513  *              the operation was interrupted (only possible if the
4514  *              "interruptible" argument is asserted).  Other return values
4515  *              indicate a permanent error in copying the data.
4516  *
4517  *              The actual amount of data copied will be returned in the
4518  *              "copy_size" argument.  In the event that the destination map
4519  *              verification failed, this amount may be less than the amount
4520  *              requested.
4521  */
4522 kern_return_t
4523 vm_fault_copy(
4524         vm_object_t             src_object,
4525         vm_object_offset_t      src_offset,
4526         vm_map_size_t           *copy_size,             /* INOUT */
4527         vm_object_t             dst_object,
4528         vm_object_offset_t      dst_offset,
4529         vm_map_t                dst_map,
4530         vm_map_version_t         *dst_version,
4531         int                     interruptible)
4532 {
4533         vm_page_t               result_page;
4534
4535         vm_page_t               src_page;
4536         vm_page_t               src_top_page;
4537         vm_prot_t               src_prot;
4538
4539         vm_page_t               dst_page;
4540         vm_page_t               dst_top_page;
4541         vm_prot_t               dst_prot;
4542
4543         vm_map_size_t           amount_left;
4544         vm_object_t             old_copy_object;
4545         kern_return_t           error = 0;
4546         vm_fault_return_t       result;
4547
4548         vm_map_size_t           part_size;
4549         struct vm_object_fault_info fault_info_src;
4550         struct vm_object_fault_info fault_info_dst;
4551
4552         /*
4553          * In order not to confuse the clustered pageins, align
4554          * the different offsets on a page boundary.
4555          */
4556
4557 #define RETURN(x)                                       \
4558         MACRO_BEGIN                                     \
4559         *copy_size -= amount_left;                      \
4560         MACRO_RETURN(x);                                \
4561         MACRO_END
4562
4563         amount_left = *copy_size;
4564
4565         fault_info_src.interruptible = interruptible;
4566         fault_info_src.behavior = VM_BEHAVIOR_SEQUENTIAL;
4567         fault_info_src.user_tag  = 0;
4568         fault_info_src.lo_offset = vm_object_trunc_page(src_offset);
4569         fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left;
4570         fault_info_src.no_cache   = FALSE;
4571         fault_info_src.stealth = TRUE;
4572         fault_info_src.io_sync = FALSE;
4573         fault_info_src.cs_bypass = FALSE;
4574         fault_info_src.mark_zf_absent = FALSE;
4575         fault_info_src.batch_pmap_op = FALSE;
4576
4577         fault_info_dst.interruptible = interruptible;
4578         fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL;
4579         fault_info_dst.user_tag  = 0;
4580         fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset);
4581         fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left;
4582         fault_info_dst.no_cache   = FALSE;
4583         fault_info_dst.stealth = TRUE;
4584         fault_info_dst.io_sync = FALSE;
4585         fault_info_dst.cs_bypass = FALSE;
4586         fault_info_dst.mark_zf_absent = FALSE;
4587         fault_info_dst.batch_pmap_op = FALSE;
4588
4589         do { /* while (amount_left > 0) */
4590                 /*
4591                  * There may be a deadlock if both source and destination
4592                  * pages are the same. To avoid this deadlock, the copy must
4593                  * start by getting the destination page in order to apply
4594                  * COW semantics if any.
4595                  */
4596
4597         RetryDestinationFault: ;
4598
4599                 dst_prot = VM_PROT_WRITE|VM_PROT_READ;
4600
4601                 vm_object_lock(dst_object);
4602                 vm_object_paging_begin(dst_object);
4603
4604                 if (amount_left > (vm_size_t) -1) {
4605                         /* 32-bit overflow */
4606                         fault_info_dst.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
4607                 } else {
4608                         fault_info_dst.cluster_size = (vm_size_t) amount_left;
4609                         assert(fault_info_dst.cluster_size == amount_left);
4610                 }
4611
4612                 XPR(XPR_VM_FAULT,"vm_fault_copy -> vm_fault_page\n",0,0,0,0,0);
4613                 result = vm_fault_page(dst_object,
4614                                        vm_object_trunc_page(dst_offset),
4615                                        VM_PROT_WRITE|VM_PROT_READ,
4616                                        FALSE,
4617                                        &dst_prot, &dst_page, &dst_top_page,
4618                                        (int *)0,
4619                                        &error,
4620                                        dst_map->no_zero_fill,
4621                                        FALSE, &fault_info_dst);
4622                 switch (result) {
4623                 case VM_FAULT_SUCCESS:
4624                         break;
4625                 case VM_FAULT_RETRY:
4626                         goto RetryDestinationFault;
4627                 case VM_FAULT_MEMORY_SHORTAGE:
4628                         if (vm_page_wait(interruptible))
4629                                 goto RetryDestinationFault;
4630                         /* fall thru */
4631                 case VM_FAULT_INTERRUPTED:
4632                         RETURN(MACH_SEND_INTERRUPTED);
4633                 case VM_FAULT_SUCCESS_NO_VM_PAGE:
4634                         /* success but no VM page: fail the copy */
4635                         vm_object_paging_end(dst_object);
4636                         vm_object_unlock(dst_object);
4637                         /*FALLTHROUGH*/
4638                 case VM_FAULT_MEMORY_ERROR:
4639                         if (error)
4640                                 return (error);
4641                         else
4642                                 return(KERN_MEMORY_ERROR);
4643                 default:
4644                         panic("vm_fault_copy: unexpected error 0x%x from "
4645                               "vm_fault_page()\n", result);
4646                 }
4647                 assert ((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
4648
4649                 old_copy_object = dst_page->object->copy;
4650
4651                 /*
4652                  * There exists the possiblity that the source and
4653                  * destination page are the same.  But we can't
4654                  * easily determine that now.  If they are the
4655                  * same, the call to vm_fault_page() for the
4656                  * destination page will deadlock.  To prevent this we
4657                  * wire the page so we can drop busy without having
4658                  * the page daemon steal the page.  We clean up the
4659                  * top page  but keep the paging reference on the object
4660                  * holding the dest page so it doesn't go away.
4661                  */
4662
4663                 vm_page_lockspin_queues();
4664                 vm_page_wire(dst_page);
4665                 vm_page_unlock_queues();
4666                 PAGE_WAKEUP_DONE(dst_page);
4667                 vm_object_unlock(dst_page->object);
4668
4669                 if (dst_top_page != VM_PAGE_NULL) {
4670                         vm_object_lock(dst_object);
4671                         VM_PAGE_FREE(dst_top_page);
4672                         vm_object_paging_end(dst_object);
4673                         vm_object_unlock(dst_object);
4674                 }
4675
4676         RetrySourceFault: ;
4677
4678                 if (src_object == VM_OBJECT_NULL) {
4679                         /*
4680                          *      No source object.  We will just
4681                          *      zero-fill the page in dst_object.
4682                          */
4683                         src_page = VM_PAGE_NULL;
4684                         result_page = VM_PAGE_NULL;
4685                 } else {
4686                         vm_object_lock(src_object);
4687                         src_page = vm_page_lookup(src_object,
4688                                                   vm_object_trunc_page(src_offset));
4689                         if (src_page == dst_page) {
4690                                 src_prot = dst_prot;
4691                                 result_page = VM_PAGE_NULL;
4692                         } else {
4693                                 src_prot = VM_PROT_READ;
4694                                 vm_object_paging_begin(src_object);
4695
4696                                 if (amount_left > (vm_size_t) -1) {
4697                                         /* 32-bit overflow */
4698                                         fault_info_src.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
4699                                 } else {
4700                                         fault_info_src.cluster_size = (vm_size_t) amount_left;
4701                                         assert(fault_info_src.cluster_size == amount_left);
4702                                 }
4703
4704                                 XPR(XPR_VM_FAULT,
4705                                         "vm_fault_copy(2) -> vm_fault_page\n",
4706                                         0,0,0,0,0);
4707                                 result = vm_fault_page(
4708                                         src_object,
4709                                         vm_object_trunc_page(src_offset),
4710                                         VM_PROT_READ, FALSE,
4711                                         &src_prot,
4712                                         &result_page, &src_top_page,
4713                                         (int *)0, &error, FALSE,
4714                                         FALSE, &fault_info_src);
4715
4716                                 switch (result) {
4717                                 case VM_FAULT_SUCCESS:
4718                                         break;
4719                                 case VM_FAULT_RETRY:
4720                                         goto RetrySourceFault;
4721                                 case VM_FAULT_MEMORY_SHORTAGE:
4722                                         if (vm_page_wait(interruptible))
4723                                                 goto RetrySourceFault;
4724                                         /* fall thru */
4725                                 case VM_FAULT_INTERRUPTED:
4726                                         vm_fault_copy_dst_cleanup(dst_page);
4727                                         RETURN(MACH_SEND_INTERRUPTED);
4728                                 case VM_FAULT_SUCCESS_NO_VM_PAGE:
4729                                         /* success but no VM page: fail */
4730                                         vm_object_paging_end(src_object);
4731                                         vm_object_unlock(src_object);
4732                                         /*FALLTHROUGH*/
4733                                 case VM_FAULT_MEMORY_ERROR:
4734                                         vm_fault_copy_dst_cleanup(dst_page);
4735                                         if (error)
4736                                                 return (error);
4737                                         else
4738                                                 return(KERN_MEMORY_ERROR);
4739                                 default:
4740                                         panic("vm_fault_copy(2): unexpected "
4741                                               "error 0x%x from "
4742                                               "vm_fault_page()\n", result);
4743                                 }
4744
4745
4746                                 assert((src_top_page == VM_PAGE_NULL) ==
4747                                        (result_page->object == src_object));
4748                         }
4749                         assert ((src_prot & VM_PROT_READ) != VM_PROT_NONE);
4750                         vm_object_unlock(result_page->object);
4751                 }
4752
4753                 if (!vm_map_verify(dst_map, dst_version)) {
4754                         if (result_page != VM_PAGE_NULL && src_page != dst_page)
4755                                 vm_fault_copy_cleanup(result_page, src_top_page);
4756                         vm_fault_copy_dst_cleanup(dst_page);
4757                         break;
4758                 }
4759
4760                 vm_object_lock(dst_page->object);
4761
4762                 if (dst_page->object->copy != old_copy_object) {
4763                         vm_object_unlock(dst_page->object);
4764                         vm_map_verify_done(dst_map, dst_version);
4765                         if (result_page != VM_PAGE_NULL && src_page != dst_page)
4766                                 vm_fault_copy_cleanup(result_page, src_top_page);
4767                         vm_fault_copy_dst_cleanup(dst_page);
4768                         break;
4769                 }
4770                 vm_object_unlock(dst_page->object);
4771
4772                 /*
4773                  *      Copy the page, and note that it is dirty
4774                  *      immediately.
4775                  */
4776
4777                 if (!page_aligned(src_offset) ||
4778                         !page_aligned(dst_offset) ||
4779                         !page_aligned(amount_left)) {
4780
4781                         vm_object_offset_t      src_po,
4782                                                 dst_po;
4783
4784                         src_po = src_offset - vm_object_trunc_page(src_offset);
4785                         dst_po = dst_offset - vm_object_trunc_page(dst_offset);
4786
4787                         if (dst_po > src_po) {
4788                                 part_size = PAGE_SIZE - dst_po;
4789                         } else {
4790                                 part_size = PAGE_SIZE - src_po;
4791                         }
4792                         if (part_size > (amount_left)){
4793                                 part_size = amount_left;
4794                         }
4795
4796                         if (result_page == VM_PAGE_NULL) {
4797                                 assert((vm_offset_t) dst_po == dst_po);
4798                                 assert((vm_size_t) part_size == part_size);
4799                                 vm_page_part_zero_fill(dst_page,
4800                                                        (vm_offset_t) dst_po,
4801                                                        (vm_size_t) part_size);
4802                         } else {
4803                                 assert((vm_offset_t) src_po == src_po);
4804                                 assert((vm_offset_t) dst_po == dst_po);
4805                                 assert((vm_size_t) part_size == part_size);
4806                                 vm_page_part_copy(result_page,
4807                                                   (vm_offset_t) src_po,
4808                                                   dst_page,
4809                                                   (vm_offset_t) dst_po,
4810                                                   (vm_size_t)part_size);
4811                                 if(!dst_page->dirty){
4812                                         vm_object_lock(dst_object);
4813                                         SET_PAGE_DIRTY(dst_page, TRUE);
4814                                         vm_object_unlock(dst_page->object);
4815                                 }
4816
4817                         }
4818                 } else {
4819                         part_size = PAGE_SIZE;
4820
4821                         if (result_page == VM_PAGE_NULL)
4822                                 vm_page_zero_fill(dst_page);
4823                         else{
4824                                 vm_object_lock(result_page->object);
4825                                 vm_page_copy(result_page, dst_page);
4826                                 vm_object_unlock(result_page->object);
4827
4828                                 if(!dst_page->dirty){
4829                                         vm_object_lock(dst_object);
4830                                         SET_PAGE_DIRTY(dst_page, TRUE);
4831                                         vm_object_unlock(dst_page->object);
4832                                 }
4833                         }
4834
4835                 }
4836
4837                 /*
4838                  *      Unlock everything, and return
4839                  */
4840
4841                 vm_map_verify_done(dst_map, dst_version);
4842
4843                 if (result_page != VM_PAGE_NULL && src_page != dst_page)
4844                         vm_fault_copy_cleanup(result_page, src_top_page);
4845                 vm_fault_copy_dst_cleanup(dst_page);
4846
4847                 amount_left -= part_size;
4848                 src_offset += part_size;
4849                 dst_offset += part_size;
4850         } while (amount_left > 0);
4851
4852         RETURN(KERN_SUCCESS);
4853 #undef  RETURN
4854
4855         /*NOTREACHED*/
4856 }
4857
4858 #if     VM_FAULT_CLASSIFY
4859 /*
4860  *      Temporary statistics gathering support.
4861  */
4862
4863 /*
4864  *      Statistics arrays:
4865  */
4866 #define VM_FAULT_TYPES_MAX      5
4867 #define VM_FAULT_LEVEL_MAX      8
4868
4869 int     vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
4870
4871 #define VM_FAULT_TYPE_ZERO_FILL 0
4872 #define VM_FAULT_TYPE_MAP_IN    1
4873 #define VM_FAULT_TYPE_PAGER     2
4874 #define VM_FAULT_TYPE_COPY      3
4875 #define VM_FAULT_TYPE_OTHER     4
4876
4877
4878 void
4879 vm_fault_classify(vm_object_t           object,
4880                   vm_object_offset_t    offset,
4881                   vm_prot_t             fault_type)
4882 {
4883         int             type, level = 0;
4884         vm_page_t       m;
4885
4886         while (TRUE) {
4887                 m = vm_page_lookup(object, offset);
4888                 if (m != VM_PAGE_NULL) {
4889                         if (m->busy || m->error || m->restart || m->absent) {
4890                                 type = VM_FAULT_TYPE_OTHER;
4891                                 break;
4892                         }
4893                         if (((fault_type & VM_PROT_WRITE) == 0) ||
4894                             ((level == 0) && object->copy == VM_OBJECT_NULL)) {
4895                                 type = VM_FAULT_TYPE_MAP_IN;
4896                                 break;
4897                         }
4898                         type = VM_FAULT_TYPE_COPY;
4899                         break;
4900                 }
4901                 else {
4902                         if (object->pager_created) {
4903                                 type = VM_FAULT_TYPE_PAGER;
4904                                 break;
4905                         }
4906                         if (object->shadow == VM_OBJECT_NULL) {
4907                                 type = VM_FAULT_TYPE_ZERO_FILL;
4908                                 break;
4909                         }
4910
4911                         offset += object->vo_shadow_offset;
4912                         object = object->shadow;
4913                         level++;
4914                         continue;
4915                 }
4916         }
4917
4918         if (level > VM_FAULT_LEVEL_MAX)
4919                 level = VM_FAULT_LEVEL_MAX;
4920
4921         vm_fault_stats[type][level] += 1;
4922
4923         return;
4924 }
4925
4926 /* cleanup routine to call from debugger */
4927
4928 void
4929 vm_fault_classify_init(void)
4930 {
4931         int type, level;
4932
4933         for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
4934                 for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
4935                         vm_fault_stats[type][level] = 0;
4936                 }
4937         }
4938
4939         return;
4940 }
4941 #endif  /* VM_FAULT_CLASSIFY */
4942
4943
4944 extern int cs_validation;
4945
4946 void
4947 vm_page_validate_cs_mapped(
4948         vm_page_t       page,
4949         const void      *kaddr)
4950 {
4951         vm_object_t             object;
4952         vm_object_offset_t      offset;
4953         kern_return_t           kr;
4954         memory_object_t         pager;
4955         void                    *blobs;
4956         boolean_t               validated, tainted;
4957
4958         assert(page->busy);
4959         vm_object_lock_assert_exclusive(page->object);
4960
4961         if (!cs_validation) {
4962                 return;
4963         }
4964
4965         if (page->wpmapped && !page->cs_tainted) {
4966                 /*
4967                  * This page was mapped for "write" access sometime in the
4968                  * past and could still be modifiable in the future.
4969                  * Consider it tainted.
4970                  * [ If the page was already found to be "tainted", no
4971                  * need to re-validate. ]
4972                  */
4973                 page->cs_validated = TRUE;
4974                 page->cs_tainted = TRUE;
4975                 if (cs_debug) {
4976                         printf("CODESIGNING: vm_page_validate_cs: "
4977                                "page %p obj %p off 0x%llx "
4978                                "was modified\n",
4979                                page, page->object, page->offset);
4980                 }
4981                 vm_cs_validated_dirtied++;
4982         }
4983
4984         if (page->cs_validated) {
4985                 return;
4986         }
4987
4988         vm_cs_validates++;
4989
4990         object = page->object;
4991         assert(object->code_signed);
4992         offset = page->offset;
4993
4994         if (!object->alive || object->terminating || object->pager == NULL) {
4995                 /*
4996                  * The object is terminating and we don't have its pager
4997                  * so we can't validate the data...
4998                  */
4999                 return;
5000         }
5001         /*
5002          * Since we get here to validate a page that was brought in by
5003          * the pager, we know that this pager is all setup and ready
5004          * by now.
5005          */
5006         assert(!object->internal);
5007         assert(object->pager != NULL);
5008         assert(object->pager_ready);
5009
5010         pager = object->pager;
5011         assert(object->paging_in_progress);
5012         kr = vnode_pager_get_object_cs_blobs(pager, &blobs);
5013         if (kr != KERN_SUCCESS) {
5014                 blobs = NULL;
5015         }
5016
5017         /* verify the SHA1 hash for this page */
5018         validated = cs_validate_page(blobs,
5019                                      pager,
5020                                      offset + object->paging_offset,
5021                                      (const void *)kaddr,
5022                                      &tainted);
5023
5024         page->cs_validated = validated;
5025         if (validated) {
5026                 page->cs_tainted = tainted;
5027         }
5028 }
5029
5030 void
5031 vm_page_validate_cs(
5032         vm_page_t       page)
5033 {
5034         vm_object_t             object;
5035         vm_object_offset_t      offset;
5036         vm_map_offset_t         koffset;
5037         vm_map_size_t           ksize;
5038         vm_offset_t             kaddr;
5039         kern_return_t           kr;
5040         boolean_t               busy_page;
5041
5042         vm_object_lock_assert_held(page->object);
5043
5044         if (!cs_validation) {
5045                 return;
5046         }
5047
5048         if (page->wpmapped && !page->cs_tainted) {
5049                 vm_object_lock_assert_exclusive(page->object);
5050
5051                 /*
5052                  * This page was mapped for "write" access sometime in the
5053                  * past and could still be modifiable in the future.
5054                  * Consider it tainted.
5055                  * [ If the page was already found to be "tainted", no
5056                  * need to re-validate. ]
5057                  */
5058                 page->cs_validated = TRUE;
5059                 page->cs_tainted = TRUE;
5060                 if (cs_debug) {
5061                         printf("CODESIGNING: vm_page_validate_cs: "
5062                                "page %p obj %p off 0x%llx "
5063                                "was modified\n",
5064                                page, page->object, page->offset);
5065                 }
5066                 vm_cs_validated_dirtied++;
5067         }
5068
5069         if (page->cs_validated) {
5070                 return;
5071         }
5072
5073 #if CHECK_CS_VALIDATION_BITMAP
5074         if ( vnode_pager_cs_check_validation_bitmap( page->object->pager, trunc_page(page->offset + page->object->paging_offset), CS_BITMAP_CHECK ) == KERN_SUCCESS) {
5075                 page->cs_validated = TRUE;
5076                 page->cs_tainted = FALSE;
5077                 vm_cs_bitmap_validated++;
5078                 return;
5079         }
5080 #endif
5081         vm_object_lock_assert_exclusive(page->object);
5082
5083         object = page->object;
5084         assert(object->code_signed);
5085         offset = page->offset;
5086
5087         busy_page = page->busy;
5088         if (!busy_page) {
5089                 /* keep page busy while we map (and unlock) the VM object */
5090                 page->busy = TRUE;
5091         }
5092
5093         /*
5094          * Take a paging reference on the VM object
5095          * to protect it from collapse or bypass,
5096          * and keep it from disappearing too.
5097          */
5098         vm_object_paging_begin(object);
5099
5100         /* map the page in the kernel address space */
5101         koffset = 0;
5102         ksize = PAGE_SIZE_64;
5103         kr = vm_paging_map_object(&koffset,
5104                                   page,
5105                                   object,
5106                                   offset,
5107                                   &ksize,
5108                                   VM_PROT_READ,
5109                                   FALSE); /* can't unlock object ! */
5110         if (kr != KERN_SUCCESS) {
5111                 panic("vm_page_validate_cs: could not map page: 0x%x\n", kr);
5112         }
5113         kaddr = CAST_DOWN(vm_offset_t, koffset);
5114
5115         /* validate the mapped page */
5116         vm_page_validate_cs_mapped(page, (const void *) kaddr);
5117
5118 #if CHECK_CS_VALIDATION_BITMAP
5119         if ( page->cs_validated == TRUE && page->cs_tainted == FALSE ) {
5120                 vnode_pager_cs_check_validation_bitmap( object->pager, trunc_page( offset + object->paging_offset), CS_BITMAP_SET );
5121         }
5122 #endif
5123         assert(page->busy);
5124         assert(object == page->object);
5125         vm_object_lock_assert_exclusive(object);
5126
5127         if (!busy_page) {
5128                 PAGE_WAKEUP_DONE(page);
5129         }
5130         if (koffset != 0) {
5131                 /* unmap the map from the kernel address space */
5132                 vm_paging_unmap_object(object, koffset, koffset + ksize);
5133                 koffset = 0;
5134                 ksize = 0;
5135                 kaddr = 0;
5136         }
5137         vm_object_paging_end(object);
5138 }