osfmk/vm/vm_fault.c

   1 /*
   2  * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58 /*
  59  *      File:   vm_fault.c
  60  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  61  *
  62  *      Page fault handling module.
  63  */
  64
  65 #include <mach_cluster_stats.h>
  66 #include <mach_pagemap.h>
  67 #include <mach_kdb.h>
  68 #include <libkern/OSAtomic.h>
  69
  70 #include <mach/mach_types.h>
  71 #include <mach/kern_return.h>
  72 #include <mach/message.h>       /* for error codes */
  73 #include <mach/vm_param.h>
  74 #include <mach/vm_behavior.h>
  75 #include <mach/memory_object.h>
  76                                 /* For memory_object_data_{request,unlock} */
  77 #include <mach/sdt.h>
  78
  79 #include <kern/kern_types.h>
  80 #include <kern/host_statistics.h>
  81 #include <kern/counters.h>
  82 #include <kern/task.h>
  83 #include <kern/thread.h>
  84 #include <kern/sched_prim.h>
  85 #include <kern/host.h>
  86 #include <kern/xpr.h>
  87 #include <kern/mach_param.h>
  88 #include <kern/macro_help.h>
  89 #include <kern/zalloc.h>
  90 #include <kern/misc_protos.h>
  91
  92 #include <vm/vm_fault.h>
  93 #include <vm/vm_map.h>
  94 #include <vm/vm_object.h>
  95 #include <vm/vm_page.h>
  96 #include <vm/vm_kern.h>
  97 #include <vm/pmap.h>
  98 #include <vm/vm_pageout.h>
  99 #include <vm/vm_protos.h>
 100 #include <vm/vm_external.h>
 101 #include <vm/memory_object.h>
 102 #include <vm/vm_purgeable_internal.h>   /* Needed by some vm_page.h macros */
 103 #include <vm/vm_shared_region.h>
 104
 105 #define VM_FAULT_CLASSIFY       0
 106
 107 #define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
 108
 109 int     vm_object_pagein_throttle = 16;
 110
 111 /*
 112  * We apply a hard throttle to the demand zero rate of tasks that we believe are running out of control which
 113  * kicks in when swap space runs out.  64-bit programs have massive address spaces and can leak enormous amounts
 114  * of memory if they're buggy and can run the system completely out of swap space.  If this happens, we
 115  * impose a hard throttle on them to prevent them from taking the last bit of memory left.  This helps
 116  * keep the UI active so that the user has a chance to kill the offending task before the system
 117  * completely hangs.
 118  *
 119  * The hard throttle is only applied when the system is nearly completely out of swap space and is only applied
 120  * to tasks that appear to be bloated.  When swap runs out, any task using more than vm_hard_throttle_threshold
 121  * will be throttled.  The throttling is done by giving the thread that's trying to demand zero a page a
 122  * delay of HARD_THROTTLE_DELAY microseconds before being allowed to try the page fault again.
 123  */
 124
 125 boolean_t thread_is_io_throttled(void);
 126
 127 uint64_t vm_hard_throttle_threshold;
 128
 129 extern unsigned int dp_pages_free, dp_pages_reserve;
 130
 131 #define NEED_TO_HARD_THROTTLE_THIS_TASK()       (((dp_pages_free + dp_pages_reserve < 2000) && \
 132                                                  (get_task_resident_size(current_task()) > vm_hard_throttle_threshold) && \
 133                                                  (current_task() != kernel_task) && VM_DYNAMIC_PAGING_ENABLED(memory_manager_default)) || \
 134                                                  (vm_page_free_count < vm_page_throttle_limit && thread_is_io_throttled() && \
 135                                                   (get_task_resident_size(current_task()) > vm_hard_throttle_threshold)))
 136
 137
 138 #define HARD_THROTTLE_DELAY     20000   /* 20000 us == 20 ms */
 139 #define SOFT_THROTTLE_DELAY     2000    /* 2000 us == 2 ms */
 140
 141
 142 extern int cs_debug;
 143
 144 #if     MACH_KDB
 145 extern struct db_watchpoint *db_watchpoint_list;
 146 #endif  /* MACH_KDB */
 147
 148 boolean_t current_thread_aborted(void);
 149
 150 /* Forward declarations of internal routines. */
 151 extern kern_return_t vm_fault_wire_fast(
 152                                 vm_map_t        map,
 153                                 vm_map_offset_t va,
 154                                 vm_map_entry_t  entry,
 155                                 pmap_t          pmap,
 156                                 vm_map_offset_t pmap_addr);
 157
 158 extern void vm_fault_continue(void);
 159
 160 extern void vm_fault_copy_cleanup(
 161                                 vm_page_t       page,
 162                                 vm_page_t       top_page);
 163
 164 extern void vm_fault_copy_dst_cleanup(
 165                                 vm_page_t       page);
 166
 167 #if     VM_FAULT_CLASSIFY
 168 extern void vm_fault_classify(vm_object_t       object,
 169                           vm_object_offset_t    offset,
 170                           vm_prot_t             fault_type);
 171
 172 extern void vm_fault_classify_init(void);
 173 #endif
 174
 175 unsigned long vm_pmap_enter_blocked = 0;
 176
 177 unsigned long vm_cs_validates = 0;
 178 unsigned long vm_cs_revalidates = 0;
 179 unsigned long vm_cs_query_modified = 0;
 180 unsigned long vm_cs_validated_dirtied = 0;
 181 unsigned long vm_cs_bitmap_validated = 0;
 182 #if CONFIG_ENFORCE_SIGNED_CODE
 183 int cs_enforcement_disable=0;
 184 #else
 185 static const int cs_enforcement_disable=1;
 186 #endif
 187
 188 /*
 189  *      Routine:        vm_fault_init
 190  *      Purpose:
 191  *              Initialize our private data structures.
 192  */
 193 void
 194 vm_fault_init(void)
 195 {
 196 #if !SECURE_KERNEL
 197 #if CONFIG_ENFORCE_SIGNED_CODE
 198         PE_parse_boot_argn("cs_enforcement_disable", &cs_enforcement_disable,
 199                            sizeof (cs_enforcement_disable));
 200 #endif
 201         PE_parse_boot_argn("cs_debug", &cs_debug, sizeof (cs_debug));
 202 #endif
 203
 204         /*
 205          * Choose a value for the hard throttle threshold based on the amount of ram.  The threshold is
 206          * computed as a percentage of available memory, and the percentage used is scaled inversely with
 207          * the amount of memory.  The pertange runs between 10% and 35%.  We use 35% for small memory systems
 208          * and reduce the value down to 10% for very large memory configurations.  This helps give us a
 209          * definition of a memory hog that makes more sense relative to the amount of ram in the machine.
 210          * The formula here simply uses the number of gigabytes of ram to adjust the percentage.
 211          */
 212
 213         vm_hard_throttle_threshold = sane_size * (35 - MIN((int)(sane_size / (1024*1024*1024)), 25)) / 100;
 214 }
 215
 216 /*
 217  *      Routine:        vm_fault_cleanup
 218  *      Purpose:
 219  *              Clean up the result of vm_fault_page.
 220  *      Results:
 221  *              The paging reference for "object" is released.
 222  *              "object" is unlocked.
 223  *              If "top_page" is not null,  "top_page" is
 224  *              freed and the paging reference for the object
 225  *              containing it is released.
 226  *
 227  *      In/out conditions:
 228  *              "object" must be locked.
 229  */
 230 void
 231 vm_fault_cleanup(
 232         register vm_object_t    object,
 233         register vm_page_t      top_page)
 234 {
 235         vm_object_paging_end(object);
 236         vm_object_unlock(object);
 237
 238         if (top_page != VM_PAGE_NULL) {
 239                 object = top_page->object;
 240
 241                 vm_object_lock(object);
 242                 VM_PAGE_FREE(top_page);
 243                 vm_object_paging_end(object);
 244                 vm_object_unlock(object);
 245         }
 246 }
 247
 248 #if     MACH_CLUSTER_STATS
 249 #define MAXCLUSTERPAGES 16
 250 struct {
 251         unsigned long pages_in_cluster;
 252         unsigned long pages_at_higher_offsets;
 253         unsigned long pages_at_lower_offsets;
 254 } cluster_stats_in[MAXCLUSTERPAGES];
 255 #define CLUSTER_STAT(clause)    clause
 256 #define CLUSTER_STAT_HIGHER(x)  \
 257         ((cluster_stats_in[(x)].pages_at_higher_offsets)++)
 258 #define CLUSTER_STAT_LOWER(x)   \
 259          ((cluster_stats_in[(x)].pages_at_lower_offsets)++)
 260 #define CLUSTER_STAT_CLUSTER(x) \
 261         ((cluster_stats_in[(x)].pages_in_cluster)++)
 262 #else   /* MACH_CLUSTER_STATS */
 263 #define CLUSTER_STAT(clause)
 264 #endif  /* MACH_CLUSTER_STATS */
 265
 266 #define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
 267
 268
 269 boolean_t       vm_page_deactivate_behind = TRUE;
 270 /*
 271  * default sizes given VM_BEHAVIOR_DEFAULT reference behavior
 272  */
 273 #define VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW     128
 274 #define VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER    16              /* don't make this too big... */
 275                                                                 /* we use it to size an array on the stack */
 276
 277 int vm_default_behind = VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW;
 278
 279 #define MAX_SEQUENTIAL_RUN      (1024 * 1024 * 1024)
 280
 281 /*
 282  * vm_page_is_sequential
 283  *
 284  * Determine if sequential access is in progress
 285  * in accordance with the behavior specified.
 286  * Update state to indicate current access pattern.
 287  *
 288  * object must have at least the shared lock held
 289  */
 290 static
 291 void
 292 vm_fault_is_sequential(
 293         vm_object_t             object,
 294         vm_object_offset_t      offset,
 295         vm_behavior_t           behavior)
 296 {
 297         vm_object_offset_t      last_alloc;
 298         int                     sequential;
 299         int                     orig_sequential;
 300
 301         last_alloc = object->last_alloc;
 302         sequential = object->sequential;
 303         orig_sequential = sequential;
 304
 305         switch (behavior) {
 306         case VM_BEHAVIOR_RANDOM:
 307                 /*
 308                  * reset indicator of sequential behavior
 309                  */
 310                 sequential = 0;
 311                 break;
 312
 313         case VM_BEHAVIOR_SEQUENTIAL:
 314                 if (offset && last_alloc == offset - PAGE_SIZE_64) {
 315                         /*
 316                          * advance indicator of sequential behavior
 317                          */
 318                         if (sequential < MAX_SEQUENTIAL_RUN)
 319                                 sequential += PAGE_SIZE;
 320                 } else {
 321                         /*
 322                          * reset indicator of sequential behavior
 323                          */
 324                         sequential = 0;
 325                 }
 326                 break;
 327
 328         case VM_BEHAVIOR_RSEQNTL:
 329                 if (last_alloc && last_alloc == offset + PAGE_SIZE_64) {
 330                         /*
 331                          * advance indicator of sequential behavior
 332                          */
 333                         if (sequential > -MAX_SEQUENTIAL_RUN)
 334                                 sequential -= PAGE_SIZE;
 335                 } else {
 336                         /*
 337                          * reset indicator of sequential behavior
 338                          */
 339                         sequential = 0;
 340                 }
 341                 break;
 342
 343         case VM_BEHAVIOR_DEFAULT:
 344         default:
 345                 if (offset && last_alloc == (offset - PAGE_SIZE_64)) {
 346                         /*
 347                          * advance indicator of sequential behavior
 348                          */
 349                         if (sequential < 0)
 350                                 sequential = 0;
 351                         if (sequential < MAX_SEQUENTIAL_RUN)
 352                                 sequential += PAGE_SIZE;
 353
 354                 } else if (last_alloc && last_alloc == (offset + PAGE_SIZE_64)) {
 355                         /*
 356                          * advance indicator of sequential behavior
 357                          */
 358                         if (sequential > 0)
 359                                 sequential = 0;
 360                         if (sequential > -MAX_SEQUENTIAL_RUN)
 361                                 sequential -= PAGE_SIZE;
 362                 } else {
 363                         /*
 364                          * reset indicator of sequential behavior
 365                          */
 366                         sequential = 0;
 367                 }
 368                 break;
 369         }
 370         if (sequential != orig_sequential) {
 371                 if (!OSCompareAndSwap(orig_sequential, sequential, (UInt32 *)&object->sequential)) {
 372                         /*
 373                          * if someone else has already updated object->sequential
 374                          * don't bother trying to update it or object->last_alloc
 375                          */
 376                         return;
 377                 }
 378         }
 379         /*
 380          * I'd like to do this with a OSCompareAndSwap64, but that
 381          * doesn't exist for PPC...  however, it shouldn't matter
 382          * that much... last_alloc is maintained so that we can determine
 383          * if a sequential access pattern is taking place... if only
 384          * one thread is banging on this object, no problem with the unprotected
 385          * update... if 2 or more threads are banging away, we run the risk of
 386          * someone seeing a mangled update... however, in the face of multiple
 387          * accesses, no sequential access pattern can develop anyway, so we
 388          * haven't lost any real info.
 389          */
 390         object->last_alloc = offset;
 391 }
 392
 393
 394 int vm_page_deactivate_behind_count = 0;
 395
 396 /*
 397  * vm_page_deactivate_behind
 398  *
 399  * Determine if sequential access is in progress
 400  * in accordance with the behavior specified.  If
 401  * so, compute a potential page to deactivate and
 402  * deactivate it.
 403  *
 404  * object must be locked.
 405  *
 406  * return TRUE if we actually deactivate a page
 407  */
 408 static
 409 boolean_t
 410 vm_fault_deactivate_behind(
 411         vm_object_t             object,
 412         vm_object_offset_t      offset,
 413         vm_behavior_t           behavior)
 414 {
 415         int             n;
 416         int             pages_in_run = 0;
 417         int             max_pages_in_run = 0;
 418         int             sequential_run;
 419         int             sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
 420         vm_object_offset_t      run_offset = 0;
 421         vm_object_offset_t      pg_offset = 0;
 422         vm_page_t       m;
 423         vm_page_t       page_run[VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER];
 424
 425         pages_in_run = 0;
 426 #if TRACEFAULTPAGE
 427         dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind); /* (TEST/DEBUG) */
 428 #endif
 429
 430         if (object == kernel_object || vm_page_deactivate_behind == FALSE) {
 431                 /*
 432                  * Do not deactivate pages from the kernel object: they
 433                  * are not intended to become pageable.
 434                  * or we've disabled the deactivate behind mechanism
 435                  */
 436                 return FALSE;
 437         }
 438         if ((sequential_run = object->sequential)) {
 439                   if (sequential_run < 0) {
 440                           sequential_behavior = VM_BEHAVIOR_RSEQNTL;
 441                           sequential_run = 0 - sequential_run;
 442                   } else {
 443                           sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
 444                   }
 445         }
 446         switch (behavior) {
 447         case VM_BEHAVIOR_RANDOM:
 448                 break;
 449         case VM_BEHAVIOR_SEQUENTIAL:
 450                 if (sequential_run >= (int)PAGE_SIZE) {
 451                         run_offset = 0 - PAGE_SIZE_64;
 452                         max_pages_in_run = 1;
 453                 }
 454                 break;
 455         case VM_BEHAVIOR_RSEQNTL:
 456                 if (sequential_run >= (int)PAGE_SIZE) {
 457                         run_offset = PAGE_SIZE_64;
 458                         max_pages_in_run = 1;
 459                 }
 460                 break;
 461         case VM_BEHAVIOR_DEFAULT:
 462         default:
 463         {       vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
 464
 465                 /*
 466                  * determine if the run of sequential accesss has been
 467                  * long enough on an object with default access behavior
 468                  * to consider it for deactivation
 469                  */
 470                 if ((uint64_t)sequential_run >= behind && (sequential_run % (VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER * PAGE_SIZE)) == 0) {
 471                         /*
 472                          * the comparisons between offset and behind are done
 473                          * in this kind of odd fashion in order to prevent wrap around
 474                          * at the end points
 475                          */
 476                         if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) {
 477                                 if (offset >= behind) {
 478                                         run_offset = 0 - behind;
 479                                         pg_offset = PAGE_SIZE_64;
 480                                         max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
 481                                 }
 482                         } else {
 483                                 if (offset < -behind) {
 484                                         run_offset = behind;
 485                                         pg_offset = 0 - PAGE_SIZE_64;
 486                                         max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
 487                                 }
 488                         }
 489                 }
 490                 break;
 491         }
 492         }
 493         for (n = 0; n < max_pages_in_run; n++) {
 494                 m = vm_page_lookup(object, offset + run_offset + (n * pg_offset));
 495
 496                 if (m && !m->busy && !m->no_cache && !m->throttled && !m->fictitious && !m->absent) {
 497                         page_run[pages_in_run++] = m;
 498                         pmap_clear_reference(m->phys_page);
 499                 }
 500         }
 501         if (pages_in_run) {
 502                 vm_page_lockspin_queues();
 503
 504                 for (n = 0; n < pages_in_run; n++) {
 505
 506                         m = page_run[n];
 507
 508                         vm_page_deactivate_internal(m, FALSE);
 509
 510                         vm_page_deactivate_behind_count++;
 511 #if TRACEFAULTPAGE
 512                         dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
 513 #endif
 514                 }
 515                 vm_page_unlock_queues();
 516
 517                 return TRUE;
 518         }
 519         return FALSE;
 520 }
 521
 522
 523 static int
 524 vm_page_throttled(void)
 525 {
 526         clock_sec_t     elapsed_sec;
 527         clock_sec_t     tv_sec;
 528         clock_usec_t    tv_usec;
 529
 530         thread_t thread = current_thread();
 531
 532         if (thread->options & TH_OPT_VMPRIV)
 533                 return (0);
 534
 535         thread->t_page_creation_count++;
 536
 537         if (NEED_TO_HARD_THROTTLE_THIS_TASK())
 538                 return (HARD_THROTTLE_DELAY);
 539
 540         if (vm_page_free_count < vm_page_throttle_limit &&
 541             thread->t_page_creation_count > vm_page_creation_throttle) {
 542
 543                 clock_get_system_microtime(&tv_sec, &tv_usec);
 544
 545                 elapsed_sec = tv_sec - thread->t_page_creation_time;
 546
 547                 if (elapsed_sec <= 6 || (thread->t_page_creation_count / elapsed_sec) >= (vm_page_creation_throttle / 6)) {
 548
 549                         if (elapsed_sec >= 60) {
 550                                 /*
 551                                  * we'll reset our stats to give a well behaved app
 552                                  * that was unlucky enough to accumulate a bunch of pages
 553                                  * over a long period of time a chance to get out of
 554                                  * the throttled state... we reset the counter and timestamp
 555                                  * so that if it stays under the rate limit for the next second
 556                                  * it will be back in our good graces... if it exceeds it, it
 557                                  * will remain in the throttled state
 558                                  */
 559                                 thread->t_page_creation_time = tv_sec;
 560                                 thread->t_page_creation_count = (vm_page_creation_throttle / 6) * 5;
 561                         }
 562                         ++vm_page_throttle_count;
 563
 564                         return (SOFT_THROTTLE_DELAY);
 565                 }
 566                 thread->t_page_creation_time = tv_sec;
 567                 thread->t_page_creation_count = 0;
 568         }
 569         return (0);
 570 }
 571
 572
 573 /*
 574  * check for various conditions that would
 575  * prevent us from creating a ZF page...
 576  * cleanup is based on being called from vm_fault_page
 577  *
 578  * object must be locked
 579  * object == m->object
 580  */
 581 static vm_fault_return_t
 582 vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t interruptible_state)
 583 {
 584         int throttle_delay;
 585
 586         if (object->shadow_severed ||
 587             VM_OBJECT_PURGEABLE_FAULT_ERROR(object)) {
 588                 /*
 589                  * Either:
 590                  * 1. the shadow chain was severed,
 591                  * 2. the purgeable object is volatile or empty and is marked
 592                  *    to fault on access while volatile.
 593                  * Just have to return an error at this point
 594                  */
 595                 if (m != VM_PAGE_NULL)
 596                         VM_PAGE_FREE(m);
 597                 vm_fault_cleanup(object, first_m);
 598
 599                 thread_interrupt_level(interruptible_state);
 600
 601                 return (VM_FAULT_MEMORY_ERROR);
 602         }
 603         if (vm_backing_store_low) {
 604                 /*
 605                  * are we protecting the system from
 606                  * backing store exhaustion.  If so
 607                  * sleep unless we are privileged.
 608                  */
 609                 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
 610
 611                         if (m != VM_PAGE_NULL)
 612                                 VM_PAGE_FREE(m);
 613                         vm_fault_cleanup(object, first_m);
 614
 615                         assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
 616
 617                         thread_block(THREAD_CONTINUE_NULL);
 618                         thread_interrupt_level(interruptible_state);
 619
 620                         return (VM_FAULT_RETRY);
 621                 }
 622         }
 623         if ((throttle_delay = vm_page_throttled())) {
 624                 /*
 625                  * we're throttling zero-fills...
 626                  * treat this as if we couldn't grab a page
 627                  */
 628                 if (m != VM_PAGE_NULL)
 629                         VM_PAGE_FREE(m);
 630                 vm_fault_cleanup(object, first_m);
 631
 632                 VM_DEBUG_EVENT(vmf_check_zfdelay, VMF_CHECK_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
 633
 634                 delay(throttle_delay);
 635
 636                 if (current_thread_aborted()) {
 637                         thread_interrupt_level(interruptible_state);
 638                         return VM_FAULT_INTERRUPTED;
 639                 }
 640                 thread_interrupt_level(interruptible_state);
 641
 642                 return (VM_FAULT_MEMORY_SHORTAGE);
 643         }
 644         return (VM_FAULT_SUCCESS);
 645 }
 646
 647
 648 /*
 649  * do the work to zero fill a page and
 650  * inject it into the correct paging queue
 651  *
 652  * m->object must be locked
 653  * page queue lock must NOT be held
 654  */
 655 static int
 656 vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill)
 657 {
 658         int my_fault = DBG_ZERO_FILL_FAULT;
 659
 660         /*
 661          * This is is a zero-fill page fault...
 662          *
 663          * Checking the page lock is a waste of
 664          * time;  this page was absent, so
 665          * it can't be page locked by a pager.
 666          *
 667          * we also consider it undefined
 668          * with respect to instruction
 669          * execution.  i.e. it is the responsibility
 670          * of higher layers to call for an instruction
 671          * sync after changing the contents and before
 672          * sending a program into this area.  We
 673          * choose this approach for performance
 674          */
 675         m->pmapped = TRUE;
 676
 677         m->cs_validated = FALSE;
 678         m->cs_tainted = FALSE;
 679
 680         if (no_zero_fill == TRUE) {
 681                 my_fault = DBG_NZF_PAGE_FAULT;
 682         } else {
 683                 vm_page_zero_fill(m);
 684
 685                 VM_STAT_INCR(zero_fill_count);
 686                 DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
 687         }
 688         assert(!m->laundry);
 689         assert(m->object != kernel_object);
 690         //assert(m->pageq.next == NULL && m->pageq.prev == NULL);
 691
 692         if (!VM_DYNAMIC_PAGING_ENABLED(memory_manager_default) &&
 693                 (m->object->purgable == VM_PURGABLE_DENY ||
 694                  m->object->purgable == VM_PURGABLE_NONVOLATILE ||
 695                  m->object->purgable == VM_PURGABLE_VOLATILE )) {
 696
 697                 vm_page_lockspin_queues();
 698
 699                 assert(!VM_PAGE_WIRED(m));
 700
 701                 VM_PAGE_QUEUES_REMOVE(m);
 702
 703                 queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq);
 704                 m->throttled = TRUE;
 705                 vm_page_throttled_count++;
 706
 707                 vm_page_unlock_queues();
 708         } else {
 709                 if (current_thread()->t_page_creation_count > vm_page_creation_throttle) {
 710                         m->zero_fill = TRUE;
 711                         VM_ZF_COUNT_INCR();
 712                 }
 713         }
 714         return (my_fault);
 715 }
 716
 717
 718 /*
 719  *      Routine:        vm_fault_page
 720  *      Purpose:
 721  *              Find the resident page for the virtual memory
 722  *              specified by the given virtual memory object
 723  *              and offset.
 724  *      Additional arguments:
 725  *              The required permissions for the page is given
 726  *              in "fault_type".  Desired permissions are included
 727  *              in "protection".
 728  *              fault_info is passed along to determine pagein cluster
 729  *              limits... it contains the expected reference pattern,
 730  *              cluster size if available, etc...
 731  *
 732  *              If the desired page is known to be resident (for
 733  *              example, because it was previously wired down), asserting
 734  *              the "unwiring" parameter will speed the search.
 735  *
 736  *              If the operation can be interrupted (by thread_abort
 737  *              or thread_terminate), then the "interruptible"
 738  *              parameter should be asserted.
 739  *
 740  *      Results:
 741  *              The page containing the proper data is returned
 742  *              in "result_page".
 743  *
 744  *      In/out conditions:
 745  *              The source object must be locked and referenced,
 746  *              and must donate one paging reference.  The reference
 747  *              is not affected.  The paging reference and lock are
 748  *              consumed.
 749  *
 750  *              If the call succeeds, the object in which "result_page"
 751  *              resides is left locked and holding a paging reference.
 752  *              If this is not the original object, a busy page in the
 753  *              original object is returned in "top_page", to prevent other
 754  *              callers from pursuing this same data, along with a paging
 755  *              reference for the original object.  The "top_page" should
 756  *              be destroyed when this guarantee is no longer required.
 757  *              The "result_page" is also left busy.  It is not removed
 758  *              from the pageout queues.
 759  *      Special Case:
 760  *              A return value of VM_FAULT_SUCCESS_NO_PAGE means that the
 761  *              fault succeeded but there's no VM page (i.e. the VM object
 762  *              does not actually hold VM pages, but device memory or
 763  *              large pages).  The object is still locked and we still hold a
 764  *              paging_in_progress reference.
 765  */
 766 unsigned int vm_fault_page_blocked_access = 0;
 767
 768 vm_fault_return_t
 769 vm_fault_page(
 770         /* Arguments: */
 771         vm_object_t     first_object,   /* Object to begin search */
 772         vm_object_offset_t first_offset,        /* Offset into object */
 773         vm_prot_t       fault_type,     /* What access is requested */
 774         boolean_t       must_be_resident,/* Must page be resident? */
 775         /* Modifies in place: */
 776         vm_prot_t       *protection,    /* Protection for mapping */
 777         /* Returns: */
 778         vm_page_t       *result_page,   /* Page found, if successful */
 779         vm_page_t       *top_page,      /* Page in top object, if
 780                                          * not result_page.  */
 781         int             *type_of_fault, /* if non-null, fill in with type of fault
 782                                          * COW, zero-fill, etc... returned in trace point */
 783         /* More arguments: */
 784         kern_return_t   *error_code,    /* code if page is in error */
 785         boolean_t       no_zero_fill,   /* don't zero fill absent pages */
 786 #if MACH_PAGEMAP
 787         boolean_t       data_supply,    /* treat as data_supply if
 788                                          * it is a write fault and a full
 789                                          * page is provided */
 790 #else
 791         __unused boolean_t data_supply,
 792 #endif
 793         vm_object_fault_info_t fault_info)
 794 {
 795         vm_page_t               m;
 796         vm_object_t             object;
 797         vm_object_offset_t      offset;
 798         vm_page_t               first_m;
 799         vm_object_t             next_object;
 800         vm_object_t             copy_object;
 801         boolean_t               look_for_page;
 802         vm_prot_t               access_required = fault_type;
 803         vm_prot_t               wants_copy_flag;
 804         CLUSTER_STAT(int pages_at_higher_offsets;)
 805         CLUSTER_STAT(int pages_at_lower_offsets;)
 806         kern_return_t           wait_result;
 807         boolean_t               interruptible_state;
 808         vm_fault_return_t       error;
 809         int                     my_fault;
 810         uint32_t                try_failed_count;
 811         int                     interruptible; /* how may fault be interrupted? */
 812         memory_object_t         pager;
 813         vm_fault_return_t       retval;
 814
 815 /*
 816  * MACH page map - an optional optimization where a bit map is maintained
 817  * by the VM subsystem for internal objects to indicate which pages of
 818  * the object currently reside on backing store.  This existence map
 819  * duplicates information maintained by the vnode pager.  It is
 820  * created at the time of the first pageout against the object, i.e.
 821  * at the same time pager for the object is created.  The optimization
 822  * is designed to eliminate pager interaction overhead, if it is
 823  * 'known' that the page does not exist on backing store.
 824  *
 825  * MUST_ASK_PAGER() evaluates to TRUE if the page specified by object/offset is
 826  * either marked as paged out in the existence map for the object or no
 827  * existence map exists for the object.  MUST_ASK_PAGER() is one of the
 828  * criteria in the decision to invoke the pager.   It is also used as one
 829  * of the criteria to terminate the scan for adjacent pages in a clustered
 830  * pagein operation.  Note that MUST_ASK_PAGER() always evaluates to TRUE for
 831  * permanent objects.  Note also that if the pager for an internal object
 832  * has not been created, the pager is not invoked regardless of the value
 833  * of MUST_ASK_PAGER() and that clustered pagein scans are only done on an object
 834  * for which a pager has been created.
 835  *
 836  * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
 837  * is marked as paged out in the existence map for the object.  PAGED_OUT()
 838  * PAGED_OUT() is used to determine if a page has already been pushed
 839  * into a copy object in order to avoid a redundant page out operation.
 840  */
 841 #if MACH_PAGEMAP
 842 #define MUST_ASK_PAGER(o, f) (vm_external_state_get((o)->existence_map, (f)) \
 843                         != VM_EXTERNAL_STATE_ABSENT)
 844 #define PAGED_OUT(o, f) (vm_external_state_get((o)->existence_map, (f)) \
 845                         == VM_EXTERNAL_STATE_EXISTS)
 846 #else
 847 #define MUST_ASK_PAGER(o, f) (TRUE)
 848 #define PAGED_OUT(o, f) (FALSE)
 849 #endif
 850
 851 /*
 852  *      Recovery actions
 853  */
 854 #define RELEASE_PAGE(m)                                 \
 855         MACRO_BEGIN                                     \
 856         PAGE_WAKEUP_DONE(m);                            \
 857         if (!m->active && !m->inactive && !m->throttled) {              \
 858                 vm_page_lockspin_queues();                              \
 859                 if (!m->active && !m->inactive && !m->throttled)        \
 860                         vm_page_activate(m);                            \
 861                 vm_page_unlock_queues();                                \
 862         }                                                               \
 863         MACRO_END
 864
 865 #if TRACEFAULTPAGE
 866         dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */
 867 #endif
 868
 869
 870 #if     MACH_KDB
 871                 /*
 872                  *      If there are watchpoints set, then
 873                  *      we don't want to give away write permission
 874                  *      on a read fault.  Make the task write fault,
 875                  *      so that the watchpoint code notices the access.
 876                  */
 877             if (db_watchpoint_list) {
 878                 /*
 879                  *      If we aren't asking for write permission,
 880                  *      then don't give it away.  We're using write
 881                  *      faults to set the dirty bit.
 882                  */
 883                 if (!(fault_type & VM_PROT_WRITE))
 884                         *protection &= ~VM_PROT_WRITE;
 885         }
 886 #endif  /* MACH_KDB */
 887
 888         interruptible = fault_info->interruptible;
 889         interruptible_state = thread_interrupt_level(interruptible);
 890
 891         /*
 892          *      INVARIANTS (through entire routine):
 893          *
 894          *      1)      At all times, we must either have the object
 895          *              lock or a busy page in some object to prevent
 896          *              some other thread from trying to bring in
 897          *              the same page.
 898          *
 899          *              Note that we cannot hold any locks during the
 900          *              pager access or when waiting for memory, so
 901          *              we use a busy page then.
 902          *
 903          *      2)      To prevent another thread from racing us down the
 904          *              shadow chain and entering a new page in the top
 905          *              object before we do, we must keep a busy page in
 906          *              the top object while following the shadow chain.
 907          *
 908          *      3)      We must increment paging_in_progress on any object
 909          *              for which we have a busy page before dropping
 910          *              the object lock
 911          *
 912          *      4)      We leave busy pages on the pageout queues.
 913          *              If the pageout daemon comes across a busy page,
 914          *              it will remove the page from the pageout queues.
 915          */
 916
 917         object = first_object;
 918         offset = first_offset;
 919         first_m = VM_PAGE_NULL;
 920         access_required = fault_type;
 921
 922
 923         XPR(XPR_VM_FAULT,
 924                 "vm_f_page: obj 0x%X, offset 0x%X, type %d, prot %d\n",
 925                 object, offset, fault_type, *protection, 0);
 926
 927         /*
 928          * default type of fault
 929          */
 930         my_fault = DBG_CACHE_HIT_FAULT;
 931
 932         while (TRUE) {
 933 #if TRACEFAULTPAGE
 934                 dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
 935 #endif
 936                 if (!object->alive) {
 937                         /*
 938                          * object is no longer valid
 939                          * clean up and return error
 940                          */
 941                         vm_fault_cleanup(object, first_m);
 942                         thread_interrupt_level(interruptible_state);
 943
 944                         return (VM_FAULT_MEMORY_ERROR);
 945                 }
 946
 947                 if (!object->pager_created && object->phys_contiguous) {
 948                         /*
 949                          * A physically-contiguous object without a pager:
 950                          * must be a "large page" object.  We do not deal
 951                          * with VM pages for this object.
 952                          */
 953                         m = VM_PAGE_NULL;
 954                         goto phys_contig_object;
 955                 }
 956
 957                 if (object->blocked_access) {
 958                         /*
 959                          * Access to this VM object has been blocked.
 960                          * Replace our "paging_in_progress" reference with
 961                          * a "activity_in_progress" reference and wait for
 962                          * access to be unblocked.
 963                          */
 964                         vm_object_activity_begin(object);
 965                         vm_object_paging_end(object);
 966                         while (object->blocked_access) {
 967                                 vm_object_sleep(object,
 968                                                 VM_OBJECT_EVENT_UNBLOCKED,
 969                                                 THREAD_UNINT);
 970                         }
 971                         vm_fault_page_blocked_access++;
 972                         vm_object_paging_begin(object);
 973                         vm_object_activity_end(object);
 974                 }
 975
 976                 /*
 977                  * See whether the page at 'offset' is resident
 978                  */
 979                 m = vm_page_lookup(object, offset);
 980 #if TRACEFAULTPAGE
 981                 dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
 982 #endif
 983                 if (m != VM_PAGE_NULL) {
 984
 985                         if (m->busy) {
 986                                 /*
 987                                  * The page is being brought in,
 988                                  * wait for it and then retry.
 989                                  *
 990                                  * A possible optimization: if the page
 991                                  * is known to be resident, we can ignore
 992                                  * pages that are absent (regardless of
 993                                  * whether they're busy).
 994                                  */
 995 #if TRACEFAULTPAGE
 996                                 dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
 997 #endif
 998                                 if (m->list_req_pending) {
 999                                         /*
1000                                          * "list_req_pending" means that the
1001                                          * page has been marked for a page-in
1002                                          * or page-out operation but hasn't been
1003                                          * grabbed yet.
1004                                          * Since whoever marked it
1005                                          * "list_req_pending" might now be
1006                                          * making its way through other layers
1007                                          * of code and possibly blocked on locks
1008                                          * that we might be holding, we can't
1009                                          * just block on a "busy" and
1010                                          * "list_req_pending" page or we might
1011                                          * deadlock with that other thread.
1012                                          *
1013                                          * [ For pages backed by a file on an
1014                                          * HFS volume, we might deadlock with
1015                                          * the HFS truncate lock, for example:
1016                                          * A: starts a pageout or pagein
1017                                          * operation and marks a page "busy",
1018                                          * "list_req_pending" and either
1019                                          * "pageout", "cleaning" or "absent".
1020                                          * A: makes its way through the
1021                                          * memory object (vnode) code.
1022                                          * B: starts from the memory object
1023                                          * side, via a write() on a file, for
1024                                          * example.
1025                                          * B: grabs some filesystem locks.
1026                                          * B: attempts to grab the same page for
1027                                          * its I/O.
1028                                          * B: blocks here because the page is
1029                                          * "busy".
1030                                          * A: attempts to grab the filesystem
1031                                          * lock we're holding.
1032                                          * And we have a deadlock... ]
1033                                          *
1034                                          * Since the page hasn't been claimed
1035                                          * by the other thread yet, it's fair
1036                                          * for us to grab here.
1037                                          */
1038                                         if (m->absent) {
1039                                                 /*
1040                                                  * The page needs to be paged
1041                                                  * in.  We can do it here but we
1042                                                  * need to get rid of "m", the
1043                                                  * place holder page inserted by
1044                                                  * another thread who is also
1045                                                  * trying to page it in.  When
1046                                                  * that thread resumes, it will
1047                                                  * either wait for our page to
1048                                                  * arrive or it will find it
1049                                                  * already there.
1050                                                  */
1051                                                 VM_PAGE_FREE(m);
1052
1053                                                 /*
1054                                                  * Retry the fault.  We'll find
1055                                                  * that the page is not resident
1056                                                  * and initiate a page-in again.
1057                                                  */
1058                                                 continue;
1059                                         }
1060                                         if (m->pageout || m->cleaning) {
1061                                                 /*
1062                                                  * This page has been selected
1063                                                  * for a page-out but we want
1064                                                  * to bring it in.  Let's just
1065                                                  * cancel the page-out...
1066                                                  */
1067                                                 vm_pageout_queue_steal(m, FALSE);
1068                                                 /*
1069                                                  * ... and clear "busy" and
1070                                                  * wake up any waiters...
1071                                                  */
1072                                                 PAGE_WAKEUP_DONE(m);
1073                                                 /*
1074                                                  * ... and continue with the
1075                                                  * "fault" handling.
1076                                                  */
1077                                         }
1078                                 } else {
1079                                         wait_result = PAGE_SLEEP(object, m, interruptible);
1080                                         XPR(XPR_VM_FAULT,
1081                                             "vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n",
1082                                                 object, offset,
1083                                                 m, 0, 0);
1084                                         counter(c_vm_fault_page_block_busy_kernel++);
1085
1086                                         if (wait_result != THREAD_AWAKENED) {
1087                                                 vm_fault_cleanup(object, first_m);
1088                                                 thread_interrupt_level(interruptible_state);
1089
1090                                                 if (wait_result == THREAD_RESTART)
1091                                                         return (VM_FAULT_RETRY);
1092                                                 else
1093                                                         return (VM_FAULT_INTERRUPTED);
1094                                         }
1095                                         continue;
1096                                 }
1097                         }
1098
1099                         if (m->phys_page == vm_page_guard_addr) {
1100                                 /*
1101                                  * Guard page: off limits !
1102                                  */
1103                                 if (fault_type == VM_PROT_NONE) {
1104                                         /*
1105                                          * The fault is not requesting any
1106                                          * access to the guard page, so it must
1107                                          * be just to wire or unwire it.
1108                                          * Let's pretend it succeeded...
1109                                          */
1110                                         m->busy = TRUE;
1111                                         *result_page = m;
1112                                         assert(first_m == VM_PAGE_NULL);
1113                                         *top_page = first_m;
1114                                         if (type_of_fault)
1115                                                 *type_of_fault = DBG_GUARD_FAULT;
1116                                         return VM_FAULT_SUCCESS;
1117                                 } else {
1118                                         /*
1119                                          * The fault requests access to the
1120                                          * guard page: let's deny that !
1121                                          */
1122                                         vm_fault_cleanup(object, first_m);
1123                                         thread_interrupt_level(interruptible_state);
1124                                         return VM_FAULT_MEMORY_ERROR;
1125                                 }
1126                         }
1127
1128                         if (m->error) {
1129                                 /*
1130                                  * The page is in error, give up now.
1131                                  */
1132 #if TRACEFAULTPAGE
1133                                 dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code);      /* (TEST/DEBUG) */
1134 #endif
1135                                 if (error_code)
1136                                         *error_code = KERN_MEMORY_ERROR;
1137                                 VM_PAGE_FREE(m);
1138
1139                                 vm_fault_cleanup(object, first_m);
1140                                 thread_interrupt_level(interruptible_state);
1141
1142                                 return (VM_FAULT_MEMORY_ERROR);
1143                         }
1144                         if (m->restart) {
1145                                 /*
1146                                  * The pager wants us to restart
1147                                  * at the top of the chain,
1148                                  * typically because it has moved the
1149                                  * page to another pager, then do so.
1150                                  */
1151 #if TRACEFAULTPAGE
1152                                 dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1153 #endif
1154                                 VM_PAGE_FREE(m);
1155
1156                                 vm_fault_cleanup(object, first_m);
1157                                 thread_interrupt_level(interruptible_state);
1158
1159                                 return (VM_FAULT_RETRY);
1160                         }
1161                         if (m->absent) {
1162                                 /*
1163                                  * The page isn't busy, but is absent,
1164                                  * therefore it's deemed "unavailable".
1165                                  *
1166                                  * Remove the non-existent page (unless it's
1167                                  * in the top object) and move on down to the
1168                                  * next object (if there is one).
1169                                  */
1170 #if TRACEFAULTPAGE
1171                                 dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow);  /* (TEST/DEBUG) */
1172 #endif
1173                                 next_object = object->shadow;
1174
1175                                 if (next_object == VM_OBJECT_NULL) {
1176                                         /*
1177                                          * Absent page at bottom of shadow
1178                                          * chain; zero fill the page we left
1179                                          * busy in the first object, and free
1180                                          * the absent page.
1181                                          */
1182                                         assert(!must_be_resident);
1183
1184                                         /*
1185                                          * check for any conditions that prevent
1186                                          * us from creating a new zero-fill page
1187                                          * vm_fault_check will do all of the
1188                                          * fault cleanup in the case of an error condition
1189                                          * including resetting the thread_interrupt_level
1190                                          */
1191                                         error = vm_fault_check(object, m, first_m, interruptible_state);
1192
1193                                         if (error != VM_FAULT_SUCCESS)
1194                                                 return (error);
1195
1196                                         XPR(XPR_VM_FAULT,
1197                                             "vm_f_page: zero obj 0x%X, off 0x%X, page 0x%X, first_obj 0x%X\n",
1198                                                 object, offset,
1199                                                 m,
1200                                                 first_object, 0);
1201
1202                                         if (object != first_object) {
1203                                                 /*
1204                                                  * free the absent page we just found
1205                                                  */
1206                                                 VM_PAGE_FREE(m);
1207
1208                                                 /*
1209                                                  * drop reference and lock on current object
1210                                                  */
1211                                                 vm_object_paging_end(object);
1212                                                 vm_object_unlock(object);
1213
1214                                                 /*
1215                                                  * grab the original page we
1216                                                  * 'soldered' in place and
1217                                                  * retake lock on 'first_object'
1218                                                  */
1219                                                 m = first_m;
1220                                                 first_m = VM_PAGE_NULL;
1221
1222                                                 object = first_object;
1223                                                 offset = first_offset;
1224
1225                                                 vm_object_lock(object);
1226                                         } else {
1227                                                 /*
1228                                                  * we're going to use the absent page we just found
1229                                                  * so convert it to a 'busy' page
1230                                                  */
1231                                                 m->absent = FALSE;
1232                                                 m->busy = TRUE;
1233                                         }
1234                                         /*
1235                                          * zero-fill the page and put it on
1236                                          * the correct paging queue
1237                                          */
1238                                         my_fault = vm_fault_zero_page(m, no_zero_fill);
1239
1240                                         if (fault_info->mark_zf_absent && no_zero_fill == TRUE)
1241                                                 m->absent = TRUE;
1242
1243                                         break;
1244                                 } else {
1245                                         if (must_be_resident)
1246                                                 vm_object_paging_end(object);
1247                                         else if (object != first_object) {
1248                                                 vm_object_paging_end(object);
1249                                                 VM_PAGE_FREE(m);
1250                                         } else {
1251                                                 first_m = m;
1252                                                 m->absent = FALSE;
1253                                                 m->busy = TRUE;
1254
1255                                                 vm_page_lockspin_queues();
1256                                                 VM_PAGE_QUEUES_REMOVE(m);
1257                                                 vm_page_unlock_queues();
1258                                         }
1259                                         XPR(XPR_VM_FAULT,
1260                                             "vm_f_page: unavail obj 0x%X, off 0x%X, next_obj 0x%X, newoff 0x%X\n",
1261                                                 object, offset,
1262                                                 next_object,
1263                                                 offset+object->vo_shadow_offset,0);
1264
1265                                         offset += object->vo_shadow_offset;
1266                                         fault_info->lo_offset += object->vo_shadow_offset;
1267                                         fault_info->hi_offset += object->vo_shadow_offset;
1268                                         access_required = VM_PROT_READ;
1269
1270                                         vm_object_lock(next_object);
1271                                         vm_object_unlock(object);
1272                                         object = next_object;
1273                                         vm_object_paging_begin(object);
1274
1275                                         /*
1276                                          * reset to default type of fault
1277                                          */
1278                                         my_fault = DBG_CACHE_HIT_FAULT;
1279
1280                                         continue;
1281                                 }
1282                         }
1283                         if ((m->cleaning)
1284                             && ((object != first_object) || (object->copy != VM_OBJECT_NULL))
1285                             && (fault_type & VM_PROT_WRITE)) {
1286                                 /*
1287                                  * This is a copy-on-write fault that will
1288                                  * cause us to revoke access to this page, but
1289                                  * this page is in the process of being cleaned
1290                                  * in a clustered pageout. We must wait until
1291                                  * the cleaning operation completes before
1292                                  * revoking access to the original page,
1293                                  * otherwise we might attempt to remove a
1294                                  * wired mapping.
1295                                  */
1296 #if TRACEFAULTPAGE
1297                                 dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset);  /* (TEST/DEBUG) */
1298 #endif
1299                                 XPR(XPR_VM_FAULT,
1300                                     "vm_f_page: cleaning obj 0x%X, offset 0x%X, page 0x%X\n",
1301                                         object, offset,
1302                                         m, 0, 0);
1303                                 /*
1304                                  * take an extra ref so that object won't die
1305                                  */
1306                                 vm_object_reference_locked(object);
1307
1308                                 vm_fault_cleanup(object, first_m);
1309
1310                                 counter(c_vm_fault_page_block_backoff_kernel++);
1311                                 vm_object_lock(object);
1312                                 assert(object->ref_count > 0);
1313
1314                                 m = vm_page_lookup(object, offset);
1315
1316                                 if (m != VM_PAGE_NULL && m->cleaning) {
1317                                         PAGE_ASSERT_WAIT(m, interruptible);
1318
1319                                         vm_object_unlock(object);
1320                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1321                                         vm_object_deallocate(object);
1322
1323                                         goto backoff;
1324                                 } else {
1325                                         vm_object_unlock(object);
1326
1327                                         vm_object_deallocate(object);
1328                                         thread_interrupt_level(interruptible_state);
1329
1330                                         return (VM_FAULT_RETRY);
1331                                 }
1332                         }
1333                         if (type_of_fault == NULL && m->speculative &&
1334                             !(fault_info != NULL && fault_info->stealth)) {
1335                                 /*
1336                                  * If we were passed a non-NULL pointer for
1337                                  * "type_of_fault", than we came from
1338                                  * vm_fault... we'll let it deal with
1339                                  * this condition, since it
1340                                  * needs to see m->speculative to correctly
1341                                  * account the pageins, otherwise...
1342                                  * take it off the speculative queue, we'll
1343                                  * let the caller of vm_fault_page deal
1344                                  * with getting it onto the correct queue
1345                                  *
1346                                  * If the caller specified in fault_info that
1347                                  * it wants a "stealth" fault, we also leave
1348                                  * the page in the speculative queue.
1349                                  */
1350                                 vm_page_lockspin_queues();
1351                                 VM_PAGE_QUEUES_REMOVE(m);
1352                                 vm_page_unlock_queues();
1353                         }
1354
1355                         if (m->encrypted) {
1356                                 /*
1357                                  * ENCRYPTED SWAP:
1358                                  * the user needs access to a page that we
1359                                  * encrypted before paging it out.
1360                                  * Decrypt the page now.
1361                                  * Keep it busy to prevent anyone from
1362                                  * accessing it during the decryption.
1363                                  */
1364                                 m->busy = TRUE;
1365                                 vm_page_decrypt(m, 0);
1366                                 assert(object == m->object);
1367                                 assert(m->busy);
1368                                 PAGE_WAKEUP_DONE(m);
1369
1370                                 /*
1371                                  * Retry from the top, in case
1372                                  * something changed while we were
1373                                  * decrypting.
1374                                  */
1375                                 continue;
1376                         }
1377                         ASSERT_PAGE_DECRYPTED(m);
1378
1379                         if (m->object->code_signed) {
1380                                 /*
1381                                  * CODE SIGNING:
1382                                  * We just paged in a page from a signed
1383                                  * memory object but we don't need to
1384                                  * validate it now.  We'll validate it if
1385                                  * when it gets mapped into a user address
1386                                  * space for the first time or when the page
1387                                  * gets copied to another object as a result
1388                                  * of a copy-on-write.
1389                                  */
1390                         }
1391
1392                         /*
1393                          * We mark the page busy and leave it on
1394                          * the pageout queues.  If the pageout
1395                          * deamon comes across it, then it will
1396                          * remove the page from the queue, but not the object
1397                          */
1398 #if TRACEFAULTPAGE
1399                         dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1400 #endif
1401                         XPR(XPR_VM_FAULT,
1402                             "vm_f_page: found page obj 0x%X, offset 0x%X, page 0x%X\n",
1403                                 object, offset, m, 0, 0);
1404                         assert(!m->busy);
1405                         assert(!m->absent);
1406
1407                         m->busy = TRUE;
1408                         break;
1409                 }
1410
1411
1412                 /*
1413                  * we get here when there is no page present in the object at
1414                  * the offset we're interested in... we'll allocate a page
1415                  * at this point if the pager associated with
1416                  * this object can provide the data or we're the top object...
1417                  * object is locked;  m == NULL
1418                  */
1419                 look_for_page = (object->pager_created && (MUST_ASK_PAGER(object, offset) == TRUE) && !data_supply);
1420
1421 #if TRACEFAULTPAGE
1422                 dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object);      /* (TEST/DEBUG) */
1423 #endif
1424                 if ((look_for_page || (object == first_object)) && !must_be_resident && !object->phys_contiguous) {
1425                         /*
1426                          * Allocate a new page for this object/offset pair
1427                          */
1428                         m = vm_page_grab();
1429 #if TRACEFAULTPAGE
1430                         dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
1431 #endif
1432                         if (m == VM_PAGE_NULL) {
1433
1434                                 vm_fault_cleanup(object, first_m);
1435                                 thread_interrupt_level(interruptible_state);
1436
1437                                 return (VM_FAULT_MEMORY_SHORTAGE);
1438                         }
1439                         vm_page_insert(m, object, offset);
1440                 }
1441                 if (look_for_page && !must_be_resident) {
1442                         kern_return_t   rc;
1443
1444                         /*
1445                          *      If the memory manager is not ready, we
1446                          *      cannot make requests.
1447                          */
1448                         if (!object->pager_ready) {
1449 #if TRACEFAULTPAGE
1450                                 dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
1451 #endif
1452                                 if (m != VM_PAGE_NULL)
1453                                         VM_PAGE_FREE(m);
1454
1455                                 XPR(XPR_VM_FAULT,
1456                                 "vm_f_page: ready wait obj 0x%X, offset 0x%X\n",
1457                                         object, offset, 0, 0, 0);
1458
1459                                 /*
1460                                  * take an extra ref so object won't die
1461                                  */
1462                                 vm_object_reference_locked(object);
1463                                 vm_fault_cleanup(object, first_m);
1464                                 counter(c_vm_fault_page_block_backoff_kernel++);
1465
1466                                 vm_object_lock(object);
1467                                 assert(object->ref_count > 0);
1468
1469                                 if (!object->pager_ready) {
1470                                         wait_result = vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGER_READY, interruptible);
1471
1472                                         vm_object_unlock(object);
1473                                         if (wait_result == THREAD_WAITING)
1474                                                 wait_result = thread_block(THREAD_CONTINUE_NULL);
1475                                         vm_object_deallocate(object);
1476
1477                                         goto backoff;
1478                                 } else {
1479                                         vm_object_unlock(object);
1480                                         vm_object_deallocate(object);
1481                                         thread_interrupt_level(interruptible_state);
1482
1483                                         return (VM_FAULT_RETRY);
1484                                 }
1485                         }
1486                         if (!object->internal && !object->phys_contiguous && object->paging_in_progress > vm_object_pagein_throttle) {
1487                                 /*
1488                                  * If there are too many outstanding page
1489                                  * requests pending on this external object, we
1490                                  * wait for them to be resolved now.
1491                                  */
1492 #if TRACEFAULTPAGE
1493                                 dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1494 #endif
1495                                 if (m != VM_PAGE_NULL)
1496                                         VM_PAGE_FREE(m);
1497                                 /*
1498                                  * take an extra ref so object won't die
1499                                  */
1500                                 vm_object_reference_locked(object);
1501
1502                                 vm_fault_cleanup(object, first_m);
1503
1504                                 counter(c_vm_fault_page_block_backoff_kernel++);
1505
1506                                 vm_object_lock(object);
1507                                 assert(object->ref_count > 0);
1508
1509                                 if (object->paging_in_progress >= vm_object_pagein_throttle) {
1510                                         vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGING_ONLY_IN_PROGRESS, interruptible);
1511
1512                                         vm_object_unlock(object);
1513                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1514                                         vm_object_deallocate(object);
1515
1516                                         goto backoff;
1517                                 } else {
1518                                         vm_object_unlock(object);
1519                                         vm_object_deallocate(object);
1520                                         thread_interrupt_level(interruptible_state);
1521
1522                                         return (VM_FAULT_RETRY);
1523                                 }
1524                         }
1525                         if (m != VM_PAGE_NULL) {
1526                                 /*
1527                                  * Indicate that the page is waiting for data
1528                                  * from the memory manager.
1529                                  */
1530                                 m->list_req_pending = TRUE;
1531                                 m->absent = TRUE;
1532                         }
1533
1534 #if TRACEFAULTPAGE
1535                         dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0);  /* (TEST/DEBUG) */
1536 #endif
1537
1538                         /*
1539                          * It's possible someone called vm_object_destroy while we weren't
1540                          * holding the object lock.  If that has happened, then bail out
1541                          * here.
1542                          */
1543
1544                         pager = object->pager;
1545
1546                         if (pager == MEMORY_OBJECT_NULL) {
1547                                 vm_fault_cleanup(object, first_m);
1548                                 thread_interrupt_level(interruptible_state);
1549                                 return VM_FAULT_MEMORY_ERROR;
1550                         }
1551
1552                         /*
1553                          * We have an absent page in place for the faulting offset,
1554                          * so we can release the object lock.
1555                          */
1556
1557                         vm_object_unlock(object);
1558
1559                         /*
1560                          * If this object uses a copy_call strategy,
1561                          * and we are interested in a copy of this object
1562                          * (having gotten here only by following a
1563                          * shadow chain), then tell the memory manager
1564                          * via a flag added to the desired_access
1565                          * parameter, so that it can detect a race
1566                          * between our walking down the shadow chain
1567                          * and its pushing pages up into a copy of
1568                          * the object that it manages.
1569                          */
1570                         if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL && object != first_object)
1571                                 wants_copy_flag = VM_PROT_WANTS_COPY;
1572                         else
1573                                 wants_copy_flag = VM_PROT_NONE;
1574
1575                         XPR(XPR_VM_FAULT,
1576                             "vm_f_page: data_req obj 0x%X, offset 0x%X, page 0x%X, acc %d\n",
1577                                 object, offset, m,
1578                                 access_required | wants_copy_flag, 0);
1579
1580                         /*
1581                          * Call the memory manager to retrieve the data.
1582                          */
1583                         rc = memory_object_data_request(
1584                                 pager,
1585                                 offset + object->paging_offset,
1586                                 PAGE_SIZE,
1587                                 access_required | wants_copy_flag,
1588                                 (memory_object_fault_info_t)fault_info);
1589
1590 #if TRACEFAULTPAGE
1591                         dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
1592 #endif
1593                         vm_object_lock(object);
1594
1595                         if (rc != KERN_SUCCESS) {
1596
1597                                 vm_fault_cleanup(object, first_m);
1598                                 thread_interrupt_level(interruptible_state);
1599
1600                                 return ((rc == MACH_SEND_INTERRUPTED) ?
1601                                         VM_FAULT_INTERRUPTED :
1602                                         VM_FAULT_MEMORY_ERROR);
1603                         } else {
1604                                 clock_sec_t     tv_sec;
1605                                 clock_usec_t    tv_usec;
1606
1607                                 clock_get_system_microtime(&tv_sec, &tv_usec);
1608                                 current_thread()->t_page_creation_time = tv_sec;
1609                                 current_thread()->t_page_creation_count = 0;
1610                         }
1611                         if ((interruptible != THREAD_UNINT) && (current_thread()->sched_flags & TH_SFLAG_ABORT)) {
1612
1613                                 vm_fault_cleanup(object, first_m);
1614                                 thread_interrupt_level(interruptible_state);
1615
1616                                 return (VM_FAULT_INTERRUPTED);
1617                         }
1618                         if (m == VM_PAGE_NULL && object->phys_contiguous) {
1619                                 /*
1620                                  * No page here means that the object we
1621                                  * initially looked up was "physically
1622                                  * contiguous" (i.e. device memory).  However,
1623                                  * with Virtual VRAM, the object might not
1624                                  * be backed by that device memory anymore,
1625                                  * so we're done here only if the object is
1626                                  * still "phys_contiguous".
1627                                  * Otherwise, if the object is no longer
1628                                  * "phys_contiguous", we need to retry the
1629                                  * page fault against the object's new backing
1630                                  * store (different memory object).
1631                                  */
1632                         phys_contig_object:
1633                                 goto done;
1634                         }
1635                         /*
1636                          * potentially a pagein fault
1637                          * if we make it through the state checks
1638                          * above, than we'll count it as such
1639                          */
1640                         my_fault = DBG_PAGEIN_FAULT;
1641
1642                         /*
1643                          * Retry with same object/offset, since new data may
1644                          * be in a different page (i.e., m is meaningless at
1645                          * this point).
1646                          */
1647                         continue;
1648                 }
1649
1650                 /*
1651                  * We get here if the object has no pager, or an existence map
1652                  * exists and indicates the page isn't present on the pager
1653                  * or we're unwiring a page.  If a pager exists, but there
1654                  * is no existence map, then the m->absent case above handles
1655                  * the ZF case when the pager can't provide the page
1656                  */
1657 #if TRACEFAULTPAGE
1658                 dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1659 #endif
1660                 if (object == first_object)
1661                         first_m = m;
1662                 else
1663                         assert(m == VM_PAGE_NULL);
1664
1665                 XPR(XPR_VM_FAULT,
1666                     "vm_f_page: no pager obj 0x%X, offset 0x%X, page 0x%X, next_obj 0x%X\n",
1667                         object, offset, m,
1668                         object->shadow, 0);
1669
1670                 next_object = object->shadow;
1671
1672                 if (next_object == VM_OBJECT_NULL) {
1673                         /*
1674                          * we've hit the bottom of the shadown chain,
1675                          * fill the page in the top object with zeros.
1676                          */
1677                         assert(!must_be_resident);
1678
1679                         if (object != first_object) {
1680                                 vm_object_paging_end(object);
1681                                 vm_object_unlock(object);
1682
1683                                 object = first_object;
1684                                 offset = first_offset;
1685                                 vm_object_lock(object);
1686                         }
1687                         m = first_m;
1688                         assert(m->object == object);
1689                         first_m = VM_PAGE_NULL;
1690
1691                         /*
1692                          * check for any conditions that prevent
1693                          * us from creating a new zero-fill page
1694                          * vm_fault_check will do all of the
1695                          * fault cleanup in the case of an error condition
1696                          * including resetting the thread_interrupt_level
1697                          */
1698                         error = vm_fault_check(object, m, first_m, interruptible_state);
1699
1700                         if (error != VM_FAULT_SUCCESS)
1701                                 return (error);
1702
1703                         if (m == VM_PAGE_NULL) {
1704                                 m = vm_page_grab();
1705
1706                                 if (m == VM_PAGE_NULL) {
1707                                         vm_fault_cleanup(object, VM_PAGE_NULL);
1708                                         thread_interrupt_level(interruptible_state);
1709
1710                                         return (VM_FAULT_MEMORY_SHORTAGE);
1711                                 }
1712                                 vm_page_insert(m, object, offset);
1713                         }
1714                         my_fault = vm_fault_zero_page(m, no_zero_fill);
1715
1716                         if (fault_info->mark_zf_absent && no_zero_fill == TRUE)
1717                                 m->absent = TRUE;
1718                         break;
1719
1720                 } else {
1721                         /*
1722                          * Move on to the next object.  Lock the next
1723                          * object before unlocking the current one.
1724                          */
1725                         if ((object != first_object) || must_be_resident)
1726                                 vm_object_paging_end(object);
1727
1728                         offset += object->vo_shadow_offset;
1729                         fault_info->lo_offset += object->vo_shadow_offset;
1730                         fault_info->hi_offset += object->vo_shadow_offset;
1731                         access_required = VM_PROT_READ;
1732
1733                         vm_object_lock(next_object);
1734                         vm_object_unlock(object);
1735
1736                         object = next_object;
1737                         vm_object_paging_begin(object);
1738                 }
1739         }
1740
1741         /*
1742          *      PAGE HAS BEEN FOUND.
1743          *
1744          *      This page (m) is:
1745          *              busy, so that we can play with it;
1746          *              not absent, so that nobody else will fill it;
1747          *              possibly eligible for pageout;
1748          *
1749          *      The top-level page (first_m) is:
1750          *              VM_PAGE_NULL if the page was found in the
1751          *               top-level object;
1752          *              busy, not absent, and ineligible for pageout.
1753          *
1754          *      The current object (object) is locked.  A paging
1755          *      reference is held for the current and top-level
1756          *      objects.
1757          */
1758
1759 #if TRACEFAULTPAGE
1760         dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1761 #endif
1762 #if     EXTRA_ASSERTIONS
1763         assert(m->busy && !m->absent);
1764         assert((first_m == VM_PAGE_NULL) ||
1765                (first_m->busy && !first_m->absent &&
1766                 !first_m->active && !first_m->inactive));
1767 #endif  /* EXTRA_ASSERTIONS */
1768
1769         /*
1770          * ENCRYPTED SWAP:
1771          * If we found a page, we must have decrypted it before we
1772          * get here...
1773          */
1774         ASSERT_PAGE_DECRYPTED(m);
1775
1776         XPR(XPR_VM_FAULT,
1777             "vm_f_page: FOUND obj 0x%X, off 0x%X, page 0x%X, 1_obj 0x%X, 1_m 0x%X\n",
1778                 object, offset, m,
1779                 first_object, first_m);
1780
1781         /*
1782          * If the page is being written, but isn't
1783          * already owned by the top-level object,
1784          * we have to copy it into a new page owned
1785          * by the top-level object.
1786          */
1787         if (object != first_object) {
1788
1789 #if TRACEFAULTPAGE
1790                 dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1791 #endif
1792                 if (fault_type & VM_PROT_WRITE) {
1793                         vm_page_t copy_m;
1794
1795                         /*
1796                          * We only really need to copy if we
1797                          * want to write it.
1798                          */
1799                         assert(!must_be_resident);
1800
1801                         /*
1802                          * are we protecting the system from
1803                          * backing store exhaustion.  If so
1804                          * sleep unless we are privileged.
1805                          */
1806                         if (vm_backing_store_low) {
1807                                 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
1808
1809                                         RELEASE_PAGE(m);
1810                                         vm_fault_cleanup(object, first_m);
1811
1812                                         assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
1813
1814                                         thread_block(THREAD_CONTINUE_NULL);
1815                                         thread_interrupt_level(interruptible_state);
1816
1817                                         return (VM_FAULT_RETRY);
1818                                 }
1819                         }
1820                         /*
1821                          * If we try to collapse first_object at this
1822                          * point, we may deadlock when we try to get
1823                          * the lock on an intermediate object (since we
1824                          * have the bottom object locked).  We can't
1825                          * unlock the bottom object, because the page
1826                          * we found may move (by collapse) if we do.
1827                          *
1828                          * Instead, we first copy the page.  Then, when
1829                          * we have no more use for the bottom object,
1830                          * we unlock it and try to collapse.
1831                          *
1832                          * Note that we copy the page even if we didn't
1833                          * need to... that's the breaks.
1834                          */
1835
1836                         /*
1837                          * Allocate a page for the copy
1838                          */
1839                         copy_m = vm_page_grab();
1840
1841                         if (copy_m == VM_PAGE_NULL) {
1842                                 RELEASE_PAGE(m);
1843
1844                                 vm_fault_cleanup(object, first_m);
1845                                 thread_interrupt_level(interruptible_state);
1846
1847                                 return (VM_FAULT_MEMORY_SHORTAGE);
1848                         }
1849                         XPR(XPR_VM_FAULT,
1850                             "vm_f_page: page_copy obj 0x%X, offset 0x%X, m 0x%X, copy_m 0x%X\n",
1851                                 object, offset,
1852                                 m, copy_m, 0);
1853
1854                         vm_page_copy(m, copy_m);
1855
1856                         /*
1857                          * If another map is truly sharing this
1858                          * page with us, we have to flush all
1859                          * uses of the original page, since we
1860                          * can't distinguish those which want the
1861                          * original from those which need the
1862                          * new copy.
1863                          *
1864                          * XXXO If we know that only one map has
1865                          * access to this page, then we could
1866                          * avoid the pmap_disconnect() call.
1867                          */
1868                         if (m->pmapped)
1869                                 pmap_disconnect(m->phys_page);
1870
1871                         assert(!m->cleaning);
1872
1873                         /*
1874                          * We no longer need the old page or object.
1875                          */
1876                         PAGE_WAKEUP_DONE(m);
1877                         vm_object_paging_end(object);
1878                         vm_object_unlock(object);
1879
1880                         my_fault = DBG_COW_FAULT;
1881                         VM_STAT_INCR(cow_faults);
1882                         DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
1883                         current_task()->cow_faults++;
1884
1885                         object = first_object;
1886                         offset = first_offset;
1887
1888                         vm_object_lock(object);
1889                         /*
1890                          * get rid of the place holder
1891                          * page that we soldered in earlier
1892                          */
1893                         VM_PAGE_FREE(first_m);
1894                         first_m = VM_PAGE_NULL;
1895
1896                         /*
1897                          * and replace it with the
1898                          * page we just copied into
1899                          */
1900                         assert(copy_m->busy);
1901                         vm_page_insert(copy_m, object, offset);
1902                         copy_m->dirty = TRUE;
1903
1904                         m = copy_m;
1905                         /*
1906                          * Now that we've gotten the copy out of the
1907                          * way, let's try to collapse the top object.
1908                          * But we have to play ugly games with
1909                          * paging_in_progress to do that...
1910                          */
1911                         vm_object_paging_end(object);
1912                         vm_object_collapse(object, offset, TRUE);
1913                         vm_object_paging_begin(object);
1914
1915                 } else
1916                         *protection &= (~VM_PROT_WRITE);
1917         }
1918         /*
1919          * Now check whether the page needs to be pushed into the
1920          * copy object.  The use of asymmetric copy on write for
1921          * shared temporary objects means that we may do two copies to
1922          * satisfy the fault; one above to get the page from a
1923          * shadowed object, and one here to push it into the copy.
1924          */
1925         try_failed_count = 0;
1926
1927         while ((copy_object = first_object->copy) != VM_OBJECT_NULL) {
1928                 vm_object_offset_t      copy_offset;
1929                 vm_page_t               copy_m;
1930
1931 #if TRACEFAULTPAGE
1932                 dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type);    /* (TEST/DEBUG) */
1933 #endif
1934                 /*
1935                  * If the page is being written, but hasn't been
1936                  * copied to the copy-object, we have to copy it there.
1937                  */
1938                 if ((fault_type & VM_PROT_WRITE) == 0) {
1939                         *protection &= ~VM_PROT_WRITE;
1940                         break;
1941                 }
1942
1943                 /*
1944                  * If the page was guaranteed to be resident,
1945                  * we must have already performed the copy.
1946                  */
1947                 if (must_be_resident)
1948                         break;
1949
1950                 /*
1951                  * Try to get the lock on the copy_object.
1952                  */
1953                 if (!vm_object_lock_try(copy_object)) {
1954
1955                         vm_object_unlock(object);
1956                         try_failed_count++;
1957
1958                         mutex_pause(try_failed_count);  /* wait a bit */
1959                         vm_object_lock(object);
1960
1961                         continue;
1962                 }
1963                 try_failed_count = 0;
1964
1965                 /*
1966                  * Make another reference to the copy-object,
1967                  * to keep it from disappearing during the
1968                  * copy.
1969                  */
1970                 vm_object_reference_locked(copy_object);
1971
1972                 /*
1973                  * Does the page exist in the copy?
1974                  */
1975                 copy_offset = first_offset - copy_object->vo_shadow_offset;
1976
1977                 if (copy_object->vo_size <= copy_offset)
1978                         /*
1979                          * Copy object doesn't cover this page -- do nothing.
1980                          */
1981                         ;
1982                 else if ((copy_m = vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
1983                         /*
1984                          * Page currently exists in the copy object
1985                          */
1986                         if (copy_m->busy) {
1987                                 /*
1988                                  * If the page is being brought
1989                                  * in, wait for it and then retry.
1990                                  */
1991                                 RELEASE_PAGE(m);
1992
1993                                 /*
1994                                  * take an extra ref so object won't die
1995                                  */
1996                                 vm_object_reference_locked(copy_object);
1997                                 vm_object_unlock(copy_object);
1998                                 vm_fault_cleanup(object, first_m);
1999                                 counter(c_vm_fault_page_block_backoff_kernel++);
2000
2001                                 vm_object_lock(copy_object);
2002                                 assert(copy_object->ref_count > 0);
2003                                 VM_OBJ_RES_DECR(copy_object);
2004                                 vm_object_lock_assert_exclusive(copy_object);
2005                                 copy_object->ref_count--;
2006                                 assert(copy_object->ref_count > 0);
2007                                 copy_m = vm_page_lookup(copy_object, copy_offset);
2008                                 /*
2009                                  * ENCRYPTED SWAP:
2010                                  * it's OK if the "copy_m" page is encrypted,
2011                                  * because we're not moving it nor handling its
2012                                  * contents.
2013                                  */
2014                                 if (copy_m != VM_PAGE_NULL && copy_m->busy) {
2015                                         PAGE_ASSERT_WAIT(copy_m, interruptible);
2016
2017                                         vm_object_unlock(copy_object);
2018                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
2019                                         vm_object_deallocate(copy_object);
2020
2021                                         goto backoff;
2022                                 } else {
2023                                         vm_object_unlock(copy_object);
2024                                         vm_object_deallocate(copy_object);
2025                                         thread_interrupt_level(interruptible_state);
2026
2027                                         return (VM_FAULT_RETRY);
2028                                 }
2029                         }
2030                 }
2031                 else if (!PAGED_OUT(copy_object, copy_offset)) {
2032                         /*
2033                          * If PAGED_OUT is TRUE, then the page used to exist
2034                          * in the copy-object, and has already been paged out.
2035                          * We don't need to repeat this. If PAGED_OUT is
2036                          * FALSE, then either we don't know (!pager_created,
2037                          * for example) or it hasn't been paged out.
2038                          * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
2039                          * We must copy the page to the copy object.
2040                          */
2041
2042                         if (vm_backing_store_low) {
2043                                 /*
2044                                  * we are protecting the system from
2045                                  * backing store exhaustion.  If so
2046                                  * sleep unless we are privileged.
2047                                  */
2048                                 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
2049                                         assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
2050
2051                                         RELEASE_PAGE(m);
2052                                         VM_OBJ_RES_DECR(copy_object);
2053                                         vm_object_lock_assert_exclusive(copy_object);
2054                                         copy_object->ref_count--;
2055                                         assert(copy_object->ref_count > 0);
2056
2057                                         vm_object_unlock(copy_object);
2058                                         vm_fault_cleanup(object, first_m);
2059                                         thread_block(THREAD_CONTINUE_NULL);
2060                                         thread_interrupt_level(interruptible_state);
2061
2062                                         return (VM_FAULT_RETRY);
2063                                 }
2064                         }
2065                         /*
2066                          * Allocate a page for the copy
2067                          */
2068                         copy_m = vm_page_alloc(copy_object, copy_offset);
2069
2070                         if (copy_m == VM_PAGE_NULL) {
2071                                 RELEASE_PAGE(m);
2072
2073                                 VM_OBJ_RES_DECR(copy_object);
2074                                 vm_object_lock_assert_exclusive(copy_object);
2075                                 copy_object->ref_count--;
2076                                 assert(copy_object->ref_count > 0);
2077
2078                                 vm_object_unlock(copy_object);
2079                                 vm_fault_cleanup(object, first_m);
2080                                 thread_interrupt_level(interruptible_state);
2081
2082                                 return (VM_FAULT_MEMORY_SHORTAGE);
2083                         }
2084                         /*
2085                          * Must copy page into copy-object.
2086                          */
2087                         vm_page_copy(m, copy_m);
2088
2089                         /*
2090                          * If the old page was in use by any users
2091                          * of the copy-object, it must be removed
2092                          * from all pmaps.  (We can't know which
2093                          * pmaps use it.)
2094                          */
2095                         if (m->pmapped)
2096                                 pmap_disconnect(m->phys_page);
2097
2098                         /*
2099                          * If there's a pager, then immediately
2100                          * page out this page, using the "initialize"
2101                          * option.  Else, we use the copy.
2102                          */
2103                         if ((!copy_object->pager_created)
2104 #if MACH_PAGEMAP
2105                             || vm_external_state_get(copy_object->existence_map, copy_offset) == VM_EXTERNAL_STATE_ABSENT
2106 #endif
2107                             ) {
2108
2109                                 vm_page_lockspin_queues();
2110                                 assert(!m->cleaning);
2111                                 vm_page_activate(copy_m);
2112                                 vm_page_unlock_queues();
2113
2114                                 copy_m->dirty = TRUE;
2115                                 PAGE_WAKEUP_DONE(copy_m);
2116                         }
2117                         else {
2118                                 assert(copy_m->busy == TRUE);
2119                                 assert(!m->cleaning);
2120
2121                                 /*
2122                                  * dirty is protected by the object lock
2123                                  */
2124                                 copy_m->dirty = TRUE;
2125
2126                                 /*
2127                                  * The page is already ready for pageout:
2128                                  * not on pageout queues and busy.
2129                                  * Unlock everything except the
2130                                  * copy_object itself.
2131                                  */
2132                                 vm_object_unlock(object);
2133
2134                                 /*
2135                                  * Write the page to the copy-object,
2136                                  * flushing it from the kernel.
2137                                  */
2138                                 vm_pageout_initialize_page(copy_m);
2139
2140                                 /*
2141                                  * Since the pageout may have
2142                                  * temporarily dropped the
2143                                  * copy_object's lock, we
2144                                  * check whether we'll have
2145                                  * to deallocate the hard way.
2146                                  */
2147                                 if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
2148                                         vm_object_unlock(copy_object);
2149                                         vm_object_deallocate(copy_object);
2150                                         vm_object_lock(object);
2151
2152                                         continue;
2153                                 }
2154                                 /*
2155                                  * Pick back up the old object's
2156                                  * lock.  [It is safe to do so,
2157                                  * since it must be deeper in the
2158                                  * object tree.]
2159                                  */
2160                                 vm_object_lock(object);
2161                         }
2162                         /*
2163                          * Because we're pushing a page upward
2164                          * in the object tree, we must restart
2165                          * any faults that are waiting here.
2166                          * [Note that this is an expansion of
2167                          * PAGE_WAKEUP that uses the THREAD_RESTART
2168                          * wait result].  Can't turn off the page's
2169                          * busy bit because we're not done with it.
2170                          */
2171                         if (m->wanted) {
2172                                 m->wanted = FALSE;
2173                                 thread_wakeup_with_result((event_t) m, THREAD_RESTART);
2174                         }
2175                 }
2176                 /*
2177                  * The reference count on copy_object must be
2178                  * at least 2: one for our extra reference,
2179                  * and at least one from the outside world
2180                  * (we checked that when we last locked
2181                  * copy_object).
2182                  */
2183                 vm_object_lock_assert_exclusive(copy_object);
2184                 copy_object->ref_count--;
2185                 assert(copy_object->ref_count > 0);
2186
2187                 VM_OBJ_RES_DECR(copy_object);
2188                 vm_object_unlock(copy_object);
2189
2190                 break;
2191         }
2192
2193 done:
2194         *result_page = m;
2195         *top_page = first_m;
2196
2197         XPR(XPR_VM_FAULT,
2198                 "vm_f_page: DONE obj 0x%X, offset 0x%X, m 0x%X, first_m 0x%X\n",
2199                 object, offset, m, first_m, 0);
2200
2201         if (m != VM_PAGE_NULL) {
2202                 retval = VM_FAULT_SUCCESS;
2203                 if (my_fault == DBG_PAGEIN_FAULT) {
2204
2205                         VM_STAT_INCR(pageins);
2206                         DTRACE_VM2(pgin, int, 1, (uint64_t *), NULL);
2207                         DTRACE_VM2(maj_fault, int, 1, (uint64_t *), NULL);
2208                         current_task()->pageins++;
2209
2210                         if (m->object->internal) {
2211                                 DTRACE_VM2(anonpgin, int, 1, (uint64_t *), NULL);
2212                                 my_fault = DBG_PAGEIND_FAULT;
2213                         } else {
2214                                 DTRACE_VM2(fspgin, int, 1, (uint64_t *), NULL);
2215                                 my_fault = DBG_PAGEINV_FAULT;
2216                         }
2217
2218                         /*
2219                          * evaluate access pattern and update state
2220                          * vm_fault_deactivate_behind depends on the
2221                          * state being up to date
2222                          */
2223                         vm_fault_is_sequential(object, offset, fault_info->behavior);
2224
2225                         vm_fault_deactivate_behind(object, offset, fault_info->behavior);
2226                 }
2227                 if (type_of_fault)
2228                         *type_of_fault = my_fault;
2229         } else {
2230                 retval = VM_FAULT_SUCCESS_NO_VM_PAGE;
2231                 assert(first_m == VM_PAGE_NULL);
2232                 assert(object == first_object);
2233         }
2234
2235         thread_interrupt_level(interruptible_state);
2236
2237 #if TRACEFAULTPAGE
2238         dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0);       /* (TEST/DEBUG) */
2239 #endif
2240         return retval;
2241
2242 backoff:
2243         thread_interrupt_level(interruptible_state);
2244
2245         if (wait_result == THREAD_INTERRUPTED)
2246                 return (VM_FAULT_INTERRUPTED);
2247         return (VM_FAULT_RETRY);
2248
2249 #undef  RELEASE_PAGE
2250 }
2251
2252
2253
2254 /*
2255  * CODE SIGNING:
2256  * When soft faulting a page, we have to validate the page if:
2257  * 1. the page is being mapped in user space
2258  * 2. the page hasn't already been found to be "tainted"
2259  * 3. the page belongs to a code-signed object
2260  * 4. the page has not been validated yet or has been mapped for write.
2261  */
2262 #define VM_FAULT_NEED_CS_VALIDATION(pmap, page)                         \
2263         ((pmap) != kernel_pmap /*1*/ &&                                 \
2264          !(page)->cs_tainted /*2*/ &&                                   \
2265          (page)->object->code_signed /*3*/ &&                           \
2266          (!(page)->cs_validated || (page)->wpmapped /*4*/))
2267
2268
2269 /*
2270  * page queue lock must NOT be held
2271  * m->object must be locked
2272  *
2273  * NOTE: m->object could be locked "shared" only if we are called
2274  * from vm_fault() as part of a soft fault.  If so, we must be
2275  * careful not to modify the VM object in any way that is not
2276  * legal under a shared lock...
2277  */
2278 unsigned long cs_enter_tainted_rejected = 0;
2279 unsigned long cs_enter_tainted_accepted = 0;
2280 kern_return_t
2281 vm_fault_enter(vm_page_t m,
2282                pmap_t pmap,
2283                vm_map_offset_t vaddr,
2284                vm_prot_t prot,
2285                vm_prot_t fault_type,
2286                boolean_t wired,
2287                boolean_t change_wiring,
2288                boolean_t no_cache,
2289                boolean_t cs_bypass,
2290                int *type_of_fault)
2291 {
2292         kern_return_t   kr, pe_result;
2293         boolean_t       previously_pmapped = m->pmapped;
2294         boolean_t       must_disconnect = 0;
2295         boolean_t       map_is_switched, map_is_switch_protected;
2296
2297         vm_object_lock_assert_held(m->object);
2298 #if DEBUG
2299         lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
2300 #endif /* DEBUG */
2301
2302         if (m->phys_page == vm_page_guard_addr) {
2303                 assert(m->fictitious);
2304                 return KERN_SUCCESS;
2305         }
2306
2307         if (*type_of_fault == DBG_ZERO_FILL_FAULT) {
2308
2309                 vm_object_lock_assert_exclusive(m->object);
2310
2311         } else if ((fault_type & VM_PROT_WRITE) == 0) {
2312                 /*
2313                  * This is not a "write" fault, so we
2314                  * might not have taken the object lock
2315                  * exclusively and we might not be able
2316                  * to update the "wpmapped" bit in
2317                  * vm_fault_enter().
2318                  * Let's just grant read access to
2319                  * the page for now and we'll
2320                  * soft-fault again if we need write
2321                  * access later...
2322                  */
2323                 prot &= ~VM_PROT_WRITE;
2324         }
2325         if (m->pmapped == FALSE) {
2326
2327                 if ((*type_of_fault == DBG_CACHE_HIT_FAULT) && m->clustered) {
2328                         /*
2329                          * found it in the cache, but this
2330                          * is the first fault-in of the page (m->pmapped == FALSE)
2331                          * so it must have come in as part of
2332                          * a cluster... account 1 pagein against it
2333                          */
2334                         VM_STAT_INCR(pageins);
2335                         DTRACE_VM2(pgin, int, 1, (uint64_t *), NULL);
2336
2337                         if (m->object->internal) {
2338                                 DTRACE_VM2(anonpgin, int, 1, (uint64_t *), NULL);
2339                                 *type_of_fault = DBG_PAGEIND_FAULT;
2340                         } else {
2341                                 DTRACE_VM2(fspgin, int, 1, (uint64_t *), NULL);
2342                                 *type_of_fault = DBG_PAGEINV_FAULT;
2343                         }
2344
2345                         current_task()->pageins++;
2346                 }
2347                 VM_PAGE_CONSUME_CLUSTERED(m);
2348
2349         }
2350
2351         if (*type_of_fault != DBG_COW_FAULT) {
2352                 DTRACE_VM2(as_fault, int, 1, (uint64_t *), NULL);
2353
2354                 if (pmap == kernel_pmap) {
2355                         DTRACE_VM2(kernel_asflt, int, 1, (uint64_t *), NULL);
2356                 }
2357         }
2358
2359         /* Validate code signature if necessary. */
2360         if (VM_FAULT_NEED_CS_VALIDATION(pmap, m)) {
2361                 vm_object_lock_assert_exclusive(m->object);
2362
2363                 if (m->cs_validated) {
2364                         vm_cs_revalidates++;
2365                 }
2366
2367                 /* VM map is locked, so 1 ref will remain on VM object -
2368                  * so no harm if vm_page_validate_cs drops the object lock */
2369                 vm_page_validate_cs(m);
2370         }
2371
2372 #define page_immutable(m,prot) ((m)->cs_validated /*&& ((prot) & VM_PROT_EXECUTE)*/)
2373
2374         map_is_switched = ((pmap != vm_map_pmap(current_task()->map)) &&
2375                            (pmap == vm_map_pmap(current_thread()->map)));
2376         map_is_switch_protected = current_thread()->map->switch_protect;
2377
2378         /* If the map is switched, and is switch-protected, we must protect
2379          * some pages from being write-faulted: immutable pages because by
2380          * definition they may not be written, and executable pages because that
2381          * would provide a way to inject unsigned code.
2382          * If the page is immutable, we can simply return. However, we can't
2383          * immediately determine whether a page is executable anywhere. But,
2384          * we can disconnect it everywhere and remove the executable protection
2385          * from the current map. We do that below right before we do the
2386          * PMAP_ENTER.
2387          */
2388         if(!cs_enforcement_disable && map_is_switched &&
2389            map_is_switch_protected && page_immutable(m, prot) &&
2390            (prot & VM_PROT_WRITE))
2391         {
2392                 return KERN_CODESIGN_ERROR;
2393         }
2394
2395         /* A page could be tainted, or pose a risk of being tainted later.
2396          * Check whether the receiving process wants it, and make it feel
2397          * the consequences (that hapens in cs_invalid_page()).
2398          * For CS Enforcement, two other conditions will
2399          * cause that page to be tainted as well:
2400          * - pmapping an unsigned page executable - this means unsigned code;
2401          * - writeable mapping of a validated page - the content of that page
2402          *   can be changed without the kernel noticing, therefore unsigned
2403          *   code can be created
2404          */
2405         if (m->cs_tainted ||
2406             (( !cs_enforcement_disable && !cs_bypass ) &&
2407              (/* The page is unsigned and wants to be executable */
2408               (!m->cs_validated && (prot & VM_PROT_EXECUTE))  ||
2409               /* The page should be immutable, but is in danger of being modified
2410                 * This is the case where we want policy from the code directory -
2411                 * is the page immutable or not? For now we have to assume that
2412                 * code pages will be immutable, data pages not.
2413                 * We'll assume a page is a code page if it has a code directory
2414                 * and we fault for execution.
2415                 * That is good enough since if we faulted the code page for
2416                 * writing in another map before, it is wpmapped; if we fault
2417                 * it for writing in this map later it will also be faulted for executing
2418                 * at the same time; and if we fault for writing in another map
2419                 * later, we will disconnect it from this pmap so we'll notice
2420                 * the change.
2421                 */
2422               (page_immutable(m, prot) && ((prot & VM_PROT_WRITE) || m->wpmapped))
2423               ))
2424                 )
2425         {
2426                 /* We will have a tainted page. Have to handle the special case
2427                  * of a switched map now. If the map is not switched, standard
2428                  * procedure applies - call cs_invalid_page().
2429                  * If the map is switched, the real owner is invalid already.
2430                  * There is no point in invalidating the switching process since
2431                  * it will not be executing from the map. So we don't call
2432                  * cs_invalid_page() in that case. */
2433                 boolean_t reject_page;
2434                 if(map_is_switched) {
2435                         assert(pmap==vm_map_pmap(current_thread()->map));
2436                         assert(!(prot & VM_PROT_WRITE) || (map_is_switch_protected == FALSE));
2437                         reject_page = FALSE;
2438                 } else {
2439                         reject_page = cs_invalid_page((addr64_t) vaddr);
2440                 }
2441
2442                 if (reject_page) {
2443                         /* reject the tainted page: abort the page fault */
2444                         kr = KERN_CODESIGN_ERROR;
2445                         cs_enter_tainted_rejected++;
2446                 } else {
2447                         /* proceed with the tainted page */
2448                         kr = KERN_SUCCESS;
2449                         /* Page might have been tainted before or not; now it
2450                          * definitively is. If the page wasn't tainted, we must
2451                          * disconnect it from all pmaps later. */
2452                         must_disconnect = !m->cs_tainted;
2453                         m->cs_tainted = TRUE;
2454                         cs_enter_tainted_accepted++;
2455                 }
2456                 if (cs_debug || kr != KERN_SUCCESS) {
2457                         printf("CODESIGNING: vm_fault_enter(0x%llx): "
2458                                "page %p obj %p off 0x%llx *** INVALID PAGE ***\n",
2459                                (long long)vaddr, m, m->object, m->offset);
2460                 }
2461
2462         } else {
2463                 /* proceed with the valid page */
2464                 kr = KERN_SUCCESS;
2465         }
2466
2467         /* If we have a KERN_SUCCESS from the previous checks, we either have
2468          * a good page, or a tainted page that has been accepted by the process.
2469          * In both cases the page will be entered into the pmap.
2470          * If the page is writeable, we need to disconnect it from other pmaps
2471          * now so those processes can take note.
2472          */
2473         if (kr == KERN_SUCCESS) {
2474                 /*
2475                  * NOTE: we may only hold the vm_object lock SHARED
2476                  * at this point, but the update of pmapped is ok
2477                  * since this is the ONLY bit updated behind the SHARED
2478                  * lock... however, we need to figure out how to do an atomic
2479                  * update on a bit field to make this less fragile... right
2480                  * now I don't know how to coerce 'C' to give me the offset info
2481                  * that's needed for an AtomicCompareAndSwap
2482                  */
2483                 m->pmapped = TRUE;
2484                 if(vm_page_is_slideable(m)) {
2485                         boolean_t was_busy = m->busy;
2486                         m->busy = TRUE;
2487                         kr = vm_page_slide(m, 0);
2488                         assert(m->busy);
2489                         if(!was_busy) {
2490                                 PAGE_WAKEUP_DONE(m);
2491                         }
2492                         if (kr != KERN_SUCCESS) {
2493                                 /*
2494                                  * This page has not been slid correctly,
2495                                  * do not do the pmap_enter() !
2496                                  * Let vm_fault_enter() return the error
2497                                  * so the caller can fail the fault.
2498                                  */
2499                                 goto after_the_pmap_enter;
2500                         }
2501                 }
2502
2503                 if (fault_type & VM_PROT_WRITE) {
2504
2505                         if (m->wpmapped == FALSE) {
2506                                 vm_object_lock_assert_exclusive(m->object);
2507
2508                                 m->wpmapped = TRUE;
2509                         }
2510                         if (must_disconnect) {
2511                                 /*
2512                                  * We can only get here
2513                                  * because of the CSE logic
2514                                  */
2515                                 assert(cs_enforcement_disable == FALSE);
2516                                 pmap_disconnect(m->phys_page);
2517                                 /*
2518                                  * If we are faulting for a write, we can clear
2519                                  * the execute bit - that will ensure the page is
2520                                  * checked again before being executable, which
2521                                  * protects against a map switch.
2522                                  * This only happens the first time the page
2523                                  * gets tainted, so we won't get stuck here
2524                                  * to make an already writeable page executable.
2525                                  */
2526                                 if (!cs_bypass){
2527                                         prot &= ~VM_PROT_EXECUTE;
2528                                 }
2529                         }
2530                 }
2531
2532                 /* Prevent a deadlock by not
2533                  * holding the object lock if we need to wait for a page in
2534                  * pmap_enter() - <rdar://problem/7138958> */
2535                 PMAP_ENTER_OPTIONS(pmap, vaddr, m, prot, 0,
2536                                   wired, PMAP_OPTIONS_NOWAIT, pe_result);
2537
2538                 if(pe_result == KERN_RESOURCE_SHORTAGE) {
2539                         /* The nonblocking version of pmap_enter did not succeed.
2540                          * Use the blocking version instead. Requires marking
2541                          * the page busy and unlocking the object */
2542                         boolean_t was_busy = m->busy;
2543                         m->busy = TRUE;
2544                         vm_object_unlock(m->object);
2545
2546                         PMAP_ENTER(pmap, vaddr, m, prot, 0, wired);
2547
2548                         /* Take the object lock again. */
2549                         vm_object_lock(m->object);
2550
2551                         /* If the page was busy, someone else will wake it up.
2552                          * Otherwise, we have to do it now. */
2553                         assert(m->busy);
2554                         if(!was_busy) {
2555                                 PAGE_WAKEUP_DONE(m);
2556                         }
2557                         vm_pmap_enter_blocked++;
2558                 }
2559         }
2560
2561 after_the_pmap_enter:
2562         /*
2563          * Hold queues lock to manipulate
2564          * the page queues.  Change wiring
2565          * case is obvious.
2566          */
2567         if (change_wiring) {
2568                 vm_page_lockspin_queues();
2569
2570                 if (wired) {
2571                         if (kr == KERN_SUCCESS) {
2572                                 vm_page_wire(m);
2573                         }
2574                 } else {
2575                         vm_page_unwire(m, TRUE);
2576                 }
2577                 vm_page_unlock_queues();
2578
2579         } else {
2580                 if (kr != KERN_SUCCESS) {
2581                         vm_page_lockspin_queues();
2582                         vm_page_deactivate(m);
2583                         vm_page_unlock_queues();
2584                 } else {
2585                         if (((!m->active && !m->inactive) || no_cache) && !VM_PAGE_WIRED(m) && !m->throttled) {
2586
2587                                 if ( vm_page_local_q && !no_cache && (*type_of_fault == DBG_COW_FAULT || *type_of_fault == DBG_ZERO_FILL_FAULT) ) {
2588                                         struct vpl      *lq;
2589                                         uint32_t        lid;
2590
2591                                         /*
2592                                          * we got a local queue to stuff this new page on...
2593                                          * its safe to manipulate local and local_id at this point
2594                                          * since we're behind an exclusive object lock and the
2595                                          * page is not on any global queue.
2596                                          *
2597                                          * we'll use the current cpu number to select the queue
2598                                          * note that we don't need to disable preemption... we're
2599                                          * going to behind the local queue's lock to do the real
2600                                          * work
2601                                          */
2602                                         lid = cpu_number();
2603
2604                                         lq = &vm_page_local_q[lid].vpl_un.vpl;
2605
2606                                         VPL_LOCK(&lq->vpl_lock);
2607
2608                                         queue_enter(&lq->vpl_queue, m, vm_page_t, pageq);
2609                                         m->local = TRUE;
2610                                         m->local_id = lid;
2611                                         lq->vpl_count++;
2612
2613                                         VPL_UNLOCK(&lq->vpl_lock);
2614
2615                                         if (lq->vpl_count > vm_page_local_q_soft_limit) {
2616                                                 /*
2617                                                  * we're beyond the soft limit for the local queue
2618                                                  * vm_page_reactivate_local will 'try' to take
2619                                                  * the global page queue lock... if it can't that's
2620                                                  * ok... we'll let the queue continue to grow up
2621                                                  * to the hard limit... at that point we'll wait
2622                                                  * for the lock... once we've got the lock, we'll
2623                                                  * transfer all of the pages from the local queue
2624                                                  * to the global active queue
2625                                                  */
2626                                                 vm_page_reactivate_local(lid, FALSE, FALSE);
2627                                         }
2628                                         return kr;
2629                                 }
2630
2631                                 vm_page_lockspin_queues();
2632                                 /*
2633                                  * test again now that we hold the page queue lock
2634                                  */
2635                                 if (((!m->active && !m->inactive) || no_cache) && !VM_PAGE_WIRED(m)) {
2636
2637                                         /*
2638                                          * If this is a no_cache mapping and the page has never been
2639                                          * mapped before or was previously a no_cache page, then we
2640                                          * want to leave pages in the speculative state so that they
2641                                          * can be readily recycled if free memory runs low.  Otherwise
2642                                          * the page is activated as normal.
2643                                          */
2644
2645                                         if (no_cache && (!previously_pmapped || m->no_cache)) {
2646                                                 m->no_cache = TRUE;
2647
2648                                                 if (!m->speculative)
2649                                                         vm_page_speculate(m, FALSE);
2650
2651                                         } else if (!m->active && !m->inactive)
2652                                                 vm_page_activate(m);
2653
2654                                 }
2655
2656                                 vm_page_unlock_queues();
2657                         }
2658                 }
2659         }
2660         return kr;
2661 }
2662
2663
2664 /*
2665  *      Routine:        vm_fault
2666  *      Purpose:
2667  *              Handle page faults, including pseudo-faults
2668  *              used to change the wiring status of pages.
2669  *      Returns:
2670  *              Explicit continuations have been removed.
2671  *      Implementation:
2672  *              vm_fault and vm_fault_page save mucho state
2673  *              in the moral equivalent of a closure.  The state
2674  *              structure is allocated when first entering vm_fault
2675  *              and deallocated when leaving vm_fault.
2676  */
2677
2678 extern int _map_enter_debug;
2679
2680 unsigned long vm_fault_collapse_total = 0;
2681 unsigned long vm_fault_collapse_skipped = 0;
2682
2683 kern_return_t
2684 vm_fault(
2685         vm_map_t        map,
2686         vm_map_offset_t vaddr,
2687         vm_prot_t       fault_type,
2688         boolean_t       change_wiring,
2689         int             interruptible,
2690         pmap_t          caller_pmap,
2691         vm_map_offset_t caller_pmap_addr)
2692 {
2693         vm_map_version_t        version;        /* Map version for verificiation */
2694         boolean_t               wired;          /* Should mapping be wired down? */
2695         vm_object_t             object;         /* Top-level object */
2696         vm_object_offset_t      offset;         /* Top-level offset */
2697         vm_prot_t               prot;           /* Protection for mapping */
2698         vm_object_t             old_copy_object; /* Saved copy object */
2699         vm_page_t               result_page;    /* Result of vm_fault_page */
2700         vm_page_t               top_page;       /* Placeholder page */
2701         kern_return_t           kr;
2702
2703         vm_page_t               m;      /* Fast access to result_page */
2704         kern_return_t           error_code;
2705         vm_object_t             cur_object;
2706         vm_object_offset_t      cur_offset;
2707         vm_page_t               cur_m;
2708         vm_object_t             new_object;
2709         int                     type_of_fault;
2710         pmap_t                  pmap;
2711         boolean_t               interruptible_state;
2712         vm_map_t                real_map = map;
2713         vm_map_t                original_map = map;
2714         vm_prot_t               original_fault_type;
2715         struct vm_object_fault_info fault_info;
2716         boolean_t               need_collapse = FALSE;
2717         int                     object_lock_type = 0;
2718         int                     cur_object_lock_type;
2719         vm_object_t             top_object = VM_OBJECT_NULL;
2720         int                     throttle_delay;
2721
2722
2723         KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START,
2724                               (int)((uint64_t)vaddr >> 32),
2725                               (int)vaddr,
2726                               (map == kernel_map),
2727                               0,
2728                               0);
2729
2730         if (get_preemption_level() != 0) {
2731                 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
2732                                       (int)((uint64_t)vaddr >> 32),
2733                                       (int)vaddr,
2734                                       KERN_FAILURE,
2735                                       0,
2736                                       0);
2737
2738                 return (KERN_FAILURE);
2739         }
2740
2741         interruptible_state = thread_interrupt_level(interruptible);
2742
2743         VM_STAT_INCR(faults);
2744         current_task()->faults++;
2745         original_fault_type = fault_type;
2746
2747         if (fault_type & VM_PROT_WRITE)
2748                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2749         else
2750                 object_lock_type = OBJECT_LOCK_SHARED;
2751
2752         cur_object_lock_type = OBJECT_LOCK_SHARED;
2753
2754 RetryFault:
2755         /*
2756          * assume we will hit a page in the cache
2757          * otherwise, explicitly override with
2758          * the real fault type once we determine it
2759          */
2760         type_of_fault = DBG_CACHE_HIT_FAULT;
2761
2762         /*
2763          *      Find the backing store object and offset into
2764          *      it to begin the search.
2765          */
2766         fault_type = original_fault_type;
2767         map = original_map;
2768         vm_map_lock_read(map);
2769
2770         kr = vm_map_lookup_locked(&map, vaddr, fault_type,
2771                                   object_lock_type, &version,
2772                                   &object, &offset, &prot, &wired,
2773                                   &fault_info,
2774                                   &real_map);
2775
2776         if (kr != KERN_SUCCESS) {
2777                 vm_map_unlock_read(map);
2778                 goto done;
2779         }
2780         pmap = real_map->pmap;
2781         fault_info.interruptible = interruptible;
2782         fault_info.stealth = FALSE;
2783         fault_info.io_sync = FALSE;
2784         fault_info.mark_zf_absent = FALSE;
2785
2786         /*
2787          * If the page is wired, we must fault for the current protection
2788          * value, to avoid further faults.
2789          */
2790         if (wired) {
2791                 fault_type = prot | VM_PROT_WRITE;
2792                 /*
2793                  * since we're treating this fault as a 'write'
2794                  * we must hold the top object lock exclusively
2795                  */
2796                 if (object_lock_type == OBJECT_LOCK_SHARED) {
2797
2798                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2799
2800                         if (vm_object_lock_upgrade(object) == FALSE) {
2801                                 /*
2802                                  * couldn't upgrade, so explictly
2803                                  * take the lock exclusively
2804                                  */
2805                                 vm_object_lock(object);
2806                         }
2807                 }
2808         }
2809
2810 #if     VM_FAULT_CLASSIFY
2811         /*
2812          *      Temporary data gathering code
2813          */
2814         vm_fault_classify(object, offset, fault_type);
2815 #endif
2816         /*
2817          *      Fast fault code.  The basic idea is to do as much as
2818          *      possible while holding the map lock and object locks.
2819          *      Busy pages are not used until the object lock has to
2820          *      be dropped to do something (copy, zero fill, pmap enter).
2821          *      Similarly, paging references aren't acquired until that
2822          *      point, and object references aren't used.
2823          *
2824          *      If we can figure out what to do
2825          *      (zero fill, copy on write, pmap enter) while holding
2826          *      the locks, then it gets done.  Otherwise, we give up,
2827          *      and use the original fault path (which doesn't hold
2828          *      the map lock, and relies on busy pages).
2829          *      The give up cases include:
2830          *              - Have to talk to pager.
2831          *              - Page is busy, absent or in error.
2832          *              - Pager has locked out desired access.
2833          *              - Fault needs to be restarted.
2834          *              - Have to push page into copy object.
2835          *
2836          *      The code is an infinite loop that moves one level down
2837          *      the shadow chain each time.  cur_object and cur_offset
2838          *      refer to the current object being examined. object and offset
2839          *      are the original object from the map.  The loop is at the
2840          *      top level if and only if object and cur_object are the same.
2841          *
2842          *      Invariants:  Map lock is held throughout.  Lock is held on
2843          *              original object and cur_object (if different) when
2844          *              continuing or exiting loop.
2845          *
2846          */
2847
2848
2849         /*
2850          * If this page is to be inserted in a copy delay object
2851          * for writing, and if the object has a copy, then the
2852          * copy delay strategy is implemented in the slow fault page.
2853          */
2854         if (object->copy_strategy == MEMORY_OBJECT_COPY_DELAY &&
2855             object->copy != VM_OBJECT_NULL && (fault_type & VM_PROT_WRITE))
2856                 goto handle_copy_delay;
2857
2858         cur_object = object;
2859         cur_offset = offset;
2860
2861         while (TRUE) {
2862                 if (!cur_object->pager_created &&
2863                     cur_object->phys_contiguous) /* superpage */
2864                         break;
2865
2866                 if (cur_object->blocked_access) {
2867                         /*
2868                          * Access to this VM object has been blocked.
2869                          * Let the slow path handle it.
2870                          */
2871                         break;
2872                 }
2873
2874                 m = vm_page_lookup(cur_object, cur_offset);
2875
2876                 if (m != VM_PAGE_NULL) {
2877                         if (m->busy) {
2878                                 wait_result_t   result;
2879
2880                                 /*
2881                                  * in order to do the PAGE_ASSERT_WAIT, we must
2882                                  * have object that 'm' belongs to locked exclusively
2883                                  */
2884                                 if (object != cur_object) {
2885                                         vm_object_unlock(object);
2886
2887                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
2888
2889                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2890
2891                                                 if (vm_object_lock_upgrade(cur_object) == FALSE) {
2892                                                         /*
2893                                                          * couldn't upgrade so go do a full retry
2894                                                          * immediately since we've already dropped
2895                                                          * the top object lock associated with this page
2896                                                          * and the current one got dropped due to the
2897                                                          * failed upgrade... the state is no longer valid
2898                                                          */
2899                                                         vm_map_unlock_read(map);
2900                                                         if (real_map != map)
2901                                                                 vm_map_unlock(real_map);
2902
2903                                                         goto RetryFault;
2904                                                 }
2905                                         }
2906                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
2907
2908                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2909
2910                                         if (vm_object_lock_upgrade(object) == FALSE) {
2911                                                 /*
2912                                                  * couldn't upgrade, so explictly take the lock
2913                                                  * exclusively and go relookup the page since we
2914                                                  * will have dropped the object lock and
2915                                                  * a different thread could have inserted
2916                                                  * a page at this offset
2917                                                  * no need for a full retry since we're
2918                                                  * at the top level of the object chain
2919                                                  */
2920                                                 vm_object_lock(object);
2921
2922                                                 continue;
2923                                         }
2924                                 }
2925                                 vm_map_unlock_read(map);
2926                                 if (real_map != map)
2927                                         vm_map_unlock(real_map);
2928
2929                                 result = PAGE_ASSERT_WAIT(m, interruptible);
2930
2931                                 vm_object_unlock(cur_object);
2932
2933                                 if (result == THREAD_WAITING) {
2934                                         result = thread_block(THREAD_CONTINUE_NULL);
2935
2936                                         counter(c_vm_fault_page_block_busy_kernel++);
2937                                 }
2938                                 if (result == THREAD_AWAKENED || result == THREAD_RESTART)
2939                                         goto RetryFault;
2940
2941                                 kr = KERN_ABORTED;
2942                                 goto done;
2943                         }
2944                         if (m->phys_page == vm_page_guard_addr) {
2945                                 /*
2946                                  * Guard page: let the slow path deal with it
2947                                  */
2948                                 break;
2949                         }
2950                         if (m->unusual && (m->error || m->restart || m->private || m->absent)) {
2951                                 /*
2952                                  * Unusual case... let the slow path deal with it
2953                                  */
2954                                 break;
2955                         }
2956                         if (VM_OBJECT_PURGEABLE_FAULT_ERROR(m->object)) {
2957                                 if (object != cur_object)
2958                                         vm_object_unlock(object);
2959                                 vm_map_unlock_read(map);
2960                                 if (real_map != map)
2961                                         vm_map_unlock(real_map);
2962                                 vm_object_unlock(cur_object);
2963                                 kr = KERN_MEMORY_ERROR;
2964                                 goto done;
2965                         }
2966
2967                         if (m->encrypted) {
2968                                 /*
2969                                  * ENCRYPTED SWAP:
2970                                  * We've soft-faulted (because it's not in the page
2971                                  * table) on an encrypted page.
2972                                  * Keep the page "busy" so that no one messes with
2973                                  * it during the decryption.
2974                                  * Release the extra locks we're holding, keep only
2975                                  * the page's VM object lock.
2976                                  *
2977                                  * in order to set 'busy' on 'm', we must
2978                                  * have object that 'm' belongs to locked exclusively
2979                                  */
2980                                 if (object != cur_object) {
2981                                         vm_object_unlock(object);
2982
2983                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
2984
2985                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2986
2987                                                 if (vm_object_lock_upgrade(cur_object) == FALSE) {
2988                                                         /*
2989                                                          * couldn't upgrade so go do a full retry
2990                                                          * immediately since we've already dropped
2991                                                          * the top object lock associated with this page
2992                                                          * and the current one got dropped due to the
2993                                                          * failed upgrade... the state is no longer valid
2994                                                          */
2995                                                         vm_map_unlock_read(map);
2996                                                         if (real_map != map)
2997                                                                 vm_map_unlock(real_map);
2998
2999                                                         goto RetryFault;
3000                                                 }
3001                                         }
3002                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3003
3004                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3005
3006                                         if (vm_object_lock_upgrade(object) == FALSE) {
3007                                                 /*
3008                                                  * couldn't upgrade, so explictly take the lock
3009                                                  * exclusively and go relookup the page since we
3010                                                  * will have dropped the object lock and
3011                                                  * a different thread could have inserted
3012                                                  * a page at this offset
3013                                                  * no need for a full retry since we're
3014                                                  * at the top level of the object chain
3015                                                  */
3016                                                 vm_object_lock(object);
3017
3018                                                 continue;
3019                                         }
3020                                 }
3021                                 m->busy = TRUE;
3022
3023                                 vm_map_unlock_read(map);
3024                                 if (real_map != map)
3025                                         vm_map_unlock(real_map);
3026
3027                                 vm_page_decrypt(m, 0);
3028
3029                                 assert(m->busy);
3030                                 PAGE_WAKEUP_DONE(m);
3031
3032                                 vm_object_unlock(cur_object);
3033                                 /*
3034                                  * Retry from the top, in case anything
3035                                  * changed while we were decrypting...
3036                                  */
3037                                 goto RetryFault;
3038                         }
3039                         ASSERT_PAGE_DECRYPTED(m);
3040
3041                         if(vm_page_is_slideable(m)) {
3042                                 /*
3043                                  * We might need to slide this page, and so,
3044                                  * we want to hold the VM object exclusively.
3045                                  */
3046                                 if (object != cur_object) {
3047                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3048                                                 vm_object_unlock(object);
3049                                                 vm_object_unlock(cur_object);
3050
3051                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3052
3053                                                 vm_map_unlock_read(map);
3054                                                 if (real_map != map)
3055                                                         vm_map_unlock(real_map);
3056
3057                                                 goto RetryFault;
3058                                         }
3059                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3060
3061                                         vm_object_unlock(object);
3062                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3063                                         vm_map_unlock_read(map);
3064                                         goto RetryFault;
3065                                 }
3066                         }
3067
3068                         if (VM_FAULT_NEED_CS_VALIDATION(map->pmap, m)) {
3069 upgrade_for_validation:
3070                                 /*
3071                                  * We might need to validate this page
3072                                  * against its code signature, so we
3073                                  * want to hold the VM object exclusively.
3074                                  */
3075                                 if (object != cur_object) {
3076                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3077                                                 vm_object_unlock(object);
3078                                                 vm_object_unlock(cur_object);
3079
3080                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3081
3082                                                 vm_map_unlock_read(map);
3083                                                 if (real_map != map)
3084                                                         vm_map_unlock(real_map);
3085
3086                                                 goto RetryFault;
3087                                         }
3088
3089                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3090
3091                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3092
3093                                         if (vm_object_lock_upgrade(object) == FALSE) {
3094                                                 /*
3095                                                  * couldn't upgrade, so explictly take the lock
3096                                                  * exclusively and go relookup the page since we
3097                                                  * will have dropped the object lock and
3098                                                  * a different thread could have inserted
3099                                                  * a page at this offset
3100                                                  * no need for a full retry since we're
3101                                                  * at the top level of the object chain
3102                                                  */
3103                                                 vm_object_lock(object);
3104
3105                                                 continue;
3106                                         }
3107                                 }
3108                         }
3109                         /*
3110                          *      Two cases of map in faults:
3111                          *          - At top level w/o copy object.
3112                          *          - Read fault anywhere.
3113                          *              --> must disallow write.
3114                          */
3115
3116                         if (object == cur_object && object->copy == VM_OBJECT_NULL) {
3117
3118                                 goto FastPmapEnter;
3119                         }
3120
3121                         if ((fault_type & VM_PROT_WRITE) == 0) {
3122
3123                                 if (object != cur_object) {
3124                                         /*
3125                                          * We still need to hold the top object
3126                                          * lock here to prevent a race between
3127                                          * a read fault (taking only "shared"
3128                                          * locks) and a write fault (taking
3129                                          * an "exclusive" lock on the top
3130                                          * object.
3131                                          * Otherwise, as soon as we release the
3132                                          * top lock, the write fault could
3133                                          * proceed and actually complete before
3134                                          * the read fault, and the copied page's
3135                                          * translation could then be overwritten
3136                                          * by the read fault's translation for
3137                                          * the original page.
3138                                          *
3139                                          * Let's just record what the top object
3140                                          * is and we'll release it later.
3141                                          */
3142                                         top_object = object;
3143
3144                                         /*
3145                                          * switch to the object that has the new page
3146                                          */
3147                                         object = cur_object;
3148                                         object_lock_type = cur_object_lock_type;
3149                                 }
3150 FastPmapEnter:
3151                                 /*
3152                                  * prepare for the pmap_enter...
3153                                  * object and map are both locked
3154                                  * m contains valid data
3155                                  * object == m->object
3156                                  * cur_object == NULL or it's been unlocked
3157                                  * no paging references on either object or cur_object
3158                                  */
3159                                 if (caller_pmap) {
3160                                         kr = vm_fault_enter(m,
3161                                                             caller_pmap,
3162                                                             caller_pmap_addr,
3163                                                             prot,
3164                                                             fault_type,
3165                                                             wired,
3166                                                             change_wiring,
3167                                                             fault_info.no_cache,
3168                                                             fault_info.cs_bypass,
3169                                                             &type_of_fault);
3170                                 } else {
3171                                         kr = vm_fault_enter(m,
3172                                                             pmap,
3173                                                             vaddr,
3174                                                             prot,
3175                                                             fault_type,
3176                                                             wired,
3177                                                             change_wiring,
3178                                                             fault_info.no_cache,
3179                                                             fault_info.cs_bypass,
3180                                                             &type_of_fault);
3181                                 }
3182
3183                                 if (top_object != VM_OBJECT_NULL) {
3184                                         /*
3185                                          * It's safe to drop the top object
3186                                          * now that we've done our
3187                                          * vm_fault_enter().  Any other fault
3188                                          * in progress for that virtual
3189                                          * address will either find our page
3190                                          * and translation or put in a new page
3191                                          * and translation.
3192                                          */
3193                                         vm_object_unlock(top_object);
3194                                         top_object = VM_OBJECT_NULL;
3195                                 }
3196
3197                                 if (need_collapse == TRUE)
3198                                         vm_object_collapse(object, offset, TRUE);
3199
3200                                 if (type_of_fault == DBG_PAGEIND_FAULT || type_of_fault == DBG_PAGEINV_FAULT || type_of_fault == DBG_CACHE_HIT_FAULT) {
3201                                         /*
3202                                          * evaluate access pattern and update state
3203                                          * vm_fault_deactivate_behind depends on the
3204                                          * state being up to date
3205                                          */
3206                                         vm_fault_is_sequential(object, cur_offset, fault_info.behavior);
3207
3208                                         vm_fault_deactivate_behind(object, cur_offset, fault_info.behavior);
3209                                 }
3210                                 /*
3211                                  * That's it, clean up and return.
3212                                  */
3213                                 if (m->busy)
3214                                         PAGE_WAKEUP_DONE(m);
3215
3216                                 vm_object_unlock(object);
3217
3218                                 vm_map_unlock_read(map);
3219                                 if (real_map != map)
3220                                         vm_map_unlock(real_map);
3221
3222                                 goto done;
3223                         }
3224                         /*
3225                          * COPY ON WRITE FAULT
3226                          */
3227                         assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE);
3228
3229                         if ((throttle_delay = vm_page_throttled())) {
3230                                 /*
3231                                  * drop all of our locks...
3232                                  * wait until the free queue is
3233                                  * pumped back up and then
3234                                  * redrive the fault
3235                                  */
3236                                 if (object != cur_object)
3237                                         vm_object_unlock(cur_object);
3238                                 vm_object_unlock(object);
3239                                 vm_map_unlock_read(map);
3240                                 if (real_map != map)
3241                                         vm_map_unlock(real_map);
3242
3243                                 VM_DEBUG_EVENT(vmf_cowdelay, VMF_COWDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
3244
3245                                 delay(throttle_delay);
3246
3247                                 if (!current_thread_aborted() && vm_page_wait((change_wiring) ?
3248                                                  THREAD_UNINT :
3249                                                  THREAD_ABORTSAFE))
3250                                         goto RetryFault;
3251                                 kr = KERN_ABORTED;
3252                                 goto done;
3253                         }
3254                         /*
3255                          * If objects match, then
3256                          * object->copy must not be NULL (else control
3257                          * would be in previous code block), and we
3258                          * have a potential push into the copy object
3259                          * with which we can't cope with here.
3260                          */
3261                         if (cur_object == object) {
3262                                 /*
3263                                  * must take the slow path to
3264                                  * deal with the copy push
3265                                  */
3266                                 break;
3267                         }
3268
3269                         /*
3270                          * This is now a shadow based copy on write
3271                          * fault -- it requires a copy up the shadow
3272                          * chain.
3273                          */
3274
3275                         if ((cur_object_lock_type == OBJECT_LOCK_SHARED) &&
3276                             VM_FAULT_NEED_CS_VALIDATION(NULL, m)) {
3277                                 goto upgrade_for_validation;
3278                         }
3279
3280                         /*
3281                          * Allocate a page in the original top level
3282                          * object. Give up if allocate fails.  Also
3283                          * need to remember current page, as it's the
3284                          * source of the copy.
3285                          *
3286                          * at this point we hold locks on both
3287                          * object and cur_object... no need to take
3288                          * paging refs or mark pages BUSY since
3289                          * we don't drop either object lock until
3290                          * the page has been copied and inserted
3291                          */
3292                         cur_m = m;
3293                         m = vm_page_grab();
3294
3295                         if (m == VM_PAGE_NULL) {
3296                                 /*
3297                                  * no free page currently available...
3298                                  * must take the slow path
3299                                  */
3300                                 break;
3301                         }
3302                         /*
3303                          * Now do the copy.  Mark the source page busy...
3304                          *
3305                          *      NOTE: This code holds the map lock across
3306                          *      the page copy.
3307                          */
3308                         vm_page_copy(cur_m, m);
3309                         vm_page_insert(m, object, offset);
3310                         m->dirty = TRUE;
3311
3312                         /*
3313                          * Now cope with the source page and object
3314                          */
3315                         if (object->ref_count > 1 && cur_m->pmapped)
3316                                 pmap_disconnect(cur_m->phys_page);
3317
3318                         need_collapse = TRUE;
3319
3320                         if (!cur_object->internal &&
3321                             cur_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
3322                                 /*
3323                                  * The object from which we've just
3324                                  * copied a page is most probably backed
3325                                  * by a vnode.  We don't want to waste too
3326                                  * much time trying to collapse the VM objects
3327                                  * and create a bottleneck when several tasks
3328                                  * map the same file.
3329                                  */
3330                                 if (cur_object->copy == object) {
3331                                         /*
3332                                          * Shared mapping or no COW yet.
3333                                          * We can never collapse a copy
3334                                          * object into its backing object.
3335                                          */
3336                                         need_collapse = FALSE;
3337                                 } else if (cur_object->copy == object->shadow &&
3338                                            object->shadow->resident_page_count == 0) {
3339                                         /*
3340                                          * Shared mapping after a COW occurred.
3341                                          */
3342                                         need_collapse = FALSE;
3343                                 }
3344                         }
3345                         vm_object_unlock(cur_object);
3346
3347                         if (need_collapse == FALSE)
3348                                 vm_fault_collapse_skipped++;
3349                         vm_fault_collapse_total++;
3350
3351                         type_of_fault = DBG_COW_FAULT;
3352                         VM_STAT_INCR(cow_faults);
3353                         DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
3354                         current_task()->cow_faults++;
3355
3356                         goto FastPmapEnter;
3357
3358                 } else {
3359                         /*
3360                          * No page at cur_object, cur_offset... m == NULL
3361                          */
3362                         if (cur_object->pager_created) {
3363                                 if (MUST_ASK_PAGER(cur_object, cur_offset) == TRUE) {
3364                                         /*
3365                                          * May have to talk to a pager...
3366                                          * take the slow path.
3367                                          */
3368                                         break;
3369                                 }
3370                                 /*
3371                                  * existence map present and indicates
3372                                  * that the pager doesn't have this page
3373                                  */
3374                         }
3375                         if (cur_object->shadow == VM_OBJECT_NULL) {
3376                                 /*
3377                                  * Zero fill fault.  Page gets
3378                                  * inserted into the original object.
3379                                  */
3380                                 if (cur_object->shadow_severed ||
3381                                     VM_OBJECT_PURGEABLE_FAULT_ERROR(cur_object))
3382                                 {
3383                                         if (object != cur_object)
3384                                                 vm_object_unlock(cur_object);
3385                                         vm_object_unlock(object);
3386
3387                                         vm_map_unlock_read(map);
3388                                         if (real_map != map)
3389                                                 vm_map_unlock(real_map);
3390
3391                                         kr = KERN_MEMORY_ERROR;
3392                                         goto done;
3393                                 }
3394                                 if ((throttle_delay = vm_page_throttled())) {
3395                                         /*
3396                                          * drop all of our locks...
3397                                          * wait until the free queue is
3398                                          * pumped back up and then
3399                                          * redrive the fault
3400                                          */
3401                                         if (object != cur_object)
3402                                                 vm_object_unlock(cur_object);
3403                                         vm_object_unlock(object);
3404                                         vm_map_unlock_read(map);
3405                                         if (real_map != map)
3406                                                 vm_map_unlock(real_map);
3407
3408                                         VM_DEBUG_EVENT(vmf_zfdelay, VMF_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
3409
3410                                         delay(throttle_delay);
3411
3412                                         if (!current_thread_aborted() && vm_page_wait((change_wiring) ?
3413                                                          THREAD_UNINT :
3414                                                          THREAD_ABORTSAFE))
3415                                                 goto RetryFault;
3416                                         kr = KERN_ABORTED;
3417                                         goto done;
3418                                 }
3419                                 if (vm_backing_store_low) {
3420                                         /*
3421                                          * we are protecting the system from
3422                                          * backing store exhaustion...
3423                                          * must take the slow path if we're
3424                                          * not privileged
3425                                          */
3426                                         if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV))
3427                                                 break;
3428                                 }
3429                                 if (cur_object != object) {
3430                                         vm_object_unlock(cur_object);
3431
3432                                         cur_object = object;
3433                                 }
3434                                 if (object_lock_type == OBJECT_LOCK_SHARED) {
3435
3436                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3437
3438                                         if (vm_object_lock_upgrade(object) == FALSE) {
3439                                                 /*
3440                                                  * couldn't upgrade so do a full retry on the fault
3441                                                  * since we dropped the object lock which
3442                                                  * could allow another thread to insert
3443                                                  * a page at this offset
3444                                                  */
3445                                                 vm_map_unlock_read(map);
3446                                                 if (real_map != map)
3447                                                         vm_map_unlock(real_map);
3448
3449                                                 goto RetryFault;
3450                                         }
3451                                 }
3452                                 m = vm_page_alloc(object, offset);
3453
3454                                 if (m == VM_PAGE_NULL) {
3455                                         /*
3456                                          * no free page currently available...
3457                                          * must take the slow path
3458                                          */
3459                                         break;
3460                                 }
3461
3462                                 /*
3463                                  * Now zero fill page...
3464                                  * the page is probably going to
3465                                  * be written soon, so don't bother
3466                                  * to clear the modified bit
3467                                  *
3468                                  *   NOTE: This code holds the map
3469                                  *   lock across the zero fill.
3470                                  */
3471                                 type_of_fault = vm_fault_zero_page(m, map->no_zero_fill);
3472
3473                                 goto FastPmapEnter;
3474                         }
3475                         /*
3476                          * On to the next level in the shadow chain
3477                          */
3478                         cur_offset += cur_object->vo_shadow_offset;
3479                         new_object = cur_object->shadow;
3480
3481                         /*
3482                          * take the new_object's lock with the indicated state
3483                          */
3484                         if (cur_object_lock_type == OBJECT_LOCK_SHARED)
3485                                 vm_object_lock_shared(new_object);
3486                         else
3487                                 vm_object_lock(new_object);
3488
3489                         if (cur_object != object)
3490                                 vm_object_unlock(cur_object);
3491
3492                         cur_object = new_object;
3493
3494                         continue;
3495                 }
3496         }
3497         /*
3498          * Cleanup from fast fault failure.  Drop any object
3499          * lock other than original and drop map lock.
3500          */
3501         if (object != cur_object)
3502                 vm_object_unlock(cur_object);
3503
3504         /*
3505          * must own the object lock exclusively at this point
3506          */
3507         if (object_lock_type == OBJECT_LOCK_SHARED) {
3508                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3509
3510                 if (vm_object_lock_upgrade(object) == FALSE) {
3511                         /*
3512                          * couldn't upgrade, so explictly
3513                          * take the lock exclusively
3514                          * no need to retry the fault at this
3515                          * point since "vm_fault_page" will
3516                          * completely re-evaluate the state
3517                          */
3518                         vm_object_lock(object);
3519                 }
3520         }
3521
3522 handle_copy_delay:
3523         vm_map_unlock_read(map);
3524         if (real_map != map)
3525                 vm_map_unlock(real_map);
3526
3527         /*
3528          * Make a reference to this object to
3529          * prevent its disposal while we are messing with
3530          * it.  Once we have the reference, the map is free
3531          * to be diddled.  Since objects reference their
3532          * shadows (and copies), they will stay around as well.
3533          */
3534         vm_object_reference_locked(object);
3535         vm_object_paging_begin(object);
3536
3537         XPR(XPR_VM_FAULT,"vm_fault -> vm_fault_page\n",0,0,0,0,0);
3538
3539         error_code = 0;
3540
3541         kr = vm_fault_page(object, offset, fault_type,
3542                            (change_wiring && !wired),
3543                            &prot, &result_page, &top_page,
3544                            &type_of_fault,
3545                            &error_code, map->no_zero_fill,
3546                            FALSE, &fault_info);
3547
3548         /*
3549          * if kr != VM_FAULT_SUCCESS, then the paging reference
3550          * has been dropped and the object unlocked... the ref_count
3551          * is still held
3552          *
3553          * if kr == VM_FAULT_SUCCESS, then the paging reference
3554          * is still held along with the ref_count on the original object
3555          *
3556          *      the object is returned locked with a paging reference
3557          *
3558          *      if top_page != NULL, then it's BUSY and the
3559          *      object it belongs to has a paging reference
3560          *      but is returned unlocked
3561          */
3562         if (kr != VM_FAULT_SUCCESS &&
3563             kr != VM_FAULT_SUCCESS_NO_VM_PAGE) {
3564                 /*
3565                  * we didn't succeed, lose the object reference immediately.
3566                  */
3567                 vm_object_deallocate(object);
3568
3569                 /*
3570                  * See why we failed, and take corrective action.
3571                  */
3572                 switch (kr) {
3573                 case VM_FAULT_MEMORY_SHORTAGE:
3574                         if (vm_page_wait((change_wiring) ?
3575                                          THREAD_UNINT :
3576                                          THREAD_ABORTSAFE))
3577                                 goto RetryFault;
3578                         /*
3579                          * fall thru
3580                          */
3581                 case VM_FAULT_INTERRUPTED:
3582                         kr = KERN_ABORTED;
3583                         goto done;
3584                 case VM_FAULT_RETRY:
3585                         goto RetryFault;
3586                 case VM_FAULT_MEMORY_ERROR:
3587                         if (error_code)
3588                                 kr = error_code;
3589                         else
3590                                 kr = KERN_MEMORY_ERROR;
3591                         goto done;
3592                 default:
3593                         panic("vm_fault: unexpected error 0x%x from "
3594                               "vm_fault_page()\n", kr);
3595                 }
3596         }
3597         m = result_page;
3598
3599         if (m != VM_PAGE_NULL) {
3600                 assert((change_wiring && !wired) ?
3601                     (top_page == VM_PAGE_NULL) :
3602                     ((top_page == VM_PAGE_NULL) == (m->object == object)));
3603         }
3604
3605         /*
3606          * What to do with the resulting page from vm_fault_page
3607          * if it doesn't get entered into the physical map:
3608          */
3609 #define RELEASE_PAGE(m)                                 \
3610         MACRO_BEGIN                                     \
3611         PAGE_WAKEUP_DONE(m);                            \
3612         if (!m->active && !m->inactive && !m->throttled) {              \
3613                 vm_page_lockspin_queues();                              \
3614                 if (!m->active && !m->inactive && !m->throttled)        \
3615                         vm_page_activate(m);                            \
3616                 vm_page_unlock_queues();                                \
3617         }                                                               \
3618         MACRO_END
3619
3620         /*
3621          * We must verify that the maps have not changed
3622          * since our last lookup.
3623          */
3624         if (m != VM_PAGE_NULL) {
3625                 old_copy_object = m->object->copy;
3626                 vm_object_unlock(m->object);
3627         } else {
3628                 old_copy_object = VM_OBJECT_NULL;
3629                 vm_object_unlock(object);
3630         }
3631
3632         /*
3633          * no object locks are held at this point
3634          */
3635         if ((map != original_map) || !vm_map_verify(map, &version)) {
3636                 vm_object_t             retry_object;
3637                 vm_object_offset_t      retry_offset;
3638                 vm_prot_t               retry_prot;
3639
3640                 /*
3641                  * To avoid trying to write_lock the map while another
3642                  * thread has it read_locked (in vm_map_pageable), we
3643                  * do not try for write permission.  If the page is
3644                  * still writable, we will get write permission.  If it
3645                  * is not, or has been marked needs_copy, we enter the
3646                  * mapping without write permission, and will merely
3647                  * take another fault.
3648                  */
3649                 map = original_map;
3650                 vm_map_lock_read(map);
3651
3652                 kr = vm_map_lookup_locked(&map, vaddr,
3653                                           fault_type & ~VM_PROT_WRITE,
3654                                           OBJECT_LOCK_EXCLUSIVE, &version,
3655                                           &retry_object, &retry_offset, &retry_prot,
3656                                           &wired,
3657                                           &fault_info,
3658                                           &real_map);
3659                 pmap = real_map->pmap;
3660
3661                 if (kr != KERN_SUCCESS) {
3662                         vm_map_unlock_read(map);
3663
3664                         if (m != VM_PAGE_NULL) {
3665                                 /*
3666                                  * retake the lock so that
3667                                  * we can drop the paging reference
3668                                  * in vm_fault_cleanup and do the
3669                                  * PAGE_WAKEUP_DONE in RELEASE_PAGE
3670                                  */
3671                                 vm_object_lock(m->object);
3672
3673                                 RELEASE_PAGE(m);
3674
3675                                 vm_fault_cleanup(m->object, top_page);
3676                         } else {
3677                                 /*
3678                                  * retake the lock so that
3679                                  * we can drop the paging reference
3680                                  * in vm_fault_cleanup
3681                                  */
3682                                 vm_object_lock(object);
3683
3684                                 vm_fault_cleanup(object, top_page);
3685                         }
3686                         vm_object_deallocate(object);
3687
3688                         goto done;
3689                 }
3690                 vm_object_unlock(retry_object);
3691
3692                 if ((retry_object != object) || (retry_offset != offset)) {
3693
3694                         vm_map_unlock_read(map);
3695                         if (real_map != map)
3696                                 vm_map_unlock(real_map);
3697
3698                         if (m != VM_PAGE_NULL) {
3699                                 /*
3700                                  * retake the lock so that
3701                                  * we can drop the paging reference
3702                                  * in vm_fault_cleanup and do the
3703                                  * PAGE_WAKEUP_DONE in RELEASE_PAGE
3704                                  */
3705                                 vm_object_lock(m->object);
3706
3707                                 RELEASE_PAGE(m);
3708
3709                                 vm_fault_cleanup(m->object, top_page);
3710                         } else {
3711                                 /*
3712                                  * retake the lock so that
3713                                  * we can drop the paging reference
3714                                  * in vm_fault_cleanup
3715                                  */
3716                                 vm_object_lock(object);
3717
3718                                 vm_fault_cleanup(object, top_page);
3719                         }
3720                         vm_object_deallocate(object);
3721
3722                         goto RetryFault;
3723                 }
3724                 /*
3725                  * Check whether the protection has changed or the object
3726                  * has been copied while we left the map unlocked.
3727                  */
3728                 prot &= retry_prot;
3729         }
3730         if (m != VM_PAGE_NULL) {
3731                 vm_object_lock(m->object);
3732
3733                 if (m->object->copy != old_copy_object) {
3734                         /*
3735                          * The copy object changed while the top-level object
3736                          * was unlocked, so take away write permission.
3737                          */
3738                         prot &= ~VM_PROT_WRITE;
3739                 }
3740         } else
3741                 vm_object_lock(object);
3742
3743         /*
3744          * If we want to wire down this page, but no longer have
3745          * adequate permissions, we must start all over.
3746          */
3747         if (wired && (fault_type != (prot | VM_PROT_WRITE))) {
3748
3749                 vm_map_verify_done(map, &version);
3750                 if (real_map != map)
3751                         vm_map_unlock(real_map);
3752
3753                 if (m != VM_PAGE_NULL) {
3754                         RELEASE_PAGE(m);
3755
3756                         vm_fault_cleanup(m->object, top_page);
3757                 } else
3758                         vm_fault_cleanup(object, top_page);
3759
3760                 vm_object_deallocate(object);
3761
3762                 goto RetryFault;
3763         }
3764         if (m != VM_PAGE_NULL) {
3765                 /*
3766                  * Put this page into the physical map.
3767                  * We had to do the unlock above because pmap_enter
3768                  * may cause other faults.  The page may be on
3769                  * the pageout queues.  If the pageout daemon comes
3770                  * across the page, it will remove it from the queues.
3771                  */
3772                 if (caller_pmap) {
3773                         kr = vm_fault_enter(m,
3774                                             caller_pmap,
3775                                             caller_pmap_addr,
3776                                             prot,
3777                                             fault_type,
3778                                             wired,
3779                                             change_wiring,
3780                                             fault_info.no_cache,
3781                                             fault_info.cs_bypass,
3782                                             &type_of_fault);
3783                 } else {
3784                         kr = vm_fault_enter(m,
3785                                             pmap,
3786                                             vaddr,
3787                                             prot,
3788                                             fault_type,
3789                                             wired,
3790                                             change_wiring,
3791                                             fault_info.no_cache,
3792                                             fault_info.cs_bypass,
3793                                             &type_of_fault);
3794                 }
3795                 if (kr != KERN_SUCCESS) {
3796                         /* abort this page fault */
3797                         vm_map_verify_done(map, &version);
3798                         if (real_map != map)
3799                                 vm_map_unlock(real_map);
3800                         PAGE_WAKEUP_DONE(m);
3801                         vm_fault_cleanup(m->object, top_page);
3802                         vm_object_deallocate(object);
3803                         goto done;
3804                 }
3805         } else {
3806
3807                 vm_map_entry_t          entry;
3808                 vm_map_offset_t         laddr;
3809                 vm_map_offset_t         ldelta, hdelta;
3810
3811                 /*
3812                  * do a pmap block mapping from the physical address
3813                  * in the object
3814                  */
3815
3816 #ifdef ppc
3817                 /* While we do not worry about execution protection in   */
3818                 /* general, certian pages may have instruction execution */
3819                 /* disallowed.  We will check here, and if not allowed   */
3820                 /* to execute, we return with a protection failure.      */
3821
3822                 if ((fault_type & VM_PROT_EXECUTE) &&
3823                         (!pmap_eligible_for_execute((ppnum_t)(object->vo_shadow_offset >> 12)))) {
3824
3825                         vm_map_verify_done(map, &version);
3826
3827                         if (real_map != map)
3828                                 vm_map_unlock(real_map);
3829
3830                         vm_fault_cleanup(object, top_page);
3831                         vm_object_deallocate(object);
3832
3833                         kr = KERN_PROTECTION_FAILURE;
3834                         goto done;
3835                 }
3836 #endif  /* ppc */
3837
3838                 if (real_map != map)
3839                         vm_map_unlock(real_map);
3840
3841                 if (original_map != map) {
3842                         vm_map_unlock_read(map);
3843                         vm_map_lock_read(original_map);
3844                         map = original_map;
3845                 }
3846                 real_map = map;
3847
3848                 laddr = vaddr;
3849                 hdelta = 0xFFFFF000;
3850                 ldelta = 0xFFFFF000;
3851
3852                 while (vm_map_lookup_entry(map, laddr, &entry)) {
3853                         if (ldelta > (laddr - entry->vme_start))
3854                                 ldelta = laddr - entry->vme_start;
3855                         if (hdelta > (entry->vme_end - laddr))
3856                                 hdelta = entry->vme_end - laddr;
3857                         if (entry->is_sub_map) {
3858
3859                                 laddr = (laddr - entry->vme_start)
3860                                                         + entry->offset;
3861                                 vm_map_lock_read(entry->object.sub_map);
3862
3863                                 if (map != real_map)
3864                                         vm_map_unlock_read(map);
3865                                 if (entry->use_pmap) {
3866                                         vm_map_unlock_read(real_map);
3867                                         real_map = entry->object.sub_map;
3868                                 }
3869                                 map = entry->object.sub_map;
3870
3871                         } else {
3872                                 break;
3873                         }
3874                 }
3875
3876                 if (vm_map_lookup_entry(map, laddr, &entry) &&
3877                                         (entry->object.vm_object != NULL) &&
3878                                         (entry->object.vm_object == object)) {
3879
3880                         int superpage = (!object->pager_created && object->phys_contiguous)? VM_MEM_SUPERPAGE : 0;
3881                         if (caller_pmap) {
3882                                 /*
3883                                  * Set up a block mapped area
3884                                  */
3885                                 assert((uint32_t)((ldelta + hdelta) >> 12) == ((ldelta + hdelta) >> 12));
3886                                 pmap_map_block(caller_pmap,
3887                                                (addr64_t)(caller_pmap_addr - ldelta),
3888                                                (ppnum_t)((((vm_map_offset_t) (entry->object.vm_object->vo_shadow_offset)) +
3889                                                           entry->offset + (laddr - entry->vme_start) - ldelta) >> 12),
3890                                                (uint32_t)((ldelta + hdelta) >> 12), prot,
3891                                                (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
3892                         } else {
3893                                 /*
3894                                  * Set up a block mapped area
3895                                  */
3896                                 assert((uint32_t)((ldelta + hdelta) >> 12) == ((ldelta + hdelta) >> 12));
3897                                 pmap_map_block(real_map->pmap,
3898                                                (addr64_t)(vaddr - ldelta),
3899                                                (ppnum_t)((((vm_map_offset_t)(entry->object.vm_object->vo_shadow_offset)) +
3900                                                           entry->offset + (laddr - entry->vme_start) - ldelta) >> 12),
3901                                                (uint32_t)((ldelta + hdelta) >> 12), prot,
3902                                                (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
3903                         }
3904                 }
3905         }
3906
3907         /*
3908          * Unlock everything, and return
3909          */
3910         vm_map_verify_done(map, &version);
3911         if (real_map != map)
3912                 vm_map_unlock(real_map);
3913
3914         if (m != VM_PAGE_NULL) {
3915                 PAGE_WAKEUP_DONE(m);
3916
3917                 vm_fault_cleanup(m->object, top_page);
3918         } else
3919                 vm_fault_cleanup(object, top_page);
3920
3921         vm_object_deallocate(object);
3922
3923 #undef  RELEASE_PAGE
3924
3925         kr = KERN_SUCCESS;
3926 done:
3927         thread_interrupt_level(interruptible_state);
3928
3929         KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
3930                               (int)((uint64_t)vaddr >> 32),
3931                               (int)vaddr,
3932                               kr,
3933                               type_of_fault,
3934                               0);
3935
3936         return (kr);
3937 }
3938
3939 /*
3940  *      vm_fault_wire:
3941  *
3942  *      Wire down a range of virtual addresses in a map.
3943  */
3944 kern_return_t
3945 vm_fault_wire(
3946         vm_map_t        map,
3947         vm_map_entry_t  entry,
3948         pmap_t          pmap,
3949         vm_map_offset_t pmap_addr)
3950 {
3951
3952         register vm_map_offset_t        va;
3953         register vm_map_offset_t        end_addr = entry->vme_end;
3954         register kern_return_t  rc;
3955
3956         assert(entry->in_transition);
3957
3958         if ((entry->object.vm_object != NULL) &&
3959                         !entry->is_sub_map &&
3960                         entry->object.vm_object->phys_contiguous) {
3961                 return KERN_SUCCESS;
3962         }
3963
3964         /*
3965          *      Inform the physical mapping system that the
3966          *      range of addresses may not fault, so that
3967          *      page tables and such can be locked down as well.
3968          */
3969
3970         pmap_pageable(pmap, pmap_addr,
3971                 pmap_addr + (end_addr - entry->vme_start), FALSE);
3972
3973         /*
3974          *      We simulate a fault to get the page and enter it
3975          *      in the physical map.
3976          */
3977
3978         for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
3979                 if ((rc = vm_fault_wire_fast(
3980                         map, va, entry, pmap,
3981                         pmap_addr + (va - entry->vme_start)
3982                         )) != KERN_SUCCESS) {
3983                         rc = vm_fault(map, va, VM_PROT_NONE, TRUE,
3984                                 (pmap == kernel_pmap) ?
3985                                         THREAD_UNINT : THREAD_ABORTSAFE,
3986                                 pmap, pmap_addr + (va - entry->vme_start));
3987                         DTRACE_VM2(softlock, int, 1, (uint64_t *), NULL);
3988                 }
3989
3990                 if (rc != KERN_SUCCESS) {
3991                         struct vm_map_entry     tmp_entry = *entry;
3992
3993                         /* unwire wired pages */
3994                         tmp_entry.vme_end = va;
3995                         vm_fault_unwire(map,
3996                                 &tmp_entry, FALSE, pmap, pmap_addr);
3997
3998                         return rc;
3999                 }
4000         }
4001         return KERN_SUCCESS;
4002 }
4003
4004 /*
4005  *      vm_fault_unwire:
4006  *
4007  *      Unwire a range of virtual addresses in a map.
4008  */
4009 void
4010 vm_fault_unwire(
4011         vm_map_t        map,
4012         vm_map_entry_t  entry,
4013         boolean_t       deallocate,
4014         pmap_t          pmap,
4015         vm_map_offset_t pmap_addr)
4016 {
4017         register vm_map_offset_t        va;
4018         register vm_map_offset_t        end_addr = entry->vme_end;
4019         vm_object_t             object;
4020         struct vm_object_fault_info fault_info;
4021
4022         object = (entry->is_sub_map)
4023                         ? VM_OBJECT_NULL : entry->object.vm_object;
4024
4025         /*
4026          * If it's marked phys_contiguous, then vm_fault_wire() didn't actually
4027          * do anything since such memory is wired by default.  So we don't have
4028          * anything to undo here.
4029          */
4030
4031         if (object != VM_OBJECT_NULL && object->phys_contiguous)
4032                 return;
4033
4034         fault_info.interruptible = THREAD_UNINT;
4035         fault_info.behavior = entry->behavior;
4036         fault_info.user_tag = entry->alias;
4037         fault_info.lo_offset = entry->offset;
4038         fault_info.hi_offset = (entry->vme_end - entry->vme_start) + entry->offset;
4039         fault_info.no_cache = entry->no_cache;
4040         fault_info.stealth = TRUE;
4041         fault_info.io_sync = FALSE;
4042         fault_info.cs_bypass = FALSE;
4043         fault_info.mark_zf_absent = FALSE;
4044
4045         /*
4046          *      Since the pages are wired down, we must be able to
4047          *      get their mappings from the physical map system.
4048          */
4049
4050         for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
4051
4052                 if (object == VM_OBJECT_NULL) {
4053                         if (pmap) {
4054                                 pmap_change_wiring(pmap,
4055                                                    pmap_addr + (va - entry->vme_start), FALSE);
4056                         }
4057                         (void) vm_fault(map, va, VM_PROT_NONE,
4058                                         TRUE, THREAD_UNINT, pmap, pmap_addr);
4059                 } else {
4060                         vm_prot_t       prot;
4061                         vm_page_t       result_page;
4062                         vm_page_t       top_page;
4063                         vm_object_t     result_object;
4064                         vm_fault_return_t result;
4065
4066                         if (end_addr - va > (vm_size_t) -1) {
4067                                 /* 32-bit overflow */
4068                                 fault_info.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
4069                         } else {
4070                                 fault_info.cluster_size = (vm_size_t) (end_addr - va);
4071                                 assert(fault_info.cluster_size == end_addr - va);
4072                         }
4073
4074                         do {
4075                                 prot = VM_PROT_NONE;
4076
4077                                 vm_object_lock(object);
4078                                 vm_object_paging_begin(object);
4079                                 XPR(XPR_VM_FAULT,
4080                                         "vm_fault_unwire -> vm_fault_page\n",
4081                                         0,0,0,0,0);
4082                                 result = vm_fault_page(
4083                                         object,
4084                                         entry->offset + (va - entry->vme_start),
4085                                         VM_PROT_NONE, TRUE,
4086                                         &prot, &result_page, &top_page,
4087                                         (int *)0,
4088                                         NULL, map->no_zero_fill,
4089                                         FALSE, &fault_info);
4090                         } while (result == VM_FAULT_RETRY);
4091
4092                         /*
4093                          * If this was a mapping to a file on a device that has been forcibly
4094                          * unmounted, then we won't get a page back from vm_fault_page().  Just
4095                          * move on to the next one in case the remaining pages are mapped from
4096                          * different objects.  During a forced unmount, the object is terminated
4097                          * so the alive flag will be false if this happens.  A forced unmount will
4098                          * will occur when an external disk is unplugged before the user does an
4099                          * eject, so we don't want to panic in that situation.
4100                          */
4101
4102                         if (result == VM_FAULT_MEMORY_ERROR && !object->alive)
4103                                 continue;
4104
4105                         if (result != VM_FAULT_SUCCESS)
4106                                 panic("vm_fault_unwire: failure");
4107
4108                         result_object = result_page->object;
4109
4110                         if (deallocate) {
4111                                 assert(result_page->phys_page !=
4112                                        vm_page_fictitious_addr);
4113                                 pmap_disconnect(result_page->phys_page);
4114                                 VM_PAGE_FREE(result_page);
4115                         } else {
4116                                 if ((pmap) && (result_page->phys_page != vm_page_guard_addr))
4117                                         pmap_change_wiring(pmap,
4118                                             pmap_addr + (va - entry->vme_start), FALSE);
4119
4120
4121                                 if (VM_PAGE_WIRED(result_page)) {
4122                                         vm_page_lockspin_queues();
4123                                         vm_page_unwire(result_page, TRUE);
4124                                         vm_page_unlock_queues();
4125                                 }
4126                                 if(entry->zero_wired_pages) {
4127                                         pmap_zero_page(result_page->phys_page);
4128                                         entry->zero_wired_pages = FALSE;
4129                                 }
4130
4131                                 PAGE_WAKEUP_DONE(result_page);
4132                         }
4133                         vm_fault_cleanup(result_object, top_page);
4134                 }
4135         }
4136
4137         /*
4138          *      Inform the physical mapping system that the range
4139          *      of addresses may fault, so that page tables and
4140          *      such may be unwired themselves.
4141          */
4142
4143         pmap_pageable(pmap, pmap_addr,
4144                 pmap_addr + (end_addr - entry->vme_start), TRUE);
4145
4146 }
4147
4148 /*
4149  *      vm_fault_wire_fast:
4150  *
4151  *      Handle common case of a wire down page fault at the given address.
4152  *      If successful, the page is inserted into the associated physical map.
4153  *      The map entry is passed in to avoid the overhead of a map lookup.
4154  *
4155  *      NOTE: the given address should be truncated to the
4156  *      proper page address.
4157  *
4158  *      KERN_SUCCESS is returned if the page fault is handled; otherwise,
4159  *      a standard error specifying why the fault is fatal is returned.
4160  *
4161  *      The map in question must be referenced, and remains so.
4162  *      Caller has a read lock on the map.
4163  *
4164  *      This is a stripped version of vm_fault() for wiring pages.  Anything
4165  *      other than the common case will return KERN_FAILURE, and the caller
4166  *      is expected to call vm_fault().
4167  */
4168 kern_return_t
4169 vm_fault_wire_fast(
4170         __unused vm_map_t       map,
4171         vm_map_offset_t va,
4172         vm_map_entry_t  entry,
4173         pmap_t                  pmap,
4174         vm_map_offset_t pmap_addr)
4175 {
4176         vm_object_t             object;
4177         vm_object_offset_t      offset;
4178         register vm_page_t      m;
4179         vm_prot_t               prot;
4180         thread_t                thread = current_thread();
4181         int                     type_of_fault;
4182         kern_return_t           kr;
4183
4184         VM_STAT_INCR(faults);
4185
4186         if (thread != THREAD_NULL && thread->task != TASK_NULL)
4187           thread->task->faults++;
4188
4189 /*
4190  *      Recovery actions
4191  */
4192
4193 #undef  RELEASE_PAGE
4194 #define RELEASE_PAGE(m) {                               \
4195         PAGE_WAKEUP_DONE(m);                            \
4196         vm_page_lockspin_queues();                      \
4197         vm_page_unwire(m, TRUE);                        \
4198         vm_page_unlock_queues();                        \
4199 }
4200
4201
4202 #undef  UNLOCK_THINGS
4203 #define UNLOCK_THINGS   {                               \
4204         vm_object_paging_end(object);                      \
4205         vm_object_unlock(object);                          \
4206 }
4207
4208 #undef  UNLOCK_AND_DEALLOCATE
4209 #define UNLOCK_AND_DEALLOCATE   {                       \
4210         UNLOCK_THINGS;                                  \
4211         vm_object_deallocate(object);                   \
4212 }
4213 /*
4214  *      Give up and have caller do things the hard way.
4215  */
4216
4217 #define GIVE_UP {                                       \
4218         UNLOCK_AND_DEALLOCATE;                          \
4219         return(KERN_FAILURE);                           \
4220 }
4221
4222
4223         /*
4224          *      If this entry is not directly to a vm_object, bail out.
4225          */
4226         if (entry->is_sub_map)
4227                 return(KERN_FAILURE);
4228
4229         /*
4230          *      Find the backing store object and offset into it.
4231          */
4232
4233         object = entry->object.vm_object;
4234         offset = (va - entry->vme_start) + entry->offset;
4235         prot = entry->protection;
4236
4237         /*
4238          *      Make a reference to this object to prevent its
4239          *      disposal while we are messing with it.
4240          */
4241
4242         vm_object_lock(object);
4243         vm_object_reference_locked(object);
4244         vm_object_paging_begin(object);
4245
4246         /*
4247          *      INVARIANTS (through entire routine):
4248          *
4249          *      1)      At all times, we must either have the object
4250          *              lock or a busy page in some object to prevent
4251          *              some other thread from trying to bring in
4252          *              the same page.
4253          *
4254          *      2)      Once we have a busy page, we must remove it from
4255          *              the pageout queues, so that the pageout daemon
4256          *              will not grab it away.
4257          *
4258          */
4259
4260         /*
4261          *      Look for page in top-level object.  If it's not there or
4262          *      there's something going on, give up.
4263          * ENCRYPTED SWAP: use the slow fault path, since we'll need to
4264          * decrypt the page before wiring it down.
4265          */
4266         m = vm_page_lookup(object, offset);
4267         if ((m == VM_PAGE_NULL) || (m->busy) || (m->encrypted) ||
4268             (m->unusual && ( m->error || m->restart || m->absent))) {
4269
4270                 GIVE_UP;
4271         }
4272         ASSERT_PAGE_DECRYPTED(m);
4273
4274         if (m->fictitious &&
4275             m->phys_page == vm_page_guard_addr) {
4276                 /*
4277                  * Guard pages are fictitious pages and are never
4278                  * entered into a pmap, so let's say it's been wired...
4279                  */
4280                 kr = KERN_SUCCESS;
4281                 goto done;
4282         }
4283
4284         /*
4285          *      Wire the page down now.  All bail outs beyond this
4286          *      point must unwire the page.
4287          */
4288
4289         vm_page_lockspin_queues();
4290         vm_page_wire(m);
4291         vm_page_unlock_queues();
4292
4293         /*
4294          *      Mark page busy for other threads.
4295          */
4296         assert(!m->busy);
4297         m->busy = TRUE;
4298         assert(!m->absent);
4299
4300         /*
4301          *      Give up if the page is being written and there's a copy object
4302          */
4303         if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
4304                 RELEASE_PAGE(m);
4305                 GIVE_UP;
4306         }
4307
4308         /*
4309          *      Put this page into the physical map.
4310          */
4311         type_of_fault = DBG_CACHE_HIT_FAULT;
4312         kr = vm_fault_enter(m,
4313                             pmap,
4314                             pmap_addr,
4315                             prot,
4316                             prot,
4317                             TRUE,
4318                             FALSE,
4319                             FALSE,
4320                             FALSE,
4321                             &type_of_fault);
4322
4323 done:
4324         /*
4325          *      Unlock everything, and return
4326          */
4327
4328         PAGE_WAKEUP_DONE(m);
4329         UNLOCK_AND_DEALLOCATE;
4330
4331         return kr;
4332
4333 }
4334
4335 /*
4336  *      Routine:        vm_fault_copy_cleanup
4337  *      Purpose:
4338  *              Release a page used by vm_fault_copy.
4339  */
4340
4341 void
4342 vm_fault_copy_cleanup(
4343         vm_page_t       page,
4344         vm_page_t       top_page)
4345 {
4346         vm_object_t     object = page->object;
4347
4348         vm_object_lock(object);
4349         PAGE_WAKEUP_DONE(page);
4350         if (!page->active && !page->inactive && !page->throttled) {
4351                 vm_page_lockspin_queues();
4352                 if (!page->active && !page->inactive && !page->throttled)
4353                         vm_page_activate(page);
4354                 vm_page_unlock_queues();
4355         }
4356         vm_fault_cleanup(object, top_page);
4357 }
4358
4359 void
4360 vm_fault_copy_dst_cleanup(
4361         vm_page_t       page)
4362 {
4363         vm_object_t     object;
4364
4365         if (page != VM_PAGE_NULL) {
4366                 object = page->object;
4367                 vm_object_lock(object);
4368                 vm_page_lockspin_queues();
4369                 vm_page_unwire(page, TRUE);
4370                 vm_page_unlock_queues();
4371                 vm_object_paging_end(object);
4372                 vm_object_unlock(object);
4373         }
4374 }
4375
4376 /*
4377  *      Routine:        vm_fault_copy
4378  *
4379  *      Purpose:
4380  *              Copy pages from one virtual memory object to another --
4381  *              neither the source nor destination pages need be resident.
4382  *
4383  *              Before actually copying a page, the version associated with
4384  *              the destination address map wil be verified.
4385  *
4386  *      In/out conditions:
4387  *              The caller must hold a reference, but not a lock, to
4388  *              each of the source and destination objects and to the
4389  *              destination map.
4390  *
4391  *      Results:
4392  *              Returns KERN_SUCCESS if no errors were encountered in
4393  *              reading or writing the data.  Returns KERN_INTERRUPTED if
4394  *              the operation was interrupted (only possible if the
4395  *              "interruptible" argument is asserted).  Other return values
4396  *              indicate a permanent error in copying the data.
4397  *
4398  *              The actual amount of data copied will be returned in the
4399  *              "copy_size" argument.  In the event that the destination map
4400  *              verification failed, this amount may be less than the amount
4401  *              requested.
4402  */
4403 kern_return_t
4404 vm_fault_copy(
4405         vm_object_t             src_object,
4406         vm_object_offset_t      src_offset,
4407         vm_map_size_t           *copy_size,             /* INOUT */
4408         vm_object_t             dst_object,
4409         vm_object_offset_t      dst_offset,
4410         vm_map_t                dst_map,
4411         vm_map_version_t         *dst_version,
4412         int                     interruptible)
4413 {
4414         vm_page_t               result_page;
4415
4416         vm_page_t               src_page;
4417         vm_page_t               src_top_page;
4418         vm_prot_t               src_prot;
4419
4420         vm_page_t               dst_page;
4421         vm_page_t               dst_top_page;
4422         vm_prot_t               dst_prot;
4423
4424         vm_map_size_t           amount_left;
4425         vm_object_t             old_copy_object;
4426         kern_return_t           error = 0;
4427         vm_fault_return_t       result;
4428
4429         vm_map_size_t           part_size;
4430         struct vm_object_fault_info fault_info_src;
4431         struct vm_object_fault_info fault_info_dst;
4432
4433         /*
4434          * In order not to confuse the clustered pageins, align
4435          * the different offsets on a page boundary.
4436          */
4437
4438 #define RETURN(x)                                       \
4439         MACRO_BEGIN                                     \
4440         *copy_size -= amount_left;                      \
4441         MACRO_RETURN(x);                                \
4442         MACRO_END
4443
4444         amount_left = *copy_size;
4445
4446         fault_info_src.interruptible = interruptible;
4447         fault_info_src.behavior = VM_BEHAVIOR_SEQUENTIAL;
4448         fault_info_src.user_tag  = 0;
4449         fault_info_src.lo_offset = vm_object_trunc_page(src_offset);
4450         fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left;
4451         fault_info_src.no_cache   = FALSE;
4452         fault_info_src.stealth = TRUE;
4453         fault_info_src.io_sync = FALSE;
4454         fault_info_src.cs_bypass = FALSE;
4455         fault_info_src.mark_zf_absent = FALSE;
4456
4457         fault_info_dst.interruptible = interruptible;
4458         fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL;
4459         fault_info_dst.user_tag  = 0;
4460         fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset);
4461         fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left;
4462         fault_info_dst.no_cache   = FALSE;
4463         fault_info_dst.stealth = TRUE;
4464         fault_info_dst.io_sync = FALSE;
4465         fault_info_dst.cs_bypass = FALSE;
4466         fault_info_dst.mark_zf_absent = FALSE;
4467
4468         do { /* while (amount_left > 0) */
4469                 /*
4470                  * There may be a deadlock if both source and destination
4471                  * pages are the same. To avoid this deadlock, the copy must
4472                  * start by getting the destination page in order to apply
4473                  * COW semantics if any.
4474                  */
4475
4476         RetryDestinationFault: ;
4477
4478                 dst_prot = VM_PROT_WRITE|VM_PROT_READ;
4479
4480                 vm_object_lock(dst_object);
4481                 vm_object_paging_begin(dst_object);
4482
4483                 if (amount_left > (vm_size_t) -1) {
4484                         /* 32-bit overflow */
4485                         fault_info_dst.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
4486                 } else {
4487                         fault_info_dst.cluster_size = (vm_size_t) amount_left;
4488                         assert(fault_info_dst.cluster_size == amount_left);
4489                 }
4490
4491                 XPR(XPR_VM_FAULT,"vm_fault_copy -> vm_fault_page\n",0,0,0,0,0);
4492                 result = vm_fault_page(dst_object,
4493                                        vm_object_trunc_page(dst_offset),
4494                                        VM_PROT_WRITE|VM_PROT_READ,
4495                                        FALSE,
4496                                        &dst_prot, &dst_page, &dst_top_page,
4497                                        (int *)0,
4498                                        &error,
4499                                        dst_map->no_zero_fill,
4500                                        FALSE, &fault_info_dst);
4501                 switch (result) {
4502                 case VM_FAULT_SUCCESS:
4503                         break;
4504                 case VM_FAULT_RETRY:
4505                         goto RetryDestinationFault;
4506                 case VM_FAULT_MEMORY_SHORTAGE:
4507                         if (vm_page_wait(interruptible))
4508                                 goto RetryDestinationFault;
4509                         /* fall thru */
4510                 case VM_FAULT_INTERRUPTED:
4511                         RETURN(MACH_SEND_INTERRUPTED);
4512                 case VM_FAULT_SUCCESS_NO_VM_PAGE:
4513                         /* success but no VM page: fail the copy */
4514                         vm_object_paging_end(dst_object);
4515                         vm_object_unlock(dst_object);
4516                         /*FALLTHROUGH*/
4517                 case VM_FAULT_MEMORY_ERROR:
4518                         if (error)
4519                                 return (error);
4520                         else
4521                                 return(KERN_MEMORY_ERROR);
4522                 default:
4523                         panic("vm_fault_copy: unexpected error 0x%x from "
4524                               "vm_fault_page()\n", result);
4525                 }
4526                 assert ((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
4527
4528                 old_copy_object = dst_page->object->copy;
4529
4530                 /*
4531                  * There exists the possiblity that the source and
4532                  * destination page are the same.  But we can't
4533                  * easily determine that now.  If they are the
4534                  * same, the call to vm_fault_page() for the
4535                  * destination page will deadlock.  To prevent this we
4536                  * wire the page so we can drop busy without having
4537                  * the page daemon steal the page.  We clean up the
4538                  * top page  but keep the paging reference on the object
4539                  * holding the dest page so it doesn't go away.
4540                  */
4541
4542                 vm_page_lockspin_queues();
4543                 vm_page_wire(dst_page);
4544                 vm_page_unlock_queues();
4545                 PAGE_WAKEUP_DONE(dst_page);
4546                 vm_object_unlock(dst_page->object);
4547
4548                 if (dst_top_page != VM_PAGE_NULL) {
4549                         vm_object_lock(dst_object);
4550                         VM_PAGE_FREE(dst_top_page);
4551                         vm_object_paging_end(dst_object);
4552                         vm_object_unlock(dst_object);
4553                 }
4554
4555         RetrySourceFault: ;
4556
4557                 if (src_object == VM_OBJECT_NULL) {
4558                         /*
4559                          *      No source object.  We will just
4560                          *      zero-fill the page in dst_object.
4561                          */
4562                         src_page = VM_PAGE_NULL;
4563                         result_page = VM_PAGE_NULL;
4564                 } else {
4565                         vm_object_lock(src_object);
4566                         src_page = vm_page_lookup(src_object,
4567                                                   vm_object_trunc_page(src_offset));
4568                         if (src_page == dst_page) {
4569                                 src_prot = dst_prot;
4570                                 result_page = VM_PAGE_NULL;
4571                         } else {
4572                                 src_prot = VM_PROT_READ;
4573                                 vm_object_paging_begin(src_object);
4574
4575                                 if (amount_left > (vm_size_t) -1) {
4576                                         /* 32-bit overflow */
4577                                         fault_info_src.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
4578                                 } else {
4579                                         fault_info_src.cluster_size = (vm_size_t) amount_left;
4580                                         assert(fault_info_src.cluster_size == amount_left);
4581                                 }
4582
4583                                 XPR(XPR_VM_FAULT,
4584                                         "vm_fault_copy(2) -> vm_fault_page\n",
4585                                         0,0,0,0,0);
4586                                 result = vm_fault_page(
4587                                         src_object,
4588                                         vm_object_trunc_page(src_offset),
4589                                         VM_PROT_READ, FALSE,
4590                                         &src_prot,
4591                                         &result_page, &src_top_page,
4592                                         (int *)0, &error, FALSE,
4593                                         FALSE, &fault_info_src);
4594
4595                                 switch (result) {
4596                                 case VM_FAULT_SUCCESS:
4597                                         break;
4598                                 case VM_FAULT_RETRY:
4599                                         goto RetrySourceFault;
4600                                 case VM_FAULT_MEMORY_SHORTAGE:
4601                                         if (vm_page_wait(interruptible))
4602                                                 goto RetrySourceFault;
4603                                         /* fall thru */
4604                                 case VM_FAULT_INTERRUPTED:
4605                                         vm_fault_copy_dst_cleanup(dst_page);
4606                                         RETURN(MACH_SEND_INTERRUPTED);
4607                                 case VM_FAULT_SUCCESS_NO_VM_PAGE:
4608                                         /* success but no VM page: fail */
4609                                         vm_object_paging_end(src_object);
4610                                         vm_object_unlock(src_object);
4611                                         /*FALLTHROUGH*/
4612                                 case VM_FAULT_MEMORY_ERROR:
4613                                         vm_fault_copy_dst_cleanup(dst_page);
4614                                         if (error)
4615                                                 return (error);
4616                                         else
4617                                                 return(KERN_MEMORY_ERROR);
4618                                 default:
4619                                         panic("vm_fault_copy(2): unexpected "
4620                                               "error 0x%x from "
4621                                               "vm_fault_page()\n", result);
4622                                 }
4623
4624
4625                                 assert((src_top_page == VM_PAGE_NULL) ==
4626                                        (result_page->object == src_object));
4627                         }
4628                         assert ((src_prot & VM_PROT_READ) != VM_PROT_NONE);
4629                         vm_object_unlock(result_page->object);
4630                 }
4631
4632                 if (!vm_map_verify(dst_map, dst_version)) {
4633                         if (result_page != VM_PAGE_NULL && src_page != dst_page)
4634                                 vm_fault_copy_cleanup(result_page, src_top_page);
4635                         vm_fault_copy_dst_cleanup(dst_page);
4636                         break;
4637                 }
4638
4639                 vm_object_lock(dst_page->object);
4640
4641                 if (dst_page->object->copy != old_copy_object) {
4642                         vm_object_unlock(dst_page->object);
4643                         vm_map_verify_done(dst_map, dst_version);
4644                         if (result_page != VM_PAGE_NULL && src_page != dst_page)
4645                                 vm_fault_copy_cleanup(result_page, src_top_page);
4646                         vm_fault_copy_dst_cleanup(dst_page);
4647                         break;
4648                 }
4649                 vm_object_unlock(dst_page->object);
4650
4651                 /*
4652                  *      Copy the page, and note that it is dirty
4653                  *      immediately.
4654                  */
4655
4656                 if (!page_aligned(src_offset) ||
4657                         !page_aligned(dst_offset) ||
4658                         !page_aligned(amount_left)) {
4659
4660                         vm_object_offset_t      src_po,
4661                                                 dst_po;
4662
4663                         src_po = src_offset - vm_object_trunc_page(src_offset);
4664                         dst_po = dst_offset - vm_object_trunc_page(dst_offset);
4665
4666                         if (dst_po > src_po) {
4667                                 part_size = PAGE_SIZE - dst_po;
4668                         } else {
4669                                 part_size = PAGE_SIZE - src_po;
4670                         }
4671                         if (part_size > (amount_left)){
4672                                 part_size = amount_left;
4673                         }
4674
4675                         if (result_page == VM_PAGE_NULL) {
4676                                 assert((vm_offset_t) dst_po == dst_po);
4677                                 assert((vm_size_t) part_size == part_size);
4678                                 vm_page_part_zero_fill(dst_page,
4679                                                        (vm_offset_t) dst_po,
4680                                                        (vm_size_t) part_size);
4681                         } else {
4682                                 assert((vm_offset_t) src_po == src_po);
4683                                 assert((vm_offset_t) dst_po == dst_po);
4684                                 assert((vm_size_t) part_size == part_size);
4685                                 vm_page_part_copy(result_page,
4686                                                   (vm_offset_t) src_po,
4687                                                   dst_page,
4688                                                   (vm_offset_t) dst_po,
4689                                                   (vm_size_t)part_size);
4690                                 if(!dst_page->dirty){
4691                                         vm_object_lock(dst_object);
4692                                         dst_page->dirty = TRUE;
4693                                         vm_object_unlock(dst_page->object);
4694                                 }
4695
4696                         }
4697                 } else {
4698                         part_size = PAGE_SIZE;
4699
4700                         if (result_page == VM_PAGE_NULL)
4701                                 vm_page_zero_fill(dst_page);
4702                         else{
4703                                 vm_page_copy(result_page, dst_page);
4704                                 if(!dst_page->dirty){
4705                                         vm_object_lock(dst_object);
4706                                         dst_page->dirty = TRUE;
4707                                         vm_object_unlock(dst_page->object);
4708                                 }
4709                         }
4710
4711                 }
4712
4713                 /*
4714                  *      Unlock everything, and return
4715                  */
4716
4717                 vm_map_verify_done(dst_map, dst_version);
4718
4719                 if (result_page != VM_PAGE_NULL && src_page != dst_page)
4720                         vm_fault_copy_cleanup(result_page, src_top_page);
4721                 vm_fault_copy_dst_cleanup(dst_page);
4722
4723                 amount_left -= part_size;
4724                 src_offset += part_size;
4725                 dst_offset += part_size;
4726         } while (amount_left > 0);
4727
4728         RETURN(KERN_SUCCESS);
4729 #undef  RETURN
4730
4731         /*NOTREACHED*/
4732 }
4733
4734 #if     VM_FAULT_CLASSIFY
4735 /*
4736  *      Temporary statistics gathering support.
4737  */
4738
4739 /*
4740  *      Statistics arrays:
4741  */
4742 #define VM_FAULT_TYPES_MAX      5
4743 #define VM_FAULT_LEVEL_MAX      8
4744
4745 int     vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
4746
4747 #define VM_FAULT_TYPE_ZERO_FILL 0
4748 #define VM_FAULT_TYPE_MAP_IN    1
4749 #define VM_FAULT_TYPE_PAGER     2
4750 #define VM_FAULT_TYPE_COPY      3
4751 #define VM_FAULT_TYPE_OTHER     4
4752
4753
4754 void
4755 vm_fault_classify(vm_object_t           object,
4756                   vm_object_offset_t    offset,
4757                   vm_prot_t             fault_type)
4758 {
4759         int             type, level = 0;
4760         vm_page_t       m;
4761
4762         while (TRUE) {
4763                 m = vm_page_lookup(object, offset);
4764                 if (m != VM_PAGE_NULL) {
4765                         if (m->busy || m->error || m->restart || m->absent) {
4766                                 type = VM_FAULT_TYPE_OTHER;
4767                                 break;
4768                         }
4769                         if (((fault_type & VM_PROT_WRITE) == 0) ||
4770                             ((level == 0) && object->copy == VM_OBJECT_NULL)) {
4771                                 type = VM_FAULT_TYPE_MAP_IN;
4772                                 break;
4773                         }
4774                         type = VM_FAULT_TYPE_COPY;
4775                         break;
4776                 }
4777                 else {
4778                         if (object->pager_created) {
4779                                 type = VM_FAULT_TYPE_PAGER;
4780                                 break;
4781                         }
4782                         if (object->shadow == VM_OBJECT_NULL) {
4783                                 type = VM_FAULT_TYPE_ZERO_FILL;
4784                                 break;
4785                         }
4786
4787                         offset += object->vo_shadow_offset;
4788                         object = object->shadow;
4789                         level++;
4790                         continue;
4791                 }
4792         }
4793
4794         if (level > VM_FAULT_LEVEL_MAX)
4795                 level = VM_FAULT_LEVEL_MAX;
4796
4797         vm_fault_stats[type][level] += 1;
4798
4799         return;
4800 }
4801
4802 /* cleanup routine to call from debugger */
4803
4804 void
4805 vm_fault_classify_init(void)
4806 {
4807         int type, level;
4808
4809         for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
4810                 for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
4811                         vm_fault_stats[type][level] = 0;
4812                 }
4813         }
4814
4815         return;
4816 }
4817 #endif  /* VM_FAULT_CLASSIFY */
4818
4819
4820 extern int cs_validation;
4821
4822 void
4823 vm_page_validate_cs_mapped(
4824         vm_page_t       page,
4825         const void      *kaddr)
4826 {
4827         vm_object_t             object;
4828         vm_object_offset_t      offset;
4829         kern_return_t           kr;
4830         memory_object_t         pager;
4831         void                    *blobs;
4832         boolean_t               validated, tainted;
4833
4834         assert(page->busy);
4835         vm_object_lock_assert_exclusive(page->object);
4836
4837         if (!cs_validation) {
4838                 return;
4839         }
4840
4841         if (page->wpmapped && !page->cs_tainted) {
4842                 /*
4843                  * This page was mapped for "write" access sometime in the
4844                  * past and could still be modifiable in the future.
4845                  * Consider it tainted.
4846                  * [ If the page was already found to be "tainted", no
4847                  * need to re-validate. ]
4848                  */
4849                 page->cs_validated = TRUE;
4850                 page->cs_tainted = TRUE;
4851                 if (cs_debug) {
4852                         printf("CODESIGNING: vm_page_validate_cs: "
4853                                "page %p obj %p off 0x%llx "
4854                                "was modified\n",
4855                                page, page->object, page->offset);
4856                 }
4857                 vm_cs_validated_dirtied++;
4858         }
4859
4860         if (page->cs_validated) {
4861                 return;
4862         }
4863
4864         vm_cs_validates++;
4865
4866         object = page->object;
4867         assert(object->code_signed);
4868         offset = page->offset;
4869
4870         if (!object->alive || object->terminating || object->pager == NULL) {
4871                 /*
4872                  * The object is terminating and we don't have its pager
4873                  * so we can't validate the data...
4874                  */
4875                 return;
4876         }
4877         /*
4878          * Since we get here to validate a page that was brought in by
4879          * the pager, we know that this pager is all setup and ready
4880          * by now.
4881          */
4882         assert(!object->internal);
4883         assert(object->pager != NULL);
4884         assert(object->pager_ready);
4885
4886         pager = object->pager;
4887         assert(object->paging_in_progress);
4888         kr = vnode_pager_get_object_cs_blobs(pager, &blobs);
4889         if (kr != KERN_SUCCESS) {
4890                 blobs = NULL;
4891         }
4892
4893         /* verify the SHA1 hash for this page */
4894         validated = cs_validate_page(blobs,
4895                                      offset + object->paging_offset,
4896                                      (const void *)kaddr,
4897                                      &tainted);
4898
4899         page->cs_validated = validated;
4900         if (validated) {
4901                 page->cs_tainted = tainted;
4902         }
4903 }
4904
4905 void
4906 vm_page_validate_cs(
4907         vm_page_t       page)
4908 {
4909         vm_object_t             object;
4910         vm_object_offset_t      offset;
4911         vm_map_offset_t         koffset;
4912         vm_map_size_t           ksize;
4913         vm_offset_t             kaddr;
4914         kern_return_t           kr;
4915         boolean_t               busy_page;
4916
4917         vm_object_lock_assert_held(page->object);
4918
4919         if (!cs_validation) {
4920                 return;
4921         }
4922
4923         if (page->wpmapped && !page->cs_tainted) {
4924                 vm_object_lock_assert_exclusive(page->object);
4925
4926                 /*
4927                  * This page was mapped for "write" access sometime in the
4928                  * past and could still be modifiable in the future.
4929                  * Consider it tainted.
4930                  * [ If the page was already found to be "tainted", no
4931                  * need to re-validate. ]
4932                  */
4933                 page->cs_validated = TRUE;
4934                 page->cs_tainted = TRUE;
4935                 if (cs_debug) {
4936                         printf("CODESIGNING: vm_page_validate_cs: "
4937                                "page %p obj %p off 0x%llx "
4938                                "was modified\n",
4939                                page, page->object, page->offset);
4940                 }
4941                 vm_cs_validated_dirtied++;
4942         }
4943
4944         if (page->cs_validated) {
4945                 return;
4946         }
4947
4948 #if CHECK_CS_VALIDATION_BITMAP
4949         if ( vnode_pager_cs_check_validation_bitmap( page->object->pager, trunc_page(page->offset + page->object->paging_offset), CS_BITMAP_CHECK ) == KERN_SUCCESS) {
4950                 page->cs_validated = TRUE;
4951                 page->cs_tainted = FALSE;
4952                 vm_cs_bitmap_validated++;
4953                 return;
4954         }
4955 #endif
4956         vm_object_lock_assert_exclusive(page->object);
4957
4958         object = page->object;
4959         assert(object->code_signed);
4960         offset = page->offset;
4961
4962         busy_page = page->busy;
4963         if (!busy_page) {
4964                 /* keep page busy while we map (and unlock) the VM object */
4965                 page->busy = TRUE;
4966         }
4967
4968         /*
4969          * Take a paging reference on the VM object
4970          * to protect it from collapse or bypass,
4971          * and keep it from disappearing too.
4972          */
4973         vm_object_paging_begin(object);
4974
4975         /* map the page in the kernel address space */
4976         koffset = 0;
4977         ksize = PAGE_SIZE_64;
4978         kr = vm_paging_map_object(&koffset,
4979                                   page,
4980                                   object,
4981                                   offset,
4982                                   &ksize,
4983                                   VM_PROT_READ,
4984                                   FALSE); /* can't unlock object ! */
4985         if (kr != KERN_SUCCESS) {
4986                 panic("vm_page_validate_cs: could not map page: 0x%x\n", kr);
4987         }
4988         kaddr = CAST_DOWN(vm_offset_t, koffset);
4989
4990         /* validate the mapped page */
4991         vm_page_validate_cs_mapped(page, (const void *) kaddr);
4992
4993 #if CHECK_CS_VALIDATION_BITMAP
4994         if ( page->cs_validated == TRUE && page->cs_tainted == FALSE ) {
4995                 vnode_pager_cs_check_validation_bitmap( object->pager, trunc_page( offset + object->paging_offset), CS_BITMAP_SET );
4996         }
4997 #endif
4998         assert(page->busy);
4999         assert(object == page->object);
5000         vm_object_lock_assert_exclusive(object);
5001
5002         if (!busy_page) {
5003                 PAGE_WAKEUP_DONE(page);
5004         }
5005         if (koffset != 0) {
5006                 /* unmap the map from the kernel address space */
5007                 vm_paging_unmap_object(object, koffset, koffset + ksize);
5008                 koffset = 0;
5009                 ksize = 0;
5010                 kaddr = 0;
5011         }
5012         vm_object_paging_end(object);
5013 }