osfmk/vm/vm_fault.c

   1 /*
   2  * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58 /*
  59  *      File:   vm_fault.c
  60  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  61  *
  62  *      Page fault handling module.
  63  */
  64
  65 #include <mach_cluster_stats.h>
  66 #include <mach_pagemap.h>
  67 #include <libkern/OSAtomic.h>
  68
  69 #include <mach/mach_types.h>
  70 #include <mach/kern_return.h>
  71 #include <mach/message.h>       /* for error codes */
  72 #include <mach/vm_param.h>
  73 #include <mach/vm_behavior.h>
  74 #include <mach/memory_object.h>
  75                                 /* For memory_object_data_{request,unlock} */
  76 #include <mach/sdt.h>
  77
  78 #include <kern/kern_types.h>
  79 #include <kern/host_statistics.h>
  80 #include <kern/counters.h>
  81 #include <kern/task.h>
  82 #include <kern/thread.h>
  83 #include <kern/sched_prim.h>
  84 #include <kern/host.h>
  85 #include <kern/xpr.h>
  86 #include <kern/mach_param.h>
  87 #include <kern/macro_help.h>
  88 #include <kern/zalloc.h>
  89 #include <kern/misc_protos.h>
  90
  91 #include <vm/vm_compressor.h>
  92 #include <vm/vm_compressor_pager.h>
  93 #include <vm/vm_fault.h>
  94 #include <vm/vm_map.h>
  95 #include <vm/vm_object.h>
  96 #include <vm/vm_page.h>
  97 #include <vm/vm_kern.h>
  98 #include <vm/pmap.h>
  99 #include <vm/vm_pageout.h>
 100 #include <vm/vm_protos.h>
 101 #include <vm/vm_external.h>
 102 #include <vm/memory_object.h>
 103 #include <vm/vm_purgeable_internal.h>   /* Needed by some vm_page.h macros */
 104 #include <vm/vm_shared_region.h>
 105
 106 #include <sys/codesign.h>
 107
 108 #include <libsa/sys/timers.h>   /* for struct timespec */
 109
 110 #define VM_FAULT_CLASSIFY       0
 111
 112 #define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
 113
 114 int     vm_object_pagein_throttle = 16;
 115
 116 /*
 117  * We apply a hard throttle to the demand zero rate of tasks that we believe are running out of control which
 118  * kicks in when swap space runs out.  64-bit programs have massive address spaces and can leak enormous amounts
 119  * of memory if they're buggy and can run the system completely out of swap space.  If this happens, we
 120  * impose a hard throttle on them to prevent them from taking the last bit of memory left.  This helps
 121  * keep the UI active so that the user has a chance to kill the offending task before the system
 122  * completely hangs.
 123  *
 124  * The hard throttle is only applied when the system is nearly completely out of swap space and is only applied
 125  * to tasks that appear to be bloated.  When swap runs out, any task using more than vm_hard_throttle_threshold
 126  * will be throttled.  The throttling is done by giving the thread that's trying to demand zero a page a
 127  * delay of HARD_THROTTLE_DELAY microseconds before being allowed to try the page fault again.
 128  */
 129
 130 extern void throttle_lowpri_io(int);
 131
 132 uint64_t vm_hard_throttle_threshold;
 133
 134
 135
 136 #define NEED_TO_HARD_THROTTLE_THIS_TASK()       ((current_task() != kernel_task && \
 137                                                   get_task_resident_size(current_task()) > (((AVAILABLE_NON_COMPRESSED_MEMORY) * PAGE_SIZE) / 5)) && \
 138                                                  (vm_low_on_space() || (vm_page_free_count < vm_page_throttle_limit && \
 139                                                                         proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO) >= THROTTLE_LEVEL_THROTTLED )))
 140
 141
 142
 143 #define HARD_THROTTLE_DELAY     20000   /* 20000 us == 20 ms */
 144 #define SOFT_THROTTLE_DELAY     2000    /* 2000 us == 2 ms */
 145
 146 boolean_t current_thread_aborted(void);
 147
 148 /* Forward declarations of internal routines. */
 149 extern kern_return_t vm_fault_wire_fast(
 150                                 vm_map_t        map,
 151                                 vm_map_offset_t va,
 152                                 vm_map_entry_t  entry,
 153                                 pmap_t          pmap,
 154                                 vm_map_offset_t pmap_addr);
 155
 156 extern void vm_fault_continue(void);
 157
 158 extern void vm_fault_copy_cleanup(
 159                                 vm_page_t       page,
 160                                 vm_page_t       top_page);
 161
 162 extern void vm_fault_copy_dst_cleanup(
 163                                 vm_page_t       page);
 164
 165 #if     VM_FAULT_CLASSIFY
 166 extern void vm_fault_classify(vm_object_t       object,
 167                           vm_object_offset_t    offset,
 168                           vm_prot_t             fault_type);
 169
 170 extern void vm_fault_classify_init(void);
 171 #endif
 172
 173 unsigned long vm_pmap_enter_blocked = 0;
 174 unsigned long vm_pmap_enter_retried = 0;
 175
 176 unsigned long vm_cs_validates = 0;
 177 unsigned long vm_cs_revalidates = 0;
 178 unsigned long vm_cs_query_modified = 0;
 179 unsigned long vm_cs_validated_dirtied = 0;
 180 unsigned long vm_cs_bitmap_validated = 0;
 181
 182 /*
 183  *      Routine:        vm_fault_init
 184  *      Purpose:
 185  *              Initialize our private data structures.
 186  */
 187 void
 188 vm_fault_init(void)
 189 {
 190         int i, vm_compressor_temp;
 191         boolean_t need_default_val = TRUE;
 192         /*
 193          * Choose a value for the hard throttle threshold based on the amount of ram.  The threshold is
 194          * computed as a percentage of available memory, and the percentage used is scaled inversely with
 195          * the amount of memory.  The percentage runs between 10% and 35%.  We use 35% for small memory systems
 196          * and reduce the value down to 10% for very large memory configurations.  This helps give us a
 197          * definition of a memory hog that makes more sense relative to the amount of ram in the machine.
 198          * The formula here simply uses the number of gigabytes of ram to adjust the percentage.
 199          */
 200
 201         vm_hard_throttle_threshold = sane_size * (35 - MIN((int)(sane_size / (1024*1024*1024)), 25)) / 100;
 202
 203         /*
 204          * Configure compressed pager behavior. A boot arg takes precedence over a device tree entry.
 205          */
 206
 207         if (PE_parse_boot_argn("vm_compressor", &vm_compressor_temp, sizeof (vm_compressor_temp))) {
 208                 for ( i = 0; i < VM_PAGER_MAX_MODES; i++) {
 209                         if (vm_compressor_temp > 0 &&
 210                             ((vm_compressor_temp & ( 1 << i)) == vm_compressor_temp)) {
 211                                 need_default_val = FALSE;
 212                                 vm_compressor_mode = vm_compressor_temp;
 213                                 break;
 214                         }
 215                 }
 216                 if (need_default_val)
 217                         printf("Ignoring \"vm_compressor\" boot arg %d\n", vm_compressor_temp);
 218         }
 219         if (need_default_val) {
 220                 /* If no boot arg or incorrect boot arg, try device tree. */
 221                 PE_get_default("kern.vm_compressor", &vm_compressor_mode, sizeof(vm_compressor_mode));
 222         }
 223         PE_parse_boot_argn("vm_compressor_threads", &vm_compressor_thread_count, sizeof (vm_compressor_thread_count));
 224         printf("\"vm_compressor_mode\" is %d\n", vm_compressor_mode);
 225 }
 226
 227 /*
 228  *      Routine:        vm_fault_cleanup
 229  *      Purpose:
 230  *              Clean up the result of vm_fault_page.
 231  *      Results:
 232  *              The paging reference for "object" is released.
 233  *              "object" is unlocked.
 234  *              If "top_page" is not null,  "top_page" is
 235  *              freed and the paging reference for the object
 236  *              containing it is released.
 237  *
 238  *      In/out conditions:
 239  *              "object" must be locked.
 240  */
 241 void
 242 vm_fault_cleanup(
 243         register vm_object_t    object,
 244         register vm_page_t      top_page)
 245 {
 246         vm_object_paging_end(object);
 247         vm_object_unlock(object);
 248
 249         if (top_page != VM_PAGE_NULL) {
 250                 object = top_page->object;
 251
 252                 vm_object_lock(object);
 253                 VM_PAGE_FREE(top_page);
 254                 vm_object_paging_end(object);
 255                 vm_object_unlock(object);
 256         }
 257 }
 258
 259 #if     MACH_CLUSTER_STATS
 260 #define MAXCLUSTERPAGES 16
 261 struct {
 262         unsigned long pages_in_cluster;
 263         unsigned long pages_at_higher_offsets;
 264         unsigned long pages_at_lower_offsets;
 265 } cluster_stats_in[MAXCLUSTERPAGES];
 266 #define CLUSTER_STAT(clause)    clause
 267 #define CLUSTER_STAT_HIGHER(x)  \
 268         ((cluster_stats_in[(x)].pages_at_higher_offsets)++)
 269 #define CLUSTER_STAT_LOWER(x)   \
 270          ((cluster_stats_in[(x)].pages_at_lower_offsets)++)
 271 #define CLUSTER_STAT_CLUSTER(x) \
 272         ((cluster_stats_in[(x)].pages_in_cluster)++)
 273 #else   /* MACH_CLUSTER_STATS */
 274 #define CLUSTER_STAT(clause)
 275 #endif  /* MACH_CLUSTER_STATS */
 276
 277 #define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
 278
 279
 280 boolean_t       vm_page_deactivate_behind = TRUE;
 281 /*
 282  * default sizes given VM_BEHAVIOR_DEFAULT reference behavior
 283  */
 284 #define VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW     128
 285 #define VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER    16              /* don't make this too big... */
 286                                                                 /* we use it to size an array on the stack */
 287
 288 int vm_default_behind = VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW;
 289
 290 #define MAX_SEQUENTIAL_RUN      (1024 * 1024 * 1024)
 291
 292 /*
 293  * vm_page_is_sequential
 294  *
 295  * Determine if sequential access is in progress
 296  * in accordance with the behavior specified.
 297  * Update state to indicate current access pattern.
 298  *
 299  * object must have at least the shared lock held
 300  */
 301 static
 302 void
 303 vm_fault_is_sequential(
 304         vm_object_t             object,
 305         vm_object_offset_t      offset,
 306         vm_behavior_t           behavior)
 307 {
 308         vm_object_offset_t      last_alloc;
 309         int                     sequential;
 310         int                     orig_sequential;
 311
 312         last_alloc = object->last_alloc;
 313         sequential = object->sequential;
 314         orig_sequential = sequential;
 315
 316         switch (behavior) {
 317         case VM_BEHAVIOR_RANDOM:
 318                 /*
 319                  * reset indicator of sequential behavior
 320                  */
 321                 sequential = 0;
 322                 break;
 323
 324         case VM_BEHAVIOR_SEQUENTIAL:
 325                 if (offset && last_alloc == offset - PAGE_SIZE_64) {
 326                         /*
 327                          * advance indicator of sequential behavior
 328                          */
 329                         if (sequential < MAX_SEQUENTIAL_RUN)
 330                                 sequential += PAGE_SIZE;
 331                 } else {
 332                         /*
 333                          * reset indicator of sequential behavior
 334                          */
 335                         sequential = 0;
 336                 }
 337                 break;
 338
 339         case VM_BEHAVIOR_RSEQNTL:
 340                 if (last_alloc && last_alloc == offset + PAGE_SIZE_64) {
 341                         /*
 342                          * advance indicator of sequential behavior
 343                          */
 344                         if (sequential > -MAX_SEQUENTIAL_RUN)
 345                                 sequential -= PAGE_SIZE;
 346                 } else {
 347                         /*
 348                          * reset indicator of sequential behavior
 349                          */
 350                         sequential = 0;
 351                 }
 352                 break;
 353
 354         case VM_BEHAVIOR_DEFAULT:
 355         default:
 356                 if (offset && last_alloc == (offset - PAGE_SIZE_64)) {
 357                         /*
 358                          * advance indicator of sequential behavior
 359                          */
 360                         if (sequential < 0)
 361                                 sequential = 0;
 362                         if (sequential < MAX_SEQUENTIAL_RUN)
 363                                 sequential += PAGE_SIZE;
 364
 365                 } else if (last_alloc && last_alloc == (offset + PAGE_SIZE_64)) {
 366                         /*
 367                          * advance indicator of sequential behavior
 368                          */
 369                         if (sequential > 0)
 370                                 sequential = 0;
 371                         if (sequential > -MAX_SEQUENTIAL_RUN)
 372                                 sequential -= PAGE_SIZE;
 373                 } else {
 374                         /*
 375                          * reset indicator of sequential behavior
 376                          */
 377                         sequential = 0;
 378                 }
 379                 break;
 380         }
 381         if (sequential != orig_sequential) {
 382                 if (!OSCompareAndSwap(orig_sequential, sequential, (UInt32 *)&object->sequential)) {
 383                         /*
 384                          * if someone else has already updated object->sequential
 385                          * don't bother trying to update it or object->last_alloc
 386                          */
 387                         return;
 388                 }
 389         }
 390         /*
 391          * I'd like to do this with a OSCompareAndSwap64, but that
 392          * doesn't exist for PPC...  however, it shouldn't matter
 393          * that much... last_alloc is maintained so that we can determine
 394          * if a sequential access pattern is taking place... if only
 395          * one thread is banging on this object, no problem with the unprotected
 396          * update... if 2 or more threads are banging away, we run the risk of
 397          * someone seeing a mangled update... however, in the face of multiple
 398          * accesses, no sequential access pattern can develop anyway, so we
 399          * haven't lost any real info.
 400          */
 401         object->last_alloc = offset;
 402 }
 403
 404
 405 int vm_page_deactivate_behind_count = 0;
 406
 407 /*
 408  * vm_page_deactivate_behind
 409  *
 410  * Determine if sequential access is in progress
 411  * in accordance with the behavior specified.  If
 412  * so, compute a potential page to deactivate and
 413  * deactivate it.
 414  *
 415  * object must be locked.
 416  *
 417  * return TRUE if we actually deactivate a page
 418  */
 419 static
 420 boolean_t
 421 vm_fault_deactivate_behind(
 422         vm_object_t             object,
 423         vm_object_offset_t      offset,
 424         vm_behavior_t           behavior)
 425 {
 426         int             n;
 427         int             pages_in_run = 0;
 428         int             max_pages_in_run = 0;
 429         int             sequential_run;
 430         int             sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
 431         vm_object_offset_t      run_offset = 0;
 432         vm_object_offset_t      pg_offset = 0;
 433         vm_page_t       m;
 434         vm_page_t       page_run[VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER];
 435
 436         pages_in_run = 0;
 437 #if TRACEFAULTPAGE
 438         dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind); /* (TEST/DEBUG) */
 439 #endif
 440
 441         if (object == kernel_object || vm_page_deactivate_behind == FALSE) {
 442                 /*
 443                  * Do not deactivate pages from the kernel object: they
 444                  * are not intended to become pageable.
 445                  * or we've disabled the deactivate behind mechanism
 446                  */
 447                 return FALSE;
 448         }
 449         if ((sequential_run = object->sequential)) {
 450                   if (sequential_run < 0) {
 451                           sequential_behavior = VM_BEHAVIOR_RSEQNTL;
 452                           sequential_run = 0 - sequential_run;
 453                   } else {
 454                           sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
 455                   }
 456         }
 457         switch (behavior) {
 458         case VM_BEHAVIOR_RANDOM:
 459                 break;
 460         case VM_BEHAVIOR_SEQUENTIAL:
 461                 if (sequential_run >= (int)PAGE_SIZE) {
 462                         run_offset = 0 - PAGE_SIZE_64;
 463                         max_pages_in_run = 1;
 464                 }
 465                 break;
 466         case VM_BEHAVIOR_RSEQNTL:
 467                 if (sequential_run >= (int)PAGE_SIZE) {
 468                         run_offset = PAGE_SIZE_64;
 469                         max_pages_in_run = 1;
 470                 }
 471                 break;
 472         case VM_BEHAVIOR_DEFAULT:
 473         default:
 474         {       vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
 475
 476                 /*
 477                  * determine if the run of sequential accesss has been
 478                  * long enough on an object with default access behavior
 479                  * to consider it for deactivation
 480                  */
 481                 if ((uint64_t)sequential_run >= behind && (sequential_run % (VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER * PAGE_SIZE)) == 0) {
 482                         /*
 483                          * the comparisons between offset and behind are done
 484                          * in this kind of odd fashion in order to prevent wrap around
 485                          * at the end points
 486                          */
 487                         if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) {
 488                                 if (offset >= behind) {
 489                                         run_offset = 0 - behind;
 490                                         pg_offset = PAGE_SIZE_64;
 491                                         max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
 492                                 }
 493                         } else {
 494                                 if (offset < -behind) {
 495                                         run_offset = behind;
 496                                         pg_offset = 0 - PAGE_SIZE_64;
 497                                         max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
 498                                 }
 499                         }
 500                 }
 501                 break;
 502         }
 503         }
 504         for (n = 0; n < max_pages_in_run; n++) {
 505                 m = vm_page_lookup(object, offset + run_offset + (n * pg_offset));
 506
 507                 if (m && !m->laundry && !m->busy && !m->no_cache && !m->throttled && !m->fictitious && !m->absent) {
 508                         page_run[pages_in_run++] = m;
 509
 510                         /*
 511                          * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
 512                          *
 513                          * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
 514                          * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
 515                          * new reference happens. If no futher references happen on the page after that remote TLB flushes
 516                          * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
 517                          * by pageout_scan, which is just fine since the last reference would have happened quite far
 518                          * in the past (TLB caches don't hang around for very long), and of course could just as easily
 519                          * have happened before we did the deactivate_behind.
 520                          */
 521                         pmap_clear_refmod_options(m->phys_page, VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
 522                 }
 523         }
 524         if (pages_in_run) {
 525                 vm_page_lockspin_queues();
 526
 527                 for (n = 0; n < pages_in_run; n++) {
 528
 529                         m = page_run[n];
 530
 531                         vm_page_deactivate_internal(m, FALSE);
 532
 533                         vm_page_deactivate_behind_count++;
 534 #if TRACEFAULTPAGE
 535                         dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
 536 #endif
 537                 }
 538                 vm_page_unlock_queues();
 539
 540                 return TRUE;
 541         }
 542         return FALSE;
 543 }
 544
 545
 546 static int
 547 vm_page_throttled(void)
 548 {
 549         clock_sec_t     elapsed_sec;
 550         clock_sec_t     tv_sec;
 551         clock_usec_t    tv_usec;
 552
 553         thread_t thread = current_thread();
 554
 555         if (thread->options & TH_OPT_VMPRIV)
 556                 return (0);
 557
 558         thread->t_page_creation_count++;
 559
 560         if (NEED_TO_HARD_THROTTLE_THIS_TASK())
 561                 return (HARD_THROTTLE_DELAY);
 562
 563         if ((vm_page_free_count < vm_page_throttle_limit || ((COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) && SWAPPER_NEEDS_TO_UNTHROTTLE())) &&
 564             thread->t_page_creation_count > vm_page_creation_throttle) {
 565
 566                 clock_get_system_microtime(&tv_sec, &tv_usec);
 567
 568                 elapsed_sec = tv_sec - thread->t_page_creation_time;
 569
 570                 if (elapsed_sec <= 6 || (thread->t_page_creation_count / elapsed_sec) >= (vm_page_creation_throttle / 6)) {
 571
 572                         if (elapsed_sec >= 60) {
 573                                 /*
 574                                  * we'll reset our stats to give a well behaved app
 575                                  * that was unlucky enough to accumulate a bunch of pages
 576                                  * over a long period of time a chance to get out of
 577                                  * the throttled state... we reset the counter and timestamp
 578                                  * so that if it stays under the rate limit for the next second
 579                                  * it will be back in our good graces... if it exceeds it, it
 580                                  * will remain in the throttled state
 581                                  */
 582                                 thread->t_page_creation_time = tv_sec;
 583                                 thread->t_page_creation_count = (vm_page_creation_throttle / 6) * 5;
 584                         }
 585                         ++vm_page_throttle_count;
 586
 587                         if ((COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) && HARD_THROTTLE_LIMIT_REACHED())
 588                                 return (HARD_THROTTLE_DELAY);
 589                         else
 590                                 return (SOFT_THROTTLE_DELAY);
 591                 }
 592                 thread->t_page_creation_time = tv_sec;
 593                 thread->t_page_creation_count = 0;
 594         }
 595         return (0);
 596 }
 597
 598
 599 /*
 600  * check for various conditions that would
 601  * prevent us from creating a ZF page...
 602  * cleanup is based on being called from vm_fault_page
 603  *
 604  * object must be locked
 605  * object == m->object
 606  */
 607 static vm_fault_return_t
 608 vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t interruptible_state)
 609 {
 610         int throttle_delay;
 611
 612         if (object->shadow_severed ||
 613             VM_OBJECT_PURGEABLE_FAULT_ERROR(object)) {
 614                 /*
 615                  * Either:
 616                  * 1. the shadow chain was severed,
 617                  * 2. the purgeable object is volatile or empty and is marked
 618                  *    to fault on access while volatile.
 619                  * Just have to return an error at this point
 620                  */
 621                 if (m != VM_PAGE_NULL)
 622                         VM_PAGE_FREE(m);
 623                 vm_fault_cleanup(object, first_m);
 624
 625                 thread_interrupt_level(interruptible_state);
 626
 627                 return (VM_FAULT_MEMORY_ERROR);
 628         }
 629         if (vm_backing_store_low) {
 630                 /*
 631                  * are we protecting the system from
 632                  * backing store exhaustion.  If so
 633                  * sleep unless we are privileged.
 634                  */
 635                 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
 636
 637                         if (m != VM_PAGE_NULL)
 638                                 VM_PAGE_FREE(m);
 639                         vm_fault_cleanup(object, first_m);
 640
 641                         assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
 642
 643                         thread_block(THREAD_CONTINUE_NULL);
 644                         thread_interrupt_level(interruptible_state);
 645
 646                         return (VM_FAULT_RETRY);
 647                 }
 648         }
 649         if ((throttle_delay = vm_page_throttled())) {
 650                 /*
 651                  * we're throttling zero-fills...
 652                  * treat this as if we couldn't grab a page
 653                  */
 654                 if (m != VM_PAGE_NULL)
 655                         VM_PAGE_FREE(m);
 656                 vm_fault_cleanup(object, first_m);
 657
 658                 VM_DEBUG_EVENT(vmf_check_zfdelay, VMF_CHECK_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
 659
 660                 delay(throttle_delay);
 661
 662                 if (current_thread_aborted()) {
 663                         thread_interrupt_level(interruptible_state);
 664                         return VM_FAULT_INTERRUPTED;
 665                 }
 666                 thread_interrupt_level(interruptible_state);
 667
 668                 return (VM_FAULT_MEMORY_SHORTAGE);
 669         }
 670         return (VM_FAULT_SUCCESS);
 671 }
 672
 673
 674 /*
 675  * do the work to zero fill a page and
 676  * inject it into the correct paging queue
 677  *
 678  * m->object must be locked
 679  * page queue lock must NOT be held
 680  */
 681 static int
 682 vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill)
 683 {
 684         int my_fault = DBG_ZERO_FILL_FAULT;
 685
 686         /*
 687          * This is is a zero-fill page fault...
 688          *
 689          * Checking the page lock is a waste of
 690          * time;  this page was absent, so
 691          * it can't be page locked by a pager.
 692          *
 693          * we also consider it undefined
 694          * with respect to instruction
 695          * execution.  i.e. it is the responsibility
 696          * of higher layers to call for an instruction
 697          * sync after changing the contents and before
 698          * sending a program into this area.  We
 699          * choose this approach for performance
 700          */
 701         m->pmapped = TRUE;
 702
 703         m->cs_validated = FALSE;
 704         m->cs_tainted = FALSE;
 705
 706         if (no_zero_fill == TRUE) {
 707                 my_fault = DBG_NZF_PAGE_FAULT;
 708         } else {
 709                 vm_page_zero_fill(m);
 710
 711                 VM_STAT_INCR(zero_fill_count);
 712                 DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
 713         }
 714         assert(!m->laundry);
 715         assert(m->object != kernel_object);
 716         //assert(m->pageq.next == NULL && m->pageq.prev == NULL);
 717
 718         if (!VM_DYNAMIC_PAGING_ENABLED(memory_manager_default) &&
 719                 (m->object->purgable == VM_PURGABLE_DENY ||
 720                  m->object->purgable == VM_PURGABLE_NONVOLATILE ||
 721                  m->object->purgable == VM_PURGABLE_VOLATILE )) {
 722
 723                 vm_page_lockspin_queues();
 724
 725                 if (!VM_DYNAMIC_PAGING_ENABLED(memory_manager_default)) {
 726                         assert(!VM_PAGE_WIRED(m));
 727
 728                         /*
 729                          * can't be on the pageout queue since we don't
 730                          * have a pager to try and clean to
 731                          */
 732                         assert(!m->pageout_queue);
 733
 734                         VM_PAGE_QUEUES_REMOVE(m);
 735
 736                         queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq);
 737                         m->throttled = TRUE;
 738                         vm_page_throttled_count++;
 739                 }
 740                 vm_page_unlock_queues();
 741         }
 742         return (my_fault);
 743 }
 744
 745
 746 /*
 747  *      Routine:        vm_fault_page
 748  *      Purpose:
 749  *              Find the resident page for the virtual memory
 750  *              specified by the given virtual memory object
 751  *              and offset.
 752  *      Additional arguments:
 753  *              The required permissions for the page is given
 754  *              in "fault_type".  Desired permissions are included
 755  *              in "protection".
 756  *              fault_info is passed along to determine pagein cluster
 757  *              limits... it contains the expected reference pattern,
 758  *              cluster size if available, etc...
 759  *
 760  *              If the desired page is known to be resident (for
 761  *              example, because it was previously wired down), asserting
 762  *              the "unwiring" parameter will speed the search.
 763  *
 764  *              If the operation can be interrupted (by thread_abort
 765  *              or thread_terminate), then the "interruptible"
 766  *              parameter should be asserted.
 767  *
 768  *      Results:
 769  *              The page containing the proper data is returned
 770  *              in "result_page".
 771  *
 772  *      In/out conditions:
 773  *              The source object must be locked and referenced,
 774  *              and must donate one paging reference.  The reference
 775  *              is not affected.  The paging reference and lock are
 776  *              consumed.
 777  *
 778  *              If the call succeeds, the object in which "result_page"
 779  *              resides is left locked and holding a paging reference.
 780  *              If this is not the original object, a busy page in the
 781  *              original object is returned in "top_page", to prevent other
 782  *              callers from pursuing this same data, along with a paging
 783  *              reference for the original object.  The "top_page" should
 784  *              be destroyed when this guarantee is no longer required.
 785  *              The "result_page" is also left busy.  It is not removed
 786  *              from the pageout queues.
 787  *      Special Case:
 788  *              A return value of VM_FAULT_SUCCESS_NO_PAGE means that the
 789  *              fault succeeded but there's no VM page (i.e. the VM object
 790  *              does not actually hold VM pages, but device memory or
 791  *              large pages).  The object is still locked and we still hold a
 792  *              paging_in_progress reference.
 793  */
 794 unsigned int vm_fault_page_blocked_access = 0;
 795 unsigned int vm_fault_page_forced_retry = 0;
 796
 797 vm_fault_return_t
 798 vm_fault_page(
 799         /* Arguments: */
 800         vm_object_t     first_object,   /* Object to begin search */
 801         vm_object_offset_t first_offset,        /* Offset into object */
 802         vm_prot_t       fault_type,     /* What access is requested */
 803         boolean_t       must_be_resident,/* Must page be resident? */
 804         boolean_t       caller_lookup,  /* caller looked up page */
 805         /* Modifies in place: */
 806         vm_prot_t       *protection,    /* Protection for mapping */
 807         vm_page_t       *result_page,   /* Page found, if successful */
 808         /* Returns: */
 809         vm_page_t       *top_page,      /* Page in top object, if
 810                                          * not result_page.  */
 811         int             *type_of_fault, /* if non-null, fill in with type of fault
 812                                          * COW, zero-fill, etc... returned in trace point */
 813         /* More arguments: */
 814         kern_return_t   *error_code,    /* code if page is in error */
 815         boolean_t       no_zero_fill,   /* don't zero fill absent pages */
 816         boolean_t       data_supply,    /* treat as data_supply if
 817                                          * it is a write fault and a full
 818                                          * page is provided */
 819         vm_object_fault_info_t fault_info)
 820 {
 821         vm_page_t               m;
 822         vm_object_t             object;
 823         vm_object_offset_t      offset;
 824         vm_page_t               first_m;
 825         vm_object_t             next_object;
 826         vm_object_t             copy_object;
 827         boolean_t               look_for_page;
 828         boolean_t               force_fault_retry = FALSE;
 829         vm_prot_t               access_required = fault_type;
 830         vm_prot_t               wants_copy_flag;
 831         CLUSTER_STAT(int pages_at_higher_offsets;)
 832         CLUSTER_STAT(int pages_at_lower_offsets;)
 833         kern_return_t           wait_result;
 834         boolean_t               interruptible_state;
 835         boolean_t               data_already_requested = FALSE;
 836         vm_behavior_t           orig_behavior;
 837         vm_size_t               orig_cluster_size;
 838         vm_fault_return_t       error;
 839         int                     my_fault;
 840         uint32_t                try_failed_count;
 841         int                     interruptible; /* how may fault be interrupted? */
 842         int                     external_state = VM_EXTERNAL_STATE_UNKNOWN;
 843         memory_object_t         pager;
 844         vm_fault_return_t       retval;
 845
 846 /*
 847  * MACH page map - an optional optimization where a bit map is maintained
 848  * by the VM subsystem for internal objects to indicate which pages of
 849  * the object currently reside on backing store.  This existence map
 850  * duplicates information maintained by the vnode pager.  It is
 851  * created at the time of the first pageout against the object, i.e.
 852  * at the same time pager for the object is created.  The optimization
 853  * is designed to eliminate pager interaction overhead, if it is
 854  * 'known' that the page does not exist on backing store.
 855  *
 856  * MUST_ASK_PAGER() evaluates to TRUE if the page specified by object/offset is
 857  * either marked as paged out in the existence map for the object or no
 858  * existence map exists for the object.  MUST_ASK_PAGER() is one of the
 859  * criteria in the decision to invoke the pager.   It is also used as one
 860  * of the criteria to terminate the scan for adjacent pages in a clustered
 861  * pagein operation.  Note that MUST_ASK_PAGER() always evaluates to TRUE for
 862  * permanent objects.  Note also that if the pager for an internal object
 863  * has not been created, the pager is not invoked regardless of the value
 864  * of MUST_ASK_PAGER() and that clustered pagein scans are only done on an object
 865  * for which a pager has been created.
 866  *
 867  * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
 868  * is marked as paged out in the existence map for the object.  PAGED_OUT()
 869  * PAGED_OUT() is used to determine if a page has already been pushed
 870  * into a copy object in order to avoid a redundant page out operation.
 871  */
 872 #if MACH_PAGEMAP
 873 #define MUST_ASK_PAGER(o, f, s)                                 \
 874         ((vm_external_state_get((o)->existence_map, (f))        \
 875           != VM_EXTERNAL_STATE_ABSENT) &&                       \
 876          (s = (VM_COMPRESSOR_PAGER_STATE_GET((o), (f))))        \
 877          != VM_EXTERNAL_STATE_ABSENT)
 878 #define PAGED_OUT(o, f)                                         \
 879         ((vm_external_state_get((o)->existence_map, (f))        \
 880           == VM_EXTERNAL_STATE_EXISTS) ||                       \
 881          (VM_COMPRESSOR_PAGER_STATE_GET((o), (f))               \
 882           == VM_EXTERNAL_STATE_EXISTS))
 883 #else /* MACH_PAGEMAP */
 884 #define MUST_ASK_PAGER(o, f, s)                                 \
 885         ((s = VM_COMPRESSOR_PAGER_STATE_GET((o), (f))) != VM_EXTERNAL_STATE_ABSENT)
 886 #define PAGED_OUT(o, f) \
 887         (VM_COMPRESSOR_PAGER_STATE_GET((o), (f)) == VM_EXTERNAL_STATE_EXISTS)
 888 #endif /* MACH_PAGEMAP */
 889
 890 /*
 891  *      Recovery actions
 892  */
 893 #define RELEASE_PAGE(m)                                 \
 894         MACRO_BEGIN                                     \
 895         PAGE_WAKEUP_DONE(m);                            \
 896         if (!m->active && !m->inactive && !m->throttled) {              \
 897                 vm_page_lockspin_queues();                              \
 898                 if (!m->active && !m->inactive && !m->throttled) {      \
 899                         if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE)   \
 900                                 vm_page_deactivate(m);                  \
 901                         else                                            \
 902                                 vm_page_activate(m);                    \
 903                 }                                                       \
 904                 vm_page_unlock_queues();                                \
 905         }                                                               \
 906         MACRO_END
 907
 908 #if TRACEFAULTPAGE
 909         dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */
 910 #endif
 911
 912         interruptible = fault_info->interruptible;
 913         interruptible_state = thread_interrupt_level(interruptible);
 914
 915         /*
 916          *      INVARIANTS (through entire routine):
 917          *
 918          *      1)      At all times, we must either have the object
 919          *              lock or a busy page in some object to prevent
 920          *              some other thread from trying to bring in
 921          *              the same page.
 922          *
 923          *              Note that we cannot hold any locks during the
 924          *              pager access or when waiting for memory, so
 925          *              we use a busy page then.
 926          *
 927          *      2)      To prevent another thread from racing us down the
 928          *              shadow chain and entering a new page in the top
 929          *              object before we do, we must keep a busy page in
 930          *              the top object while following the shadow chain.
 931          *
 932          *      3)      We must increment paging_in_progress on any object
 933          *              for which we have a busy page before dropping
 934          *              the object lock
 935          *
 936          *      4)      We leave busy pages on the pageout queues.
 937          *              If the pageout daemon comes across a busy page,
 938          *              it will remove the page from the pageout queues.
 939          */
 940
 941         object = first_object;
 942         offset = first_offset;
 943         first_m = VM_PAGE_NULL;
 944         access_required = fault_type;
 945
 946
 947         XPR(XPR_VM_FAULT,
 948                 "vm_f_page: obj 0x%X, offset 0x%X, type %d, prot %d\n",
 949                 object, offset, fault_type, *protection, 0);
 950
 951         /*
 952          * default type of fault
 953          */
 954         my_fault = DBG_CACHE_HIT_FAULT;
 955
 956         while (TRUE) {
 957 #if TRACEFAULTPAGE
 958                 dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
 959 #endif
 960                 if (!object->alive) {
 961                         /*
 962                          * object is no longer valid
 963                          * clean up and return error
 964                          */
 965                         vm_fault_cleanup(object, first_m);
 966                         thread_interrupt_level(interruptible_state);
 967
 968                         return (VM_FAULT_MEMORY_ERROR);
 969                 }
 970
 971                 if (!object->pager_created && object->phys_contiguous) {
 972                         /*
 973                          * A physically-contiguous object without a pager:
 974                          * must be a "large page" object.  We do not deal
 975                          * with VM pages for this object.
 976                          */
 977                         caller_lookup = FALSE;
 978                         m = VM_PAGE_NULL;
 979                         goto phys_contig_object;
 980                 }
 981
 982                 if (object->blocked_access) {
 983                         /*
 984                          * Access to this VM object has been blocked.
 985                          * Replace our "paging_in_progress" reference with
 986                          * a "activity_in_progress" reference and wait for
 987                          * access to be unblocked.
 988                          */
 989                         caller_lookup = FALSE; /* no longer valid after sleep */
 990                         vm_object_activity_begin(object);
 991                         vm_object_paging_end(object);
 992                         while (object->blocked_access) {
 993                                 vm_object_sleep(object,
 994                                                 VM_OBJECT_EVENT_UNBLOCKED,
 995                                                 THREAD_UNINT);
 996                         }
 997                         vm_fault_page_blocked_access++;
 998                         vm_object_paging_begin(object);
 999                         vm_object_activity_end(object);
1000                 }
1001
1002                 /*
1003                  * See whether the page at 'offset' is resident
1004                  */
1005                 if (caller_lookup == TRUE) {
1006                         /*
1007                          * The caller has already looked up the page
1008                          * and gave us the result in "result_page".
1009                          * We can use this for the first lookup but
1010                          * it loses its validity as soon as we unlock
1011                          * the object.
1012                          */
1013                         m = *result_page;
1014                         caller_lookup = FALSE; /* no longer valid after that */
1015                 } else {
1016                         m = vm_page_lookup(object, offset);
1017                 }
1018 #if TRACEFAULTPAGE
1019                 dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
1020 #endif
1021                 if (m != VM_PAGE_NULL) {
1022
1023                         if (m->busy) {
1024                                 /*
1025                                  * The page is being brought in,
1026                                  * wait for it and then retry.
1027                                  */
1028 #if TRACEFAULTPAGE
1029                                 dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1030 #endif
1031                                 wait_result = PAGE_SLEEP(object, m, interruptible);
1032
1033                                 XPR(XPR_VM_FAULT,
1034                                     "vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n",
1035                                     object, offset,
1036                                     m, 0, 0);
1037                                 counter(c_vm_fault_page_block_busy_kernel++);
1038
1039                                 if (wait_result != THREAD_AWAKENED) {
1040                                         vm_fault_cleanup(object, first_m);
1041                                         thread_interrupt_level(interruptible_state);
1042
1043                                         if (wait_result == THREAD_RESTART)
1044                                                 return (VM_FAULT_RETRY);
1045                                         else
1046                                                 return (VM_FAULT_INTERRUPTED);
1047                                 }
1048                                 continue;
1049                         }
1050                         if (m->laundry) {
1051                                 m->pageout = FALSE;
1052
1053                                 if (!m->cleaning)
1054                                         vm_pageout_steal_laundry(m, FALSE);
1055                         }
1056                         if (m->phys_page == vm_page_guard_addr) {
1057                                 /*
1058                                  * Guard page: off limits !
1059                                  */
1060                                 if (fault_type == VM_PROT_NONE) {
1061                                         /*
1062                                          * The fault is not requesting any
1063                                          * access to the guard page, so it must
1064                                          * be just to wire or unwire it.
1065                                          * Let's pretend it succeeded...
1066                                          */
1067                                         m->busy = TRUE;
1068                                         *result_page = m;
1069                                         assert(first_m == VM_PAGE_NULL);
1070                                         *top_page = first_m;
1071                                         if (type_of_fault)
1072                                                 *type_of_fault = DBG_GUARD_FAULT;
1073                                         thread_interrupt_level(interruptible_state);
1074                                         return VM_FAULT_SUCCESS;
1075                                 } else {
1076                                         /*
1077                                          * The fault requests access to the
1078                                          * guard page: let's deny that !
1079                                          */
1080                                         vm_fault_cleanup(object, first_m);
1081                                         thread_interrupt_level(interruptible_state);
1082                                         return VM_FAULT_MEMORY_ERROR;
1083                                 }
1084                         }
1085
1086                         if (m->error) {
1087                                 /*
1088                                  * The page is in error, give up now.
1089                                  */
1090 #if TRACEFAULTPAGE
1091                                 dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code);      /* (TEST/DEBUG) */
1092 #endif
1093                                 if (error_code)
1094                                         *error_code = KERN_MEMORY_ERROR;
1095                                 VM_PAGE_FREE(m);
1096
1097                                 vm_fault_cleanup(object, first_m);
1098                                 thread_interrupt_level(interruptible_state);
1099
1100                                 return (VM_FAULT_MEMORY_ERROR);
1101                         }
1102                         if (m->restart) {
1103                                 /*
1104                                  * The pager wants us to restart
1105                                  * at the top of the chain,
1106                                  * typically because it has moved the
1107                                  * page to another pager, then do so.
1108                                  */
1109 #if TRACEFAULTPAGE
1110                                 dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1111 #endif
1112                                 VM_PAGE_FREE(m);
1113
1114                                 vm_fault_cleanup(object, first_m);
1115                                 thread_interrupt_level(interruptible_state);
1116
1117                                 return (VM_FAULT_RETRY);
1118                         }
1119                         if (m->absent) {
1120                                 /*
1121                                  * The page isn't busy, but is absent,
1122                                  * therefore it's deemed "unavailable".
1123                                  *
1124                                  * Remove the non-existent page (unless it's
1125                                  * in the top object) and move on down to the
1126                                  * next object (if there is one).
1127                                  */
1128 #if TRACEFAULTPAGE
1129                                 dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow);  /* (TEST/DEBUG) */
1130 #endif
1131                                 next_object = object->shadow;
1132
1133                                 if (next_object == VM_OBJECT_NULL) {
1134                                         /*
1135                                          * Absent page at bottom of shadow
1136                                          * chain; zero fill the page we left
1137                                          * busy in the first object, and free
1138                                          * the absent page.
1139                                          */
1140                                         assert(!must_be_resident);
1141
1142                                         /*
1143                                          * check for any conditions that prevent
1144                                          * us from creating a new zero-fill page
1145                                          * vm_fault_check will do all of the
1146                                          * fault cleanup in the case of an error condition
1147                                          * including resetting the thread_interrupt_level
1148                                          */
1149                                         error = vm_fault_check(object, m, first_m, interruptible_state);
1150
1151                                         if (error != VM_FAULT_SUCCESS)
1152                                                 return (error);
1153
1154                                         XPR(XPR_VM_FAULT,
1155                                             "vm_f_page: zero obj 0x%X, off 0x%X, page 0x%X, first_obj 0x%X\n",
1156                                                 object, offset,
1157                                                 m,
1158                                                 first_object, 0);
1159
1160                                         if (object != first_object) {
1161                                                 /*
1162                                                  * free the absent page we just found
1163                                                  */
1164                                                 VM_PAGE_FREE(m);
1165
1166                                                 /*
1167                                                  * drop reference and lock on current object
1168                                                  */
1169                                                 vm_object_paging_end(object);
1170                                                 vm_object_unlock(object);
1171
1172                                                 /*
1173                                                  * grab the original page we
1174                                                  * 'soldered' in place and
1175                                                  * retake lock on 'first_object'
1176                                                  */
1177                                                 m = first_m;
1178                                                 first_m = VM_PAGE_NULL;
1179
1180                                                 object = first_object;
1181                                                 offset = first_offset;
1182
1183                                                 vm_object_lock(object);
1184                                         } else {
1185                                                 /*
1186                                                  * we're going to use the absent page we just found
1187                                                  * so convert it to a 'busy' page
1188                                                  */
1189                                                 m->absent = FALSE;
1190                                                 m->busy = TRUE;
1191                                         }
1192                                         /*
1193                                          * zero-fill the page and put it on
1194                                          * the correct paging queue
1195                                          */
1196                                         my_fault = vm_fault_zero_page(m, no_zero_fill);
1197
1198                                         if (fault_info->mark_zf_absent && no_zero_fill == TRUE)
1199                                                 m->absent = TRUE;
1200
1201                                         break;
1202                                 } else {
1203                                         if (must_be_resident)
1204                                                 vm_object_paging_end(object);
1205                                         else if (object != first_object) {
1206                                                 vm_object_paging_end(object);
1207                                                 VM_PAGE_FREE(m);
1208                                         } else {
1209                                                 first_m = m;
1210                                                 m->absent = FALSE;
1211                                                 m->busy = TRUE;
1212
1213                                                 vm_page_lockspin_queues();
1214
1215                                                 assert(!m->pageout_queue);
1216                                                 VM_PAGE_QUEUES_REMOVE(m);
1217
1218                                                 vm_page_unlock_queues();
1219                                         }
1220                                         XPR(XPR_VM_FAULT,
1221                                             "vm_f_page: unavail obj 0x%X, off 0x%X, next_obj 0x%X, newoff 0x%X\n",
1222                                                 object, offset,
1223                                                 next_object,
1224                                                 offset+object->vo_shadow_offset,0);
1225
1226                                         offset += object->vo_shadow_offset;
1227                                         fault_info->lo_offset += object->vo_shadow_offset;
1228                                         fault_info->hi_offset += object->vo_shadow_offset;
1229                                         access_required = VM_PROT_READ;
1230
1231                                         vm_object_lock(next_object);
1232                                         vm_object_unlock(object);
1233                                         object = next_object;
1234                                         vm_object_paging_begin(object);
1235
1236                                         /*
1237                                          * reset to default type of fault
1238                                          */
1239                                         my_fault = DBG_CACHE_HIT_FAULT;
1240
1241                                         continue;
1242                                 }
1243                         }
1244                         if ((m->cleaning)
1245                             && ((object != first_object) || (object->copy != VM_OBJECT_NULL))
1246                             && (fault_type & VM_PROT_WRITE)) {
1247                                 /*
1248                                  * This is a copy-on-write fault that will
1249                                  * cause us to revoke access to this page, but
1250                                  * this page is in the process of being cleaned
1251                                  * in a clustered pageout. We must wait until
1252                                  * the cleaning operation completes before
1253                                  * revoking access to the original page,
1254                                  * otherwise we might attempt to remove a
1255                                  * wired mapping.
1256                                  */
1257 #if TRACEFAULTPAGE
1258                                 dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset);  /* (TEST/DEBUG) */
1259 #endif
1260                                 XPR(XPR_VM_FAULT,
1261                                     "vm_f_page: cleaning obj 0x%X, offset 0x%X, page 0x%X\n",
1262                                         object, offset,
1263                                         m, 0, 0);
1264                                 /*
1265                                  * take an extra ref so that object won't die
1266                                  */
1267                                 vm_object_reference_locked(object);
1268
1269                                 vm_fault_cleanup(object, first_m);
1270
1271                                 counter(c_vm_fault_page_block_backoff_kernel++);
1272                                 vm_object_lock(object);
1273                                 assert(object->ref_count > 0);
1274
1275                                 m = vm_page_lookup(object, offset);
1276
1277                                 if (m != VM_PAGE_NULL && m->cleaning) {
1278                                         PAGE_ASSERT_WAIT(m, interruptible);
1279
1280                                         vm_object_unlock(object);
1281                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1282                                         vm_object_deallocate(object);
1283
1284                                         goto backoff;
1285                                 } else {
1286                                         vm_object_unlock(object);
1287
1288                                         vm_object_deallocate(object);
1289                                         thread_interrupt_level(interruptible_state);
1290
1291                                         return (VM_FAULT_RETRY);
1292                                 }
1293                         }
1294                         if (type_of_fault == NULL && m->speculative &&
1295                             !(fault_info != NULL && fault_info->stealth)) {
1296                                 /*
1297                                  * If we were passed a non-NULL pointer for
1298                                  * "type_of_fault", than we came from
1299                                  * vm_fault... we'll let it deal with
1300                                  * this condition, since it
1301                                  * needs to see m->speculative to correctly
1302                                  * account the pageins, otherwise...
1303                                  * take it off the speculative queue, we'll
1304                                  * let the caller of vm_fault_page deal
1305                                  * with getting it onto the correct queue
1306                                  *
1307                                  * If the caller specified in fault_info that
1308                                  * it wants a "stealth" fault, we also leave
1309                                  * the page in the speculative queue.
1310                                  */
1311                                 vm_page_lockspin_queues();
1312                                 if (m->speculative)
1313                                         VM_PAGE_QUEUES_REMOVE(m);
1314                                 vm_page_unlock_queues();
1315                         }
1316
1317                         if (m->encrypted) {
1318                                 /*
1319                                  * ENCRYPTED SWAP:
1320                                  * the user needs access to a page that we
1321                                  * encrypted before paging it out.
1322                                  * Decrypt the page now.
1323                                  * Keep it busy to prevent anyone from
1324                                  * accessing it during the decryption.
1325                                  */
1326                                 m->busy = TRUE;
1327                                 vm_page_decrypt(m, 0);
1328                                 assert(object == m->object);
1329                                 assert(m->busy);
1330                                 PAGE_WAKEUP_DONE(m);
1331
1332                                 /*
1333                                  * Retry from the top, in case
1334                                  * something changed while we were
1335                                  * decrypting.
1336                                  */
1337                                 continue;
1338                         }
1339                         ASSERT_PAGE_DECRYPTED(m);
1340
1341                         if (m->object->code_signed) {
1342                                 /*
1343                                  * CODE SIGNING:
1344                                  * We just paged in a page from a signed
1345                                  * memory object but we don't need to
1346                                  * validate it now.  We'll validate it if
1347                                  * when it gets mapped into a user address
1348                                  * space for the first time or when the page
1349                                  * gets copied to another object as a result
1350                                  * of a copy-on-write.
1351                                  */
1352                         }
1353
1354                         /*
1355                          * We mark the page busy and leave it on
1356                          * the pageout queues.  If the pageout
1357                          * deamon comes across it, then it will
1358                          * remove the page from the queue, but not the object
1359                          */
1360 #if TRACEFAULTPAGE
1361                         dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1362 #endif
1363                         XPR(XPR_VM_FAULT,
1364                             "vm_f_page: found page obj 0x%X, offset 0x%X, page 0x%X\n",
1365                                 object, offset, m, 0, 0);
1366                         assert(!m->busy);
1367                         assert(!m->absent);
1368
1369                         m->busy = TRUE;
1370                         break;
1371                 }
1372
1373
1374                 /*
1375                  * we get here when there is no page present in the object at
1376                  * the offset we're interested in... we'll allocate a page
1377                  * at this point if the pager associated with
1378                  * this object can provide the data or we're the top object...
1379                  * object is locked;  m == NULL
1380                  */
1381                 if (must_be_resident) {
1382                         if (fault_type == VM_PROT_NONE &&
1383                             object == kernel_object) {
1384                                 /*
1385                                  * We've been called from vm_fault_unwire()
1386                                  * while removing a map entry that was allocated
1387                                  * with KMA_KOBJECT and KMA_VAONLY.  This page
1388                                  * is not present and there's nothing more to
1389                                  * do here (nothing to unwire).
1390                                  */
1391                                 vm_fault_cleanup(object, first_m);
1392                                 thread_interrupt_level(interruptible_state);
1393
1394                                 return VM_FAULT_MEMORY_ERROR;
1395                         }
1396
1397                         goto dont_look_for_page;
1398                 }
1399
1400 #if !MACH_PAGEMAP
1401                 data_supply = FALSE;
1402 #endif /* !MACH_PAGEMAP */
1403
1404                 look_for_page = (object->pager_created && (MUST_ASK_PAGER(object, offset, external_state) == TRUE) && !data_supply);
1405
1406 #if TRACEFAULTPAGE
1407                 dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object);      /* (TEST/DEBUG) */
1408 #endif
1409                 if (!look_for_page && object == first_object && !object->phys_contiguous) {
1410                         /*
1411                          * Allocate a new page for this object/offset pair as a placeholder
1412                          */
1413                         m = vm_page_grab();
1414 #if TRACEFAULTPAGE
1415                         dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
1416 #endif
1417                         if (m == VM_PAGE_NULL) {
1418
1419                                 vm_fault_cleanup(object, first_m);
1420                                 thread_interrupt_level(interruptible_state);
1421
1422                                 return (VM_FAULT_MEMORY_SHORTAGE);
1423                         }
1424
1425                         if (fault_info && fault_info->batch_pmap_op == TRUE) {
1426                                 vm_page_insert_internal(m, object, offset, FALSE, TRUE, TRUE);
1427                         } else {
1428                                 vm_page_insert(m, object, offset);
1429                         }
1430                 }
1431                 if (look_for_page) {
1432                         kern_return_t   rc;
1433                         int             my_fault_type;
1434
1435                         /*
1436                          *      If the memory manager is not ready, we
1437                          *      cannot make requests.
1438                          */
1439                         if (!object->pager_ready) {
1440 #if TRACEFAULTPAGE
1441                                 dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
1442 #endif
1443                                 if (m != VM_PAGE_NULL)
1444                                         VM_PAGE_FREE(m);
1445
1446                                 XPR(XPR_VM_FAULT,
1447                                 "vm_f_page: ready wait obj 0x%X, offset 0x%X\n",
1448                                         object, offset, 0, 0, 0);
1449
1450                                 /*
1451                                  * take an extra ref so object won't die
1452                                  */
1453                                 vm_object_reference_locked(object);
1454                                 vm_fault_cleanup(object, first_m);
1455                                 counter(c_vm_fault_page_block_backoff_kernel++);
1456
1457                                 vm_object_lock(object);
1458                                 assert(object->ref_count > 0);
1459
1460                                 if (!object->pager_ready) {
1461                                         wait_result = vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGER_READY, interruptible);
1462
1463                                         vm_object_unlock(object);
1464                                         if (wait_result == THREAD_WAITING)
1465                                                 wait_result = thread_block(THREAD_CONTINUE_NULL);
1466                                         vm_object_deallocate(object);
1467
1468                                         goto backoff;
1469                                 } else {
1470                                         vm_object_unlock(object);
1471                                         vm_object_deallocate(object);
1472                                         thread_interrupt_level(interruptible_state);
1473
1474                                         return (VM_FAULT_RETRY);
1475                                 }
1476                         }
1477                         if (!object->internal && !object->phys_contiguous && object->paging_in_progress > vm_object_pagein_throttle) {
1478                                 /*
1479                                  * If there are too many outstanding page
1480                                  * requests pending on this external object, we
1481                                  * wait for them to be resolved now.
1482                                  */
1483 #if TRACEFAULTPAGE
1484                                 dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1485 #endif
1486                                 if (m != VM_PAGE_NULL)
1487                                         VM_PAGE_FREE(m);
1488                                 /*
1489                                  * take an extra ref so object won't die
1490                                  */
1491                                 vm_object_reference_locked(object);
1492
1493                                 vm_fault_cleanup(object, first_m);
1494
1495                                 counter(c_vm_fault_page_block_backoff_kernel++);
1496
1497                                 vm_object_lock(object);
1498                                 assert(object->ref_count > 0);
1499
1500                                 if (object->paging_in_progress >= vm_object_pagein_throttle) {
1501                                         vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGING_ONLY_IN_PROGRESS, interruptible);
1502
1503                                         vm_object_unlock(object);
1504                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1505                                         vm_object_deallocate(object);
1506
1507                                         goto backoff;
1508                                 } else {
1509                                         vm_object_unlock(object);
1510                                         vm_object_deallocate(object);
1511                                         thread_interrupt_level(interruptible_state);
1512
1513                                         return (VM_FAULT_RETRY);
1514                                 }
1515                         }
1516                         if ((COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) && object->internal) {
1517
1518                                 if (m == VM_PAGE_NULL) {
1519                                         /*
1520                                          * Allocate a new page for this object/offset pair as a placeholder
1521                                          */
1522                                         m = vm_page_grab();
1523 #if TRACEFAULTPAGE
1524                                         dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
1525 #endif
1526                                         if (m == VM_PAGE_NULL) {
1527
1528                                                 vm_fault_cleanup(object, first_m);
1529                                                 thread_interrupt_level(interruptible_state);
1530
1531                                                 return (VM_FAULT_MEMORY_SHORTAGE);
1532                                         }
1533
1534                                         m->absent = TRUE;
1535                                         if (fault_info && fault_info->batch_pmap_op == TRUE) {
1536                                                 vm_page_insert_internal(m, object, offset, FALSE, TRUE, TRUE);
1537                                         } else {
1538                                                 vm_page_insert(m, object, offset);
1539                                         }
1540                                 }
1541                                 assert(m->busy);
1542
1543                                 m->absent = TRUE;
1544                                 pager = object->pager;
1545
1546                                 vm_object_unlock(object);
1547
1548                                 rc = vm_compressor_pager_get(pager, offset + object->paging_offset, m->phys_page, &my_fault_type, 0);
1549
1550                                 vm_object_lock(object);
1551
1552                                 switch (rc) {
1553                                 case KERN_SUCCESS:
1554                                         m->absent = FALSE;
1555                                         m->dirty = TRUE;
1556                                         if ((m->object->wimg_bits &
1557                                              VM_WIMG_MASK) !=
1558                                             VM_WIMG_USE_DEFAULT) {
1559                                                 /*
1560                                                  * If the page is not cacheable,
1561                                                  * we can't let its contents
1562                                                  * linger in the data cache
1563                                                  * after the decompression.
1564                                                  */
1565                                                 pmap_sync_page_attributes_phys(
1566                                                         m->phys_page);
1567                                         } else
1568                                                 m->written_by_kernel = TRUE;
1569                                         break;
1570                                 case KERN_MEMORY_FAILURE:
1571                                         m->unusual = TRUE;
1572                                         m->error = TRUE;
1573                                         m->absent = FALSE;
1574                                         break;
1575                                 case KERN_MEMORY_ERROR:
1576                                         assert(m->absent);
1577                                         break;
1578                                 default:
1579                                         panic("?");
1580                                 }
1581                                 PAGE_WAKEUP_DONE(m);
1582
1583                                 rc = KERN_SUCCESS;
1584                                 goto data_requested;
1585                         }
1586                         my_fault_type = DBG_PAGEIN_FAULT;
1587
1588                         if (m != VM_PAGE_NULL) {
1589                                 VM_PAGE_FREE(m);
1590                                 m = VM_PAGE_NULL;
1591                         }
1592
1593 #if TRACEFAULTPAGE
1594                         dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0);  /* (TEST/DEBUG) */
1595 #endif
1596
1597                         /*
1598                          * It's possible someone called vm_object_destroy while we weren't
1599                          * holding the object lock.  If that has happened, then bail out
1600                          * here.
1601                          */
1602
1603                         pager = object->pager;
1604
1605                         if (pager == MEMORY_OBJECT_NULL) {
1606                                 vm_fault_cleanup(object, first_m);
1607                                 thread_interrupt_level(interruptible_state);
1608                                 return VM_FAULT_MEMORY_ERROR;
1609                         }
1610
1611                         /*
1612                          * We have an absent page in place for the faulting offset,
1613                          * so we can release the object lock.
1614                          */
1615
1616                         vm_object_unlock(object);
1617
1618                         /*
1619                          * If this object uses a copy_call strategy,
1620                          * and we are interested in a copy of this object
1621                          * (having gotten here only by following a
1622                          * shadow chain), then tell the memory manager
1623                          * via a flag added to the desired_access
1624                          * parameter, so that it can detect a race
1625                          * between our walking down the shadow chain
1626                          * and its pushing pages up into a copy of
1627                          * the object that it manages.
1628                          */
1629                         if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL && object != first_object)
1630                                 wants_copy_flag = VM_PROT_WANTS_COPY;
1631                         else
1632                                 wants_copy_flag = VM_PROT_NONE;
1633
1634                         XPR(XPR_VM_FAULT,
1635                             "vm_f_page: data_req obj 0x%X, offset 0x%X, page 0x%X, acc %d\n",
1636                                 object, offset, m,
1637                                 access_required | wants_copy_flag, 0);
1638
1639                         if (object->copy == first_object) {
1640                                 /*
1641                                  * if we issue the memory_object_data_request in
1642                                  * this state, we are subject to a deadlock with
1643                                  * the underlying filesystem if it is trying to
1644                                  * shrink the file resulting in a push of pages
1645                                  * into the copy object...  that push will stall
1646                                  * on the placeholder page, and if the pushing thread
1647                                  * is holding a lock that is required on the pagein
1648                                  * path (such as a truncate lock), we'll deadlock...
1649                                  * to avoid this potential deadlock, we throw away
1650                                  * our placeholder page before calling memory_object_data_request
1651                                  * and force this thread to retry the vm_fault_page after
1652                                  * we have issued the I/O.  the second time through this path
1653                                  * we will find the page already in the cache (presumably still
1654                                  * busy waiting for the I/O to complete) and then complete
1655                                  * the fault w/o having to go through memory_object_data_request again
1656                                  */
1657                                 assert(first_m != VM_PAGE_NULL);
1658                                 assert(first_m->object == first_object);
1659
1660                                 vm_object_lock(first_object);
1661                                 VM_PAGE_FREE(first_m);
1662                                 vm_object_paging_end(first_object);
1663                                 vm_object_unlock(first_object);
1664
1665                                 first_m = VM_PAGE_NULL;
1666                                 force_fault_retry = TRUE;
1667
1668                                 vm_fault_page_forced_retry++;
1669                         }
1670
1671                         if (data_already_requested == TRUE) {
1672                                 orig_behavior = fault_info->behavior;
1673                                 orig_cluster_size = fault_info->cluster_size;
1674
1675                                 fault_info->behavior = VM_BEHAVIOR_RANDOM;
1676                                 fault_info->cluster_size = PAGE_SIZE;
1677                         }
1678                         /*
1679                          * Call the memory manager to retrieve the data.
1680                          */
1681                         rc = memory_object_data_request(
1682                                 pager,
1683                                 offset + object->paging_offset,
1684                                 PAGE_SIZE,
1685                                 access_required | wants_copy_flag,
1686                                 (memory_object_fault_info_t)fault_info);
1687
1688                         if (data_already_requested == TRUE) {
1689                                 fault_info->behavior = orig_behavior;
1690                                 fault_info->cluster_size = orig_cluster_size;
1691                         } else
1692                                 data_already_requested = TRUE;
1693
1694 #if TRACEFAULTPAGE
1695                         dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
1696 #endif
1697                         vm_object_lock(object);
1698
1699                 data_requested:
1700                         if (rc != KERN_SUCCESS) {
1701
1702                                 vm_fault_cleanup(object, first_m);
1703                                 thread_interrupt_level(interruptible_state);
1704
1705                                 return ((rc == MACH_SEND_INTERRUPTED) ?
1706                                         VM_FAULT_INTERRUPTED :
1707                                         VM_FAULT_MEMORY_ERROR);
1708                         } else {
1709                                 clock_sec_t     tv_sec;
1710                                 clock_usec_t    tv_usec;
1711
1712                                 if (my_fault_type == DBG_PAGEIN_FAULT) {
1713                                         clock_get_system_microtime(&tv_sec, &tv_usec);
1714                                         current_thread()->t_page_creation_time = tv_sec;
1715                                         current_thread()->t_page_creation_count = 0;
1716                                 }
1717                         }
1718                         if ((interruptible != THREAD_UNINT) && (current_thread()->sched_flags & TH_SFLAG_ABORT)) {
1719
1720                                 vm_fault_cleanup(object, first_m);
1721                                 thread_interrupt_level(interruptible_state);
1722
1723                                 return (VM_FAULT_INTERRUPTED);
1724                         }
1725                         if (force_fault_retry == TRUE) {
1726
1727                                 vm_fault_cleanup(object, first_m);
1728                                 thread_interrupt_level(interruptible_state);
1729
1730                                 return (VM_FAULT_RETRY);
1731                         }
1732                         if (m == VM_PAGE_NULL && object->phys_contiguous) {
1733                                 /*
1734                                  * No page here means that the object we
1735                                  * initially looked up was "physically
1736                                  * contiguous" (i.e. device memory).  However,
1737                                  * with Virtual VRAM, the object might not
1738                                  * be backed by that device memory anymore,
1739                                  * so we're done here only if the object is
1740                                  * still "phys_contiguous".
1741                                  * Otherwise, if the object is no longer
1742                                  * "phys_contiguous", we need to retry the
1743                                  * page fault against the object's new backing
1744                                  * store (different memory object).
1745                                  */
1746                         phys_contig_object:
1747                                 goto done;
1748                         }
1749                         /*
1750                          * potentially a pagein fault
1751                          * if we make it through the state checks
1752                          * above, than we'll count it as such
1753                          */
1754                         my_fault = my_fault_type;
1755
1756                         /*
1757                          * Retry with same object/offset, since new data may
1758                          * be in a different page (i.e., m is meaningless at
1759                          * this point).
1760                          */
1761                         continue;
1762                 }
1763 dont_look_for_page:
1764                 /*
1765                  * We get here if the object has no pager, or an existence map
1766                  * exists and indicates the page isn't present on the pager
1767                  * or we're unwiring a page.  If a pager exists, but there
1768                  * is no existence map, then the m->absent case above handles
1769                  * the ZF case when the pager can't provide the page
1770                  */
1771 #if TRACEFAULTPAGE
1772                 dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1773 #endif
1774                 if (object == first_object)
1775                         first_m = m;
1776                 else
1777                         assert(m == VM_PAGE_NULL);
1778
1779                 XPR(XPR_VM_FAULT,
1780                     "vm_f_page: no pager obj 0x%X, offset 0x%X, page 0x%X, next_obj 0x%X\n",
1781                         object, offset, m,
1782                         object->shadow, 0);
1783
1784                 next_object = object->shadow;
1785
1786                 if (next_object == VM_OBJECT_NULL) {
1787                         /*
1788                          * we've hit the bottom of the shadown chain,
1789                          * fill the page in the top object with zeros.
1790                          */
1791                         assert(!must_be_resident);
1792
1793                         if (object != first_object) {
1794                                 vm_object_paging_end(object);
1795                                 vm_object_unlock(object);
1796
1797                                 object = first_object;
1798                                 offset = first_offset;
1799                                 vm_object_lock(object);
1800                         }
1801                         m = first_m;
1802                         assert(m->object == object);
1803                         first_m = VM_PAGE_NULL;
1804
1805                         /*
1806                          * check for any conditions that prevent
1807                          * us from creating a new zero-fill page
1808                          * vm_fault_check will do all of the
1809                          * fault cleanup in the case of an error condition
1810                          * including resetting the thread_interrupt_level
1811                          */
1812                         error = vm_fault_check(object, m, first_m, interruptible_state);
1813
1814                         if (error != VM_FAULT_SUCCESS)
1815                                 return (error);
1816
1817                         if (m == VM_PAGE_NULL) {
1818                                 m = vm_page_grab();
1819
1820                                 if (m == VM_PAGE_NULL) {
1821                                         vm_fault_cleanup(object, VM_PAGE_NULL);
1822                                         thread_interrupt_level(interruptible_state);
1823
1824                                         return (VM_FAULT_MEMORY_SHORTAGE);
1825                                 }
1826                                 vm_page_insert(m, object, offset);
1827                         }
1828                         my_fault = vm_fault_zero_page(m, no_zero_fill);
1829
1830                         if (fault_info->mark_zf_absent && no_zero_fill == TRUE)
1831                                 m->absent = TRUE;
1832                         break;
1833
1834                 } else {
1835                         /*
1836                          * Move on to the next object.  Lock the next
1837                          * object before unlocking the current one.
1838                          */
1839                         if ((object != first_object) || must_be_resident)
1840                                 vm_object_paging_end(object);
1841
1842                         offset += object->vo_shadow_offset;
1843                         fault_info->lo_offset += object->vo_shadow_offset;
1844                         fault_info->hi_offset += object->vo_shadow_offset;
1845                         access_required = VM_PROT_READ;
1846
1847                         vm_object_lock(next_object);
1848                         vm_object_unlock(object);
1849
1850                         object = next_object;
1851                         vm_object_paging_begin(object);
1852                 }
1853         }
1854
1855         /*
1856          *      PAGE HAS BEEN FOUND.
1857          *
1858          *      This page (m) is:
1859          *              busy, so that we can play with it;
1860          *              not absent, so that nobody else will fill it;
1861          *              possibly eligible for pageout;
1862          *
1863          *      The top-level page (first_m) is:
1864          *              VM_PAGE_NULL if the page was found in the
1865          *               top-level object;
1866          *              busy, not absent, and ineligible for pageout.
1867          *
1868          *      The current object (object) is locked.  A paging
1869          *      reference is held for the current and top-level
1870          *      objects.
1871          */
1872
1873 #if TRACEFAULTPAGE
1874         dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1875 #endif
1876 #if     EXTRA_ASSERTIONS
1877         assert(m->busy && !m->absent);
1878         assert((first_m == VM_PAGE_NULL) ||
1879                (first_m->busy && !first_m->absent &&
1880                 !first_m->active && !first_m->inactive));
1881 #endif  /* EXTRA_ASSERTIONS */
1882
1883         /*
1884          * ENCRYPTED SWAP:
1885          * If we found a page, we must have decrypted it before we
1886          * get here...
1887          */
1888         ASSERT_PAGE_DECRYPTED(m);
1889
1890         XPR(XPR_VM_FAULT,
1891             "vm_f_page: FOUND obj 0x%X, off 0x%X, page 0x%X, 1_obj 0x%X, 1_m 0x%X\n",
1892                 object, offset, m,
1893                 first_object, first_m);
1894
1895         /*
1896          * If the page is being written, but isn't
1897          * already owned by the top-level object,
1898          * we have to copy it into a new page owned
1899          * by the top-level object.
1900          */
1901         if (object != first_object) {
1902
1903 #if TRACEFAULTPAGE
1904                 dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1905 #endif
1906                 if (fault_type & VM_PROT_WRITE) {
1907                         vm_page_t copy_m;
1908
1909                         /*
1910                          * We only really need to copy if we
1911                          * want to write it.
1912                          */
1913                         assert(!must_be_resident);
1914
1915                         /*
1916                          * are we protecting the system from
1917                          * backing store exhaustion.  If so
1918                          * sleep unless we are privileged.
1919                          */
1920                         if (vm_backing_store_low) {
1921                                 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
1922
1923                                         RELEASE_PAGE(m);
1924                                         vm_fault_cleanup(object, first_m);
1925
1926                                         assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
1927
1928                                         thread_block(THREAD_CONTINUE_NULL);
1929                                         thread_interrupt_level(interruptible_state);
1930
1931                                         return (VM_FAULT_RETRY);
1932                                 }
1933                         }
1934                         /*
1935                          * If we try to collapse first_object at this
1936                          * point, we may deadlock when we try to get
1937                          * the lock on an intermediate object (since we
1938                          * have the bottom object locked).  We can't
1939                          * unlock the bottom object, because the page
1940                          * we found may move (by collapse) if we do.
1941                          *
1942                          * Instead, we first copy the page.  Then, when
1943                          * we have no more use for the bottom object,
1944                          * we unlock it and try to collapse.
1945                          *
1946                          * Note that we copy the page even if we didn't
1947                          * need to... that's the breaks.
1948                          */
1949
1950                         /*
1951                          * Allocate a page for the copy
1952                          */
1953                         copy_m = vm_page_grab();
1954
1955                         if (copy_m == VM_PAGE_NULL) {
1956                                 RELEASE_PAGE(m);
1957
1958                                 vm_fault_cleanup(object, first_m);
1959                                 thread_interrupt_level(interruptible_state);
1960
1961                                 return (VM_FAULT_MEMORY_SHORTAGE);
1962                         }
1963                         XPR(XPR_VM_FAULT,
1964                             "vm_f_page: page_copy obj 0x%X, offset 0x%X, m 0x%X, copy_m 0x%X\n",
1965                                 object, offset,
1966                                 m, copy_m, 0);
1967
1968                         vm_page_copy(m, copy_m);
1969
1970                         /*
1971                          * If another map is truly sharing this
1972                          * page with us, we have to flush all
1973                          * uses of the original page, since we
1974                          * can't distinguish those which want the
1975                          * original from those which need the
1976                          * new copy.
1977                          *
1978                          * XXXO If we know that only one map has
1979                          * access to this page, then we could
1980                          * avoid the pmap_disconnect() call.
1981                          */
1982                         if (m->pmapped)
1983                                 pmap_disconnect(m->phys_page);
1984
1985                         assert(!m->cleaning);
1986
1987                         /*
1988                          * We no longer need the old page or object.
1989                          */
1990                         RELEASE_PAGE(m);
1991
1992                         vm_object_paging_end(object);
1993                         vm_object_unlock(object);
1994
1995                         my_fault = DBG_COW_FAULT;
1996                         VM_STAT_INCR(cow_faults);
1997                         DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
1998                         current_task()->cow_faults++;
1999
2000                         object = first_object;
2001                         offset = first_offset;
2002
2003                         vm_object_lock(object);
2004                         /*
2005                          * get rid of the place holder
2006                          * page that we soldered in earlier
2007                          */
2008                         VM_PAGE_FREE(first_m);
2009                         first_m = VM_PAGE_NULL;
2010
2011                         /*
2012                          * and replace it with the
2013                          * page we just copied into
2014                          */
2015                         assert(copy_m->busy);
2016                         vm_page_insert(copy_m, object, offset);
2017                         SET_PAGE_DIRTY(copy_m, TRUE);
2018
2019                         m = copy_m;
2020                         /*
2021                          * Now that we've gotten the copy out of the
2022                          * way, let's try to collapse the top object.
2023                          * But we have to play ugly games with
2024                          * paging_in_progress to do that...
2025                          */
2026                         vm_object_paging_end(object);
2027                         vm_object_collapse(object, offset, TRUE);
2028                         vm_object_paging_begin(object);
2029
2030                 } else
2031                         *protection &= (~VM_PROT_WRITE);
2032         }
2033         /*
2034          * Now check whether the page needs to be pushed into the
2035          * copy object.  The use of asymmetric copy on write for
2036          * shared temporary objects means that we may do two copies to
2037          * satisfy the fault; one above to get the page from a
2038          * shadowed object, and one here to push it into the copy.
2039          */
2040         try_failed_count = 0;
2041
2042         while ((copy_object = first_object->copy) != VM_OBJECT_NULL) {
2043                 vm_object_offset_t      copy_offset;
2044                 vm_page_t               copy_m;
2045
2046 #if TRACEFAULTPAGE
2047                 dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type);    /* (TEST/DEBUG) */
2048 #endif
2049                 /*
2050                  * If the page is being written, but hasn't been
2051                  * copied to the copy-object, we have to copy it there.
2052                  */
2053                 if ((fault_type & VM_PROT_WRITE) == 0) {
2054                         *protection &= ~VM_PROT_WRITE;
2055                         break;
2056                 }
2057
2058                 /*
2059                  * If the page was guaranteed to be resident,
2060                  * we must have already performed the copy.
2061                  */
2062                 if (must_be_resident)
2063                         break;
2064
2065                 /*
2066                  * Try to get the lock on the copy_object.
2067                  */
2068                 if (!vm_object_lock_try(copy_object)) {
2069
2070                         vm_object_unlock(object);
2071                         try_failed_count++;
2072
2073                         mutex_pause(try_failed_count);  /* wait a bit */
2074                         vm_object_lock(object);
2075
2076                         continue;
2077                 }
2078                 try_failed_count = 0;
2079
2080                 /*
2081                  * Make another reference to the copy-object,
2082                  * to keep it from disappearing during the
2083                  * copy.
2084                  */
2085                 vm_object_reference_locked(copy_object);
2086
2087                 /*
2088                  * Does the page exist in the copy?
2089                  */
2090                 copy_offset = first_offset - copy_object->vo_shadow_offset;
2091
2092                 if (copy_object->vo_size <= copy_offset)
2093                         /*
2094                          * Copy object doesn't cover this page -- do nothing.
2095                          */
2096                         ;
2097                 else if ((copy_m = vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
2098                         /*
2099                          * Page currently exists in the copy object
2100                          */
2101                         if (copy_m->busy) {
2102                                 /*
2103                                  * If the page is being brought
2104                                  * in, wait for it and then retry.
2105                                  */
2106                                 RELEASE_PAGE(m);
2107
2108                                 /*
2109                                  * take an extra ref so object won't die
2110                                  */
2111                                 vm_object_reference_locked(copy_object);
2112                                 vm_object_unlock(copy_object);
2113                                 vm_fault_cleanup(object, first_m);
2114                                 counter(c_vm_fault_page_block_backoff_kernel++);
2115
2116                                 vm_object_lock(copy_object);
2117                                 assert(copy_object->ref_count > 0);
2118                                 VM_OBJ_RES_DECR(copy_object);
2119                                 vm_object_lock_assert_exclusive(copy_object);
2120                                 copy_object->ref_count--;
2121                                 assert(copy_object->ref_count > 0);
2122                                 copy_m = vm_page_lookup(copy_object, copy_offset);
2123                                 /*
2124                                  * ENCRYPTED SWAP:
2125                                  * it's OK if the "copy_m" page is encrypted,
2126                                  * because we're not moving it nor handling its
2127                                  * contents.
2128                                  */
2129                                 if (copy_m != VM_PAGE_NULL && copy_m->busy) {
2130                                         PAGE_ASSERT_WAIT(copy_m, interruptible);
2131
2132                                         vm_object_unlock(copy_object);
2133                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
2134                                         vm_object_deallocate(copy_object);
2135
2136                                         goto backoff;
2137                                 } else {
2138                                         vm_object_unlock(copy_object);
2139                                         vm_object_deallocate(copy_object);
2140                                         thread_interrupt_level(interruptible_state);
2141
2142                                         return (VM_FAULT_RETRY);
2143                                 }
2144                         }
2145                 }
2146                 else if (!PAGED_OUT(copy_object, copy_offset)) {
2147                         /*
2148                          * If PAGED_OUT is TRUE, then the page used to exist
2149                          * in the copy-object, and has already been paged out.
2150                          * We don't need to repeat this. If PAGED_OUT is
2151                          * FALSE, then either we don't know (!pager_created,
2152                          * for example) or it hasn't been paged out.
2153                          * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
2154                          * We must copy the page to the copy object.
2155                          */
2156
2157                         if (vm_backing_store_low) {
2158                                 /*
2159                                  * we are protecting the system from
2160                                  * backing store exhaustion.  If so
2161                                  * sleep unless we are privileged.
2162                                  */
2163                                 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
2164                                         assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
2165
2166                                         RELEASE_PAGE(m);
2167                                         VM_OBJ_RES_DECR(copy_object);
2168                                         vm_object_lock_assert_exclusive(copy_object);
2169                                         copy_object->ref_count--;
2170                                         assert(copy_object->ref_count > 0);
2171
2172                                         vm_object_unlock(copy_object);
2173                                         vm_fault_cleanup(object, first_m);
2174                                         thread_block(THREAD_CONTINUE_NULL);
2175                                         thread_interrupt_level(interruptible_state);
2176
2177                                         return (VM_FAULT_RETRY);
2178                                 }
2179                         }
2180                         /*
2181                          * Allocate a page for the copy
2182                          */
2183                         copy_m = vm_page_alloc(copy_object, copy_offset);
2184
2185                         if (copy_m == VM_PAGE_NULL) {
2186                                 RELEASE_PAGE(m);
2187
2188                                 VM_OBJ_RES_DECR(copy_object);
2189                                 vm_object_lock_assert_exclusive(copy_object);
2190                                 copy_object->ref_count--;
2191                                 assert(copy_object->ref_count > 0);
2192
2193                                 vm_object_unlock(copy_object);
2194                                 vm_fault_cleanup(object, first_m);
2195                                 thread_interrupt_level(interruptible_state);
2196
2197                                 return (VM_FAULT_MEMORY_SHORTAGE);
2198                         }
2199                         /*
2200                          * Must copy page into copy-object.
2201                          */
2202                         vm_page_copy(m, copy_m);
2203
2204                         /*
2205                          * If the old page was in use by any users
2206                          * of the copy-object, it must be removed
2207                          * from all pmaps.  (We can't know which
2208                          * pmaps use it.)
2209                          */
2210                         if (m->pmapped)
2211                                 pmap_disconnect(m->phys_page);
2212
2213                         /*
2214                          * If there's a pager, then immediately
2215                          * page out this page, using the "initialize"
2216                          * option.  Else, we use the copy.
2217                          */
2218                         if ((!copy_object->pager_created)
2219 #if MACH_PAGEMAP
2220                             || vm_external_state_get(copy_object->existence_map, copy_offset) == VM_EXTERNAL_STATE_ABSENT
2221 #endif
2222                             || VM_COMPRESSOR_PAGER_STATE_GET(copy_object, copy_offset) == VM_EXTERNAL_STATE_ABSENT
2223                             ) {
2224
2225                                 vm_page_lockspin_queues();
2226                                 assert(!m->cleaning);
2227                                 vm_page_activate(copy_m);
2228                                 vm_page_unlock_queues();
2229
2230                                 SET_PAGE_DIRTY(copy_m, TRUE);
2231                                 PAGE_WAKEUP_DONE(copy_m);
2232
2233                         } else if (copy_object->internal &&
2234                                    (DEFAULT_PAGER_IS_ACTIVE || DEFAULT_FREEZER_IS_ACTIVE)) {
2235                                 /*
2236                                  * For internal objects check with the pager to see
2237                                  * if the page already exists in the backing store.
2238                                  * If yes, then we can drop the copy page. If not,
2239                                  * then we'll activate it, mark it dirty and keep it
2240                                  * around.
2241                                  */
2242
2243                                 kern_return_t kr = KERN_SUCCESS;
2244
2245                                 memory_object_t copy_pager = copy_object->pager;
2246                                 assert(copy_pager != MEMORY_OBJECT_NULL);
2247                                 vm_object_paging_begin(copy_object);
2248
2249                                 vm_object_unlock(copy_object);
2250
2251                                 kr = memory_object_data_request(
2252                                         copy_pager,
2253                                         copy_offset + copy_object->paging_offset,
2254                                         0, /* Only query the pager. */
2255                                         VM_PROT_READ,
2256                                         NULL);
2257
2258                                 vm_object_lock(copy_object);
2259
2260                                 vm_object_paging_end(copy_object);
2261
2262                                 /*
2263                                  * Since we dropped the copy_object's lock,
2264                                  * check whether we'll have to deallocate
2265                                  * the hard way.
2266                                  */
2267                                 if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
2268                                         vm_object_unlock(copy_object);
2269                                         vm_object_deallocate(copy_object);
2270                                         vm_object_lock(object);
2271
2272                                         continue;
2273                                 }
2274                                 if (kr == KERN_SUCCESS) {
2275                                         /*
2276                                          * The pager has the page. We don't want to overwrite
2277                                          * that page by sending this one out to the backing store.
2278                                          * So we drop the copy page.
2279                                          */
2280                                         VM_PAGE_FREE(copy_m);
2281
2282                                 } else {
2283                                         /*
2284                                          * The pager doesn't have the page. We'll keep this one
2285                                          * around in the copy object. It might get sent out to
2286                                          * the backing store under memory pressure.
2287                                          */
2288                                         vm_page_lockspin_queues();
2289                                         assert(!m->cleaning);
2290                                         vm_page_activate(copy_m);
2291                                         vm_page_unlock_queues();
2292
2293                                         SET_PAGE_DIRTY(copy_m, TRUE);
2294                                         PAGE_WAKEUP_DONE(copy_m);
2295                                 }
2296                         } else {
2297
2298                                 assert(copy_m->busy == TRUE);
2299                                 assert(!m->cleaning);
2300
2301                                 /*
2302                                  * dirty is protected by the object lock
2303                                  */
2304                                 SET_PAGE_DIRTY(copy_m, TRUE);
2305
2306                                 /*
2307                                  * The page is already ready for pageout:
2308                                  * not on pageout queues and busy.
2309                                  * Unlock everything except the
2310                                  * copy_object itself.
2311                                  */
2312                                 vm_object_unlock(object);
2313
2314                                 /*
2315                                  * Write the page to the copy-object,
2316                                  * flushing it from the kernel.
2317                                  */
2318                                 vm_pageout_initialize_page(copy_m);
2319
2320                                 /*
2321                                  * Since the pageout may have
2322                                  * temporarily dropped the
2323                                  * copy_object's lock, we
2324                                  * check whether we'll have
2325                                  * to deallocate the hard way.
2326                                  */
2327                                 if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
2328                                         vm_object_unlock(copy_object);
2329                                         vm_object_deallocate(copy_object);
2330                                         vm_object_lock(object);
2331
2332                                         continue;
2333                                 }
2334                                 /*
2335                                  * Pick back up the old object's
2336                                  * lock.  [It is safe to do so,
2337                                  * since it must be deeper in the
2338                                  * object tree.]
2339                                  */
2340                                 vm_object_lock(object);
2341                         }
2342
2343                         /*
2344                          * Because we're pushing a page upward
2345                          * in the object tree, we must restart
2346                          * any faults that are waiting here.
2347                          * [Note that this is an expansion of
2348                          * PAGE_WAKEUP that uses the THREAD_RESTART
2349                          * wait result].  Can't turn off the page's
2350                          * busy bit because we're not done with it.
2351                          */
2352                         if (m->wanted) {
2353                                 m->wanted = FALSE;
2354                                 thread_wakeup_with_result((event_t) m, THREAD_RESTART);
2355                         }
2356                 }
2357                 /*
2358                  * The reference count on copy_object must be
2359                  * at least 2: one for our extra reference,
2360                  * and at least one from the outside world
2361                  * (we checked that when we last locked
2362                  * copy_object).
2363                  */
2364                 vm_object_lock_assert_exclusive(copy_object);
2365                 copy_object->ref_count--;
2366                 assert(copy_object->ref_count > 0);
2367
2368                 VM_OBJ_RES_DECR(copy_object);
2369                 vm_object_unlock(copy_object);
2370
2371                 break;
2372         }
2373
2374 done:
2375         *result_page = m;
2376         *top_page = first_m;
2377
2378         XPR(XPR_VM_FAULT,
2379                 "vm_f_page: DONE obj 0x%X, offset 0x%X, m 0x%X, first_m 0x%X\n",
2380                 object, offset, m, first_m, 0);
2381
2382         if (m != VM_PAGE_NULL) {
2383                 retval = VM_FAULT_SUCCESS;
2384                 if (my_fault == DBG_PAGEIN_FAULT) {
2385
2386                         if (!m->object->internal || (DEFAULT_PAGER_IS_ACTIVE || DEFAULT_FREEZER_IS_ACTIVE))
2387                                 VM_STAT_INCR(pageins);
2388                         DTRACE_VM2(pgin, int, 1, (uint64_t *), NULL);
2389                         DTRACE_VM2(maj_fault, int, 1, (uint64_t *), NULL);
2390                         current_task()->pageins++;
2391
2392                         if (m->object->internal) {
2393                                 DTRACE_VM2(anonpgin, int, 1, (uint64_t *), NULL);
2394                                 my_fault = DBG_PAGEIND_FAULT;
2395                         } else {
2396                                 DTRACE_VM2(fspgin, int, 1, (uint64_t *), NULL);
2397                                 my_fault = DBG_PAGEINV_FAULT;
2398                         }
2399
2400                         /*
2401                          * evaluate access pattern and update state
2402                          * vm_fault_deactivate_behind depends on the
2403                          * state being up to date
2404                          */
2405                         vm_fault_is_sequential(object, offset, fault_info->behavior);
2406
2407                         vm_fault_deactivate_behind(object, offset, fault_info->behavior);
2408                 } else if (my_fault == DBG_COMPRESSOR_FAULT || my_fault == DBG_COMPRESSOR_SWAPIN_FAULT) {
2409
2410                         VM_STAT_INCR(decompressions);
2411                 }
2412                 if (type_of_fault)
2413                         *type_of_fault = my_fault;
2414         } else {
2415                 retval = VM_FAULT_SUCCESS_NO_VM_PAGE;
2416                 assert(first_m == VM_PAGE_NULL);
2417                 assert(object == first_object);
2418         }
2419
2420         thread_interrupt_level(interruptible_state);
2421
2422 #if TRACEFAULTPAGE
2423         dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0);       /* (TEST/DEBUG) */
2424 #endif
2425         return retval;
2426
2427 backoff:
2428         thread_interrupt_level(interruptible_state);
2429
2430         if (wait_result == THREAD_INTERRUPTED)
2431                 return (VM_FAULT_INTERRUPTED);
2432         return (VM_FAULT_RETRY);
2433
2434 #undef  RELEASE_PAGE
2435 }
2436
2437
2438
2439 /*
2440  * CODE SIGNING:
2441  * When soft faulting a page, we have to validate the page if:
2442  * 1. the page is being mapped in user space
2443  * 2. the page hasn't already been found to be "tainted"
2444  * 3. the page belongs to a code-signed object
2445  * 4. the page has not been validated yet or has been mapped for write.
2446  */
2447 #define VM_FAULT_NEED_CS_VALIDATION(pmap, page)                         \
2448         ((pmap) != kernel_pmap /*1*/ &&                                 \
2449          !(page)->cs_tainted /*2*/ &&                                   \
2450          (page)->object->code_signed /*3*/ &&                           \
2451          (!(page)->cs_validated || (page)->wpmapped /*4*/))
2452
2453
2454 /*
2455  * page queue lock must NOT be held
2456  * m->object must be locked
2457  *
2458  * NOTE: m->object could be locked "shared" only if we are called
2459  * from vm_fault() as part of a soft fault.  If so, we must be
2460  * careful not to modify the VM object in any way that is not
2461  * legal under a shared lock...
2462  */
2463 extern int proc_selfpid(void);
2464 extern char *proc_name_address(void *p);
2465 unsigned long cs_enter_tainted_rejected = 0;
2466 unsigned long cs_enter_tainted_accepted = 0;
2467 kern_return_t
2468 vm_fault_enter(vm_page_t m,
2469                pmap_t pmap,
2470                vm_map_offset_t vaddr,
2471                vm_prot_t prot,
2472                vm_prot_t fault_type,
2473                boolean_t wired,
2474                boolean_t change_wiring,
2475                boolean_t no_cache,
2476                boolean_t cs_bypass,
2477                boolean_t *need_retry,
2478                int *type_of_fault)
2479 {
2480         kern_return_t   kr, pe_result;
2481         boolean_t       previously_pmapped = m->pmapped;
2482         boolean_t       must_disconnect = 0;
2483         boolean_t       map_is_switched, map_is_switch_protected;
2484         int             cs_enforcement_enabled;
2485
2486         vm_object_lock_assert_held(m->object);
2487 #if DEBUG
2488         lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
2489 #endif /* DEBUG */
2490
2491         if (m->phys_page == vm_page_guard_addr) {
2492                 assert(m->fictitious);
2493                 return KERN_SUCCESS;
2494         }
2495
2496         if (*type_of_fault == DBG_ZERO_FILL_FAULT) {
2497
2498                 vm_object_lock_assert_exclusive(m->object);
2499
2500         } else if ((fault_type & VM_PROT_WRITE) == 0) {
2501                 /*
2502                  * This is not a "write" fault, so we
2503                  * might not have taken the object lock
2504                  * exclusively and we might not be able
2505                  * to update the "wpmapped" bit in
2506                  * vm_fault_enter().
2507                  * Let's just grant read access to
2508                  * the page for now and we'll
2509                  * soft-fault again if we need write
2510                  * access later...
2511                  */
2512                 prot &= ~VM_PROT_WRITE;
2513         }
2514         if (m->pmapped == FALSE) {
2515
2516                 if ((*type_of_fault == DBG_CACHE_HIT_FAULT) && m->clustered) {
2517                         /*
2518                          * found it in the cache, but this
2519                          * is the first fault-in of the page (m->pmapped == FALSE)
2520                          * so it must have come in as part of
2521                          * a cluster... account 1 pagein against it
2522                          */
2523                         VM_STAT_INCR(pageins);
2524                         DTRACE_VM2(pgin, int, 1, (uint64_t *), NULL);
2525
2526                         if (m->object->internal) {
2527                                 DTRACE_VM2(anonpgin, int, 1, (uint64_t *), NULL);
2528                                 *type_of_fault = DBG_PAGEIND_FAULT;
2529                         } else {
2530                                 DTRACE_VM2(fspgin, int, 1, (uint64_t *), NULL);
2531                                 *type_of_fault = DBG_PAGEINV_FAULT;
2532                         }
2533
2534                         current_task()->pageins++;
2535                 }
2536                 VM_PAGE_CONSUME_CLUSTERED(m);
2537
2538         }
2539
2540         if (*type_of_fault != DBG_COW_FAULT) {
2541                 DTRACE_VM2(as_fault, int, 1, (uint64_t *), NULL);
2542
2543                 if (pmap == kernel_pmap) {
2544                         DTRACE_VM2(kernel_asflt, int, 1, (uint64_t *), NULL);
2545                 }
2546         }
2547
2548         /* Validate code signature if necessary. */
2549         if (VM_FAULT_NEED_CS_VALIDATION(pmap, m)) {
2550                 vm_object_lock_assert_exclusive(m->object);
2551
2552                 if (m->cs_validated) {
2553                         vm_cs_revalidates++;
2554                 }
2555
2556                 /* VM map is locked, so 1 ref will remain on VM object -
2557                  * so no harm if vm_page_validate_cs drops the object lock */
2558                 vm_page_validate_cs(m);
2559         }
2560
2561 #define page_immutable(m,prot) ((m)->cs_validated /*&& ((prot) & VM_PROT_EXECUTE)*/)
2562
2563         map_is_switched = ((pmap != vm_map_pmap(current_task()->map)) &&
2564                            (pmap == vm_map_pmap(current_thread()->map)));
2565         map_is_switch_protected = current_thread()->map->switch_protect;
2566
2567         /* If the map is switched, and is switch-protected, we must protect
2568          * some pages from being write-faulted: immutable pages because by
2569          * definition they may not be written, and executable pages because that
2570          * would provide a way to inject unsigned code.
2571          * If the page is immutable, we can simply return. However, we can't
2572          * immediately determine whether a page is executable anywhere. But,
2573          * we can disconnect it everywhere and remove the executable protection
2574          * from the current map. We do that below right before we do the
2575          * PMAP_ENTER.
2576          */
2577         cs_enforcement_enabled = cs_enforcement(NULL);
2578
2579         if(cs_enforcement_enabled && map_is_switched &&
2580            map_is_switch_protected && page_immutable(m, prot) &&
2581            (prot & VM_PROT_WRITE))
2582         {
2583                 return KERN_CODESIGN_ERROR;
2584         }
2585
2586         /* A page could be tainted, or pose a risk of being tainted later.
2587          * Check whether the receiving process wants it, and make it feel
2588          * the consequences (that hapens in cs_invalid_page()).
2589          * For CS Enforcement, two other conditions will
2590          * cause that page to be tainted as well:
2591          * - pmapping an unsigned page executable - this means unsigned code;
2592          * - writeable mapping of a validated page - the content of that page
2593          *   can be changed without the kernel noticing, therefore unsigned
2594          *   code can be created
2595          */
2596         if (m->cs_tainted ||
2597             ((cs_enforcement_enabled && !cs_bypass ) &&
2598              (/* The page is unsigned and wants to be executable */
2599               (!m->cs_validated && (prot & VM_PROT_EXECUTE))  ||
2600               /* The page should be immutable, but is in danger of being modified
2601                 * This is the case where we want policy from the code directory -
2602                 * is the page immutable or not? For now we have to assume that
2603                 * code pages will be immutable, data pages not.
2604                 * We'll assume a page is a code page if it has a code directory
2605                 * and we fault for execution.
2606                 * That is good enough since if we faulted the code page for
2607                 * writing in another map before, it is wpmapped; if we fault
2608                 * it for writing in this map later it will also be faulted for executing
2609                 * at the same time; and if we fault for writing in another map
2610                 * later, we will disconnect it from this pmap so we'll notice
2611                 * the change.
2612                 */
2613               (page_immutable(m, prot) && ((prot & VM_PROT_WRITE) || m->wpmapped))
2614               ))
2615                 )
2616         {
2617                 /* We will have a tainted page. Have to handle the special case
2618                  * of a switched map now. If the map is not switched, standard
2619                  * procedure applies - call cs_invalid_page().
2620                  * If the map is switched, the real owner is invalid already.
2621                  * There is no point in invalidating the switching process since
2622                  * it will not be executing from the map. So we don't call
2623                  * cs_invalid_page() in that case. */
2624                 boolean_t reject_page;
2625                 if(map_is_switched) {
2626                         assert(pmap==vm_map_pmap(current_thread()->map));
2627                         assert(!(prot & VM_PROT_WRITE) || (map_is_switch_protected == FALSE));
2628                         reject_page = FALSE;
2629                 } else {
2630                         if (cs_debug > 5)
2631                                 printf("vm_fault: signed: %s validate: %s tainted: %s wpmapped: %s slid: %s prot: 0x%x\n",
2632                                        m->object->code_signed ? "yes" : "no",
2633                                        m->cs_validated ? "yes" : "no",
2634                                        m->cs_tainted ? "yes" : "no",
2635                                        m->wpmapped ? "yes" : "no",
2636                                        m->slid ? "yes" : "no",
2637                                        (int)prot);
2638                         reject_page = cs_invalid_page((addr64_t) vaddr);
2639                 }
2640
2641                 if (reject_page) {
2642                         /* reject the tainted page: abort the page fault */
2643                         int                     pid;
2644                         const char              *procname;
2645                         task_t                  task;
2646                         vm_object_t             file_object, shadow;
2647                         vm_object_offset_t      file_offset;
2648                         char                    *pathname, *filename;
2649                         vm_size_t               pathname_len, filename_len;
2650                         boolean_t               truncated_path;
2651 #define __PATH_MAX 1024
2652                         struct timespec         mtime, cs_mtime;
2653
2654                         kr = KERN_CODESIGN_ERROR;
2655                         cs_enter_tainted_rejected++;
2656
2657                         /* get process name and pid */
2658                         procname = "?";
2659                         task = current_task();
2660                         pid = proc_selfpid();
2661                         if (task->bsd_info != NULL)
2662                                 procname = proc_name_address(task->bsd_info);
2663
2664                         /* get file's VM object */
2665                         file_object = m->object;
2666                         file_offset = m->offset;
2667                         for (shadow = file_object->shadow;
2668                              shadow != VM_OBJECT_NULL;
2669                              shadow = file_object->shadow) {
2670                                 vm_object_lock_shared(shadow);
2671                                 if (file_object != m->object) {
2672                                         vm_object_unlock(file_object);
2673                                 }
2674                                 file_offset += file_object->vo_shadow_offset;
2675                                 file_object = shadow;
2676                         }
2677
2678                         mtime.tv_sec = 0;
2679                         mtime.tv_nsec = 0;
2680                         cs_mtime.tv_sec = 0;
2681                         cs_mtime.tv_nsec = 0;
2682
2683                         /* get file's pathname and/or filename */
2684                         pathname = NULL;
2685                         filename = NULL;
2686                         pathname_len = 0;
2687                         filename_len = 0;
2688                         truncated_path = FALSE;
2689                         if (file_object->pager == NULL) {
2690                                 /* no pager -> no file -> no pathname */
2691                                 pathname = (char *) "<nil>";
2692                         } else {
2693                                 pathname = (char *)kalloc(__PATH_MAX * 2);
2694                                 if (pathname) {
2695                                         pathname_len = __PATH_MAX;
2696                                         filename = pathname + pathname_len;
2697                                         filename_len = __PATH_MAX;
2698                                 }
2699                                 vnode_pager_get_object_name(file_object->pager,
2700                                                             pathname,
2701                                                             pathname_len,
2702                                                             filename,
2703                                                             filename_len,
2704                                                             &truncated_path);
2705                                 vnode_pager_get_object_mtime(file_object->pager,
2706                                                              &mtime,
2707                                                              &cs_mtime);
2708                         }
2709                         printf("CODE SIGNING: process %d[%s]: "
2710                                "rejecting invalid page at address 0x%llx "
2711                                "from offset 0x%llx in file \"%s%s%s\" "
2712                                "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
2713                                "(signed:%d validated:%d tainted:%d "
2714                                "wpmapped:%d slid:%d)\n",
2715                                pid, procname, (addr64_t) vaddr,
2716                                file_offset,
2717                                pathname,
2718                                (truncated_path ? "/.../" : ""),
2719                                (truncated_path ? filename : ""),
2720                                cs_mtime.tv_sec, cs_mtime.tv_nsec,
2721                                ((cs_mtime.tv_sec == mtime.tv_sec &&
2722                                  cs_mtime.tv_nsec == mtime.tv_nsec)
2723                                 ? "=="
2724                                 : "!="),
2725                                mtime.tv_sec, mtime.tv_nsec,
2726                                m->object->code_signed,
2727                                m->cs_validated,
2728                                m->cs_tainted,
2729                                m->wpmapped,
2730                                m->slid);
2731                         if (file_object != m->object) {
2732                                 vm_object_unlock(file_object);
2733                         }
2734                         if (pathname_len != 0) {
2735                                 kfree(pathname, __PATH_MAX * 2);
2736                                 pathname = NULL;
2737                                 filename = NULL;
2738                         }
2739                 } else {
2740                         /* proceed with the tainted page */
2741                         kr = KERN_SUCCESS;
2742                         /* Page might have been tainted before or not; now it
2743                          * definitively is. If the page wasn't tainted, we must
2744                          * disconnect it from all pmaps later. */
2745                         must_disconnect = !m->cs_tainted;
2746                         m->cs_tainted = TRUE;
2747                         cs_enter_tainted_accepted++;
2748                 }
2749                 if (kr != KERN_SUCCESS) {
2750                         if (cs_debug) {
2751                                 printf("CODESIGNING: vm_fault_enter(0x%llx): "
2752                                        "page %p obj %p off 0x%llx *** INVALID PAGE ***\n",
2753                                        (long long)vaddr, m, m->object, m->offset);
2754                         }
2755 #if !SECURE_KERNEL
2756                         if (cs_enforcement_panic) {
2757                                 panic("CODESIGNING: panicking on invalid page\n");
2758                         }
2759 #endif
2760                 }
2761
2762         } else {
2763                 /* proceed with the valid page */
2764                 kr = KERN_SUCCESS;
2765         }
2766
2767         boolean_t       page_queues_locked = FALSE;
2768 #define __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED()   \
2769 MACRO_BEGIN                                     \
2770         if (! page_queues_locked) {             \
2771                 page_queues_locked = TRUE;      \
2772                 vm_page_lockspin_queues();      \
2773         }                                       \
2774 MACRO_END
2775 #define __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED()     \
2776 MACRO_BEGIN                                     \
2777         if (page_queues_locked) {               \
2778                 page_queues_locked = FALSE;     \
2779                 vm_page_unlock_queues();        \
2780         }                                       \
2781 MACRO_END
2782
2783         /*
2784          * Hold queues lock to manipulate
2785          * the page queues.  Change wiring
2786          * case is obvious.
2787          */
2788         assert(m->compressor || m->object != compressor_object);
2789         if (m->compressor) {
2790                 /*
2791                  * Compressor pages are neither wired
2792                  * nor pageable and should never change.
2793                  */
2794                 assert(m->object == compressor_object);
2795         } else if (change_wiring) {
2796                 __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
2797
2798                 if (wired) {
2799                         if (kr == KERN_SUCCESS) {
2800                                 vm_page_wire(m);
2801                         }
2802                 } else {
2803                         vm_page_unwire(m, TRUE);
2804                 }
2805                 /* we keep the page queues lock, if we need it later */
2806
2807         } else {
2808                 if (kr != KERN_SUCCESS) {
2809                         __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
2810                         vm_page_deactivate(m);
2811                         /* we keep the page queues lock, if we need it later */
2812                 } else if (((!m->active && !m->inactive) ||
2813                             m->clean_queue ||
2814                             no_cache) &&
2815                            !VM_PAGE_WIRED(m) && !m->throttled) {
2816
2817                         if (vm_page_local_q &&
2818                             !no_cache &&
2819                             (*type_of_fault == DBG_COW_FAULT ||
2820                              *type_of_fault == DBG_ZERO_FILL_FAULT) ) {
2821                                 struct vpl      *lq;
2822                                 uint32_t        lid;
2823
2824                                 __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
2825                                 vm_object_lock_assert_exclusive(m->object);
2826
2827                                 /*
2828                                  * we got a local queue to stuff this
2829                                  * new page on...
2830                                  * its safe to manipulate local and
2831                                  * local_id at this point since we're
2832                                  * behind an exclusive object lock and
2833                                  * the page is not on any global queue.
2834                                  *
2835                                  * we'll use the current cpu number to
2836                                  * select the queue note that we don't
2837                                  * need to disable preemption... we're
2838                                  * going to behind the local queue's
2839                                  * lock to do the real work
2840                                  */
2841                                 lid = cpu_number();
2842
2843                                 lq = &vm_page_local_q[lid].vpl_un.vpl;
2844
2845                                 VPL_LOCK(&lq->vpl_lock);
2846
2847                                 queue_enter(&lq->vpl_queue, m,
2848                                             vm_page_t, pageq);
2849                                 m->local = TRUE;
2850                                 m->local_id = lid;
2851                                 lq->vpl_count++;
2852
2853                                 if (m->object->internal)
2854                                         lq->vpl_internal_count++;
2855                                 else
2856                                         lq->vpl_external_count++;
2857
2858                                 VPL_UNLOCK(&lq->vpl_lock);
2859
2860                                 if (lq->vpl_count > vm_page_local_q_soft_limit)
2861                                 {
2862                                         /*
2863                                          * we're beyond the soft limit
2864                                          * for the local queue
2865                                          * vm_page_reactivate_local will
2866                                          * 'try' to take the global page
2867                                          * queue lock... if it can't
2868                                          * that's ok... we'll let the
2869                                          * queue continue to grow up
2870                                          * to the hard limit... at that
2871                                          * point we'll wait for the
2872                                          * lock... once we've got the
2873                                          * lock, we'll transfer all of
2874                                          * the pages from the local
2875                                          * queue to the global active
2876                                          * queue
2877                                          */
2878                                         vm_page_reactivate_local(lid, FALSE, FALSE);
2879                                 }
2880                         } else {
2881
2882                                 __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
2883
2884                                 /*
2885                                  * test again now that we hold the
2886                                  * page queue lock
2887                                  */
2888                                 if (!VM_PAGE_WIRED(m)) {
2889                                         if (m->clean_queue) {
2890                                                 VM_PAGE_QUEUES_REMOVE(m);
2891
2892                                                 vm_pageout_cleaned_reactivated++;
2893                                                 vm_pageout_cleaned_fault_reactivated++;
2894                                         }
2895
2896                                         if ((!m->active &&
2897                                              !m->inactive) ||
2898                                             no_cache) {
2899                                                 /*
2900                                                  * If this is a no_cache mapping
2901                                                  * and the page has never been
2902                                                  * mapped before or was
2903                                                  * previously a no_cache page,
2904                                                  * then we want to leave pages
2905                                                  * in the speculative state so
2906                                                  * that they can be readily
2907                                                  * recycled if free memory runs
2908                                                  * low.  Otherwise the page is
2909                                                  * activated as normal.
2910                                                  */
2911
2912                                                 if (no_cache &&
2913                                                     (!previously_pmapped ||
2914                                                      m->no_cache)) {
2915                                                         m->no_cache = TRUE;
2916
2917                                                         if (!m->speculative)
2918                                                                 vm_page_speculate(m, FALSE);
2919
2920                                                 } else if (!m->active &&
2921                                                            !m->inactive) {
2922
2923                                                         vm_page_activate(m);
2924                                                 }
2925                                         }
2926                                 }
2927                                 /* we keep the page queues lock, if we need it later */
2928                         }
2929                 }
2930         }
2931
2932         if ((prot & VM_PROT_EXECUTE) &&
2933             ! m->xpmapped) {
2934
2935                 __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
2936
2937                 /*
2938                  * xpmapped is protected by the page queues lock
2939                  * so it matters not that we might only hold the
2940                  * object lock in the shared state
2941                  */
2942
2943                 if (! m->xpmapped) {
2944
2945                         m->xpmapped = TRUE;
2946                         __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
2947
2948                         if ((COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) &&
2949                             m->object->internal &&
2950                             m->object->pager != NULL) {
2951                                 /*
2952                                  * This page could have been
2953                                  * uncompressed by the
2954                                  * compressor pager and its
2955                                  * contents might be only in
2956                                  * the data cache.
2957                                  * Since it's being mapped for
2958                                  * "execute" for the fist time,
2959                                  * make sure the icache is in
2960                                  * sync.
2961                                  */
2962                                 pmap_sync_page_data_phys(m->phys_page);
2963                         }
2964
2965                 }
2966         }
2967         /* we're done with the page queues lock, if we ever took it */
2968         __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
2969
2970
2971         /* If we have a KERN_SUCCESS from the previous checks, we either have
2972          * a good page, or a tainted page that has been accepted by the process.
2973          * In both cases the page will be entered into the pmap.
2974          * If the page is writeable, we need to disconnect it from other pmaps
2975          * now so those processes can take note.
2976          */
2977         if (kr == KERN_SUCCESS) {
2978                 /*
2979                  * NOTE: we may only hold the vm_object lock SHARED
2980                  * at this point, but the update of pmapped is ok
2981                  * since this is the ONLY bit updated behind the SHARED
2982                  * lock... however, we need to figure out how to do an atomic
2983                  * update on a bit field to make this less fragile... right
2984                  * now I don't know how to coerce 'C' to give me the offset info
2985                  * that's needed for an AtomicCompareAndSwap
2986                  */
2987                 m->pmapped = TRUE;
2988                 if(vm_page_is_slideable(m)) {
2989                         boolean_t was_busy = m->busy;
2990
2991                         vm_object_lock_assert_exclusive(m->object);
2992
2993                         m->busy = TRUE;
2994                         kr = vm_page_slide(m, 0);
2995                         assert(m->busy);
2996                         if(!was_busy) {
2997                                 PAGE_WAKEUP_DONE(m);
2998                         }
2999                         if (kr != KERN_SUCCESS) {
3000                                 /*
3001                                  * This page has not been slid correctly,
3002                                  * do not do the pmap_enter() !
3003                                  * Let vm_fault_enter() return the error
3004                                  * so the caller can fail the fault.
3005                                  */
3006                                 goto after_the_pmap_enter;
3007                         }
3008                 }
3009
3010                 if (fault_type & VM_PROT_WRITE) {
3011
3012                         if (m->wpmapped == FALSE) {
3013                                 vm_object_lock_assert_exclusive(m->object);
3014
3015                                 m->wpmapped = TRUE;
3016                         }
3017                         if (must_disconnect) {
3018                                 /*
3019                                  * We can only get here
3020                                  * because of the CSE logic
3021                                  */
3022                                 assert(cs_enforcement_enabled);
3023                                 pmap_disconnect(m->phys_page);
3024                                 /*
3025                                  * If we are faulting for a write, we can clear
3026                                  * the execute bit - that will ensure the page is
3027                                  * checked again before being executable, which
3028                                  * protects against a map switch.
3029                                  * This only happens the first time the page
3030                                  * gets tainted, so we won't get stuck here
3031                                  * to make an already writeable page executable.
3032                                  */
3033                                 if (!cs_bypass){
3034                                         prot &= ~VM_PROT_EXECUTE;
3035                                 }
3036                         }
3037                 }
3038
3039                 /* Prevent a deadlock by not
3040                  * holding the object lock if we need to wait for a page in
3041                  * pmap_enter() - <rdar://problem/7138958> */
3042                 PMAP_ENTER_OPTIONS(pmap, vaddr, m, prot, fault_type, 0,
3043                                   wired, PMAP_OPTIONS_NOWAIT, pe_result);
3044
3045                 if(pe_result == KERN_RESOURCE_SHORTAGE) {
3046
3047                         if (need_retry) {
3048                                 /*
3049                                  * this will be non-null in the case where we hold the lock
3050                                  * on the top-object in this chain... we can't just drop
3051                                  * the lock on the object we're inserting the page into
3052                                  * and recall the PMAP_ENTER since we can still cause
3053                                  * a deadlock if one of the critical paths tries to
3054                                  * acquire the lock on the top-object and we're blocked
3055                                  * in PMAP_ENTER waiting for memory... our only recourse
3056                                  * is to deal with it at a higher level where we can
3057                                  * drop both locks.
3058                                  */
3059                                 *need_retry = TRUE;
3060                                 vm_pmap_enter_retried++;
3061                                 goto after_the_pmap_enter;
3062                         }
3063                         /* The nonblocking version of pmap_enter did not succeed.
3064                          * and we don't need to drop other locks and retry
3065                          * at the level above us, so
3066                          * use the blocking version instead. Requires marking
3067                          * the page busy and unlocking the object */
3068                         boolean_t was_busy = m->busy;
3069
3070                         vm_object_lock_assert_exclusive(m->object);
3071
3072                         m->busy = TRUE;
3073                         vm_object_unlock(m->object);
3074
3075                         PMAP_ENTER(pmap, vaddr, m, prot, fault_type, 0, wired);
3076
3077                         /* Take the object lock again. */
3078                         vm_object_lock(m->object);
3079
3080                         /* If the page was busy, someone else will wake it up.
3081                          * Otherwise, we have to do it now. */
3082                         assert(m->busy);
3083                         if(!was_busy) {
3084                                 PAGE_WAKEUP_DONE(m);
3085                         }
3086                         vm_pmap_enter_blocked++;
3087                 }
3088         }
3089
3090 after_the_pmap_enter:
3091         return kr;
3092 }
3093
3094
3095 /*
3096  *      Routine:        vm_fault
3097  *      Purpose:
3098  *              Handle page faults, including pseudo-faults
3099  *              used to change the wiring status of pages.
3100  *      Returns:
3101  *              Explicit continuations have been removed.
3102  *      Implementation:
3103  *              vm_fault and vm_fault_page save mucho state
3104  *              in the moral equivalent of a closure.  The state
3105  *              structure is allocated when first entering vm_fault
3106  *              and deallocated when leaving vm_fault.
3107  */
3108
3109 extern int _map_enter_debug;
3110
3111 unsigned long vm_fault_collapse_total = 0;
3112 unsigned long vm_fault_collapse_skipped = 0;
3113
3114
3115 kern_return_t
3116 vm_fault(
3117         vm_map_t        map,
3118         vm_map_offset_t vaddr,
3119         vm_prot_t       fault_type,
3120         boolean_t       change_wiring,
3121         int             interruptible,
3122         pmap_t          caller_pmap,
3123         vm_map_offset_t caller_pmap_addr)
3124 {
3125         vm_map_version_t        version;        /* Map version for verificiation */
3126         boolean_t               wired;          /* Should mapping be wired down? */
3127         vm_object_t             object;         /* Top-level object */
3128         vm_object_offset_t      offset;         /* Top-level offset */
3129         vm_prot_t               prot;           /* Protection for mapping */
3130         vm_object_t             old_copy_object; /* Saved copy object */
3131         vm_page_t               result_page;    /* Result of vm_fault_page */
3132         vm_page_t               top_page;       /* Placeholder page */
3133         kern_return_t           kr;
3134
3135         vm_page_t               m;      /* Fast access to result_page */
3136         kern_return_t           error_code;
3137         vm_object_t             cur_object;
3138         vm_object_offset_t      cur_offset;
3139         vm_page_t               cur_m;
3140         vm_object_t             new_object;
3141         int                     type_of_fault;
3142         pmap_t                  pmap;
3143         boolean_t               interruptible_state;
3144         vm_map_t                real_map = map;
3145         vm_map_t                original_map = map;
3146         vm_prot_t               original_fault_type;
3147         struct vm_object_fault_info fault_info;
3148         boolean_t               need_collapse = FALSE;
3149         boolean_t               need_retry = FALSE;
3150         boolean_t               *need_retry_ptr = NULL;
3151         int                     object_lock_type = 0;
3152         int                     cur_object_lock_type;
3153         vm_object_t             top_object = VM_OBJECT_NULL;
3154         int                     throttle_delay;
3155
3156
3157         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3158                       (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START,
3159                               ((uint64_t)vaddr >> 32),
3160                               vaddr,
3161                               (map == kernel_map),
3162                               0,
3163                               0);
3164
3165         if (get_preemption_level() != 0) {
3166                 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3167                                       (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
3168                                       ((uint64_t)vaddr >> 32),
3169                                       vaddr,
3170                                       KERN_FAILURE,
3171                                       0,
3172                                       0);
3173
3174                 return (KERN_FAILURE);
3175         }
3176
3177         interruptible_state = thread_interrupt_level(interruptible);
3178
3179         VM_STAT_INCR(faults);
3180         current_task()->faults++;
3181         original_fault_type = fault_type;
3182
3183         if (fault_type & VM_PROT_WRITE)
3184                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3185         else
3186                 object_lock_type = OBJECT_LOCK_SHARED;
3187
3188         cur_object_lock_type = OBJECT_LOCK_SHARED;
3189
3190 RetryFault:
3191         /*
3192          * assume we will hit a page in the cache
3193          * otherwise, explicitly override with
3194          * the real fault type once we determine it
3195          */
3196         type_of_fault = DBG_CACHE_HIT_FAULT;
3197
3198         /*
3199          *      Find the backing store object and offset into
3200          *      it to begin the search.
3201          */
3202         fault_type = original_fault_type;
3203         map = original_map;
3204         vm_map_lock_read(map);
3205
3206         kr = vm_map_lookup_locked(&map, vaddr, fault_type,
3207                                   object_lock_type, &version,
3208                                   &object, &offset, &prot, &wired,
3209                                   &fault_info,
3210                                   &real_map);
3211
3212         if (kr != KERN_SUCCESS) {
3213                 vm_map_unlock_read(map);
3214                 goto done;
3215         }
3216         pmap = real_map->pmap;
3217         fault_info.interruptible = interruptible;
3218         fault_info.stealth = FALSE;
3219         fault_info.io_sync = FALSE;
3220         fault_info.mark_zf_absent = FALSE;
3221         fault_info.batch_pmap_op = FALSE;
3222
3223         /*
3224          * If the page is wired, we must fault for the current protection
3225          * value, to avoid further faults.
3226          */
3227         if (wired) {
3228                 fault_type = prot | VM_PROT_WRITE;
3229                 /*
3230                  * since we're treating this fault as a 'write'
3231                  * we must hold the top object lock exclusively
3232                  */
3233                 if (object_lock_type == OBJECT_LOCK_SHARED) {
3234
3235                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3236
3237                         if (vm_object_lock_upgrade(object) == FALSE) {
3238                                 /*
3239                                  * couldn't upgrade, so explictly
3240                                  * take the lock exclusively
3241                                  */
3242                                 vm_object_lock(object);
3243                         }
3244                 }
3245         }
3246
3247 #if     VM_FAULT_CLASSIFY
3248         /*
3249          *      Temporary data gathering code
3250          */
3251         vm_fault_classify(object, offset, fault_type);
3252 #endif
3253         /*
3254          *      Fast fault code.  The basic idea is to do as much as
3255          *      possible while holding the map lock and object locks.
3256          *      Busy pages are not used until the object lock has to
3257          *      be dropped to do something (copy, zero fill, pmap enter).
3258          *      Similarly, paging references aren't acquired until that
3259          *      point, and object references aren't used.
3260          *
3261          *      If we can figure out what to do
3262          *      (zero fill, copy on write, pmap enter) while holding
3263          *      the locks, then it gets done.  Otherwise, we give up,
3264          *      and use the original fault path (which doesn't hold
3265          *      the map lock, and relies on busy pages).
3266          *      The give up cases include:
3267          *              - Have to talk to pager.
3268          *              - Page is busy, absent or in error.
3269          *              - Pager has locked out desired access.
3270          *              - Fault needs to be restarted.
3271          *              - Have to push page into copy object.
3272          *
3273          *      The code is an infinite loop that moves one level down
3274          *      the shadow chain each time.  cur_object and cur_offset
3275          *      refer to the current object being examined. object and offset
3276          *      are the original object from the map.  The loop is at the
3277          *      top level if and only if object and cur_object are the same.
3278          *
3279          *      Invariants:  Map lock is held throughout.  Lock is held on
3280          *              original object and cur_object (if different) when
3281          *              continuing or exiting loop.
3282          *
3283          */
3284
3285
3286         /*
3287          * If this page is to be inserted in a copy delay object
3288          * for writing, and if the object has a copy, then the
3289          * copy delay strategy is implemented in the slow fault page.
3290          */
3291         if (object->copy_strategy == MEMORY_OBJECT_COPY_DELAY &&
3292             object->copy != VM_OBJECT_NULL && (fault_type & VM_PROT_WRITE))
3293                 goto handle_copy_delay;
3294
3295         cur_object = object;
3296         cur_offset = offset;
3297
3298         while (TRUE) {
3299                 if (!cur_object->pager_created &&
3300                     cur_object->phys_contiguous) /* superpage */
3301                         break;
3302
3303                 if (cur_object->blocked_access) {
3304                         /*
3305                          * Access to this VM object has been blocked.
3306                          * Let the slow path handle it.
3307                          */
3308                         break;
3309                 }
3310
3311                 m = vm_page_lookup(cur_object, cur_offset);
3312
3313                 if (m != VM_PAGE_NULL) {
3314                         if (m->busy) {
3315                                 wait_result_t   result;
3316
3317                                 /*
3318                                  * in order to do the PAGE_ASSERT_WAIT, we must
3319                                  * have object that 'm' belongs to locked exclusively
3320                                  */
3321                                 if (object != cur_object) {
3322
3323                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3324
3325                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3326
3327                                                 if (vm_object_lock_upgrade(cur_object) == FALSE) {
3328                                                         /*
3329                                                          * couldn't upgrade so go do a full retry
3330                                                          * immediately since we can no longer be
3331                                                          * certain about cur_object (since we
3332                                                          * don't hold a reference on it)...
3333                                                          * first drop the top object lock
3334                                                          */
3335                                                         vm_object_unlock(object);
3336
3337                                                         vm_map_unlock_read(map);
3338                                                         if (real_map != map)
3339                                                                 vm_map_unlock(real_map);
3340
3341                                                         goto RetryFault;
3342                                                 }
3343                                         }
3344                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3345
3346                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3347
3348                                         if (vm_object_lock_upgrade(object) == FALSE) {
3349                                                 /*
3350                                                  * couldn't upgrade, so explictly take the lock
3351                                                  * exclusively and go relookup the page since we
3352                                                  * will have dropped the object lock and
3353                                                  * a different thread could have inserted
3354                                                  * a page at this offset
3355                                                  * no need for a full retry since we're
3356                                                  * at the top level of the object chain
3357                                                  */
3358                                                 vm_object_lock(object);
3359
3360                                                 continue;
3361                                         }
3362                                 }
3363                                 if (m->pageout_queue && m->object->internal && COMPRESSED_PAGER_IS_ACTIVE) {
3364                                         /*
3365                                          * m->busy == TRUE and the object is locked exclusively
3366                                          * if m->pageout_queue == TRUE after we acquire the
3367                                          * queues lock, we are guaranteed that it is stable on
3368                                          * the pageout queue and therefore reclaimable
3369                                          *
3370                                          * NOTE: this is only true for the internal pageout queue
3371                                          * in the compressor world
3372                                          */
3373                                         vm_page_lock_queues();
3374
3375                                         if (m->pageout_queue) {
3376                                                 vm_pageout_throttle_up(m);
3377                                                 vm_page_unlock_queues();
3378
3379                                                 PAGE_WAKEUP_DONE(m);
3380                                                 goto reclaimed_from_pageout;
3381                                         }
3382                                         vm_page_unlock_queues();
3383                                 }
3384                                 if (object != cur_object)
3385                                         vm_object_unlock(object);
3386
3387                                 vm_map_unlock_read(map);
3388                                 if (real_map != map)
3389                                         vm_map_unlock(real_map);
3390
3391                                 result = PAGE_ASSERT_WAIT(m, interruptible);
3392
3393                                 vm_object_unlock(cur_object);
3394
3395                                 if (result == THREAD_WAITING) {
3396                                         result = thread_block(THREAD_CONTINUE_NULL);
3397
3398                                         counter(c_vm_fault_page_block_busy_kernel++);
3399                                 }
3400                                 if (result == THREAD_AWAKENED || result == THREAD_RESTART)
3401                                         goto RetryFault;
3402
3403                                 kr = KERN_ABORTED;
3404                                 goto done;
3405                         }
3406 reclaimed_from_pageout:
3407                         if (m->laundry) {
3408                                 if (object != cur_object) {
3409                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3410                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3411
3412                                                 vm_object_unlock(object);
3413                                                 vm_object_unlock(cur_object);
3414
3415                                                 vm_map_unlock_read(map);
3416                                                 if (real_map != map)
3417                                                         vm_map_unlock(real_map);
3418
3419                                                 goto RetryFault;
3420                                         }
3421
3422                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3423
3424                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3425
3426                                         if (vm_object_lock_upgrade(object) == FALSE) {
3427                                                 /*
3428                                                  * couldn't upgrade, so explictly take the lock
3429                                                  * exclusively and go relookup the page since we
3430                                                  * will have dropped the object lock and
3431                                                  * a different thread could have inserted
3432                                                  * a page at this offset
3433                                                  * no need for a full retry since we're
3434                                                  * at the top level of the object chain
3435                                                  */
3436                                                 vm_object_lock(object);
3437
3438                                                 continue;
3439                                         }
3440                                 }
3441                                 m->pageout = FALSE;
3442
3443                                 vm_pageout_steal_laundry(m, FALSE);
3444                         }
3445
3446                         if (m->phys_page == vm_page_guard_addr) {
3447                                 /*
3448                                  * Guard page: let the slow path deal with it
3449                                  */
3450                                 break;
3451                         }
3452                         if (m->unusual && (m->error || m->restart || m->private || m->absent)) {
3453                                 /*
3454                                  * Unusual case... let the slow path deal with it
3455                                  */
3456                                 break;
3457                         }
3458                         if (VM_OBJECT_PURGEABLE_FAULT_ERROR(m->object)) {
3459                                 if (object != cur_object)
3460                                         vm_object_unlock(object);
3461                                 vm_map_unlock_read(map);
3462                                 if (real_map != map)
3463                                         vm_map_unlock(real_map);
3464                                 vm_object_unlock(cur_object);
3465                                 kr = KERN_MEMORY_ERROR;
3466                                 goto done;
3467                         }
3468
3469                         if (m->encrypted) {
3470                                 /*
3471                                  * ENCRYPTED SWAP:
3472                                  * We've soft-faulted (because it's not in the page
3473                                  * table) on an encrypted page.
3474                                  * Keep the page "busy" so that no one messes with
3475                                  * it during the decryption.
3476                                  * Release the extra locks we're holding, keep only
3477                                  * the page's VM object lock.
3478                                  *
3479                                  * in order to set 'busy' on 'm', we must
3480                                  * have object that 'm' belongs to locked exclusively
3481                                  */
3482                                 if (object != cur_object) {
3483                                         vm_object_unlock(object);
3484
3485                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3486
3487                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3488
3489                                                 if (vm_object_lock_upgrade(cur_object) == FALSE) {
3490                                                         /*
3491                                                          * couldn't upgrade so go do a full retry
3492                                                          * immediately since we've already dropped
3493                                                          * the top object lock associated with this page
3494                                                          * and the current one got dropped due to the
3495                                                          * failed upgrade... the state is no longer valid
3496                                                          */
3497                                                         vm_map_unlock_read(map);
3498                                                         if (real_map != map)
3499                                                                 vm_map_unlock(real_map);
3500
3501                                                         goto RetryFault;
3502                                                 }
3503                                         }
3504                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3505
3506                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3507
3508                                         if (vm_object_lock_upgrade(object) == FALSE) {
3509                                                 /*
3510                                                  * couldn't upgrade, so explictly take the lock
3511                                                  * exclusively and go relookup the page since we
3512                                                  * will have dropped the object lock and
3513                                                  * a different thread could have inserted
3514                                                  * a page at this offset
3515                                                  * no need for a full retry since we're
3516                                                  * at the top level of the object chain
3517                                                  */
3518                                                 vm_object_lock(object);
3519
3520                                                 continue;
3521                                         }
3522                                 }
3523                                 m->busy = TRUE;
3524
3525                                 vm_map_unlock_read(map);
3526                                 if (real_map != map)
3527                                         vm_map_unlock(real_map);
3528
3529                                 vm_page_decrypt(m, 0);
3530
3531                                 assert(m->busy);
3532                                 PAGE_WAKEUP_DONE(m);
3533
3534                                 vm_object_unlock(cur_object);
3535                                 /*
3536                                  * Retry from the top, in case anything
3537                                  * changed while we were decrypting...
3538                                  */
3539                                 goto RetryFault;
3540                         }
3541                         ASSERT_PAGE_DECRYPTED(m);
3542
3543                         if(vm_page_is_slideable(m)) {
3544                                 /*
3545                                  * We might need to slide this page, and so,
3546                                  * we want to hold the VM object exclusively.
3547                                  */
3548                                 if (object != cur_object) {
3549                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3550                                                 vm_object_unlock(object);
3551                                                 vm_object_unlock(cur_object);
3552
3553                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3554
3555                                                 vm_map_unlock_read(map);
3556                                                 if (real_map != map)
3557                                                         vm_map_unlock(real_map);
3558
3559                                                 goto RetryFault;
3560                                         }
3561                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3562
3563                                         vm_object_unlock(object);
3564                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3565                                         vm_map_unlock_read(map);
3566                                         goto RetryFault;
3567                                 }
3568                         }
3569
3570                         if (VM_FAULT_NEED_CS_VALIDATION(map->pmap, m)) {
3571 upgrade_for_validation:
3572                                 /*
3573                                  * We might need to validate this page
3574                                  * against its code signature, so we
3575                                  * want to hold the VM object exclusively.
3576                                  */
3577                                 if (object != cur_object) {
3578                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3579                                                 vm_object_unlock(object);
3580                                                 vm_object_unlock(cur_object);
3581
3582                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3583
3584                                                 vm_map_unlock_read(map);
3585                                                 if (real_map != map)
3586                                                         vm_map_unlock(real_map);
3587
3588                                                 goto RetryFault;
3589                                         }
3590
3591                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3592
3593                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3594
3595                                         if (vm_object_lock_upgrade(object) == FALSE) {
3596                                                 /*
3597                                                  * couldn't upgrade, so explictly take the lock
3598                                                  * exclusively and go relookup the page since we
3599                                                  * will have dropped the object lock and
3600                                                  * a different thread could have inserted
3601                                                  * a page at this offset
3602                                                  * no need for a full retry since we're
3603                                                  * at the top level of the object chain
3604                                                  */
3605                                                 vm_object_lock(object);
3606
3607                                                 continue;
3608                                         }
3609                                 }
3610                         }
3611                         /*
3612                          *      Two cases of map in faults:
3613                          *          - At top level w/o copy object.
3614                          *          - Read fault anywhere.
3615                          *              --> must disallow write.
3616                          */
3617
3618                         if (object == cur_object && object->copy == VM_OBJECT_NULL) {
3619
3620                                 goto FastPmapEnter;
3621                         }
3622
3623                         if ((fault_type & VM_PROT_WRITE) == 0) {
3624
3625                                 if (object != cur_object) {
3626                                         /*
3627                                          * We still need to hold the top object
3628                                          * lock here to prevent a race between
3629                                          * a read fault (taking only "shared"
3630                                          * locks) and a write fault (taking
3631                                          * an "exclusive" lock on the top
3632                                          * object.
3633                                          * Otherwise, as soon as we release the
3634                                          * top lock, the write fault could
3635                                          * proceed and actually complete before
3636                                          * the read fault, and the copied page's
3637                                          * translation could then be overwritten
3638                                          * by the read fault's translation for
3639                                          * the original page.
3640                                          *
3641                                          * Let's just record what the top object
3642                                          * is and we'll release it later.
3643                                          */
3644                                         top_object = object;
3645
3646                                         /*
3647                                          * switch to the object that has the new page
3648                                          */
3649                                         object = cur_object;
3650                                         object_lock_type = cur_object_lock_type;
3651                                 }
3652 FastPmapEnter:
3653                                 /*
3654                                  * prepare for the pmap_enter...
3655                                  * object and map are both locked
3656                                  * m contains valid data
3657                                  * object == m->object
3658                                  * cur_object == NULL or it's been unlocked
3659                                  * no paging references on either object or cur_object
3660                                  */
3661                                 if (top_object != VM_OBJECT_NULL || object_lock_type != OBJECT_LOCK_EXCLUSIVE)
3662                                         need_retry_ptr = &need_retry;
3663                                 else
3664                                         need_retry_ptr = NULL;
3665
3666                                 if (caller_pmap) {
3667                                         kr = vm_fault_enter(m,
3668                                                             caller_pmap,
3669                                                             caller_pmap_addr,
3670                                                             prot,
3671                                                             fault_type,
3672                                                             wired,
3673                                                             change_wiring,
3674                                                             fault_info.no_cache,
3675                                                             fault_info.cs_bypass,
3676                                                             need_retry_ptr,
3677                                                             &type_of_fault);
3678                                 } else {
3679                                         kr = vm_fault_enter(m,
3680                                                             pmap,
3681                                                             vaddr,
3682                                                             prot,
3683                                                             fault_type,
3684                                                             wired,
3685                                                             change_wiring,
3686                                                             fault_info.no_cache,
3687                                                             fault_info.cs_bypass,
3688                                                             need_retry_ptr,
3689                                                             &type_of_fault);
3690                                 }
3691
3692                                 if (top_object != VM_OBJECT_NULL) {
3693                                         /*
3694                                          * It's safe to drop the top object
3695                                          * now that we've done our
3696                                          * vm_fault_enter().  Any other fault
3697                                          * in progress for that virtual
3698                                          * address will either find our page
3699                                          * and translation or put in a new page
3700                                          * and translation.
3701                                          */
3702                                         vm_object_unlock(top_object);
3703                                         top_object = VM_OBJECT_NULL;
3704                                 }
3705
3706                                 if (need_collapse == TRUE)
3707                                         vm_object_collapse(object, offset, TRUE);
3708
3709                                 if (need_retry == FALSE &&
3710                                     (type_of_fault == DBG_PAGEIND_FAULT || type_of_fault == DBG_PAGEINV_FAULT || type_of_fault == DBG_CACHE_HIT_FAULT)) {
3711                                         /*
3712                                          * evaluate access pattern and update state
3713                                          * vm_fault_deactivate_behind depends on the
3714                                          * state being up to date
3715                                          */
3716                                         vm_fault_is_sequential(object, cur_offset, fault_info.behavior);
3717
3718                                         vm_fault_deactivate_behind(object, cur_offset, fault_info.behavior);
3719                                 }
3720                                 /*
3721                                  * That's it, clean up and return.
3722                                  */
3723                                 if (m->busy)
3724                                         PAGE_WAKEUP_DONE(m);
3725
3726                                 vm_object_unlock(object);
3727
3728                                 vm_map_unlock_read(map);
3729                                 if (real_map != map)
3730                                         vm_map_unlock(real_map);
3731
3732                                 if (need_retry == TRUE) {
3733                                         /*
3734                                          * vm_fault_enter couldn't complete the PMAP_ENTER...
3735                                          * at this point we don't hold any locks so it's safe
3736                                          * to ask the pmap layer to expand the page table to
3737                                          * accommodate this mapping... once expanded, we'll
3738                                          * re-drive the fault which should result in vm_fault_enter
3739                                          * being able to successfully enter the mapping this time around
3740                                          */
3741                                         (void)pmap_enter_options(pmap, vaddr, 0, 0, 0, 0, 0, PMAP_OPTIONS_NOENTER, NULL);
3742
3743                                         need_retry = FALSE;
3744                                         goto RetryFault;
3745                                 }
3746                                 goto done;
3747                         }
3748                         /*
3749                          * COPY ON WRITE FAULT
3750                          */
3751                         assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE);
3752
3753                         if ((throttle_delay = vm_page_throttled())) {
3754                                 /*
3755                                  * drop all of our locks...
3756                                  * wait until the free queue is
3757                                  * pumped back up and then
3758                                  * redrive the fault
3759                                  */
3760                                 if (object != cur_object)
3761                                         vm_object_unlock(cur_object);
3762                                 vm_object_unlock(object);
3763                                 vm_map_unlock_read(map);
3764                                 if (real_map != map)
3765                                         vm_map_unlock(real_map);
3766
3767                                 VM_DEBUG_EVENT(vmf_cowdelay, VMF_COWDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
3768
3769                                 delay(throttle_delay);
3770
3771                                 if (!current_thread_aborted() && vm_page_wait((change_wiring) ?
3772                                                  THREAD_UNINT :
3773                                                  THREAD_ABORTSAFE))
3774                                         goto RetryFault;
3775                                 kr = KERN_ABORTED;
3776                                 goto done;
3777                         }
3778                         /*
3779                          * If objects match, then
3780                          * object->copy must not be NULL (else control
3781                          * would be in previous code block), and we
3782                          * have a potential push into the copy object
3783                          * with which we can't cope with here.
3784                          */
3785                         if (cur_object == object) {
3786                                 /*
3787                                  * must take the slow path to
3788                                  * deal with the copy push
3789                                  */
3790                                 break;
3791                         }
3792
3793                         /*
3794                          * This is now a shadow based copy on write
3795                          * fault -- it requires a copy up the shadow
3796                          * chain.
3797                          */
3798
3799                         if ((cur_object_lock_type == OBJECT_LOCK_SHARED) &&
3800                             VM_FAULT_NEED_CS_VALIDATION(NULL, m)) {
3801                                 goto upgrade_for_validation;
3802                         }
3803
3804                         /*
3805                          * Allocate a page in the original top level
3806                          * object. Give up if allocate fails.  Also
3807                          * need to remember current page, as it's the
3808                          * source of the copy.
3809                          *
3810                          * at this point we hold locks on both
3811                          * object and cur_object... no need to take
3812                          * paging refs or mark pages BUSY since
3813                          * we don't drop either object lock until
3814                          * the page has been copied and inserted
3815                          */
3816                         cur_m = m;
3817                         m = vm_page_grab();
3818
3819                         if (m == VM_PAGE_NULL) {
3820                                 /*
3821                                  * no free page currently available...
3822                                  * must take the slow path
3823                                  */
3824                                 break;
3825                         }
3826                         /*
3827                          * Now do the copy.  Mark the source page busy...
3828                          *
3829                          *      NOTE: This code holds the map lock across
3830                          *      the page copy.
3831                          */
3832                         vm_page_copy(cur_m, m);
3833                         vm_page_insert(m, object, offset);
3834                         SET_PAGE_DIRTY(m, FALSE);
3835
3836                         /*
3837                          * Now cope with the source page and object
3838                          */
3839                         if (object->ref_count > 1 && cur_m->pmapped)
3840                                 pmap_disconnect(cur_m->phys_page);
3841
3842                         need_collapse = TRUE;
3843
3844                         if (!cur_object->internal &&
3845                             cur_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
3846                                 /*
3847                                  * The object from which we've just
3848                                  * copied a page is most probably backed
3849                                  * by a vnode.  We don't want to waste too
3850                                  * much time trying to collapse the VM objects
3851                                  * and create a bottleneck when several tasks
3852                                  * map the same file.
3853                                  */
3854                                 if (cur_object->copy == object) {
3855                                         /*
3856                                          * Shared mapping or no COW yet.
3857                                          * We can never collapse a copy
3858                                          * object into its backing object.
3859                                          */
3860                                         need_collapse = FALSE;
3861                                 } else if (cur_object->copy == object->shadow &&
3862                                            object->shadow->resident_page_count == 0) {
3863                                         /*
3864                                          * Shared mapping after a COW occurred.
3865                                          */
3866                                         need_collapse = FALSE;
3867                                 }
3868                         }
3869                         vm_object_unlock(cur_object);
3870
3871                         if (need_collapse == FALSE)
3872                                 vm_fault_collapse_skipped++;
3873                         vm_fault_collapse_total++;
3874
3875                         type_of_fault = DBG_COW_FAULT;
3876                         VM_STAT_INCR(cow_faults);
3877                         DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
3878                         current_task()->cow_faults++;
3879
3880                         goto FastPmapEnter;
3881
3882                 } else {
3883                         /*
3884                          * No page at cur_object, cur_offset... m == NULL
3885                          */
3886                         if (cur_object->pager_created) {
3887                                 int     compressor_external_state = VM_EXTERNAL_STATE_UNKNOWN;
3888
3889                                 if (MUST_ASK_PAGER(cur_object, cur_offset, compressor_external_state) == TRUE) {
3890                                         int             my_fault_type;
3891                                         int             c_flags = C_DONT_BLOCK;
3892                                         boolean_t       insert_cur_object = FALSE;
3893
3894                                         /*
3895                                          * May have to talk to a pager...
3896                                          * if so, take the slow path by
3897                                          * doing a 'break' from the while (TRUE) loop
3898                                          *
3899                                          * external_state will only be set to VM_EXTERNAL_STATE_EXISTS
3900                                          * if the compressor is active and the page exists there
3901                                          */
3902                                         if (compressor_external_state != VM_EXTERNAL_STATE_EXISTS)
3903                                                 break;
3904
3905                                         if (map == kernel_map || real_map == kernel_map) {
3906                                                 /*
3907                                                  * can't call into the compressor with the kernel_map
3908                                                  * lock held, since the compressor may try to operate
3909                                                  * on the kernel map in order to return an empty c_segment
3910                                                  */
3911                                                 break;
3912                                         }
3913                                         if (object != cur_object) {
3914                                                 if (fault_type & VM_PROT_WRITE)
3915                                                         c_flags |= C_KEEP;
3916                                                 else
3917                                                         insert_cur_object = TRUE;
3918                                         }
3919                                         if (insert_cur_object == TRUE) {
3920
3921                                                 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3922
3923                                                         cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3924
3925                                                         if (vm_object_lock_upgrade(cur_object) == FALSE) {
3926                                                                 /*
3927                                                                  * couldn't upgrade so go do a full retry
3928                                                                  * immediately since we can no longer be
3929                                                                  * certain about cur_object (since we
3930                                                                  * don't hold a reference on it)...
3931                                                                  * first drop the top object lock
3932                                                                  */
3933                                                                 vm_object_unlock(object);
3934
3935                                                                 vm_map_unlock_read(map);
3936                                                                 if (real_map != map)
3937                                                                         vm_map_unlock(real_map);
3938
3939                                                                 goto RetryFault;
3940                                                         }
3941                                                 }
3942                                         } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3943
3944                                                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3945
3946                                                 if (object != cur_object) {
3947                                                         /*
3948                                                          * we can't go for the upgrade on the top
3949                                                          * lock since the upgrade may block waiting
3950                                                          * for readers to drain... since we hold
3951                                                          * cur_object locked at this point, waiting
3952                                                          * for the readers to drain would represent
3953                                                          * a lock order inversion since the lock order
3954                                                          * for objects is the reference order in the
3955                                                          * shadown chain
3956                                                          */
3957                                                         vm_object_unlock(object);
3958                                                         vm_object_unlock(cur_object);
3959
3960                                                         vm_map_unlock_read(map);
3961                                                         if (real_map != map)
3962                                                                 vm_map_unlock(real_map);
3963
3964                                                         goto RetryFault;
3965                                                 }
3966                                                 if (vm_object_lock_upgrade(object) == FALSE) {
3967                                                         /*
3968                                                          * couldn't upgrade, so explictly take the lock
3969                                                          * exclusively and go relookup the page since we
3970                                                          * will have dropped the object lock and
3971                                                          * a different thread could have inserted
3972                                                          * a page at this offset
3973                                                          * no need for a full retry since we're
3974                                                          * at the top level of the object chain
3975                                                          */
3976                                                         vm_object_lock(object);
3977
3978                                                         continue;
3979                                                 }
3980                                         }
3981                                         m = vm_page_grab();
3982
3983                                         if (m == VM_PAGE_NULL) {
3984                                                 /*
3985                                                  * no free page currently available...
3986                                                  * must take the slow path
3987                                                  */
3988                                                 break;
3989                                         }
3990                                         if (vm_compressor_pager_get(cur_object->pager, cur_offset + cur_object->paging_offset,
3991                                                                     m->phys_page, &my_fault_type, c_flags) != KERN_SUCCESS) {
3992                                                 vm_page_release(m);
3993                                                 break;
3994                                         }
3995                                         m->dirty = TRUE;
3996
3997                                         if (insert_cur_object)
3998                                                 vm_page_insert(m, cur_object, cur_offset);
3999                                         else
4000                                                 vm_page_insert(m, object, offset);
4001
4002                                         if ((m->object->wimg_bits & VM_WIMG_MASK) != VM_WIMG_USE_DEFAULT) {
4003                                                 /*
4004                                                  * If the page is not cacheable,
4005                                                  * we can't let its contents
4006                                                  * linger in the data cache
4007                                                  * after the decompression.
4008                                                  */
4009                                                 pmap_sync_page_attributes_phys(m->phys_page);
4010                                         }
4011                                         type_of_fault = my_fault_type;
4012
4013                                         VM_STAT_INCR(decompressions);
4014
4015                                         if (cur_object != object) {
4016                                                 if (insert_cur_object) {
4017                                                         top_object = object;
4018                                                         /*
4019                                                          * switch to the object that has the new page
4020                                                          */
4021                                                         object = cur_object;
4022                                                         object_lock_type = cur_object_lock_type;
4023                                                 } else {
4024                                                         vm_object_unlock(cur_object);
4025                                                         cur_object = object;
4026                                                 }
4027                                         }
4028                                         goto FastPmapEnter;
4029                                 }
4030                                 /*
4031                                  * existence map present and indicates
4032                                  * that the pager doesn't have this page
4033                                  */
4034                         }
4035                         if (cur_object->shadow == VM_OBJECT_NULL) {
4036                                 /*
4037                                  * Zero fill fault.  Page gets
4038                                  * inserted into the original object.
4039                                  */
4040                                 if (cur_object->shadow_severed ||
4041                                     VM_OBJECT_PURGEABLE_FAULT_ERROR(cur_object))
4042                                 {
4043                                         if (object != cur_object)
4044                                                 vm_object_unlock(cur_object);
4045                                         vm_object_unlock(object);
4046
4047                                         vm_map_unlock_read(map);
4048                                         if (real_map != map)
4049                                                 vm_map_unlock(real_map);
4050
4051                                         kr = KERN_MEMORY_ERROR;
4052                                         goto done;
4053                                 }
4054                                 if ((throttle_delay = vm_page_throttled())) {
4055                                         /*
4056                                          * drop all of our locks...
4057                                          * wait until the free queue is
4058                                          * pumped back up and then
4059                                          * redrive the fault
4060                                          */
4061                                         if (object != cur_object)
4062                                                 vm_object_unlock(cur_object);
4063                                         vm_object_unlock(object);
4064                                         vm_map_unlock_read(map);
4065                                         if (real_map != map)
4066                                                 vm_map_unlock(real_map);
4067
4068                                         VM_DEBUG_EVENT(vmf_zfdelay, VMF_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
4069
4070                                         delay(throttle_delay);
4071
4072                                         if (!current_thread_aborted() && vm_page_wait((change_wiring) ?
4073                                                          THREAD_UNINT :
4074                                                          THREAD_ABORTSAFE))
4075                                                 goto RetryFault;
4076                                         kr = KERN_ABORTED;
4077                                         goto done;
4078                                 }
4079                                 if (vm_backing_store_low) {
4080                                         /*
4081                                          * we are protecting the system from
4082                                          * backing store exhaustion...
4083                                          * must take the slow path if we're
4084                                          * not privileged
4085                                          */
4086                                         if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV))
4087                                                 break;
4088                                 }
4089                                 if (cur_object != object) {
4090                                         vm_object_unlock(cur_object);
4091
4092                                         cur_object = object;
4093                                 }
4094                                 if (object_lock_type == OBJECT_LOCK_SHARED) {
4095
4096                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4097
4098                                         if (vm_object_lock_upgrade(object) == FALSE) {
4099                                                 /*
4100                                                  * couldn't upgrade so do a full retry on the fault
4101                                                  * since we dropped the object lock which
4102                                                  * could allow another thread to insert
4103                                                  * a page at this offset
4104                                                  */
4105                                                 vm_map_unlock_read(map);
4106                                                 if (real_map != map)
4107                                                         vm_map_unlock(real_map);
4108
4109                                                 goto RetryFault;
4110                                         }
4111                                 }
4112                                 m = vm_page_alloc(object, offset);
4113
4114                                 if (m == VM_PAGE_NULL) {
4115                                         /*
4116                                          * no free page currently available...
4117                                          * must take the slow path
4118                                          */
4119                                         break;
4120                                 }
4121
4122                                 /*
4123                                  * Now zero fill page...
4124                                  * the page is probably going to
4125                                  * be written soon, so don't bother
4126                                  * to clear the modified bit
4127                                  *
4128                                  *   NOTE: This code holds the map
4129                                  *   lock across the zero fill.
4130                                  */
4131                                 type_of_fault = vm_fault_zero_page(m, map->no_zero_fill);
4132
4133                                 goto FastPmapEnter;
4134                         }
4135                         /*
4136                          * On to the next level in the shadow chain
4137                          */
4138                         cur_offset += cur_object->vo_shadow_offset;
4139                         new_object = cur_object->shadow;
4140
4141                         /*
4142                          * take the new_object's lock with the indicated state
4143                          */
4144                         if (cur_object_lock_type == OBJECT_LOCK_SHARED)
4145                                 vm_object_lock_shared(new_object);
4146                         else
4147                                 vm_object_lock(new_object);
4148
4149                         if (cur_object != object)
4150                                 vm_object_unlock(cur_object);
4151
4152                         cur_object = new_object;
4153
4154                         continue;
4155                 }
4156         }
4157         /*
4158          * Cleanup from fast fault failure.  Drop any object
4159          * lock other than original and drop map lock.
4160          */
4161         if (object != cur_object)
4162                 vm_object_unlock(cur_object);
4163
4164         /*
4165          * must own the object lock exclusively at this point
4166          */
4167         if (object_lock_type == OBJECT_LOCK_SHARED) {
4168                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4169
4170                 if (vm_object_lock_upgrade(object) == FALSE) {
4171                         /*
4172                          * couldn't upgrade, so explictly
4173                          * take the lock exclusively
4174                          * no need to retry the fault at this
4175                          * point since "vm_fault_page" will
4176                          * completely re-evaluate the state
4177                          */
4178                         vm_object_lock(object);
4179                 }
4180         }
4181
4182 handle_copy_delay:
4183         vm_map_unlock_read(map);
4184         if (real_map != map)
4185                 vm_map_unlock(real_map);
4186
4187         /*
4188          * Make a reference to this object to
4189          * prevent its disposal while we are messing with
4190          * it.  Once we have the reference, the map is free
4191          * to be diddled.  Since objects reference their
4192          * shadows (and copies), they will stay around as well.
4193          */
4194         vm_object_reference_locked(object);
4195         vm_object_paging_begin(object);
4196
4197         XPR(XPR_VM_FAULT,"vm_fault -> vm_fault_page\n",0,0,0,0,0);
4198
4199         error_code = 0;
4200
4201         result_page = VM_PAGE_NULL;
4202         kr = vm_fault_page(object, offset, fault_type,
4203                            (change_wiring && !wired),
4204                            FALSE, /* page not looked up */
4205                            &prot, &result_page, &top_page,
4206                            &type_of_fault,
4207                            &error_code, map->no_zero_fill,
4208                            FALSE, &fault_info);
4209
4210         /*
4211          * if kr != VM_FAULT_SUCCESS, then the paging reference
4212          * has been dropped and the object unlocked... the ref_count
4213          * is still held
4214          *
4215          * if kr == VM_FAULT_SUCCESS, then the paging reference
4216          * is still held along with the ref_count on the original object
4217          *
4218          *      the object is returned locked with a paging reference
4219          *
4220          *      if top_page != NULL, then it's BUSY and the
4221          *      object it belongs to has a paging reference
4222          *      but is returned unlocked
4223          */
4224         if (kr != VM_FAULT_SUCCESS &&
4225             kr != VM_FAULT_SUCCESS_NO_VM_PAGE) {
4226                 /*
4227                  * we didn't succeed, lose the object reference immediately.
4228                  */
4229                 vm_object_deallocate(object);
4230
4231                 /*
4232                  * See why we failed, and take corrective action.
4233                  */
4234                 switch (kr) {
4235                 case VM_FAULT_MEMORY_SHORTAGE:
4236                         if (vm_page_wait((change_wiring) ?
4237                                          THREAD_UNINT :
4238                                          THREAD_ABORTSAFE))
4239                                 goto RetryFault;
4240                         /*
4241                          * fall thru
4242                          */
4243                 case VM_FAULT_INTERRUPTED:
4244                         kr = KERN_ABORTED;
4245                         goto done;
4246                 case VM_FAULT_RETRY:
4247                         goto RetryFault;
4248                 case VM_FAULT_MEMORY_ERROR:
4249                         if (error_code)
4250                                 kr = error_code;
4251                         else
4252                                 kr = KERN_MEMORY_ERROR;
4253                         goto done;
4254                 default:
4255                         panic("vm_fault: unexpected error 0x%x from "
4256                               "vm_fault_page()\n", kr);
4257                 }
4258         }
4259         m = result_page;
4260
4261         if (m != VM_PAGE_NULL) {
4262                 assert((change_wiring && !wired) ?
4263                     (top_page == VM_PAGE_NULL) :
4264                     ((top_page == VM_PAGE_NULL) == (m->object == object)));
4265         }
4266
4267         /*
4268          * What to do with the resulting page from vm_fault_page
4269          * if it doesn't get entered into the physical map:
4270          */
4271 #define RELEASE_PAGE(m)                                 \
4272         MACRO_BEGIN                                     \
4273         PAGE_WAKEUP_DONE(m);                            \
4274         if (!m->active && !m->inactive && !m->throttled) {              \
4275                 vm_page_lockspin_queues();                              \
4276                 if (!m->active && !m->inactive && !m->throttled)        \
4277                         vm_page_activate(m);                            \
4278                 vm_page_unlock_queues();                                \
4279         }                                                               \
4280         MACRO_END
4281
4282         /*
4283          * We must verify that the maps have not changed
4284          * since our last lookup.
4285          */
4286         if (m != VM_PAGE_NULL) {
4287                 old_copy_object = m->object->copy;
4288                 vm_object_unlock(m->object);
4289         } else {
4290                 old_copy_object = VM_OBJECT_NULL;
4291                 vm_object_unlock(object);
4292         }
4293
4294         /*
4295          * no object locks are held at this point
4296          */
4297         if ((map != original_map) || !vm_map_verify(map, &version)) {
4298                 vm_object_t             retry_object;
4299                 vm_object_offset_t      retry_offset;
4300                 vm_prot_t               retry_prot;
4301
4302                 /*
4303                  * To avoid trying to write_lock the map while another
4304                  * thread has it read_locked (in vm_map_pageable), we
4305                  * do not try for write permission.  If the page is
4306                  * still writable, we will get write permission.  If it
4307                  * is not, or has been marked needs_copy, we enter the
4308                  * mapping without write permission, and will merely
4309                  * take another fault.
4310                  */
4311                 map = original_map;
4312                 vm_map_lock_read(map);
4313
4314                 kr = vm_map_lookup_locked(&map, vaddr,
4315                                           fault_type & ~VM_PROT_WRITE,
4316                                           OBJECT_LOCK_EXCLUSIVE, &version,
4317                                           &retry_object, &retry_offset, &retry_prot,
4318                                           &wired,
4319                                           &fault_info,
4320                                           &real_map);
4321                 pmap = real_map->pmap;
4322
4323                 if (kr != KERN_SUCCESS) {
4324                         vm_map_unlock_read(map);
4325
4326                         if (m != VM_PAGE_NULL) {
4327                                 /*
4328                                  * retake the lock so that
4329                                  * we can drop the paging reference
4330                                  * in vm_fault_cleanup and do the
4331                                  * PAGE_WAKEUP_DONE in RELEASE_PAGE
4332                                  */
4333                                 vm_object_lock(m->object);
4334
4335                                 RELEASE_PAGE(m);
4336
4337                                 vm_fault_cleanup(m->object, top_page);
4338                         } else {
4339                                 /*
4340                                  * retake the lock so that
4341                                  * we can drop the paging reference
4342                                  * in vm_fault_cleanup
4343                                  */
4344                                 vm_object_lock(object);
4345
4346                                 vm_fault_cleanup(object, top_page);
4347                         }
4348                         vm_object_deallocate(object);
4349
4350                         goto done;
4351                 }
4352                 vm_object_unlock(retry_object);
4353
4354                 if ((retry_object != object) || (retry_offset != offset)) {
4355
4356                         vm_map_unlock_read(map);
4357                         if (real_map != map)
4358                                 vm_map_unlock(real_map);
4359
4360                         if (m != VM_PAGE_NULL) {
4361                                 /*
4362                                  * retake the lock so that
4363                                  * we can drop the paging reference
4364                                  * in vm_fault_cleanup and do the
4365                                  * PAGE_WAKEUP_DONE in RELEASE_PAGE
4366                                  */
4367                                 vm_object_lock(m->object);
4368
4369                                 RELEASE_PAGE(m);
4370
4371                                 vm_fault_cleanup(m->object, top_page);
4372                         } else {
4373                                 /*
4374                                  * retake the lock so that
4375                                  * we can drop the paging reference
4376                                  * in vm_fault_cleanup
4377                                  */
4378                                 vm_object_lock(object);
4379
4380                                 vm_fault_cleanup(object, top_page);
4381                         }
4382                         vm_object_deallocate(object);
4383
4384                         goto RetryFault;
4385                 }
4386                 /*
4387                  * Check whether the protection has changed or the object
4388                  * has been copied while we left the map unlocked.
4389                  */
4390                 prot &= retry_prot;
4391         }
4392         if (m != VM_PAGE_NULL) {
4393                 vm_object_lock(m->object);
4394
4395                 if (m->object->copy != old_copy_object) {
4396                         /*
4397                          * The copy object changed while the top-level object
4398                          * was unlocked, so take away write permission.
4399                          */
4400                         prot &= ~VM_PROT_WRITE;
4401                 }
4402         } else
4403                 vm_object_lock(object);
4404
4405         /*
4406          * If we want to wire down this page, but no longer have
4407          * adequate permissions, we must start all over.
4408          */
4409         if (wired && (fault_type != (prot | VM_PROT_WRITE))) {
4410
4411                 vm_map_verify_done(map, &version);
4412                 if (real_map != map)
4413                         vm_map_unlock(real_map);
4414
4415                 if (m != VM_PAGE_NULL) {
4416                         RELEASE_PAGE(m);
4417
4418                         vm_fault_cleanup(m->object, top_page);
4419                 } else
4420                         vm_fault_cleanup(object, top_page);
4421
4422                 vm_object_deallocate(object);
4423
4424                 goto RetryFault;
4425         }
4426         if (m != VM_PAGE_NULL) {
4427                 /*
4428                  * Put this page into the physical map.
4429                  * We had to do the unlock above because pmap_enter
4430                  * may cause other faults.  The page may be on
4431                  * the pageout queues.  If the pageout daemon comes
4432                  * across the page, it will remove it from the queues.
4433                  */
4434                 if (caller_pmap) {
4435                         kr = vm_fault_enter(m,
4436                                             caller_pmap,
4437                                             caller_pmap_addr,
4438                                             prot,
4439                                             fault_type,
4440                                             wired,
4441                                             change_wiring,
4442                                             fault_info.no_cache,
4443                                             fault_info.cs_bypass,
4444                                             NULL,
4445                                             &type_of_fault);
4446                 } else {
4447                         kr = vm_fault_enter(m,
4448                                             pmap,
4449                                             vaddr,
4450                                             prot,
4451                                             fault_type,
4452                                             wired,
4453                                             change_wiring,
4454                                             fault_info.no_cache,
4455                                             fault_info.cs_bypass,
4456                                             NULL,
4457                                             &type_of_fault);
4458                 }
4459                 if (kr != KERN_SUCCESS) {
4460                         /* abort this page fault */
4461                         vm_map_verify_done(map, &version);
4462                         if (real_map != map)
4463                                 vm_map_unlock(real_map);
4464                         PAGE_WAKEUP_DONE(m);
4465                         vm_fault_cleanup(m->object, top_page);
4466                         vm_object_deallocate(object);
4467                         goto done;
4468                 }
4469         } else {
4470
4471                 vm_map_entry_t          entry;
4472                 vm_map_offset_t         laddr;
4473                 vm_map_offset_t         ldelta, hdelta;
4474
4475                 /*
4476                  * do a pmap block mapping from the physical address
4477                  * in the object
4478                  */
4479
4480 #ifdef ppc
4481                 /* While we do not worry about execution protection in   */
4482                 /* general, certian pages may have instruction execution */
4483                 /* disallowed.  We will check here, and if not allowed   */
4484                 /* to execute, we return with a protection failure.      */
4485
4486                 if ((fault_type & VM_PROT_EXECUTE) &&
4487                         (!pmap_eligible_for_execute((ppnum_t)(object->vo_shadow_offset >> 12)))) {
4488
4489                         vm_map_verify_done(map, &version);
4490
4491                         if (real_map != map)
4492                                 vm_map_unlock(real_map);
4493
4494                         vm_fault_cleanup(object, top_page);
4495                         vm_object_deallocate(object);
4496
4497                         kr = KERN_PROTECTION_FAILURE;
4498                         goto done;
4499                 }
4500 #endif  /* ppc */
4501
4502                 if (real_map != map)
4503                         vm_map_unlock(real_map);
4504
4505                 if (original_map != map) {
4506                         vm_map_unlock_read(map);
4507                         vm_map_lock_read(original_map);
4508                         map = original_map;
4509                 }
4510                 real_map = map;
4511
4512                 laddr = vaddr;
4513                 hdelta = 0xFFFFF000;
4514                 ldelta = 0xFFFFF000;
4515
4516                 while (vm_map_lookup_entry(map, laddr, &entry)) {
4517                         if (ldelta > (laddr - entry->vme_start))
4518                                 ldelta = laddr - entry->vme_start;
4519                         if (hdelta > (entry->vme_end - laddr))
4520                                 hdelta = entry->vme_end - laddr;
4521                         if (entry->is_sub_map) {
4522
4523                                 laddr = (laddr - entry->vme_start)
4524                                                         + entry->offset;
4525                                 vm_map_lock_read(entry->object.sub_map);
4526
4527                                 if (map != real_map)
4528                                         vm_map_unlock_read(map);
4529                                 if (entry->use_pmap) {
4530                                         vm_map_unlock_read(real_map);
4531                                         real_map = entry->object.sub_map;
4532                                 }
4533                                 map = entry->object.sub_map;
4534
4535                         } else {
4536                                 break;
4537                         }
4538                 }
4539
4540                 if (vm_map_lookup_entry(map, laddr, &entry) &&
4541                                         (entry->object.vm_object != NULL) &&
4542                                         (entry->object.vm_object == object)) {
4543
4544                         int superpage = (!object->pager_created && object->phys_contiguous)? VM_MEM_SUPERPAGE : 0;
4545                         if (caller_pmap) {
4546                                 /*
4547                                  * Set up a block mapped area
4548                                  */
4549                                 assert((uint32_t)((ldelta + hdelta) >> 12) == ((ldelta + hdelta) >> 12));
4550                                 pmap_map_block(caller_pmap,
4551                                                (addr64_t)(caller_pmap_addr - ldelta),
4552                                                (ppnum_t)((((vm_map_offset_t) (entry->object.vm_object->vo_shadow_offset)) +
4553                                                           entry->offset + (laddr - entry->vme_start) - ldelta) >> 12),
4554                                                (uint32_t)((ldelta + hdelta) >> 12), prot,
4555                                                (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
4556                         } else {
4557                                 /*
4558                                  * Set up a block mapped area
4559                                  */
4560                                 assert((uint32_t)((ldelta + hdelta) >> 12) == ((ldelta + hdelta) >> 12));
4561                                 pmap_map_block(real_map->pmap,
4562                                                (addr64_t)(vaddr - ldelta),
4563                                                (ppnum_t)((((vm_map_offset_t)(entry->object.vm_object->vo_shadow_offset)) +
4564                                                           entry->offset + (laddr - entry->vme_start) - ldelta) >> 12),
4565                                                (uint32_t)((ldelta + hdelta) >> 12), prot,
4566                                                (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
4567                         }
4568                 }
4569         }
4570
4571         /*
4572          * Unlock everything, and return
4573          */
4574         vm_map_verify_done(map, &version);
4575         if (real_map != map)
4576                 vm_map_unlock(real_map);
4577
4578         if (m != VM_PAGE_NULL) {
4579                 PAGE_WAKEUP_DONE(m);
4580
4581                 vm_fault_cleanup(m->object, top_page);
4582         } else
4583                 vm_fault_cleanup(object, top_page);
4584
4585         vm_object_deallocate(object);
4586
4587 #undef  RELEASE_PAGE
4588
4589         kr = KERN_SUCCESS;
4590 done:
4591         thread_interrupt_level(interruptible_state);
4592
4593         /*
4594          * Only throttle on faults which cause a pagein.
4595          */
4596         if ((type_of_fault == DBG_PAGEIND_FAULT) || (type_of_fault == DBG_PAGEINV_FAULT) || (type_of_fault == DBG_COMPRESSOR_SWAPIN_FAULT)) {
4597                 throttle_lowpri_io(1);
4598         }
4599
4600         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4601                               (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
4602                               ((uint64_t)vaddr >> 32),
4603                               vaddr,
4604                               kr,
4605                               type_of_fault,
4606                               0);
4607
4608         return (kr);
4609 }
4610
4611 /*
4612  *      vm_fault_wire:
4613  *
4614  *      Wire down a range of virtual addresses in a map.
4615  */
4616 kern_return_t
4617 vm_fault_wire(
4618         vm_map_t        map,
4619         vm_map_entry_t  entry,
4620         pmap_t          pmap,
4621         vm_map_offset_t pmap_addr)
4622 {
4623
4624         register vm_map_offset_t        va;
4625         register vm_map_offset_t        end_addr = entry->vme_end;
4626         register kern_return_t  rc;
4627
4628         assert(entry->in_transition);
4629
4630         if ((entry->object.vm_object != NULL) &&
4631                         !entry->is_sub_map &&
4632                         entry->object.vm_object->phys_contiguous) {
4633                 return KERN_SUCCESS;
4634         }
4635
4636         /*
4637          *      Inform the physical mapping system that the
4638          *      range of addresses may not fault, so that
4639          *      page tables and such can be locked down as well.
4640          */
4641
4642         pmap_pageable(pmap, pmap_addr,
4643                 pmap_addr + (end_addr - entry->vme_start), FALSE);
4644
4645         /*
4646          *      We simulate a fault to get the page and enter it
4647          *      in the physical map.
4648          */
4649
4650         for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
4651                 if ((rc = vm_fault_wire_fast(
4652                         map, va, entry, pmap,
4653                         pmap_addr + (va - entry->vme_start)
4654                         )) != KERN_SUCCESS) {
4655                         rc = vm_fault(map, va, VM_PROT_NONE, TRUE,
4656                                 (pmap == kernel_pmap) ?
4657                                         THREAD_UNINT : THREAD_ABORTSAFE,
4658                                 pmap, pmap_addr + (va - entry->vme_start));
4659                         DTRACE_VM2(softlock, int, 1, (uint64_t *), NULL);
4660                 }
4661
4662                 if (rc != KERN_SUCCESS) {
4663                         struct vm_map_entry     tmp_entry = *entry;
4664
4665                         /* unwire wired pages */
4666                         tmp_entry.vme_end = va;
4667                         vm_fault_unwire(map,
4668                                 &tmp_entry, FALSE, pmap, pmap_addr);
4669
4670                         return rc;
4671                 }
4672         }
4673         return KERN_SUCCESS;
4674 }
4675
4676 /*
4677  *      vm_fault_unwire:
4678  *
4679  *      Unwire a range of virtual addresses in a map.
4680  */
4681 void
4682 vm_fault_unwire(
4683         vm_map_t        map,
4684         vm_map_entry_t  entry,
4685         boolean_t       deallocate,
4686         pmap_t          pmap,
4687         vm_map_offset_t pmap_addr)
4688 {
4689         register vm_map_offset_t        va;
4690         register vm_map_offset_t        end_addr = entry->vme_end;
4691         vm_object_t             object;
4692         struct vm_object_fault_info fault_info;
4693
4694         object = (entry->is_sub_map)
4695                         ? VM_OBJECT_NULL : entry->object.vm_object;
4696
4697         /*
4698          * If it's marked phys_contiguous, then vm_fault_wire() didn't actually
4699          * do anything since such memory is wired by default.  So we don't have
4700          * anything to undo here.
4701          */
4702
4703         if (object != VM_OBJECT_NULL && object->phys_contiguous)
4704                 return;
4705
4706         fault_info.interruptible = THREAD_UNINT;
4707         fault_info.behavior = entry->behavior;
4708         fault_info.user_tag = entry->alias;
4709         fault_info.lo_offset = entry->offset;
4710         fault_info.hi_offset = (entry->vme_end - entry->vme_start) + entry->offset;
4711         fault_info.no_cache = entry->no_cache;
4712         fault_info.stealth = TRUE;
4713         fault_info.io_sync = FALSE;
4714         fault_info.cs_bypass = FALSE;
4715         fault_info.mark_zf_absent = FALSE;
4716         fault_info.batch_pmap_op = FALSE;
4717
4718         /*
4719          *      Since the pages are wired down, we must be able to
4720          *      get their mappings from the physical map system.
4721          */
4722
4723         for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
4724
4725                 if (object == VM_OBJECT_NULL) {
4726                         if (pmap) {
4727                                 pmap_change_wiring(pmap,
4728                                                    pmap_addr + (va - entry->vme_start), FALSE);
4729                         }
4730                         (void) vm_fault(map, va, VM_PROT_NONE,
4731                                         TRUE, THREAD_UNINT, pmap, pmap_addr);
4732                 } else {
4733                         vm_prot_t       prot;
4734                         vm_page_t       result_page;
4735                         vm_page_t       top_page;
4736                         vm_object_t     result_object;
4737                         vm_fault_return_t result;
4738
4739                         if (end_addr - va > (vm_size_t) -1) {
4740                                 /* 32-bit overflow */
4741                                 fault_info.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
4742                         } else {
4743                                 fault_info.cluster_size = (vm_size_t) (end_addr - va);
4744                                 assert(fault_info.cluster_size == end_addr - va);
4745                         }
4746
4747                         do {
4748                                 prot = VM_PROT_NONE;
4749
4750                                 vm_object_lock(object);
4751                                 vm_object_paging_begin(object);
4752                                 XPR(XPR_VM_FAULT,
4753                                         "vm_fault_unwire -> vm_fault_page\n",
4754                                         0,0,0,0,0);
4755                                 result_page = VM_PAGE_NULL;
4756                                 result = vm_fault_page(
4757                                         object,
4758                                         entry->offset + (va - entry->vme_start),
4759                                         VM_PROT_NONE, TRUE,
4760                                         FALSE, /* page not looked up */
4761                                         &prot, &result_page, &top_page,
4762                                         (int *)0,
4763                                         NULL, map->no_zero_fill,
4764                                         FALSE, &fault_info);
4765                         } while (result == VM_FAULT_RETRY);
4766
4767                         /*
4768                          * If this was a mapping to a file on a device that has been forcibly
4769                          * unmounted, then we won't get a page back from vm_fault_page().  Just
4770                          * move on to the next one in case the remaining pages are mapped from
4771                          * different objects.  During a forced unmount, the object is terminated
4772                          * so the alive flag will be false if this happens.  A forced unmount will
4773                          * will occur when an external disk is unplugged before the user does an
4774                          * eject, so we don't want to panic in that situation.
4775                          */
4776
4777                         if (result == VM_FAULT_MEMORY_ERROR && !object->alive)
4778                                 continue;
4779
4780                         if (result == VM_FAULT_MEMORY_ERROR &&
4781                             object == kernel_object) {
4782                                 /*
4783                                  * This must have been allocated with
4784                                  * KMA_KOBJECT and KMA_VAONLY and there's
4785                                  * no physical page at this offset.
4786                                  * We're done (no page to free).
4787                                  */
4788                                 assert(deallocate);
4789                                 continue;
4790                         }
4791
4792                         if (result != VM_FAULT_SUCCESS)
4793                                 panic("vm_fault_unwire: failure");
4794
4795                         result_object = result_page->object;
4796
4797                         if (deallocate) {
4798                                 assert(result_page->phys_page !=
4799                                        vm_page_fictitious_addr);
4800                                 pmap_disconnect(result_page->phys_page);
4801                                 VM_PAGE_FREE(result_page);
4802                         } else {
4803                                 if ((pmap) && (result_page->phys_page != vm_page_guard_addr))
4804                                         pmap_change_wiring(pmap,
4805                                             pmap_addr + (va - entry->vme_start), FALSE);
4806
4807
4808                                 if (VM_PAGE_WIRED(result_page)) {
4809                                         vm_page_lockspin_queues();
4810                                         vm_page_unwire(result_page, TRUE);
4811                                         vm_page_unlock_queues();
4812                                 }
4813                                 if(entry->zero_wired_pages) {
4814                                         pmap_zero_page(result_page->phys_page);
4815                                         entry->zero_wired_pages = FALSE;
4816                                 }
4817
4818                                 PAGE_WAKEUP_DONE(result_page);
4819                         }
4820                         vm_fault_cleanup(result_object, top_page);
4821                 }
4822         }
4823
4824         /*
4825          *      Inform the physical mapping system that the range
4826          *      of addresses may fault, so that page tables and
4827          *      such may be unwired themselves.
4828          */
4829
4830         pmap_pageable(pmap, pmap_addr,
4831                 pmap_addr + (end_addr - entry->vme_start), TRUE);
4832
4833 }
4834
4835 /*
4836  *      vm_fault_wire_fast:
4837  *
4838  *      Handle common case of a wire down page fault at the given address.
4839  *      If successful, the page is inserted into the associated physical map.
4840  *      The map entry is passed in to avoid the overhead of a map lookup.
4841  *
4842  *      NOTE: the given address should be truncated to the
4843  *      proper page address.
4844  *
4845  *      KERN_SUCCESS is returned if the page fault is handled; otherwise,
4846  *      a standard error specifying why the fault is fatal is returned.
4847  *
4848  *      The map in question must be referenced, and remains so.
4849  *      Caller has a read lock on the map.
4850  *
4851  *      This is a stripped version of vm_fault() for wiring pages.  Anything
4852  *      other than the common case will return KERN_FAILURE, and the caller
4853  *      is expected to call vm_fault().
4854  */
4855 kern_return_t
4856 vm_fault_wire_fast(
4857         __unused vm_map_t       map,
4858         vm_map_offset_t va,
4859         vm_map_entry_t  entry,
4860         pmap_t                  pmap,
4861         vm_map_offset_t pmap_addr)
4862 {
4863         vm_object_t             object;
4864         vm_object_offset_t      offset;
4865         register vm_page_t      m;
4866         vm_prot_t               prot;
4867         thread_t                thread = current_thread();
4868         int                     type_of_fault;
4869         kern_return_t           kr;
4870
4871         VM_STAT_INCR(faults);
4872
4873         if (thread != THREAD_NULL && thread->task != TASK_NULL)
4874           thread->task->faults++;
4875
4876 /*
4877  *      Recovery actions
4878  */
4879
4880 #undef  RELEASE_PAGE
4881 #define RELEASE_PAGE(m) {                               \
4882         PAGE_WAKEUP_DONE(m);                            \
4883         vm_page_lockspin_queues();                      \
4884         vm_page_unwire(m, TRUE);                        \
4885         vm_page_unlock_queues();                        \
4886 }
4887
4888
4889 #undef  UNLOCK_THINGS
4890 #define UNLOCK_THINGS   {                               \
4891         vm_object_paging_end(object);                      \
4892         vm_object_unlock(object);                          \
4893 }
4894
4895 #undef  UNLOCK_AND_DEALLOCATE
4896 #define UNLOCK_AND_DEALLOCATE   {                       \
4897         UNLOCK_THINGS;                                  \
4898         vm_object_deallocate(object);                   \
4899 }
4900 /*
4901  *      Give up and have caller do things the hard way.
4902  */
4903
4904 #define GIVE_UP {                                       \
4905         UNLOCK_AND_DEALLOCATE;                          \
4906         return(KERN_FAILURE);                           \
4907 }
4908
4909
4910         /*
4911          *      If this entry is not directly to a vm_object, bail out.
4912          */
4913         if (entry->is_sub_map)
4914                 return(KERN_FAILURE);
4915
4916         /*
4917          *      Find the backing store object and offset into it.
4918          */
4919
4920         object = entry->object.vm_object;
4921         offset = (va - entry->vme_start) + entry->offset;
4922         prot = entry->protection;
4923
4924         /*
4925          *      Make a reference to this object to prevent its
4926          *      disposal while we are messing with it.
4927          */
4928
4929         vm_object_lock(object);
4930         vm_object_reference_locked(object);
4931         vm_object_paging_begin(object);
4932
4933         /*
4934          *      INVARIANTS (through entire routine):
4935          *
4936          *      1)      At all times, we must either have the object
4937          *              lock or a busy page in some object to prevent
4938          *              some other thread from trying to bring in
4939          *              the same page.
4940          *
4941          *      2)      Once we have a busy page, we must remove it from
4942          *              the pageout queues, so that the pageout daemon
4943          *              will not grab it away.
4944          *
4945          */
4946
4947         /*
4948          *      Look for page in top-level object.  If it's not there or
4949          *      there's something going on, give up.
4950          * ENCRYPTED SWAP: use the slow fault path, since we'll need to
4951          * decrypt the page before wiring it down.
4952          */
4953         m = vm_page_lookup(object, offset);
4954         if ((m == VM_PAGE_NULL) || (m->busy) || (m->encrypted) ||
4955             (m->unusual && ( m->error || m->restart || m->absent))) {
4956
4957                 GIVE_UP;
4958         }
4959         ASSERT_PAGE_DECRYPTED(m);
4960
4961         if (m->fictitious &&
4962             m->phys_page == vm_page_guard_addr) {
4963                 /*
4964                  * Guard pages are fictitious pages and are never
4965                  * entered into a pmap, so let's say it's been wired...
4966                  */
4967                 kr = KERN_SUCCESS;
4968                 goto done;
4969         }
4970
4971         /*
4972          *      Wire the page down now.  All bail outs beyond this
4973          *      point must unwire the page.
4974          */
4975
4976         vm_page_lockspin_queues();
4977         vm_page_wire(m);
4978         vm_page_unlock_queues();
4979
4980         /*
4981          *      Mark page busy for other threads.
4982          */
4983         assert(!m->busy);
4984         m->busy = TRUE;
4985         assert(!m->absent);
4986
4987         /*
4988          *      Give up if the page is being written and there's a copy object
4989          */
4990         if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
4991                 RELEASE_PAGE(m);
4992                 GIVE_UP;
4993         }
4994
4995         /*
4996          *      Put this page into the physical map.
4997          */
4998         type_of_fault = DBG_CACHE_HIT_FAULT;
4999         kr = vm_fault_enter(m,
5000                             pmap,
5001                             pmap_addr,
5002                             prot,
5003                             prot,
5004                             TRUE,
5005                             FALSE,
5006                             FALSE,
5007                             FALSE,
5008                             NULL,
5009                             &type_of_fault);
5010
5011 done:
5012         /*
5013          *      Unlock everything, and return
5014          */
5015
5016         PAGE_WAKEUP_DONE(m);
5017         UNLOCK_AND_DEALLOCATE;
5018
5019         return kr;
5020
5021 }
5022
5023 /*
5024  *      Routine:        vm_fault_copy_cleanup
5025  *      Purpose:
5026  *              Release a page used by vm_fault_copy.
5027  */
5028
5029 void
5030 vm_fault_copy_cleanup(
5031         vm_page_t       page,
5032         vm_page_t       top_page)
5033 {
5034         vm_object_t     object = page->object;
5035
5036         vm_object_lock(object);
5037         PAGE_WAKEUP_DONE(page);
5038         if (!page->active && !page->inactive && !page->throttled) {
5039                 vm_page_lockspin_queues();
5040                 if (!page->active && !page->inactive && !page->throttled)
5041                         vm_page_activate(page);
5042                 vm_page_unlock_queues();
5043         }
5044         vm_fault_cleanup(object, top_page);
5045 }
5046
5047 void
5048 vm_fault_copy_dst_cleanup(
5049         vm_page_t       page)
5050 {
5051         vm_object_t     object;
5052
5053         if (page != VM_PAGE_NULL) {
5054                 object = page->object;
5055                 vm_object_lock(object);
5056                 vm_page_lockspin_queues();
5057                 vm_page_unwire(page, TRUE);
5058                 vm_page_unlock_queues();
5059                 vm_object_paging_end(object);
5060                 vm_object_unlock(object);
5061         }
5062 }
5063
5064 /*
5065  *      Routine:        vm_fault_copy
5066  *
5067  *      Purpose:
5068  *              Copy pages from one virtual memory object to another --
5069  *              neither the source nor destination pages need be resident.
5070  *
5071  *              Before actually copying a page, the version associated with
5072  *              the destination address map wil be verified.
5073  *
5074  *      In/out conditions:
5075  *              The caller must hold a reference, but not a lock, to
5076  *              each of the source and destination objects and to the
5077  *              destination map.
5078  *
5079  *      Results:
5080  *              Returns KERN_SUCCESS if no errors were encountered in
5081  *              reading or writing the data.  Returns KERN_INTERRUPTED if
5082  *              the operation was interrupted (only possible if the
5083  *              "interruptible" argument is asserted).  Other return values
5084  *              indicate a permanent error in copying the data.
5085  *
5086  *              The actual amount of data copied will be returned in the
5087  *              "copy_size" argument.  In the event that the destination map
5088  *              verification failed, this amount may be less than the amount
5089  *              requested.
5090  */
5091 kern_return_t
5092 vm_fault_copy(
5093         vm_object_t             src_object,
5094         vm_object_offset_t      src_offset,
5095         vm_map_size_t           *copy_size,             /* INOUT */
5096         vm_object_t             dst_object,
5097         vm_object_offset_t      dst_offset,
5098         vm_map_t                dst_map,
5099         vm_map_version_t         *dst_version,
5100         int                     interruptible)
5101 {
5102         vm_page_t               result_page;
5103
5104         vm_page_t               src_page;
5105         vm_page_t               src_top_page;
5106         vm_prot_t               src_prot;
5107
5108         vm_page_t               dst_page;
5109         vm_page_t               dst_top_page;
5110         vm_prot_t               dst_prot;
5111
5112         vm_map_size_t           amount_left;
5113         vm_object_t             old_copy_object;
5114         kern_return_t           error = 0;
5115         vm_fault_return_t       result;
5116
5117         vm_map_size_t           part_size;
5118         struct vm_object_fault_info fault_info_src;
5119         struct vm_object_fault_info fault_info_dst;
5120
5121         /*
5122          * In order not to confuse the clustered pageins, align
5123          * the different offsets on a page boundary.
5124          */
5125
5126 #define RETURN(x)                                       \
5127         MACRO_BEGIN                                     \
5128         *copy_size -= amount_left;                      \
5129         MACRO_RETURN(x);                                \
5130         MACRO_END
5131
5132         amount_left = *copy_size;
5133
5134         fault_info_src.interruptible = interruptible;
5135         fault_info_src.behavior = VM_BEHAVIOR_SEQUENTIAL;
5136         fault_info_src.user_tag  = 0;
5137         fault_info_src.lo_offset = vm_object_trunc_page(src_offset);
5138         fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left;
5139         fault_info_src.no_cache   = FALSE;
5140         fault_info_src.stealth = TRUE;
5141         fault_info_src.io_sync = FALSE;
5142         fault_info_src.cs_bypass = FALSE;
5143         fault_info_src.mark_zf_absent = FALSE;
5144         fault_info_src.batch_pmap_op = FALSE;
5145
5146         fault_info_dst.interruptible = interruptible;
5147         fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL;
5148         fault_info_dst.user_tag  = 0;
5149         fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset);
5150         fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left;
5151         fault_info_dst.no_cache   = FALSE;
5152         fault_info_dst.stealth = TRUE;
5153         fault_info_dst.io_sync = FALSE;
5154         fault_info_dst.cs_bypass = FALSE;
5155         fault_info_dst.mark_zf_absent = FALSE;
5156         fault_info_dst.batch_pmap_op = FALSE;
5157
5158         do { /* while (amount_left > 0) */
5159                 /*
5160                  * There may be a deadlock if both source and destination
5161                  * pages are the same. To avoid this deadlock, the copy must
5162                  * start by getting the destination page in order to apply
5163                  * COW semantics if any.
5164                  */
5165
5166         RetryDestinationFault: ;
5167
5168                 dst_prot = VM_PROT_WRITE|VM_PROT_READ;
5169
5170                 vm_object_lock(dst_object);
5171                 vm_object_paging_begin(dst_object);
5172
5173                 if (amount_left > (vm_size_t) -1) {
5174                         /* 32-bit overflow */
5175                         fault_info_dst.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
5176                 } else {
5177                         fault_info_dst.cluster_size = (vm_size_t) amount_left;
5178                         assert(fault_info_dst.cluster_size == amount_left);
5179                 }
5180
5181                 XPR(XPR_VM_FAULT,"vm_fault_copy -> vm_fault_page\n",0,0,0,0,0);
5182                 dst_page = VM_PAGE_NULL;
5183                 result = vm_fault_page(dst_object,
5184                                        vm_object_trunc_page(dst_offset),
5185                                        VM_PROT_WRITE|VM_PROT_READ,
5186                                        FALSE,
5187                                        FALSE, /* page not looked up */
5188                                        &dst_prot, &dst_page, &dst_top_page,
5189                                        (int *)0,
5190                                        &error,
5191                                        dst_map->no_zero_fill,
5192                                        FALSE, &fault_info_dst);
5193                 switch (result) {
5194                 case VM_FAULT_SUCCESS:
5195                         break;
5196                 case VM_FAULT_RETRY:
5197                         goto RetryDestinationFault;
5198                 case VM_FAULT_MEMORY_SHORTAGE:
5199                         if (vm_page_wait(interruptible))
5200                                 goto RetryDestinationFault;
5201                         /* fall thru */
5202                 case VM_FAULT_INTERRUPTED:
5203                         RETURN(MACH_SEND_INTERRUPTED);
5204                 case VM_FAULT_SUCCESS_NO_VM_PAGE:
5205                         /* success but no VM page: fail the copy */
5206                         vm_object_paging_end(dst_object);
5207                         vm_object_unlock(dst_object);
5208                         /*FALLTHROUGH*/
5209                 case VM_FAULT_MEMORY_ERROR:
5210                         if (error)
5211                                 return (error);
5212                         else
5213                                 return(KERN_MEMORY_ERROR);
5214                 default:
5215                         panic("vm_fault_copy: unexpected error 0x%x from "
5216                               "vm_fault_page()\n", result);
5217                 }
5218                 assert ((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
5219
5220                 old_copy_object = dst_page->object->copy;
5221
5222                 /*
5223                  * There exists the possiblity that the source and
5224                  * destination page are the same.  But we can't
5225                  * easily determine that now.  If they are the
5226                  * same, the call to vm_fault_page() for the
5227                  * destination page will deadlock.  To prevent this we
5228                  * wire the page so we can drop busy without having
5229                  * the page daemon steal the page.  We clean up the
5230                  * top page  but keep the paging reference on the object
5231                  * holding the dest page so it doesn't go away.
5232                  */
5233
5234                 vm_page_lockspin_queues();
5235                 vm_page_wire(dst_page);
5236                 vm_page_unlock_queues();
5237                 PAGE_WAKEUP_DONE(dst_page);
5238                 vm_object_unlock(dst_page->object);
5239
5240                 if (dst_top_page != VM_PAGE_NULL) {
5241                         vm_object_lock(dst_object);
5242                         VM_PAGE_FREE(dst_top_page);
5243                         vm_object_paging_end(dst_object);
5244                         vm_object_unlock(dst_object);
5245                 }
5246
5247         RetrySourceFault: ;
5248
5249                 if (src_object == VM_OBJECT_NULL) {
5250                         /*
5251                          *      No source object.  We will just
5252                          *      zero-fill the page in dst_object.
5253                          */
5254                         src_page = VM_PAGE_NULL;
5255                         result_page = VM_PAGE_NULL;
5256                 } else {
5257                         vm_object_lock(src_object);
5258                         src_page = vm_page_lookup(src_object,
5259                                                   vm_object_trunc_page(src_offset));
5260                         if (src_page == dst_page) {
5261                                 src_prot = dst_prot;
5262                                 result_page = VM_PAGE_NULL;
5263                         } else {
5264                                 src_prot = VM_PROT_READ;
5265                                 vm_object_paging_begin(src_object);
5266
5267                                 if (amount_left > (vm_size_t) -1) {
5268                                         /* 32-bit overflow */
5269                                         fault_info_src.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
5270                                 } else {
5271                                         fault_info_src.cluster_size = (vm_size_t) amount_left;
5272                                         assert(fault_info_src.cluster_size == amount_left);
5273                                 }
5274
5275                                 XPR(XPR_VM_FAULT,
5276                                         "vm_fault_copy(2) -> vm_fault_page\n",
5277                                         0,0,0,0,0);
5278                                 result_page = VM_PAGE_NULL;
5279                                 result = vm_fault_page(
5280                                         src_object,
5281                                         vm_object_trunc_page(src_offset),
5282                                         VM_PROT_READ, FALSE,
5283                                         FALSE, /* page not looked up */
5284                                         &src_prot,
5285                                         &result_page, &src_top_page,
5286                                         (int *)0, &error, FALSE,
5287                                         FALSE, &fault_info_src);
5288
5289                                 switch (result) {
5290                                 case VM_FAULT_SUCCESS:
5291                                         break;
5292                                 case VM_FAULT_RETRY:
5293                                         goto RetrySourceFault;
5294                                 case VM_FAULT_MEMORY_SHORTAGE:
5295                                         if (vm_page_wait(interruptible))
5296                                                 goto RetrySourceFault;
5297                                         /* fall thru */
5298                                 case VM_FAULT_INTERRUPTED:
5299                                         vm_fault_copy_dst_cleanup(dst_page);
5300                                         RETURN(MACH_SEND_INTERRUPTED);
5301                                 case VM_FAULT_SUCCESS_NO_VM_PAGE:
5302                                         /* success but no VM page: fail */
5303                                         vm_object_paging_end(src_object);
5304                                         vm_object_unlock(src_object);
5305                                         /*FALLTHROUGH*/
5306                                 case VM_FAULT_MEMORY_ERROR:
5307                                         vm_fault_copy_dst_cleanup(dst_page);
5308                                         if (error)
5309                                                 return (error);
5310                                         else
5311                                                 return(KERN_MEMORY_ERROR);
5312                                 default:
5313                                         panic("vm_fault_copy(2): unexpected "
5314                                               "error 0x%x from "
5315                                               "vm_fault_page()\n", result);
5316                                 }
5317
5318
5319                                 assert((src_top_page == VM_PAGE_NULL) ==
5320                                        (result_page->object == src_object));
5321                         }
5322                         assert ((src_prot & VM_PROT_READ) != VM_PROT_NONE);
5323                         vm_object_unlock(result_page->object);
5324                 }
5325
5326                 if (!vm_map_verify(dst_map, dst_version)) {
5327                         if (result_page != VM_PAGE_NULL && src_page != dst_page)
5328                                 vm_fault_copy_cleanup(result_page, src_top_page);
5329                         vm_fault_copy_dst_cleanup(dst_page);
5330                         break;
5331                 }
5332
5333                 vm_object_lock(dst_page->object);
5334
5335                 if (dst_page->object->copy != old_copy_object) {
5336                         vm_object_unlock(dst_page->object);
5337                         vm_map_verify_done(dst_map, dst_version);
5338                         if (result_page != VM_PAGE_NULL && src_page != dst_page)
5339                                 vm_fault_copy_cleanup(result_page, src_top_page);
5340                         vm_fault_copy_dst_cleanup(dst_page);
5341                         break;
5342                 }
5343                 vm_object_unlock(dst_page->object);
5344
5345                 /*
5346                  *      Copy the page, and note that it is dirty
5347                  *      immediately.
5348                  */
5349
5350                 if (!page_aligned(src_offset) ||
5351                         !page_aligned(dst_offset) ||
5352                         !page_aligned(amount_left)) {
5353
5354                         vm_object_offset_t      src_po,
5355                                                 dst_po;
5356
5357                         src_po = src_offset - vm_object_trunc_page(src_offset);
5358                         dst_po = dst_offset - vm_object_trunc_page(dst_offset);
5359
5360                         if (dst_po > src_po) {
5361                                 part_size = PAGE_SIZE - dst_po;
5362                         } else {
5363                                 part_size = PAGE_SIZE - src_po;
5364                         }
5365                         if (part_size > (amount_left)){
5366                                 part_size = amount_left;
5367                         }
5368
5369                         if (result_page == VM_PAGE_NULL) {
5370                                 assert((vm_offset_t) dst_po == dst_po);
5371                                 assert((vm_size_t) part_size == part_size);
5372                                 vm_page_part_zero_fill(dst_page,
5373                                                        (vm_offset_t) dst_po,
5374                                                        (vm_size_t) part_size);
5375                         } else {
5376                                 assert((vm_offset_t) src_po == src_po);
5377                                 assert((vm_offset_t) dst_po == dst_po);
5378                                 assert((vm_size_t) part_size == part_size);
5379                                 vm_page_part_copy(result_page,
5380                                                   (vm_offset_t) src_po,
5381                                                   dst_page,
5382                                                   (vm_offset_t) dst_po,
5383                                                   (vm_size_t)part_size);
5384                                 if(!dst_page->dirty){
5385                                         vm_object_lock(dst_object);
5386                                         SET_PAGE_DIRTY(dst_page, TRUE);
5387                                         vm_object_unlock(dst_page->object);
5388                                 }
5389
5390                         }
5391                 } else {
5392                         part_size = PAGE_SIZE;
5393
5394                         if (result_page == VM_PAGE_NULL)
5395                                 vm_page_zero_fill(dst_page);
5396                         else{
5397                                 vm_object_lock(result_page->object);
5398                                 vm_page_copy(result_page, dst_page);
5399                                 vm_object_unlock(result_page->object);
5400
5401                                 if(!dst_page->dirty){
5402                                         vm_object_lock(dst_object);
5403                                         SET_PAGE_DIRTY(dst_page, TRUE);
5404                                         vm_object_unlock(dst_page->object);
5405                                 }
5406                         }
5407
5408                 }
5409
5410                 /*
5411                  *      Unlock everything, and return
5412                  */
5413
5414                 vm_map_verify_done(dst_map, dst_version);
5415
5416                 if (result_page != VM_PAGE_NULL && src_page != dst_page)
5417                         vm_fault_copy_cleanup(result_page, src_top_page);
5418                 vm_fault_copy_dst_cleanup(dst_page);
5419
5420                 amount_left -= part_size;
5421                 src_offset += part_size;
5422                 dst_offset += part_size;
5423         } while (amount_left > 0);
5424
5425         RETURN(KERN_SUCCESS);
5426 #undef  RETURN
5427
5428         /*NOTREACHED*/
5429 }
5430
5431 #if     VM_FAULT_CLASSIFY
5432 /*
5433  *      Temporary statistics gathering support.
5434  */
5435
5436 /*
5437  *      Statistics arrays:
5438  */
5439 #define VM_FAULT_TYPES_MAX      5
5440 #define VM_FAULT_LEVEL_MAX      8
5441
5442 int     vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
5443
5444 #define VM_FAULT_TYPE_ZERO_FILL 0
5445 #define VM_FAULT_TYPE_MAP_IN    1
5446 #define VM_FAULT_TYPE_PAGER     2
5447 #define VM_FAULT_TYPE_COPY      3
5448 #define VM_FAULT_TYPE_OTHER     4
5449
5450
5451 void
5452 vm_fault_classify(vm_object_t           object,
5453                   vm_object_offset_t    offset,
5454                   vm_prot_t             fault_type)
5455 {
5456         int             type, level = 0;
5457         vm_page_t       m;
5458
5459         while (TRUE) {
5460                 m = vm_page_lookup(object, offset);
5461                 if (m != VM_PAGE_NULL) {
5462                         if (m->busy || m->error || m->restart || m->absent) {
5463                                 type = VM_FAULT_TYPE_OTHER;
5464                                 break;
5465                         }
5466                         if (((fault_type & VM_PROT_WRITE) == 0) ||
5467                             ((level == 0) && object->copy == VM_OBJECT_NULL)) {
5468                                 type = VM_FAULT_TYPE_MAP_IN;
5469                                 break;
5470                         }
5471                         type = VM_FAULT_TYPE_COPY;
5472                         break;
5473                 }
5474                 else {
5475                         if (object->pager_created) {
5476                                 type = VM_FAULT_TYPE_PAGER;
5477                                 break;
5478                         }
5479                         if (object->shadow == VM_OBJECT_NULL) {
5480                                 type = VM_FAULT_TYPE_ZERO_FILL;
5481                                 break;
5482                         }
5483
5484                         offset += object->vo_shadow_offset;
5485                         object = object->shadow;
5486                         level++;
5487                         continue;
5488                 }
5489         }
5490
5491         if (level > VM_FAULT_LEVEL_MAX)
5492                 level = VM_FAULT_LEVEL_MAX;
5493
5494         vm_fault_stats[type][level] += 1;
5495
5496         return;
5497 }
5498
5499 /* cleanup routine to call from debugger */
5500
5501 void
5502 vm_fault_classify_init(void)
5503 {
5504         int type, level;
5505
5506         for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
5507                 for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
5508                         vm_fault_stats[type][level] = 0;
5509                 }
5510         }
5511
5512         return;
5513 }
5514 #endif  /* VM_FAULT_CLASSIFY */
5515
5516
5517 void
5518 vm_page_validate_cs_mapped(
5519         vm_page_t       page,
5520         const void      *kaddr)
5521 {
5522         vm_object_t             object;
5523         vm_object_offset_t      offset;
5524         kern_return_t           kr;
5525         memory_object_t         pager;
5526         void                    *blobs;
5527         boolean_t               validated, tainted;
5528
5529         assert(page->busy);
5530         vm_object_lock_assert_exclusive(page->object);
5531
5532         if (!cs_validation) {
5533                 return;
5534         }
5535
5536         if (page->wpmapped && !page->cs_tainted) {
5537                 /*
5538                  * This page was mapped for "write" access sometime in the
5539                  * past and could still be modifiable in the future.
5540                  * Consider it tainted.
5541                  * [ If the page was already found to be "tainted", no
5542                  * need to re-validate. ]
5543                  */
5544                 page->cs_validated = TRUE;
5545                 page->cs_tainted = TRUE;
5546                 if (cs_debug) {
5547                         printf("CODESIGNING: vm_page_validate_cs: "
5548                                "page %p obj %p off 0x%llx "
5549                                "was modified\n",
5550                                page, page->object, page->offset);
5551                 }
5552                 vm_cs_validated_dirtied++;
5553         }
5554
5555         if (page->cs_validated) {
5556                 return;
5557         }
5558
5559         vm_cs_validates++;
5560
5561         object = page->object;
5562         assert(object->code_signed);
5563         offset = page->offset;
5564
5565         if (!object->alive || object->terminating || object->pager == NULL) {
5566                 /*
5567                  * The object is terminating and we don't have its pager
5568                  * so we can't validate the data...
5569                  */
5570                 return;
5571         }
5572         /*
5573          * Since we get here to validate a page that was brought in by
5574          * the pager, we know that this pager is all setup and ready
5575          * by now.
5576          */
5577         assert(!object->internal);
5578         assert(object->pager != NULL);
5579         assert(object->pager_ready);
5580
5581         pager = object->pager;
5582         assert(object->paging_in_progress);
5583         kr = vnode_pager_get_object_cs_blobs(pager, &blobs);
5584         if (kr != KERN_SUCCESS) {
5585                 blobs = NULL;
5586         }
5587
5588         /* verify the SHA1 hash for this page */
5589         validated = cs_validate_page(blobs,
5590                                      pager,
5591                                      offset + object->paging_offset,
5592                                      (const void *)kaddr,
5593                                      &tainted);
5594
5595         page->cs_validated = validated;
5596         if (validated) {
5597                 page->cs_tainted = tainted;
5598         }
5599 }
5600
5601 extern int panic_on_cs_killed;
5602 void
5603 vm_page_validate_cs(
5604         vm_page_t       page)
5605 {
5606         vm_object_t             object;
5607         vm_object_offset_t      offset;
5608         vm_map_offset_t         koffset;
5609         vm_map_size_t           ksize;
5610         vm_offset_t             kaddr;
5611         kern_return_t           kr;
5612         boolean_t               busy_page;
5613         boolean_t               need_unmap;
5614
5615         vm_object_lock_assert_held(page->object);
5616
5617         if (!cs_validation) {
5618                 return;
5619         }
5620
5621         if (page->wpmapped && !page->cs_tainted) {
5622                 vm_object_lock_assert_exclusive(page->object);
5623
5624                 /*
5625                  * This page was mapped for "write" access sometime in the
5626                  * past and could still be modifiable in the future.
5627                  * Consider it tainted.
5628                  * [ If the page was already found to be "tainted", no
5629                  * need to re-validate. ]
5630                  */
5631                 page->cs_validated = TRUE;
5632                 page->cs_tainted = TRUE;
5633                 if (cs_debug) {
5634                         printf("CODESIGNING: vm_page_validate_cs: "
5635                                "page %p obj %p off 0x%llx "
5636                                "was modified\n",
5637                                page, page->object, page->offset);
5638                 }
5639                 vm_cs_validated_dirtied++;
5640         }
5641
5642         if (page->cs_validated) {
5643                 return;
5644         }
5645
5646         if (panic_on_cs_killed &&
5647             page->slid) {
5648                 panic("vm_page_validate_cs(%p): page is slid\n", page);
5649         }
5650         assert(!page->slid);
5651
5652 #if CHECK_CS_VALIDATION_BITMAP
5653         if ( vnode_pager_cs_check_validation_bitmap( page->object->pager, trunc_page(page->offset + page->object->paging_offset), CS_BITMAP_CHECK ) == KERN_SUCCESS) {
5654                 page->cs_validated = TRUE;
5655                 page->cs_tainted = FALSE;
5656                 vm_cs_bitmap_validated++;
5657                 return;
5658         }
5659 #endif
5660         vm_object_lock_assert_exclusive(page->object);
5661
5662         object = page->object;
5663         assert(object->code_signed);
5664         offset = page->offset;
5665
5666         busy_page = page->busy;
5667         if (!busy_page) {
5668                 /* keep page busy while we map (and unlock) the VM object */
5669                 page->busy = TRUE;
5670         }
5671
5672         /*
5673          * Take a paging reference on the VM object
5674          * to protect it from collapse or bypass,
5675          * and keep it from disappearing too.
5676          */
5677         vm_object_paging_begin(object);
5678
5679         /* map the page in the kernel address space */
5680         ksize = PAGE_SIZE_64;
5681         koffset = 0;
5682         need_unmap = FALSE;
5683         kr = vm_paging_map_object(page,
5684                                   object,
5685                                   offset,
5686                                   VM_PROT_READ,
5687                                   FALSE, /* can't unlock object ! */
5688                                   &ksize,
5689                                   &koffset,
5690                                   &need_unmap);
5691         if (kr != KERN_SUCCESS) {
5692                 panic("vm_page_validate_cs: could not map page: 0x%x\n", kr);
5693         }
5694         kaddr = CAST_DOWN(vm_offset_t, koffset);
5695
5696         /* validate the mapped page */
5697         vm_page_validate_cs_mapped(page, (const void *) kaddr);
5698
5699 #if CHECK_CS_VALIDATION_BITMAP
5700         if ( page->cs_validated == TRUE && page->cs_tainted == FALSE ) {
5701                 vnode_pager_cs_check_validation_bitmap( object->pager, trunc_page( offset + object->paging_offset), CS_BITMAP_SET );
5702         }
5703 #endif
5704         assert(page->busy);
5705         assert(object == page->object);
5706         vm_object_lock_assert_exclusive(object);
5707
5708         if (!busy_page) {
5709                 PAGE_WAKEUP_DONE(page);
5710         }
5711         if (need_unmap) {
5712                 /* unmap the map from the kernel address space */
5713                 vm_paging_unmap_object(object, koffset, koffset + ksize);
5714                 koffset = 0;
5715                 ksize = 0;
5716                 kaddr = 0;
5717         }
5718         vm_object_paging_end(object);
5719 }