osfmk/vm/vm_fault.c

   1 /*
   2  * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58 /*
  59  *      File:   vm_fault.c
  60  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  61  *
  62  *      Page fault handling module.
  63  */
  64
  65 #include <mach_cluster_stats.h>
  66 #include <mach_pagemap.h>
  67 #include <libkern/OSAtomic.h>
  68
  69 #include <mach/mach_types.h>
  70 #include <mach/kern_return.h>
  71 #include <mach/message.h>       /* for error codes */
  72 #include <mach/vm_param.h>
  73 #include <mach/vm_behavior.h>
  74 #include <mach/memory_object.h>
  75                                 /* For memory_object_data_{request,unlock} */
  76 #include <mach/sdt.h>
  77
  78 #include <kern/kern_types.h>
  79 #include <kern/host_statistics.h>
  80 #include <kern/counters.h>
  81 #include <kern/task.h>
  82 #include <kern/thread.h>
  83 #include <kern/sched_prim.h>
  84 #include <kern/host.h>
  85 #include <kern/xpr.h>
  86 #include <kern/mach_param.h>
  87 #include <kern/macro_help.h>
  88 #include <kern/zalloc.h>
  89 #include <kern/misc_protos.h>
  90
  91 #include <vm/vm_compressor.h>
  92 #include <vm/vm_compressor_pager.h>
  93 #include <vm/vm_fault.h>
  94 #include <vm/vm_map.h>
  95 #include <vm/vm_object.h>
  96 #include <vm/vm_page.h>
  97 #include <vm/vm_kern.h>
  98 #include <vm/pmap.h>
  99 #include <vm/vm_pageout.h>
 100 #include <vm/vm_protos.h>
 101 #include <vm/vm_external.h>
 102 #include <vm/memory_object.h>
 103 #include <vm/vm_purgeable_internal.h>   /* Needed by some vm_page.h macros */
 104 #include <vm/vm_shared_region.h>
 105
 106 #include <sys/codesign.h>
 107
 108 #include <libsa/sys/timers.h>   /* for struct timespec */
 109
 110 #define VM_FAULT_CLASSIFY       0
 111
 112 #define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
 113
 114 unsigned int    vm_object_pagein_throttle = 16;
 115
 116 /*
 117  * We apply a hard throttle to the demand zero rate of tasks that we believe are running out of control which
 118  * kicks in when swap space runs out.  64-bit programs have massive address spaces and can leak enormous amounts
 119  * of memory if they're buggy and can run the system completely out of swap space.  If this happens, we
 120  * impose a hard throttle on them to prevent them from taking the last bit of memory left.  This helps
 121  * keep the UI active so that the user has a chance to kill the offending task before the system
 122  * completely hangs.
 123  *
 124  * The hard throttle is only applied when the system is nearly completely out of swap space and is only applied
 125  * to tasks that appear to be bloated.  When swap runs out, any task using more than vm_hard_throttle_threshold
 126  * will be throttled.  The throttling is done by giving the thread that's trying to demand zero a page a
 127  * delay of HARD_THROTTLE_DELAY microseconds before being allowed to try the page fault again.
 128  */
 129
 130 extern void throttle_lowpri_io(int);
 131
 132 uint64_t vm_hard_throttle_threshold;
 133
 134
 135
 136 #define NEED_TO_HARD_THROTTLE_THIS_TASK()       (vm_wants_task_throttled(current_task()) ||     \
 137                                                  (vm_page_free_count < vm_page_throttle_limit && \
 138                                                   proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO) >= THROTTLE_LEVEL_THROTTLED))
 139
 140
 141 #define HARD_THROTTLE_DELAY     20000   /* 20000 us == 20 ms */
 142 #define SOFT_THROTTLE_DELAY     2000    /* 2000 us == 2 ms */
 143
 144 boolean_t current_thread_aborted(void);
 145
 146 /* Forward declarations of internal routines. */
 147 extern kern_return_t vm_fault_wire_fast(
 148                                 vm_map_t        map,
 149                                 vm_map_offset_t va,
 150                                 vm_map_entry_t  entry,
 151                                 pmap_t          pmap,
 152                                 vm_map_offset_t pmap_addr,
 153                                 ppnum_t         *physpage_p);
 154
 155 extern void vm_fault_continue(void);
 156
 157 extern void vm_fault_copy_cleanup(
 158                                 vm_page_t       page,
 159                                 vm_page_t       top_page);
 160
 161 extern void vm_fault_copy_dst_cleanup(
 162                                 vm_page_t       page);
 163
 164 #if     VM_FAULT_CLASSIFY
 165 extern void vm_fault_classify(vm_object_t       object,
 166                           vm_object_offset_t    offset,
 167                           vm_prot_t             fault_type);
 168
 169 extern void vm_fault_classify_init(void);
 170 #endif
 171
 172 unsigned long vm_pmap_enter_blocked = 0;
 173 unsigned long vm_pmap_enter_retried = 0;
 174
 175 unsigned long vm_cs_validates = 0;
 176 unsigned long vm_cs_revalidates = 0;
 177 unsigned long vm_cs_query_modified = 0;
 178 unsigned long vm_cs_validated_dirtied = 0;
 179 unsigned long vm_cs_bitmap_validated = 0;
 180
 181 void vm_pre_fault(vm_map_offset_t);
 182
 183 /*
 184  *      Routine:        vm_fault_init
 185  *      Purpose:
 186  *              Initialize our private data structures.
 187  */
 188 void
 189 vm_fault_init(void)
 190 {
 191         int i, vm_compressor_temp;
 192         boolean_t need_default_val = TRUE;
 193         /*
 194          * Choose a value for the hard throttle threshold based on the amount of ram.  The threshold is
 195          * computed as a percentage of available memory, and the percentage used is scaled inversely with
 196          * the amount of memory.  The percentage runs between 10% and 35%.  We use 35% for small memory systems
 197          * and reduce the value down to 10% for very large memory configurations.  This helps give us a
 198          * definition of a memory hog that makes more sense relative to the amount of ram in the machine.
 199          * The formula here simply uses the number of gigabytes of ram to adjust the percentage.
 200          */
 201
 202         vm_hard_throttle_threshold = sane_size * (35 - MIN((int)(sane_size / (1024*1024*1024)), 25)) / 100;
 203
 204         /*
 205          * Configure compressed pager behavior. A boot arg takes precedence over a device tree entry.
 206          */
 207
 208         if (PE_parse_boot_argn("vm_compressor", &vm_compressor_temp, sizeof (vm_compressor_temp))) {
 209                 for ( i = 0; i < VM_PAGER_MAX_MODES; i++) {
 210                         if (vm_compressor_temp > 0 &&
 211                             ((vm_compressor_temp & ( 1 << i)) == vm_compressor_temp)) {
 212                                 need_default_val = FALSE;
 213                                 vm_compressor_mode = vm_compressor_temp;
 214                                 break;
 215                         }
 216                 }
 217                 if (need_default_val)
 218                         printf("Ignoring \"vm_compressor\" boot arg %d\n", vm_compressor_temp);
 219         }
 220         if (need_default_val) {
 221                 /* If no boot arg or incorrect boot arg, try device tree. */
 222                 PE_get_default("kern.vm_compressor", &vm_compressor_mode, sizeof(vm_compressor_mode));
 223         }
 224         PE_parse_boot_argn("vm_compressor_threads", &vm_compressor_thread_count, sizeof (vm_compressor_thread_count));
 225         printf("\"vm_compressor_mode\" is %d\n", vm_compressor_mode);
 226 }
 227
 228 /*
 229  *      Routine:        vm_fault_cleanup
 230  *      Purpose:
 231  *              Clean up the result of vm_fault_page.
 232  *      Results:
 233  *              The paging reference for "object" is released.
 234  *              "object" is unlocked.
 235  *              If "top_page" is not null,  "top_page" is
 236  *              freed and the paging reference for the object
 237  *              containing it is released.
 238  *
 239  *      In/out conditions:
 240  *              "object" must be locked.
 241  */
 242 void
 243 vm_fault_cleanup(
 244         register vm_object_t    object,
 245         register vm_page_t      top_page)
 246 {
 247         vm_object_paging_end(object);
 248         vm_object_unlock(object);
 249
 250         if (top_page != VM_PAGE_NULL) {
 251                 object = top_page->object;
 252
 253                 vm_object_lock(object);
 254                 VM_PAGE_FREE(top_page);
 255                 vm_object_paging_end(object);
 256                 vm_object_unlock(object);
 257         }
 258 }
 259
 260 #if     MACH_CLUSTER_STATS
 261 #define MAXCLUSTERPAGES 16
 262 struct {
 263         unsigned long pages_in_cluster;
 264         unsigned long pages_at_higher_offsets;
 265         unsigned long pages_at_lower_offsets;
 266 } cluster_stats_in[MAXCLUSTERPAGES];
 267 #define CLUSTER_STAT(clause)    clause
 268 #define CLUSTER_STAT_HIGHER(x)  \
 269         ((cluster_stats_in[(x)].pages_at_higher_offsets)++)
 270 #define CLUSTER_STAT_LOWER(x)   \
 271          ((cluster_stats_in[(x)].pages_at_lower_offsets)++)
 272 #define CLUSTER_STAT_CLUSTER(x) \
 273         ((cluster_stats_in[(x)].pages_in_cluster)++)
 274 #else   /* MACH_CLUSTER_STATS */
 275 #define CLUSTER_STAT(clause)
 276 #endif  /* MACH_CLUSTER_STATS */
 277
 278 #define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
 279
 280
 281 boolean_t       vm_page_deactivate_behind = TRUE;
 282 /*
 283  * default sizes given VM_BEHAVIOR_DEFAULT reference behavior
 284  */
 285 #define VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW     128
 286 #define VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER    16              /* don't make this too big... */
 287                                                                 /* we use it to size an array on the stack */
 288
 289 int vm_default_behind = VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW;
 290
 291 #define MAX_SEQUENTIAL_RUN      (1024 * 1024 * 1024)
 292
 293 /*
 294  * vm_page_is_sequential
 295  *
 296  * Determine if sequential access is in progress
 297  * in accordance with the behavior specified.
 298  * Update state to indicate current access pattern.
 299  *
 300  * object must have at least the shared lock held
 301  */
 302 static
 303 void
 304 vm_fault_is_sequential(
 305         vm_object_t             object,
 306         vm_object_offset_t      offset,
 307         vm_behavior_t           behavior)
 308 {
 309         vm_object_offset_t      last_alloc;
 310         int                     sequential;
 311         int                     orig_sequential;
 312
 313         last_alloc = object->last_alloc;
 314         sequential = object->sequential;
 315         orig_sequential = sequential;
 316
 317         switch (behavior) {
 318         case VM_BEHAVIOR_RANDOM:
 319                 /*
 320                  * reset indicator of sequential behavior
 321                  */
 322                 sequential = 0;
 323                 break;
 324
 325         case VM_BEHAVIOR_SEQUENTIAL:
 326                 if (offset && last_alloc == offset - PAGE_SIZE_64) {
 327                         /*
 328                          * advance indicator of sequential behavior
 329                          */
 330                         if (sequential < MAX_SEQUENTIAL_RUN)
 331                                 sequential += PAGE_SIZE;
 332                 } else {
 333                         /*
 334                          * reset indicator of sequential behavior
 335                          */
 336                         sequential = 0;
 337                 }
 338                 break;
 339
 340         case VM_BEHAVIOR_RSEQNTL:
 341                 if (last_alloc && last_alloc == offset + PAGE_SIZE_64) {
 342                         /*
 343                          * advance indicator of sequential behavior
 344                          */
 345                         if (sequential > -MAX_SEQUENTIAL_RUN)
 346                                 sequential -= PAGE_SIZE;
 347                 } else {
 348                         /*
 349                          * reset indicator of sequential behavior
 350                          */
 351                         sequential = 0;
 352                 }
 353                 break;
 354
 355         case VM_BEHAVIOR_DEFAULT:
 356         default:
 357                 if (offset && last_alloc == (offset - PAGE_SIZE_64)) {
 358                         /*
 359                          * advance indicator of sequential behavior
 360                          */
 361                         if (sequential < 0)
 362                                 sequential = 0;
 363                         if (sequential < MAX_SEQUENTIAL_RUN)
 364                                 sequential += PAGE_SIZE;
 365
 366                 } else if (last_alloc && last_alloc == (offset + PAGE_SIZE_64)) {
 367                         /*
 368                          * advance indicator of sequential behavior
 369                          */
 370                         if (sequential > 0)
 371                                 sequential = 0;
 372                         if (sequential > -MAX_SEQUENTIAL_RUN)
 373                                 sequential -= PAGE_SIZE;
 374                 } else {
 375                         /*
 376                          * reset indicator of sequential behavior
 377                          */
 378                         sequential = 0;
 379                 }
 380                 break;
 381         }
 382         if (sequential != orig_sequential) {
 383                 if (!OSCompareAndSwap(orig_sequential, sequential, (UInt32 *)&object->sequential)) {
 384                         /*
 385                          * if someone else has already updated object->sequential
 386                          * don't bother trying to update it or object->last_alloc
 387                          */
 388                         return;
 389                 }
 390         }
 391         /*
 392          * I'd like to do this with a OSCompareAndSwap64, but that
 393          * doesn't exist for PPC...  however, it shouldn't matter
 394          * that much... last_alloc is maintained so that we can determine
 395          * if a sequential access pattern is taking place... if only
 396          * one thread is banging on this object, no problem with the unprotected
 397          * update... if 2 or more threads are banging away, we run the risk of
 398          * someone seeing a mangled update... however, in the face of multiple
 399          * accesses, no sequential access pattern can develop anyway, so we
 400          * haven't lost any real info.
 401          */
 402         object->last_alloc = offset;
 403 }
 404
 405
 406 int vm_page_deactivate_behind_count = 0;
 407
 408 /*
 409  * vm_page_deactivate_behind
 410  *
 411  * Determine if sequential access is in progress
 412  * in accordance with the behavior specified.  If
 413  * so, compute a potential page to deactivate and
 414  * deactivate it.
 415  *
 416  * object must be locked.
 417  *
 418  * return TRUE if we actually deactivate a page
 419  */
 420 static
 421 boolean_t
 422 vm_fault_deactivate_behind(
 423         vm_object_t             object,
 424         vm_object_offset_t      offset,
 425         vm_behavior_t           behavior)
 426 {
 427         int             n;
 428         int             pages_in_run = 0;
 429         int             max_pages_in_run = 0;
 430         int             sequential_run;
 431         int             sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
 432         vm_object_offset_t      run_offset = 0;
 433         vm_object_offset_t      pg_offset = 0;
 434         vm_page_t       m;
 435         vm_page_t       page_run[VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER];
 436
 437         pages_in_run = 0;
 438 #if TRACEFAULTPAGE
 439         dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind); /* (TEST/DEBUG) */
 440 #endif
 441
 442         if (object == kernel_object || vm_page_deactivate_behind == FALSE) {
 443                 /*
 444                  * Do not deactivate pages from the kernel object: they
 445                  * are not intended to become pageable.
 446                  * or we've disabled the deactivate behind mechanism
 447                  */
 448                 return FALSE;
 449         }
 450         if ((sequential_run = object->sequential)) {
 451                   if (sequential_run < 0) {
 452                           sequential_behavior = VM_BEHAVIOR_RSEQNTL;
 453                           sequential_run = 0 - sequential_run;
 454                   } else {
 455                           sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
 456                   }
 457         }
 458         switch (behavior) {
 459         case VM_BEHAVIOR_RANDOM:
 460                 break;
 461         case VM_BEHAVIOR_SEQUENTIAL:
 462                 if (sequential_run >= (int)PAGE_SIZE) {
 463                         run_offset = 0 - PAGE_SIZE_64;
 464                         max_pages_in_run = 1;
 465                 }
 466                 break;
 467         case VM_BEHAVIOR_RSEQNTL:
 468                 if (sequential_run >= (int)PAGE_SIZE) {
 469                         run_offset = PAGE_SIZE_64;
 470                         max_pages_in_run = 1;
 471                 }
 472                 break;
 473         case VM_BEHAVIOR_DEFAULT:
 474         default:
 475         {       vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
 476
 477                 /*
 478                  * determine if the run of sequential accesss has been
 479                  * long enough on an object with default access behavior
 480                  * to consider it for deactivation
 481                  */
 482                 if ((uint64_t)sequential_run >= behind && (sequential_run % (VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER * PAGE_SIZE)) == 0) {
 483                         /*
 484                          * the comparisons between offset and behind are done
 485                          * in this kind of odd fashion in order to prevent wrap around
 486                          * at the end points
 487                          */
 488                         if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) {
 489                                 if (offset >= behind) {
 490                                         run_offset = 0 - behind;
 491                                         pg_offset = PAGE_SIZE_64;
 492                                         max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
 493                                 }
 494                         } else {
 495                                 if (offset < -behind) {
 496                                         run_offset = behind;
 497                                         pg_offset = 0 - PAGE_SIZE_64;
 498                                         max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
 499                                 }
 500                         }
 501                 }
 502                 break;
 503         }
 504         }
 505         for (n = 0; n < max_pages_in_run; n++) {
 506                 m = vm_page_lookup(object, offset + run_offset + (n * pg_offset));
 507
 508                 if (m && !m->laundry && !m->busy && !m->no_cache && !m->throttled && !m->fictitious && !m->absent) {
 509                         page_run[pages_in_run++] = m;
 510
 511                         /*
 512                          * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
 513                          *
 514                          * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
 515                          * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
 516                          * new reference happens. If no futher references happen on the page after that remote TLB flushes
 517                          * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
 518                          * by pageout_scan, which is just fine since the last reference would have happened quite far
 519                          * in the past (TLB caches don't hang around for very long), and of course could just as easily
 520                          * have happened before we did the deactivate_behind.
 521                          */
 522                         pmap_clear_refmod_options(m->phys_page, VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
 523                 }
 524         }
 525         if (pages_in_run) {
 526                 vm_page_lockspin_queues();
 527
 528                 for (n = 0; n < pages_in_run; n++) {
 529
 530                         m = page_run[n];
 531
 532                         vm_page_deactivate_internal(m, FALSE);
 533
 534                         vm_page_deactivate_behind_count++;
 535 #if TRACEFAULTPAGE
 536                         dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
 537 #endif
 538                 }
 539                 vm_page_unlock_queues();
 540
 541                 return TRUE;
 542         }
 543         return FALSE;
 544 }
 545
 546
 547 static int
 548 vm_page_throttled(void)
 549 {
 550         clock_sec_t     elapsed_sec;
 551         clock_sec_t     tv_sec;
 552         clock_usec_t    tv_usec;
 553
 554         thread_t thread = current_thread();
 555
 556         if (thread->options & TH_OPT_VMPRIV)
 557                 return (0);
 558
 559         thread->t_page_creation_count++;
 560
 561         if (NEED_TO_HARD_THROTTLE_THIS_TASK())
 562                 return (HARD_THROTTLE_DELAY);
 563
 564         if ((vm_page_free_count < vm_page_throttle_limit || ((COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) && SWAPPER_NEEDS_TO_UNTHROTTLE())) &&
 565             thread->t_page_creation_count > vm_page_creation_throttle) {
 566
 567                 clock_get_system_microtime(&tv_sec, &tv_usec);
 568
 569                 elapsed_sec = tv_sec - thread->t_page_creation_time;
 570
 571                 if (elapsed_sec <= 6 || (thread->t_page_creation_count / elapsed_sec) >= (vm_page_creation_throttle / 6)) {
 572
 573                         if (elapsed_sec >= 60) {
 574                                 /*
 575                                  * we'll reset our stats to give a well behaved app
 576                                  * that was unlucky enough to accumulate a bunch of pages
 577                                  * over a long period of time a chance to get out of
 578                                  * the throttled state... we reset the counter and timestamp
 579                                  * so that if it stays under the rate limit for the next second
 580                                  * it will be back in our good graces... if it exceeds it, it
 581                                  * will remain in the throttled state
 582                                  */
 583                                 thread->t_page_creation_time = tv_sec;
 584                                 thread->t_page_creation_count = (vm_page_creation_throttle / 6) * 5;
 585                         }
 586                         ++vm_page_throttle_count;
 587
 588                         if ((COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) && HARD_THROTTLE_LIMIT_REACHED())
 589                                 return (HARD_THROTTLE_DELAY);
 590                         else
 591                                 return (SOFT_THROTTLE_DELAY);
 592                 }
 593                 thread->t_page_creation_time = tv_sec;
 594                 thread->t_page_creation_count = 0;
 595         }
 596         return (0);
 597 }
 598
 599
 600 /*
 601  * check for various conditions that would
 602  * prevent us from creating a ZF page...
 603  * cleanup is based on being called from vm_fault_page
 604  *
 605  * object must be locked
 606  * object == m->object
 607  */
 608 static vm_fault_return_t
 609 vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t interruptible_state)
 610 {
 611         int throttle_delay;
 612
 613         if (object->shadow_severed ||
 614             VM_OBJECT_PURGEABLE_FAULT_ERROR(object)) {
 615                 /*
 616                  * Either:
 617                  * 1. the shadow chain was severed,
 618                  * 2. the purgeable object is volatile or empty and is marked
 619                  *    to fault on access while volatile.
 620                  * Just have to return an error at this point
 621                  */
 622                 if (m != VM_PAGE_NULL)
 623                         VM_PAGE_FREE(m);
 624                 vm_fault_cleanup(object, first_m);
 625
 626                 thread_interrupt_level(interruptible_state);
 627
 628                 return (VM_FAULT_MEMORY_ERROR);
 629         }
 630         if (vm_backing_store_low) {
 631                 /*
 632                  * are we protecting the system from
 633                  * backing store exhaustion.  If so
 634                  * sleep unless we are privileged.
 635                  */
 636                 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
 637
 638                         if (m != VM_PAGE_NULL)
 639                                 VM_PAGE_FREE(m);
 640                         vm_fault_cleanup(object, first_m);
 641
 642                         assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
 643
 644                         thread_block(THREAD_CONTINUE_NULL);
 645                         thread_interrupt_level(interruptible_state);
 646
 647                         return (VM_FAULT_RETRY);
 648                 }
 649         }
 650         if ((throttle_delay = vm_page_throttled())) {
 651                 /*
 652                  * we're throttling zero-fills...
 653                  * treat this as if we couldn't grab a page
 654                  */
 655                 if (m != VM_PAGE_NULL)
 656                         VM_PAGE_FREE(m);
 657                 vm_fault_cleanup(object, first_m);
 658
 659                 VM_DEBUG_EVENT(vmf_check_zfdelay, VMF_CHECK_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
 660
 661                 delay(throttle_delay);
 662
 663                 if (current_thread_aborted()) {
 664                         thread_interrupt_level(interruptible_state);
 665                         return VM_FAULT_INTERRUPTED;
 666                 }
 667                 thread_interrupt_level(interruptible_state);
 668
 669                 return (VM_FAULT_MEMORY_SHORTAGE);
 670         }
 671         return (VM_FAULT_SUCCESS);
 672 }
 673
 674
 675 /*
 676  * do the work to zero fill a page and
 677  * inject it into the correct paging queue
 678  *
 679  * m->object must be locked
 680  * page queue lock must NOT be held
 681  */
 682 static int
 683 vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill)
 684 {
 685         int my_fault = DBG_ZERO_FILL_FAULT;
 686
 687         /*
 688          * This is is a zero-fill page fault...
 689          *
 690          * Checking the page lock is a waste of
 691          * time;  this page was absent, so
 692          * it can't be page locked by a pager.
 693          *
 694          * we also consider it undefined
 695          * with respect to instruction
 696          * execution.  i.e. it is the responsibility
 697          * of higher layers to call for an instruction
 698          * sync after changing the contents and before
 699          * sending a program into this area.  We
 700          * choose this approach for performance
 701          */
 702         m->pmapped = TRUE;
 703
 704         m->cs_validated = FALSE;
 705         m->cs_tainted = FALSE;
 706
 707         if (no_zero_fill == TRUE) {
 708                 my_fault = DBG_NZF_PAGE_FAULT;
 709
 710                 if (m->absent && m->busy)
 711                         return (my_fault);
 712         } else {
 713                 vm_page_zero_fill(m);
 714
 715                 VM_STAT_INCR(zero_fill_count);
 716                 DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
 717         }
 718         assert(!m->laundry);
 719         assert(m->object != kernel_object);
 720         //assert(m->pageq.next == NULL && m->pageq.prev == NULL);
 721
 722         if (!VM_DYNAMIC_PAGING_ENABLED(memory_manager_default) &&
 723                 (m->object->purgable == VM_PURGABLE_DENY ||
 724                  m->object->purgable == VM_PURGABLE_NONVOLATILE ||
 725                  m->object->purgable == VM_PURGABLE_VOLATILE )) {
 726
 727                 vm_page_lockspin_queues();
 728
 729                 if (!VM_DYNAMIC_PAGING_ENABLED(memory_manager_default)) {
 730                         assert(!VM_PAGE_WIRED(m));
 731
 732                         /*
 733                          * can't be on the pageout queue since we don't
 734                          * have a pager to try and clean to
 735                          */
 736                         assert(!m->pageout_queue);
 737
 738                         VM_PAGE_QUEUES_REMOVE(m);
 739
 740                         queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq);
 741                         m->throttled = TRUE;
 742                         vm_page_throttled_count++;
 743                 }
 744                 vm_page_unlock_queues();
 745         }
 746         return (my_fault);
 747 }
 748
 749
 750 /*
 751  *      Routine:        vm_fault_page
 752  *      Purpose:
 753  *              Find the resident page for the virtual memory
 754  *              specified by the given virtual memory object
 755  *              and offset.
 756  *      Additional arguments:
 757  *              The required permissions for the page is given
 758  *              in "fault_type".  Desired permissions are included
 759  *              in "protection".
 760  *              fault_info is passed along to determine pagein cluster
 761  *              limits... it contains the expected reference pattern,
 762  *              cluster size if available, etc...
 763  *
 764  *              If the desired page is known to be resident (for
 765  *              example, because it was previously wired down), asserting
 766  *              the "unwiring" parameter will speed the search.
 767  *
 768  *              If the operation can be interrupted (by thread_abort
 769  *              or thread_terminate), then the "interruptible"
 770  *              parameter should be asserted.
 771  *
 772  *      Results:
 773  *              The page containing the proper data is returned
 774  *              in "result_page".
 775  *
 776  *      In/out conditions:
 777  *              The source object must be locked and referenced,
 778  *              and must donate one paging reference.  The reference
 779  *              is not affected.  The paging reference and lock are
 780  *              consumed.
 781  *
 782  *              If the call succeeds, the object in which "result_page"
 783  *              resides is left locked and holding a paging reference.
 784  *              If this is not the original object, a busy page in the
 785  *              original object is returned in "top_page", to prevent other
 786  *              callers from pursuing this same data, along with a paging
 787  *              reference for the original object.  The "top_page" should
 788  *              be destroyed when this guarantee is no longer required.
 789  *              The "result_page" is also left busy.  It is not removed
 790  *              from the pageout queues.
 791  *      Special Case:
 792  *              A return value of VM_FAULT_SUCCESS_NO_PAGE means that the
 793  *              fault succeeded but there's no VM page (i.e. the VM object
 794  *              does not actually hold VM pages, but device memory or
 795  *              large pages).  The object is still locked and we still hold a
 796  *              paging_in_progress reference.
 797  */
 798 unsigned int vm_fault_page_blocked_access = 0;
 799 unsigned int vm_fault_page_forced_retry = 0;
 800
 801 vm_fault_return_t
 802 vm_fault_page(
 803         /* Arguments: */
 804         vm_object_t     first_object,   /* Object to begin search */
 805         vm_object_offset_t first_offset,        /* Offset into object */
 806         vm_prot_t       fault_type,     /* What access is requested */
 807         boolean_t       must_be_resident,/* Must page be resident? */
 808         boolean_t       caller_lookup,  /* caller looked up page */
 809         /* Modifies in place: */
 810         vm_prot_t       *protection,    /* Protection for mapping */
 811         vm_page_t       *result_page,   /* Page found, if successful */
 812         /* Returns: */
 813         vm_page_t       *top_page,      /* Page in top object, if
 814                                          * not result_page.  */
 815         int             *type_of_fault, /* if non-null, fill in with type of fault
 816                                          * COW, zero-fill, etc... returned in trace point */
 817         /* More arguments: */
 818         kern_return_t   *error_code,    /* code if page is in error */
 819         boolean_t       no_zero_fill,   /* don't zero fill absent pages */
 820         boolean_t       data_supply,    /* treat as data_supply if
 821                                          * it is a write fault and a full
 822                                          * page is provided */
 823         vm_object_fault_info_t fault_info)
 824 {
 825         vm_page_t               m;
 826         vm_object_t             object;
 827         vm_object_offset_t      offset;
 828         vm_page_t               first_m;
 829         vm_object_t             next_object;
 830         vm_object_t             copy_object;
 831         boolean_t               look_for_page;
 832         boolean_t               force_fault_retry = FALSE;
 833         vm_prot_t               access_required = fault_type;
 834         vm_prot_t               wants_copy_flag;
 835         CLUSTER_STAT(int pages_at_higher_offsets;)
 836         CLUSTER_STAT(int pages_at_lower_offsets;)
 837         kern_return_t           wait_result;
 838         boolean_t               interruptible_state;
 839         boolean_t               data_already_requested = FALSE;
 840         vm_behavior_t           orig_behavior;
 841         vm_size_t               orig_cluster_size;
 842         vm_fault_return_t       error;
 843         int                     my_fault;
 844         uint32_t                try_failed_count;
 845         int                     interruptible; /* how may fault be interrupted? */
 846         int                     external_state = VM_EXTERNAL_STATE_UNKNOWN;
 847         memory_object_t         pager;
 848         vm_fault_return_t       retval;
 849
 850 /*
 851  * MACH page map - an optional optimization where a bit map is maintained
 852  * by the VM subsystem for internal objects to indicate which pages of
 853  * the object currently reside on backing store.  This existence map
 854  * duplicates information maintained by the vnode pager.  It is
 855  * created at the time of the first pageout against the object, i.e.
 856  * at the same time pager for the object is created.  The optimization
 857  * is designed to eliminate pager interaction overhead, if it is
 858  * 'known' that the page does not exist on backing store.
 859  *
 860  * MUST_ASK_PAGER() evaluates to TRUE if the page specified by object/offset is
 861  * either marked as paged out in the existence map for the object or no
 862  * existence map exists for the object.  MUST_ASK_PAGER() is one of the
 863  * criteria in the decision to invoke the pager.   It is also used as one
 864  * of the criteria to terminate the scan for adjacent pages in a clustered
 865  * pagein operation.  Note that MUST_ASK_PAGER() always evaluates to TRUE for
 866  * permanent objects.  Note also that if the pager for an internal object
 867  * has not been created, the pager is not invoked regardless of the value
 868  * of MUST_ASK_PAGER() and that clustered pagein scans are only done on an object
 869  * for which a pager has been created.
 870  *
 871  * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
 872  * is marked as paged out in the existence map for the object.  PAGED_OUT()
 873  * PAGED_OUT() is used to determine if a page has already been pushed
 874  * into a copy object in order to avoid a redundant page out operation.
 875  */
 876 #if MACH_PAGEMAP
 877 #define MUST_ASK_PAGER(o, f, s)                                 \
 878         ((vm_external_state_get((o)->existence_map, (f))        \
 879           != VM_EXTERNAL_STATE_ABSENT) &&                       \
 880          (s = (VM_COMPRESSOR_PAGER_STATE_GET((o), (f))))        \
 881          != VM_EXTERNAL_STATE_ABSENT)
 882 #define PAGED_OUT(o, f)                                         \
 883         ((vm_external_state_get((o)->existence_map, (f))        \
 884           == VM_EXTERNAL_STATE_EXISTS) ||                       \
 885          (VM_COMPRESSOR_PAGER_STATE_GET((o), (f))               \
 886           == VM_EXTERNAL_STATE_EXISTS))
 887 #else /* MACH_PAGEMAP */
 888 #define MUST_ASK_PAGER(o, f, s)                                 \
 889         ((s = VM_COMPRESSOR_PAGER_STATE_GET((o), (f))) != VM_EXTERNAL_STATE_ABSENT)
 890 #define PAGED_OUT(o, f) \
 891         (VM_COMPRESSOR_PAGER_STATE_GET((o), (f)) == VM_EXTERNAL_STATE_EXISTS)
 892 #endif /* MACH_PAGEMAP */
 893
 894 /*
 895  *      Recovery actions
 896  */
 897 #define RELEASE_PAGE(m)                                 \
 898         MACRO_BEGIN                                     \
 899         PAGE_WAKEUP_DONE(m);                            \
 900         if (!m->active && !m->inactive && !m->throttled) {              \
 901                 vm_page_lockspin_queues();                              \
 902                 if (!m->active && !m->inactive && !m->throttled) {      \
 903                         if (COMPRESSED_PAGER_IS_ACTIVE) \
 904                                 vm_page_deactivate(m);                  \
 905                         else                                            \
 906                                 vm_page_activate(m);                    \
 907                 }                                                       \
 908                 vm_page_unlock_queues();                                \
 909         }                                                               \
 910         MACRO_END
 911
 912 #if TRACEFAULTPAGE
 913         dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */
 914 #endif
 915
 916         interruptible = fault_info->interruptible;
 917         interruptible_state = thread_interrupt_level(interruptible);
 918
 919         /*
 920          *      INVARIANTS (through entire routine):
 921          *
 922          *      1)      At all times, we must either have the object
 923          *              lock or a busy page in some object to prevent
 924          *              some other thread from trying to bring in
 925          *              the same page.
 926          *
 927          *              Note that we cannot hold any locks during the
 928          *              pager access or when waiting for memory, so
 929          *              we use a busy page then.
 930          *
 931          *      2)      To prevent another thread from racing us down the
 932          *              shadow chain and entering a new page in the top
 933          *              object before we do, we must keep a busy page in
 934          *              the top object while following the shadow chain.
 935          *
 936          *      3)      We must increment paging_in_progress on any object
 937          *              for which we have a busy page before dropping
 938          *              the object lock
 939          *
 940          *      4)      We leave busy pages on the pageout queues.
 941          *              If the pageout daemon comes across a busy page,
 942          *              it will remove the page from the pageout queues.
 943          */
 944
 945         object = first_object;
 946         offset = first_offset;
 947         first_m = VM_PAGE_NULL;
 948         access_required = fault_type;
 949
 950
 951         XPR(XPR_VM_FAULT,
 952                 "vm_f_page: obj 0x%X, offset 0x%X, type %d, prot %d\n",
 953                 object, offset, fault_type, *protection, 0);
 954
 955         /*
 956          * default type of fault
 957          */
 958         my_fault = DBG_CACHE_HIT_FAULT;
 959
 960         while (TRUE) {
 961 #if TRACEFAULTPAGE
 962                 dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
 963 #endif
 964                 if (!object->alive) {
 965                         /*
 966                          * object is no longer valid
 967                          * clean up and return error
 968                          */
 969                         vm_fault_cleanup(object, first_m);
 970                         thread_interrupt_level(interruptible_state);
 971
 972                         return (VM_FAULT_MEMORY_ERROR);
 973                 }
 974
 975                 if (!object->pager_created && object->phys_contiguous) {
 976                         /*
 977                          * A physically-contiguous object without a pager:
 978                          * must be a "large page" object.  We do not deal
 979                          * with VM pages for this object.
 980                          */
 981                         caller_lookup = FALSE;
 982                         m = VM_PAGE_NULL;
 983                         goto phys_contig_object;
 984                 }
 985
 986                 if (object->blocked_access) {
 987                         /*
 988                          * Access to this VM object has been blocked.
 989                          * Replace our "paging_in_progress" reference with
 990                          * a "activity_in_progress" reference and wait for
 991                          * access to be unblocked.
 992                          */
 993                         caller_lookup = FALSE; /* no longer valid after sleep */
 994                         vm_object_activity_begin(object);
 995                         vm_object_paging_end(object);
 996                         while (object->blocked_access) {
 997                                 vm_object_sleep(object,
 998                                                 VM_OBJECT_EVENT_UNBLOCKED,
 999                                                 THREAD_UNINT);
1000                         }
1001                         vm_fault_page_blocked_access++;
1002                         vm_object_paging_begin(object);
1003                         vm_object_activity_end(object);
1004                 }
1005
1006                 /*
1007                  * See whether the page at 'offset' is resident
1008                  */
1009                 if (caller_lookup == TRUE) {
1010                         /*
1011                          * The caller has already looked up the page
1012                          * and gave us the result in "result_page".
1013                          * We can use this for the first lookup but
1014                          * it loses its validity as soon as we unlock
1015                          * the object.
1016                          */
1017                         m = *result_page;
1018                         caller_lookup = FALSE; /* no longer valid after that */
1019                 } else {
1020                         m = vm_page_lookup(object, offset);
1021                 }
1022 #if TRACEFAULTPAGE
1023                 dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
1024 #endif
1025                 if (m != VM_PAGE_NULL) {
1026
1027                         if (m->busy) {
1028                                 /*
1029                                  * The page is being brought in,
1030                                  * wait for it and then retry.
1031                                  */
1032 #if TRACEFAULTPAGE
1033                                 dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1034 #endif
1035                                 wait_result = PAGE_SLEEP(object, m, interruptible);
1036
1037                                 XPR(XPR_VM_FAULT,
1038                                     "vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n",
1039                                     object, offset,
1040                                     m, 0, 0);
1041                                 counter(c_vm_fault_page_block_busy_kernel++);
1042
1043                                 if (wait_result != THREAD_AWAKENED) {
1044                                         vm_fault_cleanup(object, first_m);
1045                                         thread_interrupt_level(interruptible_state);
1046
1047                                         if (wait_result == THREAD_RESTART)
1048                                                 return (VM_FAULT_RETRY);
1049                                         else
1050                                                 return (VM_FAULT_INTERRUPTED);
1051                                 }
1052                                 continue;
1053                         }
1054                         if (m->laundry) {
1055                                 m->pageout = FALSE;
1056
1057                                 if (!m->cleaning)
1058                                         vm_pageout_steal_laundry(m, FALSE);
1059                         }
1060                         if (m->phys_page == vm_page_guard_addr) {
1061                                 /*
1062                                  * Guard page: off limits !
1063                                  */
1064                                 if (fault_type == VM_PROT_NONE) {
1065                                         /*
1066                                          * The fault is not requesting any
1067                                          * access to the guard page, so it must
1068                                          * be just to wire or unwire it.
1069                                          * Let's pretend it succeeded...
1070                                          */
1071                                         m->busy = TRUE;
1072                                         *result_page = m;
1073                                         assert(first_m == VM_PAGE_NULL);
1074                                         *top_page = first_m;
1075                                         if (type_of_fault)
1076                                                 *type_of_fault = DBG_GUARD_FAULT;
1077                                         thread_interrupt_level(interruptible_state);
1078                                         return VM_FAULT_SUCCESS;
1079                                 } else {
1080                                         /*
1081                                          * The fault requests access to the
1082                                          * guard page: let's deny that !
1083                                          */
1084                                         vm_fault_cleanup(object, first_m);
1085                                         thread_interrupt_level(interruptible_state);
1086                                         return VM_FAULT_MEMORY_ERROR;
1087                                 }
1088                         }
1089
1090                         if (m->error) {
1091                                 /*
1092                                  * The page is in error, give up now.
1093                                  */
1094 #if TRACEFAULTPAGE
1095                                 dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code);      /* (TEST/DEBUG) */
1096 #endif
1097                                 if (error_code)
1098                                         *error_code = KERN_MEMORY_ERROR;
1099                                 VM_PAGE_FREE(m);
1100
1101                                 vm_fault_cleanup(object, first_m);
1102                                 thread_interrupt_level(interruptible_state);
1103
1104                                 return (VM_FAULT_MEMORY_ERROR);
1105                         }
1106                         if (m->restart) {
1107                                 /*
1108                                  * The pager wants us to restart
1109                                  * at the top of the chain,
1110                                  * typically because it has moved the
1111                                  * page to another pager, then do so.
1112                                  */
1113 #if TRACEFAULTPAGE
1114                                 dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1115 #endif
1116                                 VM_PAGE_FREE(m);
1117
1118                                 vm_fault_cleanup(object, first_m);
1119                                 thread_interrupt_level(interruptible_state);
1120
1121                                 return (VM_FAULT_RETRY);
1122                         }
1123                         if (m->absent) {
1124                                 /*
1125                                  * The page isn't busy, but is absent,
1126                                  * therefore it's deemed "unavailable".
1127                                  *
1128                                  * Remove the non-existent page (unless it's
1129                                  * in the top object) and move on down to the
1130                                  * next object (if there is one).
1131                                  */
1132 #if TRACEFAULTPAGE
1133                                 dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow);  /* (TEST/DEBUG) */
1134 #endif
1135                                 next_object = object->shadow;
1136
1137                                 if (next_object == VM_OBJECT_NULL) {
1138                                         /*
1139                                          * Absent page at bottom of shadow
1140                                          * chain; zero fill the page we left
1141                                          * busy in the first object, and free
1142                                          * the absent page.
1143                                          */
1144                                         assert(!must_be_resident);
1145
1146                                         /*
1147                                          * check for any conditions that prevent
1148                                          * us from creating a new zero-fill page
1149                                          * vm_fault_check will do all of the
1150                                          * fault cleanup in the case of an error condition
1151                                          * including resetting the thread_interrupt_level
1152                                          */
1153                                         error = vm_fault_check(object, m, first_m, interruptible_state);
1154
1155                                         if (error != VM_FAULT_SUCCESS)
1156                                                 return (error);
1157
1158                                         XPR(XPR_VM_FAULT,
1159                                             "vm_f_page: zero obj 0x%X, off 0x%X, page 0x%X, first_obj 0x%X\n",
1160                                                 object, offset,
1161                                                 m,
1162                                                 first_object, 0);
1163
1164                                         if (object != first_object) {
1165                                                 /*
1166                                                  * free the absent page we just found
1167                                                  */
1168                                                 VM_PAGE_FREE(m);
1169
1170                                                 /*
1171                                                  * drop reference and lock on current object
1172                                                  */
1173                                                 vm_object_paging_end(object);
1174                                                 vm_object_unlock(object);
1175
1176                                                 /*
1177                                                  * grab the original page we
1178                                                  * 'soldered' in place and
1179                                                  * retake lock on 'first_object'
1180                                                  */
1181                                                 m = first_m;
1182                                                 first_m = VM_PAGE_NULL;
1183
1184                                                 object = first_object;
1185                                                 offset = first_offset;
1186
1187                                                 vm_object_lock(object);
1188                                         } else {
1189                                                 /*
1190                                                  * we're going to use the absent page we just found
1191                                                  * so convert it to a 'busy' page
1192                                                  */
1193                                                 m->absent = FALSE;
1194                                                 m->busy = TRUE;
1195                                         }
1196                                         if (fault_info->mark_zf_absent && no_zero_fill == TRUE)
1197                                                 m->absent = TRUE;
1198                                         /*
1199                                          * zero-fill the page and put it on
1200                                          * the correct paging queue
1201                                          */
1202                                         my_fault = vm_fault_zero_page(m, no_zero_fill);
1203
1204                                         break;
1205                                 } else {
1206                                         if (must_be_resident)
1207                                                 vm_object_paging_end(object);
1208                                         else if (object != first_object) {
1209                                                 vm_object_paging_end(object);
1210                                                 VM_PAGE_FREE(m);
1211                                         } else {
1212                                                 first_m = m;
1213                                                 m->absent = FALSE;
1214                                                 m->busy = TRUE;
1215
1216                                                 vm_page_lockspin_queues();
1217
1218                                                 assert(!m->pageout_queue);
1219                                                 VM_PAGE_QUEUES_REMOVE(m);
1220
1221                                                 vm_page_unlock_queues();
1222                                         }
1223                                         XPR(XPR_VM_FAULT,
1224                                             "vm_f_page: unavail obj 0x%X, off 0x%X, next_obj 0x%X, newoff 0x%X\n",
1225                                                 object, offset,
1226                                                 next_object,
1227                                                 offset+object->vo_shadow_offset,0);
1228
1229                                         offset += object->vo_shadow_offset;
1230                                         fault_info->lo_offset += object->vo_shadow_offset;
1231                                         fault_info->hi_offset += object->vo_shadow_offset;
1232                                         access_required = VM_PROT_READ;
1233
1234                                         vm_object_lock(next_object);
1235                                         vm_object_unlock(object);
1236                                         object = next_object;
1237                                         vm_object_paging_begin(object);
1238
1239                                         /*
1240                                          * reset to default type of fault
1241                                          */
1242                                         my_fault = DBG_CACHE_HIT_FAULT;
1243
1244                                         continue;
1245                                 }
1246                         }
1247                         if ((m->cleaning)
1248                             && ((object != first_object) || (object->copy != VM_OBJECT_NULL))
1249                             && (fault_type & VM_PROT_WRITE)) {
1250                                 /*
1251                                  * This is a copy-on-write fault that will
1252                                  * cause us to revoke access to this page, but
1253                                  * this page is in the process of being cleaned
1254                                  * in a clustered pageout. We must wait until
1255                                  * the cleaning operation completes before
1256                                  * revoking access to the original page,
1257                                  * otherwise we might attempt to remove a
1258                                  * wired mapping.
1259                                  */
1260 #if TRACEFAULTPAGE
1261                                 dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset);  /* (TEST/DEBUG) */
1262 #endif
1263                                 XPR(XPR_VM_FAULT,
1264                                     "vm_f_page: cleaning obj 0x%X, offset 0x%X, page 0x%X\n",
1265                                         object, offset,
1266                                         m, 0, 0);
1267                                 /*
1268                                  * take an extra ref so that object won't die
1269                                  */
1270                                 vm_object_reference_locked(object);
1271
1272                                 vm_fault_cleanup(object, first_m);
1273
1274                                 counter(c_vm_fault_page_block_backoff_kernel++);
1275                                 vm_object_lock(object);
1276                                 assert(object->ref_count > 0);
1277
1278                                 m = vm_page_lookup(object, offset);
1279
1280                                 if (m != VM_PAGE_NULL && m->cleaning) {
1281                                         PAGE_ASSERT_WAIT(m, interruptible);
1282
1283                                         vm_object_unlock(object);
1284                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1285                                         vm_object_deallocate(object);
1286
1287                                         goto backoff;
1288                                 } else {
1289                                         vm_object_unlock(object);
1290
1291                                         vm_object_deallocate(object);
1292                                         thread_interrupt_level(interruptible_state);
1293
1294                                         return (VM_FAULT_RETRY);
1295                                 }
1296                         }
1297                         if (type_of_fault == NULL && m->speculative &&
1298                             !(fault_info != NULL && fault_info->stealth)) {
1299                                 /*
1300                                  * If we were passed a non-NULL pointer for
1301                                  * "type_of_fault", than we came from
1302                                  * vm_fault... we'll let it deal with
1303                                  * this condition, since it
1304                                  * needs to see m->speculative to correctly
1305                                  * account the pageins, otherwise...
1306                                  * take it off the speculative queue, we'll
1307                                  * let the caller of vm_fault_page deal
1308                                  * with getting it onto the correct queue
1309                                  *
1310                                  * If the caller specified in fault_info that
1311                                  * it wants a "stealth" fault, we also leave
1312                                  * the page in the speculative queue.
1313                                  */
1314                                 vm_page_lockspin_queues();
1315                                 if (m->speculative)
1316                                         VM_PAGE_QUEUES_REMOVE(m);
1317                                 vm_page_unlock_queues();
1318                         }
1319
1320                         if (m->encrypted) {
1321                                 /*
1322                                  * ENCRYPTED SWAP:
1323                                  * the user needs access to a page that we
1324                                  * encrypted before paging it out.
1325                                  * Decrypt the page now.
1326                                  * Keep it busy to prevent anyone from
1327                                  * accessing it during the decryption.
1328                                  */
1329                                 m->busy = TRUE;
1330                                 vm_page_decrypt(m, 0);
1331                                 assert(object == m->object);
1332                                 assert(m->busy);
1333                                 PAGE_WAKEUP_DONE(m);
1334
1335                                 /*
1336                                  * Retry from the top, in case
1337                                  * something changed while we were
1338                                  * decrypting.
1339                                  */
1340                                 continue;
1341                         }
1342                         ASSERT_PAGE_DECRYPTED(m);
1343
1344                         if (m->object->code_signed) {
1345                                 /*
1346                                  * CODE SIGNING:
1347                                  * We just paged in a page from a signed
1348                                  * memory object but we don't need to
1349                                  * validate it now.  We'll validate it if
1350                                  * when it gets mapped into a user address
1351                                  * space for the first time or when the page
1352                                  * gets copied to another object as a result
1353                                  * of a copy-on-write.
1354                                  */
1355                         }
1356
1357                         /*
1358                          * We mark the page busy and leave it on
1359                          * the pageout queues.  If the pageout
1360                          * deamon comes across it, then it will
1361                          * remove the page from the queue, but not the object
1362                          */
1363 #if TRACEFAULTPAGE
1364                         dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1365 #endif
1366                         XPR(XPR_VM_FAULT,
1367                             "vm_f_page: found page obj 0x%X, offset 0x%X, page 0x%X\n",
1368                                 object, offset, m, 0, 0);
1369                         assert(!m->busy);
1370                         assert(!m->absent);
1371
1372                         m->busy = TRUE;
1373                         break;
1374                 }
1375
1376
1377                 /*
1378                  * we get here when there is no page present in the object at
1379                  * the offset we're interested in... we'll allocate a page
1380                  * at this point if the pager associated with
1381                  * this object can provide the data or we're the top object...
1382                  * object is locked;  m == NULL
1383                  */
1384                 if (must_be_resident) {
1385                         if (fault_type == VM_PROT_NONE &&
1386                             object == kernel_object) {
1387                                 /*
1388                                  * We've been called from vm_fault_unwire()
1389                                  * while removing a map entry that was allocated
1390                                  * with KMA_KOBJECT and KMA_VAONLY.  This page
1391                                  * is not present and there's nothing more to
1392                                  * do here (nothing to unwire).
1393                                  */
1394                                 vm_fault_cleanup(object, first_m);
1395                                 thread_interrupt_level(interruptible_state);
1396
1397                                 return VM_FAULT_MEMORY_ERROR;
1398                         }
1399
1400                         goto dont_look_for_page;
1401                 }
1402
1403 #if !MACH_PAGEMAP
1404                 data_supply = FALSE;
1405 #endif /* !MACH_PAGEMAP */
1406
1407                 look_for_page = (object->pager_created && (MUST_ASK_PAGER(object, offset, external_state) == TRUE) && !data_supply);
1408
1409 #if TRACEFAULTPAGE
1410                 dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object);      /* (TEST/DEBUG) */
1411 #endif
1412                 if (!look_for_page && object == first_object && !object->phys_contiguous) {
1413                         /*
1414                          * Allocate a new page for this object/offset pair as a placeholder
1415                          */
1416                         m = vm_page_grab();
1417 #if TRACEFAULTPAGE
1418                         dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
1419 #endif
1420                         if (m == VM_PAGE_NULL) {
1421
1422                                 vm_fault_cleanup(object, first_m);
1423                                 thread_interrupt_level(interruptible_state);
1424
1425                                 return (VM_FAULT_MEMORY_SHORTAGE);
1426                         }
1427
1428                         if (fault_info && fault_info->batch_pmap_op == TRUE) {
1429                                 vm_page_insert_internal(m, object, offset, FALSE, TRUE, TRUE);
1430                         } else {
1431                                 vm_page_insert(m, object, offset);
1432                         }
1433                 }
1434                 if (look_for_page) {
1435                         kern_return_t   rc;
1436                         int             my_fault_type;
1437
1438                         /*
1439                          *      If the memory manager is not ready, we
1440                          *      cannot make requests.
1441                          */
1442                         if (!object->pager_ready) {
1443 #if TRACEFAULTPAGE
1444                                 dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
1445 #endif
1446                                 if (m != VM_PAGE_NULL)
1447                                         VM_PAGE_FREE(m);
1448
1449                                 XPR(XPR_VM_FAULT,
1450                                 "vm_f_page: ready wait obj 0x%X, offset 0x%X\n",
1451                                         object, offset, 0, 0, 0);
1452
1453                                 /*
1454                                  * take an extra ref so object won't die
1455                                  */
1456                                 vm_object_reference_locked(object);
1457                                 vm_fault_cleanup(object, first_m);
1458                                 counter(c_vm_fault_page_block_backoff_kernel++);
1459
1460                                 vm_object_lock(object);
1461                                 assert(object->ref_count > 0);
1462
1463                                 if (!object->pager_ready) {
1464                                         wait_result = vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGER_READY, interruptible);
1465
1466                                         vm_object_unlock(object);
1467                                         if (wait_result == THREAD_WAITING)
1468                                                 wait_result = thread_block(THREAD_CONTINUE_NULL);
1469                                         vm_object_deallocate(object);
1470
1471                                         goto backoff;
1472                                 } else {
1473                                         vm_object_unlock(object);
1474                                         vm_object_deallocate(object);
1475                                         thread_interrupt_level(interruptible_state);
1476
1477                                         return (VM_FAULT_RETRY);
1478                                 }
1479                         }
1480                         if (!object->internal && !object->phys_contiguous && object->paging_in_progress > vm_object_pagein_throttle) {
1481                                 /*
1482                                  * If there are too many outstanding page
1483                                  * requests pending on this external object, we
1484                                  * wait for them to be resolved now.
1485                                  */
1486 #if TRACEFAULTPAGE
1487                                 dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1488 #endif
1489                                 if (m != VM_PAGE_NULL)
1490                                         VM_PAGE_FREE(m);
1491                                 /*
1492                                  * take an extra ref so object won't die
1493                                  */
1494                                 vm_object_reference_locked(object);
1495
1496                                 vm_fault_cleanup(object, first_m);
1497
1498                                 counter(c_vm_fault_page_block_backoff_kernel++);
1499
1500                                 vm_object_lock(object);
1501                                 assert(object->ref_count > 0);
1502
1503                                 if (object->paging_in_progress >= vm_object_pagein_throttle) {
1504                                         vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGING_ONLY_IN_PROGRESS, interruptible);
1505
1506                                         vm_object_unlock(object);
1507                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1508                                         vm_object_deallocate(object);
1509
1510                                         goto backoff;
1511                                 } else {
1512                                         vm_object_unlock(object);
1513                                         vm_object_deallocate(object);
1514                                         thread_interrupt_level(interruptible_state);
1515
1516                                         return (VM_FAULT_RETRY);
1517                                 }
1518                         }
1519                         if (object->internal &&
1520                             (COMPRESSED_PAGER_IS_ACTIVE
1521                              || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE)) {
1522                                 int compressed_count_delta;
1523
1524                                 if (m == VM_PAGE_NULL) {
1525                                         /*
1526                                          * Allocate a new page for this object/offset pair as a placeholder
1527                                          */
1528                                         m = vm_page_grab();
1529 #if TRACEFAULTPAGE
1530                                         dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
1531 #endif
1532                                         if (m == VM_PAGE_NULL) {
1533
1534                                                 vm_fault_cleanup(object, first_m);
1535                                                 thread_interrupt_level(interruptible_state);
1536
1537                                                 return (VM_FAULT_MEMORY_SHORTAGE);
1538                                         }
1539
1540                                         m->absent = TRUE;
1541                                         if (fault_info && fault_info->batch_pmap_op == TRUE) {
1542                                                 vm_page_insert_internal(m, object, offset, FALSE, TRUE, TRUE);
1543                                         } else {
1544                                                 vm_page_insert(m, object, offset);
1545                                         }
1546                                 }
1547                                 assert(m->busy);
1548
1549                                 m->absent = TRUE;
1550                                 pager = object->pager;
1551
1552                                 assert(object->paging_in_progress > 0);
1553                                 vm_object_unlock(object);
1554
1555                                 rc = vm_compressor_pager_get(
1556                                         pager,
1557                                         offset + object->paging_offset,
1558                                         m->phys_page,
1559                                         &my_fault_type,
1560                                         0,
1561                                         &compressed_count_delta);
1562
1563                                 vm_object_lock(object);
1564                                 assert(object->paging_in_progress > 0);
1565
1566                                 vm_compressor_pager_count(
1567                                         pager,
1568                                         compressed_count_delta,
1569                                         FALSE, /* shared_lock */
1570                                         object);
1571
1572                                 switch (rc) {
1573                                 case KERN_SUCCESS:
1574                                         m->absent = FALSE;
1575                                         m->dirty = TRUE;
1576                                         if ((m->object->wimg_bits &
1577                                              VM_WIMG_MASK) !=
1578                                             VM_WIMG_USE_DEFAULT) {
1579                                                 /*
1580                                                  * If the page is not cacheable,
1581                                                  * we can't let its contents
1582                                                  * linger in the data cache
1583                                                  * after the decompression.
1584                                                  */
1585                                                 pmap_sync_page_attributes_phys(
1586                                                         m->phys_page);
1587                                         } else {
1588                                                 m->written_by_kernel = TRUE;
1589                                         }
1590
1591                                         /*
1592                                          * If the object is purgeable, its
1593                                          * owner's purgeable ledgers have been
1594                                          * updated in vm_page_insert() but the
1595                                          * page was also accounted for in a
1596                                          * "compressed purgeable" ledger, so
1597                                          * update that now.
1598                                          */
1599                                         if ((object->purgable !=
1600                                              VM_PURGABLE_DENY) &&
1601                                             (object->vo_purgeable_owner !=
1602                                              NULL)) {
1603                                                 /*
1604                                                  * One less compressed
1605                                                  * purgeable page.
1606                                                  */
1607                                                 vm_purgeable_compressed_update(
1608                                                         object,
1609                                                         -1);
1610                                         }
1611
1612                                         break;
1613                                 case KERN_MEMORY_FAILURE:
1614                                         m->unusual = TRUE;
1615                                         m->error = TRUE;
1616                                         m->absent = FALSE;
1617                                         break;
1618                                 case KERN_MEMORY_ERROR:
1619                                         assert(m->absent);
1620                                         break;
1621                                 default:
1622                                         panic("vm_fault_page(): unexpected "
1623                                               "error %d from "
1624                                               "vm_compressor_pager_get()\n",
1625                                               rc);
1626                                 }
1627                                 PAGE_WAKEUP_DONE(m);
1628
1629                                 rc = KERN_SUCCESS;
1630                                 goto data_requested;
1631                         }
1632                         my_fault_type = DBG_PAGEIN_FAULT;
1633
1634                         if (m != VM_PAGE_NULL) {
1635                                 VM_PAGE_FREE(m);
1636                                 m = VM_PAGE_NULL;
1637                         }
1638
1639 #if TRACEFAULTPAGE
1640                         dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0);  /* (TEST/DEBUG) */
1641 #endif
1642
1643                         /*
1644                          * It's possible someone called vm_object_destroy while we weren't
1645                          * holding the object lock.  If that has happened, then bail out
1646                          * here.
1647                          */
1648
1649                         pager = object->pager;
1650
1651                         if (pager == MEMORY_OBJECT_NULL) {
1652                                 vm_fault_cleanup(object, first_m);
1653                                 thread_interrupt_level(interruptible_state);
1654                                 return VM_FAULT_MEMORY_ERROR;
1655                         }
1656
1657                         /*
1658                          * We have an absent page in place for the faulting offset,
1659                          * so we can release the object lock.
1660                          */
1661
1662                         vm_object_unlock(object);
1663
1664                         /*
1665                          * If this object uses a copy_call strategy,
1666                          * and we are interested in a copy of this object
1667                          * (having gotten here only by following a
1668                          * shadow chain), then tell the memory manager
1669                          * via a flag added to the desired_access
1670                          * parameter, so that it can detect a race
1671                          * between our walking down the shadow chain
1672                          * and its pushing pages up into a copy of
1673                          * the object that it manages.
1674                          */
1675                         if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL && object != first_object)
1676                                 wants_copy_flag = VM_PROT_WANTS_COPY;
1677                         else
1678                                 wants_copy_flag = VM_PROT_NONE;
1679
1680                         XPR(XPR_VM_FAULT,
1681                             "vm_f_page: data_req obj 0x%X, offset 0x%X, page 0x%X, acc %d\n",
1682                                 object, offset, m,
1683                                 access_required | wants_copy_flag, 0);
1684
1685                         if (object->copy == first_object) {
1686                                 /*
1687                                  * if we issue the memory_object_data_request in
1688                                  * this state, we are subject to a deadlock with
1689                                  * the underlying filesystem if it is trying to
1690                                  * shrink the file resulting in a push of pages
1691                                  * into the copy object...  that push will stall
1692                                  * on the placeholder page, and if the pushing thread
1693                                  * is holding a lock that is required on the pagein
1694                                  * path (such as a truncate lock), we'll deadlock...
1695                                  * to avoid this potential deadlock, we throw away
1696                                  * our placeholder page before calling memory_object_data_request
1697                                  * and force this thread to retry the vm_fault_page after
1698                                  * we have issued the I/O.  the second time through this path
1699                                  * we will find the page already in the cache (presumably still
1700                                  * busy waiting for the I/O to complete) and then complete
1701                                  * the fault w/o having to go through memory_object_data_request again
1702                                  */
1703                                 assert(first_m != VM_PAGE_NULL);
1704                                 assert(first_m->object == first_object);
1705
1706                                 vm_object_lock(first_object);
1707                                 VM_PAGE_FREE(first_m);
1708                                 vm_object_paging_end(first_object);
1709                                 vm_object_unlock(first_object);
1710
1711                                 first_m = VM_PAGE_NULL;
1712                                 force_fault_retry = TRUE;
1713
1714                                 vm_fault_page_forced_retry++;
1715                         }
1716
1717                         if (data_already_requested == TRUE) {
1718                                 orig_behavior = fault_info->behavior;
1719                                 orig_cluster_size = fault_info->cluster_size;
1720
1721                                 fault_info->behavior = VM_BEHAVIOR_RANDOM;
1722                                 fault_info->cluster_size = PAGE_SIZE;
1723                         }
1724                         /*
1725                          * Call the memory manager to retrieve the data.
1726                          */
1727                         rc = memory_object_data_request(
1728                                 pager,
1729                                 offset + object->paging_offset,
1730                                 PAGE_SIZE,
1731                                 access_required | wants_copy_flag,
1732                                 (memory_object_fault_info_t)fault_info);
1733
1734                         if (data_already_requested == TRUE) {
1735                                 fault_info->behavior = orig_behavior;
1736                                 fault_info->cluster_size = orig_cluster_size;
1737                         } else
1738                                 data_already_requested = TRUE;
1739
1740                         DTRACE_VM2(maj_fault, int, 1, (uint64_t *), NULL);
1741 #if TRACEFAULTPAGE
1742                         dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
1743 #endif
1744                         vm_object_lock(object);
1745
1746                 data_requested:
1747                         if (rc != KERN_SUCCESS) {
1748
1749                                 vm_fault_cleanup(object, first_m);
1750                                 thread_interrupt_level(interruptible_state);
1751
1752                                 return ((rc == MACH_SEND_INTERRUPTED) ?
1753                                         VM_FAULT_INTERRUPTED :
1754                                         VM_FAULT_MEMORY_ERROR);
1755                         } else {
1756                                 clock_sec_t     tv_sec;
1757                                 clock_usec_t    tv_usec;
1758
1759                                 if (my_fault_type == DBG_PAGEIN_FAULT) {
1760                                         clock_get_system_microtime(&tv_sec, &tv_usec);
1761                                         current_thread()->t_page_creation_time = tv_sec;
1762                                         current_thread()->t_page_creation_count = 0;
1763                                 }
1764                         }
1765                         if ((interruptible != THREAD_UNINT) && (current_thread()->sched_flags & TH_SFLAG_ABORT)) {
1766
1767                                 vm_fault_cleanup(object, first_m);
1768                                 thread_interrupt_level(interruptible_state);
1769
1770                                 return (VM_FAULT_INTERRUPTED);
1771                         }
1772                         if (force_fault_retry == TRUE) {
1773
1774                                 vm_fault_cleanup(object, first_m);
1775                                 thread_interrupt_level(interruptible_state);
1776
1777                                 return (VM_FAULT_RETRY);
1778                         }
1779                         if (m == VM_PAGE_NULL && object->phys_contiguous) {
1780                                 /*
1781                                  * No page here means that the object we
1782                                  * initially looked up was "physically
1783                                  * contiguous" (i.e. device memory).  However,
1784                                  * with Virtual VRAM, the object might not
1785                                  * be backed by that device memory anymore,
1786                                  * so we're done here only if the object is
1787                                  * still "phys_contiguous".
1788                                  * Otherwise, if the object is no longer
1789                                  * "phys_contiguous", we need to retry the
1790                                  * page fault against the object's new backing
1791                                  * store (different memory object).
1792                                  */
1793                         phys_contig_object:
1794                                 goto done;
1795                         }
1796                         /*
1797                          * potentially a pagein fault
1798                          * if we make it through the state checks
1799                          * above, than we'll count it as such
1800                          */
1801                         my_fault = my_fault_type;
1802
1803                         /*
1804                          * Retry with same object/offset, since new data may
1805                          * be in a different page (i.e., m is meaningless at
1806                          * this point).
1807                          */
1808                         continue;
1809                 }
1810 dont_look_for_page:
1811                 /*
1812                  * We get here if the object has no pager, or an existence map
1813                  * exists and indicates the page isn't present on the pager
1814                  * or we're unwiring a page.  If a pager exists, but there
1815                  * is no existence map, then the m->absent case above handles
1816                  * the ZF case when the pager can't provide the page
1817                  */
1818 #if TRACEFAULTPAGE
1819                 dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1820 #endif
1821                 if (object == first_object)
1822                         first_m = m;
1823                 else
1824                         assert(m == VM_PAGE_NULL);
1825
1826                 XPR(XPR_VM_FAULT,
1827                     "vm_f_page: no pager obj 0x%X, offset 0x%X, page 0x%X, next_obj 0x%X\n",
1828                         object, offset, m,
1829                         object->shadow, 0);
1830
1831                 next_object = object->shadow;
1832
1833                 if (next_object == VM_OBJECT_NULL) {
1834                         /*
1835                          * we've hit the bottom of the shadown chain,
1836                          * fill the page in the top object with zeros.
1837                          */
1838                         assert(!must_be_resident);
1839
1840                         if (object != first_object) {
1841                                 vm_object_paging_end(object);
1842                                 vm_object_unlock(object);
1843
1844                                 object = first_object;
1845                                 offset = first_offset;
1846                                 vm_object_lock(object);
1847                         }
1848                         m = first_m;
1849                         assert(m->object == object);
1850                         first_m = VM_PAGE_NULL;
1851
1852                         /*
1853                          * check for any conditions that prevent
1854                          * us from creating a new zero-fill page
1855                          * vm_fault_check will do all of the
1856                          * fault cleanup in the case of an error condition
1857                          * including resetting the thread_interrupt_level
1858                          */
1859                         error = vm_fault_check(object, m, first_m, interruptible_state);
1860
1861                         if (error != VM_FAULT_SUCCESS)
1862                                 return (error);
1863
1864                         if (m == VM_PAGE_NULL) {
1865                                 m = vm_page_grab();
1866
1867                                 if (m == VM_PAGE_NULL) {
1868                                         vm_fault_cleanup(object, VM_PAGE_NULL);
1869                                         thread_interrupt_level(interruptible_state);
1870
1871                                         return (VM_FAULT_MEMORY_SHORTAGE);
1872                                 }
1873                                 vm_page_insert(m, object, offset);
1874                         }
1875                         if (fault_info->mark_zf_absent && no_zero_fill == TRUE)
1876                                 m->absent = TRUE;
1877
1878                         my_fault = vm_fault_zero_page(m, no_zero_fill);
1879
1880                         break;
1881
1882                 } else {
1883                         /*
1884                          * Move on to the next object.  Lock the next
1885                          * object before unlocking the current one.
1886                          */
1887                         if ((object != first_object) || must_be_resident)
1888                                 vm_object_paging_end(object);
1889
1890                         offset += object->vo_shadow_offset;
1891                         fault_info->lo_offset += object->vo_shadow_offset;
1892                         fault_info->hi_offset += object->vo_shadow_offset;
1893                         access_required = VM_PROT_READ;
1894
1895                         vm_object_lock(next_object);
1896                         vm_object_unlock(object);
1897
1898                         object = next_object;
1899                         vm_object_paging_begin(object);
1900                 }
1901         }
1902
1903         /*
1904          *      PAGE HAS BEEN FOUND.
1905          *
1906          *      This page (m) is:
1907          *              busy, so that we can play with it;
1908          *              not absent, so that nobody else will fill it;
1909          *              possibly eligible for pageout;
1910          *
1911          *      The top-level page (first_m) is:
1912          *              VM_PAGE_NULL if the page was found in the
1913          *               top-level object;
1914          *              busy, not absent, and ineligible for pageout.
1915          *
1916          *      The current object (object) is locked.  A paging
1917          *      reference is held for the current and top-level
1918          *      objects.
1919          */
1920
1921 #if TRACEFAULTPAGE
1922         dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1923 #endif
1924 #if     EXTRA_ASSERTIONS
1925         assert(m->busy && !m->absent);
1926         assert((first_m == VM_PAGE_NULL) ||
1927                (first_m->busy && !first_m->absent &&
1928                 !first_m->active && !first_m->inactive));
1929 #endif  /* EXTRA_ASSERTIONS */
1930
1931         /*
1932          * ENCRYPTED SWAP:
1933          * If we found a page, we must have decrypted it before we
1934          * get here...
1935          */
1936         ASSERT_PAGE_DECRYPTED(m);
1937
1938         XPR(XPR_VM_FAULT,
1939             "vm_f_page: FOUND obj 0x%X, off 0x%X, page 0x%X, 1_obj 0x%X, 1_m 0x%X\n",
1940                 object, offset, m,
1941                 first_object, first_m);
1942
1943         /*
1944          * If the page is being written, but isn't
1945          * already owned by the top-level object,
1946          * we have to copy it into a new page owned
1947          * by the top-level object.
1948          */
1949         if (object != first_object) {
1950
1951 #if TRACEFAULTPAGE
1952                 dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1953 #endif
1954                 if (fault_type & VM_PROT_WRITE) {
1955                         vm_page_t copy_m;
1956
1957                         /*
1958                          * We only really need to copy if we
1959                          * want to write it.
1960                          */
1961                         assert(!must_be_resident);
1962
1963                         /*
1964                          * are we protecting the system from
1965                          * backing store exhaustion.  If so
1966                          * sleep unless we are privileged.
1967                          */
1968                         if (vm_backing_store_low) {
1969                                 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
1970
1971                                         RELEASE_PAGE(m);
1972                                         vm_fault_cleanup(object, first_m);
1973
1974                                         assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
1975
1976                                         thread_block(THREAD_CONTINUE_NULL);
1977                                         thread_interrupt_level(interruptible_state);
1978
1979                                         return (VM_FAULT_RETRY);
1980                                 }
1981                         }
1982                         /*
1983                          * If we try to collapse first_object at this
1984                          * point, we may deadlock when we try to get
1985                          * the lock on an intermediate object (since we
1986                          * have the bottom object locked).  We can't
1987                          * unlock the bottom object, because the page
1988                          * we found may move (by collapse) if we do.
1989                          *
1990                          * Instead, we first copy the page.  Then, when
1991                          * we have no more use for the bottom object,
1992                          * we unlock it and try to collapse.
1993                          *
1994                          * Note that we copy the page even if we didn't
1995                          * need to... that's the breaks.
1996                          */
1997
1998                         /*
1999                          * Allocate a page for the copy
2000                          */
2001                         copy_m = vm_page_grab();
2002
2003                         if (copy_m == VM_PAGE_NULL) {
2004                                 RELEASE_PAGE(m);
2005
2006                                 vm_fault_cleanup(object, first_m);
2007                                 thread_interrupt_level(interruptible_state);
2008
2009                                 return (VM_FAULT_MEMORY_SHORTAGE);
2010                         }
2011                         XPR(XPR_VM_FAULT,
2012                             "vm_f_page: page_copy obj 0x%X, offset 0x%X, m 0x%X, copy_m 0x%X\n",
2013                                 object, offset,
2014                                 m, copy_m, 0);
2015
2016                         vm_page_copy(m, copy_m);
2017
2018                         /*
2019                          * If another map is truly sharing this
2020                          * page with us, we have to flush all
2021                          * uses of the original page, since we
2022                          * can't distinguish those which want the
2023                          * original from those which need the
2024                          * new copy.
2025                          *
2026                          * XXXO If we know that only one map has
2027                          * access to this page, then we could
2028                          * avoid the pmap_disconnect() call.
2029                          */
2030                         if (m->pmapped)
2031                                 pmap_disconnect(m->phys_page);
2032
2033                         if (m->clustered) {
2034                                 VM_PAGE_COUNT_AS_PAGEIN(m);
2035                                 VM_PAGE_CONSUME_CLUSTERED(m);
2036                         }
2037                         assert(!m->cleaning);
2038
2039                         /*
2040                          * We no longer need the old page or object.
2041                          */
2042                         RELEASE_PAGE(m);
2043
2044                         vm_object_paging_end(object);
2045                         vm_object_unlock(object);
2046
2047                         my_fault = DBG_COW_FAULT;
2048                         VM_STAT_INCR(cow_faults);
2049                         DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
2050                         current_task()->cow_faults++;
2051
2052                         object = first_object;
2053                         offset = first_offset;
2054
2055                         vm_object_lock(object);
2056                         /*
2057                          * get rid of the place holder
2058                          * page that we soldered in earlier
2059                          */
2060                         VM_PAGE_FREE(first_m);
2061                         first_m = VM_PAGE_NULL;
2062
2063                         /*
2064                          * and replace it with the
2065                          * page we just copied into
2066                          */
2067                         assert(copy_m->busy);
2068                         vm_page_insert(copy_m, object, offset);
2069                         SET_PAGE_DIRTY(copy_m, TRUE);
2070
2071                         m = copy_m;
2072                         /*
2073                          * Now that we've gotten the copy out of the
2074                          * way, let's try to collapse the top object.
2075                          * But we have to play ugly games with
2076                          * paging_in_progress to do that...
2077                          */
2078                         vm_object_paging_end(object);
2079                         vm_object_collapse(object, offset, TRUE);
2080                         vm_object_paging_begin(object);
2081
2082                 } else
2083                         *protection &= (~VM_PROT_WRITE);
2084         }
2085         /*
2086          * Now check whether the page needs to be pushed into the
2087          * copy object.  The use of asymmetric copy on write for
2088          * shared temporary objects means that we may do two copies to
2089          * satisfy the fault; one above to get the page from a
2090          * shadowed object, and one here to push it into the copy.
2091          */
2092         try_failed_count = 0;
2093
2094         while ((copy_object = first_object->copy) != VM_OBJECT_NULL) {
2095                 vm_object_offset_t      copy_offset;
2096                 vm_page_t               copy_m;
2097
2098 #if TRACEFAULTPAGE
2099                 dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type);    /* (TEST/DEBUG) */
2100 #endif
2101                 /*
2102                  * If the page is being written, but hasn't been
2103                  * copied to the copy-object, we have to copy it there.
2104                  */
2105                 if ((fault_type & VM_PROT_WRITE) == 0) {
2106                         *protection &= ~VM_PROT_WRITE;
2107                         break;
2108                 }
2109
2110                 /*
2111                  * If the page was guaranteed to be resident,
2112                  * we must have already performed the copy.
2113                  */
2114                 if (must_be_resident)
2115                         break;
2116
2117                 /*
2118                  * Try to get the lock on the copy_object.
2119                  */
2120                 if (!vm_object_lock_try(copy_object)) {
2121
2122                         vm_object_unlock(object);
2123                         try_failed_count++;
2124
2125                         mutex_pause(try_failed_count);  /* wait a bit */
2126                         vm_object_lock(object);
2127
2128                         continue;
2129                 }
2130                 try_failed_count = 0;
2131
2132                 /*
2133                  * Make another reference to the copy-object,
2134                  * to keep it from disappearing during the
2135                  * copy.
2136                  */
2137                 vm_object_reference_locked(copy_object);
2138
2139                 /*
2140                  * Does the page exist in the copy?
2141                  */
2142                 copy_offset = first_offset - copy_object->vo_shadow_offset;
2143
2144                 if (copy_object->vo_size <= copy_offset)
2145                         /*
2146                          * Copy object doesn't cover this page -- do nothing.
2147                          */
2148                         ;
2149                 else if ((copy_m = vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
2150                         /*
2151                          * Page currently exists in the copy object
2152                          */
2153                         if (copy_m->busy) {
2154                                 /*
2155                                  * If the page is being brought
2156                                  * in, wait for it and then retry.
2157                                  */
2158                                 RELEASE_PAGE(m);
2159
2160                                 /*
2161                                  * take an extra ref so object won't die
2162                                  */
2163                                 vm_object_reference_locked(copy_object);
2164                                 vm_object_unlock(copy_object);
2165                                 vm_fault_cleanup(object, first_m);
2166                                 counter(c_vm_fault_page_block_backoff_kernel++);
2167
2168                                 vm_object_lock(copy_object);
2169                                 assert(copy_object->ref_count > 0);
2170                                 VM_OBJ_RES_DECR(copy_object);
2171                                 vm_object_lock_assert_exclusive(copy_object);
2172                                 copy_object->ref_count--;
2173                                 assert(copy_object->ref_count > 0);
2174                                 copy_m = vm_page_lookup(copy_object, copy_offset);
2175                                 /*
2176                                  * ENCRYPTED SWAP:
2177                                  * it's OK if the "copy_m" page is encrypted,
2178                                  * because we're not moving it nor handling its
2179                                  * contents.
2180                                  */
2181                                 if (copy_m != VM_PAGE_NULL && copy_m->busy) {
2182                                         PAGE_ASSERT_WAIT(copy_m, interruptible);
2183
2184                                         vm_object_unlock(copy_object);
2185                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
2186                                         vm_object_deallocate(copy_object);
2187
2188                                         goto backoff;
2189                                 } else {
2190                                         vm_object_unlock(copy_object);
2191                                         vm_object_deallocate(copy_object);
2192                                         thread_interrupt_level(interruptible_state);
2193
2194                                         return (VM_FAULT_RETRY);
2195                                 }
2196                         }
2197                 }
2198                 else if (!PAGED_OUT(copy_object, copy_offset)) {
2199                         /*
2200                          * If PAGED_OUT is TRUE, then the page used to exist
2201                          * in the copy-object, and has already been paged out.
2202                          * We don't need to repeat this. If PAGED_OUT is
2203                          * FALSE, then either we don't know (!pager_created,
2204                          * for example) or it hasn't been paged out.
2205                          * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
2206                          * We must copy the page to the copy object.
2207                          */
2208
2209                         if (vm_backing_store_low) {
2210                                 /*
2211                                  * we are protecting the system from
2212                                  * backing store exhaustion.  If so
2213                                  * sleep unless we are privileged.
2214                                  */
2215                                 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
2216                                         assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
2217
2218                                         RELEASE_PAGE(m);
2219                                         VM_OBJ_RES_DECR(copy_object);
2220                                         vm_object_lock_assert_exclusive(copy_object);
2221                                         copy_object->ref_count--;
2222                                         assert(copy_object->ref_count > 0);
2223
2224                                         vm_object_unlock(copy_object);
2225                                         vm_fault_cleanup(object, first_m);
2226                                         thread_block(THREAD_CONTINUE_NULL);
2227                                         thread_interrupt_level(interruptible_state);
2228
2229                                         return (VM_FAULT_RETRY);
2230                                 }
2231                         }
2232                         /*
2233                          * Allocate a page for the copy
2234                          */
2235                         copy_m = vm_page_alloc(copy_object, copy_offset);
2236
2237                         if (copy_m == VM_PAGE_NULL) {
2238                                 RELEASE_PAGE(m);
2239
2240                                 VM_OBJ_RES_DECR(copy_object);
2241                                 vm_object_lock_assert_exclusive(copy_object);
2242                                 copy_object->ref_count--;
2243                                 assert(copy_object->ref_count > 0);
2244
2245                                 vm_object_unlock(copy_object);
2246                                 vm_fault_cleanup(object, first_m);
2247                                 thread_interrupt_level(interruptible_state);
2248
2249                                 return (VM_FAULT_MEMORY_SHORTAGE);
2250                         }
2251                         /*
2252                          * Must copy page into copy-object.
2253                          */
2254                         vm_page_copy(m, copy_m);
2255
2256                         /*
2257                          * If the old page was in use by any users
2258                          * of the copy-object, it must be removed
2259                          * from all pmaps.  (We can't know which
2260                          * pmaps use it.)
2261                          */
2262                         if (m->pmapped)
2263                                 pmap_disconnect(m->phys_page);
2264
2265                         if (m->clustered) {
2266                                 VM_PAGE_COUNT_AS_PAGEIN(m);
2267                                 VM_PAGE_CONSUME_CLUSTERED(m);
2268                         }
2269                         /*
2270                          * If there's a pager, then immediately
2271                          * page out this page, using the "initialize"
2272                          * option.  Else, we use the copy.
2273                          */
2274                         if ((!copy_object->pager_ready)
2275 #if MACH_PAGEMAP
2276                             || vm_external_state_get(copy_object->existence_map, copy_offset) == VM_EXTERNAL_STATE_ABSENT
2277 #endif
2278                             || VM_COMPRESSOR_PAGER_STATE_GET(copy_object, copy_offset) == VM_EXTERNAL_STATE_ABSENT
2279                             ) {
2280
2281                                 vm_page_lockspin_queues();
2282                                 assert(!m->cleaning);
2283                                 vm_page_activate(copy_m);
2284                                 vm_page_unlock_queues();
2285
2286                                 SET_PAGE_DIRTY(copy_m, TRUE);
2287                                 PAGE_WAKEUP_DONE(copy_m);
2288
2289                         } else if (copy_object->internal &&
2290                                    (DEFAULT_PAGER_IS_ACTIVE || DEFAULT_FREEZER_IS_ACTIVE)) {
2291                                 /*
2292                                  * For internal objects check with the pager to see
2293                                  * if the page already exists in the backing store.
2294                                  * If yes, then we can drop the copy page. If not,
2295                                  * then we'll activate it, mark it dirty and keep it
2296                                  * around.
2297                                  */
2298
2299                                 kern_return_t kr = KERN_SUCCESS;
2300
2301                                 memory_object_t copy_pager = copy_object->pager;
2302                                 assert(copy_pager != MEMORY_OBJECT_NULL);
2303                                 vm_object_paging_begin(copy_object);
2304
2305                                 vm_object_unlock(copy_object);
2306
2307                                 kr = memory_object_data_request(
2308                                         copy_pager,
2309                                         copy_offset + copy_object->paging_offset,
2310                                         0, /* Only query the pager. */
2311                                         VM_PROT_READ,
2312                                         NULL);
2313
2314                                 vm_object_lock(copy_object);
2315
2316                                 vm_object_paging_end(copy_object);
2317
2318                                 /*
2319                                  * Since we dropped the copy_object's lock,
2320                                  * check whether we'll have to deallocate
2321                                  * the hard way.
2322                                  */
2323                                 if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
2324                                         vm_object_unlock(copy_object);
2325                                         vm_object_deallocate(copy_object);
2326                                         vm_object_lock(object);
2327
2328                                         continue;
2329                                 }
2330                                 if (kr == KERN_SUCCESS) {
2331                                         /*
2332                                          * The pager has the page. We don't want to overwrite
2333                                          * that page by sending this one out to the backing store.
2334                                          * So we drop the copy page.
2335                                          */
2336                                         VM_PAGE_FREE(copy_m);
2337
2338                                 } else {
2339                                         /*
2340                                          * The pager doesn't have the page. We'll keep this one
2341                                          * around in the copy object. It might get sent out to
2342                                          * the backing store under memory pressure.
2343                                          */
2344                                         vm_page_lockspin_queues();
2345                                         assert(!m->cleaning);
2346                                         vm_page_activate(copy_m);
2347                                         vm_page_unlock_queues();
2348
2349                                         SET_PAGE_DIRTY(copy_m, TRUE);
2350                                         PAGE_WAKEUP_DONE(copy_m);
2351                                 }
2352                         } else {
2353
2354                                 assert(copy_m->busy == TRUE);
2355                                 assert(!m->cleaning);
2356
2357                                 /*
2358                                  * dirty is protected by the object lock
2359                                  */
2360                                 SET_PAGE_DIRTY(copy_m, TRUE);
2361
2362                                 /*
2363                                  * The page is already ready for pageout:
2364                                  * not on pageout queues and busy.
2365                                  * Unlock everything except the
2366                                  * copy_object itself.
2367                                  */
2368                                 vm_object_unlock(object);
2369
2370                                 /*
2371                                  * Write the page to the copy-object,
2372                                  * flushing it from the kernel.
2373                                  */
2374                                 vm_pageout_initialize_page(copy_m);
2375
2376                                 /*
2377                                  * Since the pageout may have
2378                                  * temporarily dropped the
2379                                  * copy_object's lock, we
2380                                  * check whether we'll have
2381                                  * to deallocate the hard way.
2382                                  */
2383                                 if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
2384                                         vm_object_unlock(copy_object);
2385                                         vm_object_deallocate(copy_object);
2386                                         vm_object_lock(object);
2387
2388                                         continue;
2389                                 }
2390                                 /*
2391                                  * Pick back up the old object's
2392                                  * lock.  [It is safe to do so,
2393                                  * since it must be deeper in the
2394                                  * object tree.]
2395                                  */
2396                                 vm_object_lock(object);
2397                         }
2398
2399                         /*
2400                          * Because we're pushing a page upward
2401                          * in the object tree, we must restart
2402                          * any faults that are waiting here.
2403                          * [Note that this is an expansion of
2404                          * PAGE_WAKEUP that uses the THREAD_RESTART
2405                          * wait result].  Can't turn off the page's
2406                          * busy bit because we're not done with it.
2407                          */
2408                         if (m->wanted) {
2409                                 m->wanted = FALSE;
2410                                 thread_wakeup_with_result((event_t) m, THREAD_RESTART);
2411                         }
2412                 }
2413                 /*
2414                  * The reference count on copy_object must be
2415                  * at least 2: one for our extra reference,
2416                  * and at least one from the outside world
2417                  * (we checked that when we last locked
2418                  * copy_object).
2419                  */
2420                 vm_object_lock_assert_exclusive(copy_object);
2421                 copy_object->ref_count--;
2422                 assert(copy_object->ref_count > 0);
2423
2424                 VM_OBJ_RES_DECR(copy_object);
2425                 vm_object_unlock(copy_object);
2426
2427                 break;
2428         }
2429
2430 done:
2431         *result_page = m;
2432         *top_page = first_m;
2433
2434         XPR(XPR_VM_FAULT,
2435                 "vm_f_page: DONE obj 0x%X, offset 0x%X, m 0x%X, first_m 0x%X\n",
2436                 object, offset, m, first_m, 0);
2437
2438         if (m != VM_PAGE_NULL) {
2439                 retval = VM_FAULT_SUCCESS;
2440
2441                 if (my_fault == DBG_PAGEIN_FAULT) {
2442
2443                         VM_PAGE_COUNT_AS_PAGEIN(m);
2444
2445                         if (m->object->internal)
2446                                 my_fault = DBG_PAGEIND_FAULT;
2447                         else
2448                                 my_fault = DBG_PAGEINV_FAULT;
2449
2450                         /*
2451                          * evaluate access pattern and update state
2452                          * vm_fault_deactivate_behind depends on the
2453                          * state being up to date
2454                          */
2455                         vm_fault_is_sequential(object, offset, fault_info->behavior);
2456
2457                         vm_fault_deactivate_behind(object, offset, fault_info->behavior);
2458                 } else if (my_fault == DBG_COMPRESSOR_FAULT || my_fault == DBG_COMPRESSOR_SWAPIN_FAULT) {
2459
2460                         VM_STAT_INCR(decompressions);
2461                 }
2462                 if (type_of_fault)
2463                         *type_of_fault = my_fault;
2464         } else {
2465                 retval = VM_FAULT_SUCCESS_NO_VM_PAGE;
2466                 assert(first_m == VM_PAGE_NULL);
2467                 assert(object == first_object);
2468         }
2469
2470         thread_interrupt_level(interruptible_state);
2471
2472 #if TRACEFAULTPAGE
2473         dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0);       /* (TEST/DEBUG) */
2474 #endif
2475         return retval;
2476
2477 backoff:
2478         thread_interrupt_level(interruptible_state);
2479
2480         if (wait_result == THREAD_INTERRUPTED)
2481                 return (VM_FAULT_INTERRUPTED);
2482         return (VM_FAULT_RETRY);
2483
2484 #undef  RELEASE_PAGE
2485 }
2486
2487
2488
2489 /*
2490  * CODE SIGNING:
2491  * When soft faulting a page, we have to validate the page if:
2492  * 1. the page is being mapped in user space
2493  * 2. the page hasn't already been found to be "tainted"
2494  * 3. the page belongs to a code-signed object
2495  * 4. the page has not been validated yet or has been mapped for write.
2496  */
2497 #define VM_FAULT_NEED_CS_VALIDATION(pmap, page)                         \
2498         ((pmap) != kernel_pmap /*1*/ &&                                 \
2499          !(page)->cs_tainted /*2*/ &&                                   \
2500          (page)->object->code_signed /*3*/ &&                           \
2501          (!(page)->cs_validated || (page)->wpmapped /*4*/))
2502
2503
2504 /*
2505  * page queue lock must NOT be held
2506  * m->object must be locked
2507  *
2508  * NOTE: m->object could be locked "shared" only if we are called
2509  * from vm_fault() as part of a soft fault.  If so, we must be
2510  * careful not to modify the VM object in any way that is not
2511  * legal under a shared lock...
2512  */
2513 extern int proc_selfpid(void);
2514 extern char *proc_name_address(void *p);
2515 unsigned long cs_enter_tainted_rejected = 0;
2516 unsigned long cs_enter_tainted_accepted = 0;
2517 kern_return_t
2518 vm_fault_enter(vm_page_t m,
2519                pmap_t pmap,
2520                vm_map_offset_t vaddr,
2521                vm_prot_t prot,
2522                vm_prot_t fault_type,
2523                boolean_t wired,
2524                boolean_t change_wiring,
2525                boolean_t no_cache,
2526                boolean_t cs_bypass,
2527                __unused int      user_tag,
2528                int       pmap_options,
2529                boolean_t *need_retry,
2530                int *type_of_fault)
2531 {
2532         kern_return_t   kr, pe_result;
2533         boolean_t       previously_pmapped = m->pmapped;
2534         boolean_t       must_disconnect = 0;
2535         boolean_t       map_is_switched, map_is_switch_protected;
2536         int             cs_enforcement_enabled;
2537
2538         vm_object_lock_assert_held(m->object);
2539 #if DEBUG
2540         lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
2541 #endif /* DEBUG */
2542
2543         if (m->phys_page == vm_page_guard_addr) {
2544                 assert(m->fictitious);
2545                 return KERN_SUCCESS;
2546         }
2547
2548         if (*type_of_fault == DBG_ZERO_FILL_FAULT) {
2549
2550                 vm_object_lock_assert_exclusive(m->object);
2551
2552         } else if ((fault_type & VM_PROT_WRITE) == 0) {
2553                 /*
2554                  * This is not a "write" fault, so we
2555                  * might not have taken the object lock
2556                  * exclusively and we might not be able
2557                  * to update the "wpmapped" bit in
2558                  * vm_fault_enter().
2559                  * Let's just grant read access to
2560                  * the page for now and we'll
2561                  * soft-fault again if we need write
2562                  * access later...
2563                  */
2564                 prot &= ~VM_PROT_WRITE;
2565         }
2566         if (m->pmapped == FALSE) {
2567
2568                 if (m->clustered) {
2569                         if (*type_of_fault == DBG_CACHE_HIT_FAULT) {
2570                                 /*
2571                                  * found it in the cache, but this
2572                                  * is the first fault-in of the page (m->pmapped == FALSE)
2573                                  * so it must have come in as part of
2574                                  * a cluster... account 1 pagein against it
2575                                  */
2576                                 if (m->object->internal)
2577                                         *type_of_fault = DBG_PAGEIND_FAULT;
2578                                 else
2579                                         *type_of_fault = DBG_PAGEINV_FAULT;
2580
2581                                 VM_PAGE_COUNT_AS_PAGEIN(m);
2582                         }
2583                         VM_PAGE_CONSUME_CLUSTERED(m);
2584                 }
2585         }
2586
2587         if (*type_of_fault != DBG_COW_FAULT) {
2588                 DTRACE_VM2(as_fault, int, 1, (uint64_t *), NULL);
2589
2590                 if (pmap == kernel_pmap) {
2591                         DTRACE_VM2(kernel_asflt, int, 1, (uint64_t *), NULL);
2592                 }
2593         }
2594
2595         /* Validate code signature if necessary. */
2596         if (VM_FAULT_NEED_CS_VALIDATION(pmap, m)) {
2597                 vm_object_lock_assert_exclusive(m->object);
2598
2599                 if (m->cs_validated) {
2600                         vm_cs_revalidates++;
2601                 }
2602
2603                 /* VM map is locked, so 1 ref will remain on VM object -
2604                  * so no harm if vm_page_validate_cs drops the object lock */
2605                 vm_page_validate_cs(m);
2606         }
2607
2608 #define page_immutable(m,prot) ((m)->cs_validated /*&& ((prot) & VM_PROT_EXECUTE)*/)
2609
2610         map_is_switched = ((pmap != vm_map_pmap(current_task()->map)) &&
2611                            (pmap == vm_map_pmap(current_thread()->map)));
2612         map_is_switch_protected = current_thread()->map->switch_protect;
2613
2614         /* If the map is switched, and is switch-protected, we must protect
2615          * some pages from being write-faulted: immutable pages because by
2616          * definition they may not be written, and executable pages because that
2617          * would provide a way to inject unsigned code.
2618          * If the page is immutable, we can simply return. However, we can't
2619          * immediately determine whether a page is executable anywhere. But,
2620          * we can disconnect it everywhere and remove the executable protection
2621          * from the current map. We do that below right before we do the
2622          * PMAP_ENTER.
2623          */
2624         cs_enforcement_enabled = cs_enforcement(NULL);
2625
2626         if(cs_enforcement_enabled && map_is_switched &&
2627            map_is_switch_protected && page_immutable(m, prot) &&
2628            (prot & VM_PROT_WRITE))
2629         {
2630                 return KERN_CODESIGN_ERROR;
2631         }
2632
2633         /* A page could be tainted, or pose a risk of being tainted later.
2634          * Check whether the receiving process wants it, and make it feel
2635          * the consequences (that hapens in cs_invalid_page()).
2636          * For CS Enforcement, two other conditions will
2637          * cause that page to be tainted as well:
2638          * - pmapping an unsigned page executable - this means unsigned code;
2639          * - writeable mapping of a validated page - the content of that page
2640          *   can be changed without the kernel noticing, therefore unsigned
2641          *   code can be created
2642          */
2643         if (m->cs_tainted ||
2644             ((cs_enforcement_enabled && !cs_bypass ) &&
2645              (/* The page is unsigned and wants to be executable */
2646               (!m->cs_validated && (prot & VM_PROT_EXECUTE))  ||
2647               /* The page should be immutable, but is in danger of being modified
2648                 * This is the case where we want policy from the code directory -
2649                 * is the page immutable or not? For now we have to assume that
2650                 * code pages will be immutable, data pages not.
2651                 * We'll assume a page is a code page if it has a code directory
2652                 * and we fault for execution.
2653                 * That is good enough since if we faulted the code page for
2654                 * writing in another map before, it is wpmapped; if we fault
2655                 * it for writing in this map later it will also be faulted for executing
2656                 * at the same time; and if we fault for writing in another map
2657                 * later, we will disconnect it from this pmap so we'll notice
2658                 * the change.
2659                 */
2660               (page_immutable(m, prot) && ((prot & VM_PROT_WRITE) || m->wpmapped))
2661               ))
2662                 )
2663         {
2664                 /* We will have a tainted page. Have to handle the special case
2665                  * of a switched map now. If the map is not switched, standard
2666                  * procedure applies - call cs_invalid_page().
2667                  * If the map is switched, the real owner is invalid already.
2668                  * There is no point in invalidating the switching process since
2669                  * it will not be executing from the map. So we don't call
2670                  * cs_invalid_page() in that case. */
2671                 boolean_t reject_page;
2672                 if(map_is_switched) {
2673                         assert(pmap==vm_map_pmap(current_thread()->map));
2674                         assert(!(prot & VM_PROT_WRITE) || (map_is_switch_protected == FALSE));
2675                         reject_page = FALSE;
2676                 } else {
2677                         if (cs_debug > 5)
2678                                 printf("vm_fault: signed: %s validate: %s tainted: %s wpmapped: %s slid: %s prot: 0x%x\n",
2679                                        m->object->code_signed ? "yes" : "no",
2680                                        m->cs_validated ? "yes" : "no",
2681                                        m->cs_tainted ? "yes" : "no",
2682                                        m->wpmapped ? "yes" : "no",
2683                                        m->slid ? "yes" : "no",
2684                                        (int)prot);
2685                         reject_page = cs_invalid_page((addr64_t) vaddr);
2686                 }
2687
2688                 if (reject_page) {
2689                         /* reject the invalid page: abort the page fault */
2690                         int                     pid;
2691                         const char              *procname;
2692                         task_t                  task;
2693                         vm_object_t             file_object, shadow;
2694                         vm_object_offset_t      file_offset;
2695                         char                    *pathname, *filename;
2696                         vm_size_t               pathname_len, filename_len;
2697                         boolean_t               truncated_path;
2698 #define __PATH_MAX 1024
2699                         struct timespec         mtime, cs_mtime;
2700
2701                         kr = KERN_CODESIGN_ERROR;
2702                         cs_enter_tainted_rejected++;
2703
2704                         /* get process name and pid */
2705                         procname = "?";
2706                         task = current_task();
2707                         pid = proc_selfpid();
2708                         if (task->bsd_info != NULL)
2709                                 procname = proc_name_address(task->bsd_info);
2710
2711                         /* get file's VM object */
2712                         file_object = m->object;
2713                         file_offset = m->offset;
2714                         for (shadow = file_object->shadow;
2715                              shadow != VM_OBJECT_NULL;
2716                              shadow = file_object->shadow) {
2717                                 vm_object_lock_shared(shadow);
2718                                 if (file_object != m->object) {
2719                                         vm_object_unlock(file_object);
2720                                 }
2721                                 file_offset += file_object->vo_shadow_offset;
2722                                 file_object = shadow;
2723                         }
2724
2725                         mtime.tv_sec = 0;
2726                         mtime.tv_nsec = 0;
2727                         cs_mtime.tv_sec = 0;
2728                         cs_mtime.tv_nsec = 0;
2729
2730                         /* get file's pathname and/or filename */
2731                         pathname = NULL;
2732                         filename = NULL;
2733                         pathname_len = 0;
2734                         filename_len = 0;
2735                         truncated_path = FALSE;
2736                         if (file_object->pager == NULL) {
2737                                 /* no pager -> no file -> no pathname */
2738                                 pathname = (char *) "<nil>";
2739                         } else {
2740                                 pathname = (char *)kalloc(__PATH_MAX * 2);
2741                                 if (pathname) {
2742                                         pathname[0] = '\0';
2743                                         pathname_len = __PATH_MAX;
2744                                         filename = pathname + pathname_len;
2745                                         filename_len = __PATH_MAX;
2746                                 }
2747                                 vnode_pager_get_object_name(file_object->pager,
2748                                                             pathname,
2749                                                             pathname_len,
2750                                                             filename,
2751                                                             filename_len,
2752                                                             &truncated_path);
2753                                 vnode_pager_get_object_mtime(file_object->pager,
2754                                                              &mtime,
2755                                                              &cs_mtime);
2756                         }
2757                         printf("CODE SIGNING: process %d[%s]: "
2758                                "rejecting invalid page at address 0x%llx "
2759                                "from offset 0x%llx in file \"%s%s%s\" "
2760                                "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
2761                                "(signed:%d validated:%d tainted:%d "
2762                                "wpmapped:%d slid:%d)\n",
2763                                pid, procname, (addr64_t) vaddr,
2764                                file_offset,
2765                                (pathname ? pathname : ""),
2766                                (truncated_path ? "/.../" : ""),
2767                                (truncated_path ? filename : ""),
2768                                cs_mtime.tv_sec, cs_mtime.tv_nsec,
2769                                ((cs_mtime.tv_sec == mtime.tv_sec &&
2770                                  cs_mtime.tv_nsec == mtime.tv_nsec)
2771                                 ? "=="
2772                                 : "!="),
2773                                mtime.tv_sec, mtime.tv_nsec,
2774                                m->object->code_signed,
2775                                m->cs_validated,
2776                                m->cs_tainted,
2777                                m->wpmapped,
2778                                m->slid);
2779                         if (file_object != m->object) {
2780                                 vm_object_unlock(file_object);
2781                         }
2782                         if (pathname_len != 0) {
2783                                 kfree(pathname, __PATH_MAX * 2);
2784                                 pathname = NULL;
2785                                 filename = NULL;
2786                         }
2787                 } else {
2788                         /* proceed with the invalid page */
2789                         kr = KERN_SUCCESS;
2790                         if (!m->cs_validated) {
2791                                 /*
2792                                  * This page has not been validated, so it
2793                                  * must not belong to a code-signed object
2794                                  * and should not be forcefully considered
2795                                  * as tainted.
2796                                  * We're just concerned about it here because
2797                                  * we've been asked to "execute" it but that
2798                                  * does not mean that it should cause other
2799                                  * accesses to fail.
2800                                  * This happens when a debugger sets a
2801                                  * breakpoint and we then execute code in
2802                                  * that page.  Marking the page as "tainted"
2803                                  * would cause any inspection tool ("leaks",
2804                                  * "vmmap", "CrashReporter", ...) to get killed
2805                                  * due to code-signing violation on that page,
2806                                  * even though they're just reading it and not
2807                                  * executing from it.
2808                                  */
2809                                 assert(!m->object->code_signed);
2810                         } else {
2811                                 /*
2812                                  * Page might have been tainted before or not;
2813                                  * now it definitively is. If the page wasn't
2814                                  * tainted, we must disconnect it from all
2815                                  * pmaps later, to force existing mappings
2816                                  * through that code path for re-consideration
2817                                  * of the validity of that page.
2818                                  */
2819                                 must_disconnect = !m->cs_tainted;
2820                                 m->cs_tainted = TRUE;
2821                         }
2822                         cs_enter_tainted_accepted++;
2823                 }
2824                 if (kr != KERN_SUCCESS) {
2825                         if (cs_debug) {
2826                                 printf("CODESIGNING: vm_fault_enter(0x%llx): "
2827                                        "*** INVALID PAGE ***\n",
2828                                        (long long)vaddr);
2829                         }
2830 #if !SECURE_KERNEL
2831                         if (cs_enforcement_panic) {
2832                                 panic("CODESIGNING: panicking on invalid page\n");
2833                         }
2834 #endif
2835                 }
2836
2837         } else {
2838                 /* proceed with the valid page */
2839                 kr = KERN_SUCCESS;
2840         }
2841
2842         boolean_t       page_queues_locked = FALSE;
2843 #define __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED()   \
2844 MACRO_BEGIN                                     \
2845         if (! page_queues_locked) {             \
2846                 page_queues_locked = TRUE;      \
2847                 vm_page_lockspin_queues();      \
2848         }                                       \
2849 MACRO_END
2850 #define __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED()     \
2851 MACRO_BEGIN                                     \
2852         if (page_queues_locked) {               \
2853                 page_queues_locked = FALSE;     \
2854                 vm_page_unlock_queues();        \
2855         }                                       \
2856 MACRO_END
2857
2858         /*
2859          * Hold queues lock to manipulate
2860          * the page queues.  Change wiring
2861          * case is obvious.
2862          */
2863         assert(m->compressor || m->object != compressor_object);
2864         if (m->compressor) {
2865                 /*
2866                  * Compressor pages are neither wired
2867                  * nor pageable and should never change.
2868                  */
2869                 assert(m->object == compressor_object);
2870         } else if (change_wiring) {
2871                 __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
2872
2873                 if (wired) {
2874                         if (kr == KERN_SUCCESS) {
2875                                 vm_page_wire(m);
2876                         }
2877                 } else {
2878                         vm_page_unwire(m, TRUE);
2879                 }
2880                 /* we keep the page queues lock, if we need it later */
2881
2882         } else {
2883                 if (kr != KERN_SUCCESS) {
2884                         __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
2885                         vm_page_deactivate(m);
2886                         /* we keep the page queues lock, if we need it later */
2887                 } else if (((!m->active && !m->inactive) ||
2888                             m->clean_queue ||
2889                             no_cache) &&
2890                            !VM_PAGE_WIRED(m) && !m->throttled) {
2891
2892                         if (vm_page_local_q &&
2893                             !no_cache &&
2894                             (*type_of_fault == DBG_COW_FAULT ||
2895                              *type_of_fault == DBG_ZERO_FILL_FAULT) ) {
2896                                 struct vpl      *lq;
2897                                 uint32_t        lid;
2898
2899                                 __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
2900                                 vm_object_lock_assert_exclusive(m->object);
2901
2902                                 /*
2903                                  * we got a local queue to stuff this
2904                                  * new page on...
2905                                  * its safe to manipulate local and
2906                                  * local_id at this point since we're
2907                                  * behind an exclusive object lock and
2908                                  * the page is not on any global queue.
2909                                  *
2910                                  * we'll use the current cpu number to
2911                                  * select the queue note that we don't
2912                                  * need to disable preemption... we're
2913                                  * going to behind the local queue's
2914                                  * lock to do the real work
2915                                  */
2916                                 lid = cpu_number();
2917
2918                                 lq = &vm_page_local_q[lid].vpl_un.vpl;
2919
2920                                 VPL_LOCK(&lq->vpl_lock);
2921
2922                                 queue_enter(&lq->vpl_queue, m,
2923                                             vm_page_t, pageq);
2924                                 m->local = TRUE;
2925                                 m->local_id = lid;
2926                                 lq->vpl_count++;
2927
2928                                 if (m->object->internal)
2929                                         lq->vpl_internal_count++;
2930                                 else
2931                                         lq->vpl_external_count++;
2932
2933                                 VPL_UNLOCK(&lq->vpl_lock);
2934
2935                                 if (lq->vpl_count > vm_page_local_q_soft_limit)
2936                                 {
2937                                         /*
2938                                          * we're beyond the soft limit
2939                                          * for the local queue
2940                                          * vm_page_reactivate_local will
2941                                          * 'try' to take the global page
2942                                          * queue lock... if it can't
2943                                          * that's ok... we'll let the
2944                                          * queue continue to grow up
2945                                          * to the hard limit... at that
2946                                          * point we'll wait for the
2947                                          * lock... once we've got the
2948                                          * lock, we'll transfer all of
2949                                          * the pages from the local
2950                                          * queue to the global active
2951                                          * queue
2952                                          */
2953                                         vm_page_reactivate_local(lid, FALSE, FALSE);
2954                                 }
2955                         } else {
2956
2957                                 __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
2958
2959                                 /*
2960                                  * test again now that we hold the
2961                                  * page queue lock
2962                                  */
2963                                 if (!VM_PAGE_WIRED(m)) {
2964                                         if (m->clean_queue) {
2965                                                 VM_PAGE_QUEUES_REMOVE(m);
2966
2967                                                 vm_pageout_cleaned_reactivated++;
2968                                                 vm_pageout_cleaned_fault_reactivated++;
2969                                         }
2970
2971                                         if ((!m->active &&
2972                                              !m->inactive) ||
2973                                             no_cache) {
2974                                                 /*
2975                                                  * If this is a no_cache mapping
2976                                                  * and the page has never been
2977                                                  * mapped before or was
2978                                                  * previously a no_cache page,
2979                                                  * then we want to leave pages
2980                                                  * in the speculative state so
2981                                                  * that they can be readily
2982                                                  * recycled if free memory runs
2983                                                  * low.  Otherwise the page is
2984                                                  * activated as normal.
2985                                                  */
2986
2987                                                 if (no_cache &&
2988                                                     (!previously_pmapped ||
2989                                                      m->no_cache)) {
2990                                                         m->no_cache = TRUE;
2991
2992                                                         if (!m->speculative)
2993                                                                 vm_page_speculate(m, FALSE);
2994
2995                                                 } else if (!m->active &&
2996                                                            !m->inactive) {
2997
2998                                                         vm_page_activate(m);
2999                                                 }
3000                                         }
3001                                 }
3002                                 /* we keep the page queues lock, if we need it later */
3003                         }
3004                 }
3005         }
3006         /* we're done with the page queues lock, if we ever took it */
3007         __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
3008
3009
3010         /* If we have a KERN_SUCCESS from the previous checks, we either have
3011          * a good page, or a tainted page that has been accepted by the process.
3012          * In both cases the page will be entered into the pmap.
3013          * If the page is writeable, we need to disconnect it from other pmaps
3014          * now so those processes can take note.
3015          */
3016         if (kr == KERN_SUCCESS) {
3017
3018                 /*
3019                  * NOTE: we may only hold the vm_object lock SHARED
3020                  * at this point, so we need the phys_page lock to
3021                  * properly serialize updating the pmapped and
3022                  * xpmapped bits
3023                  */
3024                 if ((prot & VM_PROT_EXECUTE) && !m->xpmapped) {
3025
3026                         pmap_lock_phys_page(m->phys_page);
3027                         /*
3028                          * go ahead and take the opportunity
3029                          * to set 'pmapped' here so that we don't
3030                          * need to grab this lock a 2nd time
3031                          * just below
3032                          */
3033                         m->pmapped = TRUE;
3034
3035                         if (!m->xpmapped) {
3036
3037                                 m->xpmapped = TRUE;
3038
3039                                 pmap_unlock_phys_page(m->phys_page);
3040
3041                                 if (!m->object->internal)
3042                                         OSAddAtomic(1, &vm_page_xpmapped_external_count);
3043
3044                                 if ((COMPRESSED_PAGER_IS_ACTIVE) &&
3045                                     m->object->internal &&
3046                                     m->object->pager != NULL) {
3047                                         /*
3048                                          * This page could have been
3049                                          * uncompressed by the
3050                                          * compressor pager and its
3051                                          * contents might be only in
3052                                          * the data cache.
3053                                          * Since it's being mapped for
3054                                          * "execute" for the fist time,
3055                                          * make sure the icache is in
3056                                          * sync.
3057                                          */
3058                                         pmap_sync_page_data_phys(m->phys_page);
3059                                 }
3060                         } else
3061                                 pmap_unlock_phys_page(m->phys_page);
3062                 } else {
3063                         if (m->pmapped == FALSE) {
3064                                 pmap_lock_phys_page(m->phys_page);
3065                                 m->pmapped = TRUE;
3066                                 pmap_unlock_phys_page(m->phys_page);
3067                         }
3068                 }
3069                 if (vm_page_is_slideable(m)) {
3070                         boolean_t was_busy = m->busy;
3071
3072                         vm_object_lock_assert_exclusive(m->object);
3073
3074                         m->busy = TRUE;
3075                         kr = vm_page_slide(m, 0);
3076                         assert(m->busy);
3077                         if(!was_busy) {
3078                                 PAGE_WAKEUP_DONE(m);
3079                         }
3080                         if (kr != KERN_SUCCESS) {
3081                                 /*
3082                                  * This page has not been slid correctly,
3083                                  * do not do the pmap_enter() !
3084                                  * Let vm_fault_enter() return the error
3085                                  * so the caller can fail the fault.
3086                                  */
3087                                 goto after_the_pmap_enter;
3088                         }
3089                 }
3090
3091                 if (fault_type & VM_PROT_WRITE) {
3092
3093                         if (m->wpmapped == FALSE) {
3094                                 vm_object_lock_assert_exclusive(m->object);
3095
3096                                 m->wpmapped = TRUE;
3097                         }
3098                         if (must_disconnect) {
3099                                 /*
3100                                  * We can only get here
3101                                  * because of the CSE logic
3102                                  */
3103                                 assert(cs_enforcement_enabled);
3104                                 pmap_disconnect(m->phys_page);
3105                                 /*
3106                                  * If we are faulting for a write, we can clear
3107                                  * the execute bit - that will ensure the page is
3108                                  * checked again before being executable, which
3109                                  * protects against a map switch.
3110                                  * This only happens the first time the page
3111                                  * gets tainted, so we won't get stuck here
3112                                  * to make an already writeable page executable.
3113                                  */
3114                                 if (!cs_bypass){
3115                                         prot &= ~VM_PROT_EXECUTE;
3116                                 }
3117                         }
3118                 }
3119
3120                 /* Prevent a deadlock by not
3121                  * holding the object lock if we need to wait for a page in
3122                  * pmap_enter() - <rdar://problem/7138958> */
3123                 PMAP_ENTER_OPTIONS(pmap, vaddr, m, prot, fault_type, 0,
3124                                    wired,
3125                                    pmap_options | PMAP_OPTIONS_NOWAIT,
3126                                    pe_result);
3127
3128                 if(pe_result == KERN_RESOURCE_SHORTAGE) {
3129
3130                         if (need_retry) {
3131                                 /*
3132                                  * this will be non-null in the case where we hold the lock
3133                                  * on the top-object in this chain... we can't just drop
3134                                  * the lock on the object we're inserting the page into
3135                                  * and recall the PMAP_ENTER since we can still cause
3136                                  * a deadlock if one of the critical paths tries to
3137                                  * acquire the lock on the top-object and we're blocked
3138                                  * in PMAP_ENTER waiting for memory... our only recourse
3139                                  * is to deal with it at a higher level where we can
3140                                  * drop both locks.
3141                                  */
3142                                 *need_retry = TRUE;
3143                                 vm_pmap_enter_retried++;
3144                                 goto after_the_pmap_enter;
3145                         }
3146                         /* The nonblocking version of pmap_enter did not succeed.
3147                          * and we don't need to drop other locks and retry
3148                          * at the level above us, so
3149                          * use the blocking version instead. Requires marking
3150                          * the page busy and unlocking the object */
3151                         boolean_t was_busy = m->busy;
3152
3153                         vm_object_lock_assert_exclusive(m->object);
3154
3155                         m->busy = TRUE;
3156                         vm_object_unlock(m->object);
3157
3158                         PMAP_ENTER_OPTIONS(pmap, vaddr, m, prot, fault_type,
3159                                            0, wired,
3160                                            pmap_options, pe_result);
3161
3162                         /* Take the object lock again. */
3163                         vm_object_lock(m->object);
3164
3165                         /* If the page was busy, someone else will wake it up.
3166                          * Otherwise, we have to do it now. */
3167                         assert(m->busy);
3168                         if(!was_busy) {
3169                                 PAGE_WAKEUP_DONE(m);
3170                         }
3171                         vm_pmap_enter_blocked++;
3172                 }
3173         }
3174
3175 after_the_pmap_enter:
3176         return kr;
3177 }
3178
3179 void
3180 vm_pre_fault(vm_map_offset_t vaddr)
3181 {
3182         if (pmap_find_phys(current_map()->pmap, vaddr) == 0) {
3183
3184                 vm_fault(current_map(), /* map */
3185                         vaddr,          /* vaddr */
3186                         VM_PROT_READ, /* fault_type */
3187                         FALSE, /* change_wiring */
3188                         THREAD_UNINT, /* interruptible */
3189                         NULL, /* caller_pmap */
3190                         0 /* caller_pmap_addr */);
3191         }
3192 }
3193
3194
3195 /*
3196  *      Routine:        vm_fault
3197  *      Purpose:
3198  *              Handle page faults, including pseudo-faults
3199  *              used to change the wiring status of pages.
3200  *      Returns:
3201  *              Explicit continuations have been removed.
3202  *      Implementation:
3203  *              vm_fault and vm_fault_page save mucho state
3204  *              in the moral equivalent of a closure.  The state
3205  *              structure is allocated when first entering vm_fault
3206  *              and deallocated when leaving vm_fault.
3207  */
3208
3209 extern int _map_enter_debug;
3210
3211 unsigned long vm_fault_collapse_total = 0;
3212 unsigned long vm_fault_collapse_skipped = 0;
3213
3214
3215 kern_return_t
3216 vm_fault(
3217         vm_map_t        map,
3218         vm_map_offset_t vaddr,
3219         vm_prot_t       fault_type,
3220         boolean_t       change_wiring,
3221         int             interruptible,
3222         pmap_t          caller_pmap,
3223         vm_map_offset_t caller_pmap_addr)
3224 {
3225         return vm_fault_internal(map, vaddr, fault_type, change_wiring,
3226                                  interruptible, caller_pmap, caller_pmap_addr,
3227                                  NULL);
3228 }
3229
3230 kern_return_t
3231 vm_fault_internal(
3232         vm_map_t        map,
3233         vm_map_offset_t vaddr,
3234         vm_prot_t       fault_type,
3235         boolean_t       change_wiring,
3236         int             interruptible,
3237         pmap_t          caller_pmap,
3238         vm_map_offset_t caller_pmap_addr,
3239         ppnum_t         *physpage_p)
3240 {
3241         vm_map_version_t        version;        /* Map version for verificiation */
3242         boolean_t               wired;          /* Should mapping be wired down? */
3243         vm_object_t             object;         /* Top-level object */
3244         vm_object_offset_t      offset;         /* Top-level offset */
3245         vm_prot_t               prot;           /* Protection for mapping */
3246         vm_object_t             old_copy_object; /* Saved copy object */
3247         vm_page_t               result_page;    /* Result of vm_fault_page */
3248         vm_page_t               top_page;       /* Placeholder page */
3249         kern_return_t           kr;
3250
3251         vm_page_t               m;      /* Fast access to result_page */
3252         kern_return_t           error_code;
3253         vm_object_t             cur_object;
3254         vm_object_offset_t      cur_offset;
3255         vm_page_t               cur_m;
3256         vm_object_t             new_object;
3257         int                     type_of_fault;
3258         pmap_t                  pmap;
3259         boolean_t               interruptible_state;
3260         vm_map_t                real_map = map;
3261         vm_map_t                original_map = map;
3262         vm_prot_t               original_fault_type;
3263         struct vm_object_fault_info fault_info;
3264         boolean_t               need_collapse = FALSE;
3265         boolean_t               need_retry = FALSE;
3266         boolean_t               *need_retry_ptr = NULL;
3267         int                     object_lock_type = 0;
3268         int                     cur_object_lock_type;
3269         vm_object_t             top_object = VM_OBJECT_NULL;
3270         int                     throttle_delay;
3271         int                     compressed_count_delta;
3272
3273
3274         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3275                       (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START,
3276                               ((uint64_t)vaddr >> 32),
3277                               vaddr,
3278                               (map == kernel_map),
3279                               0,
3280                               0);
3281
3282         if (get_preemption_level() != 0) {
3283                 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3284                                       (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
3285                                       ((uint64_t)vaddr >> 32),
3286                                       vaddr,
3287                                       KERN_FAILURE,
3288                                       0,
3289                                       0);
3290
3291                 return (KERN_FAILURE);
3292         }
3293
3294         interruptible_state = thread_interrupt_level(interruptible);
3295
3296         VM_STAT_INCR(faults);
3297         current_task()->faults++;
3298         original_fault_type = fault_type;
3299
3300         if (fault_type & VM_PROT_WRITE)
3301                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3302         else
3303                 object_lock_type = OBJECT_LOCK_SHARED;
3304
3305         cur_object_lock_type = OBJECT_LOCK_SHARED;
3306
3307 RetryFault:
3308         /*
3309          * assume we will hit a page in the cache
3310          * otherwise, explicitly override with
3311          * the real fault type once we determine it
3312          */
3313         type_of_fault = DBG_CACHE_HIT_FAULT;
3314
3315         /*
3316          *      Find the backing store object and offset into
3317          *      it to begin the search.
3318          */
3319         fault_type = original_fault_type;
3320         map = original_map;
3321         vm_map_lock_read(map);
3322
3323         kr = vm_map_lookup_locked(&map, vaddr, fault_type,
3324                                   object_lock_type, &version,
3325                                   &object, &offset, &prot, &wired,
3326                                   &fault_info,
3327                                   &real_map);
3328
3329         if (kr != KERN_SUCCESS) {
3330                 vm_map_unlock_read(map);
3331                 goto done;
3332         }
3333         pmap = real_map->pmap;
3334         fault_info.interruptible = interruptible;
3335         fault_info.stealth = FALSE;
3336         fault_info.io_sync = FALSE;
3337         fault_info.mark_zf_absent = FALSE;
3338         fault_info.batch_pmap_op = FALSE;
3339
3340         /*
3341          * If the page is wired, we must fault for the current protection
3342          * value, to avoid further faults.
3343          */
3344         if (wired) {
3345                 fault_type = prot | VM_PROT_WRITE;
3346                 /*
3347                  * since we're treating this fault as a 'write'
3348                  * we must hold the top object lock exclusively
3349                  */
3350                 if (object_lock_type == OBJECT_LOCK_SHARED) {
3351
3352                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3353
3354                         if (vm_object_lock_upgrade(object) == FALSE) {
3355                                 /*
3356                                  * couldn't upgrade, so explictly
3357                                  * take the lock exclusively
3358                                  */
3359                                 vm_object_lock(object);
3360                         }
3361                 }
3362         }
3363
3364 #if     VM_FAULT_CLASSIFY
3365         /*
3366          *      Temporary data gathering code
3367          */
3368         vm_fault_classify(object, offset, fault_type);
3369 #endif
3370         /*
3371          *      Fast fault code.  The basic idea is to do as much as
3372          *      possible while holding the map lock and object locks.
3373          *      Busy pages are not used until the object lock has to
3374          *      be dropped to do something (copy, zero fill, pmap enter).
3375          *      Similarly, paging references aren't acquired until that
3376          *      point, and object references aren't used.
3377          *
3378          *      If we can figure out what to do
3379          *      (zero fill, copy on write, pmap enter) while holding
3380          *      the locks, then it gets done.  Otherwise, we give up,
3381          *      and use the original fault path (which doesn't hold
3382          *      the map lock, and relies on busy pages).
3383          *      The give up cases include:
3384          *              - Have to talk to pager.
3385          *              - Page is busy, absent or in error.
3386          *              - Pager has locked out desired access.
3387          *              - Fault needs to be restarted.
3388          *              - Have to push page into copy object.
3389          *
3390          *      The code is an infinite loop that moves one level down
3391          *      the shadow chain each time.  cur_object and cur_offset
3392          *      refer to the current object being examined. object and offset
3393          *      are the original object from the map.  The loop is at the
3394          *      top level if and only if object and cur_object are the same.
3395          *
3396          *      Invariants:  Map lock is held throughout.  Lock is held on
3397          *              original object and cur_object (if different) when
3398          *              continuing or exiting loop.
3399          *
3400          */
3401
3402
3403         /*
3404          * If this page is to be inserted in a copy delay object
3405          * for writing, and if the object has a copy, then the
3406          * copy delay strategy is implemented in the slow fault page.
3407          */
3408         if (object->copy_strategy == MEMORY_OBJECT_COPY_DELAY &&
3409             object->copy != VM_OBJECT_NULL && (fault_type & VM_PROT_WRITE))
3410                 goto handle_copy_delay;
3411
3412         cur_object = object;
3413         cur_offset = offset;
3414
3415         while (TRUE) {
3416                 if (!cur_object->pager_created &&
3417                     cur_object->phys_contiguous) /* superpage */
3418                         break;
3419
3420                 if (cur_object->blocked_access) {
3421                         /*
3422                          * Access to this VM object has been blocked.
3423                          * Let the slow path handle it.
3424                          */
3425                         break;
3426                 }
3427
3428                 m = vm_page_lookup(cur_object, cur_offset);
3429
3430                 if (m != VM_PAGE_NULL) {
3431                         if (m->busy) {
3432                                 wait_result_t   result;
3433
3434                                 /*
3435                                  * in order to do the PAGE_ASSERT_WAIT, we must
3436                                  * have object that 'm' belongs to locked exclusively
3437                                  */
3438                                 if (object != cur_object) {
3439
3440                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3441
3442                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3443
3444                                                 if (vm_object_lock_upgrade(cur_object) == FALSE) {
3445                                                         /*
3446                                                          * couldn't upgrade so go do a full retry
3447                                                          * immediately since we can no longer be
3448                                                          * certain about cur_object (since we
3449                                                          * don't hold a reference on it)...
3450                                                          * first drop the top object lock
3451                                                          */
3452                                                         vm_object_unlock(object);
3453
3454                                                         vm_map_unlock_read(map);
3455                                                         if (real_map != map)
3456                                                                 vm_map_unlock(real_map);
3457
3458                                                         goto RetryFault;
3459                                                 }
3460                                         }
3461                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3462
3463                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3464
3465                                         if (vm_object_lock_upgrade(object) == FALSE) {
3466                                                 /*
3467                                                  * couldn't upgrade, so explictly take the lock
3468                                                  * exclusively and go relookup the page since we
3469                                                  * will have dropped the object lock and
3470                                                  * a different thread could have inserted
3471                                                  * a page at this offset
3472                                                  * no need for a full retry since we're
3473                                                  * at the top level of the object chain
3474                                                  */
3475                                                 vm_object_lock(object);
3476
3477                                                 continue;
3478                                         }
3479                                 }
3480                                 if (m->pageout_queue && m->object->internal && COMPRESSED_PAGER_IS_ACTIVE) {
3481                                         /*
3482                                          * m->busy == TRUE and the object is locked exclusively
3483                                          * if m->pageout_queue == TRUE after we acquire the
3484                                          * queues lock, we are guaranteed that it is stable on
3485                                          * the pageout queue and therefore reclaimable
3486                                          *
3487                                          * NOTE: this is only true for the internal pageout queue
3488                                          * in the compressor world
3489                                          */
3490                                         vm_page_lock_queues();
3491
3492                                         if (m->pageout_queue) {
3493                                                 vm_pageout_throttle_up(m);
3494                                                 vm_page_unlock_queues();
3495
3496                                                 PAGE_WAKEUP_DONE(m);
3497                                                 goto reclaimed_from_pageout;
3498                                         }
3499                                         vm_page_unlock_queues();
3500                                 }
3501                                 if (object != cur_object)
3502                                         vm_object_unlock(object);
3503
3504                                 vm_map_unlock_read(map);
3505                                 if (real_map != map)
3506                                         vm_map_unlock(real_map);
3507
3508                                 result = PAGE_ASSERT_WAIT(m, interruptible);
3509
3510                                 vm_object_unlock(cur_object);
3511
3512                                 if (result == THREAD_WAITING) {
3513                                         result = thread_block(THREAD_CONTINUE_NULL);
3514
3515                                         counter(c_vm_fault_page_block_busy_kernel++);
3516                                 }
3517                                 if (result == THREAD_AWAKENED || result == THREAD_RESTART)
3518                                         goto RetryFault;
3519
3520                                 kr = KERN_ABORTED;
3521                                 goto done;
3522                         }
3523 reclaimed_from_pageout:
3524                         if (m->laundry) {
3525                                 if (object != cur_object) {
3526                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3527                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3528
3529                                                 vm_object_unlock(object);
3530                                                 vm_object_unlock(cur_object);
3531
3532                                                 vm_map_unlock_read(map);
3533                                                 if (real_map != map)
3534                                                         vm_map_unlock(real_map);
3535
3536                                                 goto RetryFault;
3537                                         }
3538
3539                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3540
3541                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3542
3543                                         if (vm_object_lock_upgrade(object) == FALSE) {
3544                                                 /*
3545                                                  * couldn't upgrade, so explictly take the lock
3546                                                  * exclusively and go relookup the page since we
3547                                                  * will have dropped the object lock and
3548                                                  * a different thread could have inserted
3549                                                  * a page at this offset
3550                                                  * no need for a full retry since we're
3551                                                  * at the top level of the object chain
3552                                                  */
3553                                                 vm_object_lock(object);
3554
3555                                                 continue;
3556                                         }
3557                                 }
3558                                 m->pageout = FALSE;
3559
3560                                 vm_pageout_steal_laundry(m, FALSE);
3561                         }
3562
3563                         if (m->phys_page == vm_page_guard_addr) {
3564                                 /*
3565                                  * Guard page: let the slow path deal with it
3566                                  */
3567                                 break;
3568                         }
3569                         if (m->unusual && (m->error || m->restart || m->private || m->absent)) {
3570                                 /*
3571                                  * Unusual case... let the slow path deal with it
3572                                  */
3573                                 break;
3574                         }
3575                         if (VM_OBJECT_PURGEABLE_FAULT_ERROR(m->object)) {
3576                                 if (object != cur_object)
3577                                         vm_object_unlock(object);
3578                                 vm_map_unlock_read(map);
3579                                 if (real_map != map)
3580                                         vm_map_unlock(real_map);
3581                                 vm_object_unlock(cur_object);
3582                                 kr = KERN_MEMORY_ERROR;
3583                                 goto done;
3584                         }
3585
3586                         if (m->encrypted) {
3587                                 /*
3588                                  * ENCRYPTED SWAP:
3589                                  * We've soft-faulted (because it's not in the page
3590                                  * table) on an encrypted page.
3591                                  * Keep the page "busy" so that no one messes with
3592                                  * it during the decryption.
3593                                  * Release the extra locks we're holding, keep only
3594                                  * the page's VM object lock.
3595                                  *
3596                                  * in order to set 'busy' on 'm', we must
3597                                  * have object that 'm' belongs to locked exclusively
3598                                  */
3599                                 if (object != cur_object) {
3600                                         vm_object_unlock(object);
3601
3602                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3603
3604                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3605
3606                                                 if (vm_object_lock_upgrade(cur_object) == FALSE) {
3607                                                         /*
3608                                                          * couldn't upgrade so go do a full retry
3609                                                          * immediately since we've already dropped
3610                                                          * the top object lock associated with this page
3611                                                          * and the current one got dropped due to the
3612                                                          * failed upgrade... the state is no longer valid
3613                                                          */
3614                                                         vm_map_unlock_read(map);
3615                                                         if (real_map != map)
3616                                                                 vm_map_unlock(real_map);
3617
3618                                                         goto RetryFault;
3619                                                 }
3620                                         }
3621                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3622
3623                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3624
3625                                         if (vm_object_lock_upgrade(object) == FALSE) {
3626                                                 /*
3627                                                  * couldn't upgrade, so explictly take the lock
3628                                                  * exclusively and go relookup the page since we
3629                                                  * will have dropped the object lock and
3630                                                  * a different thread could have inserted
3631                                                  * a page at this offset
3632                                                  * no need for a full retry since we're
3633                                                  * at the top level of the object chain
3634                                                  */
3635                                                 vm_object_lock(object);
3636
3637                                                 continue;
3638                                         }
3639                                 }
3640                                 m->busy = TRUE;
3641
3642                                 vm_map_unlock_read(map);
3643                                 if (real_map != map)
3644                                         vm_map_unlock(real_map);
3645
3646                                 vm_page_decrypt(m, 0);
3647
3648                                 assert(m->busy);
3649                                 PAGE_WAKEUP_DONE(m);
3650
3651                                 vm_object_unlock(cur_object);
3652                                 /*
3653                                  * Retry from the top, in case anything
3654                                  * changed while we were decrypting...
3655                                  */
3656                                 goto RetryFault;
3657                         }
3658                         ASSERT_PAGE_DECRYPTED(m);
3659
3660                         if(vm_page_is_slideable(m)) {
3661                                 /*
3662                                  * We might need to slide this page, and so,
3663                                  * we want to hold the VM object exclusively.
3664                                  */
3665                                 if (object != cur_object) {
3666                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3667                                                 vm_object_unlock(object);
3668                                                 vm_object_unlock(cur_object);
3669
3670                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3671
3672                                                 vm_map_unlock_read(map);
3673                                                 if (real_map != map)
3674                                                         vm_map_unlock(real_map);
3675
3676                                                 goto RetryFault;
3677                                         }
3678                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3679
3680                                         vm_object_unlock(object);
3681                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3682                                         vm_map_unlock_read(map);
3683                                         goto RetryFault;
3684                                 }
3685                         }
3686
3687                         if (VM_FAULT_NEED_CS_VALIDATION(map->pmap, m) ||
3688                             (physpage_p != NULL && (prot & VM_PROT_WRITE))) {
3689 upgrade_for_validation:
3690                                 /*
3691                                  * We might need to validate this page
3692                                  * against its code signature, so we
3693                                  * want to hold the VM object exclusively.
3694                                  */
3695                                 if (object != cur_object) {
3696                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3697                                                 vm_object_unlock(object);
3698                                                 vm_object_unlock(cur_object);
3699
3700                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3701
3702                                                 vm_map_unlock_read(map);
3703                                                 if (real_map != map)
3704                                                         vm_map_unlock(real_map);
3705
3706                                                 goto RetryFault;
3707                                         }
3708
3709                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3710
3711                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3712
3713                                         if (vm_object_lock_upgrade(object) == FALSE) {
3714                                                 /*
3715                                                  * couldn't upgrade, so explictly take the lock
3716                                                  * exclusively and go relookup the page since we
3717                                                  * will have dropped the object lock and
3718                                                  * a different thread could have inserted
3719                                                  * a page at this offset
3720                                                  * no need for a full retry since we're
3721                                                  * at the top level of the object chain
3722                                                  */
3723                                                 vm_object_lock(object);
3724
3725                                                 continue;
3726                                         }
3727                                 }
3728                         }
3729                         /*
3730                          *      Two cases of map in faults:
3731                          *          - At top level w/o copy object.
3732                          *          - Read fault anywhere.
3733                          *              --> must disallow write.
3734                          */
3735
3736                         if (object == cur_object && object->copy == VM_OBJECT_NULL) {
3737
3738                                 goto FastPmapEnter;
3739                         }
3740
3741                         if ((fault_type & VM_PROT_WRITE) == 0) {
3742
3743                                 if (object != cur_object) {
3744                                         /*
3745                                          * We still need to hold the top object
3746                                          * lock here to prevent a race between
3747                                          * a read fault (taking only "shared"
3748                                          * locks) and a write fault (taking
3749                                          * an "exclusive" lock on the top
3750                                          * object.
3751                                          * Otherwise, as soon as we release the
3752                                          * top lock, the write fault could
3753                                          * proceed and actually complete before
3754                                          * the read fault, and the copied page's
3755                                          * translation could then be overwritten
3756                                          * by the read fault's translation for
3757                                          * the original page.
3758                                          *
3759                                          * Let's just record what the top object
3760                                          * is and we'll release it later.
3761                                          */
3762                                         top_object = object;
3763
3764                                         /*
3765                                          * switch to the object that has the new page
3766                                          */
3767                                         object = cur_object;
3768                                         object_lock_type = cur_object_lock_type;
3769                                 }
3770 FastPmapEnter:
3771                                 /*
3772                                  * prepare for the pmap_enter...
3773                                  * object and map are both locked
3774                                  * m contains valid data
3775                                  * object == m->object
3776                                  * cur_object == NULL or it's been unlocked
3777                                  * no paging references on either object or cur_object
3778                                  */
3779                                 if (top_object != VM_OBJECT_NULL || object_lock_type != OBJECT_LOCK_EXCLUSIVE)
3780                                         need_retry_ptr = &need_retry;
3781                                 else
3782                                         need_retry_ptr = NULL;
3783
3784                                 if (caller_pmap) {
3785                                         kr = vm_fault_enter(m,
3786                                                             caller_pmap,
3787                                                             caller_pmap_addr,
3788                                                             prot,
3789                                                             fault_type,
3790                                                             wired,
3791                                                             change_wiring,
3792                                                             fault_info.no_cache,
3793                                                             fault_info.cs_bypass,
3794                                                             fault_info.user_tag,
3795                                                             fault_info.pmap_options,
3796                                                             need_retry_ptr,
3797                                                             &type_of_fault);
3798                                 } else {
3799                                         kr = vm_fault_enter(m,
3800                                                             pmap,
3801                                                             vaddr,
3802                                                             prot,
3803                                                             fault_type,
3804                                                             wired,
3805                                                             change_wiring,
3806                                                             fault_info.no_cache,
3807                                                             fault_info.cs_bypass,
3808                                                             fault_info.user_tag,
3809                                                             fault_info.pmap_options,
3810                                                             need_retry_ptr,
3811                                                             &type_of_fault);
3812                                 }
3813
3814                                 if (kr == KERN_SUCCESS &&
3815                                     physpage_p != NULL) {
3816                                         /* for vm_map_wire_and_extract() */
3817                                         *physpage_p = m->phys_page;
3818                                         if (prot & VM_PROT_WRITE) {
3819                                                 vm_object_lock_assert_exclusive(
3820                                                         m->object);
3821                                                 m->dirty = TRUE;
3822                                         }
3823                                 }
3824
3825                                 if (top_object != VM_OBJECT_NULL) {
3826                                         /*
3827                                          * It's safe to drop the top object
3828                                          * now that we've done our
3829                                          * vm_fault_enter().  Any other fault
3830                                          * in progress for that virtual
3831                                          * address will either find our page
3832                                          * and translation or put in a new page
3833                                          * and translation.
3834                                          */
3835                                         vm_object_unlock(top_object);
3836                                         top_object = VM_OBJECT_NULL;
3837                                 }
3838
3839                                 if (need_collapse == TRUE)
3840                                         vm_object_collapse(object, offset, TRUE);
3841
3842                                 if (need_retry == FALSE &&
3843                                     (type_of_fault == DBG_PAGEIND_FAULT || type_of_fault == DBG_PAGEINV_FAULT || type_of_fault == DBG_CACHE_HIT_FAULT)) {
3844                                         /*
3845                                          * evaluate access pattern and update state
3846                                          * vm_fault_deactivate_behind depends on the
3847                                          * state being up to date
3848                                          */
3849                                         vm_fault_is_sequential(object, cur_offset, fault_info.behavior);
3850
3851                                         vm_fault_deactivate_behind(object, cur_offset, fault_info.behavior);
3852                                 }
3853                                 /*
3854                                  * That's it, clean up and return.
3855                                  */
3856                                 if (m->busy)
3857                                         PAGE_WAKEUP_DONE(m);
3858
3859                                 vm_object_unlock(object);
3860
3861                                 vm_map_unlock_read(map);
3862                                 if (real_map != map)
3863                                         vm_map_unlock(real_map);
3864
3865                                 if (need_retry == TRUE) {
3866                                         /*
3867                                          * vm_fault_enter couldn't complete the PMAP_ENTER...
3868                                          * at this point we don't hold any locks so it's safe
3869                                          * to ask the pmap layer to expand the page table to
3870                                          * accommodate this mapping... once expanded, we'll
3871                                          * re-drive the fault which should result in vm_fault_enter
3872                                          * being able to successfully enter the mapping this time around
3873                                          */
3874                                         (void)pmap_enter_options(
3875                                                 pmap, vaddr, 0, 0, 0, 0, 0,
3876                                                 PMAP_OPTIONS_NOENTER, NULL);
3877
3878                                         need_retry = FALSE;
3879                                         goto RetryFault;
3880                                 }
3881                                 goto done;
3882                         }
3883                         /*
3884                          * COPY ON WRITE FAULT
3885                          */
3886                         assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE);
3887
3888                         if ((throttle_delay = vm_page_throttled())) {
3889                                 /*
3890                                  * drop all of our locks...
3891                                  * wait until the free queue is
3892                                  * pumped back up and then
3893                                  * redrive the fault
3894                                  */
3895                                 if (object != cur_object)
3896                                         vm_object_unlock(cur_object);
3897                                 vm_object_unlock(object);
3898                                 vm_map_unlock_read(map);
3899                                 if (real_map != map)
3900                                         vm_map_unlock(real_map);
3901
3902                                 VM_DEBUG_EVENT(vmf_cowdelay, VMF_COWDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
3903
3904                                 delay(throttle_delay);
3905
3906                                 if (!current_thread_aborted() && vm_page_wait((change_wiring) ?
3907                                                  THREAD_UNINT :
3908                                                  THREAD_ABORTSAFE))
3909                                         goto RetryFault;
3910                                 kr = KERN_ABORTED;
3911                                 goto done;
3912                         }
3913                         /*
3914                          * If objects match, then
3915                          * object->copy must not be NULL (else control
3916                          * would be in previous code block), and we
3917                          * have a potential push into the copy object
3918                          * with which we can't cope with here.
3919                          */
3920                         if (cur_object == object) {
3921                                 /*
3922                                  * must take the slow path to
3923                                  * deal with the copy push
3924                                  */
3925                                 break;
3926                         }
3927
3928                         /*
3929                          * This is now a shadow based copy on write
3930                          * fault -- it requires a copy up the shadow
3931                          * chain.
3932                          */
3933
3934                         if ((cur_object_lock_type == OBJECT_LOCK_SHARED) &&
3935                             VM_FAULT_NEED_CS_VALIDATION(NULL, m)) {
3936                                 goto upgrade_for_validation;
3937                         }
3938
3939                         /*
3940                          * Allocate a page in the original top level
3941                          * object. Give up if allocate fails.  Also
3942                          * need to remember current page, as it's the
3943                          * source of the copy.
3944                          *
3945                          * at this point we hold locks on both
3946                          * object and cur_object... no need to take
3947                          * paging refs or mark pages BUSY since
3948                          * we don't drop either object lock until
3949                          * the page has been copied and inserted
3950                          */
3951                         cur_m = m;
3952                         m = vm_page_grab();
3953
3954                         if (m == VM_PAGE_NULL) {
3955                                 /*
3956                                  * no free page currently available...
3957                                  * must take the slow path
3958                                  */
3959                                 break;
3960                         }
3961                         /*
3962                          * Now do the copy.  Mark the source page busy...
3963                          *
3964                          *      NOTE: This code holds the map lock across
3965                          *      the page copy.
3966                          */
3967                         vm_page_copy(cur_m, m);
3968                         vm_page_insert(m, object, offset);
3969                         SET_PAGE_DIRTY(m, FALSE);
3970
3971                         /*
3972                          * Now cope with the source page and object
3973                          */
3974                         if (object->ref_count > 1 && cur_m->pmapped)
3975                                 pmap_disconnect(cur_m->phys_page);
3976
3977                         if (cur_m->clustered) {
3978                                 VM_PAGE_COUNT_AS_PAGEIN(cur_m);
3979                                 VM_PAGE_CONSUME_CLUSTERED(cur_m);
3980                         }
3981                         need_collapse = TRUE;
3982
3983                         if (!cur_object->internal &&
3984                             cur_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
3985                                 /*
3986                                  * The object from which we've just
3987                                  * copied a page is most probably backed
3988                                  * by a vnode.  We don't want to waste too
3989                                  * much time trying to collapse the VM objects
3990                                  * and create a bottleneck when several tasks
3991                                  * map the same file.
3992                                  */
3993                                 if (cur_object->copy == object) {
3994                                         /*
3995                                          * Shared mapping or no COW yet.
3996                                          * We can never collapse a copy
3997                                          * object into its backing object.
3998                                          */
3999                                         need_collapse = FALSE;
4000                                 } else if (cur_object->copy == object->shadow &&
4001                                            object->shadow->resident_page_count == 0) {
4002                                         /*
4003                                          * Shared mapping after a COW occurred.
4004                                          */
4005                                         need_collapse = FALSE;
4006                                 }
4007                         }
4008                         vm_object_unlock(cur_object);
4009
4010                         if (need_collapse == FALSE)
4011                                 vm_fault_collapse_skipped++;
4012                         vm_fault_collapse_total++;
4013
4014                         type_of_fault = DBG_COW_FAULT;
4015                         VM_STAT_INCR(cow_faults);
4016                         DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
4017                         current_task()->cow_faults++;
4018
4019                         goto FastPmapEnter;
4020
4021                 } else {
4022                         /*
4023                          * No page at cur_object, cur_offset... m == NULL
4024                          */
4025                         if (cur_object->pager_created) {
4026                                 int     compressor_external_state = VM_EXTERNAL_STATE_UNKNOWN;
4027
4028                                 if (MUST_ASK_PAGER(cur_object, cur_offset, compressor_external_state) == TRUE) {
4029                                         int             my_fault_type;
4030                                         int             c_flags = C_DONT_BLOCK;
4031                                         boolean_t       insert_cur_object = FALSE;
4032
4033                                         /*
4034                                          * May have to talk to a pager...
4035                                          * if so, take the slow path by
4036                                          * doing a 'break' from the while (TRUE) loop
4037                                          *
4038                                          * external_state will only be set to VM_EXTERNAL_STATE_EXISTS
4039                                          * if the compressor is active and the page exists there
4040                                          */
4041                                         if (compressor_external_state != VM_EXTERNAL_STATE_EXISTS)
4042                                                 break;
4043
4044                                         if (map == kernel_map || real_map == kernel_map) {
4045                                                 /*
4046                                                  * can't call into the compressor with the kernel_map
4047                                                  * lock held, since the compressor may try to operate
4048                                                  * on the kernel map in order to return an empty c_segment
4049                                                  */
4050                                                 break;
4051                                         }
4052                                         if (object != cur_object) {
4053                                                 if (fault_type & VM_PROT_WRITE)
4054                                                         c_flags |= C_KEEP;
4055                                                 else
4056                                                         insert_cur_object = TRUE;
4057                                         }
4058                                         if (insert_cur_object == TRUE) {
4059
4060                                                 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
4061
4062                                                         cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4063
4064                                                         if (vm_object_lock_upgrade(cur_object) == FALSE) {
4065                                                                 /*
4066                                                                  * couldn't upgrade so go do a full retry
4067                                                                  * immediately since we can no longer be
4068                                                                  * certain about cur_object (since we
4069                                                                  * don't hold a reference on it)...
4070                                                                  * first drop the top object lock
4071                                                                  */
4072                                                                 vm_object_unlock(object);
4073
4074                                                                 vm_map_unlock_read(map);
4075                                                                 if (real_map != map)
4076                                                                         vm_map_unlock(real_map);
4077
4078                                                                 goto RetryFault;
4079                                                         }
4080                                                 }
4081                                         } else if (object_lock_type == OBJECT_LOCK_SHARED) {
4082
4083                                                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4084
4085                                                 if (object != cur_object) {
4086                                                         /*
4087                                                          * we can't go for the upgrade on the top
4088                                                          * lock since the upgrade may block waiting
4089                                                          * for readers to drain... since we hold
4090                                                          * cur_object locked at this point, waiting
4091                                                          * for the readers to drain would represent
4092                                                          * a lock order inversion since the lock order
4093                                                          * for objects is the reference order in the
4094                                                          * shadown chain
4095                                                          */
4096                                                         vm_object_unlock(object);
4097                                                         vm_object_unlock(cur_object);
4098
4099                                                         vm_map_unlock_read(map);
4100                                                         if (real_map != map)
4101                                                                 vm_map_unlock(real_map);
4102
4103                                                         goto RetryFault;
4104                                                 }
4105                                                 if (vm_object_lock_upgrade(object) == FALSE) {
4106                                                         /*
4107                                                          * couldn't upgrade, so explictly take the lock
4108                                                          * exclusively and go relookup the page since we
4109                                                          * will have dropped the object lock and
4110                                                          * a different thread could have inserted
4111                                                          * a page at this offset
4112                                                          * no need for a full retry since we're
4113                                                          * at the top level of the object chain
4114                                                          */
4115                                                         vm_object_lock(object);
4116
4117                                                         continue;
4118                                                 }
4119                                         }
4120                                         m = vm_page_grab();
4121
4122                                         if (m == VM_PAGE_NULL) {
4123                                                 /*
4124                                                  * no free page currently available...
4125                                                  * must take the slow path
4126                                                  */
4127                                                 break;
4128                                         }
4129
4130                                         /*
4131                                          * The object is and remains locked
4132                                          * so no need to take a
4133                                          * "paging_in_progress" reference.
4134                                          */
4135                                         boolean_t shared_lock;
4136                                         if ((object == cur_object &&
4137                                              object_lock_type == OBJECT_LOCK_EXCLUSIVE) ||
4138                                             (object != cur_object &&
4139                                              cur_object_lock_type == OBJECT_LOCK_EXCLUSIVE)) {
4140                                                 shared_lock = FALSE;
4141                                         } else {
4142                                                 shared_lock = TRUE;
4143                                         }
4144
4145                                         kr = vm_compressor_pager_get(
4146                                                 cur_object->pager,
4147                                                 (cur_offset +
4148                                                  cur_object->paging_offset),
4149                                                 m->phys_page,
4150                                                 &my_fault_type,
4151                                                 c_flags,
4152                                                 &compressed_count_delta);
4153
4154                                         vm_compressor_pager_count(
4155                                                 cur_object->pager,
4156                                                 compressed_count_delta,
4157                                                 shared_lock,
4158                                                 cur_object);
4159
4160                                         if (kr != KERN_SUCCESS) {
4161                                                 vm_page_release(m);
4162                                                 break;
4163                                         }
4164                                         m->dirty = TRUE;
4165
4166                                         /*
4167                                          * If the object is purgeable, its
4168                                          * owner's purgeable ledgers will be
4169                                          * updated in vm_page_insert() but the
4170                                          * page was also accounted for in a
4171                                          * "compressed purgeable" ledger, so
4172                                          * update that now.
4173                                          */
4174                                         if (object != cur_object &&
4175                                             !insert_cur_object) {
4176                                                 /*
4177                                                  * We're not going to insert
4178                                                  * the decompressed page into
4179                                                  * the object it came from.
4180                                                  *
4181                                                  * We're dealing with a
4182                                                  * copy-on-write fault on
4183                                                  * "object".
4184                                                  * We're going to decompress
4185                                                  * the page directly into the
4186                                                  * target "object" while
4187                                                  * keepin the compressed
4188                                                  * page for "cur_object", so
4189                                                  * no ledger update in that
4190                                                  * case.
4191                                                  */
4192                                         } else if ((cur_object->purgable ==
4193                                                     VM_PURGABLE_DENY) ||
4194                                                    (cur_object->vo_purgeable_owner ==
4195                                                     NULL)) {
4196                                                 /*
4197                                                  * "cur_object" is not purgeable
4198                                                  * or is not owned, so no
4199                                                  * purgeable ledgers to update.
4200                                                  */
4201                                         } else {
4202                                                 /*
4203                                                  * One less compressed
4204                                                  * purgeable page for
4205                                                  * cur_object's owner.
4206                                                  */
4207                                                 vm_purgeable_compressed_update(
4208                                                         cur_object,
4209                                                         -1);
4210                                         }
4211
4212                                         if (insert_cur_object) {
4213                                                 vm_page_insert(m, cur_object, cur_offset);
4214                                         } else {
4215                                                 vm_page_insert(m, object, offset);
4216                                         }
4217
4218                                         if ((m->object->wimg_bits & VM_WIMG_MASK) != VM_WIMG_USE_DEFAULT) {
4219                                                 /*
4220                                                  * If the page is not cacheable,
4221                                                  * we can't let its contents
4222                                                  * linger in the data cache
4223                                                  * after the decompression.
4224                                                  */
4225                                                 pmap_sync_page_attributes_phys(m->phys_page);
4226                                         }
4227
4228                                         type_of_fault = my_fault_type;
4229
4230                                         VM_STAT_INCR(decompressions);
4231
4232                                         if (cur_object != object) {
4233                                                 if (insert_cur_object) {
4234                                                         top_object = object;
4235                                                         /*
4236                                                          * switch to the object that has the new page
4237                                                          */
4238                                                         object = cur_object;
4239                                                         object_lock_type = cur_object_lock_type;
4240                                                 } else {
4241                                                         vm_object_unlock(cur_object);
4242                                                         cur_object = object;
4243                                                 }
4244                                         }
4245                                         goto FastPmapEnter;
4246                                 }
4247                                 /*
4248                                  * existence map present and indicates
4249                                  * that the pager doesn't have this page
4250                                  */
4251                         }
4252                         if (cur_object->shadow == VM_OBJECT_NULL) {
4253                                 /*
4254                                  * Zero fill fault.  Page gets
4255                                  * inserted into the original object.
4256                                  */
4257                                 if (cur_object->shadow_severed ||
4258                                     VM_OBJECT_PURGEABLE_FAULT_ERROR(cur_object))
4259                                 {
4260                                         if (object != cur_object)
4261                                                 vm_object_unlock(cur_object);
4262                                         vm_object_unlock(object);
4263
4264                                         vm_map_unlock_read(map);
4265                                         if (real_map != map)
4266                                                 vm_map_unlock(real_map);
4267
4268                                         kr = KERN_MEMORY_ERROR;
4269                                         goto done;
4270                                 }
4271                                 if ((throttle_delay = vm_page_throttled())) {
4272                                         /*
4273                                          * drop all of our locks...
4274                                          * wait until the free queue is
4275                                          * pumped back up and then
4276                                          * redrive the fault
4277                                          */
4278                                         if (object != cur_object)
4279                                                 vm_object_unlock(cur_object);
4280                                         vm_object_unlock(object);
4281                                         vm_map_unlock_read(map);
4282                                         if (real_map != map)
4283                                                 vm_map_unlock(real_map);
4284
4285                                         VM_DEBUG_EVENT(vmf_zfdelay, VMF_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
4286
4287                                         delay(throttle_delay);
4288
4289                                         if (!current_thread_aborted() && vm_page_wait((change_wiring) ?
4290                                                          THREAD_UNINT :
4291                                                          THREAD_ABORTSAFE))
4292                                                 goto RetryFault;
4293                                         kr = KERN_ABORTED;
4294                                         goto done;
4295                                 }
4296                                 if (vm_backing_store_low) {
4297                                         /*
4298                                          * we are protecting the system from
4299                                          * backing store exhaustion...
4300                                          * must take the slow path if we're
4301                                          * not privileged
4302                                          */
4303                                         if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV))
4304                                                 break;
4305                                 }
4306                                 if (cur_object != object) {
4307                                         vm_object_unlock(cur_object);
4308
4309                                         cur_object = object;
4310                                 }
4311                                 if (object_lock_type == OBJECT_LOCK_SHARED) {
4312
4313                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4314
4315                                         if (vm_object_lock_upgrade(object) == FALSE) {
4316                                                 /*
4317                                                  * couldn't upgrade so do a full retry on the fault
4318                                                  * since we dropped the object lock which
4319                                                  * could allow another thread to insert
4320                                                  * a page at this offset
4321                                                  */
4322                                                 vm_map_unlock_read(map);
4323                                                 if (real_map != map)
4324                                                         vm_map_unlock(real_map);
4325
4326                                                 goto RetryFault;
4327                                         }
4328                                 }
4329                                 m = vm_page_alloc(object, offset);
4330
4331                                 if (m == VM_PAGE_NULL) {
4332                                         /*
4333                                          * no free page currently available...
4334                                          * must take the slow path
4335                                          */
4336                                         break;
4337                                 }
4338
4339                                 /*
4340                                  * Now zero fill page...
4341                                  * the page is probably going to
4342                                  * be written soon, so don't bother
4343                                  * to clear the modified bit
4344                                  *
4345                                  *   NOTE: This code holds the map
4346                                  *   lock across the zero fill.
4347                                  */
4348                                 type_of_fault = vm_fault_zero_page(m, map->no_zero_fill);
4349
4350                                 goto FastPmapEnter;
4351                         }
4352                         /*
4353                          * On to the next level in the shadow chain
4354                          */
4355                         cur_offset += cur_object->vo_shadow_offset;
4356                         new_object = cur_object->shadow;
4357
4358                         /*
4359                          * take the new_object's lock with the indicated state
4360                          */
4361                         if (cur_object_lock_type == OBJECT_LOCK_SHARED)
4362                                 vm_object_lock_shared(new_object);
4363                         else
4364                                 vm_object_lock(new_object);
4365
4366                         if (cur_object != object)
4367                                 vm_object_unlock(cur_object);
4368
4369                         cur_object = new_object;
4370
4371                         continue;
4372                 }
4373         }
4374         /*
4375          * Cleanup from fast fault failure.  Drop any object
4376          * lock other than original and drop map lock.
4377          */
4378         if (object != cur_object)
4379                 vm_object_unlock(cur_object);
4380
4381         /*
4382          * must own the object lock exclusively at this point
4383          */
4384         if (object_lock_type == OBJECT_LOCK_SHARED) {
4385                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4386
4387                 if (vm_object_lock_upgrade(object) == FALSE) {
4388                         /*
4389                          * couldn't upgrade, so explictly
4390                          * take the lock exclusively
4391                          * no need to retry the fault at this
4392                          * point since "vm_fault_page" will
4393                          * completely re-evaluate the state
4394                          */
4395                         vm_object_lock(object);
4396                 }
4397         }
4398
4399 handle_copy_delay:
4400         vm_map_unlock_read(map);
4401         if (real_map != map)
4402                 vm_map_unlock(real_map);
4403
4404         /*
4405          * Make a reference to this object to
4406          * prevent its disposal while we are messing with
4407          * it.  Once we have the reference, the map is free
4408          * to be diddled.  Since objects reference their
4409          * shadows (and copies), they will stay around as well.
4410          */
4411         vm_object_reference_locked(object);
4412         vm_object_paging_begin(object);
4413
4414         XPR(XPR_VM_FAULT,"vm_fault -> vm_fault_page\n",0,0,0,0,0);
4415
4416         error_code = 0;
4417
4418         result_page = VM_PAGE_NULL;
4419         kr = vm_fault_page(object, offset, fault_type,
4420                            (change_wiring && !wired),
4421                            FALSE, /* page not looked up */
4422                            &prot, &result_page, &top_page,
4423                            &type_of_fault,
4424                            &error_code, map->no_zero_fill,
4425                            FALSE, &fault_info);
4426
4427         /*
4428          * if kr != VM_FAULT_SUCCESS, then the paging reference
4429          * has been dropped and the object unlocked... the ref_count
4430          * is still held
4431          *
4432          * if kr == VM_FAULT_SUCCESS, then the paging reference
4433          * is still held along with the ref_count on the original object
4434          *
4435          *      the object is returned locked with a paging reference
4436          *
4437          *      if top_page != NULL, then it's BUSY and the
4438          *      object it belongs to has a paging reference
4439          *      but is returned unlocked
4440          */
4441         if (kr != VM_FAULT_SUCCESS &&
4442             kr != VM_FAULT_SUCCESS_NO_VM_PAGE) {
4443                 /*
4444                  * we didn't succeed, lose the object reference immediately.
4445                  */
4446                 vm_object_deallocate(object);
4447
4448                 /*
4449                  * See why we failed, and take corrective action.
4450                  */
4451                 switch (kr) {
4452                 case VM_FAULT_MEMORY_SHORTAGE:
4453                         if (vm_page_wait((change_wiring) ?
4454                                          THREAD_UNINT :
4455                                          THREAD_ABORTSAFE))
4456                                 goto RetryFault;
4457                         /*
4458                          * fall thru
4459                          */
4460                 case VM_FAULT_INTERRUPTED:
4461                         kr = KERN_ABORTED;
4462                         goto done;
4463                 case VM_FAULT_RETRY:
4464                         goto RetryFault;
4465                 case VM_FAULT_MEMORY_ERROR:
4466                         if (error_code)
4467                                 kr = error_code;
4468                         else
4469                                 kr = KERN_MEMORY_ERROR;
4470                         goto done;
4471                 default:
4472                         panic("vm_fault: unexpected error 0x%x from "
4473                               "vm_fault_page()\n", kr);
4474                 }
4475         }
4476         m = result_page;
4477
4478         if (m != VM_PAGE_NULL) {
4479                 assert((change_wiring && !wired) ?
4480                     (top_page == VM_PAGE_NULL) :
4481                     ((top_page == VM_PAGE_NULL) == (m->object == object)));
4482         }
4483
4484         /*
4485          * What to do with the resulting page from vm_fault_page
4486          * if it doesn't get entered into the physical map:
4487          */
4488 #define RELEASE_PAGE(m)                                 \
4489         MACRO_BEGIN                                     \
4490         PAGE_WAKEUP_DONE(m);                            \
4491         if (!m->active && !m->inactive && !m->throttled) {              \
4492                 vm_page_lockspin_queues();                              \
4493                 if (!m->active && !m->inactive && !m->throttled)        \
4494                         vm_page_activate(m);                            \
4495                 vm_page_unlock_queues();                                \
4496         }                                                               \
4497         MACRO_END
4498
4499         /*
4500          * We must verify that the maps have not changed
4501          * since our last lookup.
4502          */
4503         if (m != VM_PAGE_NULL) {
4504                 old_copy_object = m->object->copy;
4505                 vm_object_unlock(m->object);
4506         } else {
4507                 old_copy_object = VM_OBJECT_NULL;
4508                 vm_object_unlock(object);
4509         }
4510
4511         /*
4512          * no object locks are held at this point
4513          */
4514         if ((map != original_map) || !vm_map_verify(map, &version)) {
4515                 vm_object_t             retry_object;
4516                 vm_object_offset_t      retry_offset;
4517                 vm_prot_t               retry_prot;
4518
4519                 /*
4520                  * To avoid trying to write_lock the map while another
4521                  * thread has it read_locked (in vm_map_pageable), we
4522                  * do not try for write permission.  If the page is
4523                  * still writable, we will get write permission.  If it
4524                  * is not, or has been marked needs_copy, we enter the
4525                  * mapping without write permission, and will merely
4526                  * take another fault.
4527                  */
4528                 map = original_map;
4529                 vm_map_lock_read(map);
4530
4531                 kr = vm_map_lookup_locked(&map, vaddr,
4532                                           fault_type & ~VM_PROT_WRITE,
4533                                           OBJECT_LOCK_EXCLUSIVE, &version,
4534                                           &retry_object, &retry_offset, &retry_prot,
4535                                           &wired,
4536                                           &fault_info,
4537                                           &real_map);
4538                 pmap = real_map->pmap;
4539
4540                 if (kr != KERN_SUCCESS) {
4541                         vm_map_unlock_read(map);
4542
4543                         if (m != VM_PAGE_NULL) {
4544                                 /*
4545                                  * retake the lock so that
4546                                  * we can drop the paging reference
4547                                  * in vm_fault_cleanup and do the
4548                                  * PAGE_WAKEUP_DONE in RELEASE_PAGE
4549                                  */
4550                                 vm_object_lock(m->object);
4551
4552                                 RELEASE_PAGE(m);
4553
4554                                 vm_fault_cleanup(m->object, top_page);
4555                         } else {
4556                                 /*
4557                                  * retake the lock so that
4558                                  * we can drop the paging reference
4559                                  * in vm_fault_cleanup
4560                                  */
4561                                 vm_object_lock(object);
4562
4563                                 vm_fault_cleanup(object, top_page);
4564                         }
4565                         vm_object_deallocate(object);
4566
4567                         goto done;
4568                 }
4569                 vm_object_unlock(retry_object);
4570
4571                 if ((retry_object != object) || (retry_offset != offset)) {
4572
4573                         vm_map_unlock_read(map);
4574                         if (real_map != map)
4575                                 vm_map_unlock(real_map);
4576
4577                         if (m != VM_PAGE_NULL) {
4578                                 /*
4579                                  * retake the lock so that
4580                                  * we can drop the paging reference
4581                                  * in vm_fault_cleanup and do the
4582                                  * PAGE_WAKEUP_DONE in RELEASE_PAGE
4583                                  */
4584                                 vm_object_lock(m->object);
4585
4586                                 RELEASE_PAGE(m);
4587
4588                                 vm_fault_cleanup(m->object, top_page);
4589                         } else {
4590                                 /*
4591                                  * retake the lock so that
4592                                  * we can drop the paging reference
4593                                  * in vm_fault_cleanup
4594                                  */
4595                                 vm_object_lock(object);
4596
4597                                 vm_fault_cleanup(object, top_page);
4598                         }
4599                         vm_object_deallocate(object);
4600
4601                         goto RetryFault;
4602                 }
4603                 /*
4604                  * Check whether the protection has changed or the object
4605                  * has been copied while we left the map unlocked.
4606                  */
4607                 prot &= retry_prot;
4608         }
4609         if (m != VM_PAGE_NULL) {
4610                 vm_object_lock(m->object);
4611
4612                 if (m->object->copy != old_copy_object) {
4613                         /*
4614                          * The copy object changed while the top-level object
4615                          * was unlocked, so take away write permission.
4616                          */
4617                         prot &= ~VM_PROT_WRITE;
4618                 }
4619         } else
4620                 vm_object_lock(object);
4621
4622         /*
4623          * If we want to wire down this page, but no longer have
4624          * adequate permissions, we must start all over.
4625          */
4626         if (wired && (fault_type != (prot | VM_PROT_WRITE))) {
4627
4628                 vm_map_verify_done(map, &version);
4629                 if (real_map != map)
4630                         vm_map_unlock(real_map);
4631
4632                 if (m != VM_PAGE_NULL) {
4633                         RELEASE_PAGE(m);
4634
4635                         vm_fault_cleanup(m->object, top_page);
4636                 } else
4637                         vm_fault_cleanup(object, top_page);
4638
4639                 vm_object_deallocate(object);
4640
4641                 goto RetryFault;
4642         }
4643         if (m != VM_PAGE_NULL) {
4644                 /*
4645                  * Put this page into the physical map.
4646                  * We had to do the unlock above because pmap_enter
4647                  * may cause other faults.  The page may be on
4648                  * the pageout queues.  If the pageout daemon comes
4649                  * across the page, it will remove it from the queues.
4650                  */
4651                 if (caller_pmap) {
4652                         kr = vm_fault_enter(m,
4653                                             caller_pmap,
4654                                             caller_pmap_addr,
4655                                             prot,
4656                                             fault_type,
4657                                             wired,
4658                                             change_wiring,
4659                                             fault_info.no_cache,
4660                                             fault_info.cs_bypass,
4661                                             fault_info.user_tag,
4662                                             fault_info.pmap_options,
4663                                             NULL,
4664                                             &type_of_fault);
4665                 } else {
4666                         kr = vm_fault_enter(m,
4667                                             pmap,
4668                                             vaddr,
4669                                             prot,
4670                                             fault_type,
4671                                             wired,
4672                                             change_wiring,
4673                                             fault_info.no_cache,
4674                                             fault_info.cs_bypass,
4675                                             fault_info.user_tag,
4676                                             fault_info.pmap_options,
4677                                             NULL,
4678                                             &type_of_fault);
4679                 }
4680                 if (kr != KERN_SUCCESS) {
4681                         /* abort this page fault */
4682                         vm_map_verify_done(map, &version);
4683                         if (real_map != map)
4684                                 vm_map_unlock(real_map);
4685                         PAGE_WAKEUP_DONE(m);
4686                         vm_fault_cleanup(m->object, top_page);
4687                         vm_object_deallocate(object);
4688                         goto done;
4689                 }
4690                 if (physpage_p != NULL) {
4691                         /* for vm_map_wire_and_extract() */
4692                         *physpage_p = m->phys_page;
4693                         if (prot & VM_PROT_WRITE) {
4694                                 vm_object_lock_assert_exclusive(m->object);
4695                                 m->dirty = TRUE;
4696                         }
4697                 }
4698         } else {
4699
4700                 vm_map_entry_t          entry;
4701                 vm_map_offset_t         laddr;
4702                 vm_map_offset_t         ldelta, hdelta;
4703
4704                 /*
4705                  * do a pmap block mapping from the physical address
4706                  * in the object
4707                  */
4708
4709 #ifdef ppc
4710                 /* While we do not worry about execution protection in   */
4711                 /* general, certian pages may have instruction execution */
4712                 /* disallowed.  We will check here, and if not allowed   */
4713                 /* to execute, we return with a protection failure.      */
4714
4715                 if ((fault_type & VM_PROT_EXECUTE) &&
4716                         (!pmap_eligible_for_execute((ppnum_t)(object->vo_shadow_offset >> 12)))) {
4717
4718                         vm_map_verify_done(map, &version);
4719
4720                         if (real_map != map)
4721                                 vm_map_unlock(real_map);
4722
4723                         vm_fault_cleanup(object, top_page);
4724                         vm_object_deallocate(object);
4725
4726                         kr = KERN_PROTECTION_FAILURE;
4727                         goto done;
4728                 }
4729 #endif  /* ppc */
4730
4731                 if (real_map != map)
4732                         vm_map_unlock(real_map);
4733
4734                 if (original_map != map) {
4735                         vm_map_unlock_read(map);
4736                         vm_map_lock_read(original_map);
4737                         map = original_map;
4738                 }
4739                 real_map = map;
4740
4741                 laddr = vaddr;
4742                 hdelta = 0xFFFFF000;
4743                 ldelta = 0xFFFFF000;
4744
4745                 while (vm_map_lookup_entry(map, laddr, &entry)) {
4746                         if (ldelta > (laddr - entry->vme_start))
4747                                 ldelta = laddr - entry->vme_start;
4748                         if (hdelta > (entry->vme_end - laddr))
4749                                 hdelta = entry->vme_end - laddr;
4750                         if (entry->is_sub_map) {
4751
4752                                 laddr = (laddr - entry->vme_start)
4753                                                         + entry->offset;
4754                                 vm_map_lock_read(entry->object.sub_map);
4755
4756                                 if (map != real_map)
4757                                         vm_map_unlock_read(map);
4758                                 if (entry->use_pmap) {
4759                                         vm_map_unlock_read(real_map);
4760                                         real_map = entry->object.sub_map;
4761                                 }
4762                                 map = entry->object.sub_map;
4763
4764                         } else {
4765                                 break;
4766                         }
4767                 }
4768
4769                 if (vm_map_lookup_entry(map, laddr, &entry) &&
4770                     (entry->object.vm_object != NULL) &&
4771                     (entry->object.vm_object == object)) {
4772
4773                         int superpage = (!object->pager_created && object->phys_contiguous)? VM_MEM_SUPERPAGE : 0;
4774
4775                         if (superpage && physpage_p) {
4776                                 /* for vm_map_wire_and_extract() */
4777                                 *physpage_p = (ppnum_t) ((((vm_map_offset_t) entry->object.vm_object->vo_shadow_offset)
4778                                                           + entry->offset
4779                                                           + (laddr - entry->vme_start))
4780                                                          >> PAGE_SHIFT);
4781                         }
4782
4783                         if (caller_pmap) {
4784                                 /*
4785                                  * Set up a block mapped area
4786                                  */
4787                                 assert((uint32_t)((ldelta + hdelta) >> PAGE_SHIFT) == ((ldelta + hdelta) >> PAGE_SHIFT));
4788                                 pmap_map_block(caller_pmap,
4789                                                (addr64_t)(caller_pmap_addr - ldelta),
4790                                                (ppnum_t)((((vm_map_offset_t) (entry->object.vm_object->vo_shadow_offset)) +
4791                                                           entry->offset + (laddr - entry->vme_start) - ldelta) >> PAGE_SHIFT),
4792                                                (uint32_t)((ldelta + hdelta) >> PAGE_SHIFT), prot,
4793                                                (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
4794                         } else {
4795                                 /*
4796                                  * Set up a block mapped area
4797                                  */
4798                                 assert((uint32_t)((ldelta + hdelta) >> PAGE_SHIFT) == ((ldelta + hdelta) >> PAGE_SHIFT));
4799                                 pmap_map_block(real_map->pmap,
4800                                                (addr64_t)(vaddr - ldelta),
4801                                                (ppnum_t)((((vm_map_offset_t)(entry->object.vm_object->vo_shadow_offset)) +
4802                                                           entry->offset + (laddr - entry->vme_start) - ldelta) >> PAGE_SHIFT),
4803                                                (uint32_t)((ldelta + hdelta) >> PAGE_SHIFT), prot,
4804                                                (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
4805                         }
4806                 }
4807         }
4808
4809         /*
4810          * Unlock everything, and return
4811          */
4812         vm_map_verify_done(map, &version);
4813         if (real_map != map)
4814                 vm_map_unlock(real_map);
4815
4816         if (m != VM_PAGE_NULL) {
4817                 PAGE_WAKEUP_DONE(m);
4818
4819                 vm_fault_cleanup(m->object, top_page);
4820         } else
4821                 vm_fault_cleanup(object, top_page);
4822
4823         vm_object_deallocate(object);
4824
4825 #undef  RELEASE_PAGE
4826
4827         kr = KERN_SUCCESS;
4828 done:
4829         thread_interrupt_level(interruptible_state);
4830
4831         /*
4832          * Only throttle on faults which cause a pagein.
4833          */
4834         if ((type_of_fault == DBG_PAGEIND_FAULT) || (type_of_fault == DBG_PAGEINV_FAULT) || (type_of_fault == DBG_COMPRESSOR_SWAPIN_FAULT)) {
4835                 throttle_lowpri_io(1);
4836         }
4837
4838         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4839                               (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
4840                               ((uint64_t)vaddr >> 32),
4841                               vaddr,
4842                               kr,
4843                               type_of_fault,
4844                               0);
4845
4846         return (kr);
4847 }
4848
4849 /*
4850  *      vm_fault_wire:
4851  *
4852  *      Wire down a range of virtual addresses in a map.
4853  */
4854 kern_return_t
4855 vm_fault_wire(
4856         vm_map_t        map,
4857         vm_map_entry_t  entry,
4858         pmap_t          pmap,
4859         vm_map_offset_t pmap_addr,
4860         ppnum_t         *physpage_p)
4861 {
4862
4863         register vm_map_offset_t        va;
4864         register vm_map_offset_t        end_addr = entry->vme_end;
4865         register kern_return_t  rc;
4866
4867         assert(entry->in_transition);
4868
4869         if ((entry->object.vm_object != NULL) &&
4870             !entry->is_sub_map &&
4871             entry->object.vm_object->phys_contiguous) {
4872                 return KERN_SUCCESS;
4873         }
4874
4875         /*
4876          *      Inform the physical mapping system that the
4877          *      range of addresses may not fault, so that
4878          *      page tables and such can be locked down as well.
4879          */
4880
4881         pmap_pageable(pmap, pmap_addr,
4882                 pmap_addr + (end_addr - entry->vme_start), FALSE);
4883
4884         /*
4885          *      We simulate a fault to get the page and enter it
4886          *      in the physical map.
4887          */
4888
4889         for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
4890                 rc = vm_fault_wire_fast(map, va, entry, pmap,
4891                                         pmap_addr + (va - entry->vme_start),
4892                                         physpage_p);
4893                 if (rc != KERN_SUCCESS) {
4894                         rc = vm_fault_internal(map, va, VM_PROT_NONE, TRUE,
4895                                                ((pmap == kernel_pmap)
4896                                                 ? THREAD_UNINT
4897                                                 : THREAD_ABORTSAFE),
4898                                                pmap,
4899                                                (pmap_addr +
4900                                                 (va - entry->vme_start)),
4901                                                physpage_p);
4902                         DTRACE_VM2(softlock, int, 1, (uint64_t *), NULL);
4903                 }
4904
4905                 if (rc != KERN_SUCCESS) {
4906                         struct vm_map_entry     tmp_entry = *entry;
4907
4908                         /* unwire wired pages */
4909                         tmp_entry.vme_end = va;
4910                         vm_fault_unwire(map,
4911                                 &tmp_entry, FALSE, pmap, pmap_addr);
4912
4913                         return rc;
4914                 }
4915         }
4916         return KERN_SUCCESS;
4917 }
4918
4919 /*
4920  *      vm_fault_unwire:
4921  *
4922  *      Unwire a range of virtual addresses in a map.
4923  */
4924 void
4925 vm_fault_unwire(
4926         vm_map_t        map,
4927         vm_map_entry_t  entry,
4928         boolean_t       deallocate,
4929         pmap_t          pmap,
4930         vm_map_offset_t pmap_addr)
4931 {
4932         register vm_map_offset_t        va;
4933         register vm_map_offset_t        end_addr = entry->vme_end;
4934         vm_object_t             object;
4935         struct vm_object_fault_info fault_info;
4936
4937         object = (entry->is_sub_map)
4938                         ? VM_OBJECT_NULL : entry->object.vm_object;
4939
4940         /*
4941          * If it's marked phys_contiguous, then vm_fault_wire() didn't actually
4942          * do anything since such memory is wired by default.  So we don't have
4943          * anything to undo here.
4944          */
4945
4946         if (object != VM_OBJECT_NULL && object->phys_contiguous)
4947                 return;
4948
4949         fault_info.interruptible = THREAD_UNINT;
4950         fault_info.behavior = entry->behavior;
4951         fault_info.user_tag = entry->alias;
4952         fault_info.pmap_options = 0;
4953         if (entry->iokit_acct ||
4954             (!entry->is_sub_map && !entry->use_pmap)) {
4955                 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
4956         }
4957         fault_info.lo_offset = entry->offset;
4958         fault_info.hi_offset = (entry->vme_end - entry->vme_start) + entry->offset;
4959         fault_info.no_cache = entry->no_cache;
4960         fault_info.stealth = TRUE;
4961         fault_info.io_sync = FALSE;
4962         fault_info.cs_bypass = FALSE;
4963         fault_info.mark_zf_absent = FALSE;
4964         fault_info.batch_pmap_op = FALSE;
4965
4966         /*
4967          *      Since the pages are wired down, we must be able to
4968          *      get their mappings from the physical map system.
4969          */
4970
4971         for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
4972
4973                 if (object == VM_OBJECT_NULL) {
4974                         if (pmap) {
4975                                 pmap_change_wiring(pmap,
4976                                                    pmap_addr + (va - entry->vme_start), FALSE);
4977                         }
4978                         (void) vm_fault(map, va, VM_PROT_NONE,
4979                                         TRUE, THREAD_UNINT, pmap, pmap_addr);
4980                 } else {
4981                         vm_prot_t       prot;
4982                         vm_page_t       result_page;
4983                         vm_page_t       top_page;
4984                         vm_object_t     result_object;
4985                         vm_fault_return_t result;
4986
4987                         if (end_addr - va > (vm_size_t) -1) {
4988                                 /* 32-bit overflow */
4989                                 fault_info.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
4990                         } else {
4991                                 fault_info.cluster_size = (vm_size_t) (end_addr - va);
4992                                 assert(fault_info.cluster_size == end_addr - va);
4993                         }
4994
4995                         do {
4996                                 prot = VM_PROT_NONE;
4997
4998                                 vm_object_lock(object);
4999                                 vm_object_paging_begin(object);
5000                                 XPR(XPR_VM_FAULT,
5001                                         "vm_fault_unwire -> vm_fault_page\n",
5002                                         0,0,0,0,0);
5003                                 result_page = VM_PAGE_NULL;
5004                                 result = vm_fault_page(
5005                                         object,
5006                                         entry->offset + (va - entry->vme_start),
5007                                         VM_PROT_NONE, TRUE,
5008                                         FALSE, /* page not looked up */
5009                                         &prot, &result_page, &top_page,
5010                                         (int *)0,
5011                                         NULL, map->no_zero_fill,
5012                                         FALSE, &fault_info);
5013                         } while (result == VM_FAULT_RETRY);
5014
5015                         /*
5016                          * If this was a mapping to a file on a device that has been forcibly
5017                          * unmounted, then we won't get a page back from vm_fault_page().  Just
5018                          * move on to the next one in case the remaining pages are mapped from
5019                          * different objects.  During a forced unmount, the object is terminated
5020                          * so the alive flag will be false if this happens.  A forced unmount will
5021                          * will occur when an external disk is unplugged before the user does an
5022                          * eject, so we don't want to panic in that situation.
5023                          */
5024
5025                         if (result == VM_FAULT_MEMORY_ERROR && !object->alive)
5026                                 continue;
5027
5028                         if (result == VM_FAULT_MEMORY_ERROR &&
5029                             object == kernel_object) {
5030                                 /*
5031                                  * This must have been allocated with
5032                                  * KMA_KOBJECT and KMA_VAONLY and there's
5033                                  * no physical page at this offset.
5034                                  * We're done (no page to free).
5035                                  */
5036                                 assert(deallocate);
5037                                 continue;
5038                         }
5039
5040                         if (result != VM_FAULT_SUCCESS)
5041                                 panic("vm_fault_unwire: failure");
5042
5043                         result_object = result_page->object;
5044
5045                         if (deallocate) {
5046                                 assert(result_page->phys_page !=
5047                                        vm_page_fictitious_addr);
5048                                 pmap_disconnect(result_page->phys_page);
5049                                 VM_PAGE_FREE(result_page);
5050                         } else {
5051                                 if ((pmap) && (result_page->phys_page != vm_page_guard_addr))
5052                                         pmap_change_wiring(pmap,
5053                                             pmap_addr + (va - entry->vme_start), FALSE);
5054
5055
5056                                 if (VM_PAGE_WIRED(result_page)) {
5057                                         vm_page_lockspin_queues();
5058                                         vm_page_unwire(result_page, TRUE);
5059                                         vm_page_unlock_queues();
5060                                 }
5061                                 if(entry->zero_wired_pages) {
5062                                         pmap_zero_page(result_page->phys_page);
5063                                         entry->zero_wired_pages = FALSE;
5064                                 }
5065
5066                                 PAGE_WAKEUP_DONE(result_page);
5067                         }
5068                         vm_fault_cleanup(result_object, top_page);
5069                 }
5070         }
5071
5072         /*
5073          *      Inform the physical mapping system that the range
5074          *      of addresses may fault, so that page tables and
5075          *      such may be unwired themselves.
5076          */
5077
5078         pmap_pageable(pmap, pmap_addr,
5079                 pmap_addr + (end_addr - entry->vme_start), TRUE);
5080
5081 }
5082
5083 /*
5084  *      vm_fault_wire_fast:
5085  *
5086  *      Handle common case of a wire down page fault at the given address.
5087  *      If successful, the page is inserted into the associated physical map.
5088  *      The map entry is passed in to avoid the overhead of a map lookup.
5089  *
5090  *      NOTE: the given address should be truncated to the
5091  *      proper page address.
5092  *
5093  *      KERN_SUCCESS is returned if the page fault is handled; otherwise,
5094  *      a standard error specifying why the fault is fatal is returned.
5095  *
5096  *      The map in question must be referenced, and remains so.
5097  *      Caller has a read lock on the map.
5098  *
5099  *      This is a stripped version of vm_fault() for wiring pages.  Anything
5100  *      other than the common case will return KERN_FAILURE, and the caller
5101  *      is expected to call vm_fault().
5102  */
5103 kern_return_t
5104 vm_fault_wire_fast(
5105         __unused vm_map_t       map,
5106         vm_map_offset_t va,
5107         vm_map_entry_t  entry,
5108         pmap_t          pmap,
5109         vm_map_offset_t pmap_addr,
5110         ppnum_t         *physpage_p)
5111 {
5112         vm_object_t             object;
5113         vm_object_offset_t      offset;
5114         register vm_page_t      m;
5115         vm_prot_t               prot;
5116         thread_t                thread = current_thread();
5117         int                     type_of_fault;
5118         kern_return_t           kr;
5119
5120         VM_STAT_INCR(faults);
5121
5122         if (thread != THREAD_NULL && thread->task != TASK_NULL)
5123           thread->task->faults++;
5124
5125 /*
5126  *      Recovery actions
5127  */
5128
5129 #undef  RELEASE_PAGE
5130 #define RELEASE_PAGE(m) {                               \
5131         PAGE_WAKEUP_DONE(m);                            \
5132         vm_page_lockspin_queues();                      \
5133         vm_page_unwire(m, TRUE);                        \
5134         vm_page_unlock_queues();                        \
5135 }
5136
5137
5138 #undef  UNLOCK_THINGS
5139 #define UNLOCK_THINGS   {                               \
5140         vm_object_paging_end(object);                      \
5141         vm_object_unlock(object);                          \
5142 }
5143
5144 #undef  UNLOCK_AND_DEALLOCATE
5145 #define UNLOCK_AND_DEALLOCATE   {                       \
5146         UNLOCK_THINGS;                                  \
5147         vm_object_deallocate(object);                   \
5148 }
5149 /*
5150  *      Give up and have caller do things the hard way.
5151  */
5152
5153 #define GIVE_UP {                                       \
5154         UNLOCK_AND_DEALLOCATE;                          \
5155         return(KERN_FAILURE);                           \
5156 }
5157
5158
5159         /*
5160          *      If this entry is not directly to a vm_object, bail out.
5161          */
5162         if (entry->is_sub_map) {
5163                 assert(physpage_p == NULL);
5164                 return(KERN_FAILURE);
5165         }
5166
5167         /*
5168          *      Find the backing store object and offset into it.
5169          */
5170
5171         object = entry->object.vm_object;
5172         offset = (va - entry->vme_start) + entry->offset;
5173         prot = entry->protection;
5174
5175         /*
5176          *      Make a reference to this object to prevent its
5177          *      disposal while we are messing with it.
5178          */
5179
5180         vm_object_lock(object);
5181         vm_object_reference_locked(object);
5182         vm_object_paging_begin(object);
5183
5184         /*
5185          *      INVARIANTS (through entire routine):
5186          *
5187          *      1)      At all times, we must either have the object
5188          *              lock or a busy page in some object to prevent
5189          *              some other thread from trying to bring in
5190          *              the same page.
5191          *
5192          *      2)      Once we have a busy page, we must remove it from
5193          *              the pageout queues, so that the pageout daemon
5194          *              will not grab it away.
5195          *
5196          */
5197
5198         /*
5199          *      Look for page in top-level object.  If it's not there or
5200          *      there's something going on, give up.
5201          * ENCRYPTED SWAP: use the slow fault path, since we'll need to
5202          * decrypt the page before wiring it down.
5203          */
5204         m = vm_page_lookup(object, offset);
5205         if ((m == VM_PAGE_NULL) || (m->busy) || (m->encrypted) ||
5206             (m->unusual && ( m->error || m->restart || m->absent))) {
5207
5208                 GIVE_UP;
5209         }
5210         ASSERT_PAGE_DECRYPTED(m);
5211
5212         if (m->fictitious &&
5213             m->phys_page == vm_page_guard_addr) {
5214                 /*
5215                  * Guard pages are fictitious pages and are never
5216                  * entered into a pmap, so let's say it's been wired...
5217                  */
5218                 kr = KERN_SUCCESS;
5219                 goto done;
5220         }
5221
5222         /*
5223          *      Wire the page down now.  All bail outs beyond this
5224          *      point must unwire the page.
5225          */
5226
5227         vm_page_lockspin_queues();
5228         vm_page_wire(m);
5229         vm_page_unlock_queues();
5230
5231         /*
5232          *      Mark page busy for other threads.
5233          */
5234         assert(!m->busy);
5235         m->busy = TRUE;
5236         assert(!m->absent);
5237
5238         /*
5239          *      Give up if the page is being written and there's a copy object
5240          */
5241         if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
5242                 RELEASE_PAGE(m);
5243                 GIVE_UP;
5244         }
5245
5246         /*
5247          *      Put this page into the physical map.
5248          */
5249         type_of_fault = DBG_CACHE_HIT_FAULT;
5250         kr = vm_fault_enter(m,
5251                             pmap,
5252                             pmap_addr,
5253                             prot,
5254                             prot,
5255                             TRUE,
5256                             FALSE,
5257                             FALSE,
5258                             FALSE,
5259                             entry->alias,
5260                             ((entry->iokit_acct ||
5261                               (!entry->is_sub_map && !entry->use_pmap))
5262                              ? PMAP_OPTIONS_ALT_ACCT
5263                              : 0),
5264                             NULL,
5265                             &type_of_fault);
5266
5267 done:
5268         /*
5269          *      Unlock everything, and return
5270          */
5271
5272         if (physpage_p) {
5273                 /* for vm_map_wire_and_extract() */
5274                 if (kr == KERN_SUCCESS) {
5275                         *physpage_p = m->phys_page;
5276                         if (prot & VM_PROT_WRITE) {
5277                                 vm_object_lock_assert_exclusive(m->object);
5278                                 m->dirty = TRUE;
5279                         }
5280                 } else {
5281                         *physpage_p = 0;
5282                 }
5283         }
5284
5285         PAGE_WAKEUP_DONE(m);
5286         UNLOCK_AND_DEALLOCATE;
5287
5288         return kr;
5289
5290 }
5291
5292 /*
5293  *      Routine:        vm_fault_copy_cleanup
5294  *      Purpose:
5295  *              Release a page used by vm_fault_copy.
5296  */
5297
5298 void
5299 vm_fault_copy_cleanup(
5300         vm_page_t       page,
5301         vm_page_t       top_page)
5302 {
5303         vm_object_t     object = page->object;
5304
5305         vm_object_lock(object);
5306         PAGE_WAKEUP_DONE(page);
5307         if (!page->active && !page->inactive && !page->throttled) {
5308                 vm_page_lockspin_queues();
5309                 if (!page->active && !page->inactive && !page->throttled)
5310                         vm_page_activate(page);
5311                 vm_page_unlock_queues();
5312         }
5313         vm_fault_cleanup(object, top_page);
5314 }
5315
5316 void
5317 vm_fault_copy_dst_cleanup(
5318         vm_page_t       page)
5319 {
5320         vm_object_t     object;
5321
5322         if (page != VM_PAGE_NULL) {
5323                 object = page->object;
5324                 vm_object_lock(object);
5325                 vm_page_lockspin_queues();
5326                 vm_page_unwire(page, TRUE);
5327                 vm_page_unlock_queues();
5328                 vm_object_paging_end(object);
5329                 vm_object_unlock(object);
5330         }
5331 }
5332
5333 /*
5334  *      Routine:        vm_fault_copy
5335  *
5336  *      Purpose:
5337  *              Copy pages from one virtual memory object to another --
5338  *              neither the source nor destination pages need be resident.
5339  *
5340  *              Before actually copying a page, the version associated with
5341  *              the destination address map wil be verified.
5342  *
5343  *      In/out conditions:
5344  *              The caller must hold a reference, but not a lock, to
5345  *              each of the source and destination objects and to the
5346  *              destination map.
5347  *
5348  *      Results:
5349  *              Returns KERN_SUCCESS if no errors were encountered in
5350  *              reading or writing the data.  Returns KERN_INTERRUPTED if
5351  *              the operation was interrupted (only possible if the
5352  *              "interruptible" argument is asserted).  Other return values
5353  *              indicate a permanent error in copying the data.
5354  *
5355  *              The actual amount of data copied will be returned in the
5356  *              "copy_size" argument.  In the event that the destination map
5357  *              verification failed, this amount may be less than the amount
5358  *              requested.
5359  */
5360 kern_return_t
5361 vm_fault_copy(
5362         vm_object_t             src_object,
5363         vm_object_offset_t      src_offset,
5364         vm_map_size_t           *copy_size,             /* INOUT */
5365         vm_object_t             dst_object,
5366         vm_object_offset_t      dst_offset,
5367         vm_map_t                dst_map,
5368         vm_map_version_t         *dst_version,
5369         int                     interruptible)
5370 {
5371         vm_page_t               result_page;
5372
5373         vm_page_t               src_page;
5374         vm_page_t               src_top_page;
5375         vm_prot_t               src_prot;
5376
5377         vm_page_t               dst_page;
5378         vm_page_t               dst_top_page;
5379         vm_prot_t               dst_prot;
5380
5381         vm_map_size_t           amount_left;
5382         vm_object_t             old_copy_object;
5383         kern_return_t           error = 0;
5384         vm_fault_return_t       result;
5385
5386         vm_map_size_t           part_size;
5387         struct vm_object_fault_info fault_info_src;
5388         struct vm_object_fault_info fault_info_dst;
5389
5390         /*
5391          * In order not to confuse the clustered pageins, align
5392          * the different offsets on a page boundary.
5393          */
5394
5395 #define RETURN(x)                                       \
5396         MACRO_BEGIN                                     \
5397         *copy_size -= amount_left;                      \
5398         MACRO_RETURN(x);                                \
5399         MACRO_END
5400
5401         amount_left = *copy_size;
5402
5403         fault_info_src.interruptible = interruptible;
5404         fault_info_src.behavior = VM_BEHAVIOR_SEQUENTIAL;
5405         fault_info_src.user_tag  = 0;
5406         fault_info_src.pmap_options = 0;
5407         fault_info_src.lo_offset = vm_object_trunc_page(src_offset);
5408         fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left;
5409         fault_info_src.no_cache   = FALSE;
5410         fault_info_src.stealth = TRUE;
5411         fault_info_src.io_sync = FALSE;
5412         fault_info_src.cs_bypass = FALSE;
5413         fault_info_src.mark_zf_absent = FALSE;
5414         fault_info_src.batch_pmap_op = FALSE;
5415
5416         fault_info_dst.interruptible = interruptible;
5417         fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL;
5418         fault_info_dst.user_tag  = 0;
5419         fault_info_dst.pmap_options = 0;
5420         fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset);
5421         fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left;
5422         fault_info_dst.no_cache   = FALSE;
5423         fault_info_dst.stealth = TRUE;
5424         fault_info_dst.io_sync = FALSE;
5425         fault_info_dst.cs_bypass = FALSE;
5426         fault_info_dst.mark_zf_absent = FALSE;
5427         fault_info_dst.batch_pmap_op = FALSE;
5428
5429         do { /* while (amount_left > 0) */
5430                 /*
5431                  * There may be a deadlock if both source and destination
5432                  * pages are the same. To avoid this deadlock, the copy must
5433                  * start by getting the destination page in order to apply
5434                  * COW semantics if any.
5435                  */
5436
5437         RetryDestinationFault: ;
5438
5439                 dst_prot = VM_PROT_WRITE|VM_PROT_READ;
5440
5441                 vm_object_lock(dst_object);
5442                 vm_object_paging_begin(dst_object);
5443
5444                 if (amount_left > (vm_size_t) -1) {
5445                         /* 32-bit overflow */
5446                         fault_info_dst.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
5447                 } else {
5448                         fault_info_dst.cluster_size = (vm_size_t) amount_left;
5449                         assert(fault_info_dst.cluster_size == amount_left);
5450                 }
5451
5452                 XPR(XPR_VM_FAULT,"vm_fault_copy -> vm_fault_page\n",0,0,0,0,0);
5453                 dst_page = VM_PAGE_NULL;
5454                 result = vm_fault_page(dst_object,
5455                                        vm_object_trunc_page(dst_offset),
5456                                        VM_PROT_WRITE|VM_PROT_READ,
5457                                        FALSE,
5458                                        FALSE, /* page not looked up */
5459                                        &dst_prot, &dst_page, &dst_top_page,
5460                                        (int *)0,
5461                                        &error,
5462                                        dst_map->no_zero_fill,
5463                                        FALSE, &fault_info_dst);
5464                 switch (result) {
5465                 case VM_FAULT_SUCCESS:
5466                         break;
5467                 case VM_FAULT_RETRY:
5468                         goto RetryDestinationFault;
5469                 case VM_FAULT_MEMORY_SHORTAGE:
5470                         if (vm_page_wait(interruptible))
5471                                 goto RetryDestinationFault;
5472                         /* fall thru */
5473                 case VM_FAULT_INTERRUPTED:
5474                         RETURN(MACH_SEND_INTERRUPTED);
5475                 case VM_FAULT_SUCCESS_NO_VM_PAGE:
5476                         /* success but no VM page: fail the copy */
5477                         vm_object_paging_end(dst_object);
5478                         vm_object_unlock(dst_object);
5479                         /*FALLTHROUGH*/
5480                 case VM_FAULT_MEMORY_ERROR:
5481                         if (error)
5482                                 return (error);
5483                         else
5484                                 return(KERN_MEMORY_ERROR);
5485                 default:
5486                         panic("vm_fault_copy: unexpected error 0x%x from "
5487                               "vm_fault_page()\n", result);
5488                 }
5489                 assert ((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
5490
5491                 old_copy_object = dst_page->object->copy;
5492
5493                 /*
5494                  * There exists the possiblity that the source and
5495                  * destination page are the same.  But we can't
5496                  * easily determine that now.  If they are the
5497                  * same, the call to vm_fault_page() for the
5498                  * destination page will deadlock.  To prevent this we
5499                  * wire the page so we can drop busy without having
5500                  * the page daemon steal the page.  We clean up the
5501                  * top page  but keep the paging reference on the object
5502                  * holding the dest page so it doesn't go away.
5503                  */
5504
5505                 vm_page_lockspin_queues();
5506                 vm_page_wire(dst_page);
5507                 vm_page_unlock_queues();
5508                 PAGE_WAKEUP_DONE(dst_page);
5509                 vm_object_unlock(dst_page->object);
5510
5511                 if (dst_top_page != VM_PAGE_NULL) {
5512                         vm_object_lock(dst_object);
5513                         VM_PAGE_FREE(dst_top_page);
5514                         vm_object_paging_end(dst_object);
5515                         vm_object_unlock(dst_object);
5516                 }
5517
5518         RetrySourceFault: ;
5519
5520                 if (src_object == VM_OBJECT_NULL) {
5521                         /*
5522                          *      No source object.  We will just
5523                          *      zero-fill the page in dst_object.
5524                          */
5525                         src_page = VM_PAGE_NULL;
5526                         result_page = VM_PAGE_NULL;
5527                 } else {
5528                         vm_object_lock(src_object);
5529                         src_page = vm_page_lookup(src_object,
5530                                                   vm_object_trunc_page(src_offset));
5531                         if (src_page == dst_page) {
5532                                 src_prot = dst_prot;
5533                                 result_page = VM_PAGE_NULL;
5534                         } else {
5535                                 src_prot = VM_PROT_READ;
5536                                 vm_object_paging_begin(src_object);
5537
5538                                 if (amount_left > (vm_size_t) -1) {
5539                                         /* 32-bit overflow */
5540                                         fault_info_src.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
5541                                 } else {
5542                                         fault_info_src.cluster_size = (vm_size_t) amount_left;
5543                                         assert(fault_info_src.cluster_size == amount_left);
5544                                 }
5545
5546                                 XPR(XPR_VM_FAULT,
5547                                         "vm_fault_copy(2) -> vm_fault_page\n",
5548                                         0,0,0,0,0);
5549                                 result_page = VM_PAGE_NULL;
5550                                 result = vm_fault_page(
5551                                         src_object,
5552                                         vm_object_trunc_page(src_offset),
5553                                         VM_PROT_READ, FALSE,
5554                                         FALSE, /* page not looked up */
5555                                         &src_prot,
5556                                         &result_page, &src_top_page,
5557                                         (int *)0, &error, FALSE,
5558                                         FALSE, &fault_info_src);
5559
5560                                 switch (result) {
5561                                 case VM_FAULT_SUCCESS:
5562                                         break;
5563                                 case VM_FAULT_RETRY:
5564                                         goto RetrySourceFault;
5565                                 case VM_FAULT_MEMORY_SHORTAGE:
5566                                         if (vm_page_wait(interruptible))
5567                                                 goto RetrySourceFault;
5568                                         /* fall thru */
5569                                 case VM_FAULT_INTERRUPTED:
5570                                         vm_fault_copy_dst_cleanup(dst_page);
5571                                         RETURN(MACH_SEND_INTERRUPTED);
5572                                 case VM_FAULT_SUCCESS_NO_VM_PAGE:
5573                                         /* success but no VM page: fail */
5574                                         vm_object_paging_end(src_object);
5575                                         vm_object_unlock(src_object);
5576                                         /*FALLTHROUGH*/
5577                                 case VM_FAULT_MEMORY_ERROR:
5578                                         vm_fault_copy_dst_cleanup(dst_page);
5579                                         if (error)
5580                                                 return (error);
5581                                         else
5582                                                 return(KERN_MEMORY_ERROR);
5583                                 default:
5584                                         panic("vm_fault_copy(2): unexpected "
5585                                               "error 0x%x from "
5586                                               "vm_fault_page()\n", result);
5587                                 }
5588
5589
5590                                 assert((src_top_page == VM_PAGE_NULL) ==
5591                                        (result_page->object == src_object));
5592                         }
5593                         assert ((src_prot & VM_PROT_READ) != VM_PROT_NONE);
5594                         vm_object_unlock(result_page->object);
5595                 }
5596
5597                 if (!vm_map_verify(dst_map, dst_version)) {
5598                         if (result_page != VM_PAGE_NULL && src_page != dst_page)
5599                                 vm_fault_copy_cleanup(result_page, src_top_page);
5600                         vm_fault_copy_dst_cleanup(dst_page);
5601                         break;
5602                 }
5603
5604                 vm_object_lock(dst_page->object);
5605
5606                 if (dst_page->object->copy != old_copy_object) {
5607                         vm_object_unlock(dst_page->object);
5608                         vm_map_verify_done(dst_map, dst_version);
5609                         if (result_page != VM_PAGE_NULL && src_page != dst_page)
5610                                 vm_fault_copy_cleanup(result_page, src_top_page);
5611                         vm_fault_copy_dst_cleanup(dst_page);
5612                         break;
5613                 }
5614                 vm_object_unlock(dst_page->object);
5615
5616                 /*
5617                  *      Copy the page, and note that it is dirty
5618                  *      immediately.
5619                  */
5620
5621                 if (!page_aligned(src_offset) ||
5622                         !page_aligned(dst_offset) ||
5623                         !page_aligned(amount_left)) {
5624
5625                         vm_object_offset_t      src_po,
5626                                                 dst_po;
5627
5628                         src_po = src_offset - vm_object_trunc_page(src_offset);
5629                         dst_po = dst_offset - vm_object_trunc_page(dst_offset);
5630
5631                         if (dst_po > src_po) {
5632                                 part_size = PAGE_SIZE - dst_po;
5633                         } else {
5634                                 part_size = PAGE_SIZE - src_po;
5635                         }
5636                         if (part_size > (amount_left)){
5637                                 part_size = amount_left;
5638                         }
5639
5640                         if (result_page == VM_PAGE_NULL) {
5641                                 assert((vm_offset_t) dst_po == dst_po);
5642                                 assert((vm_size_t) part_size == part_size);
5643                                 vm_page_part_zero_fill(dst_page,
5644                                                        (vm_offset_t) dst_po,
5645                                                        (vm_size_t) part_size);
5646                         } else {
5647                                 assert((vm_offset_t) src_po == src_po);
5648                                 assert((vm_offset_t) dst_po == dst_po);
5649                                 assert((vm_size_t) part_size == part_size);
5650                                 vm_page_part_copy(result_page,
5651                                                   (vm_offset_t) src_po,
5652                                                   dst_page,
5653                                                   (vm_offset_t) dst_po,
5654                                                   (vm_size_t)part_size);
5655                                 if(!dst_page->dirty){
5656                                         vm_object_lock(dst_object);
5657                                         SET_PAGE_DIRTY(dst_page, TRUE);
5658                                         vm_object_unlock(dst_page->object);
5659                                 }
5660
5661                         }
5662                 } else {
5663                         part_size = PAGE_SIZE;
5664
5665                         if (result_page == VM_PAGE_NULL)
5666                                 vm_page_zero_fill(dst_page);
5667                         else{
5668                                 vm_object_lock(result_page->object);
5669                                 vm_page_copy(result_page, dst_page);
5670                                 vm_object_unlock(result_page->object);
5671
5672                                 if(!dst_page->dirty){
5673                                         vm_object_lock(dst_object);
5674                                         SET_PAGE_DIRTY(dst_page, TRUE);
5675                                         vm_object_unlock(dst_page->object);
5676                                 }
5677                         }
5678
5679                 }
5680
5681                 /*
5682                  *      Unlock everything, and return
5683                  */
5684
5685                 vm_map_verify_done(dst_map, dst_version);
5686
5687                 if (result_page != VM_PAGE_NULL && src_page != dst_page)
5688                         vm_fault_copy_cleanup(result_page, src_top_page);
5689                 vm_fault_copy_dst_cleanup(dst_page);
5690
5691                 amount_left -= part_size;
5692                 src_offset += part_size;
5693                 dst_offset += part_size;
5694         } while (amount_left > 0);
5695
5696         RETURN(KERN_SUCCESS);
5697 #undef  RETURN
5698
5699         /*NOTREACHED*/
5700 }
5701
5702 #if     VM_FAULT_CLASSIFY
5703 /*
5704  *      Temporary statistics gathering support.
5705  */
5706
5707 /*
5708  *      Statistics arrays:
5709  */
5710 #define VM_FAULT_TYPES_MAX      5
5711 #define VM_FAULT_LEVEL_MAX      8
5712
5713 int     vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
5714
5715 #define VM_FAULT_TYPE_ZERO_FILL 0
5716 #define VM_FAULT_TYPE_MAP_IN    1
5717 #define VM_FAULT_TYPE_PAGER     2
5718 #define VM_FAULT_TYPE_COPY      3
5719 #define VM_FAULT_TYPE_OTHER     4
5720
5721
5722 void
5723 vm_fault_classify(vm_object_t           object,
5724                   vm_object_offset_t    offset,
5725                   vm_prot_t             fault_type)
5726 {
5727         int             type, level = 0;
5728         vm_page_t       m;
5729
5730         while (TRUE) {
5731                 m = vm_page_lookup(object, offset);
5732                 if (m != VM_PAGE_NULL) {
5733                         if (m->busy || m->error || m->restart || m->absent) {
5734                                 type = VM_FAULT_TYPE_OTHER;
5735                                 break;
5736                         }
5737                         if (((fault_type & VM_PROT_WRITE) == 0) ||
5738                             ((level == 0) && object->copy == VM_OBJECT_NULL)) {
5739                                 type = VM_FAULT_TYPE_MAP_IN;
5740                                 break;
5741                         }
5742                         type = VM_FAULT_TYPE_COPY;
5743                         break;
5744                 }
5745                 else {
5746                         if (object->pager_created) {
5747                                 type = VM_FAULT_TYPE_PAGER;
5748                                 break;
5749                         }
5750                         if (object->shadow == VM_OBJECT_NULL) {
5751                                 type = VM_FAULT_TYPE_ZERO_FILL;
5752                                 break;
5753                         }
5754
5755                         offset += object->vo_shadow_offset;
5756                         object = object->shadow;
5757                         level++;
5758                         continue;
5759                 }
5760         }
5761
5762         if (level > VM_FAULT_LEVEL_MAX)
5763                 level = VM_FAULT_LEVEL_MAX;
5764
5765         vm_fault_stats[type][level] += 1;
5766
5767         return;
5768 }
5769
5770 /* cleanup routine to call from debugger */
5771
5772 void
5773 vm_fault_classify_init(void)
5774 {
5775         int type, level;
5776
5777         for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
5778                 for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
5779                         vm_fault_stats[type][level] = 0;
5780                 }
5781         }
5782
5783         return;
5784 }
5785 #endif  /* VM_FAULT_CLASSIFY */
5786
5787
5788 void
5789 vm_page_validate_cs_mapped(
5790         vm_page_t       page,
5791         const void      *kaddr)
5792 {
5793         vm_object_t             object;
5794         vm_object_offset_t      offset;
5795         kern_return_t           kr;
5796         memory_object_t         pager;
5797         void                    *blobs;
5798         boolean_t               validated, tainted;
5799
5800         assert(page->busy);
5801         vm_object_lock_assert_exclusive(page->object);
5802
5803         if (!cs_validation) {
5804                 return;
5805         }
5806
5807         if (page->wpmapped && !page->cs_tainted) {
5808                 /*
5809                  * This page was mapped for "write" access sometime in the
5810                  * past and could still be modifiable in the future.
5811                  * Consider it tainted.
5812                  * [ If the page was already found to be "tainted", no
5813                  * need to re-validate. ]
5814                  */
5815                 page->cs_validated = TRUE;
5816                 page->cs_tainted = TRUE;
5817                 if (cs_debug) {
5818                         printf("CODESIGNING: vm_page_validate_cs: "
5819                                "page %p obj %p off 0x%llx "
5820                                "was modified\n",
5821                                page, page->object, page->offset);
5822                 }
5823                 vm_cs_validated_dirtied++;
5824         }
5825
5826         if (page->cs_validated) {
5827                 return;
5828         }
5829
5830         vm_cs_validates++;
5831
5832         object = page->object;
5833         assert(object->code_signed);
5834         offset = page->offset;
5835
5836         if (!object->alive || object->terminating || object->pager == NULL) {
5837                 /*
5838                  * The object is terminating and we don't have its pager
5839                  * so we can't validate the data...
5840                  */
5841                 return;
5842         }
5843         /*
5844          * Since we get here to validate a page that was brought in by
5845          * the pager, we know that this pager is all setup and ready
5846          * by now.
5847          */
5848         assert(!object->internal);
5849         assert(object->pager != NULL);
5850         assert(object->pager_ready);
5851
5852         pager = object->pager;
5853         assert(object->paging_in_progress);
5854         kr = vnode_pager_get_object_cs_blobs(pager, &blobs);
5855         if (kr != KERN_SUCCESS) {
5856                 blobs = NULL;
5857         }
5858
5859         /* verify the SHA1 hash for this page */
5860         validated = cs_validate_page(blobs,
5861                                      pager,
5862                                      offset + object->paging_offset,
5863                                      (const void *)kaddr,
5864                                      &tainted);
5865
5866         page->cs_validated = validated;
5867         if (validated) {
5868                 page->cs_tainted = tainted;
5869         }
5870 }
5871
5872 void
5873 vm_page_validate_cs(
5874         vm_page_t       page)
5875 {
5876         vm_object_t             object;
5877         vm_object_offset_t      offset;
5878         vm_map_offset_t         koffset;
5879         vm_map_size_t           ksize;
5880         vm_offset_t             kaddr;
5881         kern_return_t           kr;
5882         boolean_t               busy_page;
5883         boolean_t               need_unmap;
5884
5885         vm_object_lock_assert_held(page->object);
5886
5887         if (!cs_validation) {
5888                 return;
5889         }
5890
5891         if (page->wpmapped && !page->cs_tainted) {
5892                 vm_object_lock_assert_exclusive(page->object);
5893
5894                 /*
5895                  * This page was mapped for "write" access sometime in the
5896                  * past and could still be modifiable in the future.
5897                  * Consider it tainted.
5898                  * [ If the page was already found to be "tainted", no
5899                  * need to re-validate. ]
5900                  */
5901                 page->cs_validated = TRUE;
5902                 page->cs_tainted = TRUE;
5903                 if (cs_debug) {
5904                         printf("CODESIGNING: vm_page_validate_cs: "
5905                                "page %p obj %p off 0x%llx "
5906                                "was modified\n",
5907                                page, page->object, page->offset);
5908                 }
5909                 vm_cs_validated_dirtied++;
5910         }
5911
5912         if (page->cs_validated) {
5913                 return;
5914         }
5915
5916         if (page->slid) {
5917                 panic("vm_page_validate_cs(%p): page is slid\n", page);
5918         }
5919         assert(!page->slid);
5920
5921 #if CHECK_CS_VALIDATION_BITMAP
5922         if ( vnode_pager_cs_check_validation_bitmap( page->object->pager, trunc_page(page->offset + page->object->paging_offset), CS_BITMAP_CHECK ) == KERN_SUCCESS) {
5923                 page->cs_validated = TRUE;
5924                 page->cs_tainted = FALSE;
5925                 vm_cs_bitmap_validated++;
5926                 return;
5927         }
5928 #endif
5929         vm_object_lock_assert_exclusive(page->object);
5930
5931         object = page->object;
5932         assert(object->code_signed);
5933         offset = page->offset;
5934
5935         busy_page = page->busy;
5936         if (!busy_page) {
5937                 /* keep page busy while we map (and unlock) the VM object */
5938                 page->busy = TRUE;
5939         }
5940
5941         /*
5942          * Take a paging reference on the VM object
5943          * to protect it from collapse or bypass,
5944          * and keep it from disappearing too.
5945          */
5946         vm_object_paging_begin(object);
5947
5948         /* map the page in the kernel address space */
5949         ksize = PAGE_SIZE_64;
5950         koffset = 0;
5951         need_unmap = FALSE;
5952         kr = vm_paging_map_object(page,
5953                                   object,
5954                                   offset,
5955                                   VM_PROT_READ,
5956                                   FALSE, /* can't unlock object ! */
5957                                   &ksize,
5958                                   &koffset,
5959                                   &need_unmap);
5960         if (kr != KERN_SUCCESS) {
5961                 panic("vm_page_validate_cs: could not map page: 0x%x\n", kr);
5962         }
5963         kaddr = CAST_DOWN(vm_offset_t, koffset);
5964
5965         /* validate the mapped page */
5966         vm_page_validate_cs_mapped(page, (const void *) kaddr);
5967
5968 #if CHECK_CS_VALIDATION_BITMAP
5969         if ( page->cs_validated == TRUE && page->cs_tainted == FALSE ) {
5970                 vnode_pager_cs_check_validation_bitmap( object->pager, trunc_page( offset + object->paging_offset), CS_BITMAP_SET );
5971         }
5972 #endif
5973         assert(page->busy);
5974         assert(object == page->object);
5975         vm_object_lock_assert_exclusive(object);
5976
5977         if (!busy_page) {
5978                 PAGE_WAKEUP_DONE(page);
5979         }
5980         if (need_unmap) {
5981                 /* unmap the map from the kernel address space */
5982                 vm_paging_unmap_object(object, koffset, koffset + ksize);
5983                 koffset = 0;
5984                 ksize = 0;
5985                 kaddr = 0;
5986         }
5987         vm_object_paging_end(object);
5988 }