osfmk/vm/vm_fault.c

   1 /*
   2  * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58 /*
  59  *      File:   vm_fault.c
  60  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  61  *
  62  *      Page fault handling module.
  63  */
  64
  65 #include <mach_cluster_stats.h>
  66 #include <mach_pagemap.h>
  67 #include <libkern/OSAtomic.h>
  68
  69 #include <mach/mach_types.h>
  70 #include <mach/kern_return.h>
  71 #include <mach/message.h>       /* for error codes */
  72 #include <mach/vm_param.h>
  73 #include <mach/vm_behavior.h>
  74 #include <mach/memory_object.h>
  75                                 /* For memory_object_data_{request,unlock} */
  76 #include <mach/sdt.h>
  77
  78 #include <kern/kern_types.h>
  79 #include <kern/host_statistics.h>
  80 #include <kern/counters.h>
  81 #include <kern/task.h>
  82 #include <kern/thread.h>
  83 #include <kern/sched_prim.h>
  84 #include <kern/host.h>
  85 #include <kern/xpr.h>
  86 #include <kern/mach_param.h>
  87 #include <kern/macro_help.h>
  88 #include <kern/zalloc.h>
  89 #include <kern/misc_protos.h>
  90
  91 #include <vm/vm_compressor.h>
  92 #include <vm/vm_compressor_pager.h>
  93 #include <vm/vm_fault.h>
  94 #include <vm/vm_map.h>
  95 #include <vm/vm_object.h>
  96 #include <vm/vm_page.h>
  97 #include <vm/vm_kern.h>
  98 #include <vm/pmap.h>
  99 #include <vm/vm_pageout.h>
 100 #include <vm/vm_protos.h>
 101 #include <vm/vm_external.h>
 102 #include <vm/memory_object.h>
 103 #include <vm/vm_purgeable_internal.h>   /* Needed by some vm_page.h macros */
 104 #include <vm/vm_shared_region.h>
 105
 106 #include <sys/codesign.h>
 107
 108 #include <libsa/sys/timers.h>   /* for struct timespec */
 109
 110 #define VM_FAULT_CLASSIFY       0
 111
 112 #define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
 113
 114 unsigned int    vm_object_pagein_throttle = 16;
 115
 116 /*
 117  * We apply a hard throttle to the demand zero rate of tasks that we believe are running out of control which
 118  * kicks in when swap space runs out.  64-bit programs have massive address spaces and can leak enormous amounts
 119  * of memory if they're buggy and can run the system completely out of swap space.  If this happens, we
 120  * impose a hard throttle on them to prevent them from taking the last bit of memory left.  This helps
 121  * keep the UI active so that the user has a chance to kill the offending task before the system
 122  * completely hangs.
 123  *
 124  * The hard throttle is only applied when the system is nearly completely out of swap space and is only applied
 125  * to tasks that appear to be bloated.  When swap runs out, any task using more than vm_hard_throttle_threshold
 126  * will be throttled.  The throttling is done by giving the thread that's trying to demand zero a page a
 127  * delay of HARD_THROTTLE_DELAY microseconds before being allowed to try the page fault again.
 128  */
 129
 130 extern void throttle_lowpri_io(int);
 131
 132 uint64_t vm_hard_throttle_threshold;
 133
 134
 135
 136 #define NEED_TO_HARD_THROTTLE_THIS_TASK()       (vm_wants_task_throttled(current_task()) ||     \
 137                                                  (vm_page_free_count < vm_page_throttle_limit && \
 138                                                   proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO) > THROTTLE_LEVEL_THROTTLED))
 139
 140
 141 #define HARD_THROTTLE_DELAY     5000    /* 5000 us == 5 ms */
 142 #define SOFT_THROTTLE_DELAY     200     /* 200 us == .2 ms */
 143
 144 #define VM_PAGE_CREATION_THROTTLE_PERIOD_SECS   6
 145 #define VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC  20000
 146
 147
 148 boolean_t current_thread_aborted(void);
 149
 150 /* Forward declarations of internal routines. */
 151 extern kern_return_t vm_fault_wire_fast(
 152                                 vm_map_t        map,
 153                                 vm_map_offset_t va,
 154                                 vm_map_entry_t  entry,
 155                                 pmap_t          pmap,
 156                                 vm_map_offset_t pmap_addr,
 157                                 ppnum_t         *physpage_p);
 158
 159 extern void vm_fault_continue(void);
 160
 161 extern void vm_fault_copy_cleanup(
 162                                 vm_page_t       page,
 163                                 vm_page_t       top_page);
 164
 165 extern void vm_fault_copy_dst_cleanup(
 166                                 vm_page_t       page);
 167
 168 #if     VM_FAULT_CLASSIFY
 169 extern void vm_fault_classify(vm_object_t       object,
 170                           vm_object_offset_t    offset,
 171                           vm_prot_t             fault_type);
 172
 173 extern void vm_fault_classify_init(void);
 174 #endif
 175
 176 unsigned long vm_pmap_enter_blocked = 0;
 177 unsigned long vm_pmap_enter_retried = 0;
 178
 179 unsigned long vm_cs_validates = 0;
 180 unsigned long vm_cs_revalidates = 0;
 181 unsigned long vm_cs_query_modified = 0;
 182 unsigned long vm_cs_validated_dirtied = 0;
 183 unsigned long vm_cs_bitmap_validated = 0;
 184
 185 void vm_pre_fault(vm_map_offset_t);
 186
 187 /*
 188  *      Routine:        vm_fault_init
 189  *      Purpose:
 190  *              Initialize our private data structures.
 191  */
 192 void
 193 vm_fault_init(void)
 194 {
 195         int i, vm_compressor_temp;
 196         boolean_t need_default_val = TRUE;
 197         /*
 198          * Choose a value for the hard throttle threshold based on the amount of ram.  The threshold is
 199          * computed as a percentage of available memory, and the percentage used is scaled inversely with
 200          * the amount of memory.  The percentage runs between 10% and 35%.  We use 35% for small memory systems
 201          * and reduce the value down to 10% for very large memory configurations.  This helps give us a
 202          * definition of a memory hog that makes more sense relative to the amount of ram in the machine.
 203          * The formula here simply uses the number of gigabytes of ram to adjust the percentage.
 204          */
 205
 206         vm_hard_throttle_threshold = sane_size * (35 - MIN((int)(sane_size / (1024*1024*1024)), 25)) / 100;
 207
 208         /*
 209          * Configure compressed pager behavior. A boot arg takes precedence over a device tree entry.
 210          */
 211
 212         if (PE_parse_boot_argn("vm_compressor", &vm_compressor_temp, sizeof (vm_compressor_temp))) {
 213                 for ( i = 0; i < VM_PAGER_MAX_MODES; i++) {
 214                         if (vm_compressor_temp > 0 &&
 215                             ((vm_compressor_temp & ( 1 << i)) == vm_compressor_temp)) {
 216                                 need_default_val = FALSE;
 217                                 vm_compressor_mode = vm_compressor_temp;
 218                                 break;
 219                         }
 220                 }
 221                 if (need_default_val)
 222                         printf("Ignoring \"vm_compressor\" boot arg %d\n", vm_compressor_temp);
 223         }
 224         if (need_default_val) {
 225                 /* If no boot arg or incorrect boot arg, try device tree. */
 226                 PE_get_default("kern.vm_compressor", &vm_compressor_mode, sizeof(vm_compressor_mode));
 227         }
 228         PE_parse_boot_argn("vm_compressor_threads", &vm_compressor_thread_count, sizeof (vm_compressor_thread_count));
 229         printf("\"vm_compressor_mode\" is %d\n", vm_compressor_mode);
 230 }
 231
 232 /*
 233  *      Routine:        vm_fault_cleanup
 234  *      Purpose:
 235  *              Clean up the result of vm_fault_page.
 236  *      Results:
 237  *              The paging reference for "object" is released.
 238  *              "object" is unlocked.
 239  *              If "top_page" is not null,  "top_page" is
 240  *              freed and the paging reference for the object
 241  *              containing it is released.
 242  *
 243  *      In/out conditions:
 244  *              "object" must be locked.
 245  */
 246 void
 247 vm_fault_cleanup(
 248         register vm_object_t    object,
 249         register vm_page_t      top_page)
 250 {
 251         vm_object_paging_end(object);
 252         vm_object_unlock(object);
 253
 254         if (top_page != VM_PAGE_NULL) {
 255                 object = top_page->object;
 256
 257                 vm_object_lock(object);
 258                 VM_PAGE_FREE(top_page);
 259                 vm_object_paging_end(object);
 260                 vm_object_unlock(object);
 261         }
 262 }
 263
 264 #if     MACH_CLUSTER_STATS
 265 #define MAXCLUSTERPAGES 16
 266 struct {
 267         unsigned long pages_in_cluster;
 268         unsigned long pages_at_higher_offsets;
 269         unsigned long pages_at_lower_offsets;
 270 } cluster_stats_in[MAXCLUSTERPAGES];
 271 #define CLUSTER_STAT(clause)    clause
 272 #define CLUSTER_STAT_HIGHER(x)  \
 273         ((cluster_stats_in[(x)].pages_at_higher_offsets)++)
 274 #define CLUSTER_STAT_LOWER(x)   \
 275          ((cluster_stats_in[(x)].pages_at_lower_offsets)++)
 276 #define CLUSTER_STAT_CLUSTER(x) \
 277         ((cluster_stats_in[(x)].pages_in_cluster)++)
 278 #else   /* MACH_CLUSTER_STATS */
 279 #define CLUSTER_STAT(clause)
 280 #endif  /* MACH_CLUSTER_STATS */
 281
 282 #define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
 283
 284
 285 boolean_t       vm_page_deactivate_behind = TRUE;
 286 /*
 287  * default sizes given VM_BEHAVIOR_DEFAULT reference behavior
 288  */
 289 #define VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW     128
 290 #define VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER    16              /* don't make this too big... */
 291                                                                 /* we use it to size an array on the stack */
 292
 293 int vm_default_behind = VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW;
 294
 295 #define MAX_SEQUENTIAL_RUN      (1024 * 1024 * 1024)
 296
 297 /*
 298  * vm_page_is_sequential
 299  *
 300  * Determine if sequential access is in progress
 301  * in accordance with the behavior specified.
 302  * Update state to indicate current access pattern.
 303  *
 304  * object must have at least the shared lock held
 305  */
 306 static
 307 void
 308 vm_fault_is_sequential(
 309         vm_object_t             object,
 310         vm_object_offset_t      offset,
 311         vm_behavior_t           behavior)
 312 {
 313         vm_object_offset_t      last_alloc;
 314         int                     sequential;
 315         int                     orig_sequential;
 316
 317         last_alloc = object->last_alloc;
 318         sequential = object->sequential;
 319         orig_sequential = sequential;
 320
 321         switch (behavior) {
 322         case VM_BEHAVIOR_RANDOM:
 323                 /*
 324                  * reset indicator of sequential behavior
 325                  */
 326                 sequential = 0;
 327                 break;
 328
 329         case VM_BEHAVIOR_SEQUENTIAL:
 330                 if (offset && last_alloc == offset - PAGE_SIZE_64) {
 331                         /*
 332                          * advance indicator of sequential behavior
 333                          */
 334                         if (sequential < MAX_SEQUENTIAL_RUN)
 335                                 sequential += PAGE_SIZE;
 336                 } else {
 337                         /*
 338                          * reset indicator of sequential behavior
 339                          */
 340                         sequential = 0;
 341                 }
 342                 break;
 343
 344         case VM_BEHAVIOR_RSEQNTL:
 345                 if (last_alloc && last_alloc == offset + PAGE_SIZE_64) {
 346                         /*
 347                          * advance indicator of sequential behavior
 348                          */
 349                         if (sequential > -MAX_SEQUENTIAL_RUN)
 350                                 sequential -= PAGE_SIZE;
 351                 } else {
 352                         /*
 353                          * reset indicator of sequential behavior
 354                          */
 355                         sequential = 0;
 356                 }
 357                 break;
 358
 359         case VM_BEHAVIOR_DEFAULT:
 360         default:
 361                 if (offset && last_alloc == (offset - PAGE_SIZE_64)) {
 362                         /*
 363                          * advance indicator of sequential behavior
 364                          */
 365                         if (sequential < 0)
 366                                 sequential = 0;
 367                         if (sequential < MAX_SEQUENTIAL_RUN)
 368                                 sequential += PAGE_SIZE;
 369
 370                 } else if (last_alloc && last_alloc == (offset + PAGE_SIZE_64)) {
 371                         /*
 372                          * advance indicator of sequential behavior
 373                          */
 374                         if (sequential > 0)
 375                                 sequential = 0;
 376                         if (sequential > -MAX_SEQUENTIAL_RUN)
 377                                 sequential -= PAGE_SIZE;
 378                 } else {
 379                         /*
 380                          * reset indicator of sequential behavior
 381                          */
 382                         sequential = 0;
 383                 }
 384                 break;
 385         }
 386         if (sequential != orig_sequential) {
 387                 if (!OSCompareAndSwap(orig_sequential, sequential, (UInt32 *)&object->sequential)) {
 388                         /*
 389                          * if someone else has already updated object->sequential
 390                          * don't bother trying to update it or object->last_alloc
 391                          */
 392                         return;
 393                 }
 394         }
 395         /*
 396          * I'd like to do this with a OSCompareAndSwap64, but that
 397          * doesn't exist for PPC...  however, it shouldn't matter
 398          * that much... last_alloc is maintained so that we can determine
 399          * if a sequential access pattern is taking place... if only
 400          * one thread is banging on this object, no problem with the unprotected
 401          * update... if 2 or more threads are banging away, we run the risk of
 402          * someone seeing a mangled update... however, in the face of multiple
 403          * accesses, no sequential access pattern can develop anyway, so we
 404          * haven't lost any real info.
 405          */
 406         object->last_alloc = offset;
 407 }
 408
 409
 410 int vm_page_deactivate_behind_count = 0;
 411
 412 /*
 413  * vm_page_deactivate_behind
 414  *
 415  * Determine if sequential access is in progress
 416  * in accordance with the behavior specified.  If
 417  * so, compute a potential page to deactivate and
 418  * deactivate it.
 419  *
 420  * object must be locked.
 421  *
 422  * return TRUE if we actually deactivate a page
 423  */
 424 static
 425 boolean_t
 426 vm_fault_deactivate_behind(
 427         vm_object_t             object,
 428         vm_object_offset_t      offset,
 429         vm_behavior_t           behavior)
 430 {
 431         int             n;
 432         int             pages_in_run = 0;
 433         int             max_pages_in_run = 0;
 434         int             sequential_run;
 435         int             sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
 436         vm_object_offset_t      run_offset = 0;
 437         vm_object_offset_t      pg_offset = 0;
 438         vm_page_t       m;
 439         vm_page_t       page_run[VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER];
 440
 441         pages_in_run = 0;
 442 #if TRACEFAULTPAGE
 443         dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind); /* (TEST/DEBUG) */
 444 #endif
 445
 446         if (object == kernel_object || vm_page_deactivate_behind == FALSE) {
 447                 /*
 448                  * Do not deactivate pages from the kernel object: they
 449                  * are not intended to become pageable.
 450                  * or we've disabled the deactivate behind mechanism
 451                  */
 452                 return FALSE;
 453         }
 454         if ((sequential_run = object->sequential)) {
 455                   if (sequential_run < 0) {
 456                           sequential_behavior = VM_BEHAVIOR_RSEQNTL;
 457                           sequential_run = 0 - sequential_run;
 458                   } else {
 459                           sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
 460                   }
 461         }
 462         switch (behavior) {
 463         case VM_BEHAVIOR_RANDOM:
 464                 break;
 465         case VM_BEHAVIOR_SEQUENTIAL:
 466                 if (sequential_run >= (int)PAGE_SIZE) {
 467                         run_offset = 0 - PAGE_SIZE_64;
 468                         max_pages_in_run = 1;
 469                 }
 470                 break;
 471         case VM_BEHAVIOR_RSEQNTL:
 472                 if (sequential_run >= (int)PAGE_SIZE) {
 473                         run_offset = PAGE_SIZE_64;
 474                         max_pages_in_run = 1;
 475                 }
 476                 break;
 477         case VM_BEHAVIOR_DEFAULT:
 478         default:
 479         {       vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
 480
 481                 /*
 482                  * determine if the run of sequential accesss has been
 483                  * long enough on an object with default access behavior
 484                  * to consider it for deactivation
 485                  */
 486                 if ((uint64_t)sequential_run >= behind && (sequential_run % (VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER * PAGE_SIZE)) == 0) {
 487                         /*
 488                          * the comparisons between offset and behind are done
 489                          * in this kind of odd fashion in order to prevent wrap around
 490                          * at the end points
 491                          */
 492                         if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) {
 493                                 if (offset >= behind) {
 494                                         run_offset = 0 - behind;
 495                                         pg_offset = PAGE_SIZE_64;
 496                                         max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
 497                                 }
 498                         } else {
 499                                 if (offset < -behind) {
 500                                         run_offset = behind;
 501                                         pg_offset = 0 - PAGE_SIZE_64;
 502                                         max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
 503                                 }
 504                         }
 505                 }
 506                 break;
 507         }
 508         }
 509         for (n = 0; n < max_pages_in_run; n++) {
 510                 m = vm_page_lookup(object, offset + run_offset + (n * pg_offset));
 511
 512                 if (m && !m->laundry && !m->busy && !m->no_cache && !m->throttled && !m->fictitious && !m->absent) {
 513                         page_run[pages_in_run++] = m;
 514
 515                         /*
 516                          * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
 517                          *
 518                          * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
 519                          * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
 520                          * new reference happens. If no futher references happen on the page after that remote TLB flushes
 521                          * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
 522                          * by pageout_scan, which is just fine since the last reference would have happened quite far
 523                          * in the past (TLB caches don't hang around for very long), and of course could just as easily
 524                          * have happened before we did the deactivate_behind.
 525                          */
 526                         pmap_clear_refmod_options(m->phys_page, VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
 527                 }
 528         }
 529         if (pages_in_run) {
 530                 vm_page_lockspin_queues();
 531
 532                 for (n = 0; n < pages_in_run; n++) {
 533
 534                         m = page_run[n];
 535
 536                         vm_page_deactivate_internal(m, FALSE);
 537
 538                         vm_page_deactivate_behind_count++;
 539 #if TRACEFAULTPAGE
 540                         dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
 541 #endif
 542                 }
 543                 vm_page_unlock_queues();
 544
 545                 return TRUE;
 546         }
 547         return FALSE;
 548 }
 549
 550
 551 #if (DEVELOPMENT || DEBUG)
 552 uint32_t        vm_page_creation_throttled_hard = 0;
 553 uint32_t        vm_page_creation_throttled_soft = 0;
 554 #endif /* DEVELOPMENT || DEBUG */
 555
 556 static int
 557 vm_page_throttled(boolean_t page_kept)
 558 {
 559         clock_sec_t     elapsed_sec;
 560         clock_sec_t     tv_sec;
 561         clock_usec_t    tv_usec;
 562
 563         thread_t thread = current_thread();
 564
 565         if (thread->options & TH_OPT_VMPRIV)
 566                 return (0);
 567
 568         if (thread->t_page_creation_throttled) {
 569                 thread->t_page_creation_throttled = 0;
 570
 571                 if (page_kept == FALSE)
 572                         goto no_throttle;
 573         }
 574         if (NEED_TO_HARD_THROTTLE_THIS_TASK()) {
 575 #if (DEVELOPMENT || DEBUG)
 576                 thread->t_page_creation_throttled_hard++;
 577                 OSAddAtomic(1, &vm_page_creation_throttled_hard);
 578 #endif /* DEVELOPMENT || DEBUG */
 579                 return (HARD_THROTTLE_DELAY);
 580         }
 581
 582         if ((vm_page_free_count < vm_page_throttle_limit || ((COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) && SWAPPER_NEEDS_TO_UNTHROTTLE())) &&
 583             thread->t_page_creation_count > (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS * VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC)) {
 584
 585                 clock_get_system_microtime(&tv_sec, &tv_usec);
 586
 587                 elapsed_sec = tv_sec - thread->t_page_creation_time;
 588
 589                 if (elapsed_sec <= VM_PAGE_CREATION_THROTTLE_PERIOD_SECS ||
 590                     (thread->t_page_creation_count / elapsed_sec) >= VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC) {
 591
 592                         if (elapsed_sec >= (3 * VM_PAGE_CREATION_THROTTLE_PERIOD_SECS)) {
 593                                 /*
 594                                  * we'll reset our stats to give a well behaved app
 595                                  * that was unlucky enough to accumulate a bunch of pages
 596                                  * over a long period of time a chance to get out of
 597                                  * the throttled state... we reset the counter and timestamp
 598                                  * so that if it stays under the rate limit for the next second
 599                                  * it will be back in our good graces... if it exceeds it, it
 600                                  * will remain in the throttled state
 601                                  */
 602                                 thread->t_page_creation_time = tv_sec;
 603                                 thread->t_page_creation_count = VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC * (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS - 1);
 604                         }
 605                         ++vm_page_throttle_count;
 606
 607                         thread->t_page_creation_throttled = 1;
 608
 609                         if ((COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) && HARD_THROTTLE_LIMIT_REACHED()) {
 610 #if (DEVELOPMENT || DEBUG)
 611                                 thread->t_page_creation_throttled_hard++;
 612                                 OSAddAtomic(1, &vm_page_creation_throttled_hard);
 613 #endif /* DEVELOPMENT || DEBUG */
 614                                 return (HARD_THROTTLE_DELAY);
 615                         } else {
 616 #if (DEVELOPMENT || DEBUG)
 617                                 thread->t_page_creation_throttled_soft++;
 618                                 OSAddAtomic(1, &vm_page_creation_throttled_soft);
 619 #endif /* DEVELOPMENT || DEBUG */
 620                                 return (SOFT_THROTTLE_DELAY);
 621                         }
 622                 }
 623                 thread->t_page_creation_time = tv_sec;
 624                 thread->t_page_creation_count = 0;
 625         }
 626 no_throttle:
 627         thread->t_page_creation_count++;
 628
 629         return (0);
 630 }
 631
 632 /*
 633  * check for various conditions that would
 634  * prevent us from creating a ZF page...
 635  * cleanup is based on being called from vm_fault_page
 636  *
 637  * object must be locked
 638  * object == m->object
 639  */
 640 static vm_fault_return_t
 641 vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t interruptible_state, boolean_t page_throttle)
 642 {
 643         int throttle_delay;
 644
 645         if (object->shadow_severed ||
 646             VM_OBJECT_PURGEABLE_FAULT_ERROR(object)) {
 647                 /*
 648                  * Either:
 649                  * 1. the shadow chain was severed,
 650                  * 2. the purgeable object is volatile or empty and is marked
 651                  *    to fault on access while volatile.
 652                  * Just have to return an error at this point
 653                  */
 654                 if (m != VM_PAGE_NULL)
 655                         VM_PAGE_FREE(m);
 656                 vm_fault_cleanup(object, first_m);
 657
 658                 thread_interrupt_level(interruptible_state);
 659
 660                 return (VM_FAULT_MEMORY_ERROR);
 661         }
 662         if (vm_backing_store_low) {
 663                 /*
 664                  * are we protecting the system from
 665                  * backing store exhaustion.  If so
 666                  * sleep unless we are privileged.
 667                  */
 668                 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
 669
 670                         if (m != VM_PAGE_NULL)
 671                                 VM_PAGE_FREE(m);
 672                         vm_fault_cleanup(object, first_m);
 673
 674                         assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
 675
 676                         thread_block(THREAD_CONTINUE_NULL);
 677                         thread_interrupt_level(interruptible_state);
 678
 679                         return (VM_FAULT_RETRY);
 680                 }
 681         }
 682         if (page_throttle == TRUE && (throttle_delay = vm_page_throttled(FALSE))) {
 683                 /*
 684                  * we're throttling zero-fills...
 685                  * treat this as if we couldn't grab a page
 686                  */
 687                 if (m != VM_PAGE_NULL)
 688                         VM_PAGE_FREE(m);
 689                 vm_fault_cleanup(object, first_m);
 690
 691                 VM_DEBUG_EVENT(vmf_check_zfdelay, VMF_CHECK_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
 692
 693                 delay(throttle_delay);
 694
 695                 if (current_thread_aborted()) {
 696                         thread_interrupt_level(interruptible_state);
 697                         return VM_FAULT_INTERRUPTED;
 698                 }
 699                 thread_interrupt_level(interruptible_state);
 700
 701                 return (VM_FAULT_MEMORY_SHORTAGE);
 702         }
 703         return (VM_FAULT_SUCCESS);
 704 }
 705
 706
 707 /*
 708  * do the work to zero fill a page and
 709  * inject it into the correct paging queue
 710  *
 711  * m->object must be locked
 712  * page queue lock must NOT be held
 713  */
 714 static int
 715 vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill)
 716 {
 717         int my_fault = DBG_ZERO_FILL_FAULT;
 718
 719         /*
 720          * This is is a zero-fill page fault...
 721          *
 722          * Checking the page lock is a waste of
 723          * time;  this page was absent, so
 724          * it can't be page locked by a pager.
 725          *
 726          * we also consider it undefined
 727          * with respect to instruction
 728          * execution.  i.e. it is the responsibility
 729          * of higher layers to call for an instruction
 730          * sync after changing the contents and before
 731          * sending a program into this area.  We
 732          * choose this approach for performance
 733          */
 734         m->pmapped = TRUE;
 735
 736         m->cs_validated = FALSE;
 737         m->cs_tainted = FALSE;
 738
 739         if (no_zero_fill == TRUE) {
 740                 my_fault = DBG_NZF_PAGE_FAULT;
 741
 742                 if (m->absent && m->busy)
 743                         return (my_fault);
 744         } else {
 745                 vm_page_zero_fill(m);
 746
 747                 VM_STAT_INCR(zero_fill_count);
 748                 DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
 749         }
 750         assert(!m->laundry);
 751         assert(m->object != kernel_object);
 752         //assert(m->pageq.next == NULL && m->pageq.prev == NULL);
 753
 754         if (!VM_DYNAMIC_PAGING_ENABLED(memory_manager_default) &&
 755                 (m->object->purgable == VM_PURGABLE_DENY ||
 756                  m->object->purgable == VM_PURGABLE_NONVOLATILE ||
 757                  m->object->purgable == VM_PURGABLE_VOLATILE )) {
 758
 759                 vm_page_lockspin_queues();
 760
 761                 if (!VM_DYNAMIC_PAGING_ENABLED(memory_manager_default)) {
 762                         assert(!VM_PAGE_WIRED(m));
 763
 764                         /*
 765                          * can't be on the pageout queue since we don't
 766                          * have a pager to try and clean to
 767                          */
 768                         assert(!m->pageout_queue);
 769
 770                         VM_PAGE_QUEUES_REMOVE(m);
 771
 772                         queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq);
 773                         m->throttled = TRUE;
 774                         vm_page_throttled_count++;
 775                 }
 776                 vm_page_unlock_queues();
 777         }
 778         return (my_fault);
 779 }
 780
 781
 782 /*
 783  *      Routine:        vm_fault_page
 784  *      Purpose:
 785  *              Find the resident page for the virtual memory
 786  *              specified by the given virtual memory object
 787  *              and offset.
 788  *      Additional arguments:
 789  *              The required permissions for the page is given
 790  *              in "fault_type".  Desired permissions are included
 791  *              in "protection".
 792  *              fault_info is passed along to determine pagein cluster
 793  *              limits... it contains the expected reference pattern,
 794  *              cluster size if available, etc...
 795  *
 796  *              If the desired page is known to be resident (for
 797  *              example, because it was previously wired down), asserting
 798  *              the "unwiring" parameter will speed the search.
 799  *
 800  *              If the operation can be interrupted (by thread_abort
 801  *              or thread_terminate), then the "interruptible"
 802  *              parameter should be asserted.
 803  *
 804  *      Results:
 805  *              The page containing the proper data is returned
 806  *              in "result_page".
 807  *
 808  *      In/out conditions:
 809  *              The source object must be locked and referenced,
 810  *              and must donate one paging reference.  The reference
 811  *              is not affected.  The paging reference and lock are
 812  *              consumed.
 813  *
 814  *              If the call succeeds, the object in which "result_page"
 815  *              resides is left locked and holding a paging reference.
 816  *              If this is not the original object, a busy page in the
 817  *              original object is returned in "top_page", to prevent other
 818  *              callers from pursuing this same data, along with a paging
 819  *              reference for the original object.  The "top_page" should
 820  *              be destroyed when this guarantee is no longer required.
 821  *              The "result_page" is also left busy.  It is not removed
 822  *              from the pageout queues.
 823  *      Special Case:
 824  *              A return value of VM_FAULT_SUCCESS_NO_PAGE means that the
 825  *              fault succeeded but there's no VM page (i.e. the VM object
 826  *              does not actually hold VM pages, but device memory or
 827  *              large pages).  The object is still locked and we still hold a
 828  *              paging_in_progress reference.
 829  */
 830 unsigned int vm_fault_page_blocked_access = 0;
 831 unsigned int vm_fault_page_forced_retry = 0;
 832
 833 vm_fault_return_t
 834 vm_fault_page(
 835         /* Arguments: */
 836         vm_object_t     first_object,   /* Object to begin search */
 837         vm_object_offset_t first_offset,        /* Offset into object */
 838         vm_prot_t       fault_type,     /* What access is requested */
 839         boolean_t       must_be_resident,/* Must page be resident? */
 840         boolean_t       caller_lookup,  /* caller looked up page */
 841         /* Modifies in place: */
 842         vm_prot_t       *protection,    /* Protection for mapping */
 843         vm_page_t       *result_page,   /* Page found, if successful */
 844         /* Returns: */
 845         vm_page_t       *top_page,      /* Page in top object, if
 846                                          * not result_page.  */
 847         int             *type_of_fault, /* if non-null, fill in with type of fault
 848                                          * COW, zero-fill, etc... returned in trace point */
 849         /* More arguments: */
 850         kern_return_t   *error_code,    /* code if page is in error */
 851         boolean_t       no_zero_fill,   /* don't zero fill absent pages */
 852         boolean_t       data_supply,    /* treat as data_supply if
 853                                          * it is a write fault and a full
 854                                          * page is provided */
 855         vm_object_fault_info_t fault_info)
 856 {
 857         vm_page_t               m;
 858         vm_object_t             object;
 859         vm_object_offset_t      offset;
 860         vm_page_t               first_m;
 861         vm_object_t             next_object;
 862         vm_object_t             copy_object;
 863         boolean_t               look_for_page;
 864         boolean_t               force_fault_retry = FALSE;
 865         vm_prot_t               access_required = fault_type;
 866         vm_prot_t               wants_copy_flag;
 867         CLUSTER_STAT(int pages_at_higher_offsets;)
 868         CLUSTER_STAT(int pages_at_lower_offsets;)
 869         kern_return_t           wait_result;
 870         boolean_t               interruptible_state;
 871         boolean_t               data_already_requested = FALSE;
 872         vm_behavior_t           orig_behavior;
 873         vm_size_t               orig_cluster_size;
 874         vm_fault_return_t       error;
 875         int                     my_fault;
 876         uint32_t                try_failed_count;
 877         int                     interruptible; /* how may fault be interrupted? */
 878         int                     external_state = VM_EXTERNAL_STATE_UNKNOWN;
 879         memory_object_t         pager;
 880         vm_fault_return_t       retval;
 881
 882 /*
 883  * MACH page map - an optional optimization where a bit map is maintained
 884  * by the VM subsystem for internal objects to indicate which pages of
 885  * the object currently reside on backing store.  This existence map
 886  * duplicates information maintained by the vnode pager.  It is
 887  * created at the time of the first pageout against the object, i.e.
 888  * at the same time pager for the object is created.  The optimization
 889  * is designed to eliminate pager interaction overhead, if it is
 890  * 'known' that the page does not exist on backing store.
 891  *
 892  * MUST_ASK_PAGER() evaluates to TRUE if the page specified by object/offset is
 893  * either marked as paged out in the existence map for the object or no
 894  * existence map exists for the object.  MUST_ASK_PAGER() is one of the
 895  * criteria in the decision to invoke the pager.   It is also used as one
 896  * of the criteria to terminate the scan for adjacent pages in a clustered
 897  * pagein operation.  Note that MUST_ASK_PAGER() always evaluates to TRUE for
 898  * permanent objects.  Note also that if the pager for an internal object
 899  * has not been created, the pager is not invoked regardless of the value
 900  * of MUST_ASK_PAGER() and that clustered pagein scans are only done on an object
 901  * for which a pager has been created.
 902  *
 903  * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
 904  * is marked as paged out in the existence map for the object.  PAGED_OUT()
 905  * PAGED_OUT() is used to determine if a page has already been pushed
 906  * into a copy object in order to avoid a redundant page out operation.
 907  */
 908 #if MACH_PAGEMAP
 909 #define MUST_ASK_PAGER(o, f, s)                                 \
 910         ((vm_external_state_get((o)->existence_map, (f))        \
 911           != VM_EXTERNAL_STATE_ABSENT) &&                       \
 912          (s = (VM_COMPRESSOR_PAGER_STATE_GET((o), (f))))        \
 913          != VM_EXTERNAL_STATE_ABSENT)
 914 #define PAGED_OUT(o, f)                                         \
 915         ((vm_external_state_get((o)->existence_map, (f))        \
 916           == VM_EXTERNAL_STATE_EXISTS) ||                       \
 917          (VM_COMPRESSOR_PAGER_STATE_GET((o), (f))               \
 918           == VM_EXTERNAL_STATE_EXISTS))
 919 #else /* MACH_PAGEMAP */
 920 #define MUST_ASK_PAGER(o, f, s)                                 \
 921         ((s = VM_COMPRESSOR_PAGER_STATE_GET((o), (f))) != VM_EXTERNAL_STATE_ABSENT)
 922 #define PAGED_OUT(o, f) \
 923         (VM_COMPRESSOR_PAGER_STATE_GET((o), (f)) == VM_EXTERNAL_STATE_EXISTS)
 924 #endif /* MACH_PAGEMAP */
 925
 926 /*
 927  *      Recovery actions
 928  */
 929 #define RELEASE_PAGE(m)                                 \
 930         MACRO_BEGIN                                     \
 931         PAGE_WAKEUP_DONE(m);                            \
 932         if (!m->active && !m->inactive && !m->throttled) {              \
 933                 vm_page_lockspin_queues();                              \
 934                 if (!m->active && !m->inactive && !m->throttled) {      \
 935                         if (COMPRESSED_PAGER_IS_ACTIVE) \
 936                                 vm_page_deactivate(m);                  \
 937                         else                                            \
 938                                 vm_page_activate(m);                    \
 939                 }                                                       \
 940                 vm_page_unlock_queues();                                \
 941         }                                                               \
 942         MACRO_END
 943
 944 #if TRACEFAULTPAGE
 945         dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */
 946 #endif
 947
 948         interruptible = fault_info->interruptible;
 949         interruptible_state = thread_interrupt_level(interruptible);
 950
 951         /*
 952          *      INVARIANTS (through entire routine):
 953          *
 954          *      1)      At all times, we must either have the object
 955          *              lock or a busy page in some object to prevent
 956          *              some other thread from trying to bring in
 957          *              the same page.
 958          *
 959          *              Note that we cannot hold any locks during the
 960          *              pager access or when waiting for memory, so
 961          *              we use a busy page then.
 962          *
 963          *      2)      To prevent another thread from racing us down the
 964          *              shadow chain and entering a new page in the top
 965          *              object before we do, we must keep a busy page in
 966          *              the top object while following the shadow chain.
 967          *
 968          *      3)      We must increment paging_in_progress on any object
 969          *              for which we have a busy page before dropping
 970          *              the object lock
 971          *
 972          *      4)      We leave busy pages on the pageout queues.
 973          *              If the pageout daemon comes across a busy page,
 974          *              it will remove the page from the pageout queues.
 975          */
 976
 977         object = first_object;
 978         offset = first_offset;
 979         first_m = VM_PAGE_NULL;
 980         access_required = fault_type;
 981
 982
 983         XPR(XPR_VM_FAULT,
 984                 "vm_f_page: obj 0x%X, offset 0x%X, type %d, prot %d\n",
 985                 object, offset, fault_type, *protection, 0);
 986
 987         /*
 988          * default type of fault
 989          */
 990         my_fault = DBG_CACHE_HIT_FAULT;
 991
 992         while (TRUE) {
 993 #if TRACEFAULTPAGE
 994                 dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
 995 #endif
 996                 if (!object->alive) {
 997                         /*
 998                          * object is no longer valid
 999                          * clean up and return error
1000                          */
1001                         vm_fault_cleanup(object, first_m);
1002                         thread_interrupt_level(interruptible_state);
1003
1004                         return (VM_FAULT_MEMORY_ERROR);
1005                 }
1006
1007                 if (!object->pager_created && object->phys_contiguous) {
1008                         /*
1009                          * A physically-contiguous object without a pager:
1010                          * must be a "large page" object.  We do not deal
1011                          * with VM pages for this object.
1012                          */
1013                         caller_lookup = FALSE;
1014                         m = VM_PAGE_NULL;
1015                         goto phys_contig_object;
1016                 }
1017
1018                 if (object->blocked_access) {
1019                         /*
1020                          * Access to this VM object has been blocked.
1021                          * Replace our "paging_in_progress" reference with
1022                          * a "activity_in_progress" reference and wait for
1023                          * access to be unblocked.
1024                          */
1025                         caller_lookup = FALSE; /* no longer valid after sleep */
1026                         vm_object_activity_begin(object);
1027                         vm_object_paging_end(object);
1028                         while (object->blocked_access) {
1029                                 vm_object_sleep(object,
1030                                                 VM_OBJECT_EVENT_UNBLOCKED,
1031                                                 THREAD_UNINT);
1032                         }
1033                         vm_fault_page_blocked_access++;
1034                         vm_object_paging_begin(object);
1035                         vm_object_activity_end(object);
1036                 }
1037
1038                 /*
1039                  * See whether the page at 'offset' is resident
1040                  */
1041                 if (caller_lookup == TRUE) {
1042                         /*
1043                          * The caller has already looked up the page
1044                          * and gave us the result in "result_page".
1045                          * We can use this for the first lookup but
1046                          * it loses its validity as soon as we unlock
1047                          * the object.
1048                          */
1049                         m = *result_page;
1050                         caller_lookup = FALSE; /* no longer valid after that */
1051                 } else {
1052                         m = vm_page_lookup(object, offset);
1053                 }
1054 #if TRACEFAULTPAGE
1055                 dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
1056 #endif
1057                 if (m != VM_PAGE_NULL) {
1058
1059                         if (m->busy) {
1060                                 /*
1061                                  * The page is being brought in,
1062                                  * wait for it and then retry.
1063                                  */
1064 #if TRACEFAULTPAGE
1065                                 dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1066 #endif
1067                                 wait_result = PAGE_SLEEP(object, m, interruptible);
1068
1069                                 XPR(XPR_VM_FAULT,
1070                                     "vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n",
1071                                     object, offset,
1072                                     m, 0, 0);
1073                                 counter(c_vm_fault_page_block_busy_kernel++);
1074
1075                                 if (wait_result != THREAD_AWAKENED) {
1076                                         vm_fault_cleanup(object, first_m);
1077                                         thread_interrupt_level(interruptible_state);
1078
1079                                         if (wait_result == THREAD_RESTART)
1080                                                 return (VM_FAULT_RETRY);
1081                                         else
1082                                                 return (VM_FAULT_INTERRUPTED);
1083                                 }
1084                                 continue;
1085                         }
1086                         if (m->laundry) {
1087                                 m->pageout = FALSE;
1088
1089                                 if (!m->cleaning)
1090                                         vm_pageout_steal_laundry(m, FALSE);
1091                         }
1092                         if (m->phys_page == vm_page_guard_addr) {
1093                                 /*
1094                                  * Guard page: off limits !
1095                                  */
1096                                 if (fault_type == VM_PROT_NONE) {
1097                                         /*
1098                                          * The fault is not requesting any
1099                                          * access to the guard page, so it must
1100                                          * be just to wire or unwire it.
1101                                          * Let's pretend it succeeded...
1102                                          */
1103                                         m->busy = TRUE;
1104                                         *result_page = m;
1105                                         assert(first_m == VM_PAGE_NULL);
1106                                         *top_page = first_m;
1107                                         if (type_of_fault)
1108                                                 *type_of_fault = DBG_GUARD_FAULT;
1109                                         thread_interrupt_level(interruptible_state);
1110                                         return VM_FAULT_SUCCESS;
1111                                 } else {
1112                                         /*
1113                                          * The fault requests access to the
1114                                          * guard page: let's deny that !
1115                                          */
1116                                         vm_fault_cleanup(object, first_m);
1117                                         thread_interrupt_level(interruptible_state);
1118                                         return VM_FAULT_MEMORY_ERROR;
1119                                 }
1120                         }
1121
1122                         if (m->error) {
1123                                 /*
1124                                  * The page is in error, give up now.
1125                                  */
1126 #if TRACEFAULTPAGE
1127                                 dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code);      /* (TEST/DEBUG) */
1128 #endif
1129                                 if (error_code)
1130                                         *error_code = KERN_MEMORY_ERROR;
1131                                 VM_PAGE_FREE(m);
1132
1133                                 vm_fault_cleanup(object, first_m);
1134                                 thread_interrupt_level(interruptible_state);
1135
1136                                 return (VM_FAULT_MEMORY_ERROR);
1137                         }
1138                         if (m->restart) {
1139                                 /*
1140                                  * The pager wants us to restart
1141                                  * at the top of the chain,
1142                                  * typically because it has moved the
1143                                  * page to another pager, then do so.
1144                                  */
1145 #if TRACEFAULTPAGE
1146                                 dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1147 #endif
1148                                 VM_PAGE_FREE(m);
1149
1150                                 vm_fault_cleanup(object, first_m);
1151                                 thread_interrupt_level(interruptible_state);
1152
1153                                 return (VM_FAULT_RETRY);
1154                         }
1155                         if (m->absent) {
1156                                 /*
1157                                  * The page isn't busy, but is absent,
1158                                  * therefore it's deemed "unavailable".
1159                                  *
1160                                  * Remove the non-existent page (unless it's
1161                                  * in the top object) and move on down to the
1162                                  * next object (if there is one).
1163                                  */
1164 #if TRACEFAULTPAGE
1165                                 dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow);  /* (TEST/DEBUG) */
1166 #endif
1167                                 next_object = object->shadow;
1168
1169                                 if (next_object == VM_OBJECT_NULL) {
1170                                         /*
1171                                          * Absent page at bottom of shadow
1172                                          * chain; zero fill the page we left
1173                                          * busy in the first object, and free
1174                                          * the absent page.
1175                                          */
1176                                         assert(!must_be_resident);
1177
1178                                         /*
1179                                          * check for any conditions that prevent
1180                                          * us from creating a new zero-fill page
1181                                          * vm_fault_check will do all of the
1182                                          * fault cleanup in the case of an error condition
1183                                          * including resetting the thread_interrupt_level
1184                                          */
1185                                         error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
1186
1187                                         if (error != VM_FAULT_SUCCESS)
1188                                                 return (error);
1189
1190                                         XPR(XPR_VM_FAULT,
1191                                             "vm_f_page: zero obj 0x%X, off 0x%X, page 0x%X, first_obj 0x%X\n",
1192                                                 object, offset,
1193                                                 m,
1194                                                 first_object, 0);
1195
1196                                         if (object != first_object) {
1197                                                 /*
1198                                                  * free the absent page we just found
1199                                                  */
1200                                                 VM_PAGE_FREE(m);
1201
1202                                                 /*
1203                                                  * drop reference and lock on current object
1204                                                  */
1205                                                 vm_object_paging_end(object);
1206                                                 vm_object_unlock(object);
1207
1208                                                 /*
1209                                                  * grab the original page we
1210                                                  * 'soldered' in place and
1211                                                  * retake lock on 'first_object'
1212                                                  */
1213                                                 m = first_m;
1214                                                 first_m = VM_PAGE_NULL;
1215
1216                                                 object = first_object;
1217                                                 offset = first_offset;
1218
1219                                                 vm_object_lock(object);
1220                                         } else {
1221                                                 /*
1222                                                  * we're going to use the absent page we just found
1223                                                  * so convert it to a 'busy' page
1224                                                  */
1225                                                 m->absent = FALSE;
1226                                                 m->busy = TRUE;
1227                                         }
1228                                         if (fault_info->mark_zf_absent && no_zero_fill == TRUE)
1229                                                 m->absent = TRUE;
1230                                         /*
1231                                          * zero-fill the page and put it on
1232                                          * the correct paging queue
1233                                          */
1234                                         my_fault = vm_fault_zero_page(m, no_zero_fill);
1235
1236                                         break;
1237                                 } else {
1238                                         if (must_be_resident)
1239                                                 vm_object_paging_end(object);
1240                                         else if (object != first_object) {
1241                                                 vm_object_paging_end(object);
1242                                                 VM_PAGE_FREE(m);
1243                                         } else {
1244                                                 first_m = m;
1245                                                 m->absent = FALSE;
1246                                                 m->busy = TRUE;
1247
1248                                                 vm_page_lockspin_queues();
1249
1250                                                 assert(!m->pageout_queue);
1251                                                 VM_PAGE_QUEUES_REMOVE(m);
1252
1253                                                 vm_page_unlock_queues();
1254                                         }
1255                                         XPR(XPR_VM_FAULT,
1256                                             "vm_f_page: unavail obj 0x%X, off 0x%X, next_obj 0x%X, newoff 0x%X\n",
1257                                                 object, offset,
1258                                                 next_object,
1259                                                 offset+object->vo_shadow_offset,0);
1260
1261                                         offset += object->vo_shadow_offset;
1262                                         fault_info->lo_offset += object->vo_shadow_offset;
1263                                         fault_info->hi_offset += object->vo_shadow_offset;
1264                                         access_required = VM_PROT_READ;
1265
1266                                         vm_object_lock(next_object);
1267                                         vm_object_unlock(object);
1268                                         object = next_object;
1269                                         vm_object_paging_begin(object);
1270
1271                                         /*
1272                                          * reset to default type of fault
1273                                          */
1274                                         my_fault = DBG_CACHE_HIT_FAULT;
1275
1276                                         continue;
1277                                 }
1278                         }
1279                         if ((m->cleaning)
1280                             && ((object != first_object) || (object->copy != VM_OBJECT_NULL))
1281                             && (fault_type & VM_PROT_WRITE)) {
1282                                 /*
1283                                  * This is a copy-on-write fault that will
1284                                  * cause us to revoke access to this page, but
1285                                  * this page is in the process of being cleaned
1286                                  * in a clustered pageout. We must wait until
1287                                  * the cleaning operation completes before
1288                                  * revoking access to the original page,
1289                                  * otherwise we might attempt to remove a
1290                                  * wired mapping.
1291                                  */
1292 #if TRACEFAULTPAGE
1293                                 dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset);  /* (TEST/DEBUG) */
1294 #endif
1295                                 XPR(XPR_VM_FAULT,
1296                                     "vm_f_page: cleaning obj 0x%X, offset 0x%X, page 0x%X\n",
1297                                         object, offset,
1298                                         m, 0, 0);
1299                                 /*
1300                                  * take an extra ref so that object won't die
1301                                  */
1302                                 vm_object_reference_locked(object);
1303
1304                                 vm_fault_cleanup(object, first_m);
1305
1306                                 counter(c_vm_fault_page_block_backoff_kernel++);
1307                                 vm_object_lock(object);
1308                                 assert(object->ref_count > 0);
1309
1310                                 m = vm_page_lookup(object, offset);
1311
1312                                 if (m != VM_PAGE_NULL && m->cleaning) {
1313                                         PAGE_ASSERT_WAIT(m, interruptible);
1314
1315                                         vm_object_unlock(object);
1316                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1317                                         vm_object_deallocate(object);
1318
1319                                         goto backoff;
1320                                 } else {
1321                                         vm_object_unlock(object);
1322
1323                                         vm_object_deallocate(object);
1324                                         thread_interrupt_level(interruptible_state);
1325
1326                                         return (VM_FAULT_RETRY);
1327                                 }
1328                         }
1329                         if (type_of_fault == NULL && m->speculative &&
1330                             !(fault_info != NULL && fault_info->stealth)) {
1331                                 /*
1332                                  * If we were passed a non-NULL pointer for
1333                                  * "type_of_fault", than we came from
1334                                  * vm_fault... we'll let it deal with
1335                                  * this condition, since it
1336                                  * needs to see m->speculative to correctly
1337                                  * account the pageins, otherwise...
1338                                  * take it off the speculative queue, we'll
1339                                  * let the caller of vm_fault_page deal
1340                                  * with getting it onto the correct queue
1341                                  *
1342                                  * If the caller specified in fault_info that
1343                                  * it wants a "stealth" fault, we also leave
1344                                  * the page in the speculative queue.
1345                                  */
1346                                 vm_page_lockspin_queues();
1347                                 if (m->speculative)
1348                                         VM_PAGE_QUEUES_REMOVE(m);
1349                                 vm_page_unlock_queues();
1350                         }
1351
1352                         if (m->encrypted) {
1353                                 /*
1354                                  * ENCRYPTED SWAP:
1355                                  * the user needs access to a page that we
1356                                  * encrypted before paging it out.
1357                                  * Decrypt the page now.
1358                                  * Keep it busy to prevent anyone from
1359                                  * accessing it during the decryption.
1360                                  */
1361                                 m->busy = TRUE;
1362                                 vm_page_decrypt(m, 0);
1363                                 assert(object == m->object);
1364                                 assert(m->busy);
1365                                 PAGE_WAKEUP_DONE(m);
1366
1367                                 /*
1368                                  * Retry from the top, in case
1369                                  * something changed while we were
1370                                  * decrypting.
1371                                  */
1372                                 continue;
1373                         }
1374                         ASSERT_PAGE_DECRYPTED(m);
1375
1376                         if (m->object->code_signed) {
1377                                 /*
1378                                  * CODE SIGNING:
1379                                  * We just paged in a page from a signed
1380                                  * memory object but we don't need to
1381                                  * validate it now.  We'll validate it if
1382                                  * when it gets mapped into a user address
1383                                  * space for the first time or when the page
1384                                  * gets copied to another object as a result
1385                                  * of a copy-on-write.
1386                                  */
1387                         }
1388
1389                         /*
1390                          * We mark the page busy and leave it on
1391                          * the pageout queues.  If the pageout
1392                          * deamon comes across it, then it will
1393                          * remove the page from the queue, but not the object
1394                          */
1395 #if TRACEFAULTPAGE
1396                         dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1397 #endif
1398                         XPR(XPR_VM_FAULT,
1399                             "vm_f_page: found page obj 0x%X, offset 0x%X, page 0x%X\n",
1400                                 object, offset, m, 0, 0);
1401                         assert(!m->busy);
1402                         assert(!m->absent);
1403
1404                         m->busy = TRUE;
1405                         break;
1406                 }
1407
1408
1409                 /*
1410                  * we get here when there is no page present in the object at
1411                  * the offset we're interested in... we'll allocate a page
1412                  * at this point if the pager associated with
1413                  * this object can provide the data or we're the top object...
1414                  * object is locked;  m == NULL
1415                  */
1416                 if (must_be_resident) {
1417                         if (fault_type == VM_PROT_NONE &&
1418                             object == kernel_object) {
1419                                 /*
1420                                  * We've been called from vm_fault_unwire()
1421                                  * while removing a map entry that was allocated
1422                                  * with KMA_KOBJECT and KMA_VAONLY.  This page
1423                                  * is not present and there's nothing more to
1424                                  * do here (nothing to unwire).
1425                                  */
1426                                 vm_fault_cleanup(object, first_m);
1427                                 thread_interrupt_level(interruptible_state);
1428
1429                                 return VM_FAULT_MEMORY_ERROR;
1430                         }
1431
1432                         goto dont_look_for_page;
1433                 }
1434
1435 #if !MACH_PAGEMAP
1436                 data_supply = FALSE;
1437 #endif /* !MACH_PAGEMAP */
1438
1439                 look_for_page = (object->pager_created && (MUST_ASK_PAGER(object, offset, external_state) == TRUE) && !data_supply);
1440
1441 #if TRACEFAULTPAGE
1442                 dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object);      /* (TEST/DEBUG) */
1443 #endif
1444                 if (!look_for_page && object == first_object && !object->phys_contiguous) {
1445                         /*
1446                          * Allocate a new page for this object/offset pair as a placeholder
1447                          */
1448                         m = vm_page_grab();
1449 #if TRACEFAULTPAGE
1450                         dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
1451 #endif
1452                         if (m == VM_PAGE_NULL) {
1453
1454                                 vm_fault_cleanup(object, first_m);
1455                                 thread_interrupt_level(interruptible_state);
1456
1457                                 return (VM_FAULT_MEMORY_SHORTAGE);
1458                         }
1459
1460                         if (fault_info && fault_info->batch_pmap_op == TRUE) {
1461                                 vm_page_insert_internal(m, object, offset, FALSE, TRUE, TRUE);
1462                         } else {
1463                                 vm_page_insert(m, object, offset);
1464                         }
1465                 }
1466                 if (look_for_page) {
1467                         kern_return_t   rc;
1468                         int             my_fault_type;
1469
1470                         /*
1471                          *      If the memory manager is not ready, we
1472                          *      cannot make requests.
1473                          */
1474                         if (!object->pager_ready) {
1475 #if TRACEFAULTPAGE
1476                                 dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
1477 #endif
1478                                 if (m != VM_PAGE_NULL)
1479                                         VM_PAGE_FREE(m);
1480
1481                                 XPR(XPR_VM_FAULT,
1482                                 "vm_f_page: ready wait obj 0x%X, offset 0x%X\n",
1483                                         object, offset, 0, 0, 0);
1484
1485                                 /*
1486                                  * take an extra ref so object won't die
1487                                  */
1488                                 vm_object_reference_locked(object);
1489                                 vm_fault_cleanup(object, first_m);
1490                                 counter(c_vm_fault_page_block_backoff_kernel++);
1491
1492                                 vm_object_lock(object);
1493                                 assert(object->ref_count > 0);
1494
1495                                 if (!object->pager_ready) {
1496                                         wait_result = vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGER_READY, interruptible);
1497
1498                                         vm_object_unlock(object);
1499                                         if (wait_result == THREAD_WAITING)
1500                                                 wait_result = thread_block(THREAD_CONTINUE_NULL);
1501                                         vm_object_deallocate(object);
1502
1503                                         goto backoff;
1504                                 } else {
1505                                         vm_object_unlock(object);
1506                                         vm_object_deallocate(object);
1507                                         thread_interrupt_level(interruptible_state);
1508
1509                                         return (VM_FAULT_RETRY);
1510                                 }
1511                         }
1512                         if (!object->internal && !object->phys_contiguous && object->paging_in_progress > vm_object_pagein_throttle) {
1513                                 /*
1514                                  * If there are too many outstanding page
1515                                  * requests pending on this external object, we
1516                                  * wait for them to be resolved now.
1517                                  */
1518 #if TRACEFAULTPAGE
1519                                 dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1520 #endif
1521                                 if (m != VM_PAGE_NULL)
1522                                         VM_PAGE_FREE(m);
1523                                 /*
1524                                  * take an extra ref so object won't die
1525                                  */
1526                                 vm_object_reference_locked(object);
1527
1528                                 vm_fault_cleanup(object, first_m);
1529
1530                                 counter(c_vm_fault_page_block_backoff_kernel++);
1531
1532                                 vm_object_lock(object);
1533                                 assert(object->ref_count > 0);
1534
1535                                 if (object->paging_in_progress >= vm_object_pagein_throttle) {
1536                                         vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGING_ONLY_IN_PROGRESS, interruptible);
1537
1538                                         vm_object_unlock(object);
1539                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1540                                         vm_object_deallocate(object);
1541
1542                                         goto backoff;
1543                                 } else {
1544                                         vm_object_unlock(object);
1545                                         vm_object_deallocate(object);
1546                                         thread_interrupt_level(interruptible_state);
1547
1548                                         return (VM_FAULT_RETRY);
1549                                 }
1550                         }
1551                         if (object->internal &&
1552                             (COMPRESSED_PAGER_IS_ACTIVE
1553                              || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE)) {
1554                                 int compressed_count_delta;
1555
1556                                 if (m == VM_PAGE_NULL) {
1557                                         /*
1558                                          * Allocate a new page for this object/offset pair as a placeholder
1559                                          */
1560                                         m = vm_page_grab();
1561 #if TRACEFAULTPAGE
1562                                         dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
1563 #endif
1564                                         if (m == VM_PAGE_NULL) {
1565
1566                                                 vm_fault_cleanup(object, first_m);
1567                                                 thread_interrupt_level(interruptible_state);
1568
1569                                                 return (VM_FAULT_MEMORY_SHORTAGE);
1570                                         }
1571
1572                                         m->absent = TRUE;
1573                                         if (fault_info && fault_info->batch_pmap_op == TRUE) {
1574                                                 vm_page_insert_internal(m, object, offset, FALSE, TRUE, TRUE);
1575                                         } else {
1576                                                 vm_page_insert(m, object, offset);
1577                                         }
1578                                 }
1579                                 assert(m->busy);
1580
1581                                 m->absent = TRUE;
1582                                 pager = object->pager;
1583
1584                                 assert(object->paging_in_progress > 0);
1585                                 vm_object_unlock(object);
1586
1587                                 rc = vm_compressor_pager_get(
1588                                         pager,
1589                                         offset + object->paging_offset,
1590                                         m->phys_page,
1591                                         &my_fault_type,
1592                                         0,
1593                                         &compressed_count_delta);
1594
1595                                 if (type_of_fault == NULL) {
1596                                         int     throttle_delay;
1597
1598                                         /*
1599                                          * we weren't called from vm_fault, so we
1600                                          * need to apply page creation throttling
1601                                          * do it before we re-acquire any locks
1602                                          */
1603                                         if (my_fault_type == DBG_COMPRESSOR_FAULT) {
1604                                                 if ((throttle_delay = vm_page_throttled(TRUE))) {
1605                                                         VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 1, 0);
1606                                                         delay(throttle_delay);
1607                                                 }
1608                                         }
1609                                 }
1610                                 vm_object_lock(object);
1611                                 assert(object->paging_in_progress > 0);
1612
1613                                 vm_compressor_pager_count(
1614                                         pager,
1615                                         compressed_count_delta,
1616                                         FALSE, /* shared_lock */
1617                                         object);
1618
1619                                 switch (rc) {
1620                                 case KERN_SUCCESS:
1621                                         m->absent = FALSE;
1622                                         m->dirty = TRUE;
1623                                         if ((m->object->wimg_bits &
1624                                              VM_WIMG_MASK) !=
1625                                             VM_WIMG_USE_DEFAULT) {
1626                                                 /*
1627                                                  * If the page is not cacheable,
1628                                                  * we can't let its contents
1629                                                  * linger in the data cache
1630                                                  * after the decompression.
1631                                                  */
1632                                                 pmap_sync_page_attributes_phys(
1633                                                         m->phys_page);
1634                                         } else {
1635                                                 m->written_by_kernel = TRUE;
1636                                         }
1637
1638                                         /*
1639                                          * If the object is purgeable, its
1640                                          * owner's purgeable ledgers have been
1641                                          * updated in vm_page_insert() but the
1642                                          * page was also accounted for in a
1643                                          * "compressed purgeable" ledger, so
1644                                          * update that now.
1645                                          */
1646                                         if ((object->purgable !=
1647                                              VM_PURGABLE_DENY) &&
1648                                             (object->vo_purgeable_owner !=
1649                                              NULL)) {
1650                                                 /*
1651                                                  * One less compressed
1652                                                  * purgeable page.
1653                                                  */
1654                                                 vm_purgeable_compressed_update(
1655                                                         object,
1656                                                         -1);
1657                                         }
1658
1659                                         break;
1660                                 case KERN_MEMORY_FAILURE:
1661                                         m->unusual = TRUE;
1662                                         m->error = TRUE;
1663                                         m->absent = FALSE;
1664                                         break;
1665                                 case KERN_MEMORY_ERROR:
1666                                         assert(m->absent);
1667                                         break;
1668                                 default:
1669                                         panic("vm_fault_page(): unexpected "
1670                                               "error %d from "
1671                                               "vm_compressor_pager_get()\n",
1672                                               rc);
1673                                 }
1674                                 PAGE_WAKEUP_DONE(m);
1675
1676                                 rc = KERN_SUCCESS;
1677                                 goto data_requested;
1678                         }
1679                         my_fault_type = DBG_PAGEIN_FAULT;
1680
1681                         if (m != VM_PAGE_NULL) {
1682                                 VM_PAGE_FREE(m);
1683                                 m = VM_PAGE_NULL;
1684                         }
1685
1686 #if TRACEFAULTPAGE
1687                         dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0);  /* (TEST/DEBUG) */
1688 #endif
1689
1690                         /*
1691                          * It's possible someone called vm_object_destroy while we weren't
1692                          * holding the object lock.  If that has happened, then bail out
1693                          * here.
1694                          */
1695
1696                         pager = object->pager;
1697
1698                         if (pager == MEMORY_OBJECT_NULL) {
1699                                 vm_fault_cleanup(object, first_m);
1700                                 thread_interrupt_level(interruptible_state);
1701                                 return VM_FAULT_MEMORY_ERROR;
1702                         }
1703
1704                         /*
1705                          * We have an absent page in place for the faulting offset,
1706                          * so we can release the object lock.
1707                          */
1708
1709                         vm_object_unlock(object);
1710
1711                         /*
1712                          * If this object uses a copy_call strategy,
1713                          * and we are interested in a copy of this object
1714                          * (having gotten here only by following a
1715                          * shadow chain), then tell the memory manager
1716                          * via a flag added to the desired_access
1717                          * parameter, so that it can detect a race
1718                          * between our walking down the shadow chain
1719                          * and its pushing pages up into a copy of
1720                          * the object that it manages.
1721                          */
1722                         if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL && object != first_object)
1723                                 wants_copy_flag = VM_PROT_WANTS_COPY;
1724                         else
1725                                 wants_copy_flag = VM_PROT_NONE;
1726
1727                         XPR(XPR_VM_FAULT,
1728                             "vm_f_page: data_req obj 0x%X, offset 0x%X, page 0x%X, acc %d\n",
1729                                 object, offset, m,
1730                                 access_required | wants_copy_flag, 0);
1731
1732                         if (object->copy == first_object) {
1733                                 /*
1734                                  * if we issue the memory_object_data_request in
1735                                  * this state, we are subject to a deadlock with
1736                                  * the underlying filesystem if it is trying to
1737                                  * shrink the file resulting in a push of pages
1738                                  * into the copy object...  that push will stall
1739                                  * on the placeholder page, and if the pushing thread
1740                                  * is holding a lock that is required on the pagein
1741                                  * path (such as a truncate lock), we'll deadlock...
1742                                  * to avoid this potential deadlock, we throw away
1743                                  * our placeholder page before calling memory_object_data_request
1744                                  * and force this thread to retry the vm_fault_page after
1745                                  * we have issued the I/O.  the second time through this path
1746                                  * we will find the page already in the cache (presumably still
1747                                  * busy waiting for the I/O to complete) and then complete
1748                                  * the fault w/o having to go through memory_object_data_request again
1749                                  */
1750                                 assert(first_m != VM_PAGE_NULL);
1751                                 assert(first_m->object == first_object);
1752
1753                                 vm_object_lock(first_object);
1754                                 VM_PAGE_FREE(first_m);
1755                                 vm_object_paging_end(first_object);
1756                                 vm_object_unlock(first_object);
1757
1758                                 first_m = VM_PAGE_NULL;
1759                                 force_fault_retry = TRUE;
1760
1761                                 vm_fault_page_forced_retry++;
1762                         }
1763
1764                         if (data_already_requested == TRUE) {
1765                                 orig_behavior = fault_info->behavior;
1766                                 orig_cluster_size = fault_info->cluster_size;
1767
1768                                 fault_info->behavior = VM_BEHAVIOR_RANDOM;
1769                                 fault_info->cluster_size = PAGE_SIZE;
1770                         }
1771                         /*
1772                          * Call the memory manager to retrieve the data.
1773                          */
1774                         rc = memory_object_data_request(
1775                                 pager,
1776                                 offset + object->paging_offset,
1777                                 PAGE_SIZE,
1778                                 access_required | wants_copy_flag,
1779                                 (memory_object_fault_info_t)fault_info);
1780
1781                         if (data_already_requested == TRUE) {
1782                                 fault_info->behavior = orig_behavior;
1783                                 fault_info->cluster_size = orig_cluster_size;
1784                         } else
1785                                 data_already_requested = TRUE;
1786
1787                         DTRACE_VM2(maj_fault, int, 1, (uint64_t *), NULL);
1788 #if TRACEFAULTPAGE
1789                         dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
1790 #endif
1791                         vm_object_lock(object);
1792
1793                 data_requested:
1794                         if (rc != KERN_SUCCESS) {
1795
1796                                 vm_fault_cleanup(object, first_m);
1797                                 thread_interrupt_level(interruptible_state);
1798
1799                                 return ((rc == MACH_SEND_INTERRUPTED) ?
1800                                         VM_FAULT_INTERRUPTED :
1801                                         VM_FAULT_MEMORY_ERROR);
1802                         } else {
1803                                 clock_sec_t     tv_sec;
1804                                 clock_usec_t    tv_usec;
1805
1806                                 if (my_fault_type == DBG_PAGEIN_FAULT) {
1807                                         clock_get_system_microtime(&tv_sec, &tv_usec);
1808                                         current_thread()->t_page_creation_time = tv_sec;
1809                                         current_thread()->t_page_creation_count = 0;
1810                                 }
1811                         }
1812                         if ((interruptible != THREAD_UNINT) && (current_thread()->sched_flags & TH_SFLAG_ABORT)) {
1813
1814                                 vm_fault_cleanup(object, first_m);
1815                                 thread_interrupt_level(interruptible_state);
1816
1817                                 return (VM_FAULT_INTERRUPTED);
1818                         }
1819                         if (force_fault_retry == TRUE) {
1820
1821                                 vm_fault_cleanup(object, first_m);
1822                                 thread_interrupt_level(interruptible_state);
1823
1824                                 return (VM_FAULT_RETRY);
1825                         }
1826                         if (m == VM_PAGE_NULL && object->phys_contiguous) {
1827                                 /*
1828                                  * No page here means that the object we
1829                                  * initially looked up was "physically
1830                                  * contiguous" (i.e. device memory).  However,
1831                                  * with Virtual VRAM, the object might not
1832                                  * be backed by that device memory anymore,
1833                                  * so we're done here only if the object is
1834                                  * still "phys_contiguous".
1835                                  * Otherwise, if the object is no longer
1836                                  * "phys_contiguous", we need to retry the
1837                                  * page fault against the object's new backing
1838                                  * store (different memory object).
1839                                  */
1840                         phys_contig_object:
1841                                 goto done;
1842                         }
1843                         /*
1844                          * potentially a pagein fault
1845                          * if we make it through the state checks
1846                          * above, than we'll count it as such
1847                          */
1848                         my_fault = my_fault_type;
1849
1850                         /*
1851                          * Retry with same object/offset, since new data may
1852                          * be in a different page (i.e., m is meaningless at
1853                          * this point).
1854                          */
1855                         continue;
1856                 }
1857 dont_look_for_page:
1858                 /*
1859                  * We get here if the object has no pager, or an existence map
1860                  * exists and indicates the page isn't present on the pager
1861                  * or we're unwiring a page.  If a pager exists, but there
1862                  * is no existence map, then the m->absent case above handles
1863                  * the ZF case when the pager can't provide the page
1864                  */
1865 #if TRACEFAULTPAGE
1866                 dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1867 #endif
1868                 if (object == first_object)
1869                         first_m = m;
1870                 else
1871                         assert(m == VM_PAGE_NULL);
1872
1873                 XPR(XPR_VM_FAULT,
1874                     "vm_f_page: no pager obj 0x%X, offset 0x%X, page 0x%X, next_obj 0x%X\n",
1875                         object, offset, m,
1876                         object->shadow, 0);
1877
1878                 next_object = object->shadow;
1879
1880                 if (next_object == VM_OBJECT_NULL) {
1881                         /*
1882                          * we've hit the bottom of the shadown chain,
1883                          * fill the page in the top object with zeros.
1884                          */
1885                         assert(!must_be_resident);
1886
1887                         if (object != first_object) {
1888                                 vm_object_paging_end(object);
1889                                 vm_object_unlock(object);
1890
1891                                 object = first_object;
1892                                 offset = first_offset;
1893                                 vm_object_lock(object);
1894                         }
1895                         m = first_m;
1896                         assert(m->object == object);
1897                         first_m = VM_PAGE_NULL;
1898
1899                         /*
1900                          * check for any conditions that prevent
1901                          * us from creating a new zero-fill page
1902                          * vm_fault_check will do all of the
1903                          * fault cleanup in the case of an error condition
1904                          * including resetting the thread_interrupt_level
1905                          */
1906                         error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
1907
1908                         if (error != VM_FAULT_SUCCESS)
1909                                 return (error);
1910
1911                         if (m == VM_PAGE_NULL) {
1912                                 m = vm_page_grab();
1913
1914                                 if (m == VM_PAGE_NULL) {
1915                                         vm_fault_cleanup(object, VM_PAGE_NULL);
1916                                         thread_interrupt_level(interruptible_state);
1917
1918                                         return (VM_FAULT_MEMORY_SHORTAGE);
1919                                 }
1920                                 vm_page_insert(m, object, offset);
1921                         }
1922                         if (fault_info->mark_zf_absent && no_zero_fill == TRUE)
1923                                 m->absent = TRUE;
1924
1925                         my_fault = vm_fault_zero_page(m, no_zero_fill);
1926
1927                         break;
1928
1929                 } else {
1930                         /*
1931                          * Move on to the next object.  Lock the next
1932                          * object before unlocking the current one.
1933                          */
1934                         if ((object != first_object) || must_be_resident)
1935                                 vm_object_paging_end(object);
1936
1937                         offset += object->vo_shadow_offset;
1938                         fault_info->lo_offset += object->vo_shadow_offset;
1939                         fault_info->hi_offset += object->vo_shadow_offset;
1940                         access_required = VM_PROT_READ;
1941
1942                         vm_object_lock(next_object);
1943                         vm_object_unlock(object);
1944
1945                         object = next_object;
1946                         vm_object_paging_begin(object);
1947                 }
1948         }
1949
1950         /*
1951          *      PAGE HAS BEEN FOUND.
1952          *
1953          *      This page (m) is:
1954          *              busy, so that we can play with it;
1955          *              not absent, so that nobody else will fill it;
1956          *              possibly eligible for pageout;
1957          *
1958          *      The top-level page (first_m) is:
1959          *              VM_PAGE_NULL if the page was found in the
1960          *               top-level object;
1961          *              busy, not absent, and ineligible for pageout.
1962          *
1963          *      The current object (object) is locked.  A paging
1964          *      reference is held for the current and top-level
1965          *      objects.
1966          */
1967
1968 #if TRACEFAULTPAGE
1969         dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1970 #endif
1971 #if     EXTRA_ASSERTIONS
1972         assert(m->busy && !m->absent);
1973         assert((first_m == VM_PAGE_NULL) ||
1974                (first_m->busy && !first_m->absent &&
1975                 !first_m->active && !first_m->inactive));
1976 #endif  /* EXTRA_ASSERTIONS */
1977
1978         /*
1979          * ENCRYPTED SWAP:
1980          * If we found a page, we must have decrypted it before we
1981          * get here...
1982          */
1983         ASSERT_PAGE_DECRYPTED(m);
1984
1985         XPR(XPR_VM_FAULT,
1986             "vm_f_page: FOUND obj 0x%X, off 0x%X, page 0x%X, 1_obj 0x%X, 1_m 0x%X\n",
1987                 object, offset, m,
1988                 first_object, first_m);
1989
1990         /*
1991          * If the page is being written, but isn't
1992          * already owned by the top-level object,
1993          * we have to copy it into a new page owned
1994          * by the top-level object.
1995          */
1996         if (object != first_object) {
1997
1998 #if TRACEFAULTPAGE
1999                 dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
2000 #endif
2001                 if (fault_type & VM_PROT_WRITE) {
2002                         vm_page_t copy_m;
2003
2004                         /*
2005                          * We only really need to copy if we
2006                          * want to write it.
2007                          */
2008                         assert(!must_be_resident);
2009
2010                         /*
2011                          * are we protecting the system from
2012                          * backing store exhaustion.  If so
2013                          * sleep unless we are privileged.
2014                          */
2015                         if (vm_backing_store_low) {
2016                                 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
2017
2018                                         RELEASE_PAGE(m);
2019                                         vm_fault_cleanup(object, first_m);
2020
2021                                         assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
2022
2023                                         thread_block(THREAD_CONTINUE_NULL);
2024                                         thread_interrupt_level(interruptible_state);
2025
2026                                         return (VM_FAULT_RETRY);
2027                                 }
2028                         }
2029                         /*
2030                          * If we try to collapse first_object at this
2031                          * point, we may deadlock when we try to get
2032                          * the lock on an intermediate object (since we
2033                          * have the bottom object locked).  We can't
2034                          * unlock the bottom object, because the page
2035                          * we found may move (by collapse) if we do.
2036                          *
2037                          * Instead, we first copy the page.  Then, when
2038                          * we have no more use for the bottom object,
2039                          * we unlock it and try to collapse.
2040                          *
2041                          * Note that we copy the page even if we didn't
2042                          * need to... that's the breaks.
2043                          */
2044
2045                         /*
2046                          * Allocate a page for the copy
2047                          */
2048                         copy_m = vm_page_grab();
2049
2050                         if (copy_m == VM_PAGE_NULL) {
2051                                 RELEASE_PAGE(m);
2052
2053                                 vm_fault_cleanup(object, first_m);
2054                                 thread_interrupt_level(interruptible_state);
2055
2056                                 return (VM_FAULT_MEMORY_SHORTAGE);
2057                         }
2058                         XPR(XPR_VM_FAULT,
2059                             "vm_f_page: page_copy obj 0x%X, offset 0x%X, m 0x%X, copy_m 0x%X\n",
2060                                 object, offset,
2061                                 m, copy_m, 0);
2062
2063                         vm_page_copy(m, copy_m);
2064
2065                         /*
2066                          * If another map is truly sharing this
2067                          * page with us, we have to flush all
2068                          * uses of the original page, since we
2069                          * can't distinguish those which want the
2070                          * original from those which need the
2071                          * new copy.
2072                          *
2073                          * XXXO If we know that only one map has
2074                          * access to this page, then we could
2075                          * avoid the pmap_disconnect() call.
2076                          */
2077                         if (m->pmapped)
2078                                 pmap_disconnect(m->phys_page);
2079
2080                         if (m->clustered) {
2081                                 VM_PAGE_COUNT_AS_PAGEIN(m);
2082                                 VM_PAGE_CONSUME_CLUSTERED(m);
2083                         }
2084                         assert(!m->cleaning);
2085
2086                         /*
2087                          * We no longer need the old page or object.
2088                          */
2089                         RELEASE_PAGE(m);
2090
2091                         vm_object_paging_end(object);
2092                         vm_object_unlock(object);
2093
2094                         my_fault = DBG_COW_FAULT;
2095                         VM_STAT_INCR(cow_faults);
2096                         DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
2097                         current_task()->cow_faults++;
2098
2099                         object = first_object;
2100                         offset = first_offset;
2101
2102                         vm_object_lock(object);
2103                         /*
2104                          * get rid of the place holder
2105                          * page that we soldered in earlier
2106                          */
2107                         VM_PAGE_FREE(first_m);
2108                         first_m = VM_PAGE_NULL;
2109
2110                         /*
2111                          * and replace it with the
2112                          * page we just copied into
2113                          */
2114                         assert(copy_m->busy);
2115                         vm_page_insert(copy_m, object, offset);
2116                         SET_PAGE_DIRTY(copy_m, TRUE);
2117
2118                         m = copy_m;
2119                         /*
2120                          * Now that we've gotten the copy out of the
2121                          * way, let's try to collapse the top object.
2122                          * But we have to play ugly games with
2123                          * paging_in_progress to do that...
2124                          */
2125                         vm_object_paging_end(object);
2126                         vm_object_collapse(object, offset, TRUE);
2127                         vm_object_paging_begin(object);
2128
2129                 } else
2130                         *protection &= (~VM_PROT_WRITE);
2131         }
2132         /*
2133          * Now check whether the page needs to be pushed into the
2134          * copy object.  The use of asymmetric copy on write for
2135          * shared temporary objects means that we may do two copies to
2136          * satisfy the fault; one above to get the page from a
2137          * shadowed object, and one here to push it into the copy.
2138          */
2139         try_failed_count = 0;
2140
2141         while ((copy_object = first_object->copy) != VM_OBJECT_NULL) {
2142                 vm_object_offset_t      copy_offset;
2143                 vm_page_t               copy_m;
2144
2145 #if TRACEFAULTPAGE
2146                 dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type);    /* (TEST/DEBUG) */
2147 #endif
2148                 /*
2149                  * If the page is being written, but hasn't been
2150                  * copied to the copy-object, we have to copy it there.
2151                  */
2152                 if ((fault_type & VM_PROT_WRITE) == 0) {
2153                         *protection &= ~VM_PROT_WRITE;
2154                         break;
2155                 }
2156
2157                 /*
2158                  * If the page was guaranteed to be resident,
2159                  * we must have already performed the copy.
2160                  */
2161                 if (must_be_resident)
2162                         break;
2163
2164                 /*
2165                  * Try to get the lock on the copy_object.
2166                  */
2167                 if (!vm_object_lock_try(copy_object)) {
2168
2169                         vm_object_unlock(object);
2170                         try_failed_count++;
2171
2172                         mutex_pause(try_failed_count);  /* wait a bit */
2173                         vm_object_lock(object);
2174
2175                         continue;
2176                 }
2177                 try_failed_count = 0;
2178
2179                 /*
2180                  * Make another reference to the copy-object,
2181                  * to keep it from disappearing during the
2182                  * copy.
2183                  */
2184                 vm_object_reference_locked(copy_object);
2185
2186                 /*
2187                  * Does the page exist in the copy?
2188                  */
2189                 copy_offset = first_offset - copy_object->vo_shadow_offset;
2190
2191                 if (copy_object->vo_size <= copy_offset)
2192                         /*
2193                          * Copy object doesn't cover this page -- do nothing.
2194                          */
2195                         ;
2196                 else if ((copy_m = vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
2197                         /*
2198                          * Page currently exists in the copy object
2199                          */
2200                         if (copy_m->busy) {
2201                                 /*
2202                                  * If the page is being brought
2203                                  * in, wait for it and then retry.
2204                                  */
2205                                 RELEASE_PAGE(m);
2206
2207                                 /*
2208                                  * take an extra ref so object won't die
2209                                  */
2210                                 vm_object_reference_locked(copy_object);
2211                                 vm_object_unlock(copy_object);
2212                                 vm_fault_cleanup(object, first_m);
2213                                 counter(c_vm_fault_page_block_backoff_kernel++);
2214
2215                                 vm_object_lock(copy_object);
2216                                 assert(copy_object->ref_count > 0);
2217                                 VM_OBJ_RES_DECR(copy_object);
2218                                 vm_object_lock_assert_exclusive(copy_object);
2219                                 copy_object->ref_count--;
2220                                 assert(copy_object->ref_count > 0);
2221                                 copy_m = vm_page_lookup(copy_object, copy_offset);
2222                                 /*
2223                                  * ENCRYPTED SWAP:
2224                                  * it's OK if the "copy_m" page is encrypted,
2225                                  * because we're not moving it nor handling its
2226                                  * contents.
2227                                  */
2228                                 if (copy_m != VM_PAGE_NULL && copy_m->busy) {
2229                                         PAGE_ASSERT_WAIT(copy_m, interruptible);
2230
2231                                         vm_object_unlock(copy_object);
2232                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
2233                                         vm_object_deallocate(copy_object);
2234
2235                                         goto backoff;
2236                                 } else {
2237                                         vm_object_unlock(copy_object);
2238                                         vm_object_deallocate(copy_object);
2239                                         thread_interrupt_level(interruptible_state);
2240
2241                                         return (VM_FAULT_RETRY);
2242                                 }
2243                         }
2244                 }
2245                 else if (!PAGED_OUT(copy_object, copy_offset)) {
2246                         /*
2247                          * If PAGED_OUT is TRUE, then the page used to exist
2248                          * in the copy-object, and has already been paged out.
2249                          * We don't need to repeat this. If PAGED_OUT is
2250                          * FALSE, then either we don't know (!pager_created,
2251                          * for example) or it hasn't been paged out.
2252                          * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
2253                          * We must copy the page to the copy object.
2254                          */
2255
2256                         if (vm_backing_store_low) {
2257                                 /*
2258                                  * we are protecting the system from
2259                                  * backing store exhaustion.  If so
2260                                  * sleep unless we are privileged.
2261                                  */
2262                                 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
2263                                         assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
2264
2265                                         RELEASE_PAGE(m);
2266                                         VM_OBJ_RES_DECR(copy_object);
2267                                         vm_object_lock_assert_exclusive(copy_object);
2268                                         copy_object->ref_count--;
2269                                         assert(copy_object->ref_count > 0);
2270
2271                                         vm_object_unlock(copy_object);
2272                                         vm_fault_cleanup(object, first_m);
2273                                         thread_block(THREAD_CONTINUE_NULL);
2274                                         thread_interrupt_level(interruptible_state);
2275
2276                                         return (VM_FAULT_RETRY);
2277                                 }
2278                         }
2279                         /*
2280                          * Allocate a page for the copy
2281                          */
2282                         copy_m = vm_page_alloc(copy_object, copy_offset);
2283
2284                         if (copy_m == VM_PAGE_NULL) {
2285                                 RELEASE_PAGE(m);
2286
2287                                 VM_OBJ_RES_DECR(copy_object);
2288                                 vm_object_lock_assert_exclusive(copy_object);
2289                                 copy_object->ref_count--;
2290                                 assert(copy_object->ref_count > 0);
2291
2292                                 vm_object_unlock(copy_object);
2293                                 vm_fault_cleanup(object, first_m);
2294                                 thread_interrupt_level(interruptible_state);
2295
2296                                 return (VM_FAULT_MEMORY_SHORTAGE);
2297                         }
2298                         /*
2299                          * Must copy page into copy-object.
2300                          */
2301                         vm_page_copy(m, copy_m);
2302
2303                         /*
2304                          * If the old page was in use by any users
2305                          * of the copy-object, it must be removed
2306                          * from all pmaps.  (We can't know which
2307                          * pmaps use it.)
2308                          */
2309                         if (m->pmapped)
2310                                 pmap_disconnect(m->phys_page);
2311
2312                         if (m->clustered) {
2313                                 VM_PAGE_COUNT_AS_PAGEIN(m);
2314                                 VM_PAGE_CONSUME_CLUSTERED(m);
2315                         }
2316                         /*
2317                          * If there's a pager, then immediately
2318                          * page out this page, using the "initialize"
2319                          * option.  Else, we use the copy.
2320                          */
2321                         if ((!copy_object->pager_ready)
2322 #if MACH_PAGEMAP
2323                             || vm_external_state_get(copy_object->existence_map, copy_offset) == VM_EXTERNAL_STATE_ABSENT
2324 #endif
2325                             || VM_COMPRESSOR_PAGER_STATE_GET(copy_object, copy_offset) == VM_EXTERNAL_STATE_ABSENT
2326                             ) {
2327
2328                                 vm_page_lockspin_queues();
2329                                 assert(!m->cleaning);
2330                                 vm_page_activate(copy_m);
2331                                 vm_page_unlock_queues();
2332
2333                                 SET_PAGE_DIRTY(copy_m, TRUE);
2334                                 PAGE_WAKEUP_DONE(copy_m);
2335
2336                         } else if (copy_object->internal &&
2337                                    (DEFAULT_PAGER_IS_ACTIVE || DEFAULT_FREEZER_IS_ACTIVE)) {
2338                                 /*
2339                                  * For internal objects check with the pager to see
2340                                  * if the page already exists in the backing store.
2341                                  * If yes, then we can drop the copy page. If not,
2342                                  * then we'll activate it, mark it dirty and keep it
2343                                  * around.
2344                                  */
2345
2346                                 kern_return_t kr = KERN_SUCCESS;
2347
2348                                 memory_object_t copy_pager = copy_object->pager;
2349                                 assert(copy_pager != MEMORY_OBJECT_NULL);
2350                                 vm_object_paging_begin(copy_object);
2351
2352                                 vm_object_unlock(copy_object);
2353
2354                                 kr = memory_object_data_request(
2355                                         copy_pager,
2356                                         copy_offset + copy_object->paging_offset,
2357                                         0, /* Only query the pager. */
2358                                         VM_PROT_READ,
2359                                         NULL);
2360
2361                                 vm_object_lock(copy_object);
2362
2363                                 vm_object_paging_end(copy_object);
2364
2365                                 /*
2366                                  * Since we dropped the copy_object's lock,
2367                                  * check whether we'll have to deallocate
2368                                  * the hard way.
2369                                  */
2370                                 if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
2371                                         vm_object_unlock(copy_object);
2372                                         vm_object_deallocate(copy_object);
2373                                         vm_object_lock(object);
2374
2375                                         continue;
2376                                 }
2377                                 if (kr == KERN_SUCCESS) {
2378                                         /*
2379                                          * The pager has the page. We don't want to overwrite
2380                                          * that page by sending this one out to the backing store.
2381                                          * So we drop the copy page.
2382                                          */
2383                                         VM_PAGE_FREE(copy_m);
2384
2385                                 } else {
2386                                         /*
2387                                          * The pager doesn't have the page. We'll keep this one
2388                                          * around in the copy object. It might get sent out to
2389                                          * the backing store under memory pressure.
2390                                          */
2391                                         vm_page_lockspin_queues();
2392                                         assert(!m->cleaning);
2393                                         vm_page_activate(copy_m);
2394                                         vm_page_unlock_queues();
2395
2396                                         SET_PAGE_DIRTY(copy_m, TRUE);
2397                                         PAGE_WAKEUP_DONE(copy_m);
2398                                 }
2399                         } else {
2400
2401                                 assert(copy_m->busy == TRUE);
2402                                 assert(!m->cleaning);
2403
2404                                 /*
2405                                  * dirty is protected by the object lock
2406                                  */
2407                                 SET_PAGE_DIRTY(copy_m, TRUE);
2408
2409                                 /*
2410                                  * The page is already ready for pageout:
2411                                  * not on pageout queues and busy.
2412                                  * Unlock everything except the
2413                                  * copy_object itself.
2414                                  */
2415                                 vm_object_unlock(object);
2416
2417                                 /*
2418                                  * Write the page to the copy-object,
2419                                  * flushing it from the kernel.
2420                                  */
2421                                 vm_pageout_initialize_page(copy_m);
2422
2423                                 /*
2424                                  * Since the pageout may have
2425                                  * temporarily dropped the
2426                                  * copy_object's lock, we
2427                                  * check whether we'll have
2428                                  * to deallocate the hard way.
2429                                  */
2430                                 if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
2431                                         vm_object_unlock(copy_object);
2432                                         vm_object_deallocate(copy_object);
2433                                         vm_object_lock(object);
2434
2435                                         continue;
2436                                 }
2437                                 /*
2438                                  * Pick back up the old object's
2439                                  * lock.  [It is safe to do so,
2440                                  * since it must be deeper in the
2441                                  * object tree.]
2442                                  */
2443                                 vm_object_lock(object);
2444                         }
2445
2446                         /*
2447                          * Because we're pushing a page upward
2448                          * in the object tree, we must restart
2449                          * any faults that are waiting here.
2450                          * [Note that this is an expansion of
2451                          * PAGE_WAKEUP that uses the THREAD_RESTART
2452                          * wait result].  Can't turn off the page's
2453                          * busy bit because we're not done with it.
2454                          */
2455                         if (m->wanted) {
2456                                 m->wanted = FALSE;
2457                                 thread_wakeup_with_result((event_t) m, THREAD_RESTART);
2458                         }
2459                 }
2460                 /*
2461                  * The reference count on copy_object must be
2462                  * at least 2: one for our extra reference,
2463                  * and at least one from the outside world
2464                  * (we checked that when we last locked
2465                  * copy_object).
2466                  */
2467                 vm_object_lock_assert_exclusive(copy_object);
2468                 copy_object->ref_count--;
2469                 assert(copy_object->ref_count > 0);
2470
2471                 VM_OBJ_RES_DECR(copy_object);
2472                 vm_object_unlock(copy_object);
2473
2474                 break;
2475         }
2476
2477 done:
2478         *result_page = m;
2479         *top_page = first_m;
2480
2481         XPR(XPR_VM_FAULT,
2482                 "vm_f_page: DONE obj 0x%X, offset 0x%X, m 0x%X, first_m 0x%X\n",
2483                 object, offset, m, first_m, 0);
2484
2485         if (m != VM_PAGE_NULL) {
2486                 retval = VM_FAULT_SUCCESS;
2487
2488                 if (my_fault == DBG_PAGEIN_FAULT) {
2489
2490                         VM_PAGE_COUNT_AS_PAGEIN(m);
2491
2492                         if (m->object->internal)
2493                                 my_fault = DBG_PAGEIND_FAULT;
2494                         else
2495                                 my_fault = DBG_PAGEINV_FAULT;
2496
2497                         /*
2498                          * evaluate access pattern and update state
2499                          * vm_fault_deactivate_behind depends on the
2500                          * state being up to date
2501                          */
2502                         vm_fault_is_sequential(object, offset, fault_info->behavior);
2503
2504                         vm_fault_deactivate_behind(object, offset, fault_info->behavior);
2505                 } else if (my_fault == DBG_COMPRESSOR_FAULT || my_fault == DBG_COMPRESSOR_SWAPIN_FAULT) {
2506
2507                         VM_STAT_INCR(decompressions);
2508                 }
2509                 if (type_of_fault)
2510                         *type_of_fault = my_fault;
2511         } else {
2512                 retval = VM_FAULT_SUCCESS_NO_VM_PAGE;
2513                 assert(first_m == VM_PAGE_NULL);
2514                 assert(object == first_object);
2515         }
2516
2517         thread_interrupt_level(interruptible_state);
2518
2519 #if TRACEFAULTPAGE
2520         dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0);       /* (TEST/DEBUG) */
2521 #endif
2522         return retval;
2523
2524 backoff:
2525         thread_interrupt_level(interruptible_state);
2526
2527         if (wait_result == THREAD_INTERRUPTED)
2528                 return (VM_FAULT_INTERRUPTED);
2529         return (VM_FAULT_RETRY);
2530
2531 #undef  RELEASE_PAGE
2532 }
2533
2534
2535
2536 /*
2537  * CODE SIGNING:
2538  * When soft faulting a page, we have to validate the page if:
2539  * 1. the page is being mapped in user space
2540  * 2. the page hasn't already been found to be "tainted"
2541  * 3. the page belongs to a code-signed object
2542  * 4. the page has not been validated yet or has been mapped for write.
2543  */
2544 #define VM_FAULT_NEED_CS_VALIDATION(pmap, page)                         \
2545         ((pmap) != kernel_pmap /*1*/ &&                                 \
2546          !(page)->cs_tainted /*2*/ &&                                   \
2547          (page)->object->code_signed /*3*/ &&                           \
2548          (!(page)->cs_validated || (page)->wpmapped /*4*/))
2549
2550
2551 /*
2552  * page queue lock must NOT be held
2553  * m->object must be locked
2554  *
2555  * NOTE: m->object could be locked "shared" only if we are called
2556  * from vm_fault() as part of a soft fault.  If so, we must be
2557  * careful not to modify the VM object in any way that is not
2558  * legal under a shared lock...
2559  */
2560 extern int proc_selfpid(void);
2561 extern char *proc_name_address(void *p);
2562 unsigned long cs_enter_tainted_rejected = 0;
2563 unsigned long cs_enter_tainted_accepted = 0;
2564 kern_return_t
2565 vm_fault_enter(vm_page_t m,
2566                pmap_t pmap,
2567                vm_map_offset_t vaddr,
2568                vm_prot_t prot,
2569                vm_prot_t fault_type,
2570                boolean_t wired,
2571                boolean_t change_wiring,
2572                boolean_t no_cache,
2573                boolean_t cs_bypass,
2574                __unused int      user_tag,
2575                int       pmap_options,
2576                boolean_t *need_retry,
2577                int *type_of_fault)
2578 {
2579         kern_return_t   kr, pe_result;
2580         boolean_t       previously_pmapped = m->pmapped;
2581         boolean_t       must_disconnect = 0;
2582         boolean_t       map_is_switched, map_is_switch_protected;
2583         int             cs_enforcement_enabled;
2584
2585         vm_object_lock_assert_held(m->object);
2586 #if DEBUG
2587         lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
2588 #endif /* DEBUG */
2589
2590         if (m->phys_page == vm_page_guard_addr) {
2591                 assert(m->fictitious);
2592                 return KERN_SUCCESS;
2593         }
2594
2595         if (*type_of_fault == DBG_ZERO_FILL_FAULT) {
2596
2597                 vm_object_lock_assert_exclusive(m->object);
2598
2599         } else if ((fault_type & VM_PROT_WRITE) == 0) {
2600                 /*
2601                  * This is not a "write" fault, so we
2602                  * might not have taken the object lock
2603                  * exclusively and we might not be able
2604                  * to update the "wpmapped" bit in
2605                  * vm_fault_enter().
2606                  * Let's just grant read access to
2607                  * the page for now and we'll
2608                  * soft-fault again if we need write
2609                  * access later...
2610                  */
2611                 prot &= ~VM_PROT_WRITE;
2612         }
2613         if (m->pmapped == FALSE) {
2614
2615                 if (m->clustered) {
2616                         if (*type_of_fault == DBG_CACHE_HIT_FAULT) {
2617                                 /*
2618                                  * found it in the cache, but this
2619                                  * is the first fault-in of the page (m->pmapped == FALSE)
2620                                  * so it must have come in as part of
2621                                  * a cluster... account 1 pagein against it
2622                                  */
2623                                 if (m->object->internal)
2624                                         *type_of_fault = DBG_PAGEIND_FAULT;
2625                                 else
2626                                         *type_of_fault = DBG_PAGEINV_FAULT;
2627
2628                                 VM_PAGE_COUNT_AS_PAGEIN(m);
2629                         }
2630                         VM_PAGE_CONSUME_CLUSTERED(m);
2631                 }
2632         }
2633
2634         if (*type_of_fault != DBG_COW_FAULT) {
2635                 DTRACE_VM2(as_fault, int, 1, (uint64_t *), NULL);
2636
2637                 if (pmap == kernel_pmap) {
2638                         DTRACE_VM2(kernel_asflt, int, 1, (uint64_t *), NULL);
2639                 }
2640         }
2641
2642         /* Validate code signature if necessary. */
2643         if (VM_FAULT_NEED_CS_VALIDATION(pmap, m)) {
2644                 vm_object_lock_assert_exclusive(m->object);
2645
2646                 if (m->cs_validated) {
2647                         vm_cs_revalidates++;
2648                 }
2649
2650                 /* VM map is locked, so 1 ref will remain on VM object -
2651                  * so no harm if vm_page_validate_cs drops the object lock */
2652                 vm_page_validate_cs(m);
2653         }
2654
2655 #define page_immutable(m,prot) ((m)->cs_validated /*&& ((prot) & VM_PROT_EXECUTE)*/)
2656
2657         map_is_switched = ((pmap != vm_map_pmap(current_task()->map)) &&
2658                            (pmap == vm_map_pmap(current_thread()->map)));
2659         map_is_switch_protected = current_thread()->map->switch_protect;
2660
2661         /* If the map is switched, and is switch-protected, we must protect
2662          * some pages from being write-faulted: immutable pages because by
2663          * definition they may not be written, and executable pages because that
2664          * would provide a way to inject unsigned code.
2665          * If the page is immutable, we can simply return. However, we can't
2666          * immediately determine whether a page is executable anywhere. But,
2667          * we can disconnect it everywhere and remove the executable protection
2668          * from the current map. We do that below right before we do the
2669          * PMAP_ENTER.
2670          */
2671         cs_enforcement_enabled = cs_enforcement(NULL);
2672
2673         if(cs_enforcement_enabled && map_is_switched &&
2674            map_is_switch_protected && page_immutable(m, prot) &&
2675            (prot & VM_PROT_WRITE))
2676         {
2677                 return KERN_CODESIGN_ERROR;
2678         }
2679
2680         /* A page could be tainted, or pose a risk of being tainted later.
2681          * Check whether the receiving process wants it, and make it feel
2682          * the consequences (that hapens in cs_invalid_page()).
2683          * For CS Enforcement, two other conditions will
2684          * cause that page to be tainted as well:
2685          * - pmapping an unsigned page executable - this means unsigned code;
2686          * - writeable mapping of a validated page - the content of that page
2687          *   can be changed without the kernel noticing, therefore unsigned
2688          *   code can be created
2689          */
2690         if (m->cs_tainted ||
2691             ((cs_enforcement_enabled && !cs_bypass ) &&
2692              (/* The page is unsigned and wants to be executable */
2693               (!m->cs_validated && (prot & VM_PROT_EXECUTE))  ||
2694               /* The page should be immutable, but is in danger of being modified
2695                 * This is the case where we want policy from the code directory -
2696                 * is the page immutable or not? For now we have to assume that
2697                 * code pages will be immutable, data pages not.
2698                 * We'll assume a page is a code page if it has a code directory
2699                 * and we fault for execution.
2700                 * That is good enough since if we faulted the code page for
2701                 * writing in another map before, it is wpmapped; if we fault
2702                 * it for writing in this map later it will also be faulted for executing
2703                 * at the same time; and if we fault for writing in another map
2704                 * later, we will disconnect it from this pmap so we'll notice
2705                 * the change.
2706                 */
2707               (page_immutable(m, prot) && ((prot & VM_PROT_WRITE) || m->wpmapped))
2708               ))
2709                 )
2710         {
2711                 /* We will have a tainted page. Have to handle the special case
2712                  * of a switched map now. If the map is not switched, standard
2713                  * procedure applies - call cs_invalid_page().
2714                  * If the map is switched, the real owner is invalid already.
2715                  * There is no point in invalidating the switching process since
2716                  * it will not be executing from the map. So we don't call
2717                  * cs_invalid_page() in that case. */
2718                 boolean_t reject_page;
2719                 if(map_is_switched) {
2720                         assert(pmap==vm_map_pmap(current_thread()->map));
2721                         assert(!(prot & VM_PROT_WRITE) || (map_is_switch_protected == FALSE));
2722                         reject_page = FALSE;
2723                 } else {
2724                         if (cs_debug > 5)
2725                                 printf("vm_fault: signed: %s validate: %s tainted: %s wpmapped: %s slid: %s prot: 0x%x\n",
2726                                        m->object->code_signed ? "yes" : "no",
2727                                        m->cs_validated ? "yes" : "no",
2728                                        m->cs_tainted ? "yes" : "no",
2729                                        m->wpmapped ? "yes" : "no",
2730                                        m->slid ? "yes" : "no",
2731                                        (int)prot);
2732                         reject_page = cs_invalid_page((addr64_t) vaddr);
2733                 }
2734
2735                 if (reject_page) {
2736                         /* reject the invalid page: abort the page fault */
2737                         int                     pid;
2738                         const char              *procname;
2739                         task_t                  task;
2740                         vm_object_t             file_object, shadow;
2741                         vm_object_offset_t      file_offset;
2742                         char                    *pathname, *filename;
2743                         vm_size_t               pathname_len, filename_len;
2744                         boolean_t               truncated_path;
2745 #define __PATH_MAX 1024
2746                         struct timespec         mtime, cs_mtime;
2747
2748                         kr = KERN_CODESIGN_ERROR;
2749                         cs_enter_tainted_rejected++;
2750
2751                         /* get process name and pid */
2752                         procname = "?";
2753                         task = current_task();
2754                         pid = proc_selfpid();
2755                         if (task->bsd_info != NULL)
2756                                 procname = proc_name_address(task->bsd_info);
2757
2758                         /* get file's VM object */
2759                         file_object = m->object;
2760                         file_offset = m->offset;
2761                         for (shadow = file_object->shadow;
2762                              shadow != VM_OBJECT_NULL;
2763                              shadow = file_object->shadow) {
2764                                 vm_object_lock_shared(shadow);
2765                                 if (file_object != m->object) {
2766                                         vm_object_unlock(file_object);
2767                                 }
2768                                 file_offset += file_object->vo_shadow_offset;
2769                                 file_object = shadow;
2770                         }
2771
2772                         mtime.tv_sec = 0;
2773                         mtime.tv_nsec = 0;
2774                         cs_mtime.tv_sec = 0;
2775                         cs_mtime.tv_nsec = 0;
2776
2777                         /* get file's pathname and/or filename */
2778                         pathname = NULL;
2779                         filename = NULL;
2780                         pathname_len = 0;
2781                         filename_len = 0;
2782                         truncated_path = FALSE;
2783                         if (file_object->pager == NULL) {
2784                                 /* no pager -> no file -> no pathname */
2785                                 pathname = (char *) "<nil>";
2786                         } else {
2787                                 pathname = (char *)kalloc(__PATH_MAX * 2);
2788                                 if (pathname) {
2789                                         pathname[0] = '\0';
2790                                         pathname_len = __PATH_MAX;
2791                                         filename = pathname + pathname_len;
2792                                         filename_len = __PATH_MAX;
2793                                 }
2794                                 vnode_pager_get_object_name(file_object->pager,
2795                                                             pathname,
2796                                                             pathname_len,
2797                                                             filename,
2798                                                             filename_len,
2799                                                             &truncated_path);
2800                                 vnode_pager_get_object_mtime(file_object->pager,
2801                                                              &mtime,
2802                                                              &cs_mtime);
2803                         }
2804                         printf("CODE SIGNING: process %d[%s]: "
2805                                "rejecting invalid page at address 0x%llx "
2806                                "from offset 0x%llx in file \"%s%s%s\" "
2807                                "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
2808                                "(signed:%d validated:%d tainted:%d "
2809                                "wpmapped:%d slid:%d)\n",
2810                                pid, procname, (addr64_t) vaddr,
2811                                file_offset,
2812                                (pathname ? pathname : ""),
2813                                (truncated_path ? "/.../" : ""),
2814                                (truncated_path ? filename : ""),
2815                                cs_mtime.tv_sec, cs_mtime.tv_nsec,
2816                                ((cs_mtime.tv_sec == mtime.tv_sec &&
2817                                  cs_mtime.tv_nsec == mtime.tv_nsec)
2818                                 ? "=="
2819                                 : "!="),
2820                                mtime.tv_sec, mtime.tv_nsec,
2821                                m->object->code_signed,
2822                                m->cs_validated,
2823                                m->cs_tainted,
2824                                m->wpmapped,
2825                                m->slid);
2826                         if (file_object != m->object) {
2827                                 vm_object_unlock(file_object);
2828                         }
2829                         if (pathname_len != 0) {
2830                                 kfree(pathname, __PATH_MAX * 2);
2831                                 pathname = NULL;
2832                                 filename = NULL;
2833                         }
2834                 } else {
2835                         /* proceed with the invalid page */
2836                         kr = KERN_SUCCESS;
2837                         if (!m->cs_validated) {
2838                                 /*
2839                                  * This page has not been validated, so it
2840                                  * must not belong to a code-signed object
2841                                  * and should not be forcefully considered
2842                                  * as tainted.
2843                                  * We're just concerned about it here because
2844                                  * we've been asked to "execute" it but that
2845                                  * does not mean that it should cause other
2846                                  * accesses to fail.
2847                                  * This happens when a debugger sets a
2848                                  * breakpoint and we then execute code in
2849                                  * that page.  Marking the page as "tainted"
2850                                  * would cause any inspection tool ("leaks",
2851                                  * "vmmap", "CrashReporter", ...) to get killed
2852                                  * due to code-signing violation on that page,
2853                                  * even though they're just reading it and not
2854                                  * executing from it.
2855                                  */
2856                                 assert(!m->object->code_signed);
2857                         } else {
2858                                 /*
2859                                  * Page might have been tainted before or not;
2860                                  * now it definitively is. If the page wasn't
2861                                  * tainted, we must disconnect it from all
2862                                  * pmaps later, to force existing mappings
2863                                  * through that code path for re-consideration
2864                                  * of the validity of that page.
2865                                  */
2866                                 must_disconnect = !m->cs_tainted;
2867                                 m->cs_tainted = TRUE;
2868                         }
2869                         cs_enter_tainted_accepted++;
2870                 }
2871                 if (kr != KERN_SUCCESS) {
2872                         if (cs_debug) {
2873                                 printf("CODESIGNING: vm_fault_enter(0x%llx): "
2874                                        "*** INVALID PAGE ***\n",
2875                                        (long long)vaddr);
2876                         }
2877 #if !SECURE_KERNEL
2878                         if (cs_enforcement_panic) {
2879                                 panic("CODESIGNING: panicking on invalid page\n");
2880                         }
2881 #endif
2882                 }
2883
2884         } else {
2885                 /* proceed with the valid page */
2886                 kr = KERN_SUCCESS;
2887         }
2888
2889         boolean_t       page_queues_locked = FALSE;
2890 #define __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED()   \
2891 MACRO_BEGIN                                     \
2892         if (! page_queues_locked) {             \
2893                 page_queues_locked = TRUE;      \
2894                 vm_page_lockspin_queues();      \
2895         }                                       \
2896 MACRO_END
2897 #define __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED()     \
2898 MACRO_BEGIN                                     \
2899         if (page_queues_locked) {               \
2900                 page_queues_locked = FALSE;     \
2901                 vm_page_unlock_queues();        \
2902         }                                       \
2903 MACRO_END
2904
2905         /*
2906          * Hold queues lock to manipulate
2907          * the page queues.  Change wiring
2908          * case is obvious.
2909          */
2910         assert(m->compressor || m->object != compressor_object);
2911         if (m->compressor) {
2912                 /*
2913                  * Compressor pages are neither wired
2914                  * nor pageable and should never change.
2915                  */
2916                 assert(m->object == compressor_object);
2917         } else if (change_wiring) {
2918                 __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
2919
2920                 if (wired) {
2921                         if (kr == KERN_SUCCESS) {
2922                                 vm_page_wire(m);
2923                         }
2924                 } else {
2925                         vm_page_unwire(m, TRUE);
2926                 }
2927                 /* we keep the page queues lock, if we need it later */
2928
2929         } else {
2930                 if (kr != KERN_SUCCESS) {
2931                         __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
2932                         vm_page_deactivate(m);
2933                         /* we keep the page queues lock, if we need it later */
2934                 } else if (((!m->active && !m->inactive) ||
2935                             m->clean_queue ||
2936                             no_cache) &&
2937                            !VM_PAGE_WIRED(m) && !m->throttled) {
2938
2939                         if (vm_page_local_q &&
2940                             !no_cache &&
2941                             (*type_of_fault == DBG_COW_FAULT ||
2942                              *type_of_fault == DBG_ZERO_FILL_FAULT) ) {
2943                                 struct vpl      *lq;
2944                                 uint32_t        lid;
2945
2946                                 __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
2947                                 vm_object_lock_assert_exclusive(m->object);
2948
2949                                 /*
2950                                  * we got a local queue to stuff this
2951                                  * new page on...
2952                                  * its safe to manipulate local and
2953                                  * local_id at this point since we're
2954                                  * behind an exclusive object lock and
2955                                  * the page is not on any global queue.
2956                                  *
2957                                  * we'll use the current cpu number to
2958                                  * select the queue note that we don't
2959                                  * need to disable preemption... we're
2960                                  * going to behind the local queue's
2961                                  * lock to do the real work
2962                                  */
2963                                 lid = cpu_number();
2964
2965                                 lq = &vm_page_local_q[lid].vpl_un.vpl;
2966
2967                                 VPL_LOCK(&lq->vpl_lock);
2968
2969                                 queue_enter(&lq->vpl_queue, m,
2970                                             vm_page_t, pageq);
2971                                 m->local = TRUE;
2972                                 m->local_id = lid;
2973                                 lq->vpl_count++;
2974
2975                                 if (m->object->internal)
2976                                         lq->vpl_internal_count++;
2977                                 else
2978                                         lq->vpl_external_count++;
2979
2980                                 VPL_UNLOCK(&lq->vpl_lock);
2981
2982                                 if (lq->vpl_count > vm_page_local_q_soft_limit)
2983                                 {
2984                                         /*
2985                                          * we're beyond the soft limit
2986                                          * for the local queue
2987                                          * vm_page_reactivate_local will
2988                                          * 'try' to take the global page
2989                                          * queue lock... if it can't
2990                                          * that's ok... we'll let the
2991                                          * queue continue to grow up
2992                                          * to the hard limit... at that
2993                                          * point we'll wait for the
2994                                          * lock... once we've got the
2995                                          * lock, we'll transfer all of
2996                                          * the pages from the local
2997                                          * queue to the global active
2998                                          * queue
2999                                          */
3000                                         vm_page_reactivate_local(lid, FALSE, FALSE);
3001                                 }
3002                         } else {
3003
3004                                 __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3005
3006                                 /*
3007                                  * test again now that we hold the
3008                                  * page queue lock
3009                                  */
3010                                 if (!VM_PAGE_WIRED(m)) {
3011                                         if (m->clean_queue) {
3012                                                 VM_PAGE_QUEUES_REMOVE(m);
3013
3014                                                 vm_pageout_cleaned_reactivated++;
3015                                                 vm_pageout_cleaned_fault_reactivated++;
3016                                         }
3017
3018                                         if ((!m->active &&
3019                                              !m->inactive) ||
3020                                             no_cache) {
3021                                                 /*
3022                                                  * If this is a no_cache mapping
3023                                                  * and the page has never been
3024                                                  * mapped before or was
3025                                                  * previously a no_cache page,
3026                                                  * then we want to leave pages
3027                                                  * in the speculative state so
3028                                                  * that they can be readily
3029                                                  * recycled if free memory runs
3030                                                  * low.  Otherwise the page is
3031                                                  * activated as normal.
3032                                                  */
3033
3034                                                 if (no_cache &&
3035                                                     (!previously_pmapped ||
3036                                                      m->no_cache)) {
3037                                                         m->no_cache = TRUE;
3038
3039                                                         if (!m->speculative)
3040                                                                 vm_page_speculate(m, FALSE);
3041
3042                                                 } else if (!m->active &&
3043                                                            !m->inactive) {
3044
3045                                                         vm_page_activate(m);
3046                                                 }
3047                                         }
3048                                 }
3049                                 /* we keep the page queues lock, if we need it later */
3050                         }
3051                 }
3052         }
3053         /* we're done with the page queues lock, if we ever took it */
3054         __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
3055
3056
3057         /* If we have a KERN_SUCCESS from the previous checks, we either have
3058          * a good page, or a tainted page that has been accepted by the process.
3059          * In both cases the page will be entered into the pmap.
3060          * If the page is writeable, we need to disconnect it from other pmaps
3061          * now so those processes can take note.
3062          */
3063         if (kr == KERN_SUCCESS) {
3064
3065                 /*
3066                  * NOTE: we may only hold the vm_object lock SHARED
3067                  * at this point, so we need the phys_page lock to
3068                  * properly serialize updating the pmapped and
3069                  * xpmapped bits
3070                  */
3071                 if ((prot & VM_PROT_EXECUTE) && !m->xpmapped) {
3072
3073                         pmap_lock_phys_page(m->phys_page);
3074                         /*
3075                          * go ahead and take the opportunity
3076                          * to set 'pmapped' here so that we don't
3077                          * need to grab this lock a 2nd time
3078                          * just below
3079                          */
3080                         m->pmapped = TRUE;
3081
3082                         if (!m->xpmapped) {
3083
3084                                 m->xpmapped = TRUE;
3085
3086                                 pmap_unlock_phys_page(m->phys_page);
3087
3088                                 if (!m->object->internal)
3089                                         OSAddAtomic(1, &vm_page_xpmapped_external_count);
3090
3091                                 if ((COMPRESSED_PAGER_IS_ACTIVE) &&
3092                                     m->object->internal &&
3093                                     m->object->pager != NULL) {
3094                                         /*
3095                                          * This page could have been
3096                                          * uncompressed by the
3097                                          * compressor pager and its
3098                                          * contents might be only in
3099                                          * the data cache.
3100                                          * Since it's being mapped for
3101                                          * "execute" for the fist time,
3102                                          * make sure the icache is in
3103                                          * sync.
3104                                          */
3105                                         pmap_sync_page_data_phys(m->phys_page);
3106                                 }
3107                         } else
3108                                 pmap_unlock_phys_page(m->phys_page);
3109                 } else {
3110                         if (m->pmapped == FALSE) {
3111                                 pmap_lock_phys_page(m->phys_page);
3112                                 m->pmapped = TRUE;
3113                                 pmap_unlock_phys_page(m->phys_page);
3114                         }
3115                 }
3116                 if (vm_page_is_slideable(m)) {
3117                         boolean_t was_busy = m->busy;
3118
3119                         vm_object_lock_assert_exclusive(m->object);
3120
3121                         m->busy = TRUE;
3122                         kr = vm_page_slide(m, 0);
3123                         assert(m->busy);
3124                         if(!was_busy) {
3125                                 PAGE_WAKEUP_DONE(m);
3126                         }
3127                         if (kr != KERN_SUCCESS) {
3128                                 /*
3129                                  * This page has not been slid correctly,
3130                                  * do not do the pmap_enter() !
3131                                  * Let vm_fault_enter() return the error
3132                                  * so the caller can fail the fault.
3133                                  */
3134                                 goto after_the_pmap_enter;
3135                         }
3136                 }
3137
3138                 if (fault_type & VM_PROT_WRITE) {
3139
3140                         if (m->wpmapped == FALSE) {
3141                                 vm_object_lock_assert_exclusive(m->object);
3142
3143                                 m->wpmapped = TRUE;
3144                         }
3145                         if (must_disconnect) {
3146                                 /*
3147                                  * We can only get here
3148                                  * because of the CSE logic
3149                                  */
3150                                 assert(cs_enforcement_enabled);
3151                                 pmap_disconnect(m->phys_page);
3152                                 /*
3153                                  * If we are faulting for a write, we can clear
3154                                  * the execute bit - that will ensure the page is
3155                                  * checked again before being executable, which
3156                                  * protects against a map switch.
3157                                  * This only happens the first time the page
3158                                  * gets tainted, so we won't get stuck here
3159                                  * to make an already writeable page executable.
3160                                  */
3161                                 if (!cs_bypass){
3162                                         prot &= ~VM_PROT_EXECUTE;
3163                                 }
3164                         }
3165                 }
3166
3167                 /* Prevent a deadlock by not
3168                  * holding the object lock if we need to wait for a page in
3169                  * pmap_enter() - <rdar://problem/7138958> */
3170                 PMAP_ENTER_OPTIONS(pmap, vaddr, m, prot, fault_type, 0,
3171                                    wired,
3172                                    pmap_options | PMAP_OPTIONS_NOWAIT,
3173                                    pe_result);
3174
3175                 if(pe_result == KERN_RESOURCE_SHORTAGE) {
3176
3177                         if (need_retry) {
3178                                 /*
3179                                  * this will be non-null in the case where we hold the lock
3180                                  * on the top-object in this chain... we can't just drop
3181                                  * the lock on the object we're inserting the page into
3182                                  * and recall the PMAP_ENTER since we can still cause
3183                                  * a deadlock if one of the critical paths tries to
3184                                  * acquire the lock on the top-object and we're blocked
3185                                  * in PMAP_ENTER waiting for memory... our only recourse
3186                                  * is to deal with it at a higher level where we can
3187                                  * drop both locks.
3188                                  */
3189                                 *need_retry = TRUE;
3190                                 vm_pmap_enter_retried++;
3191                                 goto after_the_pmap_enter;
3192                         }
3193                         /* The nonblocking version of pmap_enter did not succeed.
3194                          * and we don't need to drop other locks and retry
3195                          * at the level above us, so
3196                          * use the blocking version instead. Requires marking
3197                          * the page busy and unlocking the object */
3198                         boolean_t was_busy = m->busy;
3199
3200                         vm_object_lock_assert_exclusive(m->object);
3201
3202                         m->busy = TRUE;
3203                         vm_object_unlock(m->object);
3204
3205                         PMAP_ENTER_OPTIONS(pmap, vaddr, m, prot, fault_type,
3206                                            0, wired,
3207                                            pmap_options, pe_result);
3208
3209                         /* Take the object lock again. */
3210                         vm_object_lock(m->object);
3211
3212                         /* If the page was busy, someone else will wake it up.
3213                          * Otherwise, we have to do it now. */
3214                         assert(m->busy);
3215                         if(!was_busy) {
3216                                 PAGE_WAKEUP_DONE(m);
3217                         }
3218                         vm_pmap_enter_blocked++;
3219                 }
3220         }
3221
3222 after_the_pmap_enter:
3223         return kr;
3224 }
3225
3226 void
3227 vm_pre_fault(vm_map_offset_t vaddr)
3228 {
3229         if (pmap_find_phys(current_map()->pmap, vaddr) == 0) {
3230
3231                 vm_fault(current_map(), /* map */
3232                         vaddr,          /* vaddr */
3233                         VM_PROT_READ, /* fault_type */
3234                         FALSE, /* change_wiring */
3235                         THREAD_UNINT, /* interruptible */
3236                         NULL, /* caller_pmap */
3237                         0 /* caller_pmap_addr */);
3238         }
3239 }
3240
3241
3242 /*
3243  *      Routine:        vm_fault
3244  *      Purpose:
3245  *              Handle page faults, including pseudo-faults
3246  *              used to change the wiring status of pages.
3247  *      Returns:
3248  *              Explicit continuations have been removed.
3249  *      Implementation:
3250  *              vm_fault and vm_fault_page save mucho state
3251  *              in the moral equivalent of a closure.  The state
3252  *              structure is allocated when first entering vm_fault
3253  *              and deallocated when leaving vm_fault.
3254  */
3255
3256 extern int _map_enter_debug;
3257
3258 unsigned long vm_fault_collapse_total = 0;
3259 unsigned long vm_fault_collapse_skipped = 0;
3260
3261
3262 kern_return_t
3263 vm_fault(
3264         vm_map_t        map,
3265         vm_map_offset_t vaddr,
3266         vm_prot_t       fault_type,
3267         boolean_t       change_wiring,
3268         int             interruptible,
3269         pmap_t          caller_pmap,
3270         vm_map_offset_t caller_pmap_addr)
3271 {
3272         return vm_fault_internal(map, vaddr, fault_type, change_wiring,
3273                                  interruptible, caller_pmap, caller_pmap_addr,
3274                                  NULL);
3275 }
3276
3277 kern_return_t
3278 vm_fault_internal(
3279         vm_map_t        map,
3280         vm_map_offset_t vaddr,
3281         vm_prot_t       fault_type,
3282         boolean_t       change_wiring,
3283         int             interruptible,
3284         pmap_t          caller_pmap,
3285         vm_map_offset_t caller_pmap_addr,
3286         ppnum_t         *physpage_p)
3287 {
3288         vm_map_version_t        version;        /* Map version for verificiation */
3289         boolean_t               wired;          /* Should mapping be wired down? */
3290         vm_object_t             object;         /* Top-level object */
3291         vm_object_offset_t      offset;         /* Top-level offset */
3292         vm_prot_t               prot;           /* Protection for mapping */
3293         vm_object_t             old_copy_object; /* Saved copy object */
3294         vm_page_t               result_page;    /* Result of vm_fault_page */
3295         vm_page_t               top_page;       /* Placeholder page */
3296         kern_return_t           kr;
3297
3298         vm_page_t               m;      /* Fast access to result_page */
3299         kern_return_t           error_code;
3300         vm_object_t             cur_object;
3301         vm_object_offset_t      cur_offset;
3302         vm_page_t               cur_m;
3303         vm_object_t             new_object;
3304         int                     type_of_fault;
3305         pmap_t                  pmap;
3306         boolean_t               interruptible_state;
3307         vm_map_t                real_map = map;
3308         vm_map_t                original_map = map;
3309         vm_prot_t               original_fault_type;
3310         struct vm_object_fault_info fault_info;
3311         boolean_t               need_collapse = FALSE;
3312         boolean_t               need_retry = FALSE;
3313         boolean_t               *need_retry_ptr = NULL;
3314         int                     object_lock_type = 0;
3315         int                     cur_object_lock_type;
3316         vm_object_t             top_object = VM_OBJECT_NULL;
3317         int                     throttle_delay;
3318         int                     compressed_count_delta;
3319
3320
3321         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3322                       (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START,
3323                               ((uint64_t)vaddr >> 32),
3324                               vaddr,
3325                               (map == kernel_map),
3326                               0,
3327                               0);
3328
3329         if (get_preemption_level() != 0) {
3330                 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3331                                       (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
3332                                       ((uint64_t)vaddr >> 32),
3333                                       vaddr,
3334                                       KERN_FAILURE,
3335                                       0,
3336                                       0);
3337
3338                 return (KERN_FAILURE);
3339         }
3340
3341         interruptible_state = thread_interrupt_level(interruptible);
3342
3343         VM_STAT_INCR(faults);
3344         current_task()->faults++;
3345         original_fault_type = fault_type;
3346
3347         if (fault_type & VM_PROT_WRITE)
3348                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3349         else
3350                 object_lock_type = OBJECT_LOCK_SHARED;
3351
3352         cur_object_lock_type = OBJECT_LOCK_SHARED;
3353
3354 RetryFault:
3355         /*
3356          * assume we will hit a page in the cache
3357          * otherwise, explicitly override with
3358          * the real fault type once we determine it
3359          */
3360         type_of_fault = DBG_CACHE_HIT_FAULT;
3361
3362         /*
3363          *      Find the backing store object and offset into
3364          *      it to begin the search.
3365          */
3366         fault_type = original_fault_type;
3367         map = original_map;
3368         vm_map_lock_read(map);
3369
3370         kr = vm_map_lookup_locked(&map, vaddr, fault_type,
3371                                   object_lock_type, &version,
3372                                   &object, &offset, &prot, &wired,
3373                                   &fault_info,
3374                                   &real_map);
3375
3376         if (kr != KERN_SUCCESS) {
3377                 vm_map_unlock_read(map);
3378                 goto done;
3379         }
3380         pmap = real_map->pmap;
3381         fault_info.interruptible = interruptible;
3382         fault_info.stealth = FALSE;
3383         fault_info.io_sync = FALSE;
3384         fault_info.mark_zf_absent = FALSE;
3385         fault_info.batch_pmap_op = FALSE;
3386
3387         /*
3388          * If the page is wired, we must fault for the current protection
3389          * value, to avoid further faults.
3390          */
3391         if (wired) {
3392                 fault_type = prot | VM_PROT_WRITE;
3393                 /*
3394                  * since we're treating this fault as a 'write'
3395                  * we must hold the top object lock exclusively
3396                  */
3397                 if (object_lock_type == OBJECT_LOCK_SHARED) {
3398
3399                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3400
3401                         if (vm_object_lock_upgrade(object) == FALSE) {
3402                                 /*
3403                                  * couldn't upgrade, so explictly
3404                                  * take the lock exclusively
3405                                  */
3406                                 vm_object_lock(object);
3407                         }
3408                 }
3409         }
3410
3411 #if     VM_FAULT_CLASSIFY
3412         /*
3413          *      Temporary data gathering code
3414          */
3415         vm_fault_classify(object, offset, fault_type);
3416 #endif
3417         /*
3418          *      Fast fault code.  The basic idea is to do as much as
3419          *      possible while holding the map lock and object locks.
3420          *      Busy pages are not used until the object lock has to
3421          *      be dropped to do something (copy, zero fill, pmap enter).
3422          *      Similarly, paging references aren't acquired until that
3423          *      point, and object references aren't used.
3424          *
3425          *      If we can figure out what to do
3426          *      (zero fill, copy on write, pmap enter) while holding
3427          *      the locks, then it gets done.  Otherwise, we give up,
3428          *      and use the original fault path (which doesn't hold
3429          *      the map lock, and relies on busy pages).
3430          *      The give up cases include:
3431          *              - Have to talk to pager.
3432          *              - Page is busy, absent or in error.
3433          *              - Pager has locked out desired access.
3434          *              - Fault needs to be restarted.
3435          *              - Have to push page into copy object.
3436          *
3437          *      The code is an infinite loop that moves one level down
3438          *      the shadow chain each time.  cur_object and cur_offset
3439          *      refer to the current object being examined. object and offset
3440          *      are the original object from the map.  The loop is at the
3441          *      top level if and only if object and cur_object are the same.
3442          *
3443          *      Invariants:  Map lock is held throughout.  Lock is held on
3444          *              original object and cur_object (if different) when
3445          *              continuing or exiting loop.
3446          *
3447          */
3448
3449
3450         /*
3451          * If this page is to be inserted in a copy delay object
3452          * for writing, and if the object has a copy, then the
3453          * copy delay strategy is implemented in the slow fault page.
3454          */
3455         if (object->copy_strategy == MEMORY_OBJECT_COPY_DELAY &&
3456             object->copy != VM_OBJECT_NULL && (fault_type & VM_PROT_WRITE))
3457                 goto handle_copy_delay;
3458
3459         cur_object = object;
3460         cur_offset = offset;
3461
3462         while (TRUE) {
3463                 if (!cur_object->pager_created &&
3464                     cur_object->phys_contiguous) /* superpage */
3465                         break;
3466
3467                 if (cur_object->blocked_access) {
3468                         /*
3469                          * Access to this VM object has been blocked.
3470                          * Let the slow path handle it.
3471                          */
3472                         break;
3473                 }
3474
3475                 m = vm_page_lookup(cur_object, cur_offset);
3476
3477                 if (m != VM_PAGE_NULL) {
3478                         if (m->busy) {
3479                                 wait_result_t   result;
3480
3481                                 /*
3482                                  * in order to do the PAGE_ASSERT_WAIT, we must
3483                                  * have object that 'm' belongs to locked exclusively
3484                                  */
3485                                 if (object != cur_object) {
3486
3487                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3488
3489                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3490
3491                                                 if (vm_object_lock_upgrade(cur_object) == FALSE) {
3492                                                         /*
3493                                                          * couldn't upgrade so go do a full retry
3494                                                          * immediately since we can no longer be
3495                                                          * certain about cur_object (since we
3496                                                          * don't hold a reference on it)...
3497                                                          * first drop the top object lock
3498                                                          */
3499                                                         vm_object_unlock(object);
3500
3501                                                         vm_map_unlock_read(map);
3502                                                         if (real_map != map)
3503                                                                 vm_map_unlock(real_map);
3504
3505                                                         goto RetryFault;
3506                                                 }
3507                                         }
3508                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3509
3510                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3511
3512                                         if (vm_object_lock_upgrade(object) == FALSE) {
3513                                                 /*
3514                                                  * couldn't upgrade, so explictly take the lock
3515                                                  * exclusively and go relookup the page since we
3516                                                  * will have dropped the object lock and
3517                                                  * a different thread could have inserted
3518                                                  * a page at this offset
3519                                                  * no need for a full retry since we're
3520                                                  * at the top level of the object chain
3521                                                  */
3522                                                 vm_object_lock(object);
3523
3524                                                 continue;
3525                                         }
3526                                 }
3527                                 if (m->pageout_queue && m->object->internal && COMPRESSED_PAGER_IS_ACTIVE) {
3528                                         /*
3529                                          * m->busy == TRUE and the object is locked exclusively
3530                                          * if m->pageout_queue == TRUE after we acquire the
3531                                          * queues lock, we are guaranteed that it is stable on
3532                                          * the pageout queue and therefore reclaimable
3533                                          *
3534                                          * NOTE: this is only true for the internal pageout queue
3535                                          * in the compressor world
3536                                          */
3537                                         vm_page_lock_queues();
3538
3539                                         if (m->pageout_queue) {
3540                                                 vm_pageout_throttle_up(m);
3541                                                 vm_page_unlock_queues();
3542
3543                                                 PAGE_WAKEUP_DONE(m);
3544                                                 goto reclaimed_from_pageout;
3545                                         }
3546                                         vm_page_unlock_queues();
3547                                 }
3548                                 if (object != cur_object)
3549                                         vm_object_unlock(object);
3550
3551                                 vm_map_unlock_read(map);
3552                                 if (real_map != map)
3553                                         vm_map_unlock(real_map);
3554
3555                                 result = PAGE_ASSERT_WAIT(m, interruptible);
3556
3557                                 vm_object_unlock(cur_object);
3558
3559                                 if (result == THREAD_WAITING) {
3560                                         result = thread_block(THREAD_CONTINUE_NULL);
3561
3562                                         counter(c_vm_fault_page_block_busy_kernel++);
3563                                 }
3564                                 if (result == THREAD_AWAKENED || result == THREAD_RESTART)
3565                                         goto RetryFault;
3566
3567                                 kr = KERN_ABORTED;
3568                                 goto done;
3569                         }
3570 reclaimed_from_pageout:
3571                         if (m->laundry) {
3572                                 if (object != cur_object) {
3573                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3574                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3575
3576                                                 vm_object_unlock(object);
3577                                                 vm_object_unlock(cur_object);
3578
3579                                                 vm_map_unlock_read(map);
3580                                                 if (real_map != map)
3581                                                         vm_map_unlock(real_map);
3582
3583                                                 goto RetryFault;
3584                                         }
3585
3586                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3587
3588                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3589
3590                                         if (vm_object_lock_upgrade(object) == FALSE) {
3591                                                 /*
3592                                                  * couldn't upgrade, so explictly take the lock
3593                                                  * exclusively and go relookup the page since we
3594                                                  * will have dropped the object lock and
3595                                                  * a different thread could have inserted
3596                                                  * a page at this offset
3597                                                  * no need for a full retry since we're
3598                                                  * at the top level of the object chain
3599                                                  */
3600                                                 vm_object_lock(object);
3601
3602                                                 continue;
3603                                         }
3604                                 }
3605                                 m->pageout = FALSE;
3606
3607                                 vm_pageout_steal_laundry(m, FALSE);
3608                         }
3609
3610                         if (m->phys_page == vm_page_guard_addr) {
3611                                 /*
3612                                  * Guard page: let the slow path deal with it
3613                                  */
3614                                 break;
3615                         }
3616                         if (m->unusual && (m->error || m->restart || m->private || m->absent)) {
3617                                 /*
3618                                  * Unusual case... let the slow path deal with it
3619                                  */
3620                                 break;
3621                         }
3622                         if (VM_OBJECT_PURGEABLE_FAULT_ERROR(m->object)) {
3623                                 if (object != cur_object)
3624                                         vm_object_unlock(object);
3625                                 vm_map_unlock_read(map);
3626                                 if (real_map != map)
3627                                         vm_map_unlock(real_map);
3628                                 vm_object_unlock(cur_object);
3629                                 kr = KERN_MEMORY_ERROR;
3630                                 goto done;
3631                         }
3632
3633                         if (m->encrypted) {
3634                                 /*
3635                                  * ENCRYPTED SWAP:
3636                                  * We've soft-faulted (because it's not in the page
3637                                  * table) on an encrypted page.
3638                                  * Keep the page "busy" so that no one messes with
3639                                  * it during the decryption.
3640                                  * Release the extra locks we're holding, keep only
3641                                  * the page's VM object lock.
3642                                  *
3643                                  * in order to set 'busy' on 'm', we must
3644                                  * have object that 'm' belongs to locked exclusively
3645                                  */
3646                                 if (object != cur_object) {
3647                                         vm_object_unlock(object);
3648
3649                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3650
3651                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3652
3653                                                 if (vm_object_lock_upgrade(cur_object) == FALSE) {
3654                                                         /*
3655                                                          * couldn't upgrade so go do a full retry
3656                                                          * immediately since we've already dropped
3657                                                          * the top object lock associated with this page
3658                                                          * and the current one got dropped due to the
3659                                                          * failed upgrade... the state is no longer valid
3660                                                          */
3661                                                         vm_map_unlock_read(map);
3662                                                         if (real_map != map)
3663                                                                 vm_map_unlock(real_map);
3664
3665                                                         goto RetryFault;
3666                                                 }
3667                                         }
3668                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3669
3670                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3671
3672                                         if (vm_object_lock_upgrade(object) == FALSE) {
3673                                                 /*
3674                                                  * couldn't upgrade, so explictly take the lock
3675                                                  * exclusively and go relookup the page since we
3676                                                  * will have dropped the object lock and
3677                                                  * a different thread could have inserted
3678                                                  * a page at this offset
3679                                                  * no need for a full retry since we're
3680                                                  * at the top level of the object chain
3681                                                  */
3682                                                 vm_object_lock(object);
3683
3684                                                 continue;
3685                                         }
3686                                 }
3687                                 m->busy = TRUE;
3688
3689                                 vm_map_unlock_read(map);
3690                                 if (real_map != map)
3691                                         vm_map_unlock(real_map);
3692
3693                                 vm_page_decrypt(m, 0);
3694
3695                                 assert(m->busy);
3696                                 PAGE_WAKEUP_DONE(m);
3697
3698                                 vm_object_unlock(cur_object);
3699                                 /*
3700                                  * Retry from the top, in case anything
3701                                  * changed while we were decrypting...
3702                                  */
3703                                 goto RetryFault;
3704                         }
3705                         ASSERT_PAGE_DECRYPTED(m);
3706
3707                         if(vm_page_is_slideable(m)) {
3708                                 /*
3709                                  * We might need to slide this page, and so,
3710                                  * we want to hold the VM object exclusively.
3711                                  */
3712                                 if (object != cur_object) {
3713                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3714                                                 vm_object_unlock(object);
3715                                                 vm_object_unlock(cur_object);
3716
3717                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3718
3719                                                 vm_map_unlock_read(map);
3720                                                 if (real_map != map)
3721                                                         vm_map_unlock(real_map);
3722
3723                                                 goto RetryFault;
3724                                         }
3725                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3726
3727                                         vm_object_unlock(object);
3728                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3729                                         vm_map_unlock_read(map);
3730                                         goto RetryFault;
3731                                 }
3732                         }
3733
3734                         if (VM_FAULT_NEED_CS_VALIDATION(map->pmap, m) ||
3735                             (physpage_p != NULL && (prot & VM_PROT_WRITE))) {
3736 upgrade_for_validation:
3737                                 /*
3738                                  * We might need to validate this page
3739                                  * against its code signature, so we
3740                                  * want to hold the VM object exclusively.
3741                                  */
3742                                 if (object != cur_object) {
3743                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3744                                                 vm_object_unlock(object);
3745                                                 vm_object_unlock(cur_object);
3746
3747                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3748
3749                                                 vm_map_unlock_read(map);
3750                                                 if (real_map != map)
3751                                                         vm_map_unlock(real_map);
3752
3753                                                 goto RetryFault;
3754                                         }
3755
3756                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3757
3758                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3759
3760                                         if (vm_object_lock_upgrade(object) == FALSE) {
3761                                                 /*
3762                                                  * couldn't upgrade, so explictly take the lock
3763                                                  * exclusively and go relookup the page since we
3764                                                  * will have dropped the object lock and
3765                                                  * a different thread could have inserted
3766                                                  * a page at this offset
3767                                                  * no need for a full retry since we're
3768                                                  * at the top level of the object chain
3769                                                  */
3770                                                 vm_object_lock(object);
3771
3772                                                 continue;
3773                                         }
3774                                 }
3775                         }
3776                         /*
3777                          *      Two cases of map in faults:
3778                          *          - At top level w/o copy object.
3779                          *          - Read fault anywhere.
3780                          *              --> must disallow write.
3781                          */
3782
3783                         if (object == cur_object && object->copy == VM_OBJECT_NULL) {
3784
3785                                 goto FastPmapEnter;
3786                         }
3787
3788                         if ((fault_type & VM_PROT_WRITE) == 0) {
3789
3790                                 if (object != cur_object) {
3791                                         /*
3792                                          * We still need to hold the top object
3793                                          * lock here to prevent a race between
3794                                          * a read fault (taking only "shared"
3795                                          * locks) and a write fault (taking
3796                                          * an "exclusive" lock on the top
3797                                          * object.
3798                                          * Otherwise, as soon as we release the
3799                                          * top lock, the write fault could
3800                                          * proceed and actually complete before
3801                                          * the read fault, and the copied page's
3802                                          * translation could then be overwritten
3803                                          * by the read fault's translation for
3804                                          * the original page.
3805                                          *
3806                                          * Let's just record what the top object
3807                                          * is and we'll release it later.
3808                                          */
3809                                         top_object = object;
3810
3811                                         /*
3812                                          * switch to the object that has the new page
3813                                          */
3814                                         object = cur_object;
3815                                         object_lock_type = cur_object_lock_type;
3816                                 }
3817 FastPmapEnter:
3818                                 /*
3819                                  * prepare for the pmap_enter...
3820                                  * object and map are both locked
3821                                  * m contains valid data
3822                                  * object == m->object
3823                                  * cur_object == NULL or it's been unlocked
3824                                  * no paging references on either object or cur_object
3825                                  */
3826                                 if (top_object != VM_OBJECT_NULL || object_lock_type != OBJECT_LOCK_EXCLUSIVE)
3827                                         need_retry_ptr = &need_retry;
3828                                 else
3829                                         need_retry_ptr = NULL;
3830
3831                                 if (caller_pmap) {
3832                                         kr = vm_fault_enter(m,
3833                                                             caller_pmap,
3834                                                             caller_pmap_addr,
3835                                                             prot,
3836                                                             fault_type,
3837                                                             wired,
3838                                                             change_wiring,
3839                                                             fault_info.no_cache,
3840                                                             fault_info.cs_bypass,
3841                                                             fault_info.user_tag,
3842                                                             fault_info.pmap_options,
3843                                                             need_retry_ptr,
3844                                                             &type_of_fault);
3845                                 } else {
3846                                         kr = vm_fault_enter(m,
3847                                                             pmap,
3848                                                             vaddr,
3849                                                             prot,
3850                                                             fault_type,
3851                                                             wired,
3852                                                             change_wiring,
3853                                                             fault_info.no_cache,
3854                                                             fault_info.cs_bypass,
3855                                                             fault_info.user_tag,
3856                                                             fault_info.pmap_options,
3857                                                             need_retry_ptr,
3858                                                             &type_of_fault);
3859                                 }
3860
3861                                 if (kr == KERN_SUCCESS &&
3862                                     physpage_p != NULL) {
3863                                         /* for vm_map_wire_and_extract() */
3864                                         *physpage_p = m->phys_page;
3865                                         if (prot & VM_PROT_WRITE) {
3866                                                 vm_object_lock_assert_exclusive(
3867                                                         m->object);
3868                                                 m->dirty = TRUE;
3869                                         }
3870                                 }
3871
3872                                 if (top_object != VM_OBJECT_NULL) {
3873                                         /*
3874                                          * It's safe to drop the top object
3875                                          * now that we've done our
3876                                          * vm_fault_enter().  Any other fault
3877                                          * in progress for that virtual
3878                                          * address will either find our page
3879                                          * and translation or put in a new page
3880                                          * and translation.
3881                                          */
3882                                         vm_object_unlock(top_object);
3883                                         top_object = VM_OBJECT_NULL;
3884                                 }
3885
3886                                 if (need_collapse == TRUE)
3887                                         vm_object_collapse(object, offset, TRUE);
3888
3889                                 if (need_retry == FALSE &&
3890                                     (type_of_fault == DBG_PAGEIND_FAULT || type_of_fault == DBG_PAGEINV_FAULT || type_of_fault == DBG_CACHE_HIT_FAULT)) {
3891                                         /*
3892                                          * evaluate access pattern and update state
3893                                          * vm_fault_deactivate_behind depends on the
3894                                          * state being up to date
3895                                          */
3896                                         vm_fault_is_sequential(object, cur_offset, fault_info.behavior);
3897
3898                                         vm_fault_deactivate_behind(object, cur_offset, fault_info.behavior);
3899                                 }
3900                                 /*
3901                                  * That's it, clean up and return.
3902                                  */
3903                                 if (m->busy)
3904                                         PAGE_WAKEUP_DONE(m);
3905
3906                                 vm_object_unlock(object);
3907
3908                                 vm_map_unlock_read(map);
3909                                 if (real_map != map)
3910                                         vm_map_unlock(real_map);
3911
3912                                 if (need_retry == TRUE) {
3913                                         /*
3914                                          * vm_fault_enter couldn't complete the PMAP_ENTER...
3915                                          * at this point we don't hold any locks so it's safe
3916                                          * to ask the pmap layer to expand the page table to
3917                                          * accommodate this mapping... once expanded, we'll
3918                                          * re-drive the fault which should result in vm_fault_enter
3919                                          * being able to successfully enter the mapping this time around
3920                                          */
3921                                         (void)pmap_enter_options(
3922                                                 pmap, vaddr, 0, 0, 0, 0, 0,
3923                                                 PMAP_OPTIONS_NOENTER, NULL);
3924
3925                                         need_retry = FALSE;
3926                                         goto RetryFault;
3927                                 }
3928                                 goto done;
3929                         }
3930                         /*
3931                          * COPY ON WRITE FAULT
3932                          */
3933                         assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE);
3934
3935                         /*
3936                          * If objects match, then
3937                          * object->copy must not be NULL (else control
3938                          * would be in previous code block), and we
3939                          * have a potential push into the copy object
3940                          * with which we can't cope with here.
3941                          */
3942                         if (cur_object == object) {
3943                                 /*
3944                                  * must take the slow path to
3945                                  * deal with the copy push
3946                                  */
3947                                 break;
3948                         }
3949
3950                         /*
3951                          * This is now a shadow based copy on write
3952                          * fault -- it requires a copy up the shadow
3953                          * chain.
3954                          */
3955
3956                         if ((cur_object_lock_type == OBJECT_LOCK_SHARED) &&
3957                             VM_FAULT_NEED_CS_VALIDATION(NULL, m)) {
3958                                 goto upgrade_for_validation;
3959                         }
3960
3961                         /*
3962                          * Allocate a page in the original top level
3963                          * object. Give up if allocate fails.  Also
3964                          * need to remember current page, as it's the
3965                          * source of the copy.
3966                          *
3967                          * at this point we hold locks on both
3968                          * object and cur_object... no need to take
3969                          * paging refs or mark pages BUSY since
3970                          * we don't drop either object lock until
3971                          * the page has been copied and inserted
3972                          */
3973                         cur_m = m;
3974                         m = vm_page_grab();
3975
3976                         if (m == VM_PAGE_NULL) {
3977                                 /*
3978                                  * no free page currently available...
3979                                  * must take the slow path
3980                                  */
3981                                 break;
3982                         }
3983                         /*
3984                          * Now do the copy.  Mark the source page busy...
3985                          *
3986                          *      NOTE: This code holds the map lock across
3987                          *      the page copy.
3988                          */
3989                         vm_page_copy(cur_m, m);
3990                         vm_page_insert(m, object, offset);
3991                         SET_PAGE_DIRTY(m, FALSE);
3992
3993                         /*
3994                          * Now cope with the source page and object
3995                          */
3996                         if (object->ref_count > 1 && cur_m->pmapped)
3997                                 pmap_disconnect(cur_m->phys_page);
3998
3999                         if (cur_m->clustered) {
4000                                 VM_PAGE_COUNT_AS_PAGEIN(cur_m);
4001                                 VM_PAGE_CONSUME_CLUSTERED(cur_m);
4002                         }
4003                         need_collapse = TRUE;
4004
4005                         if (!cur_object->internal &&
4006                             cur_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
4007                                 /*
4008                                  * The object from which we've just
4009                                  * copied a page is most probably backed
4010                                  * by a vnode.  We don't want to waste too
4011                                  * much time trying to collapse the VM objects
4012                                  * and create a bottleneck when several tasks
4013                                  * map the same file.
4014                                  */
4015                                 if (cur_object->copy == object) {
4016                                         /*
4017                                          * Shared mapping or no COW yet.
4018                                          * We can never collapse a copy
4019                                          * object into its backing object.
4020                                          */
4021                                         need_collapse = FALSE;
4022                                 } else if (cur_object->copy == object->shadow &&
4023                                            object->shadow->resident_page_count == 0) {
4024                                         /*
4025                                          * Shared mapping after a COW occurred.
4026                                          */
4027                                         need_collapse = FALSE;
4028                                 }
4029                         }
4030                         vm_object_unlock(cur_object);
4031
4032                         if (need_collapse == FALSE)
4033                                 vm_fault_collapse_skipped++;
4034                         vm_fault_collapse_total++;
4035
4036                         type_of_fault = DBG_COW_FAULT;
4037                         VM_STAT_INCR(cow_faults);
4038                         DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
4039                         current_task()->cow_faults++;
4040
4041                         goto FastPmapEnter;
4042
4043                 } else {
4044                         /*
4045                          * No page at cur_object, cur_offset... m == NULL
4046                          */
4047                         if (cur_object->pager_created) {
4048                                 int     compressor_external_state = VM_EXTERNAL_STATE_UNKNOWN;
4049
4050                                 if (MUST_ASK_PAGER(cur_object, cur_offset, compressor_external_state) == TRUE) {
4051                                         int             my_fault_type;
4052                                         int             c_flags = C_DONT_BLOCK;
4053                                         boolean_t       insert_cur_object = FALSE;
4054
4055                                         /*
4056                                          * May have to talk to a pager...
4057                                          * if so, take the slow path by
4058                                          * doing a 'break' from the while (TRUE) loop
4059                                          *
4060                                          * external_state will only be set to VM_EXTERNAL_STATE_EXISTS
4061                                          * if the compressor is active and the page exists there
4062                                          */
4063                                         if (compressor_external_state != VM_EXTERNAL_STATE_EXISTS)
4064                                                 break;
4065
4066                                         if (map == kernel_map || real_map == kernel_map) {
4067                                                 /*
4068                                                  * can't call into the compressor with the kernel_map
4069                                                  * lock held, since the compressor may try to operate
4070                                                  * on the kernel map in order to return an empty c_segment
4071                                                  */
4072                                                 break;
4073                                         }
4074                                         if (object != cur_object) {
4075                                                 if (fault_type & VM_PROT_WRITE)
4076                                                         c_flags |= C_KEEP;
4077                                                 else
4078                                                         insert_cur_object = TRUE;
4079                                         }
4080                                         if (insert_cur_object == TRUE) {
4081
4082                                                 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
4083
4084                                                         cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4085
4086                                                         if (vm_object_lock_upgrade(cur_object) == FALSE) {
4087                                                                 /*
4088                                                                  * couldn't upgrade so go do a full retry
4089                                                                  * immediately since we can no longer be
4090                                                                  * certain about cur_object (since we
4091                                                                  * don't hold a reference on it)...
4092                                                                  * first drop the top object lock
4093                                                                  */
4094                                                                 vm_object_unlock(object);
4095
4096                                                                 vm_map_unlock_read(map);
4097                                                                 if (real_map != map)
4098                                                                         vm_map_unlock(real_map);
4099
4100                                                                 goto RetryFault;
4101                                                         }
4102                                                 }
4103                                         } else if (object_lock_type == OBJECT_LOCK_SHARED) {
4104
4105                                                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4106
4107                                                 if (object != cur_object) {
4108                                                         /*
4109                                                          * we can't go for the upgrade on the top
4110                                                          * lock since the upgrade may block waiting
4111                                                          * for readers to drain... since we hold
4112                                                          * cur_object locked at this point, waiting
4113                                                          * for the readers to drain would represent
4114                                                          * a lock order inversion since the lock order
4115                                                          * for objects is the reference order in the
4116                                                          * shadown chain
4117                                                          */
4118                                                         vm_object_unlock(object);
4119                                                         vm_object_unlock(cur_object);
4120
4121                                                         vm_map_unlock_read(map);
4122                                                         if (real_map != map)
4123                                                                 vm_map_unlock(real_map);
4124
4125                                                         goto RetryFault;
4126                                                 }
4127                                                 if (vm_object_lock_upgrade(object) == FALSE) {
4128                                                         /*
4129                                                          * couldn't upgrade, so explictly take the lock
4130                                                          * exclusively and go relookup the page since we
4131                                                          * will have dropped the object lock and
4132                                                          * a different thread could have inserted
4133                                                          * a page at this offset
4134                                                          * no need for a full retry since we're
4135                                                          * at the top level of the object chain
4136                                                          */
4137                                                         vm_object_lock(object);
4138
4139                                                         continue;
4140                                                 }
4141                                         }
4142                                         m = vm_page_grab();
4143
4144                                         if (m == VM_PAGE_NULL) {
4145                                                 /*
4146                                                  * no free page currently available...
4147                                                  * must take the slow path
4148                                                  */
4149                                                 break;
4150                                         }
4151
4152                                         /*
4153                                          * The object is and remains locked
4154                                          * so no need to take a
4155                                          * "paging_in_progress" reference.
4156                                          */
4157                                         boolean_t shared_lock;
4158                                         if ((object == cur_object &&
4159                                              object_lock_type == OBJECT_LOCK_EXCLUSIVE) ||
4160                                             (object != cur_object &&
4161                                              cur_object_lock_type == OBJECT_LOCK_EXCLUSIVE)) {
4162                                                 shared_lock = FALSE;
4163                                         } else {
4164                                                 shared_lock = TRUE;
4165                                         }
4166
4167                                         kr = vm_compressor_pager_get(
4168                                                 cur_object->pager,
4169                                                 (cur_offset +
4170                                                  cur_object->paging_offset),
4171                                                 m->phys_page,
4172                                                 &my_fault_type,
4173                                                 c_flags,
4174                                                 &compressed_count_delta);
4175
4176                                         vm_compressor_pager_count(
4177                                                 cur_object->pager,
4178                                                 compressed_count_delta,
4179                                                 shared_lock,
4180                                                 cur_object);
4181
4182                                         if (kr != KERN_SUCCESS) {
4183                                                 vm_page_release(m);
4184                                                 break;
4185                                         }
4186                                         m->dirty = TRUE;
4187
4188                                         /*
4189                                          * If the object is purgeable, its
4190                                          * owner's purgeable ledgers will be
4191                                          * updated in vm_page_insert() but the
4192                                          * page was also accounted for in a
4193                                          * "compressed purgeable" ledger, so
4194                                          * update that now.
4195                                          */
4196                                         if (object != cur_object &&
4197                                             !insert_cur_object) {
4198                                                 /*
4199                                                  * We're not going to insert
4200                                                  * the decompressed page into
4201                                                  * the object it came from.
4202                                                  *
4203                                                  * We're dealing with a
4204                                                  * copy-on-write fault on
4205                                                  * "object".
4206                                                  * We're going to decompress
4207                                                  * the page directly into the
4208                                                  * target "object" while
4209                                                  * keepin the compressed
4210                                                  * page for "cur_object", so
4211                                                  * no ledger update in that
4212                                                  * case.
4213                                                  */
4214                                         } else if ((cur_object->purgable ==
4215                                                     VM_PURGABLE_DENY) ||
4216                                                    (cur_object->vo_purgeable_owner ==
4217                                                     NULL)) {
4218                                                 /*
4219                                                  * "cur_object" is not purgeable
4220                                                  * or is not owned, so no
4221                                                  * purgeable ledgers to update.
4222                                                  */
4223                                         } else {
4224                                                 /*
4225                                                  * One less compressed
4226                                                  * purgeable page for
4227                                                  * cur_object's owner.
4228                                                  */
4229                                                 vm_purgeable_compressed_update(
4230                                                         cur_object,
4231                                                         -1);
4232                                         }
4233
4234                                         if (insert_cur_object) {
4235                                                 vm_page_insert(m, cur_object, cur_offset);
4236                                         } else {
4237                                                 vm_page_insert(m, object, offset);
4238                                         }
4239
4240                                         if ((m->object->wimg_bits & VM_WIMG_MASK) != VM_WIMG_USE_DEFAULT) {
4241                                                 /*
4242                                                  * If the page is not cacheable,
4243                                                  * we can't let its contents
4244                                                  * linger in the data cache
4245                                                  * after the decompression.
4246                                                  */
4247                                                 pmap_sync_page_attributes_phys(m->phys_page);
4248                                         }
4249
4250                                         type_of_fault = my_fault_type;
4251
4252                                         VM_STAT_INCR(decompressions);
4253
4254                                         if (cur_object != object) {
4255                                                 if (insert_cur_object) {
4256                                                         top_object = object;
4257                                                         /*
4258                                                          * switch to the object that has the new page
4259                                                          */
4260                                                         object = cur_object;
4261                                                         object_lock_type = cur_object_lock_type;
4262                                                 } else {
4263                                                         vm_object_unlock(cur_object);
4264                                                         cur_object = object;
4265                                                 }
4266                                         }
4267                                         goto FastPmapEnter;
4268                                 }
4269                                 /*
4270                                  * existence map present and indicates
4271                                  * that the pager doesn't have this page
4272                                  */
4273                         }
4274                         if (cur_object->shadow == VM_OBJECT_NULL) {
4275                                 /*
4276                                  * Zero fill fault.  Page gets
4277                                  * inserted into the original object.
4278                                  */
4279                                 if (cur_object->shadow_severed ||
4280                                     VM_OBJECT_PURGEABLE_FAULT_ERROR(cur_object))
4281                                 {
4282                                         if (object != cur_object)
4283                                                 vm_object_unlock(cur_object);
4284                                         vm_object_unlock(object);
4285
4286                                         vm_map_unlock_read(map);
4287                                         if (real_map != map)
4288                                                 vm_map_unlock(real_map);
4289
4290                                         kr = KERN_MEMORY_ERROR;
4291                                         goto done;
4292                                 }
4293                                 if (vm_backing_store_low) {
4294                                         /*
4295                                          * we are protecting the system from
4296                                          * backing store exhaustion...
4297                                          * must take the slow path if we're
4298                                          * not privileged
4299                                          */
4300                                         if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV))
4301                                                 break;
4302                                 }
4303                                 if (cur_object != object) {
4304                                         vm_object_unlock(cur_object);
4305
4306                                         cur_object = object;
4307                                 }
4308                                 if (object_lock_type == OBJECT_LOCK_SHARED) {
4309
4310                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4311
4312                                         if (vm_object_lock_upgrade(object) == FALSE) {
4313                                                 /*
4314                                                  * couldn't upgrade so do a full retry on the fault
4315                                                  * since we dropped the object lock which
4316                                                  * could allow another thread to insert
4317                                                  * a page at this offset
4318                                                  */
4319                                                 vm_map_unlock_read(map);
4320                                                 if (real_map != map)
4321                                                         vm_map_unlock(real_map);
4322
4323                                                 goto RetryFault;
4324                                         }
4325                                 }
4326                                 m = vm_page_alloc(object, offset);
4327
4328                                 if (m == VM_PAGE_NULL) {
4329                                         /*
4330                                          * no free page currently available...
4331                                          * must take the slow path
4332                                          */
4333                                         break;
4334                                 }
4335
4336                                 /*
4337                                  * Now zero fill page...
4338                                  * the page is probably going to
4339                                  * be written soon, so don't bother
4340                                  * to clear the modified bit
4341                                  *
4342                                  *   NOTE: This code holds the map
4343                                  *   lock across the zero fill.
4344                                  */
4345                                 type_of_fault = vm_fault_zero_page(m, map->no_zero_fill);
4346
4347                                 goto FastPmapEnter;
4348                         }
4349                         /*
4350                          * On to the next level in the shadow chain
4351                          */
4352                         cur_offset += cur_object->vo_shadow_offset;
4353                         new_object = cur_object->shadow;
4354
4355                         /*
4356                          * take the new_object's lock with the indicated state
4357                          */
4358                         if (cur_object_lock_type == OBJECT_LOCK_SHARED)
4359                                 vm_object_lock_shared(new_object);
4360                         else
4361                                 vm_object_lock(new_object);
4362
4363                         if (cur_object != object)
4364                                 vm_object_unlock(cur_object);
4365
4366                         cur_object = new_object;
4367
4368                         continue;
4369                 }
4370         }
4371         /*
4372          * Cleanup from fast fault failure.  Drop any object
4373          * lock other than original and drop map lock.
4374          */
4375         if (object != cur_object)
4376                 vm_object_unlock(cur_object);
4377
4378         /*
4379          * must own the object lock exclusively at this point
4380          */
4381         if (object_lock_type == OBJECT_LOCK_SHARED) {
4382                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4383
4384                 if (vm_object_lock_upgrade(object) == FALSE) {
4385                         /*
4386                          * couldn't upgrade, so explictly
4387                          * take the lock exclusively
4388                          * no need to retry the fault at this
4389                          * point since "vm_fault_page" will
4390                          * completely re-evaluate the state
4391                          */
4392                         vm_object_lock(object);
4393                 }
4394         }
4395
4396 handle_copy_delay:
4397         vm_map_unlock_read(map);
4398         if (real_map != map)
4399                 vm_map_unlock(real_map);
4400
4401         /*
4402          * Make a reference to this object to
4403          * prevent its disposal while we are messing with
4404          * it.  Once we have the reference, the map is free
4405          * to be diddled.  Since objects reference their
4406          * shadows (and copies), they will stay around as well.
4407          */
4408         vm_object_reference_locked(object);
4409         vm_object_paging_begin(object);
4410
4411         XPR(XPR_VM_FAULT,"vm_fault -> vm_fault_page\n",0,0,0,0,0);
4412
4413         error_code = 0;
4414
4415         result_page = VM_PAGE_NULL;
4416         kr = vm_fault_page(object, offset, fault_type,
4417                            (change_wiring && !wired),
4418                            FALSE, /* page not looked up */
4419                            &prot, &result_page, &top_page,
4420                            &type_of_fault,
4421                            &error_code, map->no_zero_fill,
4422                            FALSE, &fault_info);
4423
4424         /*
4425          * if kr != VM_FAULT_SUCCESS, then the paging reference
4426          * has been dropped and the object unlocked... the ref_count
4427          * is still held
4428          *
4429          * if kr == VM_FAULT_SUCCESS, then the paging reference
4430          * is still held along with the ref_count on the original object
4431          *
4432          *      the object is returned locked with a paging reference
4433          *
4434          *      if top_page != NULL, then it's BUSY and the
4435          *      object it belongs to has a paging reference
4436          *      but is returned unlocked
4437          */
4438         if (kr != VM_FAULT_SUCCESS &&
4439             kr != VM_FAULT_SUCCESS_NO_VM_PAGE) {
4440                 /*
4441                  * we didn't succeed, lose the object reference immediately.
4442                  */
4443                 vm_object_deallocate(object);
4444
4445                 /*
4446                  * See why we failed, and take corrective action.
4447                  */
4448                 switch (kr) {
4449                 case VM_FAULT_MEMORY_SHORTAGE:
4450                         if (vm_page_wait((change_wiring) ?
4451                                          THREAD_UNINT :
4452                                          THREAD_ABORTSAFE))
4453                                 goto RetryFault;
4454                         /*
4455                          * fall thru
4456                          */
4457                 case VM_FAULT_INTERRUPTED:
4458                         kr = KERN_ABORTED;
4459                         goto done;
4460                 case VM_FAULT_RETRY:
4461                         goto RetryFault;
4462                 case VM_FAULT_MEMORY_ERROR:
4463                         if (error_code)
4464                                 kr = error_code;
4465                         else
4466                                 kr = KERN_MEMORY_ERROR;
4467                         goto done;
4468                 default:
4469                         panic("vm_fault: unexpected error 0x%x from "
4470                               "vm_fault_page()\n", kr);
4471                 }
4472         }
4473         m = result_page;
4474
4475         if (m != VM_PAGE_NULL) {
4476                 assert((change_wiring && !wired) ?
4477                     (top_page == VM_PAGE_NULL) :
4478                     ((top_page == VM_PAGE_NULL) == (m->object == object)));
4479         }
4480
4481         /*
4482          * What to do with the resulting page from vm_fault_page
4483          * if it doesn't get entered into the physical map:
4484          */
4485 #define RELEASE_PAGE(m)                                 \
4486         MACRO_BEGIN                                     \
4487         PAGE_WAKEUP_DONE(m);                            \
4488         if (!m->active && !m->inactive && !m->throttled) {              \
4489                 vm_page_lockspin_queues();                              \
4490                 if (!m->active && !m->inactive && !m->throttled)        \
4491                         vm_page_activate(m);                            \
4492                 vm_page_unlock_queues();                                \
4493         }                                                               \
4494         MACRO_END
4495
4496         /*
4497          * We must verify that the maps have not changed
4498          * since our last lookup.
4499          */
4500         if (m != VM_PAGE_NULL) {
4501                 old_copy_object = m->object->copy;
4502                 vm_object_unlock(m->object);
4503         } else {
4504                 old_copy_object = VM_OBJECT_NULL;
4505                 vm_object_unlock(object);
4506         }
4507
4508         /*
4509          * no object locks are held at this point
4510          */
4511         if ((map != original_map) || !vm_map_verify(map, &version)) {
4512                 vm_object_t             retry_object;
4513                 vm_object_offset_t      retry_offset;
4514                 vm_prot_t               retry_prot;
4515
4516                 /*
4517                  * To avoid trying to write_lock the map while another
4518                  * thread has it read_locked (in vm_map_pageable), we
4519                  * do not try for write permission.  If the page is
4520                  * still writable, we will get write permission.  If it
4521                  * is not, or has been marked needs_copy, we enter the
4522                  * mapping without write permission, and will merely
4523                  * take another fault.
4524                  */
4525                 map = original_map;
4526                 vm_map_lock_read(map);
4527
4528                 kr = vm_map_lookup_locked(&map, vaddr,
4529                                           fault_type & ~VM_PROT_WRITE,
4530                                           OBJECT_LOCK_EXCLUSIVE, &version,
4531                                           &retry_object, &retry_offset, &retry_prot,
4532                                           &wired,
4533                                           &fault_info,
4534                                           &real_map);
4535                 pmap = real_map->pmap;
4536
4537                 if (kr != KERN_SUCCESS) {
4538                         vm_map_unlock_read(map);
4539
4540                         if (m != VM_PAGE_NULL) {
4541                                 /*
4542                                  * retake the lock so that
4543                                  * we can drop the paging reference
4544                                  * in vm_fault_cleanup and do the
4545                                  * PAGE_WAKEUP_DONE in RELEASE_PAGE
4546                                  */
4547                                 vm_object_lock(m->object);
4548
4549                                 RELEASE_PAGE(m);
4550
4551                                 vm_fault_cleanup(m->object, top_page);
4552                         } else {
4553                                 /*
4554                                  * retake the lock so that
4555                                  * we can drop the paging reference
4556                                  * in vm_fault_cleanup
4557                                  */
4558                                 vm_object_lock(object);
4559
4560                                 vm_fault_cleanup(object, top_page);
4561                         }
4562                         vm_object_deallocate(object);
4563
4564                         goto done;
4565                 }
4566                 vm_object_unlock(retry_object);
4567
4568                 if ((retry_object != object) || (retry_offset != offset)) {
4569
4570                         vm_map_unlock_read(map);
4571                         if (real_map != map)
4572                                 vm_map_unlock(real_map);
4573
4574                         if (m != VM_PAGE_NULL) {
4575                                 /*
4576                                  * retake the lock so that
4577                                  * we can drop the paging reference
4578                                  * in vm_fault_cleanup and do the
4579                                  * PAGE_WAKEUP_DONE in RELEASE_PAGE
4580                                  */
4581                                 vm_object_lock(m->object);
4582
4583                                 RELEASE_PAGE(m);
4584
4585                                 vm_fault_cleanup(m->object, top_page);
4586                         } else {
4587                                 /*
4588                                  * retake the lock so that
4589                                  * we can drop the paging reference
4590                                  * in vm_fault_cleanup
4591                                  */
4592                                 vm_object_lock(object);
4593
4594                                 vm_fault_cleanup(object, top_page);
4595                         }
4596                         vm_object_deallocate(object);
4597
4598                         goto RetryFault;
4599                 }
4600                 /*
4601                  * Check whether the protection has changed or the object
4602                  * has been copied while we left the map unlocked.
4603                  */
4604                 prot &= retry_prot;
4605         }
4606         if (m != VM_PAGE_NULL) {
4607                 vm_object_lock(m->object);
4608
4609                 if (m->object->copy != old_copy_object) {
4610                         /*
4611                          * The copy object changed while the top-level object
4612                          * was unlocked, so take away write permission.
4613                          */
4614                         prot &= ~VM_PROT_WRITE;
4615                 }
4616         } else
4617                 vm_object_lock(object);
4618
4619         /*
4620          * If we want to wire down this page, but no longer have
4621          * adequate permissions, we must start all over.
4622          */
4623         if (wired && (fault_type != (prot | VM_PROT_WRITE))) {
4624
4625                 vm_map_verify_done(map, &version);
4626                 if (real_map != map)
4627                         vm_map_unlock(real_map);
4628
4629                 if (m != VM_PAGE_NULL) {
4630                         RELEASE_PAGE(m);
4631
4632                         vm_fault_cleanup(m->object, top_page);
4633                 } else
4634                         vm_fault_cleanup(object, top_page);
4635
4636                 vm_object_deallocate(object);
4637
4638                 goto RetryFault;
4639         }
4640         if (m != VM_PAGE_NULL) {
4641                 /*
4642                  * Put this page into the physical map.
4643                  * We had to do the unlock above because pmap_enter
4644                  * may cause other faults.  The page may be on
4645                  * the pageout queues.  If the pageout daemon comes
4646                  * across the page, it will remove it from the queues.
4647                  */
4648                 if (caller_pmap) {
4649                         kr = vm_fault_enter(m,
4650                                             caller_pmap,
4651                                             caller_pmap_addr,
4652                                             prot,
4653                                             fault_type,
4654                                             wired,
4655                                             change_wiring,
4656                                             fault_info.no_cache,
4657                                             fault_info.cs_bypass,
4658                                             fault_info.user_tag,
4659                                             fault_info.pmap_options,
4660                                             NULL,
4661                                             &type_of_fault);
4662                 } else {
4663                         kr = vm_fault_enter(m,
4664                                             pmap,
4665                                             vaddr,
4666                                             prot,
4667                                             fault_type,
4668                                             wired,
4669                                             change_wiring,
4670                                             fault_info.no_cache,
4671                                             fault_info.cs_bypass,
4672                                             fault_info.user_tag,
4673                                             fault_info.pmap_options,
4674                                             NULL,
4675                                             &type_of_fault);
4676                 }
4677                 if (kr != KERN_SUCCESS) {
4678                         /* abort this page fault */
4679                         vm_map_verify_done(map, &version);
4680                         if (real_map != map)
4681                                 vm_map_unlock(real_map);
4682                         PAGE_WAKEUP_DONE(m);
4683                         vm_fault_cleanup(m->object, top_page);
4684                         vm_object_deallocate(object);
4685                         goto done;
4686                 }
4687                 if (physpage_p != NULL) {
4688                         /* for vm_map_wire_and_extract() */
4689                         *physpage_p = m->phys_page;
4690                         if (prot & VM_PROT_WRITE) {
4691                                 vm_object_lock_assert_exclusive(m->object);
4692                                 m->dirty = TRUE;
4693                         }
4694                 }
4695         } else {
4696
4697                 vm_map_entry_t          entry;
4698                 vm_map_offset_t         laddr;
4699                 vm_map_offset_t         ldelta, hdelta;
4700
4701                 /*
4702                  * do a pmap block mapping from the physical address
4703                  * in the object
4704                  */
4705
4706 #ifdef ppc
4707                 /* While we do not worry about execution protection in   */
4708                 /* general, certian pages may have instruction execution */
4709                 /* disallowed.  We will check here, and if not allowed   */
4710                 /* to execute, we return with a protection failure.      */
4711
4712                 if ((fault_type & VM_PROT_EXECUTE) &&
4713                         (!pmap_eligible_for_execute((ppnum_t)(object->vo_shadow_offset >> 12)))) {
4714
4715                         vm_map_verify_done(map, &version);
4716
4717                         if (real_map != map)
4718                                 vm_map_unlock(real_map);
4719
4720                         vm_fault_cleanup(object, top_page);
4721                         vm_object_deallocate(object);
4722
4723                         kr = KERN_PROTECTION_FAILURE;
4724                         goto done;
4725                 }
4726 #endif  /* ppc */
4727
4728                 if (real_map != map)
4729                         vm_map_unlock(real_map);
4730
4731                 if (original_map != map) {
4732                         vm_map_unlock_read(map);
4733                         vm_map_lock_read(original_map);
4734                         map = original_map;
4735                 }
4736                 real_map = map;
4737
4738                 laddr = vaddr;
4739                 hdelta = 0xFFFFF000;
4740                 ldelta = 0xFFFFF000;
4741
4742                 while (vm_map_lookup_entry(map, laddr, &entry)) {
4743                         if (ldelta > (laddr - entry->vme_start))
4744                                 ldelta = laddr - entry->vme_start;
4745                         if (hdelta > (entry->vme_end - laddr))
4746                                 hdelta = entry->vme_end - laddr;
4747                         if (entry->is_sub_map) {
4748
4749                                 laddr = (laddr - entry->vme_start)
4750                                                         + entry->offset;
4751                                 vm_map_lock_read(entry->object.sub_map);
4752
4753                                 if (map != real_map)
4754                                         vm_map_unlock_read(map);
4755                                 if (entry->use_pmap) {
4756                                         vm_map_unlock_read(real_map);
4757                                         real_map = entry->object.sub_map;
4758                                 }
4759                                 map = entry->object.sub_map;
4760
4761                         } else {
4762                                 break;
4763                         }
4764                 }
4765
4766                 if (vm_map_lookup_entry(map, laddr, &entry) &&
4767                     (entry->object.vm_object != NULL) &&
4768                     (entry->object.vm_object == object)) {
4769
4770                         int superpage = (!object->pager_created && object->phys_contiguous)? VM_MEM_SUPERPAGE : 0;
4771
4772                         if (superpage && physpage_p) {
4773                                 /* for vm_map_wire_and_extract() */
4774                                 *physpage_p = (ppnum_t) ((((vm_map_offset_t) entry->object.vm_object->vo_shadow_offset)
4775                                                           + entry->offset
4776                                                           + (laddr - entry->vme_start))
4777                                                          >> PAGE_SHIFT);
4778                         }
4779
4780                         if (caller_pmap) {
4781                                 /*
4782                                  * Set up a block mapped area
4783                                  */
4784                                 assert((uint32_t)((ldelta + hdelta) >> PAGE_SHIFT) == ((ldelta + hdelta) >> PAGE_SHIFT));
4785                                 pmap_map_block(caller_pmap,
4786                                                (addr64_t)(caller_pmap_addr - ldelta),
4787                                                (ppnum_t)((((vm_map_offset_t) (entry->object.vm_object->vo_shadow_offset)) +
4788                                                           entry->offset + (laddr - entry->vme_start) - ldelta) >> PAGE_SHIFT),
4789                                                (uint32_t)((ldelta + hdelta) >> PAGE_SHIFT), prot,
4790                                                (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
4791                         } else {
4792                                 /*
4793                                  * Set up a block mapped area
4794                                  */
4795                                 assert((uint32_t)((ldelta + hdelta) >> PAGE_SHIFT) == ((ldelta + hdelta) >> PAGE_SHIFT));
4796                                 pmap_map_block(real_map->pmap,
4797                                                (addr64_t)(vaddr - ldelta),
4798                                                (ppnum_t)((((vm_map_offset_t)(entry->object.vm_object->vo_shadow_offset)) +
4799                                                           entry->offset + (laddr - entry->vme_start) - ldelta) >> PAGE_SHIFT),
4800                                                (uint32_t)((ldelta + hdelta) >> PAGE_SHIFT), prot,
4801                                                (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
4802                         }
4803                 }
4804         }
4805
4806         /*
4807          * Unlock everything, and return
4808          */
4809         vm_map_verify_done(map, &version);
4810         if (real_map != map)
4811                 vm_map_unlock(real_map);
4812
4813         if (m != VM_PAGE_NULL) {
4814                 PAGE_WAKEUP_DONE(m);
4815
4816                 vm_fault_cleanup(m->object, top_page);
4817         } else
4818                 vm_fault_cleanup(object, top_page);
4819
4820         vm_object_deallocate(object);
4821
4822 #undef  RELEASE_PAGE
4823
4824         kr = KERN_SUCCESS;
4825 done:
4826         thread_interrupt_level(interruptible_state);
4827
4828         /*
4829          * Only I/O throttle on faults which cause a pagein/swapin.
4830          */
4831         if ((type_of_fault == DBG_PAGEIND_FAULT) || (type_of_fault == DBG_PAGEINV_FAULT) || (type_of_fault == DBG_COMPRESSOR_SWAPIN_FAULT)) {
4832                 throttle_lowpri_io(1);
4833         } else {
4834                 if (kr == KERN_SUCCESS && type_of_fault != DBG_CACHE_HIT_FAULT && type_of_fault != DBG_GUARD_FAULT) {
4835
4836                         if ((throttle_delay = vm_page_throttled(TRUE))) {
4837
4838                                 if (vm_debug_events) {
4839                                         if (type_of_fault == DBG_COMPRESSOR_FAULT)
4840                                                 VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
4841                                         else if (type_of_fault == DBG_COW_FAULT)
4842                                                 VM_DEBUG_EVENT(vmf_cowdelay, VMF_COWDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
4843                                         else
4844                                                 VM_DEBUG_EVENT(vmf_zfdelay, VMF_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
4845                                 }
4846                                 delay(throttle_delay);
4847                         }
4848                 }
4849         }
4850         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4851                               (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
4852                               ((uint64_t)vaddr >> 32),
4853                               vaddr,
4854                               kr,
4855                               type_of_fault,
4856                               0);
4857
4858         return (kr);
4859 }
4860
4861 /*
4862  *      vm_fault_wire:
4863  *
4864  *      Wire down a range of virtual addresses in a map.
4865  */
4866 kern_return_t
4867 vm_fault_wire(
4868         vm_map_t        map,
4869         vm_map_entry_t  entry,
4870         pmap_t          pmap,
4871         vm_map_offset_t pmap_addr,
4872         ppnum_t         *physpage_p)
4873 {
4874
4875         register vm_map_offset_t        va;
4876         register vm_map_offset_t        end_addr = entry->vme_end;
4877         register kern_return_t  rc;
4878
4879         assert(entry->in_transition);
4880
4881         if ((entry->object.vm_object != NULL) &&
4882             !entry->is_sub_map &&
4883             entry->object.vm_object->phys_contiguous) {
4884                 return KERN_SUCCESS;
4885         }
4886
4887         /*
4888          *      Inform the physical mapping system that the
4889          *      range of addresses may not fault, so that
4890          *      page tables and such can be locked down as well.
4891          */
4892
4893         pmap_pageable(pmap, pmap_addr,
4894                 pmap_addr + (end_addr - entry->vme_start), FALSE);
4895
4896         /*
4897          *      We simulate a fault to get the page and enter it
4898          *      in the physical map.
4899          */
4900
4901         for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
4902                 rc = vm_fault_wire_fast(map, va, entry, pmap,
4903                                         pmap_addr + (va - entry->vme_start),
4904                                         physpage_p);
4905                 if (rc != KERN_SUCCESS) {
4906                         rc = vm_fault_internal(map, va, VM_PROT_NONE, TRUE,
4907                                                ((pmap == kernel_pmap)
4908                                                 ? THREAD_UNINT
4909                                                 : THREAD_ABORTSAFE),
4910                                                pmap,
4911                                                (pmap_addr +
4912                                                 (va - entry->vme_start)),
4913                                                physpage_p);
4914                         DTRACE_VM2(softlock, int, 1, (uint64_t *), NULL);
4915                 }
4916
4917                 if (rc != KERN_SUCCESS) {
4918                         struct vm_map_entry     tmp_entry = *entry;
4919
4920                         /* unwire wired pages */
4921                         tmp_entry.vme_end = va;
4922                         vm_fault_unwire(map,
4923                                 &tmp_entry, FALSE, pmap, pmap_addr);
4924
4925                         return rc;
4926                 }
4927         }
4928         return KERN_SUCCESS;
4929 }
4930
4931 /*
4932  *      vm_fault_unwire:
4933  *
4934  *      Unwire a range of virtual addresses in a map.
4935  */
4936 void
4937 vm_fault_unwire(
4938         vm_map_t        map,
4939         vm_map_entry_t  entry,
4940         boolean_t       deallocate,
4941         pmap_t          pmap,
4942         vm_map_offset_t pmap_addr)
4943 {
4944         register vm_map_offset_t        va;
4945         register vm_map_offset_t        end_addr = entry->vme_end;
4946         vm_object_t             object;
4947         struct vm_object_fault_info fault_info;
4948
4949         object = (entry->is_sub_map)
4950                         ? VM_OBJECT_NULL : entry->object.vm_object;
4951
4952         /*
4953          * If it's marked phys_contiguous, then vm_fault_wire() didn't actually
4954          * do anything since such memory is wired by default.  So we don't have
4955          * anything to undo here.
4956          */
4957
4958         if (object != VM_OBJECT_NULL && object->phys_contiguous)
4959                 return;
4960
4961         fault_info.interruptible = THREAD_UNINT;
4962         fault_info.behavior = entry->behavior;
4963         fault_info.user_tag = entry->alias;
4964         fault_info.pmap_options = 0;
4965         if (entry->iokit_acct ||
4966             (!entry->is_sub_map && !entry->use_pmap)) {
4967                 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
4968         }
4969         fault_info.lo_offset = entry->offset;
4970         fault_info.hi_offset = (entry->vme_end - entry->vme_start) + entry->offset;
4971         fault_info.no_cache = entry->no_cache;
4972         fault_info.stealth = TRUE;
4973         fault_info.io_sync = FALSE;
4974         fault_info.cs_bypass = FALSE;
4975         fault_info.mark_zf_absent = FALSE;
4976         fault_info.batch_pmap_op = FALSE;
4977
4978         /*
4979          *      Since the pages are wired down, we must be able to
4980          *      get their mappings from the physical map system.
4981          */
4982
4983         for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
4984
4985                 if (object == VM_OBJECT_NULL) {
4986                         if (pmap) {
4987                                 pmap_change_wiring(pmap,
4988                                                    pmap_addr + (va - entry->vme_start), FALSE);
4989                         }
4990                         (void) vm_fault(map, va, VM_PROT_NONE,
4991                                         TRUE, THREAD_UNINT, pmap, pmap_addr);
4992                 } else {
4993                         vm_prot_t       prot;
4994                         vm_page_t       result_page;
4995                         vm_page_t       top_page;
4996                         vm_object_t     result_object;
4997                         vm_fault_return_t result;
4998
4999                         if (end_addr - va > (vm_size_t) -1) {
5000                                 /* 32-bit overflow */
5001                                 fault_info.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
5002                         } else {
5003                                 fault_info.cluster_size = (vm_size_t) (end_addr - va);
5004                                 assert(fault_info.cluster_size == end_addr - va);
5005                         }
5006
5007                         do {
5008                                 prot = VM_PROT_NONE;
5009
5010                                 vm_object_lock(object);
5011                                 vm_object_paging_begin(object);
5012                                 XPR(XPR_VM_FAULT,
5013                                         "vm_fault_unwire -> vm_fault_page\n",
5014                                         0,0,0,0,0);
5015                                 result_page = VM_PAGE_NULL;
5016                                 result = vm_fault_page(
5017                                         object,
5018                                         entry->offset + (va - entry->vme_start),
5019                                         VM_PROT_NONE, TRUE,
5020                                         FALSE, /* page not looked up */
5021                                         &prot, &result_page, &top_page,
5022                                         (int *)0,
5023                                         NULL, map->no_zero_fill,
5024                                         FALSE, &fault_info);
5025                         } while (result == VM_FAULT_RETRY);
5026
5027                         /*
5028                          * If this was a mapping to a file on a device that has been forcibly
5029                          * unmounted, then we won't get a page back from vm_fault_page().  Just
5030                          * move on to the next one in case the remaining pages are mapped from
5031                          * different objects.  During a forced unmount, the object is terminated
5032                          * so the alive flag will be false if this happens.  A forced unmount will
5033                          * will occur when an external disk is unplugged before the user does an
5034                          * eject, so we don't want to panic in that situation.
5035                          */
5036
5037                         if (result == VM_FAULT_MEMORY_ERROR && !object->alive)
5038                                 continue;
5039
5040                         if (result == VM_FAULT_MEMORY_ERROR &&
5041                             object == kernel_object) {
5042                                 /*
5043                                  * This must have been allocated with
5044                                  * KMA_KOBJECT and KMA_VAONLY and there's
5045                                  * no physical page at this offset.
5046                                  * We're done (no page to free).
5047                                  */
5048                                 assert(deallocate);
5049                                 continue;
5050                         }
5051
5052                         if (result != VM_FAULT_SUCCESS)
5053                                 panic("vm_fault_unwire: failure");
5054
5055                         result_object = result_page->object;
5056
5057                         if (deallocate) {
5058                                 assert(result_page->phys_page !=
5059                                        vm_page_fictitious_addr);
5060                                 pmap_disconnect(result_page->phys_page);
5061                                 VM_PAGE_FREE(result_page);
5062                         } else {
5063                                 if ((pmap) && (result_page->phys_page != vm_page_guard_addr))
5064                                         pmap_change_wiring(pmap,
5065                                             pmap_addr + (va - entry->vme_start), FALSE);
5066
5067
5068                                 if (VM_PAGE_WIRED(result_page)) {
5069                                         vm_page_lockspin_queues();
5070                                         vm_page_unwire(result_page, TRUE);
5071                                         vm_page_unlock_queues();
5072                                 }
5073                                 if(entry->zero_wired_pages) {
5074                                         pmap_zero_page(result_page->phys_page);
5075                                         entry->zero_wired_pages = FALSE;
5076                                 }
5077
5078                                 PAGE_WAKEUP_DONE(result_page);
5079                         }
5080                         vm_fault_cleanup(result_object, top_page);
5081                 }
5082         }
5083
5084         /*
5085          *      Inform the physical mapping system that the range
5086          *      of addresses may fault, so that page tables and
5087          *      such may be unwired themselves.
5088          */
5089
5090         pmap_pageable(pmap, pmap_addr,
5091                 pmap_addr + (end_addr - entry->vme_start), TRUE);
5092
5093 }
5094
5095 /*
5096  *      vm_fault_wire_fast:
5097  *
5098  *      Handle common case of a wire down page fault at the given address.
5099  *      If successful, the page is inserted into the associated physical map.
5100  *      The map entry is passed in to avoid the overhead of a map lookup.
5101  *
5102  *      NOTE: the given address should be truncated to the
5103  *      proper page address.
5104  *
5105  *      KERN_SUCCESS is returned if the page fault is handled; otherwise,
5106  *      a standard error specifying why the fault is fatal is returned.
5107  *
5108  *      The map in question must be referenced, and remains so.
5109  *      Caller has a read lock on the map.
5110  *
5111  *      This is a stripped version of vm_fault() for wiring pages.  Anything
5112  *      other than the common case will return KERN_FAILURE, and the caller
5113  *      is expected to call vm_fault().
5114  */
5115 kern_return_t
5116 vm_fault_wire_fast(
5117         __unused vm_map_t       map,
5118         vm_map_offset_t va,
5119         vm_map_entry_t  entry,
5120         pmap_t          pmap,
5121         vm_map_offset_t pmap_addr,
5122         ppnum_t         *physpage_p)
5123 {
5124         vm_object_t             object;
5125         vm_object_offset_t      offset;
5126         register vm_page_t      m;
5127         vm_prot_t               prot;
5128         thread_t                thread = current_thread();
5129         int                     type_of_fault;
5130         kern_return_t           kr;
5131
5132         VM_STAT_INCR(faults);
5133
5134         if (thread != THREAD_NULL && thread->task != TASK_NULL)
5135           thread->task->faults++;
5136
5137 /*
5138  *      Recovery actions
5139  */
5140
5141 #undef  RELEASE_PAGE
5142 #define RELEASE_PAGE(m) {                               \
5143         PAGE_WAKEUP_DONE(m);                            \
5144         vm_page_lockspin_queues();                      \
5145         vm_page_unwire(m, TRUE);                        \
5146         vm_page_unlock_queues();                        \
5147 }
5148
5149
5150 #undef  UNLOCK_THINGS
5151 #define UNLOCK_THINGS   {                               \
5152         vm_object_paging_end(object);                      \
5153         vm_object_unlock(object);                          \
5154 }
5155
5156 #undef  UNLOCK_AND_DEALLOCATE
5157 #define UNLOCK_AND_DEALLOCATE   {                       \
5158         UNLOCK_THINGS;                                  \
5159         vm_object_deallocate(object);                   \
5160 }
5161 /*
5162  *      Give up and have caller do things the hard way.
5163  */
5164
5165 #define GIVE_UP {                                       \
5166         UNLOCK_AND_DEALLOCATE;                          \
5167         return(KERN_FAILURE);                           \
5168 }
5169
5170
5171         /*
5172          *      If this entry is not directly to a vm_object, bail out.
5173          */
5174         if (entry->is_sub_map) {
5175                 assert(physpage_p == NULL);
5176                 return(KERN_FAILURE);
5177         }
5178
5179         /*
5180          *      Find the backing store object and offset into it.
5181          */
5182
5183         object = entry->object.vm_object;
5184         offset = (va - entry->vme_start) + entry->offset;
5185         prot = entry->protection;
5186
5187         /*
5188          *      Make a reference to this object to prevent its
5189          *      disposal while we are messing with it.
5190          */
5191
5192         vm_object_lock(object);
5193         vm_object_reference_locked(object);
5194         vm_object_paging_begin(object);
5195
5196         /*
5197          *      INVARIANTS (through entire routine):
5198          *
5199          *      1)      At all times, we must either have the object
5200          *              lock or a busy page in some object to prevent
5201          *              some other thread from trying to bring in
5202          *              the same page.
5203          *
5204          *      2)      Once we have a busy page, we must remove it from
5205          *              the pageout queues, so that the pageout daemon
5206          *              will not grab it away.
5207          *
5208          */
5209
5210         /*
5211          *      Look for page in top-level object.  If it's not there or
5212          *      there's something going on, give up.
5213          * ENCRYPTED SWAP: use the slow fault path, since we'll need to
5214          * decrypt the page before wiring it down.
5215          */
5216         m = vm_page_lookup(object, offset);
5217         if ((m == VM_PAGE_NULL) || (m->busy) || (m->encrypted) ||
5218             (m->unusual && ( m->error || m->restart || m->absent))) {
5219
5220                 GIVE_UP;
5221         }
5222         ASSERT_PAGE_DECRYPTED(m);
5223
5224         if (m->fictitious &&
5225             m->phys_page == vm_page_guard_addr) {
5226                 /*
5227                  * Guard pages are fictitious pages and are never
5228                  * entered into a pmap, so let's say it's been wired...
5229                  */
5230                 kr = KERN_SUCCESS;
5231                 goto done;
5232         }
5233
5234         /*
5235          *      Wire the page down now.  All bail outs beyond this
5236          *      point must unwire the page.
5237          */
5238
5239         vm_page_lockspin_queues();
5240         vm_page_wire(m);
5241         vm_page_unlock_queues();
5242
5243         /*
5244          *      Mark page busy for other threads.
5245          */
5246         assert(!m->busy);
5247         m->busy = TRUE;
5248         assert(!m->absent);
5249
5250         /*
5251          *      Give up if the page is being written and there's a copy object
5252          */
5253         if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
5254                 RELEASE_PAGE(m);
5255                 GIVE_UP;
5256         }
5257
5258         /*
5259          *      Put this page into the physical map.
5260          */
5261         type_of_fault = DBG_CACHE_HIT_FAULT;
5262         kr = vm_fault_enter(m,
5263                             pmap,
5264                             pmap_addr,
5265                             prot,
5266                             prot,
5267                             TRUE,
5268                             FALSE,
5269                             FALSE,
5270                             FALSE,
5271                             entry->alias,
5272                             ((entry->iokit_acct ||
5273                               (!entry->is_sub_map && !entry->use_pmap))
5274                              ? PMAP_OPTIONS_ALT_ACCT
5275                              : 0),
5276                             NULL,
5277                             &type_of_fault);
5278
5279 done:
5280         /*
5281          *      Unlock everything, and return
5282          */
5283
5284         if (physpage_p) {
5285                 /* for vm_map_wire_and_extract() */
5286                 if (kr == KERN_SUCCESS) {
5287                         *physpage_p = m->phys_page;
5288                         if (prot & VM_PROT_WRITE) {
5289                                 vm_object_lock_assert_exclusive(m->object);
5290                                 m->dirty = TRUE;
5291                         }
5292                 } else {
5293                         *physpage_p = 0;
5294                 }
5295         }
5296
5297         PAGE_WAKEUP_DONE(m);
5298         UNLOCK_AND_DEALLOCATE;
5299
5300         return kr;
5301
5302 }
5303
5304 /*
5305  *      Routine:        vm_fault_copy_cleanup
5306  *      Purpose:
5307  *              Release a page used by vm_fault_copy.
5308  */
5309
5310 void
5311 vm_fault_copy_cleanup(
5312         vm_page_t       page,
5313         vm_page_t       top_page)
5314 {
5315         vm_object_t     object = page->object;
5316
5317         vm_object_lock(object);
5318         PAGE_WAKEUP_DONE(page);
5319         if (!page->active && !page->inactive && !page->throttled) {
5320                 vm_page_lockspin_queues();
5321                 if (!page->active && !page->inactive && !page->throttled)
5322                         vm_page_activate(page);
5323                 vm_page_unlock_queues();
5324         }
5325         vm_fault_cleanup(object, top_page);
5326 }
5327
5328 void
5329 vm_fault_copy_dst_cleanup(
5330         vm_page_t       page)
5331 {
5332         vm_object_t     object;
5333
5334         if (page != VM_PAGE_NULL) {
5335                 object = page->object;
5336                 vm_object_lock(object);
5337                 vm_page_lockspin_queues();
5338                 vm_page_unwire(page, TRUE);
5339                 vm_page_unlock_queues();
5340                 vm_object_paging_end(object);
5341                 vm_object_unlock(object);
5342         }
5343 }
5344
5345 /*
5346  *      Routine:        vm_fault_copy
5347  *
5348  *      Purpose:
5349  *              Copy pages from one virtual memory object to another --
5350  *              neither the source nor destination pages need be resident.
5351  *
5352  *              Before actually copying a page, the version associated with
5353  *              the destination address map wil be verified.
5354  *
5355  *      In/out conditions:
5356  *              The caller must hold a reference, but not a lock, to
5357  *              each of the source and destination objects and to the
5358  *              destination map.
5359  *
5360  *      Results:
5361  *              Returns KERN_SUCCESS if no errors were encountered in
5362  *              reading or writing the data.  Returns KERN_INTERRUPTED if
5363  *              the operation was interrupted (only possible if the
5364  *              "interruptible" argument is asserted).  Other return values
5365  *              indicate a permanent error in copying the data.
5366  *
5367  *              The actual amount of data copied will be returned in the
5368  *              "copy_size" argument.  In the event that the destination map
5369  *              verification failed, this amount may be less than the amount
5370  *              requested.
5371  */
5372 kern_return_t
5373 vm_fault_copy(
5374         vm_object_t             src_object,
5375         vm_object_offset_t      src_offset,
5376         vm_map_size_t           *copy_size,             /* INOUT */
5377         vm_object_t             dst_object,
5378         vm_object_offset_t      dst_offset,
5379         vm_map_t                dst_map,
5380         vm_map_version_t         *dst_version,
5381         int                     interruptible)
5382 {
5383         vm_page_t               result_page;
5384
5385         vm_page_t               src_page;
5386         vm_page_t               src_top_page;
5387         vm_prot_t               src_prot;
5388
5389         vm_page_t               dst_page;
5390         vm_page_t               dst_top_page;
5391         vm_prot_t               dst_prot;
5392
5393         vm_map_size_t           amount_left;
5394         vm_object_t             old_copy_object;
5395         kern_return_t           error = 0;
5396         vm_fault_return_t       result;
5397
5398         vm_map_size_t           part_size;
5399         struct vm_object_fault_info fault_info_src;
5400         struct vm_object_fault_info fault_info_dst;
5401
5402         /*
5403          * In order not to confuse the clustered pageins, align
5404          * the different offsets on a page boundary.
5405          */
5406
5407 #define RETURN(x)                                       \
5408         MACRO_BEGIN                                     \
5409         *copy_size -= amount_left;                      \
5410         MACRO_RETURN(x);                                \
5411         MACRO_END
5412
5413         amount_left = *copy_size;
5414
5415         fault_info_src.interruptible = interruptible;
5416         fault_info_src.behavior = VM_BEHAVIOR_SEQUENTIAL;
5417         fault_info_src.user_tag  = 0;
5418         fault_info_src.pmap_options = 0;
5419         fault_info_src.lo_offset = vm_object_trunc_page(src_offset);
5420         fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left;
5421         fault_info_src.no_cache   = FALSE;
5422         fault_info_src.stealth = TRUE;
5423         fault_info_src.io_sync = FALSE;
5424         fault_info_src.cs_bypass = FALSE;
5425         fault_info_src.mark_zf_absent = FALSE;
5426         fault_info_src.batch_pmap_op = FALSE;
5427
5428         fault_info_dst.interruptible = interruptible;
5429         fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL;
5430         fault_info_dst.user_tag  = 0;
5431         fault_info_dst.pmap_options = 0;
5432         fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset);
5433         fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left;
5434         fault_info_dst.no_cache   = FALSE;
5435         fault_info_dst.stealth = TRUE;
5436         fault_info_dst.io_sync = FALSE;
5437         fault_info_dst.cs_bypass = FALSE;
5438         fault_info_dst.mark_zf_absent = FALSE;
5439         fault_info_dst.batch_pmap_op = FALSE;
5440
5441         do { /* while (amount_left > 0) */
5442                 /*
5443                  * There may be a deadlock if both source and destination
5444                  * pages are the same. To avoid this deadlock, the copy must
5445                  * start by getting the destination page in order to apply
5446                  * COW semantics if any.
5447                  */
5448
5449         RetryDestinationFault: ;
5450
5451                 dst_prot = VM_PROT_WRITE|VM_PROT_READ;
5452
5453                 vm_object_lock(dst_object);
5454                 vm_object_paging_begin(dst_object);
5455
5456                 if (amount_left > (vm_size_t) -1) {
5457                         /* 32-bit overflow */
5458                         fault_info_dst.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
5459                 } else {
5460                         fault_info_dst.cluster_size = (vm_size_t) amount_left;
5461                         assert(fault_info_dst.cluster_size == amount_left);
5462                 }
5463
5464                 XPR(XPR_VM_FAULT,"vm_fault_copy -> vm_fault_page\n",0,0,0,0,0);
5465                 dst_page = VM_PAGE_NULL;
5466                 result = vm_fault_page(dst_object,
5467                                        vm_object_trunc_page(dst_offset),
5468                                        VM_PROT_WRITE|VM_PROT_READ,
5469                                        FALSE,
5470                                        FALSE, /* page not looked up */
5471                                        &dst_prot, &dst_page, &dst_top_page,
5472                                        (int *)0,
5473                                        &error,
5474                                        dst_map->no_zero_fill,
5475                                        FALSE, &fault_info_dst);
5476                 switch (result) {
5477                 case VM_FAULT_SUCCESS:
5478                         break;
5479                 case VM_FAULT_RETRY:
5480                         goto RetryDestinationFault;
5481                 case VM_FAULT_MEMORY_SHORTAGE:
5482                         if (vm_page_wait(interruptible))
5483                                 goto RetryDestinationFault;
5484                         /* fall thru */
5485                 case VM_FAULT_INTERRUPTED:
5486                         RETURN(MACH_SEND_INTERRUPTED);
5487                 case VM_FAULT_SUCCESS_NO_VM_PAGE:
5488                         /* success but no VM page: fail the copy */
5489                         vm_object_paging_end(dst_object);
5490                         vm_object_unlock(dst_object);
5491                         /*FALLTHROUGH*/
5492                 case VM_FAULT_MEMORY_ERROR:
5493                         if (error)
5494                                 return (error);
5495                         else
5496                                 return(KERN_MEMORY_ERROR);
5497                 default:
5498                         panic("vm_fault_copy: unexpected error 0x%x from "
5499                               "vm_fault_page()\n", result);
5500                 }
5501                 assert ((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
5502
5503                 old_copy_object = dst_page->object->copy;
5504
5505                 /*
5506                  * There exists the possiblity that the source and
5507                  * destination page are the same.  But we can't
5508                  * easily determine that now.  If they are the
5509                  * same, the call to vm_fault_page() for the
5510                  * destination page will deadlock.  To prevent this we
5511                  * wire the page so we can drop busy without having
5512                  * the page daemon steal the page.  We clean up the
5513                  * top page  but keep the paging reference on the object
5514                  * holding the dest page so it doesn't go away.
5515                  */
5516
5517                 vm_page_lockspin_queues();
5518                 vm_page_wire(dst_page);
5519                 vm_page_unlock_queues();
5520                 PAGE_WAKEUP_DONE(dst_page);
5521                 vm_object_unlock(dst_page->object);
5522
5523                 if (dst_top_page != VM_PAGE_NULL) {
5524                         vm_object_lock(dst_object);
5525                         VM_PAGE_FREE(dst_top_page);
5526                         vm_object_paging_end(dst_object);
5527                         vm_object_unlock(dst_object);
5528                 }
5529
5530         RetrySourceFault: ;
5531
5532                 if (src_object == VM_OBJECT_NULL) {
5533                         /*
5534                          *      No source object.  We will just
5535                          *      zero-fill the page in dst_object.
5536                          */
5537                         src_page = VM_PAGE_NULL;
5538                         result_page = VM_PAGE_NULL;
5539                 } else {
5540                         vm_object_lock(src_object);
5541                         src_page = vm_page_lookup(src_object,
5542                                                   vm_object_trunc_page(src_offset));
5543                         if (src_page == dst_page) {
5544                                 src_prot = dst_prot;
5545                                 result_page = VM_PAGE_NULL;
5546                         } else {
5547                                 src_prot = VM_PROT_READ;
5548                                 vm_object_paging_begin(src_object);
5549
5550                                 if (amount_left > (vm_size_t) -1) {
5551                                         /* 32-bit overflow */
5552                                         fault_info_src.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
5553                                 } else {
5554                                         fault_info_src.cluster_size = (vm_size_t) amount_left;
5555                                         assert(fault_info_src.cluster_size == amount_left);
5556                                 }
5557
5558                                 XPR(XPR_VM_FAULT,
5559                                         "vm_fault_copy(2) -> vm_fault_page\n",
5560                                         0,0,0,0,0);
5561                                 result_page = VM_PAGE_NULL;
5562                                 result = vm_fault_page(
5563                                         src_object,
5564                                         vm_object_trunc_page(src_offset),
5565                                         VM_PROT_READ, FALSE,
5566                                         FALSE, /* page not looked up */
5567                                         &src_prot,
5568                                         &result_page, &src_top_page,
5569                                         (int *)0, &error, FALSE,
5570                                         FALSE, &fault_info_src);
5571
5572                                 switch (result) {
5573                                 case VM_FAULT_SUCCESS:
5574                                         break;
5575                                 case VM_FAULT_RETRY:
5576                                         goto RetrySourceFault;
5577                                 case VM_FAULT_MEMORY_SHORTAGE:
5578                                         if (vm_page_wait(interruptible))
5579                                                 goto RetrySourceFault;
5580                                         /* fall thru */
5581                                 case VM_FAULT_INTERRUPTED:
5582                                         vm_fault_copy_dst_cleanup(dst_page);
5583                                         RETURN(MACH_SEND_INTERRUPTED);
5584                                 case VM_FAULT_SUCCESS_NO_VM_PAGE:
5585                                         /* success but no VM page: fail */
5586                                         vm_object_paging_end(src_object);
5587                                         vm_object_unlock(src_object);
5588                                         /*FALLTHROUGH*/
5589                                 case VM_FAULT_MEMORY_ERROR:
5590                                         vm_fault_copy_dst_cleanup(dst_page);
5591                                         if (error)
5592                                                 return (error);
5593                                         else
5594                                                 return(KERN_MEMORY_ERROR);
5595                                 default:
5596                                         panic("vm_fault_copy(2): unexpected "
5597                                               "error 0x%x from "
5598                                               "vm_fault_page()\n", result);
5599                                 }
5600
5601
5602                                 assert((src_top_page == VM_PAGE_NULL) ==
5603                                        (result_page->object == src_object));
5604                         }
5605                         assert ((src_prot & VM_PROT_READ) != VM_PROT_NONE);
5606                         vm_object_unlock(result_page->object);
5607                 }
5608
5609                 if (!vm_map_verify(dst_map, dst_version)) {
5610                         if (result_page != VM_PAGE_NULL && src_page != dst_page)
5611                                 vm_fault_copy_cleanup(result_page, src_top_page);
5612                         vm_fault_copy_dst_cleanup(dst_page);
5613                         break;
5614                 }
5615
5616                 vm_object_lock(dst_page->object);
5617
5618                 if (dst_page->object->copy != old_copy_object) {
5619                         vm_object_unlock(dst_page->object);
5620                         vm_map_verify_done(dst_map, dst_version);
5621                         if (result_page != VM_PAGE_NULL && src_page != dst_page)
5622                                 vm_fault_copy_cleanup(result_page, src_top_page);
5623                         vm_fault_copy_dst_cleanup(dst_page);
5624                         break;
5625                 }
5626                 vm_object_unlock(dst_page->object);
5627
5628                 /*
5629                  *      Copy the page, and note that it is dirty
5630                  *      immediately.
5631                  */
5632
5633                 if (!page_aligned(src_offset) ||
5634                         !page_aligned(dst_offset) ||
5635                         !page_aligned(amount_left)) {
5636
5637                         vm_object_offset_t      src_po,
5638                                                 dst_po;
5639
5640                         src_po = src_offset - vm_object_trunc_page(src_offset);
5641                         dst_po = dst_offset - vm_object_trunc_page(dst_offset);
5642
5643                         if (dst_po > src_po) {
5644                                 part_size = PAGE_SIZE - dst_po;
5645                         } else {
5646                                 part_size = PAGE_SIZE - src_po;
5647                         }
5648                         if (part_size > (amount_left)){
5649                                 part_size = amount_left;
5650                         }
5651
5652                         if (result_page == VM_PAGE_NULL) {
5653                                 assert((vm_offset_t) dst_po == dst_po);
5654                                 assert((vm_size_t) part_size == part_size);
5655                                 vm_page_part_zero_fill(dst_page,
5656                                                        (vm_offset_t) dst_po,
5657                                                        (vm_size_t) part_size);
5658                         } else {
5659                                 assert((vm_offset_t) src_po == src_po);
5660                                 assert((vm_offset_t) dst_po == dst_po);
5661                                 assert((vm_size_t) part_size == part_size);
5662                                 vm_page_part_copy(result_page,
5663                                                   (vm_offset_t) src_po,
5664                                                   dst_page,
5665                                                   (vm_offset_t) dst_po,
5666                                                   (vm_size_t)part_size);
5667                                 if(!dst_page->dirty){
5668                                         vm_object_lock(dst_object);
5669                                         SET_PAGE_DIRTY(dst_page, TRUE);
5670                                         vm_object_unlock(dst_page->object);
5671                                 }
5672
5673                         }
5674                 } else {
5675                         part_size = PAGE_SIZE;
5676
5677                         if (result_page == VM_PAGE_NULL)
5678                                 vm_page_zero_fill(dst_page);
5679                         else{
5680                                 vm_object_lock(result_page->object);
5681                                 vm_page_copy(result_page, dst_page);
5682                                 vm_object_unlock(result_page->object);
5683
5684                                 if(!dst_page->dirty){
5685                                         vm_object_lock(dst_object);
5686                                         SET_PAGE_DIRTY(dst_page, TRUE);
5687                                         vm_object_unlock(dst_page->object);
5688                                 }
5689                         }
5690
5691                 }
5692
5693                 /*
5694                  *      Unlock everything, and return
5695                  */
5696
5697                 vm_map_verify_done(dst_map, dst_version);
5698
5699                 if (result_page != VM_PAGE_NULL && src_page != dst_page)
5700                         vm_fault_copy_cleanup(result_page, src_top_page);
5701                 vm_fault_copy_dst_cleanup(dst_page);
5702
5703                 amount_left -= part_size;
5704                 src_offset += part_size;
5705                 dst_offset += part_size;
5706         } while (amount_left > 0);
5707
5708         RETURN(KERN_SUCCESS);
5709 #undef  RETURN
5710
5711         /*NOTREACHED*/
5712 }
5713
5714 #if     VM_FAULT_CLASSIFY
5715 /*
5716  *      Temporary statistics gathering support.
5717  */
5718
5719 /*
5720  *      Statistics arrays:
5721  */
5722 #define VM_FAULT_TYPES_MAX      5
5723 #define VM_FAULT_LEVEL_MAX      8
5724
5725 int     vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
5726
5727 #define VM_FAULT_TYPE_ZERO_FILL 0
5728 #define VM_FAULT_TYPE_MAP_IN    1
5729 #define VM_FAULT_TYPE_PAGER     2
5730 #define VM_FAULT_TYPE_COPY      3
5731 #define VM_FAULT_TYPE_OTHER     4
5732
5733
5734 void
5735 vm_fault_classify(vm_object_t           object,
5736                   vm_object_offset_t    offset,
5737                   vm_prot_t             fault_type)
5738 {
5739         int             type, level = 0;
5740         vm_page_t       m;
5741
5742         while (TRUE) {
5743                 m = vm_page_lookup(object, offset);
5744                 if (m != VM_PAGE_NULL) {
5745                         if (m->busy || m->error || m->restart || m->absent) {
5746                                 type = VM_FAULT_TYPE_OTHER;
5747                                 break;
5748                         }
5749                         if (((fault_type & VM_PROT_WRITE) == 0) ||
5750                             ((level == 0) && object->copy == VM_OBJECT_NULL)) {
5751                                 type = VM_FAULT_TYPE_MAP_IN;
5752                                 break;
5753                         }
5754                         type = VM_FAULT_TYPE_COPY;
5755                         break;
5756                 }
5757                 else {
5758                         if (object->pager_created) {
5759                                 type = VM_FAULT_TYPE_PAGER;
5760                                 break;
5761                         }
5762                         if (object->shadow == VM_OBJECT_NULL) {
5763                                 type = VM_FAULT_TYPE_ZERO_FILL;
5764                                 break;
5765                         }
5766
5767                         offset += object->vo_shadow_offset;
5768                         object = object->shadow;
5769                         level++;
5770                         continue;
5771                 }
5772         }
5773
5774         if (level > VM_FAULT_LEVEL_MAX)
5775                 level = VM_FAULT_LEVEL_MAX;
5776
5777         vm_fault_stats[type][level] += 1;
5778
5779         return;
5780 }
5781
5782 /* cleanup routine to call from debugger */
5783
5784 void
5785 vm_fault_classify_init(void)
5786 {
5787         int type, level;
5788
5789         for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
5790                 for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
5791                         vm_fault_stats[type][level] = 0;
5792                 }
5793         }
5794
5795         return;
5796 }
5797 #endif  /* VM_FAULT_CLASSIFY */
5798
5799
5800 void
5801 vm_page_validate_cs_mapped(
5802         vm_page_t       page,
5803         const void      *kaddr)
5804 {
5805         vm_object_t             object;
5806         vm_object_offset_t      offset;
5807         kern_return_t           kr;
5808         memory_object_t         pager;
5809         void                    *blobs;
5810         boolean_t               validated, tainted;
5811
5812         assert(page->busy);
5813         vm_object_lock_assert_exclusive(page->object);
5814
5815         if (!cs_validation) {
5816                 return;
5817         }
5818
5819         if (page->wpmapped && !page->cs_tainted) {
5820                 /*
5821                  * This page was mapped for "write" access sometime in the
5822                  * past and could still be modifiable in the future.
5823                  * Consider it tainted.
5824                  * [ If the page was already found to be "tainted", no
5825                  * need to re-validate. ]
5826                  */
5827                 page->cs_validated = TRUE;
5828                 page->cs_tainted = TRUE;
5829                 if (cs_debug) {
5830                         printf("CODESIGNING: vm_page_validate_cs: "
5831                                "page %p obj %p off 0x%llx "
5832                                "was modified\n",
5833                                page, page->object, page->offset);
5834                 }
5835                 vm_cs_validated_dirtied++;
5836         }
5837
5838         if (page->cs_validated) {
5839                 return;
5840         }
5841
5842         vm_cs_validates++;
5843
5844         object = page->object;
5845         assert(object->code_signed);
5846         offset = page->offset;
5847
5848         if (!object->alive || object->terminating || object->pager == NULL) {
5849                 /*
5850                  * The object is terminating and we don't have its pager
5851                  * so we can't validate the data...
5852                  */
5853                 return;
5854         }
5855         /*
5856          * Since we get here to validate a page that was brought in by
5857          * the pager, we know that this pager is all setup and ready
5858          * by now.
5859          */
5860         assert(!object->internal);
5861         assert(object->pager != NULL);
5862         assert(object->pager_ready);
5863
5864         pager = object->pager;
5865         assert(object->paging_in_progress);
5866         kr = vnode_pager_get_object_cs_blobs(pager, &blobs);
5867         if (kr != KERN_SUCCESS) {
5868                 blobs = NULL;
5869         }
5870
5871         /* verify the SHA1 hash for this page */
5872         validated = cs_validate_page(blobs,
5873                                      pager,
5874                                      offset + object->paging_offset,
5875                                      (const void *)kaddr,
5876                                      &tainted);
5877
5878         page->cs_validated = validated;
5879         if (validated) {
5880                 page->cs_tainted = tainted;
5881         }
5882 }
5883
5884 void
5885 vm_page_validate_cs(
5886         vm_page_t       page)
5887 {
5888         vm_object_t             object;
5889         vm_object_offset_t      offset;
5890         vm_map_offset_t         koffset;
5891         vm_map_size_t           ksize;
5892         vm_offset_t             kaddr;
5893         kern_return_t           kr;
5894         boolean_t               busy_page;
5895         boolean_t               need_unmap;
5896
5897         vm_object_lock_assert_held(page->object);
5898
5899         if (!cs_validation) {
5900                 return;
5901         }
5902
5903         if (page->wpmapped && !page->cs_tainted) {
5904                 vm_object_lock_assert_exclusive(page->object);
5905
5906                 /*
5907                  * This page was mapped for "write" access sometime in the
5908                  * past and could still be modifiable in the future.
5909                  * Consider it tainted.
5910                  * [ If the page was already found to be "tainted", no
5911                  * need to re-validate. ]
5912                  */
5913                 page->cs_validated = TRUE;
5914                 page->cs_tainted = TRUE;
5915                 if (cs_debug) {
5916                         printf("CODESIGNING: vm_page_validate_cs: "
5917                                "page %p obj %p off 0x%llx "
5918                                "was modified\n",
5919                                page, page->object, page->offset);
5920                 }
5921                 vm_cs_validated_dirtied++;
5922         }
5923
5924         if (page->cs_validated) {
5925                 return;
5926         }
5927
5928         if (page->slid) {
5929                 panic("vm_page_validate_cs(%p): page is slid\n", page);
5930         }
5931         assert(!page->slid);
5932
5933 #if CHECK_CS_VALIDATION_BITMAP
5934         if ( vnode_pager_cs_check_validation_bitmap( page->object->pager, trunc_page(page->offset + page->object->paging_offset), CS_BITMAP_CHECK ) == KERN_SUCCESS) {
5935                 page->cs_validated = TRUE;
5936                 page->cs_tainted = FALSE;
5937                 vm_cs_bitmap_validated++;
5938                 return;
5939         }
5940 #endif
5941         vm_object_lock_assert_exclusive(page->object);
5942
5943         object = page->object;
5944         assert(object->code_signed);
5945         offset = page->offset;
5946
5947         busy_page = page->busy;
5948         if (!busy_page) {
5949                 /* keep page busy while we map (and unlock) the VM object */
5950                 page->busy = TRUE;
5951         }
5952
5953         /*
5954          * Take a paging reference on the VM object
5955          * to protect it from collapse or bypass,
5956          * and keep it from disappearing too.
5957          */
5958         vm_object_paging_begin(object);
5959
5960         /* map the page in the kernel address space */
5961         ksize = PAGE_SIZE_64;
5962         koffset = 0;
5963         need_unmap = FALSE;
5964         kr = vm_paging_map_object(page,
5965                                   object,
5966                                   offset,
5967                                   VM_PROT_READ,
5968                                   FALSE, /* can't unlock object ! */
5969                                   &ksize,
5970                                   &koffset,
5971                                   &need_unmap);
5972         if (kr != KERN_SUCCESS) {
5973                 panic("vm_page_validate_cs: could not map page: 0x%x\n", kr);
5974         }
5975         kaddr = CAST_DOWN(vm_offset_t, koffset);
5976
5977         /* validate the mapped page */
5978         vm_page_validate_cs_mapped(page, (const void *) kaddr);
5979
5980 #if CHECK_CS_VALIDATION_BITMAP
5981         if ( page->cs_validated == TRUE && page->cs_tainted == FALSE ) {
5982                 vnode_pager_cs_check_validation_bitmap( object->pager, trunc_page( offset + object->paging_offset), CS_BITMAP_SET );
5983         }
5984 #endif
5985         assert(page->busy);
5986         assert(object == page->object);
5987         vm_object_lock_assert_exclusive(object);
5988
5989         if (!busy_page) {
5990                 PAGE_WAKEUP_DONE(page);
5991         }
5992         if (need_unmap) {
5993                 /* unmap the map from the kernel address space */
5994                 vm_paging_unmap_object(object, koffset, koffset + ksize);
5995                 koffset = 0;
5996                 ksize = 0;
5997                 kaddr = 0;
5998         }
5999         vm_object_paging_end(object);
6000 }