osfmk/vm/vm_fault.c

   1 /*
   2  * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58 /*
  59  *      File:   vm_fault.c
  60  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  61  *
  62  *      Page fault handling module.
  63  */
  64
  65 #include <mach_cluster_stats.h>
  66 #include <mach_pagemap.h>
  67 #include <libkern/OSAtomic.h>
  68
  69 #include <mach/mach_types.h>
  70 #include <mach/kern_return.h>
  71 #include <mach/message.h>       /* for error codes */
  72 #include <mach/vm_param.h>
  73 #include <mach/vm_behavior.h>
  74 #include <mach/memory_object.h>
  75                                 /* For memory_object_data_{request,unlock} */
  76 #include <mach/sdt.h>
  77
  78 #include <kern/kern_types.h>
  79 #include <kern/host_statistics.h>
  80 #include <kern/counters.h>
  81 #include <kern/task.h>
  82 #include <kern/thread.h>
  83 #include <kern/sched_prim.h>
  84 #include <kern/host.h>
  85 #include <kern/xpr.h>
  86 #include <kern/mach_param.h>
  87 #include <kern/macro_help.h>
  88 #include <kern/zalloc.h>
  89 #include <kern/misc_protos.h>
  90
  91 #include <vm/vm_compressor.h>
  92 #include <vm/vm_compressor_pager.h>
  93 #include <vm/vm_fault.h>
  94 #include <vm/vm_map.h>
  95 #include <vm/vm_object.h>
  96 #include <vm/vm_page.h>
  97 #include <vm/vm_kern.h>
  98 #include <vm/pmap.h>
  99 #include <vm/vm_pageout.h>
 100 #include <vm/vm_protos.h>
 101 #include <vm/vm_external.h>
 102 #include <vm/memory_object.h>
 103 #include <vm/vm_purgeable_internal.h>   /* Needed by some vm_page.h macros */
 104 #include <vm/vm_shared_region.h>
 105
 106 #include <sys/codesign.h>
 107
 108 #include <libsa/sys/timers.h>   /* for struct timespec */
 109
 110 #define VM_FAULT_CLASSIFY       0
 111
 112 #define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
 113
 114 unsigned int    vm_object_pagein_throttle = 16;
 115
 116 /*
 117  * We apply a hard throttle to the demand zero rate of tasks that we believe are running out of control which
 118  * kicks in when swap space runs out.  64-bit programs have massive address spaces and can leak enormous amounts
 119  * of memory if they're buggy and can run the system completely out of swap space.  If this happens, we
 120  * impose a hard throttle on them to prevent them from taking the last bit of memory left.  This helps
 121  * keep the UI active so that the user has a chance to kill the offending task before the system
 122  * completely hangs.
 123  *
 124  * The hard throttle is only applied when the system is nearly completely out of swap space and is only applied
 125  * to tasks that appear to be bloated.  When swap runs out, any task using more than vm_hard_throttle_threshold
 126  * will be throttled.  The throttling is done by giving the thread that's trying to demand zero a page a
 127  * delay of HARD_THROTTLE_DELAY microseconds before being allowed to try the page fault again.
 128  */
 129
 130 extern void throttle_lowpri_io(int);
 131
 132 uint64_t vm_hard_throttle_threshold;
 133
 134
 135
 136 #define NEED_TO_HARD_THROTTLE_THIS_TASK()       (vm_wants_task_throttled(current_task()) ||     \
 137                                                  (vm_page_free_count < vm_page_throttle_limit && \
 138                                                   proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO) > THROTTLE_LEVEL_THROTTLED))
 139
 140
 141 #define HARD_THROTTLE_DELAY     5000    /* 5000 us == 5 ms */
 142 #define SOFT_THROTTLE_DELAY     200     /* 200 us == .2 ms */
 143
 144 #define VM_PAGE_CREATION_THROTTLE_PERIOD_SECS   6
 145 #define VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC  20000
 146
 147
 148 boolean_t current_thread_aborted(void);
 149
 150 /* Forward declarations of internal routines. */
 151 extern kern_return_t vm_fault_wire_fast(
 152                                 vm_map_t        map,
 153                                 vm_map_offset_t va,
 154                                 vm_map_entry_t  entry,
 155                                 pmap_t          pmap,
 156                                 vm_map_offset_t pmap_addr,
 157                                 ppnum_t         *physpage_p);
 158
 159 extern void vm_fault_continue(void);
 160
 161 extern void vm_fault_copy_cleanup(
 162                                 vm_page_t       page,
 163                                 vm_page_t       top_page);
 164
 165 extern void vm_fault_copy_dst_cleanup(
 166                                 vm_page_t       page);
 167
 168 #if     VM_FAULT_CLASSIFY
 169 extern void vm_fault_classify(vm_object_t       object,
 170                           vm_object_offset_t    offset,
 171                           vm_prot_t             fault_type);
 172
 173 extern void vm_fault_classify_init(void);
 174 #endif
 175
 176 unsigned long vm_pmap_enter_blocked = 0;
 177 unsigned long vm_pmap_enter_retried = 0;
 178
 179 unsigned long vm_cs_validates = 0;
 180 unsigned long vm_cs_revalidates = 0;
 181 unsigned long vm_cs_query_modified = 0;
 182 unsigned long vm_cs_validated_dirtied = 0;
 183 unsigned long vm_cs_bitmap_validated = 0;
 184
 185 void vm_pre_fault(vm_map_offset_t);
 186
 187 /*
 188  *      Routine:        vm_fault_init
 189  *      Purpose:
 190  *              Initialize our private data structures.
 191  */
 192 void
 193 vm_fault_init(void)
 194 {
 195         int i, vm_compressor_temp;
 196         boolean_t need_default_val = TRUE;
 197         /*
 198          * Choose a value for the hard throttle threshold based on the amount of ram.  The threshold is
 199          * computed as a percentage of available memory, and the percentage used is scaled inversely with
 200          * the amount of memory.  The percentage runs between 10% and 35%.  We use 35% for small memory systems
 201          * and reduce the value down to 10% for very large memory configurations.  This helps give us a
 202          * definition of a memory hog that makes more sense relative to the amount of ram in the machine.
 203          * The formula here simply uses the number of gigabytes of ram to adjust the percentage.
 204          */
 205
 206         vm_hard_throttle_threshold = sane_size * (35 - MIN((int)(sane_size / (1024*1024*1024)), 25)) / 100;
 207
 208         /*
 209          * Configure compressed pager behavior. A boot arg takes precedence over a device tree entry.
 210          */
 211
 212         if (PE_parse_boot_argn("vm_compressor", &vm_compressor_temp, sizeof (vm_compressor_temp))) {
 213                 for ( i = 0; i < VM_PAGER_MAX_MODES; i++) {
 214                         if (vm_compressor_temp > 0 &&
 215                             ((vm_compressor_temp & ( 1 << i)) == vm_compressor_temp)) {
 216                                 need_default_val = FALSE;
 217                                 vm_compressor_mode = vm_compressor_temp;
 218                                 break;
 219                         }
 220                 }
 221                 if (need_default_val)
 222                         printf("Ignoring \"vm_compressor\" boot arg %d\n", vm_compressor_temp);
 223         }
 224         if (need_default_val) {
 225                 /* If no boot arg or incorrect boot arg, try device tree. */
 226                 PE_get_default("kern.vm_compressor", &vm_compressor_mode, sizeof(vm_compressor_mode));
 227         }
 228         PE_parse_boot_argn("vm_compressor_threads", &vm_compressor_thread_count, sizeof (vm_compressor_thread_count));
 229         printf("\"vm_compressor_mode\" is %d\n", vm_compressor_mode);
 230 }
 231
 232 /*
 233  *      Routine:        vm_fault_cleanup
 234  *      Purpose:
 235  *              Clean up the result of vm_fault_page.
 236  *      Results:
 237  *              The paging reference for "object" is released.
 238  *              "object" is unlocked.
 239  *              If "top_page" is not null,  "top_page" is
 240  *              freed and the paging reference for the object
 241  *              containing it is released.
 242  *
 243  *      In/out conditions:
 244  *              "object" must be locked.
 245  */
 246 void
 247 vm_fault_cleanup(
 248         register vm_object_t    object,
 249         register vm_page_t      top_page)
 250 {
 251         vm_object_paging_end(object);
 252         vm_object_unlock(object);
 253
 254         if (top_page != VM_PAGE_NULL) {
 255                 object = top_page->object;
 256
 257                 vm_object_lock(object);
 258                 VM_PAGE_FREE(top_page);
 259                 vm_object_paging_end(object);
 260                 vm_object_unlock(object);
 261         }
 262 }
 263
 264 #if     MACH_CLUSTER_STATS
 265 #define MAXCLUSTERPAGES 16
 266 struct {
 267         unsigned long pages_in_cluster;
 268         unsigned long pages_at_higher_offsets;
 269         unsigned long pages_at_lower_offsets;
 270 } cluster_stats_in[MAXCLUSTERPAGES];
 271 #define CLUSTER_STAT(clause)    clause
 272 #define CLUSTER_STAT_HIGHER(x)  \
 273         ((cluster_stats_in[(x)].pages_at_higher_offsets)++)
 274 #define CLUSTER_STAT_LOWER(x)   \
 275          ((cluster_stats_in[(x)].pages_at_lower_offsets)++)
 276 #define CLUSTER_STAT_CLUSTER(x) \
 277         ((cluster_stats_in[(x)].pages_in_cluster)++)
 278 #else   /* MACH_CLUSTER_STATS */
 279 #define CLUSTER_STAT(clause)
 280 #endif  /* MACH_CLUSTER_STATS */
 281
 282 #define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
 283
 284
 285 boolean_t       vm_page_deactivate_behind = TRUE;
 286 /*
 287  * default sizes given VM_BEHAVIOR_DEFAULT reference behavior
 288  */
 289 #define VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW     128
 290 #define VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER    16              /* don't make this too big... */
 291                                                                 /* we use it to size an array on the stack */
 292
 293 int vm_default_behind = VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW;
 294
 295 #define MAX_SEQUENTIAL_RUN      (1024 * 1024 * 1024)
 296
 297 /*
 298  * vm_page_is_sequential
 299  *
 300  * Determine if sequential access is in progress
 301  * in accordance with the behavior specified.
 302  * Update state to indicate current access pattern.
 303  *
 304  * object must have at least the shared lock held
 305  */
 306 static
 307 void
 308 vm_fault_is_sequential(
 309         vm_object_t             object,
 310         vm_object_offset_t      offset,
 311         vm_behavior_t           behavior)
 312 {
 313         vm_object_offset_t      last_alloc;
 314         int                     sequential;
 315         int                     orig_sequential;
 316
 317         last_alloc = object->last_alloc;
 318         sequential = object->sequential;
 319         orig_sequential = sequential;
 320
 321         switch (behavior) {
 322         case VM_BEHAVIOR_RANDOM:
 323                 /*
 324                  * reset indicator of sequential behavior
 325                  */
 326                 sequential = 0;
 327                 break;
 328
 329         case VM_BEHAVIOR_SEQUENTIAL:
 330                 if (offset && last_alloc == offset - PAGE_SIZE_64) {
 331                         /*
 332                          * advance indicator of sequential behavior
 333                          */
 334                         if (sequential < MAX_SEQUENTIAL_RUN)
 335                                 sequential += PAGE_SIZE;
 336                 } else {
 337                         /*
 338                          * reset indicator of sequential behavior
 339                          */
 340                         sequential = 0;
 341                 }
 342                 break;
 343
 344         case VM_BEHAVIOR_RSEQNTL:
 345                 if (last_alloc && last_alloc == offset + PAGE_SIZE_64) {
 346                         /*
 347                          * advance indicator of sequential behavior
 348                          */
 349                         if (sequential > -MAX_SEQUENTIAL_RUN)
 350                                 sequential -= PAGE_SIZE;
 351                 } else {
 352                         /*
 353                          * reset indicator of sequential behavior
 354                          */
 355                         sequential = 0;
 356                 }
 357                 break;
 358
 359         case VM_BEHAVIOR_DEFAULT:
 360         default:
 361                 if (offset && last_alloc == (offset - PAGE_SIZE_64)) {
 362                         /*
 363                          * advance indicator of sequential behavior
 364                          */
 365                         if (sequential < 0)
 366                                 sequential = 0;
 367                         if (sequential < MAX_SEQUENTIAL_RUN)
 368                                 sequential += PAGE_SIZE;
 369
 370                 } else if (last_alloc && last_alloc == (offset + PAGE_SIZE_64)) {
 371                         /*
 372                          * advance indicator of sequential behavior
 373                          */
 374                         if (sequential > 0)
 375                                 sequential = 0;
 376                         if (sequential > -MAX_SEQUENTIAL_RUN)
 377                                 sequential -= PAGE_SIZE;
 378                 } else {
 379                         /*
 380                          * reset indicator of sequential behavior
 381                          */
 382                         sequential = 0;
 383                 }
 384                 break;
 385         }
 386         if (sequential != orig_sequential) {
 387                 if (!OSCompareAndSwap(orig_sequential, sequential, (UInt32 *)&object->sequential)) {
 388                         /*
 389                          * if someone else has already updated object->sequential
 390                          * don't bother trying to update it or object->last_alloc
 391                          */
 392                         return;
 393                 }
 394         }
 395         /*
 396          * I'd like to do this with a OSCompareAndSwap64, but that
 397          * doesn't exist for PPC...  however, it shouldn't matter
 398          * that much... last_alloc is maintained so that we can determine
 399          * if a sequential access pattern is taking place... if only
 400          * one thread is banging on this object, no problem with the unprotected
 401          * update... if 2 or more threads are banging away, we run the risk of
 402          * someone seeing a mangled update... however, in the face of multiple
 403          * accesses, no sequential access pattern can develop anyway, so we
 404          * haven't lost any real info.
 405          */
 406         object->last_alloc = offset;
 407 }
 408
 409
 410 int vm_page_deactivate_behind_count = 0;
 411
 412 /*
 413  * vm_page_deactivate_behind
 414  *
 415  * Determine if sequential access is in progress
 416  * in accordance with the behavior specified.  If
 417  * so, compute a potential page to deactivate and
 418  * deactivate it.
 419  *
 420  * object must be locked.
 421  *
 422  * return TRUE if we actually deactivate a page
 423  */
 424 static
 425 boolean_t
 426 vm_fault_deactivate_behind(
 427         vm_object_t             object,
 428         vm_object_offset_t      offset,
 429         vm_behavior_t           behavior)
 430 {
 431         int             n;
 432         int             pages_in_run = 0;
 433         int             max_pages_in_run = 0;
 434         int             sequential_run;
 435         int             sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
 436         vm_object_offset_t      run_offset = 0;
 437         vm_object_offset_t      pg_offset = 0;
 438         vm_page_t       m;
 439         vm_page_t       page_run[VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER];
 440
 441         pages_in_run = 0;
 442 #if TRACEFAULTPAGE
 443         dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind); /* (TEST/DEBUG) */
 444 #endif
 445
 446         if (object == kernel_object || vm_page_deactivate_behind == FALSE) {
 447                 /*
 448                  * Do not deactivate pages from the kernel object: they
 449                  * are not intended to become pageable.
 450                  * or we've disabled the deactivate behind mechanism
 451                  */
 452                 return FALSE;
 453         }
 454         if ((sequential_run = object->sequential)) {
 455                   if (sequential_run < 0) {
 456                           sequential_behavior = VM_BEHAVIOR_RSEQNTL;
 457                           sequential_run = 0 - sequential_run;
 458                   } else {
 459                           sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
 460                   }
 461         }
 462         switch (behavior) {
 463         case VM_BEHAVIOR_RANDOM:
 464                 break;
 465         case VM_BEHAVIOR_SEQUENTIAL:
 466                 if (sequential_run >= (int)PAGE_SIZE) {
 467                         run_offset = 0 - PAGE_SIZE_64;
 468                         max_pages_in_run = 1;
 469                 }
 470                 break;
 471         case VM_BEHAVIOR_RSEQNTL:
 472                 if (sequential_run >= (int)PAGE_SIZE) {
 473                         run_offset = PAGE_SIZE_64;
 474                         max_pages_in_run = 1;
 475                 }
 476                 break;
 477         case VM_BEHAVIOR_DEFAULT:
 478         default:
 479         {       vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
 480
 481                 /*
 482                  * determine if the run of sequential accesss has been
 483                  * long enough on an object with default access behavior
 484                  * to consider it for deactivation
 485                  */
 486                 if ((uint64_t)sequential_run >= behind && (sequential_run % (VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER * PAGE_SIZE)) == 0) {
 487                         /*
 488                          * the comparisons between offset and behind are done
 489                          * in this kind of odd fashion in order to prevent wrap around
 490                          * at the end points
 491                          */
 492                         if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) {
 493                                 if (offset >= behind) {
 494                                         run_offset = 0 - behind;
 495                                         pg_offset = PAGE_SIZE_64;
 496                                         max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
 497                                 }
 498                         } else {
 499                                 if (offset < -behind) {
 500                                         run_offset = behind;
 501                                         pg_offset = 0 - PAGE_SIZE_64;
 502                                         max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
 503                                 }
 504                         }
 505                 }
 506                 break;
 507         }
 508         }
 509         for (n = 0; n < max_pages_in_run; n++) {
 510                 m = vm_page_lookup(object, offset + run_offset + (n * pg_offset));
 511
 512                 if (m && !m->laundry && !m->busy && !m->no_cache && !m->throttled && !m->fictitious && !m->absent) {
 513                         page_run[pages_in_run++] = m;
 514
 515                         /*
 516                          * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
 517                          *
 518                          * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
 519                          * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
 520                          * new reference happens. If no futher references happen on the page after that remote TLB flushes
 521                          * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
 522                          * by pageout_scan, which is just fine since the last reference would have happened quite far
 523                          * in the past (TLB caches don't hang around for very long), and of course could just as easily
 524                          * have happened before we did the deactivate_behind.
 525                          */
 526                         pmap_clear_refmod_options(m->phys_page, VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
 527                 }
 528         }
 529         if (pages_in_run) {
 530                 vm_page_lockspin_queues();
 531
 532                 for (n = 0; n < pages_in_run; n++) {
 533
 534                         m = page_run[n];
 535
 536                         vm_page_deactivate_internal(m, FALSE);
 537
 538                         vm_page_deactivate_behind_count++;
 539 #if TRACEFAULTPAGE
 540                         dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
 541 #endif
 542                 }
 543                 vm_page_unlock_queues();
 544
 545                 return TRUE;
 546         }
 547         return FALSE;
 548 }
 549
 550
 551 #if (DEVELOPMENT || DEBUG)
 552 uint32_t        vm_page_creation_throttled_hard = 0;
 553 uint32_t        vm_page_creation_throttled_soft = 0;
 554 #endif /* DEVELOPMENT || DEBUG */
 555
 556 static int
 557 vm_page_throttled(boolean_t page_kept)
 558 {
 559         clock_sec_t     elapsed_sec;
 560         clock_sec_t     tv_sec;
 561         clock_usec_t    tv_usec;
 562
 563         thread_t thread = current_thread();
 564
 565         if (thread->options & TH_OPT_VMPRIV)
 566                 return (0);
 567
 568         if (thread->t_page_creation_throttled) {
 569                 thread->t_page_creation_throttled = 0;
 570
 571                 if (page_kept == FALSE)
 572                         goto no_throttle;
 573         }
 574         if (NEED_TO_HARD_THROTTLE_THIS_TASK()) {
 575 #if (DEVELOPMENT || DEBUG)
 576                 thread->t_page_creation_throttled_hard++;
 577                 OSAddAtomic(1, &vm_page_creation_throttled_hard);
 578 #endif /* DEVELOPMENT || DEBUG */
 579                 return (HARD_THROTTLE_DELAY);
 580         }
 581
 582         if ((vm_page_free_count < vm_page_throttle_limit || ((COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) && SWAPPER_NEEDS_TO_UNTHROTTLE())) &&
 583             thread->t_page_creation_count > (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS * VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC)) {
 584
 585                 clock_get_system_microtime(&tv_sec, &tv_usec);
 586
 587                 elapsed_sec = tv_sec - thread->t_page_creation_time;
 588
 589                 if (elapsed_sec <= VM_PAGE_CREATION_THROTTLE_PERIOD_SECS ||
 590                     (thread->t_page_creation_count / elapsed_sec) >= VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC) {
 591
 592                         if (elapsed_sec >= (3 * VM_PAGE_CREATION_THROTTLE_PERIOD_SECS)) {
 593                                 /*
 594                                  * we'll reset our stats to give a well behaved app
 595                                  * that was unlucky enough to accumulate a bunch of pages
 596                                  * over a long period of time a chance to get out of
 597                                  * the throttled state... we reset the counter and timestamp
 598                                  * so that if it stays under the rate limit for the next second
 599                                  * it will be back in our good graces... if it exceeds it, it
 600                                  * will remain in the throttled state
 601                                  */
 602                                 thread->t_page_creation_time = tv_sec;
 603                                 thread->t_page_creation_count = VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC * (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS - 1);
 604                         }
 605                         ++vm_page_throttle_count;
 606
 607                         thread->t_page_creation_throttled = 1;
 608
 609                         if ((COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) && HARD_THROTTLE_LIMIT_REACHED()) {
 610 #if (DEVELOPMENT || DEBUG)
 611                                 thread->t_page_creation_throttled_hard++;
 612                                 OSAddAtomic(1, &vm_page_creation_throttled_hard);
 613 #endif /* DEVELOPMENT || DEBUG */
 614                                 return (HARD_THROTTLE_DELAY);
 615                         } else {
 616 #if (DEVELOPMENT || DEBUG)
 617                                 thread->t_page_creation_throttled_soft++;
 618                                 OSAddAtomic(1, &vm_page_creation_throttled_soft);
 619 #endif /* DEVELOPMENT || DEBUG */
 620                                 return (SOFT_THROTTLE_DELAY);
 621                         }
 622                 }
 623                 thread->t_page_creation_time = tv_sec;
 624                 thread->t_page_creation_count = 0;
 625         }
 626 no_throttle:
 627         thread->t_page_creation_count++;
 628
 629         return (0);
 630 }
 631
 632 /*
 633  * check for various conditions that would
 634  * prevent us from creating a ZF page...
 635  * cleanup is based on being called from vm_fault_page
 636  *
 637  * object must be locked
 638  * object == m->object
 639  */
 640 static vm_fault_return_t
 641 vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t interruptible_state, boolean_t page_throttle)
 642 {
 643         int throttle_delay;
 644
 645         if (object->shadow_severed ||
 646             VM_OBJECT_PURGEABLE_FAULT_ERROR(object)) {
 647                 /*
 648                  * Either:
 649                  * 1. the shadow chain was severed,
 650                  * 2. the purgeable object is volatile or empty and is marked
 651                  *    to fault on access while volatile.
 652                  * Just have to return an error at this point
 653                  */
 654                 if (m != VM_PAGE_NULL)
 655                         VM_PAGE_FREE(m);
 656                 vm_fault_cleanup(object, first_m);
 657
 658                 thread_interrupt_level(interruptible_state);
 659
 660                 return (VM_FAULT_MEMORY_ERROR);
 661         }
 662         if (vm_backing_store_low) {
 663                 /*
 664                  * are we protecting the system from
 665                  * backing store exhaustion.  If so
 666                  * sleep unless we are privileged.
 667                  */
 668                 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
 669
 670                         if (m != VM_PAGE_NULL)
 671                                 VM_PAGE_FREE(m);
 672                         vm_fault_cleanup(object, first_m);
 673
 674                         assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
 675
 676                         thread_block(THREAD_CONTINUE_NULL);
 677                         thread_interrupt_level(interruptible_state);
 678
 679                         return (VM_FAULT_RETRY);
 680                 }
 681         }
 682         if (page_throttle == TRUE && (throttle_delay = vm_page_throttled(FALSE))) {
 683                 /*
 684                  * we're throttling zero-fills...
 685                  * treat this as if we couldn't grab a page
 686                  */
 687                 if (m != VM_PAGE_NULL)
 688                         VM_PAGE_FREE(m);
 689                 vm_fault_cleanup(object, first_m);
 690
 691                 VM_DEBUG_EVENT(vmf_check_zfdelay, VMF_CHECK_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
 692
 693                 delay(throttle_delay);
 694
 695                 if (current_thread_aborted()) {
 696                         thread_interrupt_level(interruptible_state);
 697                         return VM_FAULT_INTERRUPTED;
 698                 }
 699                 thread_interrupt_level(interruptible_state);
 700
 701                 return (VM_FAULT_MEMORY_SHORTAGE);
 702         }
 703         return (VM_FAULT_SUCCESS);
 704 }
 705
 706
 707 /*
 708  * do the work to zero fill a page and
 709  * inject it into the correct paging queue
 710  *
 711  * m->object must be locked
 712  * page queue lock must NOT be held
 713  */
 714 static int
 715 vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill)
 716 {
 717         int my_fault = DBG_ZERO_FILL_FAULT;
 718
 719         /*
 720          * This is is a zero-fill page fault...
 721          *
 722          * Checking the page lock is a waste of
 723          * time;  this page was absent, so
 724          * it can't be page locked by a pager.
 725          *
 726          * we also consider it undefined
 727          * with respect to instruction
 728          * execution.  i.e. it is the responsibility
 729          * of higher layers to call for an instruction
 730          * sync after changing the contents and before
 731          * sending a program into this area.  We
 732          * choose this approach for performance
 733          */
 734         m->pmapped = TRUE;
 735
 736         m->cs_validated = FALSE;
 737         m->cs_tainted = FALSE;
 738         m->cs_nx = FALSE;
 739
 740         if (no_zero_fill == TRUE) {
 741                 my_fault = DBG_NZF_PAGE_FAULT;
 742
 743                 if (m->absent && m->busy)
 744                         return (my_fault);
 745         } else {
 746                 vm_page_zero_fill(m);
 747
 748                 VM_STAT_INCR(zero_fill_count);
 749                 DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
 750         }
 751         assert(!m->laundry);
 752         assert(m->object != kernel_object);
 753         //assert(m->pageq.next == NULL && m->pageq.prev == NULL);
 754
 755         if (!VM_DYNAMIC_PAGING_ENABLED(memory_manager_default) &&
 756                 (m->object->purgable == VM_PURGABLE_DENY ||
 757                  m->object->purgable == VM_PURGABLE_NONVOLATILE ||
 758                  m->object->purgable == VM_PURGABLE_VOLATILE )) {
 759
 760                 vm_page_lockspin_queues();
 761
 762                 if (!VM_DYNAMIC_PAGING_ENABLED(memory_manager_default)) {
 763                         assert(!VM_PAGE_WIRED(m));
 764
 765                         /*
 766                          * can't be on the pageout queue since we don't
 767                          * have a pager to try and clean to
 768                          */
 769                         assert(!m->pageout_queue);
 770
 771                         VM_PAGE_QUEUES_REMOVE(m);
 772
 773                         queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq);
 774                         m->throttled = TRUE;
 775                         vm_page_throttled_count++;
 776                 }
 777                 vm_page_unlock_queues();
 778         }
 779         return (my_fault);
 780 }
 781
 782
 783 /*
 784  *      Routine:        vm_fault_page
 785  *      Purpose:
 786  *              Find the resident page for the virtual memory
 787  *              specified by the given virtual memory object
 788  *              and offset.
 789  *      Additional arguments:
 790  *              The required permissions for the page is given
 791  *              in "fault_type".  Desired permissions are included
 792  *              in "protection".
 793  *              fault_info is passed along to determine pagein cluster
 794  *              limits... it contains the expected reference pattern,
 795  *              cluster size if available, etc...
 796  *
 797  *              If the desired page is known to be resident (for
 798  *              example, because it was previously wired down), asserting
 799  *              the "unwiring" parameter will speed the search.
 800  *
 801  *              If the operation can be interrupted (by thread_abort
 802  *              or thread_terminate), then the "interruptible"
 803  *              parameter should be asserted.
 804  *
 805  *      Results:
 806  *              The page containing the proper data is returned
 807  *              in "result_page".
 808  *
 809  *      In/out conditions:
 810  *              The source object must be locked and referenced,
 811  *              and must donate one paging reference.  The reference
 812  *              is not affected.  The paging reference and lock are
 813  *              consumed.
 814  *
 815  *              If the call succeeds, the object in which "result_page"
 816  *              resides is left locked and holding a paging reference.
 817  *              If this is not the original object, a busy page in the
 818  *              original object is returned in "top_page", to prevent other
 819  *              callers from pursuing this same data, along with a paging
 820  *              reference for the original object.  The "top_page" should
 821  *              be destroyed when this guarantee is no longer required.
 822  *              The "result_page" is also left busy.  It is not removed
 823  *              from the pageout queues.
 824  *      Special Case:
 825  *              A return value of VM_FAULT_SUCCESS_NO_PAGE means that the
 826  *              fault succeeded but there's no VM page (i.e. the VM object
 827  *              does not actually hold VM pages, but device memory or
 828  *              large pages).  The object is still locked and we still hold a
 829  *              paging_in_progress reference.
 830  */
 831 unsigned int vm_fault_page_blocked_access = 0;
 832 unsigned int vm_fault_page_forced_retry = 0;
 833
 834 vm_fault_return_t
 835 vm_fault_page(
 836         /* Arguments: */
 837         vm_object_t     first_object,   /* Object to begin search */
 838         vm_object_offset_t first_offset,        /* Offset into object */
 839         vm_prot_t       fault_type,     /* What access is requested */
 840         boolean_t       must_be_resident,/* Must page be resident? */
 841         boolean_t       caller_lookup,  /* caller looked up page */
 842         /* Modifies in place: */
 843         vm_prot_t       *protection,    /* Protection for mapping */
 844         vm_page_t       *result_page,   /* Page found, if successful */
 845         /* Returns: */
 846         vm_page_t       *top_page,      /* Page in top object, if
 847                                          * not result_page.  */
 848         int             *type_of_fault, /* if non-null, fill in with type of fault
 849                                          * COW, zero-fill, etc... returned in trace point */
 850         /* More arguments: */
 851         kern_return_t   *error_code,    /* code if page is in error */
 852         boolean_t       no_zero_fill,   /* don't zero fill absent pages */
 853         boolean_t       data_supply,    /* treat as data_supply if
 854                                          * it is a write fault and a full
 855                                          * page is provided */
 856         vm_object_fault_info_t fault_info)
 857 {
 858         vm_page_t               m;
 859         vm_object_t             object;
 860         vm_object_offset_t      offset;
 861         vm_page_t               first_m;
 862         vm_object_t             next_object;
 863         vm_object_t             copy_object;
 864         boolean_t               look_for_page;
 865         boolean_t               force_fault_retry = FALSE;
 866         vm_prot_t               access_required = fault_type;
 867         vm_prot_t               wants_copy_flag;
 868         CLUSTER_STAT(int pages_at_higher_offsets;)
 869         CLUSTER_STAT(int pages_at_lower_offsets;)
 870         kern_return_t           wait_result;
 871         boolean_t               interruptible_state;
 872         boolean_t               data_already_requested = FALSE;
 873         vm_behavior_t           orig_behavior;
 874         vm_size_t               orig_cluster_size;
 875         vm_fault_return_t       error;
 876         int                     my_fault;
 877         uint32_t                try_failed_count;
 878         int                     interruptible; /* how may fault be interrupted? */
 879         int                     external_state = VM_EXTERNAL_STATE_UNKNOWN;
 880         memory_object_t         pager;
 881         vm_fault_return_t       retval;
 882
 883 /*
 884  * MACH page map - an optional optimization where a bit map is maintained
 885  * by the VM subsystem for internal objects to indicate which pages of
 886  * the object currently reside on backing store.  This existence map
 887  * duplicates information maintained by the vnode pager.  It is
 888  * created at the time of the first pageout against the object, i.e.
 889  * at the same time pager for the object is created.  The optimization
 890  * is designed to eliminate pager interaction overhead, if it is
 891  * 'known' that the page does not exist on backing store.
 892  *
 893  * MUST_ASK_PAGER() evaluates to TRUE if the page specified by object/offset is
 894  * either marked as paged out in the existence map for the object or no
 895  * existence map exists for the object.  MUST_ASK_PAGER() is one of the
 896  * criteria in the decision to invoke the pager.   It is also used as one
 897  * of the criteria to terminate the scan for adjacent pages in a clustered
 898  * pagein operation.  Note that MUST_ASK_PAGER() always evaluates to TRUE for
 899  * permanent objects.  Note also that if the pager for an internal object
 900  * has not been created, the pager is not invoked regardless of the value
 901  * of MUST_ASK_PAGER() and that clustered pagein scans are only done on an object
 902  * for which a pager has been created.
 903  *
 904  * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
 905  * is marked as paged out in the existence map for the object.  PAGED_OUT()
 906  * PAGED_OUT() is used to determine if a page has already been pushed
 907  * into a copy object in order to avoid a redundant page out operation.
 908  */
 909 #if MACH_PAGEMAP
 910 #define MUST_ASK_PAGER(o, f, s)                                 \
 911         ((vm_external_state_get((o)->existence_map, (f))        \
 912           != VM_EXTERNAL_STATE_ABSENT) &&                       \
 913          (s = (VM_COMPRESSOR_PAGER_STATE_GET((o), (f))))        \
 914          != VM_EXTERNAL_STATE_ABSENT)
 915 #define PAGED_OUT(o, f)                                         \
 916         ((vm_external_state_get((o)->existence_map, (f))        \
 917           == VM_EXTERNAL_STATE_EXISTS) ||                       \
 918          (VM_COMPRESSOR_PAGER_STATE_GET((o), (f))               \
 919           == VM_EXTERNAL_STATE_EXISTS))
 920 #else /* MACH_PAGEMAP */
 921 #define MUST_ASK_PAGER(o, f, s)                                 \
 922         ((s = VM_COMPRESSOR_PAGER_STATE_GET((o), (f))) != VM_EXTERNAL_STATE_ABSENT)
 923 #define PAGED_OUT(o, f) \
 924         (VM_COMPRESSOR_PAGER_STATE_GET((o), (f)) == VM_EXTERNAL_STATE_EXISTS)
 925 #endif /* MACH_PAGEMAP */
 926
 927 /*
 928  *      Recovery actions
 929  */
 930 #define RELEASE_PAGE(m)                                 \
 931         MACRO_BEGIN                                     \
 932         PAGE_WAKEUP_DONE(m);                            \
 933         if (!m->active && !m->inactive && !m->throttled) {              \
 934                 vm_page_lockspin_queues();                              \
 935                 if (!m->active && !m->inactive && !m->throttled) {      \
 936                         if (COMPRESSED_PAGER_IS_ACTIVE) \
 937                                 vm_page_deactivate(m);                  \
 938                         else                                            \
 939                                 vm_page_activate(m);                    \
 940                 }                                                       \
 941                 vm_page_unlock_queues();                                \
 942         }                                                               \
 943         MACRO_END
 944
 945 #if TRACEFAULTPAGE
 946         dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */
 947 #endif
 948
 949         interruptible = fault_info->interruptible;
 950         interruptible_state = thread_interrupt_level(interruptible);
 951
 952         /*
 953          *      INVARIANTS (through entire routine):
 954          *
 955          *      1)      At all times, we must either have the object
 956          *              lock or a busy page in some object to prevent
 957          *              some other thread from trying to bring in
 958          *              the same page.
 959          *
 960          *              Note that we cannot hold any locks during the
 961          *              pager access or when waiting for memory, so
 962          *              we use a busy page then.
 963          *
 964          *      2)      To prevent another thread from racing us down the
 965          *              shadow chain and entering a new page in the top
 966          *              object before we do, we must keep a busy page in
 967          *              the top object while following the shadow chain.
 968          *
 969          *      3)      We must increment paging_in_progress on any object
 970          *              for which we have a busy page before dropping
 971          *              the object lock
 972          *
 973          *      4)      We leave busy pages on the pageout queues.
 974          *              If the pageout daemon comes across a busy page,
 975          *              it will remove the page from the pageout queues.
 976          */
 977
 978         object = first_object;
 979         offset = first_offset;
 980         first_m = VM_PAGE_NULL;
 981         access_required = fault_type;
 982
 983
 984         XPR(XPR_VM_FAULT,
 985                 "vm_f_page: obj 0x%X, offset 0x%X, type %d, prot %d\n",
 986                 object, offset, fault_type, *protection, 0);
 987
 988         /*
 989          * default type of fault
 990          */
 991         my_fault = DBG_CACHE_HIT_FAULT;
 992
 993         while (TRUE) {
 994 #if TRACEFAULTPAGE
 995                 dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
 996 #endif
 997                 if (!object->alive) {
 998                         /*
 999                          * object is no longer valid
1000                          * clean up and return error
1001                          */
1002                         vm_fault_cleanup(object, first_m);
1003                         thread_interrupt_level(interruptible_state);
1004
1005                         return (VM_FAULT_MEMORY_ERROR);
1006                 }
1007
1008                 if (!object->pager_created && object->phys_contiguous) {
1009                         /*
1010                          * A physically-contiguous object without a pager:
1011                          * must be a "large page" object.  We do not deal
1012                          * with VM pages for this object.
1013                          */
1014                         caller_lookup = FALSE;
1015                         m = VM_PAGE_NULL;
1016                         goto phys_contig_object;
1017                 }
1018
1019                 if (object->blocked_access) {
1020                         /*
1021                          * Access to this VM object has been blocked.
1022                          * Replace our "paging_in_progress" reference with
1023                          * a "activity_in_progress" reference and wait for
1024                          * access to be unblocked.
1025                          */
1026                         caller_lookup = FALSE; /* no longer valid after sleep */
1027                         vm_object_activity_begin(object);
1028                         vm_object_paging_end(object);
1029                         while (object->blocked_access) {
1030                                 vm_object_sleep(object,
1031                                                 VM_OBJECT_EVENT_UNBLOCKED,
1032                                                 THREAD_UNINT);
1033                         }
1034                         vm_fault_page_blocked_access++;
1035                         vm_object_paging_begin(object);
1036                         vm_object_activity_end(object);
1037                 }
1038
1039                 /*
1040                  * See whether the page at 'offset' is resident
1041                  */
1042                 if (caller_lookup == TRUE) {
1043                         /*
1044                          * The caller has already looked up the page
1045                          * and gave us the result in "result_page".
1046                          * We can use this for the first lookup but
1047                          * it loses its validity as soon as we unlock
1048                          * the object.
1049                          */
1050                         m = *result_page;
1051                         caller_lookup = FALSE; /* no longer valid after that */
1052                 } else {
1053                         m = vm_page_lookup(object, offset);
1054                 }
1055 #if TRACEFAULTPAGE
1056                 dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
1057 #endif
1058                 if (m != VM_PAGE_NULL) {
1059
1060                         if (m->busy) {
1061                                 /*
1062                                  * The page is being brought in,
1063                                  * wait for it and then retry.
1064                                  */
1065 #if TRACEFAULTPAGE
1066                                 dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1067 #endif
1068                                 wait_result = PAGE_SLEEP(object, m, interruptible);
1069
1070                                 XPR(XPR_VM_FAULT,
1071                                     "vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n",
1072                                     object, offset,
1073                                     m, 0, 0);
1074                                 counter(c_vm_fault_page_block_busy_kernel++);
1075
1076                                 if (wait_result != THREAD_AWAKENED) {
1077                                         vm_fault_cleanup(object, first_m);
1078                                         thread_interrupt_level(interruptible_state);
1079
1080                                         if (wait_result == THREAD_RESTART)
1081                                                 return (VM_FAULT_RETRY);
1082                                         else
1083                                                 return (VM_FAULT_INTERRUPTED);
1084                                 }
1085                                 continue;
1086                         }
1087                         if (m->laundry) {
1088                                 m->pageout = FALSE;
1089
1090                                 if (!m->cleaning)
1091                                         vm_pageout_steal_laundry(m, FALSE);
1092                         }
1093                         if (m->phys_page == vm_page_guard_addr) {
1094                                 /*
1095                                  * Guard page: off limits !
1096                                  */
1097                                 if (fault_type == VM_PROT_NONE) {
1098                                         /*
1099                                          * The fault is not requesting any
1100                                          * access to the guard page, so it must
1101                                          * be just to wire or unwire it.
1102                                          * Let's pretend it succeeded...
1103                                          */
1104                                         m->busy = TRUE;
1105                                         *result_page = m;
1106                                         assert(first_m == VM_PAGE_NULL);
1107                                         *top_page = first_m;
1108                                         if (type_of_fault)
1109                                                 *type_of_fault = DBG_GUARD_FAULT;
1110                                         thread_interrupt_level(interruptible_state);
1111                                         return VM_FAULT_SUCCESS;
1112                                 } else {
1113                                         /*
1114                                          * The fault requests access to the
1115                                          * guard page: let's deny that !
1116                                          */
1117                                         vm_fault_cleanup(object, first_m);
1118                                         thread_interrupt_level(interruptible_state);
1119                                         return VM_FAULT_MEMORY_ERROR;
1120                                 }
1121                         }
1122
1123                         if (m->error) {
1124                                 /*
1125                                  * The page is in error, give up now.
1126                                  */
1127 #if TRACEFAULTPAGE
1128                                 dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code);      /* (TEST/DEBUG) */
1129 #endif
1130                                 if (error_code)
1131                                         *error_code = KERN_MEMORY_ERROR;
1132                                 VM_PAGE_FREE(m);
1133
1134                                 vm_fault_cleanup(object, first_m);
1135                                 thread_interrupt_level(interruptible_state);
1136
1137                                 return (VM_FAULT_MEMORY_ERROR);
1138                         }
1139                         if (m->restart) {
1140                                 /*
1141                                  * The pager wants us to restart
1142                                  * at the top of the chain,
1143                                  * typically because it has moved the
1144                                  * page to another pager, then do so.
1145                                  */
1146 #if TRACEFAULTPAGE
1147                                 dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1148 #endif
1149                                 VM_PAGE_FREE(m);
1150
1151                                 vm_fault_cleanup(object, first_m);
1152                                 thread_interrupt_level(interruptible_state);
1153
1154                                 return (VM_FAULT_RETRY);
1155                         }
1156                         if (m->absent) {
1157                                 /*
1158                                  * The page isn't busy, but is absent,
1159                                  * therefore it's deemed "unavailable".
1160                                  *
1161                                  * Remove the non-existent page (unless it's
1162                                  * in the top object) and move on down to the
1163                                  * next object (if there is one).
1164                                  */
1165 #if TRACEFAULTPAGE
1166                                 dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow);  /* (TEST/DEBUG) */
1167 #endif
1168                                 next_object = object->shadow;
1169
1170                                 if (next_object == VM_OBJECT_NULL) {
1171                                         /*
1172                                          * Absent page at bottom of shadow
1173                                          * chain; zero fill the page we left
1174                                          * busy in the first object, and free
1175                                          * the absent page.
1176                                          */
1177                                         assert(!must_be_resident);
1178
1179                                         /*
1180                                          * check for any conditions that prevent
1181                                          * us from creating a new zero-fill page
1182                                          * vm_fault_check will do all of the
1183                                          * fault cleanup in the case of an error condition
1184                                          * including resetting the thread_interrupt_level
1185                                          */
1186                                         error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
1187
1188                                         if (error != VM_FAULT_SUCCESS)
1189                                                 return (error);
1190
1191                                         XPR(XPR_VM_FAULT,
1192                                             "vm_f_page: zero obj 0x%X, off 0x%X, page 0x%X, first_obj 0x%X\n",
1193                                                 object, offset,
1194                                                 m,
1195                                                 first_object, 0);
1196
1197                                         if (object != first_object) {
1198                                                 /*
1199                                                  * free the absent page we just found
1200                                                  */
1201                                                 VM_PAGE_FREE(m);
1202
1203                                                 /*
1204                                                  * drop reference and lock on current object
1205                                                  */
1206                                                 vm_object_paging_end(object);
1207                                                 vm_object_unlock(object);
1208
1209                                                 /*
1210                                                  * grab the original page we
1211                                                  * 'soldered' in place and
1212                                                  * retake lock on 'first_object'
1213                                                  */
1214                                                 m = first_m;
1215                                                 first_m = VM_PAGE_NULL;
1216
1217                                                 object = first_object;
1218                                                 offset = first_offset;
1219
1220                                                 vm_object_lock(object);
1221                                         } else {
1222                                                 /*
1223                                                  * we're going to use the absent page we just found
1224                                                  * so convert it to a 'busy' page
1225                                                  */
1226                                                 m->absent = FALSE;
1227                                                 m->busy = TRUE;
1228                                         }
1229                                         if (fault_info->mark_zf_absent && no_zero_fill == TRUE)
1230                                                 m->absent = TRUE;
1231                                         /*
1232                                          * zero-fill the page and put it on
1233                                          * the correct paging queue
1234                                          */
1235                                         my_fault = vm_fault_zero_page(m, no_zero_fill);
1236
1237                                         break;
1238                                 } else {
1239                                         if (must_be_resident)
1240                                                 vm_object_paging_end(object);
1241                                         else if (object != first_object) {
1242                                                 vm_object_paging_end(object);
1243                                                 VM_PAGE_FREE(m);
1244                                         } else {
1245                                                 first_m = m;
1246                                                 m->absent = FALSE;
1247                                                 m->busy = TRUE;
1248
1249                                                 vm_page_lockspin_queues();
1250
1251                                                 assert(!m->pageout_queue);
1252                                                 VM_PAGE_QUEUES_REMOVE(m);
1253
1254                                                 vm_page_unlock_queues();
1255                                         }
1256                                         XPR(XPR_VM_FAULT,
1257                                             "vm_f_page: unavail obj 0x%X, off 0x%X, next_obj 0x%X, newoff 0x%X\n",
1258                                                 object, offset,
1259                                                 next_object,
1260                                                 offset+object->vo_shadow_offset,0);
1261
1262                                         offset += object->vo_shadow_offset;
1263                                         fault_info->lo_offset += object->vo_shadow_offset;
1264                                         fault_info->hi_offset += object->vo_shadow_offset;
1265                                         access_required = VM_PROT_READ;
1266
1267                                         vm_object_lock(next_object);
1268                                         vm_object_unlock(object);
1269                                         object = next_object;
1270                                         vm_object_paging_begin(object);
1271
1272                                         /*
1273                                          * reset to default type of fault
1274                                          */
1275                                         my_fault = DBG_CACHE_HIT_FAULT;
1276
1277                                         continue;
1278                                 }
1279                         }
1280                         if ((m->cleaning)
1281                             && ((object != first_object) || (object->copy != VM_OBJECT_NULL))
1282                             && (fault_type & VM_PROT_WRITE)) {
1283                                 /*
1284                                  * This is a copy-on-write fault that will
1285                                  * cause us to revoke access to this page, but
1286                                  * this page is in the process of being cleaned
1287                                  * in a clustered pageout. We must wait until
1288                                  * the cleaning operation completes before
1289                                  * revoking access to the original page,
1290                                  * otherwise we might attempt to remove a
1291                                  * wired mapping.
1292                                  */
1293 #if TRACEFAULTPAGE
1294                                 dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset);  /* (TEST/DEBUG) */
1295 #endif
1296                                 XPR(XPR_VM_FAULT,
1297                                     "vm_f_page: cleaning obj 0x%X, offset 0x%X, page 0x%X\n",
1298                                         object, offset,
1299                                         m, 0, 0);
1300                                 /*
1301                                  * take an extra ref so that object won't die
1302                                  */
1303                                 vm_object_reference_locked(object);
1304
1305                                 vm_fault_cleanup(object, first_m);
1306
1307                                 counter(c_vm_fault_page_block_backoff_kernel++);
1308                                 vm_object_lock(object);
1309                                 assert(object->ref_count > 0);
1310
1311                                 m = vm_page_lookup(object, offset);
1312
1313                                 if (m != VM_PAGE_NULL && m->cleaning) {
1314                                         PAGE_ASSERT_WAIT(m, interruptible);
1315
1316                                         vm_object_unlock(object);
1317                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1318                                         vm_object_deallocate(object);
1319
1320                                         goto backoff;
1321                                 } else {
1322                                         vm_object_unlock(object);
1323
1324                                         vm_object_deallocate(object);
1325                                         thread_interrupt_level(interruptible_state);
1326
1327                                         return (VM_FAULT_RETRY);
1328                                 }
1329                         }
1330                         if (type_of_fault == NULL && m->speculative &&
1331                             !(fault_info != NULL && fault_info->stealth)) {
1332                                 /*
1333                                  * If we were passed a non-NULL pointer for
1334                                  * "type_of_fault", than we came from
1335                                  * vm_fault... we'll let it deal with
1336                                  * this condition, since it
1337                                  * needs to see m->speculative to correctly
1338                                  * account the pageins, otherwise...
1339                                  * take it off the speculative queue, we'll
1340                                  * let the caller of vm_fault_page deal
1341                                  * with getting it onto the correct queue
1342                                  *
1343                                  * If the caller specified in fault_info that
1344                                  * it wants a "stealth" fault, we also leave
1345                                  * the page in the speculative queue.
1346                                  */
1347                                 vm_page_lockspin_queues();
1348                                 if (m->speculative)
1349                                         VM_PAGE_QUEUES_REMOVE(m);
1350                                 vm_page_unlock_queues();
1351                         }
1352
1353                         if (m->encrypted) {
1354                                 /*
1355                                  * ENCRYPTED SWAP:
1356                                  * the user needs access to a page that we
1357                                  * encrypted before paging it out.
1358                                  * Decrypt the page now.
1359                                  * Keep it busy to prevent anyone from
1360                                  * accessing it during the decryption.
1361                                  */
1362                                 m->busy = TRUE;
1363                                 vm_page_decrypt(m, 0);
1364                                 assert(object == m->object);
1365                                 assert(m->busy);
1366                                 PAGE_WAKEUP_DONE(m);
1367
1368                                 /*
1369                                  * Retry from the top, in case
1370                                  * something changed while we were
1371                                  * decrypting.
1372                                  */
1373                                 continue;
1374                         }
1375                         ASSERT_PAGE_DECRYPTED(m);
1376
1377                         if (m->object->code_signed) {
1378                                 /*
1379                                  * CODE SIGNING:
1380                                  * We just paged in a page from a signed
1381                                  * memory object but we don't need to
1382                                  * validate it now.  We'll validate it if
1383                                  * when it gets mapped into a user address
1384                                  * space for the first time or when the page
1385                                  * gets copied to another object as a result
1386                                  * of a copy-on-write.
1387                                  */
1388                         }
1389
1390                         /*
1391                          * We mark the page busy and leave it on
1392                          * the pageout queues.  If the pageout
1393                          * deamon comes across it, then it will
1394                          * remove the page from the queue, but not the object
1395                          */
1396 #if TRACEFAULTPAGE
1397                         dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1398 #endif
1399                         XPR(XPR_VM_FAULT,
1400                             "vm_f_page: found page obj 0x%X, offset 0x%X, page 0x%X\n",
1401                                 object, offset, m, 0, 0);
1402                         assert(!m->busy);
1403                         assert(!m->absent);
1404
1405                         m->busy = TRUE;
1406                         break;
1407                 }
1408
1409
1410                 /*
1411                  * we get here when there is no page present in the object at
1412                  * the offset we're interested in... we'll allocate a page
1413                  * at this point if the pager associated with
1414                  * this object can provide the data or we're the top object...
1415                  * object is locked;  m == NULL
1416                  */
1417                 if (must_be_resident) {
1418                         if (fault_type == VM_PROT_NONE &&
1419                             object == kernel_object) {
1420                                 /*
1421                                  * We've been called from vm_fault_unwire()
1422                                  * while removing a map entry that was allocated
1423                                  * with KMA_KOBJECT and KMA_VAONLY.  This page
1424                                  * is not present and there's nothing more to
1425                                  * do here (nothing to unwire).
1426                                  */
1427                                 vm_fault_cleanup(object, first_m);
1428                                 thread_interrupt_level(interruptible_state);
1429
1430                                 return VM_FAULT_MEMORY_ERROR;
1431                         }
1432
1433                         goto dont_look_for_page;
1434                 }
1435
1436 #if !MACH_PAGEMAP
1437                 data_supply = FALSE;
1438 #endif /* !MACH_PAGEMAP */
1439
1440                 look_for_page = (object->pager_created && (MUST_ASK_PAGER(object, offset, external_state) == TRUE) && !data_supply);
1441
1442 #if TRACEFAULTPAGE
1443                 dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object);      /* (TEST/DEBUG) */
1444 #endif
1445                 if (!look_for_page && object == first_object && !object->phys_contiguous) {
1446                         /*
1447                          * Allocate a new page for this object/offset pair as a placeholder
1448                          */
1449                         m = vm_page_grab();
1450 #if TRACEFAULTPAGE
1451                         dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
1452 #endif
1453                         if (m == VM_PAGE_NULL) {
1454
1455                                 vm_fault_cleanup(object, first_m);
1456                                 thread_interrupt_level(interruptible_state);
1457
1458                                 return (VM_FAULT_MEMORY_SHORTAGE);
1459                         }
1460
1461                         if (fault_info && fault_info->batch_pmap_op == TRUE) {
1462                                 vm_page_insert_internal(m, object, offset, FALSE, TRUE, TRUE);
1463                         } else {
1464                                 vm_page_insert(m, object, offset);
1465                         }
1466                 }
1467                 if (look_for_page) {
1468                         kern_return_t   rc;
1469                         int             my_fault_type;
1470
1471                         /*
1472                          *      If the memory manager is not ready, we
1473                          *      cannot make requests.
1474                          */
1475                         if (!object->pager_ready) {
1476 #if TRACEFAULTPAGE
1477                                 dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
1478 #endif
1479                                 if (m != VM_PAGE_NULL)
1480                                         VM_PAGE_FREE(m);
1481
1482                                 XPR(XPR_VM_FAULT,
1483                                 "vm_f_page: ready wait obj 0x%X, offset 0x%X\n",
1484                                         object, offset, 0, 0, 0);
1485
1486                                 /*
1487                                  * take an extra ref so object won't die
1488                                  */
1489                                 vm_object_reference_locked(object);
1490                                 vm_fault_cleanup(object, first_m);
1491                                 counter(c_vm_fault_page_block_backoff_kernel++);
1492
1493                                 vm_object_lock(object);
1494                                 assert(object->ref_count > 0);
1495
1496                                 if (!object->pager_ready) {
1497                                         wait_result = vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGER_READY, interruptible);
1498
1499                                         vm_object_unlock(object);
1500                                         if (wait_result == THREAD_WAITING)
1501                                                 wait_result = thread_block(THREAD_CONTINUE_NULL);
1502                                         vm_object_deallocate(object);
1503
1504                                         goto backoff;
1505                                 } else {
1506                                         vm_object_unlock(object);
1507                                         vm_object_deallocate(object);
1508                                         thread_interrupt_level(interruptible_state);
1509
1510                                         return (VM_FAULT_RETRY);
1511                                 }
1512                         }
1513                         if (!object->internal && !object->phys_contiguous && object->paging_in_progress > vm_object_pagein_throttle) {
1514                                 /*
1515                                  * If there are too many outstanding page
1516                                  * requests pending on this external object, we
1517                                  * wait for them to be resolved now.
1518                                  */
1519 #if TRACEFAULTPAGE
1520                                 dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1521 #endif
1522                                 if (m != VM_PAGE_NULL)
1523                                         VM_PAGE_FREE(m);
1524                                 /*
1525                                  * take an extra ref so object won't die
1526                                  */
1527                                 vm_object_reference_locked(object);
1528
1529                                 vm_fault_cleanup(object, first_m);
1530
1531                                 counter(c_vm_fault_page_block_backoff_kernel++);
1532
1533                                 vm_object_lock(object);
1534                                 assert(object->ref_count > 0);
1535
1536                                 if (object->paging_in_progress >= vm_object_pagein_throttle) {
1537                                         vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGING_ONLY_IN_PROGRESS, interruptible);
1538
1539                                         vm_object_unlock(object);
1540                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1541                                         vm_object_deallocate(object);
1542
1543                                         goto backoff;
1544                                 } else {
1545                                         vm_object_unlock(object);
1546                                         vm_object_deallocate(object);
1547                                         thread_interrupt_level(interruptible_state);
1548
1549                                         return (VM_FAULT_RETRY);
1550                                 }
1551                         }
1552                         if (object->internal &&
1553                             (COMPRESSED_PAGER_IS_ACTIVE
1554                              || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE)) {
1555                                 int compressed_count_delta;
1556
1557                                 if (m == VM_PAGE_NULL) {
1558                                         /*
1559                                          * Allocate a new page for this object/offset pair as a placeholder
1560                                          */
1561                                         m = vm_page_grab();
1562 #if TRACEFAULTPAGE
1563                                         dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
1564 #endif
1565                                         if (m == VM_PAGE_NULL) {
1566
1567                                                 vm_fault_cleanup(object, first_m);
1568                                                 thread_interrupt_level(interruptible_state);
1569
1570                                                 return (VM_FAULT_MEMORY_SHORTAGE);
1571                                         }
1572
1573                                         m->absent = TRUE;
1574                                         if (fault_info && fault_info->batch_pmap_op == TRUE) {
1575                                                 vm_page_insert_internal(m, object, offset, FALSE, TRUE, TRUE);
1576                                         } else {
1577                                                 vm_page_insert(m, object, offset);
1578                                         }
1579                                 }
1580                                 assert(m->busy);
1581
1582                                 m->absent = TRUE;
1583                                 pager = object->pager;
1584
1585                                 assert(object->paging_in_progress > 0);
1586                                 vm_object_unlock(object);
1587
1588                                 rc = vm_compressor_pager_get(
1589                                         pager,
1590                                         offset + object->paging_offset,
1591                                         m->phys_page,
1592                                         &my_fault_type,
1593                                         0,
1594                                         &compressed_count_delta);
1595
1596                                 if (type_of_fault == NULL) {
1597                                         int     throttle_delay;
1598
1599                                         /*
1600                                          * we weren't called from vm_fault, so we
1601                                          * need to apply page creation throttling
1602                                          * do it before we re-acquire any locks
1603                                          */
1604                                         if (my_fault_type == DBG_COMPRESSOR_FAULT) {
1605                                                 if ((throttle_delay = vm_page_throttled(TRUE))) {
1606                                                         VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 1, 0);
1607                                                         delay(throttle_delay);
1608                                                 }
1609                                         }
1610                                 }
1611                                 vm_object_lock(object);
1612                                 assert(object->paging_in_progress > 0);
1613
1614                                 vm_compressor_pager_count(
1615                                         pager,
1616                                         compressed_count_delta,
1617                                         FALSE, /* shared_lock */
1618                                         object);
1619
1620                                 switch (rc) {
1621                                 case KERN_SUCCESS:
1622                                         m->absent = FALSE;
1623                                         m->dirty = TRUE;
1624                                         if ((m->object->wimg_bits &
1625                                              VM_WIMG_MASK) !=
1626                                             VM_WIMG_USE_DEFAULT) {
1627                                                 /*
1628                                                  * If the page is not cacheable,
1629                                                  * we can't let its contents
1630                                                  * linger in the data cache
1631                                                  * after the decompression.
1632                                                  */
1633                                                 pmap_sync_page_attributes_phys(
1634                                                         m->phys_page);
1635                                         } else {
1636                                                 m->written_by_kernel = TRUE;
1637                                         }
1638
1639                                         /*
1640                                          * If the object is purgeable, its
1641                                          * owner's purgeable ledgers have been
1642                                          * updated in vm_page_insert() but the
1643                                          * page was also accounted for in a
1644                                          * "compressed purgeable" ledger, so
1645                                          * update that now.
1646                                          */
1647                                         if ((object->purgable !=
1648                                              VM_PURGABLE_DENY) &&
1649                                             (object->vo_purgeable_owner !=
1650                                              NULL)) {
1651                                                 /*
1652                                                  * One less compressed
1653                                                  * purgeable page.
1654                                                  */
1655                                                 vm_purgeable_compressed_update(
1656                                                         object,
1657                                                         -1);
1658                                         }
1659
1660                                         break;
1661                                 case KERN_MEMORY_FAILURE:
1662                                         m->unusual = TRUE;
1663                                         m->error = TRUE;
1664                                         m->absent = FALSE;
1665                                         break;
1666                                 case KERN_MEMORY_ERROR:
1667                                         assert(m->absent);
1668                                         break;
1669                                 default:
1670                                         panic("vm_fault_page(): unexpected "
1671                                               "error %d from "
1672                                               "vm_compressor_pager_get()\n",
1673                                               rc);
1674                                 }
1675                                 PAGE_WAKEUP_DONE(m);
1676
1677                                 rc = KERN_SUCCESS;
1678                                 goto data_requested;
1679                         }
1680                         my_fault_type = DBG_PAGEIN_FAULT;
1681
1682                         if (m != VM_PAGE_NULL) {
1683                                 VM_PAGE_FREE(m);
1684                                 m = VM_PAGE_NULL;
1685                         }
1686
1687 #if TRACEFAULTPAGE
1688                         dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0);  /* (TEST/DEBUG) */
1689 #endif
1690
1691                         /*
1692                          * It's possible someone called vm_object_destroy while we weren't
1693                          * holding the object lock.  If that has happened, then bail out
1694                          * here.
1695                          */
1696
1697                         pager = object->pager;
1698
1699                         if (pager == MEMORY_OBJECT_NULL) {
1700                                 vm_fault_cleanup(object, first_m);
1701                                 thread_interrupt_level(interruptible_state);
1702                                 return VM_FAULT_MEMORY_ERROR;
1703                         }
1704
1705                         /*
1706                          * We have an absent page in place for the faulting offset,
1707                          * so we can release the object lock.
1708                          */
1709
1710                         vm_object_unlock(object);
1711
1712                         /*
1713                          * If this object uses a copy_call strategy,
1714                          * and we are interested in a copy of this object
1715                          * (having gotten here only by following a
1716                          * shadow chain), then tell the memory manager
1717                          * via a flag added to the desired_access
1718                          * parameter, so that it can detect a race
1719                          * between our walking down the shadow chain
1720                          * and its pushing pages up into a copy of
1721                          * the object that it manages.
1722                          */
1723                         if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL && object != first_object)
1724                                 wants_copy_flag = VM_PROT_WANTS_COPY;
1725                         else
1726                                 wants_copy_flag = VM_PROT_NONE;
1727
1728                         XPR(XPR_VM_FAULT,
1729                             "vm_f_page: data_req obj 0x%X, offset 0x%X, page 0x%X, acc %d\n",
1730                                 object, offset, m,
1731                                 access_required | wants_copy_flag, 0);
1732
1733                         if (object->copy == first_object) {
1734                                 /*
1735                                  * if we issue the memory_object_data_request in
1736                                  * this state, we are subject to a deadlock with
1737                                  * the underlying filesystem if it is trying to
1738                                  * shrink the file resulting in a push of pages
1739                                  * into the copy object...  that push will stall
1740                                  * on the placeholder page, and if the pushing thread
1741                                  * is holding a lock that is required on the pagein
1742                                  * path (such as a truncate lock), we'll deadlock...
1743                                  * to avoid this potential deadlock, we throw away
1744                                  * our placeholder page before calling memory_object_data_request
1745                                  * and force this thread to retry the vm_fault_page after
1746                                  * we have issued the I/O.  the second time through this path
1747                                  * we will find the page already in the cache (presumably still
1748                                  * busy waiting for the I/O to complete) and then complete
1749                                  * the fault w/o having to go through memory_object_data_request again
1750                                  */
1751                                 assert(first_m != VM_PAGE_NULL);
1752                                 assert(first_m->object == first_object);
1753
1754                                 vm_object_lock(first_object);
1755                                 VM_PAGE_FREE(first_m);
1756                                 vm_object_paging_end(first_object);
1757                                 vm_object_unlock(first_object);
1758
1759                                 first_m = VM_PAGE_NULL;
1760                                 force_fault_retry = TRUE;
1761
1762                                 vm_fault_page_forced_retry++;
1763                         }
1764
1765                         if (data_already_requested == TRUE) {
1766                                 orig_behavior = fault_info->behavior;
1767                                 orig_cluster_size = fault_info->cluster_size;
1768
1769                                 fault_info->behavior = VM_BEHAVIOR_RANDOM;
1770                                 fault_info->cluster_size = PAGE_SIZE;
1771                         }
1772                         /*
1773                          * Call the memory manager to retrieve the data.
1774                          */
1775                         rc = memory_object_data_request(
1776                                 pager,
1777                                 offset + object->paging_offset,
1778                                 PAGE_SIZE,
1779                                 access_required | wants_copy_flag,
1780                                 (memory_object_fault_info_t)fault_info);
1781
1782                         if (data_already_requested == TRUE) {
1783                                 fault_info->behavior = orig_behavior;
1784                                 fault_info->cluster_size = orig_cluster_size;
1785                         } else
1786                                 data_already_requested = TRUE;
1787
1788                         DTRACE_VM2(maj_fault, int, 1, (uint64_t *), NULL);
1789 #if TRACEFAULTPAGE
1790                         dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
1791 #endif
1792                         vm_object_lock(object);
1793
1794                 data_requested:
1795                         if (rc != KERN_SUCCESS) {
1796
1797                                 vm_fault_cleanup(object, first_m);
1798                                 thread_interrupt_level(interruptible_state);
1799
1800                                 return ((rc == MACH_SEND_INTERRUPTED) ?
1801                                         VM_FAULT_INTERRUPTED :
1802                                         VM_FAULT_MEMORY_ERROR);
1803                         } else {
1804                                 clock_sec_t     tv_sec;
1805                                 clock_usec_t    tv_usec;
1806
1807                                 if (my_fault_type == DBG_PAGEIN_FAULT) {
1808                                         clock_get_system_microtime(&tv_sec, &tv_usec);
1809                                         current_thread()->t_page_creation_time = tv_sec;
1810                                         current_thread()->t_page_creation_count = 0;
1811                                 }
1812                         }
1813                         if ((interruptible != THREAD_UNINT) && (current_thread()->sched_flags & TH_SFLAG_ABORT)) {
1814
1815                                 vm_fault_cleanup(object, first_m);
1816                                 thread_interrupt_level(interruptible_state);
1817
1818                                 return (VM_FAULT_INTERRUPTED);
1819                         }
1820                         if (force_fault_retry == TRUE) {
1821
1822                                 vm_fault_cleanup(object, first_m);
1823                                 thread_interrupt_level(interruptible_state);
1824
1825                                 return (VM_FAULT_RETRY);
1826                         }
1827                         if (m == VM_PAGE_NULL && object->phys_contiguous) {
1828                                 /*
1829                                  * No page here means that the object we
1830                                  * initially looked up was "physically
1831                                  * contiguous" (i.e. device memory).  However,
1832                                  * with Virtual VRAM, the object might not
1833                                  * be backed by that device memory anymore,
1834                                  * so we're done here only if the object is
1835                                  * still "phys_contiguous".
1836                                  * Otherwise, if the object is no longer
1837                                  * "phys_contiguous", we need to retry the
1838                                  * page fault against the object's new backing
1839                                  * store (different memory object).
1840                                  */
1841                         phys_contig_object:
1842                                 goto done;
1843                         }
1844                         /*
1845                          * potentially a pagein fault
1846                          * if we make it through the state checks
1847                          * above, than we'll count it as such
1848                          */
1849                         my_fault = my_fault_type;
1850
1851                         /*
1852                          * Retry with same object/offset, since new data may
1853                          * be in a different page (i.e., m is meaningless at
1854                          * this point).
1855                          */
1856                         continue;
1857                 }
1858 dont_look_for_page:
1859                 /*
1860                  * We get here if the object has no pager, or an existence map
1861                  * exists and indicates the page isn't present on the pager
1862                  * or we're unwiring a page.  If a pager exists, but there
1863                  * is no existence map, then the m->absent case above handles
1864                  * the ZF case when the pager can't provide the page
1865                  */
1866 #if TRACEFAULTPAGE
1867                 dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1868 #endif
1869                 if (object == first_object)
1870                         first_m = m;
1871                 else
1872                         assert(m == VM_PAGE_NULL);
1873
1874                 XPR(XPR_VM_FAULT,
1875                     "vm_f_page: no pager obj 0x%X, offset 0x%X, page 0x%X, next_obj 0x%X\n",
1876                         object, offset, m,
1877                         object->shadow, 0);
1878
1879                 next_object = object->shadow;
1880
1881                 if (next_object == VM_OBJECT_NULL) {
1882                         /*
1883                          * we've hit the bottom of the shadown chain,
1884                          * fill the page in the top object with zeros.
1885                          */
1886                         assert(!must_be_resident);
1887
1888                         if (object != first_object) {
1889                                 vm_object_paging_end(object);
1890                                 vm_object_unlock(object);
1891
1892                                 object = first_object;
1893                                 offset = first_offset;
1894                                 vm_object_lock(object);
1895                         }
1896                         m = first_m;
1897                         assert(m->object == object);
1898                         first_m = VM_PAGE_NULL;
1899
1900                         /*
1901                          * check for any conditions that prevent
1902                          * us from creating a new zero-fill page
1903                          * vm_fault_check will do all of the
1904                          * fault cleanup in the case of an error condition
1905                          * including resetting the thread_interrupt_level
1906                          */
1907                         error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
1908
1909                         if (error != VM_FAULT_SUCCESS)
1910                                 return (error);
1911
1912                         if (m == VM_PAGE_NULL) {
1913                                 m = vm_page_grab();
1914
1915                                 if (m == VM_PAGE_NULL) {
1916                                         vm_fault_cleanup(object, VM_PAGE_NULL);
1917                                         thread_interrupt_level(interruptible_state);
1918
1919                                         return (VM_FAULT_MEMORY_SHORTAGE);
1920                                 }
1921                                 vm_page_insert(m, object, offset);
1922                         }
1923                         if (fault_info->mark_zf_absent && no_zero_fill == TRUE)
1924                                 m->absent = TRUE;
1925
1926                         my_fault = vm_fault_zero_page(m, no_zero_fill);
1927
1928                         break;
1929
1930                 } else {
1931                         /*
1932                          * Move on to the next object.  Lock the next
1933                          * object before unlocking the current one.
1934                          */
1935                         if ((object != first_object) || must_be_resident)
1936                                 vm_object_paging_end(object);
1937
1938                         offset += object->vo_shadow_offset;
1939                         fault_info->lo_offset += object->vo_shadow_offset;
1940                         fault_info->hi_offset += object->vo_shadow_offset;
1941                         access_required = VM_PROT_READ;
1942
1943                         vm_object_lock(next_object);
1944                         vm_object_unlock(object);
1945
1946                         object = next_object;
1947                         vm_object_paging_begin(object);
1948                 }
1949         }
1950
1951         /*
1952          *      PAGE HAS BEEN FOUND.
1953          *
1954          *      This page (m) is:
1955          *              busy, so that we can play with it;
1956          *              not absent, so that nobody else will fill it;
1957          *              possibly eligible for pageout;
1958          *
1959          *      The top-level page (first_m) is:
1960          *              VM_PAGE_NULL if the page was found in the
1961          *               top-level object;
1962          *              busy, not absent, and ineligible for pageout.
1963          *
1964          *      The current object (object) is locked.  A paging
1965          *      reference is held for the current and top-level
1966          *      objects.
1967          */
1968
1969 #if TRACEFAULTPAGE
1970         dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1971 #endif
1972 #if     EXTRA_ASSERTIONS
1973         assert(m->busy && !m->absent);
1974         assert((first_m == VM_PAGE_NULL) ||
1975                (first_m->busy && !first_m->absent &&
1976                 !first_m->active && !first_m->inactive));
1977 #endif  /* EXTRA_ASSERTIONS */
1978
1979         /*
1980          * ENCRYPTED SWAP:
1981          * If we found a page, we must have decrypted it before we
1982          * get here...
1983          */
1984         ASSERT_PAGE_DECRYPTED(m);
1985
1986         XPR(XPR_VM_FAULT,
1987             "vm_f_page: FOUND obj 0x%X, off 0x%X, page 0x%X, 1_obj 0x%X, 1_m 0x%X\n",
1988                 object, offset, m,
1989                 first_object, first_m);
1990
1991         /*
1992          * If the page is being written, but isn't
1993          * already owned by the top-level object,
1994          * we have to copy it into a new page owned
1995          * by the top-level object.
1996          */
1997         if (object != first_object) {
1998
1999 #if TRACEFAULTPAGE
2000                 dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
2001 #endif
2002                 if (fault_type & VM_PROT_WRITE) {
2003                         vm_page_t copy_m;
2004
2005                         /*
2006                          * We only really need to copy if we
2007                          * want to write it.
2008                          */
2009                         assert(!must_be_resident);
2010
2011                         /*
2012                          * are we protecting the system from
2013                          * backing store exhaustion.  If so
2014                          * sleep unless we are privileged.
2015                          */
2016                         if (vm_backing_store_low) {
2017                                 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
2018
2019                                         RELEASE_PAGE(m);
2020                                         vm_fault_cleanup(object, first_m);
2021
2022                                         assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
2023
2024                                         thread_block(THREAD_CONTINUE_NULL);
2025                                         thread_interrupt_level(interruptible_state);
2026
2027                                         return (VM_FAULT_RETRY);
2028                                 }
2029                         }
2030                         /*
2031                          * If we try to collapse first_object at this
2032                          * point, we may deadlock when we try to get
2033                          * the lock on an intermediate object (since we
2034                          * have the bottom object locked).  We can't
2035                          * unlock the bottom object, because the page
2036                          * we found may move (by collapse) if we do.
2037                          *
2038                          * Instead, we first copy the page.  Then, when
2039                          * we have no more use for the bottom object,
2040                          * we unlock it and try to collapse.
2041                          *
2042                          * Note that we copy the page even if we didn't
2043                          * need to... that's the breaks.
2044                          */
2045
2046                         /*
2047                          * Allocate a page for the copy
2048                          */
2049                         copy_m = vm_page_grab();
2050
2051                         if (copy_m == VM_PAGE_NULL) {
2052                                 RELEASE_PAGE(m);
2053
2054                                 vm_fault_cleanup(object, first_m);
2055                                 thread_interrupt_level(interruptible_state);
2056
2057                                 return (VM_FAULT_MEMORY_SHORTAGE);
2058                         }
2059                         XPR(XPR_VM_FAULT,
2060                             "vm_f_page: page_copy obj 0x%X, offset 0x%X, m 0x%X, copy_m 0x%X\n",
2061                                 object, offset,
2062                                 m, copy_m, 0);
2063
2064                         vm_page_copy(m, copy_m);
2065
2066                         /*
2067                          * If another map is truly sharing this
2068                          * page with us, we have to flush all
2069                          * uses of the original page, since we
2070                          * can't distinguish those which want the
2071                          * original from those which need the
2072                          * new copy.
2073                          *
2074                          * XXXO If we know that only one map has
2075                          * access to this page, then we could
2076                          * avoid the pmap_disconnect() call.
2077                          */
2078                         if (m->pmapped)
2079                                 pmap_disconnect(m->phys_page);
2080
2081                         if (m->clustered) {
2082                                 VM_PAGE_COUNT_AS_PAGEIN(m);
2083                                 VM_PAGE_CONSUME_CLUSTERED(m);
2084                         }
2085                         assert(!m->cleaning);
2086
2087                         /*
2088                          * We no longer need the old page or object.
2089                          */
2090                         RELEASE_PAGE(m);
2091
2092                         vm_object_paging_end(object);
2093                         vm_object_unlock(object);
2094
2095                         my_fault = DBG_COW_FAULT;
2096                         VM_STAT_INCR(cow_faults);
2097                         DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
2098                         current_task()->cow_faults++;
2099
2100                         object = first_object;
2101                         offset = first_offset;
2102
2103                         vm_object_lock(object);
2104                         /*
2105                          * get rid of the place holder
2106                          * page that we soldered in earlier
2107                          */
2108                         VM_PAGE_FREE(first_m);
2109                         first_m = VM_PAGE_NULL;
2110
2111                         /*
2112                          * and replace it with the
2113                          * page we just copied into
2114                          */
2115                         assert(copy_m->busy);
2116                         vm_page_insert(copy_m, object, offset);
2117                         SET_PAGE_DIRTY(copy_m, TRUE);
2118
2119                         m = copy_m;
2120                         /*
2121                          * Now that we've gotten the copy out of the
2122                          * way, let's try to collapse the top object.
2123                          * But we have to play ugly games with
2124                          * paging_in_progress to do that...
2125                          */
2126                         vm_object_paging_end(object);
2127                         vm_object_collapse(object, offset, TRUE);
2128                         vm_object_paging_begin(object);
2129
2130                 } else
2131                         *protection &= (~VM_PROT_WRITE);
2132         }
2133         /*
2134          * Now check whether the page needs to be pushed into the
2135          * copy object.  The use of asymmetric copy on write for
2136          * shared temporary objects means that we may do two copies to
2137          * satisfy the fault; one above to get the page from a
2138          * shadowed object, and one here to push it into the copy.
2139          */
2140         try_failed_count = 0;
2141
2142         while ((copy_object = first_object->copy) != VM_OBJECT_NULL) {
2143                 vm_object_offset_t      copy_offset;
2144                 vm_page_t               copy_m;
2145
2146 #if TRACEFAULTPAGE
2147                 dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type);    /* (TEST/DEBUG) */
2148 #endif
2149                 /*
2150                  * If the page is being written, but hasn't been
2151                  * copied to the copy-object, we have to copy it there.
2152                  */
2153                 if ((fault_type & VM_PROT_WRITE) == 0) {
2154                         *protection &= ~VM_PROT_WRITE;
2155                         break;
2156                 }
2157
2158                 /*
2159                  * If the page was guaranteed to be resident,
2160                  * we must have already performed the copy.
2161                  */
2162                 if (must_be_resident)
2163                         break;
2164
2165                 /*
2166                  * Try to get the lock on the copy_object.
2167                  */
2168                 if (!vm_object_lock_try(copy_object)) {
2169
2170                         vm_object_unlock(object);
2171                         try_failed_count++;
2172
2173                         mutex_pause(try_failed_count);  /* wait a bit */
2174                         vm_object_lock(object);
2175
2176                         continue;
2177                 }
2178                 try_failed_count = 0;
2179
2180                 /*
2181                  * Make another reference to the copy-object,
2182                  * to keep it from disappearing during the
2183                  * copy.
2184                  */
2185                 vm_object_reference_locked(copy_object);
2186
2187                 /*
2188                  * Does the page exist in the copy?
2189                  */
2190                 copy_offset = first_offset - copy_object->vo_shadow_offset;
2191
2192                 if (copy_object->vo_size <= copy_offset)
2193                         /*
2194                          * Copy object doesn't cover this page -- do nothing.
2195                          */
2196                         ;
2197                 else if ((copy_m = vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
2198                         /*
2199                          * Page currently exists in the copy object
2200                          */
2201                         if (copy_m->busy) {
2202                                 /*
2203                                  * If the page is being brought
2204                                  * in, wait for it and then retry.
2205                                  */
2206                                 RELEASE_PAGE(m);
2207
2208                                 /*
2209                                  * take an extra ref so object won't die
2210                                  */
2211                                 vm_object_reference_locked(copy_object);
2212                                 vm_object_unlock(copy_object);
2213                                 vm_fault_cleanup(object, first_m);
2214                                 counter(c_vm_fault_page_block_backoff_kernel++);
2215
2216                                 vm_object_lock(copy_object);
2217                                 assert(copy_object->ref_count > 0);
2218                                 VM_OBJ_RES_DECR(copy_object);
2219                                 vm_object_lock_assert_exclusive(copy_object);
2220                                 copy_object->ref_count--;
2221                                 assert(copy_object->ref_count > 0);
2222                                 copy_m = vm_page_lookup(copy_object, copy_offset);
2223                                 /*
2224                                  * ENCRYPTED SWAP:
2225                                  * it's OK if the "copy_m" page is encrypted,
2226                                  * because we're not moving it nor handling its
2227                                  * contents.
2228                                  */
2229                                 if (copy_m != VM_PAGE_NULL && copy_m->busy) {
2230                                         PAGE_ASSERT_WAIT(copy_m, interruptible);
2231
2232                                         vm_object_unlock(copy_object);
2233                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
2234                                         vm_object_deallocate(copy_object);
2235
2236                                         goto backoff;
2237                                 } else {
2238                                         vm_object_unlock(copy_object);
2239                                         vm_object_deallocate(copy_object);
2240                                         thread_interrupt_level(interruptible_state);
2241
2242                                         return (VM_FAULT_RETRY);
2243                                 }
2244                         }
2245                 }
2246                 else if (!PAGED_OUT(copy_object, copy_offset)) {
2247                         /*
2248                          * If PAGED_OUT is TRUE, then the page used to exist
2249                          * in the copy-object, and has already been paged out.
2250                          * We don't need to repeat this. If PAGED_OUT is
2251                          * FALSE, then either we don't know (!pager_created,
2252                          * for example) or it hasn't been paged out.
2253                          * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
2254                          * We must copy the page to the copy object.
2255                          */
2256
2257                         if (vm_backing_store_low) {
2258                                 /*
2259                                  * we are protecting the system from
2260                                  * backing store exhaustion.  If so
2261                                  * sleep unless we are privileged.
2262                                  */
2263                                 if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
2264                                         assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
2265
2266                                         RELEASE_PAGE(m);
2267                                         VM_OBJ_RES_DECR(copy_object);
2268                                         vm_object_lock_assert_exclusive(copy_object);
2269                                         copy_object->ref_count--;
2270                                         assert(copy_object->ref_count > 0);
2271
2272                                         vm_object_unlock(copy_object);
2273                                         vm_fault_cleanup(object, first_m);
2274                                         thread_block(THREAD_CONTINUE_NULL);
2275                                         thread_interrupt_level(interruptible_state);
2276
2277                                         return (VM_FAULT_RETRY);
2278                                 }
2279                         }
2280                         /*
2281                          * Allocate a page for the copy
2282                          */
2283                         copy_m = vm_page_alloc(copy_object, copy_offset);
2284
2285                         if (copy_m == VM_PAGE_NULL) {
2286                                 RELEASE_PAGE(m);
2287
2288                                 VM_OBJ_RES_DECR(copy_object);
2289                                 vm_object_lock_assert_exclusive(copy_object);
2290                                 copy_object->ref_count--;
2291                                 assert(copy_object->ref_count > 0);
2292
2293                                 vm_object_unlock(copy_object);
2294                                 vm_fault_cleanup(object, first_m);
2295                                 thread_interrupt_level(interruptible_state);
2296
2297                                 return (VM_FAULT_MEMORY_SHORTAGE);
2298                         }
2299                         /*
2300                          * Must copy page into copy-object.
2301                          */
2302                         vm_page_copy(m, copy_m);
2303
2304                         /*
2305                          * If the old page was in use by any users
2306                          * of the copy-object, it must be removed
2307                          * from all pmaps.  (We can't know which
2308                          * pmaps use it.)
2309                          */
2310                         if (m->pmapped)
2311                                 pmap_disconnect(m->phys_page);
2312
2313                         if (m->clustered) {
2314                                 VM_PAGE_COUNT_AS_PAGEIN(m);
2315                                 VM_PAGE_CONSUME_CLUSTERED(m);
2316                         }
2317                         /*
2318                          * If there's a pager, then immediately
2319                          * page out this page, using the "initialize"
2320                          * option.  Else, we use the copy.
2321                          */
2322                         if ((!copy_object->pager_ready)
2323 #if MACH_PAGEMAP
2324                             || vm_external_state_get(copy_object->existence_map, copy_offset) == VM_EXTERNAL_STATE_ABSENT
2325 #endif
2326                             || VM_COMPRESSOR_PAGER_STATE_GET(copy_object, copy_offset) == VM_EXTERNAL_STATE_ABSENT
2327                             ) {
2328
2329                                 vm_page_lockspin_queues();
2330                                 assert(!m->cleaning);
2331                                 vm_page_activate(copy_m);
2332                                 vm_page_unlock_queues();
2333
2334                                 SET_PAGE_DIRTY(copy_m, TRUE);
2335                                 PAGE_WAKEUP_DONE(copy_m);
2336
2337                         } else if (copy_object->internal &&
2338                                    (DEFAULT_PAGER_IS_ACTIVE || DEFAULT_FREEZER_IS_ACTIVE)) {
2339                                 /*
2340                                  * For internal objects check with the pager to see
2341                                  * if the page already exists in the backing store.
2342                                  * If yes, then we can drop the copy page. If not,
2343                                  * then we'll activate it, mark it dirty and keep it
2344                                  * around.
2345                                  */
2346
2347                                 kern_return_t kr = KERN_SUCCESS;
2348
2349                                 memory_object_t copy_pager = copy_object->pager;
2350                                 assert(copy_pager != MEMORY_OBJECT_NULL);
2351                                 vm_object_paging_begin(copy_object);
2352
2353                                 vm_object_unlock(copy_object);
2354
2355                                 kr = memory_object_data_request(
2356                                         copy_pager,
2357                                         copy_offset + copy_object->paging_offset,
2358                                         0, /* Only query the pager. */
2359                                         VM_PROT_READ,
2360                                         NULL);
2361
2362                                 vm_object_lock(copy_object);
2363
2364                                 vm_object_paging_end(copy_object);
2365
2366                                 /*
2367                                  * Since we dropped the copy_object's lock,
2368                                  * check whether we'll have to deallocate
2369                                  * the hard way.
2370                                  */
2371                                 if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
2372                                         vm_object_unlock(copy_object);
2373                                         vm_object_deallocate(copy_object);
2374                                         vm_object_lock(object);
2375
2376                                         continue;
2377                                 }
2378                                 if (kr == KERN_SUCCESS) {
2379                                         /*
2380                                          * The pager has the page. We don't want to overwrite
2381                                          * that page by sending this one out to the backing store.
2382                                          * So we drop the copy page.
2383                                          */
2384                                         VM_PAGE_FREE(copy_m);
2385
2386                                 } else {
2387                                         /*
2388                                          * The pager doesn't have the page. We'll keep this one
2389                                          * around in the copy object. It might get sent out to
2390                                          * the backing store under memory pressure.
2391                                          */
2392                                         vm_page_lockspin_queues();
2393                                         assert(!m->cleaning);
2394                                         vm_page_activate(copy_m);
2395                                         vm_page_unlock_queues();
2396
2397                                         SET_PAGE_DIRTY(copy_m, TRUE);
2398                                         PAGE_WAKEUP_DONE(copy_m);
2399                                 }
2400                         } else {
2401
2402                                 assert(copy_m->busy == TRUE);
2403                                 assert(!m->cleaning);
2404
2405                                 /*
2406                                  * dirty is protected by the object lock
2407                                  */
2408                                 SET_PAGE_DIRTY(copy_m, TRUE);
2409
2410                                 /*
2411                                  * The page is already ready for pageout:
2412                                  * not on pageout queues and busy.
2413                                  * Unlock everything except the
2414                                  * copy_object itself.
2415                                  */
2416                                 vm_object_unlock(object);
2417
2418                                 /*
2419                                  * Write the page to the copy-object,
2420                                  * flushing it from the kernel.
2421                                  */
2422                                 vm_pageout_initialize_page(copy_m);
2423
2424                                 /*
2425                                  * Since the pageout may have
2426                                  * temporarily dropped the
2427                                  * copy_object's lock, we
2428                                  * check whether we'll have
2429                                  * to deallocate the hard way.
2430                                  */
2431                                 if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
2432                                         vm_object_unlock(copy_object);
2433                                         vm_object_deallocate(copy_object);
2434                                         vm_object_lock(object);
2435
2436                                         continue;
2437                                 }
2438                                 /*
2439                                  * Pick back up the old object's
2440                                  * lock.  [It is safe to do so,
2441                                  * since it must be deeper in the
2442                                  * object tree.]
2443                                  */
2444                                 vm_object_lock(object);
2445                         }
2446
2447                         /*
2448                          * Because we're pushing a page upward
2449                          * in the object tree, we must restart
2450                          * any faults that are waiting here.
2451                          * [Note that this is an expansion of
2452                          * PAGE_WAKEUP that uses the THREAD_RESTART
2453                          * wait result].  Can't turn off the page's
2454                          * busy bit because we're not done with it.
2455                          */
2456                         if (m->wanted) {
2457                                 m->wanted = FALSE;
2458                                 thread_wakeup_with_result((event_t) m, THREAD_RESTART);
2459                         }
2460                 }
2461                 /*
2462                  * The reference count on copy_object must be
2463                  * at least 2: one for our extra reference,
2464                  * and at least one from the outside world
2465                  * (we checked that when we last locked
2466                  * copy_object).
2467                  */
2468                 vm_object_lock_assert_exclusive(copy_object);
2469                 copy_object->ref_count--;
2470                 assert(copy_object->ref_count > 0);
2471
2472                 VM_OBJ_RES_DECR(copy_object);
2473                 vm_object_unlock(copy_object);
2474
2475                 break;
2476         }
2477
2478 done:
2479         *result_page = m;
2480         *top_page = first_m;
2481
2482         XPR(XPR_VM_FAULT,
2483                 "vm_f_page: DONE obj 0x%X, offset 0x%X, m 0x%X, first_m 0x%X\n",
2484                 object, offset, m, first_m, 0);
2485
2486         if (m != VM_PAGE_NULL) {
2487                 retval = VM_FAULT_SUCCESS;
2488
2489                 if (my_fault == DBG_PAGEIN_FAULT) {
2490
2491                         VM_PAGE_COUNT_AS_PAGEIN(m);
2492
2493                         if (m->object->internal)
2494                                 my_fault = DBG_PAGEIND_FAULT;
2495                         else
2496                                 my_fault = DBG_PAGEINV_FAULT;
2497
2498                         /*
2499                          * evaluate access pattern and update state
2500                          * vm_fault_deactivate_behind depends on the
2501                          * state being up to date
2502                          */
2503                         vm_fault_is_sequential(object, offset, fault_info->behavior);
2504
2505                         vm_fault_deactivate_behind(object, offset, fault_info->behavior);
2506                 } else if (my_fault == DBG_COMPRESSOR_FAULT || my_fault == DBG_COMPRESSOR_SWAPIN_FAULT) {
2507
2508                         VM_STAT_INCR(decompressions);
2509                 }
2510                 if (type_of_fault)
2511                         *type_of_fault = my_fault;
2512         } else {
2513                 retval = VM_FAULT_SUCCESS_NO_VM_PAGE;
2514                 assert(first_m == VM_PAGE_NULL);
2515                 assert(object == first_object);
2516         }
2517
2518         thread_interrupt_level(interruptible_state);
2519
2520 #if TRACEFAULTPAGE
2521         dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0);       /* (TEST/DEBUG) */
2522 #endif
2523         return retval;
2524
2525 backoff:
2526         thread_interrupt_level(interruptible_state);
2527
2528         if (wait_result == THREAD_INTERRUPTED)
2529                 return (VM_FAULT_INTERRUPTED);
2530         return (VM_FAULT_RETRY);
2531
2532 #undef  RELEASE_PAGE
2533 }
2534
2535
2536
2537 /*
2538  * CODE SIGNING:
2539  * When soft faulting a page, we have to validate the page if:
2540  * 1. the page is being mapped in user space
2541  * 2. the page hasn't already been found to be "tainted"
2542  * 3. the page belongs to a code-signed object
2543  * 4. the page has not been validated yet or has been mapped for write.
2544  */
2545 #define VM_FAULT_NEED_CS_VALIDATION(pmap, page)                         \
2546         ((pmap) != kernel_pmap /*1*/ &&                                 \
2547          !(page)->cs_tainted /*2*/ &&                                   \
2548          (page)->object->code_signed /*3*/ &&                           \
2549          (!(page)->cs_validated || (page)->wpmapped /*4*/))
2550
2551
2552 /*
2553  * page queue lock must NOT be held
2554  * m->object must be locked
2555  *
2556  * NOTE: m->object could be locked "shared" only if we are called
2557  * from vm_fault() as part of a soft fault.  If so, we must be
2558  * careful not to modify the VM object in any way that is not
2559  * legal under a shared lock...
2560  */
2561 extern int proc_selfpid(void);
2562 extern char *proc_name_address(void *p);
2563 unsigned long cs_enter_tainted_rejected = 0;
2564 unsigned long cs_enter_tainted_accepted = 0;
2565 kern_return_t
2566 vm_fault_enter(vm_page_t m,
2567                pmap_t pmap,
2568                vm_map_offset_t vaddr,
2569                vm_prot_t prot,
2570                vm_prot_t fault_type,
2571                boolean_t wired,
2572                boolean_t change_wiring,
2573                boolean_t no_cache,
2574                boolean_t cs_bypass,
2575                __unused int      user_tag,
2576                int       pmap_options,
2577                boolean_t *need_retry,
2578                int *type_of_fault)
2579 {
2580         kern_return_t   kr, pe_result;
2581         boolean_t       previously_pmapped = m->pmapped;
2582         boolean_t       must_disconnect = 0;
2583         boolean_t       map_is_switched, map_is_switch_protected;
2584         int             cs_enforcement_enabled;
2585
2586         vm_object_lock_assert_held(m->object);
2587 #if DEBUG
2588         lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
2589 #endif /* DEBUG */
2590
2591         if (m->phys_page == vm_page_guard_addr) {
2592                 assert(m->fictitious);
2593                 return KERN_SUCCESS;
2594         }
2595
2596         if (*type_of_fault == DBG_ZERO_FILL_FAULT) {
2597
2598                 vm_object_lock_assert_exclusive(m->object);
2599
2600         } else if ((fault_type & VM_PROT_WRITE) == 0) {
2601                 /*
2602                  * This is not a "write" fault, so we
2603                  * might not have taken the object lock
2604                  * exclusively and we might not be able
2605                  * to update the "wpmapped" bit in
2606                  * vm_fault_enter().
2607                  * Let's just grant read access to
2608                  * the page for now and we'll
2609                  * soft-fault again if we need write
2610                  * access later...
2611                  */
2612                 prot &= ~VM_PROT_WRITE;
2613         }
2614         if (m->pmapped == FALSE) {
2615
2616                 if (m->clustered) {
2617                         if (*type_of_fault == DBG_CACHE_HIT_FAULT) {
2618                                 /*
2619                                  * found it in the cache, but this
2620                                  * is the first fault-in of the page (m->pmapped == FALSE)
2621                                  * so it must have come in as part of
2622                                  * a cluster... account 1 pagein against it
2623                                  */
2624                                 if (m->object->internal)
2625                                         *type_of_fault = DBG_PAGEIND_FAULT;
2626                                 else
2627                                         *type_of_fault = DBG_PAGEINV_FAULT;
2628
2629                                 VM_PAGE_COUNT_AS_PAGEIN(m);
2630                         }
2631                         VM_PAGE_CONSUME_CLUSTERED(m);
2632                 }
2633         }
2634
2635         if (*type_of_fault != DBG_COW_FAULT) {
2636                 DTRACE_VM2(as_fault, int, 1, (uint64_t *), NULL);
2637
2638                 if (pmap == kernel_pmap) {
2639                         DTRACE_VM2(kernel_asflt, int, 1, (uint64_t *), NULL);
2640                 }
2641         }
2642
2643         /* Validate code signature if necessary. */
2644         if (VM_FAULT_NEED_CS_VALIDATION(pmap, m)) {
2645                 vm_object_lock_assert_exclusive(m->object);
2646
2647                 if (m->cs_validated) {
2648                         vm_cs_revalidates++;
2649                 }
2650
2651                 /* VM map is locked, so 1 ref will remain on VM object -
2652                  * so no harm if vm_page_validate_cs drops the object lock */
2653                 vm_page_validate_cs(m);
2654         }
2655
2656 #define page_immutable(m,prot) ((m)->cs_validated /*&& ((prot) & VM_PROT_EXECUTE)*/)
2657 #define page_nx(m) ((m)->cs_nx)
2658
2659         map_is_switched = ((pmap != vm_map_pmap(current_task()->map)) &&
2660                            (pmap == vm_map_pmap(current_thread()->map)));
2661         map_is_switch_protected = current_thread()->map->switch_protect;
2662
2663         /* If the map is switched, and is switch-protected, we must protect
2664          * some pages from being write-faulted: immutable pages because by
2665          * definition they may not be written, and executable pages because that
2666          * would provide a way to inject unsigned code.
2667          * If the page is immutable, we can simply return. However, we can't
2668          * immediately determine whether a page is executable anywhere. But,
2669          * we can disconnect it everywhere and remove the executable protection
2670          * from the current map. We do that below right before we do the
2671          * PMAP_ENTER.
2672          */
2673         cs_enforcement_enabled = cs_enforcement(NULL);
2674
2675         if(cs_enforcement_enabled && map_is_switched &&
2676            map_is_switch_protected && page_immutable(m, prot) &&
2677            (prot & VM_PROT_WRITE))
2678         {
2679                 return KERN_CODESIGN_ERROR;
2680         }
2681
2682         if (cs_enforcement_enabled && page_nx(m) && (prot & VM_PROT_EXECUTE)) {
2683                 if (cs_debug)
2684                         printf("page marked to be NX, not letting it be mapped EXEC\n");
2685                 return KERN_CODESIGN_ERROR;
2686         }
2687
2688         /* A page could be tainted, or pose a risk of being tainted later.
2689          * Check whether the receiving process wants it, and make it feel
2690          * the consequences (that hapens in cs_invalid_page()).
2691          * For CS Enforcement, two other conditions will
2692          * cause that page to be tainted as well:
2693          * - pmapping an unsigned page executable - this means unsigned code;
2694          * - writeable mapping of a validated page - the content of that page
2695          *   can be changed without the kernel noticing, therefore unsigned
2696          *   code can be created
2697          */
2698         if (m->cs_tainted ||
2699             ((cs_enforcement_enabled && !cs_bypass ) &&
2700              (/* The page is unsigned and wants to be executable */
2701               (!m->cs_validated && (prot & VM_PROT_EXECUTE))  ||
2702               /* The page should be immutable, but is in danger of being modified
2703                 * This is the case where we want policy from the code directory -
2704                 * is the page immutable or not? For now we have to assume that
2705                 * code pages will be immutable, data pages not.
2706                 * We'll assume a page is a code page if it has a code directory
2707                 * and we fault for execution.
2708                 * That is good enough since if we faulted the code page for
2709                 * writing in another map before, it is wpmapped; if we fault
2710                 * it for writing in this map later it will also be faulted for executing
2711                 * at the same time; and if we fault for writing in another map
2712                 * later, we will disconnect it from this pmap so we'll notice
2713                 * the change.
2714                 */
2715               (page_immutable(m, prot) && ((prot & VM_PROT_WRITE) || m->wpmapped))
2716               ))
2717                 )
2718         {
2719                 /* We will have a tainted page. Have to handle the special case
2720                  * of a switched map now. If the map is not switched, standard
2721                  * procedure applies - call cs_invalid_page().
2722                  * If the map is switched, the real owner is invalid already.
2723                  * There is no point in invalidating the switching process since
2724                  * it will not be executing from the map. So we don't call
2725                  * cs_invalid_page() in that case. */
2726                 boolean_t reject_page;
2727                 if(map_is_switched) {
2728                         assert(pmap==vm_map_pmap(current_thread()->map));
2729                         assert(!(prot & VM_PROT_WRITE) || (map_is_switch_protected == FALSE));
2730                         reject_page = FALSE;
2731                 } else {
2732                         if (cs_debug > 5)
2733                                 printf("vm_fault: signed: %s validate: %s tainted: %s wpmapped: %s slid: %s prot: 0x%x\n",
2734                                        m->object->code_signed ? "yes" : "no",
2735                                        m->cs_validated ? "yes" : "no",
2736                                        m->cs_tainted ? "yes" : "no",
2737                                        m->wpmapped ? "yes" : "no",
2738                                        m->slid ? "yes" : "no",
2739                                        (int)prot);
2740                         reject_page = cs_invalid_page((addr64_t) vaddr);
2741                 }
2742
2743                 if (reject_page) {
2744                         /* reject the invalid page: abort the page fault */
2745                         int                     pid;
2746                         const char              *procname;
2747                         task_t                  task;
2748                         vm_object_t             file_object, shadow;
2749                         vm_object_offset_t      file_offset;
2750                         char                    *pathname, *filename;
2751                         vm_size_t               pathname_len, filename_len;
2752                         boolean_t               truncated_path;
2753 #define __PATH_MAX 1024
2754                         struct timespec         mtime, cs_mtime;
2755
2756                         kr = KERN_CODESIGN_ERROR;
2757                         cs_enter_tainted_rejected++;
2758
2759                         /* get process name and pid */
2760                         procname = "?";
2761                         task = current_task();
2762                         pid = proc_selfpid();
2763                         if (task->bsd_info != NULL)
2764                                 procname = proc_name_address(task->bsd_info);
2765
2766                         /* get file's VM object */
2767                         file_object = m->object;
2768                         file_offset = m->offset;
2769                         for (shadow = file_object->shadow;
2770                              shadow != VM_OBJECT_NULL;
2771                              shadow = file_object->shadow) {
2772                                 vm_object_lock_shared(shadow);
2773                                 if (file_object != m->object) {
2774                                         vm_object_unlock(file_object);
2775                                 }
2776                                 file_offset += file_object->vo_shadow_offset;
2777                                 file_object = shadow;
2778                         }
2779
2780                         mtime.tv_sec = 0;
2781                         mtime.tv_nsec = 0;
2782                         cs_mtime.tv_sec = 0;
2783                         cs_mtime.tv_nsec = 0;
2784
2785                         /* get file's pathname and/or filename */
2786                         pathname = NULL;
2787                         filename = NULL;
2788                         pathname_len = 0;
2789                         filename_len = 0;
2790                         truncated_path = FALSE;
2791                         if (file_object->pager == NULL) {
2792                                 /* no pager -> no file -> no pathname */
2793                                 pathname = (char *) "<nil>";
2794                         } else {
2795                                 pathname = (char *)kalloc(__PATH_MAX * 2);
2796                                 if (pathname) {
2797                                         pathname[0] = '\0';
2798                                         pathname_len = __PATH_MAX;
2799                                         filename = pathname + pathname_len;
2800                                         filename_len = __PATH_MAX;
2801                                 }
2802                                 vnode_pager_get_object_name(file_object->pager,
2803                                                             pathname,
2804                                                             pathname_len,
2805                                                             filename,
2806                                                             filename_len,
2807                                                             &truncated_path);
2808                                 vnode_pager_get_object_mtime(file_object->pager,
2809                                                              &mtime,
2810                                                              &cs_mtime);
2811                         }
2812                         printf("CODE SIGNING: process %d[%s]: "
2813                                "rejecting invalid page at address 0x%llx "
2814                                "from offset 0x%llx in file \"%s%s%s\" "
2815                                "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
2816                                "(signed:%d validated:%d tainted:%d "
2817                                "wpmapped:%d slid:%d)\n",
2818                                pid, procname, (addr64_t) vaddr,
2819                                file_offset,
2820                                (pathname ? pathname : ""),
2821                                (truncated_path ? "/.../" : ""),
2822                                (truncated_path ? filename : ""),
2823                                cs_mtime.tv_sec, cs_mtime.tv_nsec,
2824                                ((cs_mtime.tv_sec == mtime.tv_sec &&
2825                                  cs_mtime.tv_nsec == mtime.tv_nsec)
2826                                 ? "=="
2827                                 : "!="),
2828                                mtime.tv_sec, mtime.tv_nsec,
2829                                m->object->code_signed,
2830                                m->cs_validated,
2831                                m->cs_tainted,
2832                                m->wpmapped,
2833                                m->slid);
2834                         if (file_object != m->object) {
2835                                 vm_object_unlock(file_object);
2836                         }
2837                         if (pathname_len != 0) {
2838                                 kfree(pathname, __PATH_MAX * 2);
2839                                 pathname = NULL;
2840                                 filename = NULL;
2841                         }
2842                 } else {
2843                         /* proceed with the invalid page */
2844                         kr = KERN_SUCCESS;
2845                         if (!m->cs_validated) {
2846                                 /*
2847                                  * This page has not been validated, so it
2848                                  * must not belong to a code-signed object
2849                                  * and should not be forcefully considered
2850                                  * as tainted.
2851                                  * We're just concerned about it here because
2852                                  * we've been asked to "execute" it but that
2853                                  * does not mean that it should cause other
2854                                  * accesses to fail.
2855                                  * This happens when a debugger sets a
2856                                  * breakpoint and we then execute code in
2857                                  * that page.  Marking the page as "tainted"
2858                                  * would cause any inspection tool ("leaks",
2859                                  * "vmmap", "CrashReporter", ...) to get killed
2860                                  * due to code-signing violation on that page,
2861                                  * even though they're just reading it and not
2862                                  * executing from it.
2863                                  */
2864                                 assert(!m->object->code_signed);
2865                         } else {
2866                                 /*
2867                                  * Page might have been tainted before or not;
2868                                  * now it definitively is. If the page wasn't
2869                                  * tainted, we must disconnect it from all
2870                                  * pmaps later, to force existing mappings
2871                                  * through that code path for re-consideration
2872                                  * of the validity of that page.
2873                                  */
2874                                 must_disconnect = !m->cs_tainted;
2875                                 m->cs_tainted = TRUE;
2876                         }
2877                         cs_enter_tainted_accepted++;
2878                 }
2879                 if (kr != KERN_SUCCESS) {
2880                         if (cs_debug) {
2881                                 printf("CODESIGNING: vm_fault_enter(0x%llx): "
2882                                        "*** INVALID PAGE ***\n",
2883                                        (long long)vaddr);
2884                         }
2885 #if !SECURE_KERNEL
2886                         if (cs_enforcement_panic) {
2887                                 panic("CODESIGNING: panicking on invalid page\n");
2888                         }
2889 #endif
2890                 }
2891
2892         } else {
2893                 /* proceed with the valid page */
2894                 kr = KERN_SUCCESS;
2895         }
2896
2897         boolean_t       page_queues_locked = FALSE;
2898 #define __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED()   \
2899 MACRO_BEGIN                                     \
2900         if (! page_queues_locked) {             \
2901                 page_queues_locked = TRUE;      \
2902                 vm_page_lockspin_queues();      \
2903         }                                       \
2904 MACRO_END
2905 #define __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED()     \
2906 MACRO_BEGIN                                     \
2907         if (page_queues_locked) {               \
2908                 page_queues_locked = FALSE;     \
2909                 vm_page_unlock_queues();        \
2910         }                                       \
2911 MACRO_END
2912
2913         /*
2914          * Hold queues lock to manipulate
2915          * the page queues.  Change wiring
2916          * case is obvious.
2917          */
2918         assert(m->compressor || m->object != compressor_object);
2919         if (m->compressor) {
2920                 /*
2921                  * Compressor pages are neither wired
2922                  * nor pageable and should never change.
2923                  */
2924                 assert(m->object == compressor_object);
2925         } else if (change_wiring) {
2926                 __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
2927
2928                 if (wired) {
2929                         if (kr == KERN_SUCCESS) {
2930                                 vm_page_wire(m);
2931                         }
2932                 } else {
2933                         vm_page_unwire(m, TRUE);
2934                 }
2935                 /* we keep the page queues lock, if we need it later */
2936
2937         } else {
2938                 if (kr != KERN_SUCCESS) {
2939                         __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
2940                         vm_page_deactivate(m);
2941                         /* we keep the page queues lock, if we need it later */
2942                 } else if (((!m->active && !m->inactive) ||
2943                             m->clean_queue ||
2944                             no_cache) &&
2945                            !VM_PAGE_WIRED(m) && !m->throttled) {
2946
2947                         if (vm_page_local_q &&
2948                             !no_cache &&
2949                             (*type_of_fault == DBG_COW_FAULT ||
2950                              *type_of_fault == DBG_ZERO_FILL_FAULT) ) {
2951                                 struct vpl      *lq;
2952                                 uint32_t        lid;
2953
2954                                 __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
2955                                 vm_object_lock_assert_exclusive(m->object);
2956
2957                                 /*
2958                                  * we got a local queue to stuff this
2959                                  * new page on...
2960                                  * its safe to manipulate local and
2961                                  * local_id at this point since we're
2962                                  * behind an exclusive object lock and
2963                                  * the page is not on any global queue.
2964                                  *
2965                                  * we'll use the current cpu number to
2966                                  * select the queue note that we don't
2967                                  * need to disable preemption... we're
2968                                  * going to behind the local queue's
2969                                  * lock to do the real work
2970                                  */
2971                                 lid = cpu_number();
2972
2973                                 lq = &vm_page_local_q[lid].vpl_un.vpl;
2974
2975                                 VPL_LOCK(&lq->vpl_lock);
2976
2977                                 queue_enter(&lq->vpl_queue, m,
2978                                             vm_page_t, pageq);
2979                                 m->local = TRUE;
2980                                 m->local_id = lid;
2981                                 lq->vpl_count++;
2982
2983                                 if (m->object->internal)
2984                                         lq->vpl_internal_count++;
2985                                 else
2986                                         lq->vpl_external_count++;
2987
2988                                 VPL_UNLOCK(&lq->vpl_lock);
2989
2990                                 if (lq->vpl_count > vm_page_local_q_soft_limit)
2991                                 {
2992                                         /*
2993                                          * we're beyond the soft limit
2994                                          * for the local queue
2995                                          * vm_page_reactivate_local will
2996                                          * 'try' to take the global page
2997                                          * queue lock... if it can't
2998                                          * that's ok... we'll let the
2999                                          * queue continue to grow up
3000                                          * to the hard limit... at that
3001                                          * point we'll wait for the
3002                                          * lock... once we've got the
3003                                          * lock, we'll transfer all of
3004                                          * the pages from the local
3005                                          * queue to the global active
3006                                          * queue
3007                                          */
3008                                         vm_page_reactivate_local(lid, FALSE, FALSE);
3009                                 }
3010                         } else {
3011
3012                                 __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3013
3014                                 /*
3015                                  * test again now that we hold the
3016                                  * page queue lock
3017                                  */
3018                                 if (!VM_PAGE_WIRED(m)) {
3019                                         if (m->clean_queue) {
3020                                                 VM_PAGE_QUEUES_REMOVE(m);
3021
3022                                                 vm_pageout_cleaned_reactivated++;
3023                                                 vm_pageout_cleaned_fault_reactivated++;
3024                                         }
3025
3026                                         if ((!m->active &&
3027                                              !m->inactive) ||
3028                                             no_cache) {
3029                                                 /*
3030                                                  * If this is a no_cache mapping
3031                                                  * and the page has never been
3032                                                  * mapped before or was
3033                                                  * previously a no_cache page,
3034                                                  * then we want to leave pages
3035                                                  * in the speculative state so
3036                                                  * that they can be readily
3037                                                  * recycled if free memory runs
3038                                                  * low.  Otherwise the page is
3039                                                  * activated as normal.
3040                                                  */
3041
3042                                                 if (no_cache &&
3043                                                     (!previously_pmapped ||
3044                                                      m->no_cache)) {
3045                                                         m->no_cache = TRUE;
3046
3047                                                         if (!m->speculative)
3048                                                                 vm_page_speculate(m, FALSE);
3049
3050                                                 } else if (!m->active &&
3051                                                            !m->inactive) {
3052
3053                                                         vm_page_activate(m);
3054                                                 }
3055                                         }
3056                                 }
3057                                 /* we keep the page queues lock, if we need it later */
3058                         }
3059                 }
3060         }
3061         /* we're done with the page queues lock, if we ever took it */
3062         __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
3063
3064
3065         /* If we have a KERN_SUCCESS from the previous checks, we either have
3066          * a good page, or a tainted page that has been accepted by the process.
3067          * In both cases the page will be entered into the pmap.
3068          * If the page is writeable, we need to disconnect it from other pmaps
3069          * now so those processes can take note.
3070          */
3071         if (kr == KERN_SUCCESS) {
3072
3073                 /*
3074                  * NOTE: we may only hold the vm_object lock SHARED
3075                  * at this point, so we need the phys_page lock to
3076                  * properly serialize updating the pmapped and
3077                  * xpmapped bits
3078                  */
3079                 if ((prot & VM_PROT_EXECUTE) && !m->xpmapped) {
3080
3081                         pmap_lock_phys_page(m->phys_page);
3082                         /*
3083                          * go ahead and take the opportunity
3084                          * to set 'pmapped' here so that we don't
3085                          * need to grab this lock a 2nd time
3086                          * just below
3087                          */
3088                         m->pmapped = TRUE;
3089
3090                         if (!m->xpmapped) {
3091
3092                                 m->xpmapped = TRUE;
3093
3094                                 pmap_unlock_phys_page(m->phys_page);
3095
3096                                 if (!m->object->internal)
3097                                         OSAddAtomic(1, &vm_page_xpmapped_external_count);
3098
3099                                 if ((COMPRESSED_PAGER_IS_ACTIVE) &&
3100                                     m->object->internal &&
3101                                     m->object->pager != NULL) {
3102                                         /*
3103                                          * This page could have been
3104                                          * uncompressed by the
3105                                          * compressor pager and its
3106                                          * contents might be only in
3107                                          * the data cache.
3108                                          * Since it's being mapped for
3109                                          * "execute" for the fist time,
3110                                          * make sure the icache is in
3111                                          * sync.
3112                                          */
3113                                         pmap_sync_page_data_phys(m->phys_page);
3114                                 }
3115                         } else
3116                                 pmap_unlock_phys_page(m->phys_page);
3117                 } else {
3118                         if (m->pmapped == FALSE) {
3119                                 pmap_lock_phys_page(m->phys_page);
3120                                 m->pmapped = TRUE;
3121                                 pmap_unlock_phys_page(m->phys_page);
3122                         }
3123                 }
3124                 if (vm_page_is_slideable(m)) {
3125                         boolean_t was_busy = m->busy;
3126
3127                         vm_object_lock_assert_exclusive(m->object);
3128
3129                         m->busy = TRUE;
3130                         kr = vm_page_slide(m, 0);
3131                         assert(m->busy);
3132                         if(!was_busy) {
3133                                 PAGE_WAKEUP_DONE(m);
3134                         }
3135                         if (kr != KERN_SUCCESS) {
3136                                 /*
3137                                  * This page has not been slid correctly,
3138                                  * do not do the pmap_enter() !
3139                                  * Let vm_fault_enter() return the error
3140                                  * so the caller can fail the fault.
3141                                  */
3142                                 goto after_the_pmap_enter;
3143                         }
3144                 }
3145
3146                 if (fault_type & VM_PROT_WRITE) {
3147
3148                         if (m->wpmapped == FALSE) {
3149                                 vm_object_lock_assert_exclusive(m->object);
3150
3151                                 m->wpmapped = TRUE;
3152                         }
3153                         if (must_disconnect) {
3154                                 /*
3155                                  * We can only get here
3156                                  * because of the CSE logic
3157                                  */
3158                                 assert(cs_enforcement_enabled);
3159                                 pmap_disconnect(m->phys_page);
3160                                 /*
3161                                  * If we are faulting for a write, we can clear
3162                                  * the execute bit - that will ensure the page is
3163                                  * checked again before being executable, which
3164                                  * protects against a map switch.
3165                                  * This only happens the first time the page
3166                                  * gets tainted, so we won't get stuck here
3167                                  * to make an already writeable page executable.
3168                                  */
3169                                 if (!cs_bypass){
3170                                         prot &= ~VM_PROT_EXECUTE;
3171                                 }
3172                         }
3173                 }
3174
3175                 /* Prevent a deadlock by not
3176                  * holding the object lock if we need to wait for a page in
3177                  * pmap_enter() - <rdar://problem/7138958> */
3178                 PMAP_ENTER_OPTIONS(pmap, vaddr, m, prot, fault_type, 0,
3179                                    wired,
3180                                    pmap_options | PMAP_OPTIONS_NOWAIT,
3181                                    pe_result);
3182
3183                 if(pe_result == KERN_RESOURCE_SHORTAGE) {
3184
3185                         if (need_retry) {
3186                                 /*
3187                                  * this will be non-null in the case where we hold the lock
3188                                  * on the top-object in this chain... we can't just drop
3189                                  * the lock on the object we're inserting the page into
3190                                  * and recall the PMAP_ENTER since we can still cause
3191                                  * a deadlock if one of the critical paths tries to
3192                                  * acquire the lock on the top-object and we're blocked
3193                                  * in PMAP_ENTER waiting for memory... our only recourse
3194                                  * is to deal with it at a higher level where we can
3195                                  * drop both locks.
3196                                  */
3197                                 *need_retry = TRUE;
3198                                 vm_pmap_enter_retried++;
3199                                 goto after_the_pmap_enter;
3200                         }
3201                         /* The nonblocking version of pmap_enter did not succeed.
3202                          * and we don't need to drop other locks and retry
3203                          * at the level above us, so
3204                          * use the blocking version instead. Requires marking
3205                          * the page busy and unlocking the object */
3206                         boolean_t was_busy = m->busy;
3207
3208                         vm_object_lock_assert_exclusive(m->object);
3209
3210                         m->busy = TRUE;
3211                         vm_object_unlock(m->object);
3212
3213                         PMAP_ENTER_OPTIONS(pmap, vaddr, m, prot, fault_type,
3214                                            0, wired,
3215                                            pmap_options, pe_result);
3216
3217                         /* Take the object lock again. */
3218                         vm_object_lock(m->object);
3219
3220                         /* If the page was busy, someone else will wake it up.
3221                          * Otherwise, we have to do it now. */
3222                         assert(m->busy);
3223                         if(!was_busy) {
3224                                 PAGE_WAKEUP_DONE(m);
3225                         }
3226                         vm_pmap_enter_blocked++;
3227                 }
3228         }
3229
3230 after_the_pmap_enter:
3231         return kr;
3232 }
3233
3234 void
3235 vm_pre_fault(vm_map_offset_t vaddr)
3236 {
3237         if (pmap_find_phys(current_map()->pmap, vaddr) == 0) {
3238
3239                 vm_fault(current_map(), /* map */
3240                         vaddr,          /* vaddr */
3241                         VM_PROT_READ, /* fault_type */
3242                         FALSE, /* change_wiring */
3243                         THREAD_UNINT, /* interruptible */
3244                         NULL, /* caller_pmap */
3245                         0 /* caller_pmap_addr */);
3246         }
3247 }
3248
3249
3250 /*
3251  *      Routine:        vm_fault
3252  *      Purpose:
3253  *              Handle page faults, including pseudo-faults
3254  *              used to change the wiring status of pages.
3255  *      Returns:
3256  *              Explicit continuations have been removed.
3257  *      Implementation:
3258  *              vm_fault and vm_fault_page save mucho state
3259  *              in the moral equivalent of a closure.  The state
3260  *              structure is allocated when first entering vm_fault
3261  *              and deallocated when leaving vm_fault.
3262  */
3263
3264 extern int _map_enter_debug;
3265
3266 unsigned long vm_fault_collapse_total = 0;
3267 unsigned long vm_fault_collapse_skipped = 0;
3268
3269
3270 kern_return_t
3271 vm_fault(
3272         vm_map_t        map,
3273         vm_map_offset_t vaddr,
3274         vm_prot_t       fault_type,
3275         boolean_t       change_wiring,
3276         int             interruptible,
3277         pmap_t          caller_pmap,
3278         vm_map_offset_t caller_pmap_addr)
3279 {
3280         return vm_fault_internal(map, vaddr, fault_type, change_wiring,
3281                                  interruptible, caller_pmap, caller_pmap_addr,
3282                                  NULL);
3283 }
3284
3285 kern_return_t
3286 vm_fault_internal(
3287         vm_map_t        map,
3288         vm_map_offset_t vaddr,
3289         vm_prot_t       fault_type,
3290         boolean_t       change_wiring,
3291         int             interruptible,
3292         pmap_t          caller_pmap,
3293         vm_map_offset_t caller_pmap_addr,
3294         ppnum_t         *physpage_p)
3295 {
3296         vm_map_version_t        version;        /* Map version for verificiation */
3297         boolean_t               wired;          /* Should mapping be wired down? */
3298         vm_object_t             object;         /* Top-level object */
3299         vm_object_offset_t      offset;         /* Top-level offset */
3300         vm_prot_t               prot;           /* Protection for mapping */
3301         vm_object_t             old_copy_object; /* Saved copy object */
3302         vm_page_t               result_page;    /* Result of vm_fault_page */
3303         vm_page_t               top_page;       /* Placeholder page */
3304         kern_return_t           kr;
3305
3306         vm_page_t               m;      /* Fast access to result_page */
3307         kern_return_t           error_code;
3308         vm_object_t             cur_object;
3309         vm_object_offset_t      cur_offset;
3310         vm_page_t               cur_m;
3311         vm_object_t             new_object;
3312         int                     type_of_fault;
3313         pmap_t                  pmap;
3314         boolean_t               interruptible_state;
3315         vm_map_t                real_map = map;
3316         vm_map_t                original_map = map;
3317         vm_prot_t               original_fault_type;
3318         struct vm_object_fault_info fault_info;
3319         boolean_t               need_collapse = FALSE;
3320         boolean_t               need_retry = FALSE;
3321         boolean_t               *need_retry_ptr = NULL;
3322         int                     object_lock_type = 0;
3323         int                     cur_object_lock_type;
3324         vm_object_t             top_object = VM_OBJECT_NULL;
3325         int                     throttle_delay;
3326         int                     compressed_count_delta;
3327
3328
3329         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3330                       (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START,
3331                               ((uint64_t)vaddr >> 32),
3332                               vaddr,
3333                               (map == kernel_map),
3334                               0,
3335                               0);
3336
3337         if (get_preemption_level() != 0) {
3338                 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3339                                       (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
3340                                       ((uint64_t)vaddr >> 32),
3341                                       vaddr,
3342                                       KERN_FAILURE,
3343                                       0,
3344                                       0);
3345
3346                 return (KERN_FAILURE);
3347         }
3348
3349         interruptible_state = thread_interrupt_level(interruptible);
3350
3351         VM_STAT_INCR(faults);
3352         current_task()->faults++;
3353         original_fault_type = fault_type;
3354
3355         if (fault_type & VM_PROT_WRITE)
3356                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3357         else
3358                 object_lock_type = OBJECT_LOCK_SHARED;
3359
3360         cur_object_lock_type = OBJECT_LOCK_SHARED;
3361
3362 RetryFault:
3363         /*
3364          * assume we will hit a page in the cache
3365          * otherwise, explicitly override with
3366          * the real fault type once we determine it
3367          */
3368         type_of_fault = DBG_CACHE_HIT_FAULT;
3369
3370         /*
3371          *      Find the backing store object and offset into
3372          *      it to begin the search.
3373          */
3374         fault_type = original_fault_type;
3375         map = original_map;
3376         vm_map_lock_read(map);
3377
3378         kr = vm_map_lookup_locked(&map, vaddr, fault_type,
3379                                   object_lock_type, &version,
3380                                   &object, &offset, &prot, &wired,
3381                                   &fault_info,
3382                                   &real_map);
3383
3384         if (kr != KERN_SUCCESS) {
3385                 vm_map_unlock_read(map);
3386                 goto done;
3387         }
3388         pmap = real_map->pmap;
3389         fault_info.interruptible = interruptible;
3390         fault_info.stealth = FALSE;
3391         fault_info.io_sync = FALSE;
3392         fault_info.mark_zf_absent = FALSE;
3393         fault_info.batch_pmap_op = FALSE;
3394
3395         /*
3396          * If the page is wired, we must fault for the current protection
3397          * value, to avoid further faults.
3398          */
3399         if (wired) {
3400                 fault_type = prot | VM_PROT_WRITE;
3401                 /*
3402                  * since we're treating this fault as a 'write'
3403                  * we must hold the top object lock exclusively
3404                  */
3405                 if (object_lock_type == OBJECT_LOCK_SHARED) {
3406
3407                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3408
3409                         if (vm_object_lock_upgrade(object) == FALSE) {
3410                                 /*
3411                                  * couldn't upgrade, so explictly
3412                                  * take the lock exclusively
3413                                  */
3414                                 vm_object_lock(object);
3415                         }
3416                 }
3417         }
3418
3419 #if     VM_FAULT_CLASSIFY
3420         /*
3421          *      Temporary data gathering code
3422          */
3423         vm_fault_classify(object, offset, fault_type);
3424 #endif
3425         /*
3426          *      Fast fault code.  The basic idea is to do as much as
3427          *      possible while holding the map lock and object locks.
3428          *      Busy pages are not used until the object lock has to
3429          *      be dropped to do something (copy, zero fill, pmap enter).
3430          *      Similarly, paging references aren't acquired until that
3431          *      point, and object references aren't used.
3432          *
3433          *      If we can figure out what to do
3434          *      (zero fill, copy on write, pmap enter) while holding
3435          *      the locks, then it gets done.  Otherwise, we give up,
3436          *      and use the original fault path (which doesn't hold
3437          *      the map lock, and relies on busy pages).
3438          *      The give up cases include:
3439          *              - Have to talk to pager.
3440          *              - Page is busy, absent or in error.
3441          *              - Pager has locked out desired access.
3442          *              - Fault needs to be restarted.
3443          *              - Have to push page into copy object.
3444          *
3445          *      The code is an infinite loop that moves one level down
3446          *      the shadow chain each time.  cur_object and cur_offset
3447          *      refer to the current object being examined. object and offset
3448          *      are the original object from the map.  The loop is at the
3449          *      top level if and only if object and cur_object are the same.
3450          *
3451          *      Invariants:  Map lock is held throughout.  Lock is held on
3452          *              original object and cur_object (if different) when
3453          *              continuing or exiting loop.
3454          *
3455          */
3456
3457
3458         /*
3459          * If this page is to be inserted in a copy delay object
3460          * for writing, and if the object has a copy, then the
3461          * copy delay strategy is implemented in the slow fault page.
3462          */
3463         if (object->copy_strategy == MEMORY_OBJECT_COPY_DELAY &&
3464             object->copy != VM_OBJECT_NULL && (fault_type & VM_PROT_WRITE))
3465                 goto handle_copy_delay;
3466
3467         cur_object = object;
3468         cur_offset = offset;
3469
3470         while (TRUE) {
3471                 if (!cur_object->pager_created &&
3472                     cur_object->phys_contiguous) /* superpage */
3473                         break;
3474
3475                 if (cur_object->blocked_access) {
3476                         /*
3477                          * Access to this VM object has been blocked.
3478                          * Let the slow path handle it.
3479                          */
3480                         break;
3481                 }
3482
3483                 m = vm_page_lookup(cur_object, cur_offset);
3484
3485                 if (m != VM_PAGE_NULL) {
3486                         if (m->busy) {
3487                                 wait_result_t   result;
3488
3489                                 /*
3490                                  * in order to do the PAGE_ASSERT_WAIT, we must
3491                                  * have object that 'm' belongs to locked exclusively
3492                                  */
3493                                 if (object != cur_object) {
3494
3495                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3496
3497                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3498
3499                                                 if (vm_object_lock_upgrade(cur_object) == FALSE) {
3500                                                         /*
3501                                                          * couldn't upgrade so go do a full retry
3502                                                          * immediately since we can no longer be
3503                                                          * certain about cur_object (since we
3504                                                          * don't hold a reference on it)...
3505                                                          * first drop the top object lock
3506                                                          */
3507                                                         vm_object_unlock(object);
3508
3509                                                         vm_map_unlock_read(map);
3510                                                         if (real_map != map)
3511                                                                 vm_map_unlock(real_map);
3512
3513                                                         goto RetryFault;
3514                                                 }
3515                                         }
3516                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3517
3518                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3519
3520                                         if (vm_object_lock_upgrade(object) == FALSE) {
3521                                                 /*
3522                                                  * couldn't upgrade, so explictly take the lock
3523                                                  * exclusively and go relookup the page since we
3524                                                  * will have dropped the object lock and
3525                                                  * a different thread could have inserted
3526                                                  * a page at this offset
3527                                                  * no need for a full retry since we're
3528                                                  * at the top level of the object chain
3529                                                  */
3530                                                 vm_object_lock(object);
3531
3532                                                 continue;
3533                                         }
3534                                 }
3535                                 if (m->pageout_queue && m->object->internal && COMPRESSED_PAGER_IS_ACTIVE) {
3536                                         /*
3537                                          * m->busy == TRUE and the object is locked exclusively
3538                                          * if m->pageout_queue == TRUE after we acquire the
3539                                          * queues lock, we are guaranteed that it is stable on
3540                                          * the pageout queue and therefore reclaimable
3541                                          *
3542                                          * NOTE: this is only true for the internal pageout queue
3543                                          * in the compressor world
3544                                          */
3545                                         vm_page_lock_queues();
3546
3547                                         if (m->pageout_queue) {
3548                                                 vm_pageout_throttle_up(m);
3549                                                 vm_page_unlock_queues();
3550
3551                                                 PAGE_WAKEUP_DONE(m);
3552                                                 goto reclaimed_from_pageout;
3553                                         }
3554                                         vm_page_unlock_queues();
3555                                 }
3556                                 if (object != cur_object)
3557                                         vm_object_unlock(object);
3558
3559                                 vm_map_unlock_read(map);
3560                                 if (real_map != map)
3561                                         vm_map_unlock(real_map);
3562
3563                                 result = PAGE_ASSERT_WAIT(m, interruptible);
3564
3565                                 vm_object_unlock(cur_object);
3566
3567                                 if (result == THREAD_WAITING) {
3568                                         result = thread_block(THREAD_CONTINUE_NULL);
3569
3570                                         counter(c_vm_fault_page_block_busy_kernel++);
3571                                 }
3572                                 if (result == THREAD_AWAKENED || result == THREAD_RESTART)
3573                                         goto RetryFault;
3574
3575                                 kr = KERN_ABORTED;
3576                                 goto done;
3577                         }
3578 reclaimed_from_pageout:
3579                         if (m->laundry) {
3580                                 if (object != cur_object) {
3581                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3582                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3583
3584                                                 vm_object_unlock(object);
3585                                                 vm_object_unlock(cur_object);
3586
3587                                                 vm_map_unlock_read(map);
3588                                                 if (real_map != map)
3589                                                         vm_map_unlock(real_map);
3590
3591                                                 goto RetryFault;
3592                                         }
3593
3594                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3595
3596                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3597
3598                                         if (vm_object_lock_upgrade(object) == FALSE) {
3599                                                 /*
3600                                                  * couldn't upgrade, so explictly take the lock
3601                                                  * exclusively and go relookup the page since we
3602                                                  * will have dropped the object lock and
3603                                                  * a different thread could have inserted
3604                                                  * a page at this offset
3605                                                  * no need for a full retry since we're
3606                                                  * at the top level of the object chain
3607                                                  */
3608                                                 vm_object_lock(object);
3609
3610                                                 continue;
3611                                         }
3612                                 }
3613                                 m->pageout = FALSE;
3614
3615                                 vm_pageout_steal_laundry(m, FALSE);
3616                         }
3617
3618                         if (m->phys_page == vm_page_guard_addr) {
3619                                 /*
3620                                  * Guard page: let the slow path deal with it
3621                                  */
3622                                 break;
3623                         }
3624                         if (m->unusual && (m->error || m->restart || m->private || m->absent)) {
3625                                 /*
3626                                  * Unusual case... let the slow path deal with it
3627                                  */
3628                                 break;
3629                         }
3630                         if (VM_OBJECT_PURGEABLE_FAULT_ERROR(m->object)) {
3631                                 if (object != cur_object)
3632                                         vm_object_unlock(object);
3633                                 vm_map_unlock_read(map);
3634                                 if (real_map != map)
3635                                         vm_map_unlock(real_map);
3636                                 vm_object_unlock(cur_object);
3637                                 kr = KERN_MEMORY_ERROR;
3638                                 goto done;
3639                         }
3640
3641                         if (m->encrypted) {
3642                                 /*
3643                                  * ENCRYPTED SWAP:
3644                                  * We've soft-faulted (because it's not in the page
3645                                  * table) on an encrypted page.
3646                                  * Keep the page "busy" so that no one messes with
3647                                  * it during the decryption.
3648                                  * Release the extra locks we're holding, keep only
3649                                  * the page's VM object lock.
3650                                  *
3651                                  * in order to set 'busy' on 'm', we must
3652                                  * have object that 'm' belongs to locked exclusively
3653                                  */
3654                                 if (object != cur_object) {
3655                                         vm_object_unlock(object);
3656
3657                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3658
3659                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3660
3661                                                 if (vm_object_lock_upgrade(cur_object) == FALSE) {
3662                                                         /*
3663                                                          * couldn't upgrade so go do a full retry
3664                                                          * immediately since we've already dropped
3665                                                          * the top object lock associated with this page
3666                                                          * and the current one got dropped due to the
3667                                                          * failed upgrade... the state is no longer valid
3668                                                          */
3669                                                         vm_map_unlock_read(map);
3670                                                         if (real_map != map)
3671                                                                 vm_map_unlock(real_map);
3672
3673                                                         goto RetryFault;
3674                                                 }
3675                                         }
3676                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3677
3678                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3679
3680                                         if (vm_object_lock_upgrade(object) == FALSE) {
3681                                                 /*
3682                                                  * couldn't upgrade, so explictly take the lock
3683                                                  * exclusively and go relookup the page since we
3684                                                  * will have dropped the object lock and
3685                                                  * a different thread could have inserted
3686                                                  * a page at this offset
3687                                                  * no need for a full retry since we're
3688                                                  * at the top level of the object chain
3689                                                  */
3690                                                 vm_object_lock(object);
3691
3692                                                 continue;
3693                                         }
3694                                 }
3695                                 m->busy = TRUE;
3696
3697                                 vm_map_unlock_read(map);
3698                                 if (real_map != map)
3699                                         vm_map_unlock(real_map);
3700
3701                                 vm_page_decrypt(m, 0);
3702
3703                                 assert(m->busy);
3704                                 PAGE_WAKEUP_DONE(m);
3705
3706                                 vm_object_unlock(cur_object);
3707                                 /*
3708                                  * Retry from the top, in case anything
3709                                  * changed while we were decrypting...
3710                                  */
3711                                 goto RetryFault;
3712                         }
3713                         ASSERT_PAGE_DECRYPTED(m);
3714
3715                         if(vm_page_is_slideable(m)) {
3716                                 /*
3717                                  * We might need to slide this page, and so,
3718                                  * we want to hold the VM object exclusively.
3719                                  */
3720                                 if (object != cur_object) {
3721                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3722                                                 vm_object_unlock(object);
3723                                                 vm_object_unlock(cur_object);
3724
3725                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3726
3727                                                 vm_map_unlock_read(map);
3728                                                 if (real_map != map)
3729                                                         vm_map_unlock(real_map);
3730
3731                                                 goto RetryFault;
3732                                         }
3733                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3734
3735                                         vm_object_unlock(object);
3736                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3737                                         vm_map_unlock_read(map);
3738                                         goto RetryFault;
3739                                 }
3740                         }
3741
3742                         if (VM_FAULT_NEED_CS_VALIDATION(map->pmap, m) ||
3743                             (physpage_p != NULL && (prot & VM_PROT_WRITE))) {
3744 upgrade_for_validation:
3745                                 /*
3746                                  * We might need to validate this page
3747                                  * against its code signature, so we
3748                                  * want to hold the VM object exclusively.
3749                                  */
3750                                 if (object != cur_object) {
3751                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3752                                                 vm_object_unlock(object);
3753                                                 vm_object_unlock(cur_object);
3754
3755                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3756
3757                                                 vm_map_unlock_read(map);
3758                                                 if (real_map != map)
3759                                                         vm_map_unlock(real_map);
3760
3761                                                 goto RetryFault;
3762                                         }
3763
3764                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3765
3766                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3767
3768                                         if (vm_object_lock_upgrade(object) == FALSE) {
3769                                                 /*
3770                                                  * couldn't upgrade, so explictly take the lock
3771                                                  * exclusively and go relookup the page since we
3772                                                  * will have dropped the object lock and
3773                                                  * a different thread could have inserted
3774                                                  * a page at this offset
3775                                                  * no need for a full retry since we're
3776                                                  * at the top level of the object chain
3777                                                  */
3778                                                 vm_object_lock(object);
3779
3780                                                 continue;
3781                                         }
3782                                 }
3783                         }
3784                         /*
3785                          *      Two cases of map in faults:
3786                          *          - At top level w/o copy object.
3787                          *          - Read fault anywhere.
3788                          *              --> must disallow write.
3789                          */
3790
3791                         if (object == cur_object && object->copy == VM_OBJECT_NULL) {
3792
3793                                 goto FastPmapEnter;
3794                         }
3795
3796                         if ((fault_type & VM_PROT_WRITE) == 0) {
3797
3798                                 if (object != cur_object) {
3799                                         /*
3800                                          * We still need to hold the top object
3801                                          * lock here to prevent a race between
3802                                          * a read fault (taking only "shared"
3803                                          * locks) and a write fault (taking
3804                                          * an "exclusive" lock on the top
3805                                          * object.
3806                                          * Otherwise, as soon as we release the
3807                                          * top lock, the write fault could
3808                                          * proceed and actually complete before
3809                                          * the read fault, and the copied page's
3810                                          * translation could then be overwritten
3811                                          * by the read fault's translation for
3812                                          * the original page.
3813                                          *
3814                                          * Let's just record what the top object
3815                                          * is and we'll release it later.
3816                                          */
3817                                         top_object = object;
3818
3819                                         /*
3820                                          * switch to the object that has the new page
3821                                          */
3822                                         object = cur_object;
3823                                         object_lock_type = cur_object_lock_type;
3824                                 }
3825 FastPmapEnter:
3826                                 /*
3827                                  * prepare for the pmap_enter...
3828                                  * object and map are both locked
3829                                  * m contains valid data
3830                                  * object == m->object
3831                                  * cur_object == NULL or it's been unlocked
3832                                  * no paging references on either object or cur_object
3833                                  */
3834                                 if (top_object != VM_OBJECT_NULL || object_lock_type != OBJECT_LOCK_EXCLUSIVE)
3835                                         need_retry_ptr = &need_retry;
3836                                 else
3837                                         need_retry_ptr = NULL;
3838
3839                                 if (caller_pmap) {
3840                                         kr = vm_fault_enter(m,
3841                                                             caller_pmap,
3842                                                             caller_pmap_addr,
3843                                                             prot,
3844                                                             fault_type,
3845                                                             wired,
3846                                                             change_wiring,
3847                                                             fault_info.no_cache,
3848                                                             fault_info.cs_bypass,
3849                                                             fault_info.user_tag,
3850                                                             fault_info.pmap_options,
3851                                                             need_retry_ptr,
3852                                                             &type_of_fault);
3853                                 } else {
3854                                         kr = vm_fault_enter(m,
3855                                                             pmap,
3856                                                             vaddr,
3857                                                             prot,
3858                                                             fault_type,
3859                                                             wired,
3860                                                             change_wiring,
3861                                                             fault_info.no_cache,
3862                                                             fault_info.cs_bypass,
3863                                                             fault_info.user_tag,
3864                                                             fault_info.pmap_options,
3865                                                             need_retry_ptr,
3866                                                             &type_of_fault);
3867                                 }
3868
3869                                 if (kr == KERN_SUCCESS &&
3870                                     physpage_p != NULL) {
3871                                         /* for vm_map_wire_and_extract() */
3872                                         *physpage_p = m->phys_page;
3873                                         if (prot & VM_PROT_WRITE) {
3874                                                 vm_object_lock_assert_exclusive(
3875                                                         m->object);
3876                                                 m->dirty = TRUE;
3877                                         }
3878                                 }
3879
3880                                 if (top_object != VM_OBJECT_NULL) {
3881                                         /*
3882                                          * It's safe to drop the top object
3883                                          * now that we've done our
3884                                          * vm_fault_enter().  Any other fault
3885                                          * in progress for that virtual
3886                                          * address will either find our page
3887                                          * and translation or put in a new page
3888                                          * and translation.
3889                                          */
3890                                         vm_object_unlock(top_object);
3891                                         top_object = VM_OBJECT_NULL;
3892                                 }
3893
3894                                 if (need_collapse == TRUE)
3895                                         vm_object_collapse(object, offset, TRUE);
3896
3897                                 if (need_retry == FALSE &&
3898                                     (type_of_fault == DBG_PAGEIND_FAULT || type_of_fault == DBG_PAGEINV_FAULT || type_of_fault == DBG_CACHE_HIT_FAULT)) {
3899                                         /*
3900                                          * evaluate access pattern and update state
3901                                          * vm_fault_deactivate_behind depends on the
3902                                          * state being up to date
3903                                          */
3904                                         vm_fault_is_sequential(object, cur_offset, fault_info.behavior);
3905
3906                                         vm_fault_deactivate_behind(object, cur_offset, fault_info.behavior);
3907                                 }
3908                                 /*
3909                                  * That's it, clean up and return.
3910                                  */
3911                                 if (m->busy)
3912                                         PAGE_WAKEUP_DONE(m);
3913
3914                                 vm_object_unlock(object);
3915
3916                                 vm_map_unlock_read(map);
3917                                 if (real_map != map)
3918                                         vm_map_unlock(real_map);
3919
3920                                 if (need_retry == TRUE) {
3921                                         /*
3922                                          * vm_fault_enter couldn't complete the PMAP_ENTER...
3923                                          * at this point we don't hold any locks so it's safe
3924                                          * to ask the pmap layer to expand the page table to
3925                                          * accommodate this mapping... once expanded, we'll
3926                                          * re-drive the fault which should result in vm_fault_enter
3927                                          * being able to successfully enter the mapping this time around
3928                                          */
3929                                         (void)pmap_enter_options(
3930                                                 pmap, vaddr, 0, 0, 0, 0, 0,
3931                                                 PMAP_OPTIONS_NOENTER, NULL);
3932
3933                                         need_retry = FALSE;
3934                                         goto RetryFault;
3935                                 }
3936                                 goto done;
3937                         }
3938                         /*
3939                          * COPY ON WRITE FAULT
3940                          */
3941                         assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE);
3942
3943                         /*
3944                          * If objects match, then
3945                          * object->copy must not be NULL (else control
3946                          * would be in previous code block), and we
3947                          * have a potential push into the copy object
3948                          * with which we can't cope with here.
3949                          */
3950                         if (cur_object == object) {
3951                                 /*
3952                                  * must take the slow path to
3953                                  * deal with the copy push
3954                                  */
3955                                 break;
3956                         }
3957
3958                         /*
3959                          * This is now a shadow based copy on write
3960                          * fault -- it requires a copy up the shadow
3961                          * chain.
3962                          */
3963
3964                         if ((cur_object_lock_type == OBJECT_LOCK_SHARED) &&
3965                             VM_FAULT_NEED_CS_VALIDATION(NULL, m)) {
3966                                 goto upgrade_for_validation;
3967                         }
3968
3969                         /*
3970                          * Allocate a page in the original top level
3971                          * object. Give up if allocate fails.  Also
3972                          * need to remember current page, as it's the
3973                          * source of the copy.
3974                          *
3975                          * at this point we hold locks on both
3976                          * object and cur_object... no need to take
3977                          * paging refs or mark pages BUSY since
3978                          * we don't drop either object lock until
3979                          * the page has been copied and inserted
3980                          */
3981                         cur_m = m;
3982                         m = vm_page_grab();
3983
3984                         if (m == VM_PAGE_NULL) {
3985                                 /*
3986                                  * no free page currently available...
3987                                  * must take the slow path
3988                                  */
3989                                 break;
3990                         }
3991                         /*
3992                          * Now do the copy.  Mark the source page busy...
3993                          *
3994                          *      NOTE: This code holds the map lock across
3995                          *      the page copy.
3996                          */
3997                         vm_page_copy(cur_m, m);
3998                         vm_page_insert(m, object, offset);
3999                         SET_PAGE_DIRTY(m, FALSE);
4000
4001                         /*
4002                          * Now cope with the source page and object
4003                          */
4004                         if (object->ref_count > 1 && cur_m->pmapped)
4005                                 pmap_disconnect(cur_m->phys_page);
4006
4007                         if (cur_m->clustered) {
4008                                 VM_PAGE_COUNT_AS_PAGEIN(cur_m);
4009                                 VM_PAGE_CONSUME_CLUSTERED(cur_m);
4010                         }
4011                         need_collapse = TRUE;
4012
4013                         if (!cur_object->internal &&
4014                             cur_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
4015                                 /*
4016                                  * The object from which we've just
4017                                  * copied a page is most probably backed
4018                                  * by a vnode.  We don't want to waste too
4019                                  * much time trying to collapse the VM objects
4020                                  * and create a bottleneck when several tasks
4021                                  * map the same file.
4022                                  */
4023                                 if (cur_object->copy == object) {
4024                                         /*
4025                                          * Shared mapping or no COW yet.
4026                                          * We can never collapse a copy
4027                                          * object into its backing object.
4028                                          */
4029                                         need_collapse = FALSE;
4030                                 } else if (cur_object->copy == object->shadow &&
4031                                            object->shadow->resident_page_count == 0) {
4032                                         /*
4033                                          * Shared mapping after a COW occurred.
4034                                          */
4035                                         need_collapse = FALSE;
4036                                 }
4037                         }
4038                         vm_object_unlock(cur_object);
4039
4040                         if (need_collapse == FALSE)
4041                                 vm_fault_collapse_skipped++;
4042                         vm_fault_collapse_total++;
4043
4044                         type_of_fault = DBG_COW_FAULT;
4045                         VM_STAT_INCR(cow_faults);
4046                         DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
4047                         current_task()->cow_faults++;
4048
4049                         goto FastPmapEnter;
4050
4051                 } else {
4052                         /*
4053                          * No page at cur_object, cur_offset... m == NULL
4054                          */
4055                         if (cur_object->pager_created) {
4056                                 int     compressor_external_state = VM_EXTERNAL_STATE_UNKNOWN;
4057
4058                                 if (MUST_ASK_PAGER(cur_object, cur_offset, compressor_external_state) == TRUE) {
4059                                         int             my_fault_type;
4060                                         int             c_flags = C_DONT_BLOCK;
4061                                         boolean_t       insert_cur_object = FALSE;
4062
4063                                         /*
4064                                          * May have to talk to a pager...
4065                                          * if so, take the slow path by
4066                                          * doing a 'break' from the while (TRUE) loop
4067                                          *
4068                                          * external_state will only be set to VM_EXTERNAL_STATE_EXISTS
4069                                          * if the compressor is active and the page exists there
4070                                          */
4071                                         if (compressor_external_state != VM_EXTERNAL_STATE_EXISTS)
4072                                                 break;
4073
4074                                         if (map == kernel_map || real_map == kernel_map) {
4075                                                 /*
4076                                                  * can't call into the compressor with the kernel_map
4077                                                  * lock held, since the compressor may try to operate
4078                                                  * on the kernel map in order to return an empty c_segment
4079                                                  */
4080                                                 break;
4081                                         }
4082                                         if (object != cur_object) {
4083                                                 if (fault_type & VM_PROT_WRITE)
4084                                                         c_flags |= C_KEEP;
4085                                                 else
4086                                                         insert_cur_object = TRUE;
4087                                         }
4088                                         if (insert_cur_object == TRUE) {
4089
4090                                                 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
4091
4092                                                         cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4093
4094                                                         if (vm_object_lock_upgrade(cur_object) == FALSE) {
4095                                                                 /*
4096                                                                  * couldn't upgrade so go do a full retry
4097                                                                  * immediately since we can no longer be
4098                                                                  * certain about cur_object (since we
4099                                                                  * don't hold a reference on it)...
4100                                                                  * first drop the top object lock
4101                                                                  */
4102                                                                 vm_object_unlock(object);
4103
4104                                                                 vm_map_unlock_read(map);
4105                                                                 if (real_map != map)
4106                                                                         vm_map_unlock(real_map);
4107
4108                                                                 goto RetryFault;
4109                                                         }
4110                                                 }
4111                                         } else if (object_lock_type == OBJECT_LOCK_SHARED) {
4112
4113                                                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4114
4115                                                 if (object != cur_object) {
4116                                                         /*
4117                                                          * we can't go for the upgrade on the top
4118                                                          * lock since the upgrade may block waiting
4119                                                          * for readers to drain... since we hold
4120                                                          * cur_object locked at this point, waiting
4121                                                          * for the readers to drain would represent
4122                                                          * a lock order inversion since the lock order
4123                                                          * for objects is the reference order in the
4124                                                          * shadown chain
4125                                                          */
4126                                                         vm_object_unlock(object);
4127                                                         vm_object_unlock(cur_object);
4128
4129                                                         vm_map_unlock_read(map);
4130                                                         if (real_map != map)
4131                                                                 vm_map_unlock(real_map);
4132
4133                                                         goto RetryFault;
4134                                                 }
4135                                                 if (vm_object_lock_upgrade(object) == FALSE) {
4136                                                         /*
4137                                                          * couldn't upgrade, so explictly take the lock
4138                                                          * exclusively and go relookup the page since we
4139                                                          * will have dropped the object lock and
4140                                                          * a different thread could have inserted
4141                                                          * a page at this offset
4142                                                          * no need for a full retry since we're
4143                                                          * at the top level of the object chain
4144                                                          */
4145                                                         vm_object_lock(object);
4146
4147                                                         continue;
4148                                                 }
4149                                         }
4150                                         m = vm_page_grab();
4151
4152                                         if (m == VM_PAGE_NULL) {
4153                                                 /*
4154                                                  * no free page currently available...
4155                                                  * must take the slow path
4156                                                  */
4157                                                 break;
4158                                         }
4159
4160                                         /*
4161                                          * The object is and remains locked
4162                                          * so no need to take a
4163                                          * "paging_in_progress" reference.
4164                                          */
4165                                         boolean_t shared_lock;
4166                                         if ((object == cur_object &&
4167                                              object_lock_type == OBJECT_LOCK_EXCLUSIVE) ||
4168                                             (object != cur_object &&
4169                                              cur_object_lock_type == OBJECT_LOCK_EXCLUSIVE)) {
4170                                                 shared_lock = FALSE;
4171                                         } else {
4172                                                 shared_lock = TRUE;
4173                                         }
4174
4175                                         kr = vm_compressor_pager_get(
4176                                                 cur_object->pager,
4177                                                 (cur_offset +
4178                                                  cur_object->paging_offset),
4179                                                 m->phys_page,
4180                                                 &my_fault_type,
4181                                                 c_flags,
4182                                                 &compressed_count_delta);
4183
4184                                         vm_compressor_pager_count(
4185                                                 cur_object->pager,
4186                                                 compressed_count_delta,
4187                                                 shared_lock,
4188                                                 cur_object);
4189
4190                                         if (kr != KERN_SUCCESS) {
4191                                                 vm_page_release(m);
4192                                                 break;
4193                                         }
4194                                         m->dirty = TRUE;
4195
4196                                         /*
4197                                          * If the object is purgeable, its
4198                                          * owner's purgeable ledgers will be
4199                                          * updated in vm_page_insert() but the
4200                                          * page was also accounted for in a
4201                                          * "compressed purgeable" ledger, so
4202                                          * update that now.
4203                                          */
4204                                         if (object != cur_object &&
4205                                             !insert_cur_object) {
4206                                                 /*
4207                                                  * We're not going to insert
4208                                                  * the decompressed page into
4209                                                  * the object it came from.
4210                                                  *
4211                                                  * We're dealing with a
4212                                                  * copy-on-write fault on
4213                                                  * "object".
4214                                                  * We're going to decompress
4215                                                  * the page directly into the
4216                                                  * target "object" while
4217                                                  * keepin the compressed
4218                                                  * page for "cur_object", so
4219                                                  * no ledger update in that
4220                                                  * case.
4221                                                  */
4222                                         } else if ((cur_object->purgable ==
4223                                                     VM_PURGABLE_DENY) ||
4224                                                    (cur_object->vo_purgeable_owner ==
4225                                                     NULL)) {
4226                                                 /*
4227                                                  * "cur_object" is not purgeable
4228                                                  * or is not owned, so no
4229                                                  * purgeable ledgers to update.
4230                                                  */
4231                                         } else {
4232                                                 /*
4233                                                  * One less compressed
4234                                                  * purgeable page for
4235                                                  * cur_object's owner.
4236                                                  */
4237                                                 vm_purgeable_compressed_update(
4238                                                         cur_object,
4239                                                         -1);
4240                                         }
4241
4242                                         if (insert_cur_object) {
4243                                                 vm_page_insert(m, cur_object, cur_offset);
4244                                         } else {
4245                                                 vm_page_insert(m, object, offset);
4246                                         }
4247
4248                                         if ((m->object->wimg_bits & VM_WIMG_MASK) != VM_WIMG_USE_DEFAULT) {
4249                                                 /*
4250                                                  * If the page is not cacheable,
4251                                                  * we can't let its contents
4252                                                  * linger in the data cache
4253                                                  * after the decompression.
4254                                                  */
4255                                                 pmap_sync_page_attributes_phys(m->phys_page);
4256                                         }
4257
4258                                         type_of_fault = my_fault_type;
4259
4260                                         VM_STAT_INCR(decompressions);
4261
4262                                         if (cur_object != object) {
4263                                                 if (insert_cur_object) {
4264                                                         top_object = object;
4265                                                         /*
4266                                                          * switch to the object that has the new page
4267                                                          */
4268                                                         object = cur_object;
4269                                                         object_lock_type = cur_object_lock_type;
4270                                                 } else {
4271                                                         vm_object_unlock(cur_object);
4272                                                         cur_object = object;
4273                                                 }
4274                                         }
4275                                         goto FastPmapEnter;
4276                                 }
4277                                 /*
4278                                  * existence map present and indicates
4279                                  * that the pager doesn't have this page
4280                                  */
4281                         }
4282                         if (cur_object->shadow == VM_OBJECT_NULL) {
4283                                 /*
4284                                  * Zero fill fault.  Page gets
4285                                  * inserted into the original object.
4286                                  */
4287                                 if (cur_object->shadow_severed ||
4288                                     VM_OBJECT_PURGEABLE_FAULT_ERROR(cur_object))
4289                                 {
4290                                         if (object != cur_object)
4291                                                 vm_object_unlock(cur_object);
4292                                         vm_object_unlock(object);
4293
4294                                         vm_map_unlock_read(map);
4295                                         if (real_map != map)
4296                                                 vm_map_unlock(real_map);
4297
4298                                         kr = KERN_MEMORY_ERROR;
4299                                         goto done;
4300                                 }
4301                                 if (vm_backing_store_low) {
4302                                         /*
4303                                          * we are protecting the system from
4304                                          * backing store exhaustion...
4305                                          * must take the slow path if we're
4306                                          * not privileged
4307                                          */
4308                                         if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV))
4309                                                 break;
4310                                 }
4311                                 if (cur_object != object) {
4312                                         vm_object_unlock(cur_object);
4313
4314                                         cur_object = object;
4315                                 }
4316                                 if (object_lock_type == OBJECT_LOCK_SHARED) {
4317
4318                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4319
4320                                         if (vm_object_lock_upgrade(object) == FALSE) {
4321                                                 /*
4322                                                  * couldn't upgrade so do a full retry on the fault
4323                                                  * since we dropped the object lock which
4324                                                  * could allow another thread to insert
4325                                                  * a page at this offset
4326                                                  */
4327                                                 vm_map_unlock_read(map);
4328                                                 if (real_map != map)
4329                                                         vm_map_unlock(real_map);
4330
4331                                                 goto RetryFault;
4332                                         }
4333                                 }
4334                                 m = vm_page_alloc(object, offset);
4335
4336                                 if (m == VM_PAGE_NULL) {
4337                                         /*
4338                                          * no free page currently available...
4339                                          * must take the slow path
4340                                          */
4341                                         break;
4342                                 }
4343
4344                                 /*
4345                                  * Now zero fill page...
4346                                  * the page is probably going to
4347                                  * be written soon, so don't bother
4348                                  * to clear the modified bit
4349                                  *
4350                                  *   NOTE: This code holds the map
4351                                  *   lock across the zero fill.
4352                                  */
4353                                 type_of_fault = vm_fault_zero_page(m, map->no_zero_fill);
4354
4355                                 goto FastPmapEnter;
4356                         }
4357                         /*
4358                          * On to the next level in the shadow chain
4359                          */
4360                         cur_offset += cur_object->vo_shadow_offset;
4361                         new_object = cur_object->shadow;
4362
4363                         /*
4364                          * take the new_object's lock with the indicated state
4365                          */
4366                         if (cur_object_lock_type == OBJECT_LOCK_SHARED)
4367                                 vm_object_lock_shared(new_object);
4368                         else
4369                                 vm_object_lock(new_object);
4370
4371                         if (cur_object != object)
4372                                 vm_object_unlock(cur_object);
4373
4374                         cur_object = new_object;
4375
4376                         continue;
4377                 }
4378         }
4379         /*
4380          * Cleanup from fast fault failure.  Drop any object
4381          * lock other than original and drop map lock.
4382          */
4383         if (object != cur_object)
4384                 vm_object_unlock(cur_object);
4385
4386         /*
4387          * must own the object lock exclusively at this point
4388          */
4389         if (object_lock_type == OBJECT_LOCK_SHARED) {
4390                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4391
4392                 if (vm_object_lock_upgrade(object) == FALSE) {
4393                         /*
4394                          * couldn't upgrade, so explictly
4395                          * take the lock exclusively
4396                          * no need to retry the fault at this
4397                          * point since "vm_fault_page" will
4398                          * completely re-evaluate the state
4399                          */
4400                         vm_object_lock(object);
4401                 }
4402         }
4403
4404 handle_copy_delay:
4405         vm_map_unlock_read(map);
4406         if (real_map != map)
4407                 vm_map_unlock(real_map);
4408
4409         /*
4410          * Make a reference to this object to
4411          * prevent its disposal while we are messing with
4412          * it.  Once we have the reference, the map is free
4413          * to be diddled.  Since objects reference their
4414          * shadows (and copies), they will stay around as well.
4415          */
4416         vm_object_reference_locked(object);
4417         vm_object_paging_begin(object);
4418
4419         XPR(XPR_VM_FAULT,"vm_fault -> vm_fault_page\n",0,0,0,0,0);
4420
4421         error_code = 0;
4422
4423         result_page = VM_PAGE_NULL;
4424         kr = vm_fault_page(object, offset, fault_type,
4425                            (change_wiring && !wired),
4426                            FALSE, /* page not looked up */
4427                            &prot, &result_page, &top_page,
4428                            &type_of_fault,
4429                            &error_code, map->no_zero_fill,
4430                            FALSE, &fault_info);
4431
4432         /*
4433          * if kr != VM_FAULT_SUCCESS, then the paging reference
4434          * has been dropped and the object unlocked... the ref_count
4435          * is still held
4436          *
4437          * if kr == VM_FAULT_SUCCESS, then the paging reference
4438          * is still held along with the ref_count on the original object
4439          *
4440          *      the object is returned locked with a paging reference
4441          *
4442          *      if top_page != NULL, then it's BUSY and the
4443          *      object it belongs to has a paging reference
4444          *      but is returned unlocked
4445          */
4446         if (kr != VM_FAULT_SUCCESS &&
4447             kr != VM_FAULT_SUCCESS_NO_VM_PAGE) {
4448                 /*
4449                  * we didn't succeed, lose the object reference immediately.
4450                  */
4451                 vm_object_deallocate(object);
4452
4453                 /*
4454                  * See why we failed, and take corrective action.
4455                  */
4456                 switch (kr) {
4457                 case VM_FAULT_MEMORY_SHORTAGE:
4458                         if (vm_page_wait((change_wiring) ?
4459                                          THREAD_UNINT :
4460                                          THREAD_ABORTSAFE))
4461                                 goto RetryFault;
4462                         /*
4463                          * fall thru
4464                          */
4465                 case VM_FAULT_INTERRUPTED:
4466                         kr = KERN_ABORTED;
4467                         goto done;
4468                 case VM_FAULT_RETRY:
4469                         goto RetryFault;
4470                 case VM_FAULT_MEMORY_ERROR:
4471                         if (error_code)
4472                                 kr = error_code;
4473                         else
4474                                 kr = KERN_MEMORY_ERROR;
4475                         goto done;
4476                 default:
4477                         panic("vm_fault: unexpected error 0x%x from "
4478                               "vm_fault_page()\n", kr);
4479                 }
4480         }
4481         m = result_page;
4482
4483         if (m != VM_PAGE_NULL) {
4484                 assert((change_wiring && !wired) ?
4485                     (top_page == VM_PAGE_NULL) :
4486                     ((top_page == VM_PAGE_NULL) == (m->object == object)));
4487         }
4488
4489         /*
4490          * What to do with the resulting page from vm_fault_page
4491          * if it doesn't get entered into the physical map:
4492          */
4493 #define RELEASE_PAGE(m)                                 \
4494         MACRO_BEGIN                                     \
4495         PAGE_WAKEUP_DONE(m);                            \
4496         if (!m->active && !m->inactive && !m->throttled) {              \
4497                 vm_page_lockspin_queues();                              \
4498                 if (!m->active && !m->inactive && !m->throttled)        \
4499                         vm_page_activate(m);                            \
4500                 vm_page_unlock_queues();                                \
4501         }                                                               \
4502         MACRO_END
4503
4504         /*
4505          * We must verify that the maps have not changed
4506          * since our last lookup.
4507          */
4508         if (m != VM_PAGE_NULL) {
4509                 old_copy_object = m->object->copy;
4510                 vm_object_unlock(m->object);
4511         } else {
4512                 old_copy_object = VM_OBJECT_NULL;
4513                 vm_object_unlock(object);
4514         }
4515
4516         /*
4517          * no object locks are held at this point
4518          */
4519         if ((map != original_map) || !vm_map_verify(map, &version)) {
4520                 vm_object_t             retry_object;
4521                 vm_object_offset_t      retry_offset;
4522                 vm_prot_t               retry_prot;
4523
4524                 /*
4525                  * To avoid trying to write_lock the map while another
4526                  * thread has it read_locked (in vm_map_pageable), we
4527                  * do not try for write permission.  If the page is
4528                  * still writable, we will get write permission.  If it
4529                  * is not, or has been marked needs_copy, we enter the
4530                  * mapping without write permission, and will merely
4531                  * take another fault.
4532                  */
4533                 map = original_map;
4534                 vm_map_lock_read(map);
4535
4536                 kr = vm_map_lookup_locked(&map, vaddr,
4537                                           fault_type & ~VM_PROT_WRITE,
4538                                           OBJECT_LOCK_EXCLUSIVE, &version,
4539                                           &retry_object, &retry_offset, &retry_prot,
4540                                           &wired,
4541                                           &fault_info,
4542                                           &real_map);
4543                 pmap = real_map->pmap;
4544
4545                 if (kr != KERN_SUCCESS) {
4546                         vm_map_unlock_read(map);
4547
4548                         if (m != VM_PAGE_NULL) {
4549                                 /*
4550                                  * retake the lock so that
4551                                  * we can drop the paging reference
4552                                  * in vm_fault_cleanup and do the
4553                                  * PAGE_WAKEUP_DONE in RELEASE_PAGE
4554                                  */
4555                                 vm_object_lock(m->object);
4556
4557                                 RELEASE_PAGE(m);
4558
4559                                 vm_fault_cleanup(m->object, top_page);
4560                         } else {
4561                                 /*
4562                                  * retake the lock so that
4563                                  * we can drop the paging reference
4564                                  * in vm_fault_cleanup
4565                                  */
4566                                 vm_object_lock(object);
4567
4568                                 vm_fault_cleanup(object, top_page);
4569                         }
4570                         vm_object_deallocate(object);
4571
4572                         goto done;
4573                 }
4574                 vm_object_unlock(retry_object);
4575
4576                 if ((retry_object != object) || (retry_offset != offset)) {
4577
4578                         vm_map_unlock_read(map);
4579                         if (real_map != map)
4580                                 vm_map_unlock(real_map);
4581
4582                         if (m != VM_PAGE_NULL) {
4583                                 /*
4584                                  * retake the lock so that
4585                                  * we can drop the paging reference
4586                                  * in vm_fault_cleanup and do the
4587                                  * PAGE_WAKEUP_DONE in RELEASE_PAGE
4588                                  */
4589                                 vm_object_lock(m->object);
4590
4591                                 RELEASE_PAGE(m);
4592
4593                                 vm_fault_cleanup(m->object, top_page);
4594                         } else {
4595                                 /*
4596                                  * retake the lock so that
4597                                  * we can drop the paging reference
4598                                  * in vm_fault_cleanup
4599                                  */
4600                                 vm_object_lock(object);
4601
4602                                 vm_fault_cleanup(object, top_page);
4603                         }
4604                         vm_object_deallocate(object);
4605
4606                         goto RetryFault;
4607                 }
4608                 /*
4609                  * Check whether the protection has changed or the object
4610                  * has been copied while we left the map unlocked.
4611                  */
4612                 prot &= retry_prot;
4613         }
4614         if (m != VM_PAGE_NULL) {
4615                 vm_object_lock(m->object);
4616
4617                 if (m->object->copy != old_copy_object) {
4618                         /*
4619                          * The copy object changed while the top-level object
4620                          * was unlocked, so take away write permission.
4621                          */
4622                         prot &= ~VM_PROT_WRITE;
4623                 }
4624         } else
4625                 vm_object_lock(object);
4626
4627         /*
4628          * If we want to wire down this page, but no longer have
4629          * adequate permissions, we must start all over.
4630          */
4631         if (wired && (fault_type != (prot | VM_PROT_WRITE))) {
4632
4633                 vm_map_verify_done(map, &version);
4634                 if (real_map != map)
4635                         vm_map_unlock(real_map);
4636
4637                 if (m != VM_PAGE_NULL) {
4638                         RELEASE_PAGE(m);
4639
4640                         vm_fault_cleanup(m->object, top_page);
4641                 } else
4642                         vm_fault_cleanup(object, top_page);
4643
4644                 vm_object_deallocate(object);
4645
4646                 goto RetryFault;
4647         }
4648         if (m != VM_PAGE_NULL) {
4649                 /*
4650                  * Put this page into the physical map.
4651                  * We had to do the unlock above because pmap_enter
4652                  * may cause other faults.  The page may be on
4653                  * the pageout queues.  If the pageout daemon comes
4654                  * across the page, it will remove it from the queues.
4655                  */
4656                 if (caller_pmap) {
4657                         kr = vm_fault_enter(m,
4658                                             caller_pmap,
4659                                             caller_pmap_addr,
4660                                             prot,
4661                                             fault_type,
4662                                             wired,
4663                                             change_wiring,
4664                                             fault_info.no_cache,
4665                                             fault_info.cs_bypass,
4666                                             fault_info.user_tag,
4667                                             fault_info.pmap_options,
4668                                             NULL,
4669                                             &type_of_fault);
4670                 } else {
4671                         kr = vm_fault_enter(m,
4672                                             pmap,
4673                                             vaddr,
4674                                             prot,
4675                                             fault_type,
4676                                             wired,
4677                                             change_wiring,
4678                                             fault_info.no_cache,
4679                                             fault_info.cs_bypass,
4680                                             fault_info.user_tag,
4681                                             fault_info.pmap_options,
4682                                             NULL,
4683                                             &type_of_fault);
4684                 }
4685                 if (kr != KERN_SUCCESS) {
4686                         /* abort this page fault */
4687                         vm_map_verify_done(map, &version);
4688                         if (real_map != map)
4689                                 vm_map_unlock(real_map);
4690                         PAGE_WAKEUP_DONE(m);
4691                         vm_fault_cleanup(m->object, top_page);
4692                         vm_object_deallocate(object);
4693                         goto done;
4694                 }
4695                 if (physpage_p != NULL) {
4696                         /* for vm_map_wire_and_extract() */
4697                         *physpage_p = m->phys_page;
4698                         if (prot & VM_PROT_WRITE) {
4699                                 vm_object_lock_assert_exclusive(m->object);
4700                                 m->dirty = TRUE;
4701                         }
4702                 }
4703         } else {
4704
4705                 vm_map_entry_t          entry;
4706                 vm_map_offset_t         laddr;
4707                 vm_map_offset_t         ldelta, hdelta;
4708
4709                 /*
4710                  * do a pmap block mapping from the physical address
4711                  * in the object
4712                  */
4713
4714 #ifdef ppc
4715                 /* While we do not worry about execution protection in   */
4716                 /* general, certian pages may have instruction execution */
4717                 /* disallowed.  We will check here, and if not allowed   */
4718                 /* to execute, we return with a protection failure.      */
4719
4720                 if ((fault_type & VM_PROT_EXECUTE) &&
4721                         (!pmap_eligible_for_execute((ppnum_t)(object->vo_shadow_offset >> 12)))) {
4722
4723                         vm_map_verify_done(map, &version);
4724
4725                         if (real_map != map)
4726                                 vm_map_unlock(real_map);
4727
4728                         vm_fault_cleanup(object, top_page);
4729                         vm_object_deallocate(object);
4730
4731                         kr = KERN_PROTECTION_FAILURE;
4732                         goto done;
4733                 }
4734 #endif  /* ppc */
4735
4736                 if (real_map != map)
4737                         vm_map_unlock(real_map);
4738
4739                 if (original_map != map) {
4740                         vm_map_unlock_read(map);
4741                         vm_map_lock_read(original_map);
4742                         map = original_map;
4743                 }
4744                 real_map = map;
4745
4746                 laddr = vaddr;
4747                 hdelta = 0xFFFFF000;
4748                 ldelta = 0xFFFFF000;
4749
4750                 while (vm_map_lookup_entry(map, laddr, &entry)) {
4751                         if (ldelta > (laddr - entry->vme_start))
4752                                 ldelta = laddr - entry->vme_start;
4753                         if (hdelta > (entry->vme_end - laddr))
4754                                 hdelta = entry->vme_end - laddr;
4755                         if (entry->is_sub_map) {
4756
4757                                 laddr = (laddr - entry->vme_start)
4758                                                         + entry->offset;
4759                                 vm_map_lock_read(entry->object.sub_map);
4760
4761                                 if (map != real_map)
4762                                         vm_map_unlock_read(map);
4763                                 if (entry->use_pmap) {
4764                                         vm_map_unlock_read(real_map);
4765                                         real_map = entry->object.sub_map;
4766                                 }
4767                                 map = entry->object.sub_map;
4768
4769                         } else {
4770                                 break;
4771                         }
4772                 }
4773
4774                 if (vm_map_lookup_entry(map, laddr, &entry) &&
4775                     (entry->object.vm_object != NULL) &&
4776                     (entry->object.vm_object == object)) {
4777
4778                         int superpage = (!object->pager_created && object->phys_contiguous)? VM_MEM_SUPERPAGE : 0;
4779
4780                         if (superpage && physpage_p) {
4781                                 /* for vm_map_wire_and_extract() */
4782                                 *physpage_p = (ppnum_t) ((((vm_map_offset_t) entry->object.vm_object->vo_shadow_offset)
4783                                                           + entry->offset
4784                                                           + (laddr - entry->vme_start))
4785                                                          >> PAGE_SHIFT);
4786                         }
4787
4788                         if (caller_pmap) {
4789                                 /*
4790                                  * Set up a block mapped area
4791                                  */
4792                                 assert((uint32_t)((ldelta + hdelta) >> PAGE_SHIFT) == ((ldelta + hdelta) >> PAGE_SHIFT));
4793                                 pmap_map_block(caller_pmap,
4794                                                (addr64_t)(caller_pmap_addr - ldelta),
4795                                                (ppnum_t)((((vm_map_offset_t) (entry->object.vm_object->vo_shadow_offset)) +
4796                                                           entry->offset + (laddr - entry->vme_start) - ldelta) >> PAGE_SHIFT),
4797                                                (uint32_t)((ldelta + hdelta) >> PAGE_SHIFT), prot,
4798                                                (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
4799                         } else {
4800                                 /*
4801                                  * Set up a block mapped area
4802                                  */
4803                                 assert((uint32_t)((ldelta + hdelta) >> PAGE_SHIFT) == ((ldelta + hdelta) >> PAGE_SHIFT));
4804                                 pmap_map_block(real_map->pmap,
4805                                                (addr64_t)(vaddr - ldelta),
4806                                                (ppnum_t)((((vm_map_offset_t)(entry->object.vm_object->vo_shadow_offset)) +
4807                                                           entry->offset + (laddr - entry->vme_start) - ldelta) >> PAGE_SHIFT),
4808                                                (uint32_t)((ldelta + hdelta) >> PAGE_SHIFT), prot,
4809                                                (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
4810                         }
4811                 }
4812         }
4813
4814         /*
4815          * Unlock everything, and return
4816          */
4817         vm_map_verify_done(map, &version);
4818         if (real_map != map)
4819                 vm_map_unlock(real_map);
4820
4821         if (m != VM_PAGE_NULL) {
4822                 PAGE_WAKEUP_DONE(m);
4823
4824                 vm_fault_cleanup(m->object, top_page);
4825         } else
4826                 vm_fault_cleanup(object, top_page);
4827
4828         vm_object_deallocate(object);
4829
4830 #undef  RELEASE_PAGE
4831
4832         kr = KERN_SUCCESS;
4833 done:
4834         thread_interrupt_level(interruptible_state);
4835
4836         /*
4837          * Only I/O throttle on faults which cause a pagein/swapin.
4838          */
4839         if ((type_of_fault == DBG_PAGEIND_FAULT) || (type_of_fault == DBG_PAGEINV_FAULT) || (type_of_fault == DBG_COMPRESSOR_SWAPIN_FAULT)) {
4840                 throttle_lowpri_io(1);
4841         } else {
4842                 if (kr == KERN_SUCCESS && type_of_fault != DBG_CACHE_HIT_FAULT && type_of_fault != DBG_GUARD_FAULT) {
4843
4844                         if ((throttle_delay = vm_page_throttled(TRUE))) {
4845
4846                                 if (vm_debug_events) {
4847                                         if (type_of_fault == DBG_COMPRESSOR_FAULT)
4848                                                 VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
4849                                         else if (type_of_fault == DBG_COW_FAULT)
4850                                                 VM_DEBUG_EVENT(vmf_cowdelay, VMF_COWDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
4851                                         else
4852                                                 VM_DEBUG_EVENT(vmf_zfdelay, VMF_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
4853                                 }
4854                                 delay(throttle_delay);
4855                         }
4856                 }
4857         }
4858         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
4859                               (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
4860                               ((uint64_t)vaddr >> 32),
4861                               vaddr,
4862                               kr,
4863                               type_of_fault,
4864                               0);
4865
4866         return (kr);
4867 }
4868
4869 /*
4870  *      vm_fault_wire:
4871  *
4872  *      Wire down a range of virtual addresses in a map.
4873  */
4874 kern_return_t
4875 vm_fault_wire(
4876         vm_map_t        map,
4877         vm_map_entry_t  entry,
4878         pmap_t          pmap,
4879         vm_map_offset_t pmap_addr,
4880         ppnum_t         *physpage_p)
4881 {
4882
4883         register vm_map_offset_t        va;
4884         register vm_map_offset_t        end_addr = entry->vme_end;
4885         register kern_return_t  rc;
4886
4887         assert(entry->in_transition);
4888
4889         if ((entry->object.vm_object != NULL) &&
4890             !entry->is_sub_map &&
4891             entry->object.vm_object->phys_contiguous) {
4892                 return KERN_SUCCESS;
4893         }
4894
4895         /*
4896          *      Inform the physical mapping system that the
4897          *      range of addresses may not fault, so that
4898          *      page tables and such can be locked down as well.
4899          */
4900
4901         pmap_pageable(pmap, pmap_addr,
4902                 pmap_addr + (end_addr - entry->vme_start), FALSE);
4903
4904         /*
4905          *      We simulate a fault to get the page and enter it
4906          *      in the physical map.
4907          */
4908
4909         for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
4910                 rc = vm_fault_wire_fast(map, va, entry, pmap,
4911                                         pmap_addr + (va - entry->vme_start),
4912                                         physpage_p);
4913                 if (rc != KERN_SUCCESS) {
4914                         rc = vm_fault_internal(map, va, VM_PROT_NONE, TRUE,
4915                                                ((pmap == kernel_pmap)
4916                                                 ? THREAD_UNINT
4917                                                 : THREAD_ABORTSAFE),
4918                                                pmap,
4919                                                (pmap_addr +
4920                                                 (va - entry->vme_start)),
4921                                                physpage_p);
4922                         DTRACE_VM2(softlock, int, 1, (uint64_t *), NULL);
4923                 }
4924
4925                 if (rc != KERN_SUCCESS) {
4926                         struct vm_map_entry     tmp_entry = *entry;
4927
4928                         /* unwire wired pages */
4929                         tmp_entry.vme_end = va;
4930                         vm_fault_unwire(map,
4931                                 &tmp_entry, FALSE, pmap, pmap_addr);
4932
4933                         return rc;
4934                 }
4935         }
4936         return KERN_SUCCESS;
4937 }
4938
4939 /*
4940  *      vm_fault_unwire:
4941  *
4942  *      Unwire a range of virtual addresses in a map.
4943  */
4944 void
4945 vm_fault_unwire(
4946         vm_map_t        map,
4947         vm_map_entry_t  entry,
4948         boolean_t       deallocate,
4949         pmap_t          pmap,
4950         vm_map_offset_t pmap_addr)
4951 {
4952         register vm_map_offset_t        va;
4953         register vm_map_offset_t        end_addr = entry->vme_end;
4954         vm_object_t             object;
4955         struct vm_object_fault_info fault_info;
4956
4957         object = (entry->is_sub_map)
4958                         ? VM_OBJECT_NULL : entry->object.vm_object;
4959
4960         /*
4961          * If it's marked phys_contiguous, then vm_fault_wire() didn't actually
4962          * do anything since such memory is wired by default.  So we don't have
4963          * anything to undo here.
4964          */
4965
4966         if (object != VM_OBJECT_NULL && object->phys_contiguous)
4967                 return;
4968
4969         fault_info.interruptible = THREAD_UNINT;
4970         fault_info.behavior = entry->behavior;
4971         fault_info.user_tag = entry->alias;
4972         fault_info.pmap_options = 0;
4973         if (entry->iokit_acct ||
4974             (!entry->is_sub_map && !entry->use_pmap)) {
4975                 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
4976         }
4977         fault_info.lo_offset = entry->offset;
4978         fault_info.hi_offset = (entry->vme_end - entry->vme_start) + entry->offset;
4979         fault_info.no_cache = entry->no_cache;
4980         fault_info.stealth = TRUE;
4981         fault_info.io_sync = FALSE;
4982         fault_info.cs_bypass = FALSE;
4983         fault_info.mark_zf_absent = FALSE;
4984         fault_info.batch_pmap_op = FALSE;
4985
4986         /*
4987          *      Since the pages are wired down, we must be able to
4988          *      get their mappings from the physical map system.
4989          */
4990
4991         for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
4992
4993                 if (object == VM_OBJECT_NULL) {
4994                         if (pmap) {
4995                                 pmap_change_wiring(pmap,
4996                                                    pmap_addr + (va - entry->vme_start), FALSE);
4997                         }
4998                         (void) vm_fault(map, va, VM_PROT_NONE,
4999                                         TRUE, THREAD_UNINT, pmap, pmap_addr);
5000                 } else {
5001                         vm_prot_t       prot;
5002                         vm_page_t       result_page;
5003                         vm_page_t       top_page;
5004                         vm_object_t     result_object;
5005                         vm_fault_return_t result;
5006
5007                         if (end_addr - va > (vm_size_t) -1) {
5008                                 /* 32-bit overflow */
5009                                 fault_info.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
5010                         } else {
5011                                 fault_info.cluster_size = (vm_size_t) (end_addr - va);
5012                                 assert(fault_info.cluster_size == end_addr - va);
5013                         }
5014
5015                         do {
5016                                 prot = VM_PROT_NONE;
5017
5018                                 vm_object_lock(object);
5019                                 vm_object_paging_begin(object);
5020                                 XPR(XPR_VM_FAULT,
5021                                         "vm_fault_unwire -> vm_fault_page\n",
5022                                         0,0,0,0,0);
5023                                 result_page = VM_PAGE_NULL;
5024                                 result = vm_fault_page(
5025                                         object,
5026                                         entry->offset + (va - entry->vme_start),
5027                                         VM_PROT_NONE, TRUE,
5028                                         FALSE, /* page not looked up */
5029                                         &prot, &result_page, &top_page,
5030                                         (int *)0,
5031                                         NULL, map->no_zero_fill,
5032                                         FALSE, &fault_info);
5033                         } while (result == VM_FAULT_RETRY);
5034
5035                         /*
5036                          * If this was a mapping to a file on a device that has been forcibly
5037                          * unmounted, then we won't get a page back from vm_fault_page().  Just
5038                          * move on to the next one in case the remaining pages are mapped from
5039                          * different objects.  During a forced unmount, the object is terminated
5040                          * so the alive flag will be false if this happens.  A forced unmount will
5041                          * will occur when an external disk is unplugged before the user does an
5042                          * eject, so we don't want to panic in that situation.
5043                          */
5044
5045                         if (result == VM_FAULT_MEMORY_ERROR && !object->alive)
5046                                 continue;
5047
5048                         if (result == VM_FAULT_MEMORY_ERROR &&
5049                             object == kernel_object) {
5050                                 /*
5051                                  * This must have been allocated with
5052                                  * KMA_KOBJECT and KMA_VAONLY and there's
5053                                  * no physical page at this offset.
5054                                  * We're done (no page to free).
5055                                  */
5056                                 assert(deallocate);
5057                                 continue;
5058                         }
5059
5060                         if (result != VM_FAULT_SUCCESS)
5061                                 panic("vm_fault_unwire: failure");
5062
5063                         result_object = result_page->object;
5064
5065                         if (deallocate) {
5066                                 assert(result_page->phys_page !=
5067                                        vm_page_fictitious_addr);
5068                                 pmap_disconnect(result_page->phys_page);
5069                                 VM_PAGE_FREE(result_page);
5070                         } else {
5071                                 if ((pmap) && (result_page->phys_page != vm_page_guard_addr))
5072                                         pmap_change_wiring(pmap,
5073                                             pmap_addr + (va - entry->vme_start), FALSE);
5074
5075
5076                                 if (VM_PAGE_WIRED(result_page)) {
5077                                         vm_page_lockspin_queues();
5078                                         vm_page_unwire(result_page, TRUE);
5079                                         vm_page_unlock_queues();
5080                                 }
5081                                 if(entry->zero_wired_pages) {
5082                                         pmap_zero_page(result_page->phys_page);
5083                                         entry->zero_wired_pages = FALSE;
5084                                 }
5085
5086                                 PAGE_WAKEUP_DONE(result_page);
5087                         }
5088                         vm_fault_cleanup(result_object, top_page);
5089                 }
5090         }
5091
5092         /*
5093          *      Inform the physical mapping system that the range
5094          *      of addresses may fault, so that page tables and
5095          *      such may be unwired themselves.
5096          */
5097
5098         pmap_pageable(pmap, pmap_addr,
5099                 pmap_addr + (end_addr - entry->vme_start), TRUE);
5100
5101 }
5102
5103 /*
5104  *      vm_fault_wire_fast:
5105  *
5106  *      Handle common case of a wire down page fault at the given address.
5107  *      If successful, the page is inserted into the associated physical map.
5108  *      The map entry is passed in to avoid the overhead of a map lookup.
5109  *
5110  *      NOTE: the given address should be truncated to the
5111  *      proper page address.
5112  *
5113  *      KERN_SUCCESS is returned if the page fault is handled; otherwise,
5114  *      a standard error specifying why the fault is fatal is returned.
5115  *
5116  *      The map in question must be referenced, and remains so.
5117  *      Caller has a read lock on the map.
5118  *
5119  *      This is a stripped version of vm_fault() for wiring pages.  Anything
5120  *      other than the common case will return KERN_FAILURE, and the caller
5121  *      is expected to call vm_fault().
5122  */
5123 kern_return_t
5124 vm_fault_wire_fast(
5125         __unused vm_map_t       map,
5126         vm_map_offset_t va,
5127         vm_map_entry_t  entry,
5128         pmap_t          pmap,
5129         vm_map_offset_t pmap_addr,
5130         ppnum_t         *physpage_p)
5131 {
5132         vm_object_t             object;
5133         vm_object_offset_t      offset;
5134         register vm_page_t      m;
5135         vm_prot_t               prot;
5136         thread_t                thread = current_thread();
5137         int                     type_of_fault;
5138         kern_return_t           kr;
5139
5140         VM_STAT_INCR(faults);
5141
5142         if (thread != THREAD_NULL && thread->task != TASK_NULL)
5143           thread->task->faults++;
5144
5145 /*
5146  *      Recovery actions
5147  */
5148
5149 #undef  RELEASE_PAGE
5150 #define RELEASE_PAGE(m) {                               \
5151         PAGE_WAKEUP_DONE(m);                            \
5152         vm_page_lockspin_queues();                      \
5153         vm_page_unwire(m, TRUE);                        \
5154         vm_page_unlock_queues();                        \
5155 }
5156
5157
5158 #undef  UNLOCK_THINGS
5159 #define UNLOCK_THINGS   {                               \
5160         vm_object_paging_end(object);                      \
5161         vm_object_unlock(object);                          \
5162 }
5163
5164 #undef  UNLOCK_AND_DEALLOCATE
5165 #define UNLOCK_AND_DEALLOCATE   {                       \
5166         UNLOCK_THINGS;                                  \
5167         vm_object_deallocate(object);                   \
5168 }
5169 /*
5170  *      Give up and have caller do things the hard way.
5171  */
5172
5173 #define GIVE_UP {                                       \
5174         UNLOCK_AND_DEALLOCATE;                          \
5175         return(KERN_FAILURE);                           \
5176 }
5177
5178
5179         /*
5180          *      If this entry is not directly to a vm_object, bail out.
5181          */
5182         if (entry->is_sub_map) {
5183                 assert(physpage_p == NULL);
5184                 return(KERN_FAILURE);
5185         }
5186
5187         /*
5188          *      Find the backing store object and offset into it.
5189          */
5190
5191         object = entry->object.vm_object;
5192         offset = (va - entry->vme_start) + entry->offset;
5193         prot = entry->protection;
5194
5195         /*
5196          *      Make a reference to this object to prevent its
5197          *      disposal while we are messing with it.
5198          */
5199
5200         vm_object_lock(object);
5201         vm_object_reference_locked(object);
5202         vm_object_paging_begin(object);
5203
5204         /*
5205          *      INVARIANTS (through entire routine):
5206          *
5207          *      1)      At all times, we must either have the object
5208          *              lock or a busy page in some object to prevent
5209          *              some other thread from trying to bring in
5210          *              the same page.
5211          *
5212          *      2)      Once we have a busy page, we must remove it from
5213          *              the pageout queues, so that the pageout daemon
5214          *              will not grab it away.
5215          *
5216          */
5217
5218         /*
5219          *      Look for page in top-level object.  If it's not there or
5220          *      there's something going on, give up.
5221          * ENCRYPTED SWAP: use the slow fault path, since we'll need to
5222          * decrypt the page before wiring it down.
5223          */
5224         m = vm_page_lookup(object, offset);
5225         if ((m == VM_PAGE_NULL) || (m->busy) || (m->encrypted) ||
5226             (m->unusual && ( m->error || m->restart || m->absent))) {
5227
5228                 GIVE_UP;
5229         }
5230         ASSERT_PAGE_DECRYPTED(m);
5231
5232         if (m->fictitious &&
5233             m->phys_page == vm_page_guard_addr) {
5234                 /*
5235                  * Guard pages are fictitious pages and are never
5236                  * entered into a pmap, so let's say it's been wired...
5237                  */
5238                 kr = KERN_SUCCESS;
5239                 goto done;
5240         }
5241
5242         /*
5243          *      Wire the page down now.  All bail outs beyond this
5244          *      point must unwire the page.
5245          */
5246
5247         vm_page_lockspin_queues();
5248         vm_page_wire(m);
5249         vm_page_unlock_queues();
5250
5251         /*
5252          *      Mark page busy for other threads.
5253          */
5254         assert(!m->busy);
5255         m->busy = TRUE;
5256         assert(!m->absent);
5257
5258         /*
5259          *      Give up if the page is being written and there's a copy object
5260          */
5261         if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
5262                 RELEASE_PAGE(m);
5263                 GIVE_UP;
5264         }
5265
5266         /*
5267          *      Put this page into the physical map.
5268          */
5269         type_of_fault = DBG_CACHE_HIT_FAULT;
5270         kr = vm_fault_enter(m,
5271                             pmap,
5272                             pmap_addr,
5273                             prot,
5274                             prot,
5275                             TRUE,
5276                             FALSE,
5277                             FALSE,
5278                             FALSE,
5279                             entry->alias,
5280                             ((entry->iokit_acct ||
5281                               (!entry->is_sub_map && !entry->use_pmap))
5282                              ? PMAP_OPTIONS_ALT_ACCT
5283                              : 0),
5284                             NULL,
5285                             &type_of_fault);
5286
5287 done:
5288         /*
5289          *      Unlock everything, and return
5290          */
5291
5292         if (physpage_p) {
5293                 /* for vm_map_wire_and_extract() */
5294                 if (kr == KERN_SUCCESS) {
5295                         *physpage_p = m->phys_page;
5296                         if (prot & VM_PROT_WRITE) {
5297                                 vm_object_lock_assert_exclusive(m->object);
5298                                 m->dirty = TRUE;
5299                         }
5300                 } else {
5301                         *physpage_p = 0;
5302                 }
5303         }
5304
5305         PAGE_WAKEUP_DONE(m);
5306         UNLOCK_AND_DEALLOCATE;
5307
5308         return kr;
5309
5310 }
5311
5312 /*
5313  *      Routine:        vm_fault_copy_cleanup
5314  *      Purpose:
5315  *              Release a page used by vm_fault_copy.
5316  */
5317
5318 void
5319 vm_fault_copy_cleanup(
5320         vm_page_t       page,
5321         vm_page_t       top_page)
5322 {
5323         vm_object_t     object = page->object;
5324
5325         vm_object_lock(object);
5326         PAGE_WAKEUP_DONE(page);
5327         if (!page->active && !page->inactive && !page->throttled) {
5328                 vm_page_lockspin_queues();
5329                 if (!page->active && !page->inactive && !page->throttled)
5330                         vm_page_activate(page);
5331                 vm_page_unlock_queues();
5332         }
5333         vm_fault_cleanup(object, top_page);
5334 }
5335
5336 void
5337 vm_fault_copy_dst_cleanup(
5338         vm_page_t       page)
5339 {
5340         vm_object_t     object;
5341
5342         if (page != VM_PAGE_NULL) {
5343                 object = page->object;
5344                 vm_object_lock(object);
5345                 vm_page_lockspin_queues();
5346                 vm_page_unwire(page, TRUE);
5347                 vm_page_unlock_queues();
5348                 vm_object_paging_end(object);
5349                 vm_object_unlock(object);
5350         }
5351 }
5352
5353 /*
5354  *      Routine:        vm_fault_copy
5355  *
5356  *      Purpose:
5357  *              Copy pages from one virtual memory object to another --
5358  *              neither the source nor destination pages need be resident.
5359  *
5360  *              Before actually copying a page, the version associated with
5361  *              the destination address map wil be verified.
5362  *
5363  *      In/out conditions:
5364  *              The caller must hold a reference, but not a lock, to
5365  *              each of the source and destination objects and to the
5366  *              destination map.
5367  *
5368  *      Results:
5369  *              Returns KERN_SUCCESS if no errors were encountered in
5370  *              reading or writing the data.  Returns KERN_INTERRUPTED if
5371  *              the operation was interrupted (only possible if the
5372  *              "interruptible" argument is asserted).  Other return values
5373  *              indicate a permanent error in copying the data.
5374  *
5375  *              The actual amount of data copied will be returned in the
5376  *              "copy_size" argument.  In the event that the destination map
5377  *              verification failed, this amount may be less than the amount
5378  *              requested.
5379  */
5380 kern_return_t
5381 vm_fault_copy(
5382         vm_object_t             src_object,
5383         vm_object_offset_t      src_offset,
5384         vm_map_size_t           *copy_size,             /* INOUT */
5385         vm_object_t             dst_object,
5386         vm_object_offset_t      dst_offset,
5387         vm_map_t                dst_map,
5388         vm_map_version_t         *dst_version,
5389         int                     interruptible)
5390 {
5391         vm_page_t               result_page;
5392
5393         vm_page_t               src_page;
5394         vm_page_t               src_top_page;
5395         vm_prot_t               src_prot;
5396
5397         vm_page_t               dst_page;
5398         vm_page_t               dst_top_page;
5399         vm_prot_t               dst_prot;
5400
5401         vm_map_size_t           amount_left;
5402         vm_object_t             old_copy_object;
5403         kern_return_t           error = 0;
5404         vm_fault_return_t       result;
5405
5406         vm_map_size_t           part_size;
5407         struct vm_object_fault_info fault_info_src;
5408         struct vm_object_fault_info fault_info_dst;
5409
5410         /*
5411          * In order not to confuse the clustered pageins, align
5412          * the different offsets on a page boundary.
5413          */
5414
5415 #define RETURN(x)                                       \
5416         MACRO_BEGIN                                     \
5417         *copy_size -= amount_left;                      \
5418         MACRO_RETURN(x);                                \
5419         MACRO_END
5420
5421         amount_left = *copy_size;
5422
5423         fault_info_src.interruptible = interruptible;
5424         fault_info_src.behavior = VM_BEHAVIOR_SEQUENTIAL;
5425         fault_info_src.user_tag  = 0;
5426         fault_info_src.pmap_options = 0;
5427         fault_info_src.lo_offset = vm_object_trunc_page(src_offset);
5428         fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left;
5429         fault_info_src.no_cache   = FALSE;
5430         fault_info_src.stealth = TRUE;
5431         fault_info_src.io_sync = FALSE;
5432         fault_info_src.cs_bypass = FALSE;
5433         fault_info_src.mark_zf_absent = FALSE;
5434         fault_info_src.batch_pmap_op = FALSE;
5435
5436         fault_info_dst.interruptible = interruptible;
5437         fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL;
5438         fault_info_dst.user_tag  = 0;
5439         fault_info_dst.pmap_options = 0;
5440         fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset);
5441         fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left;
5442         fault_info_dst.no_cache   = FALSE;
5443         fault_info_dst.stealth = TRUE;
5444         fault_info_dst.io_sync = FALSE;
5445         fault_info_dst.cs_bypass = FALSE;
5446         fault_info_dst.mark_zf_absent = FALSE;
5447         fault_info_dst.batch_pmap_op = FALSE;
5448
5449         do { /* while (amount_left > 0) */
5450                 /*
5451                  * There may be a deadlock if both source and destination
5452                  * pages are the same. To avoid this deadlock, the copy must
5453                  * start by getting the destination page in order to apply
5454                  * COW semantics if any.
5455                  */
5456
5457         RetryDestinationFault: ;
5458
5459                 dst_prot = VM_PROT_WRITE|VM_PROT_READ;
5460
5461                 vm_object_lock(dst_object);
5462                 vm_object_paging_begin(dst_object);
5463
5464                 if (amount_left > (vm_size_t) -1) {
5465                         /* 32-bit overflow */
5466                         fault_info_dst.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
5467                 } else {
5468                         fault_info_dst.cluster_size = (vm_size_t) amount_left;
5469                         assert(fault_info_dst.cluster_size == amount_left);
5470                 }
5471
5472                 XPR(XPR_VM_FAULT,"vm_fault_copy -> vm_fault_page\n",0,0,0,0,0);
5473                 dst_page = VM_PAGE_NULL;
5474                 result = vm_fault_page(dst_object,
5475                                        vm_object_trunc_page(dst_offset),
5476                                        VM_PROT_WRITE|VM_PROT_READ,
5477                                        FALSE,
5478                                        FALSE, /* page not looked up */
5479                                        &dst_prot, &dst_page, &dst_top_page,
5480                                        (int *)0,
5481                                        &error,
5482                                        dst_map->no_zero_fill,
5483                                        FALSE, &fault_info_dst);
5484                 switch (result) {
5485                 case VM_FAULT_SUCCESS:
5486                         break;
5487                 case VM_FAULT_RETRY:
5488                         goto RetryDestinationFault;
5489                 case VM_FAULT_MEMORY_SHORTAGE:
5490                         if (vm_page_wait(interruptible))
5491                                 goto RetryDestinationFault;
5492                         /* fall thru */
5493                 case VM_FAULT_INTERRUPTED:
5494                         RETURN(MACH_SEND_INTERRUPTED);
5495                 case VM_FAULT_SUCCESS_NO_VM_PAGE:
5496                         /* success but no VM page: fail the copy */
5497                         vm_object_paging_end(dst_object);
5498                         vm_object_unlock(dst_object);
5499                         /*FALLTHROUGH*/
5500                 case VM_FAULT_MEMORY_ERROR:
5501                         if (error)
5502                                 return (error);
5503                         else
5504                                 return(KERN_MEMORY_ERROR);
5505                 default:
5506                         panic("vm_fault_copy: unexpected error 0x%x from "
5507                               "vm_fault_page()\n", result);
5508                 }
5509                 assert ((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
5510
5511                 old_copy_object = dst_page->object->copy;
5512
5513                 /*
5514                  * There exists the possiblity that the source and
5515                  * destination page are the same.  But we can't
5516                  * easily determine that now.  If they are the
5517                  * same, the call to vm_fault_page() for the
5518                  * destination page will deadlock.  To prevent this we
5519                  * wire the page so we can drop busy without having
5520                  * the page daemon steal the page.  We clean up the
5521                  * top page  but keep the paging reference on the object
5522                  * holding the dest page so it doesn't go away.
5523                  */
5524
5525                 vm_page_lockspin_queues();
5526                 vm_page_wire(dst_page);
5527                 vm_page_unlock_queues();
5528                 PAGE_WAKEUP_DONE(dst_page);
5529                 vm_object_unlock(dst_page->object);
5530
5531                 if (dst_top_page != VM_PAGE_NULL) {
5532                         vm_object_lock(dst_object);
5533                         VM_PAGE_FREE(dst_top_page);
5534                         vm_object_paging_end(dst_object);
5535                         vm_object_unlock(dst_object);
5536                 }
5537
5538         RetrySourceFault: ;
5539
5540                 if (src_object == VM_OBJECT_NULL) {
5541                         /*
5542                          *      No source object.  We will just
5543                          *      zero-fill the page in dst_object.
5544                          */
5545                         src_page = VM_PAGE_NULL;
5546                         result_page = VM_PAGE_NULL;
5547                 } else {
5548                         vm_object_lock(src_object);
5549                         src_page = vm_page_lookup(src_object,
5550                                                   vm_object_trunc_page(src_offset));
5551                         if (src_page == dst_page) {
5552                                 src_prot = dst_prot;
5553                                 result_page = VM_PAGE_NULL;
5554                         } else {
5555                                 src_prot = VM_PROT_READ;
5556                                 vm_object_paging_begin(src_object);
5557
5558                                 if (amount_left > (vm_size_t) -1) {
5559                                         /* 32-bit overflow */
5560                                         fault_info_src.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
5561                                 } else {
5562                                         fault_info_src.cluster_size = (vm_size_t) amount_left;
5563                                         assert(fault_info_src.cluster_size == amount_left);
5564                                 }
5565
5566                                 XPR(XPR_VM_FAULT,
5567                                         "vm_fault_copy(2) -> vm_fault_page\n",
5568                                         0,0,0,0,0);
5569                                 result_page = VM_PAGE_NULL;
5570                                 result = vm_fault_page(
5571                                         src_object,
5572                                         vm_object_trunc_page(src_offset),
5573                                         VM_PROT_READ, FALSE,
5574                                         FALSE, /* page not looked up */
5575                                         &src_prot,
5576                                         &result_page, &src_top_page,
5577                                         (int *)0, &error, FALSE,
5578                                         FALSE, &fault_info_src);
5579
5580                                 switch (result) {
5581                                 case VM_FAULT_SUCCESS:
5582                                         break;
5583                                 case VM_FAULT_RETRY:
5584                                         goto RetrySourceFault;
5585                                 case VM_FAULT_MEMORY_SHORTAGE:
5586                                         if (vm_page_wait(interruptible))
5587                                                 goto RetrySourceFault;
5588                                         /* fall thru */
5589                                 case VM_FAULT_INTERRUPTED:
5590                                         vm_fault_copy_dst_cleanup(dst_page);
5591                                         RETURN(MACH_SEND_INTERRUPTED);
5592                                 case VM_FAULT_SUCCESS_NO_VM_PAGE:
5593                                         /* success but no VM page: fail */
5594                                         vm_object_paging_end(src_object);
5595                                         vm_object_unlock(src_object);
5596                                         /*FALLTHROUGH*/
5597                                 case VM_FAULT_MEMORY_ERROR:
5598                                         vm_fault_copy_dst_cleanup(dst_page);
5599                                         if (error)
5600                                                 return (error);
5601                                         else
5602                                                 return(KERN_MEMORY_ERROR);
5603                                 default:
5604                                         panic("vm_fault_copy(2): unexpected "
5605                                               "error 0x%x from "
5606                                               "vm_fault_page()\n", result);
5607                                 }
5608
5609
5610                                 assert((src_top_page == VM_PAGE_NULL) ==
5611                                        (result_page->object == src_object));
5612                         }
5613                         assert ((src_prot & VM_PROT_READ) != VM_PROT_NONE);
5614                         vm_object_unlock(result_page->object);
5615                 }
5616
5617                 if (!vm_map_verify(dst_map, dst_version)) {
5618                         if (result_page != VM_PAGE_NULL && src_page != dst_page)
5619                                 vm_fault_copy_cleanup(result_page, src_top_page);
5620                         vm_fault_copy_dst_cleanup(dst_page);
5621                         break;
5622                 }
5623
5624                 vm_object_lock(dst_page->object);
5625
5626                 if (dst_page->object->copy != old_copy_object) {
5627                         vm_object_unlock(dst_page->object);
5628                         vm_map_verify_done(dst_map, dst_version);
5629                         if (result_page != VM_PAGE_NULL && src_page != dst_page)
5630                                 vm_fault_copy_cleanup(result_page, src_top_page);
5631                         vm_fault_copy_dst_cleanup(dst_page);
5632                         break;
5633                 }
5634                 vm_object_unlock(dst_page->object);
5635
5636                 /*
5637                  *      Copy the page, and note that it is dirty
5638                  *      immediately.
5639                  */
5640
5641                 if (!page_aligned(src_offset) ||
5642                         !page_aligned(dst_offset) ||
5643                         !page_aligned(amount_left)) {
5644
5645                         vm_object_offset_t      src_po,
5646                                                 dst_po;
5647
5648                         src_po = src_offset - vm_object_trunc_page(src_offset);
5649                         dst_po = dst_offset - vm_object_trunc_page(dst_offset);
5650
5651                         if (dst_po > src_po) {
5652                                 part_size = PAGE_SIZE - dst_po;
5653                         } else {
5654                                 part_size = PAGE_SIZE - src_po;
5655                         }
5656                         if (part_size > (amount_left)){
5657                                 part_size = amount_left;
5658                         }
5659
5660                         if (result_page == VM_PAGE_NULL) {
5661                                 assert((vm_offset_t) dst_po == dst_po);
5662                                 assert((vm_size_t) part_size == part_size);
5663                                 vm_page_part_zero_fill(dst_page,
5664                                                        (vm_offset_t) dst_po,
5665                                                        (vm_size_t) part_size);
5666                         } else {
5667                                 assert((vm_offset_t) src_po == src_po);
5668                                 assert((vm_offset_t) dst_po == dst_po);
5669                                 assert((vm_size_t) part_size == part_size);
5670                                 vm_page_part_copy(result_page,
5671                                                   (vm_offset_t) src_po,
5672                                                   dst_page,
5673                                                   (vm_offset_t) dst_po,
5674                                                   (vm_size_t)part_size);
5675                                 if(!dst_page->dirty){
5676                                         vm_object_lock(dst_object);
5677                                         SET_PAGE_DIRTY(dst_page, TRUE);
5678                                         vm_object_unlock(dst_page->object);
5679                                 }
5680
5681                         }
5682                 } else {
5683                         part_size = PAGE_SIZE;
5684
5685                         if (result_page == VM_PAGE_NULL)
5686                                 vm_page_zero_fill(dst_page);
5687                         else{
5688                                 vm_object_lock(result_page->object);
5689                                 vm_page_copy(result_page, dst_page);
5690                                 vm_object_unlock(result_page->object);
5691
5692                                 if(!dst_page->dirty){
5693                                         vm_object_lock(dst_object);
5694                                         SET_PAGE_DIRTY(dst_page, TRUE);
5695                                         vm_object_unlock(dst_page->object);
5696                                 }
5697                         }
5698
5699                 }
5700
5701                 /*
5702                  *      Unlock everything, and return
5703                  */
5704
5705                 vm_map_verify_done(dst_map, dst_version);
5706
5707                 if (result_page != VM_PAGE_NULL && src_page != dst_page)
5708                         vm_fault_copy_cleanup(result_page, src_top_page);
5709                 vm_fault_copy_dst_cleanup(dst_page);
5710
5711                 amount_left -= part_size;
5712                 src_offset += part_size;
5713                 dst_offset += part_size;
5714         } while (amount_left > 0);
5715
5716         RETURN(KERN_SUCCESS);
5717 #undef  RETURN
5718
5719         /*NOTREACHED*/
5720 }
5721
5722 #if     VM_FAULT_CLASSIFY
5723 /*
5724  *      Temporary statistics gathering support.
5725  */
5726
5727 /*
5728  *      Statistics arrays:
5729  */
5730 #define VM_FAULT_TYPES_MAX      5
5731 #define VM_FAULT_LEVEL_MAX      8
5732
5733 int     vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
5734
5735 #define VM_FAULT_TYPE_ZERO_FILL 0
5736 #define VM_FAULT_TYPE_MAP_IN    1
5737 #define VM_FAULT_TYPE_PAGER     2
5738 #define VM_FAULT_TYPE_COPY      3
5739 #define VM_FAULT_TYPE_OTHER     4
5740
5741
5742 void
5743 vm_fault_classify(vm_object_t           object,
5744                   vm_object_offset_t    offset,
5745                   vm_prot_t             fault_type)
5746 {
5747         int             type, level = 0;
5748         vm_page_t       m;
5749
5750         while (TRUE) {
5751                 m = vm_page_lookup(object, offset);
5752                 if (m != VM_PAGE_NULL) {
5753                         if (m->busy || m->error || m->restart || m->absent) {
5754                                 type = VM_FAULT_TYPE_OTHER;
5755                                 break;
5756                         }
5757                         if (((fault_type & VM_PROT_WRITE) == 0) ||
5758                             ((level == 0) && object->copy == VM_OBJECT_NULL)) {
5759                                 type = VM_FAULT_TYPE_MAP_IN;
5760                                 break;
5761                         }
5762                         type = VM_FAULT_TYPE_COPY;
5763                         break;
5764                 }
5765                 else {
5766                         if (object->pager_created) {
5767                                 type = VM_FAULT_TYPE_PAGER;
5768                                 break;
5769                         }
5770                         if (object->shadow == VM_OBJECT_NULL) {
5771                                 type = VM_FAULT_TYPE_ZERO_FILL;
5772                                 break;
5773                         }
5774
5775                         offset += object->vo_shadow_offset;
5776                         object = object->shadow;
5777                         level++;
5778                         continue;
5779                 }
5780         }
5781
5782         if (level > VM_FAULT_LEVEL_MAX)
5783                 level = VM_FAULT_LEVEL_MAX;
5784
5785         vm_fault_stats[type][level] += 1;
5786
5787         return;
5788 }
5789
5790 /* cleanup routine to call from debugger */
5791
5792 void
5793 vm_fault_classify_init(void)
5794 {
5795         int type, level;
5796
5797         for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
5798                 for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
5799                         vm_fault_stats[type][level] = 0;
5800                 }
5801         }
5802
5803         return;
5804 }
5805 #endif  /* VM_FAULT_CLASSIFY */
5806
5807
5808 void
5809 vm_page_validate_cs_mapped(
5810         vm_page_t       page,
5811         const void      *kaddr)
5812 {
5813         vm_object_t             object;
5814         vm_object_offset_t      offset;
5815         kern_return_t           kr;
5816         memory_object_t         pager;
5817         void                    *blobs;
5818         boolean_t               validated;
5819         unsigned                        tainted;
5820
5821         assert(page->busy);
5822         vm_object_lock_assert_exclusive(page->object);
5823
5824         if (!cs_validation) {
5825                 return;
5826         }
5827
5828         if (page->wpmapped && !page->cs_tainted) {
5829                 /*
5830                  * This page was mapped for "write" access sometime in the
5831                  * past and could still be modifiable in the future.
5832                  * Consider it tainted.
5833                  * [ If the page was already found to be "tainted", no
5834                  * need to re-validate. ]
5835                  */
5836                 page->cs_validated = TRUE;
5837                 page->cs_tainted = TRUE;
5838                 if (cs_debug) {
5839                         printf("CODESIGNING: vm_page_validate_cs: "
5840                                "page %p obj %p off 0x%llx "
5841                                "was modified\n",
5842                                page, page->object, page->offset);
5843                 }
5844                 vm_cs_validated_dirtied++;
5845         }
5846
5847         if (page->cs_validated) {
5848                 return;
5849         }
5850
5851         vm_cs_validates++;
5852
5853         object = page->object;
5854         assert(object->code_signed);
5855         offset = page->offset;
5856
5857         if (!object->alive || object->terminating || object->pager == NULL) {
5858                 /*
5859                  * The object is terminating and we don't have its pager
5860                  * so we can't validate the data...
5861                  */
5862                 return;
5863         }
5864         /*
5865          * Since we get here to validate a page that was brought in by
5866          * the pager, we know that this pager is all setup and ready
5867          * by now.
5868          */
5869         assert(!object->internal);
5870         assert(object->pager != NULL);
5871         assert(object->pager_ready);
5872
5873         pager = object->pager;
5874         assert(object->paging_in_progress);
5875         kr = vnode_pager_get_object_cs_blobs(pager, &blobs);
5876         if (kr != KERN_SUCCESS) {
5877                 blobs = NULL;
5878         }
5879
5880         /* verify the SHA1 hash for this page */
5881         tainted = 0;
5882         validated = cs_validate_page(blobs,
5883                                      pager,
5884                                      offset + object->paging_offset,
5885                                      (const void *)kaddr,
5886                                      &tainted);
5887
5888         page->cs_validated = validated;
5889         if (validated) {
5890                 page->cs_tainted = !!(tainted & CS_VALIDATE_TAINTED);
5891                 page->cs_nx = !!(tainted & CS_VALIDATE_NX);
5892         }
5893 }
5894
5895 void
5896 vm_page_validate_cs(
5897         vm_page_t       page)
5898 {
5899         vm_object_t             object;
5900         vm_object_offset_t      offset;
5901         vm_map_offset_t         koffset;
5902         vm_map_size_t           ksize;
5903         vm_offset_t             kaddr;
5904         kern_return_t           kr;
5905         boolean_t               busy_page;
5906         boolean_t               need_unmap;
5907
5908         vm_object_lock_assert_held(page->object);
5909
5910         if (!cs_validation) {
5911                 return;
5912         }
5913
5914         if (page->wpmapped && !page->cs_tainted) {
5915                 vm_object_lock_assert_exclusive(page->object);
5916
5917                 /*
5918                  * This page was mapped for "write" access sometime in the
5919                  * past and could still be modifiable in the future.
5920                  * Consider it tainted.
5921                  * [ If the page was already found to be "tainted", no
5922                  * need to re-validate. ]
5923                  */
5924                 page->cs_validated = TRUE;
5925                 page->cs_tainted = TRUE;
5926                 if (cs_debug) {
5927                         printf("CODESIGNING: vm_page_validate_cs: "
5928                                "page %p obj %p off 0x%llx "
5929                                "was modified\n",
5930                                page, page->object, page->offset);
5931                 }
5932                 vm_cs_validated_dirtied++;
5933         }
5934
5935         if (page->cs_validated) {
5936                 return;
5937         }
5938
5939         if (page->slid) {
5940                 panic("vm_page_validate_cs(%p): page is slid\n", page);
5941         }
5942         assert(!page->slid);
5943
5944 #if CHECK_CS_VALIDATION_BITMAP
5945         if ( vnode_pager_cs_check_validation_bitmap( page->object->pager, trunc_page(page->offset + page->object->paging_offset), CS_BITMAP_CHECK ) == KERN_SUCCESS) {
5946                 page->cs_validated = TRUE;
5947                 page->cs_tainted = FALSE;
5948                 vm_cs_bitmap_validated++;
5949                 return;
5950         }
5951 #endif
5952         vm_object_lock_assert_exclusive(page->object);
5953
5954         object = page->object;
5955         assert(object->code_signed);
5956         offset = page->offset;
5957
5958         busy_page = page->busy;
5959         if (!busy_page) {
5960                 /* keep page busy while we map (and unlock) the VM object */
5961                 page->busy = TRUE;
5962         }
5963
5964         /*
5965          * Take a paging reference on the VM object
5966          * to protect it from collapse or bypass,
5967          * and keep it from disappearing too.
5968          */
5969         vm_object_paging_begin(object);
5970
5971         /* map the page in the kernel address space */
5972         ksize = PAGE_SIZE_64;
5973         koffset = 0;
5974         need_unmap = FALSE;
5975         kr = vm_paging_map_object(page,
5976                                   object,
5977                                   offset,
5978                                   VM_PROT_READ,
5979                                   FALSE, /* can't unlock object ! */
5980                                   &ksize,
5981                                   &koffset,
5982                                   &need_unmap);
5983         if (kr != KERN_SUCCESS) {
5984                 panic("vm_page_validate_cs: could not map page: 0x%x\n", kr);
5985         }
5986         kaddr = CAST_DOWN(vm_offset_t, koffset);
5987
5988         /* validate the mapped page */
5989         vm_page_validate_cs_mapped(page, (const void *) kaddr);
5990
5991 #if CHECK_CS_VALIDATION_BITMAP
5992         if ( page->cs_validated == TRUE && page->cs_tainted == FALSE ) {
5993                 vnode_pager_cs_check_validation_bitmap( object->pager, trunc_page( offset + object->paging_offset), CS_BITMAP_SET );
5994         }
5995 #endif
5996         assert(page->busy);
5997         assert(object == page->object);
5998         vm_object_lock_assert_exclusive(object);
5999
6000         if (!busy_page) {
6001                 PAGE_WAKEUP_DONE(page);
6002         }
6003         if (need_unmap) {
6004                 /* unmap the map from the kernel address space */
6005                 vm_paging_unmap_object(object, koffset, koffset + ksize);
6006                 koffset = 0;
6007                 ksize = 0;
6008                 kaddr = 0;
6009         }
6010         vm_object_paging_end(object);
6011 }