osfmk/vm/vm_fault.c

   1 /*
   2  * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58 /*
  59  *      File:   vm_fault.c
  60  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  61  *
  62  *      Page fault handling module.
  63  */
  64
  65 #include <mach_cluster_stats.h>
  66 #include <mach_pagemap.h>
  67 #include <libkern/OSAtomic.h>
  68
  69 #include <mach/mach_types.h>
  70 #include <mach/kern_return.h>
  71 #include <mach/message.h>       /* for error codes */
  72 #include <mach/vm_param.h>
  73 #include <mach/vm_behavior.h>
  74 #include <mach/memory_object.h>
  75                                 /* For memory_object_data_{request,unlock} */
  76 #include <mach/sdt.h>
  77
  78 #include <kern/kern_types.h>
  79 #include <kern/host_statistics.h>
  80 #include <kern/counters.h>
  81 #include <kern/task.h>
  82 #include <kern/thread.h>
  83 #include <kern/sched_prim.h>
  84 #include <kern/host.h>
  85 #include <kern/xpr.h>
  86 #include <kern/mach_param.h>
  87 #include <kern/macro_help.h>
  88 #include <kern/zalloc.h>
  89 #include <kern/misc_protos.h>
  90 #include <kern/policy_internal.h>
  91
  92 #include <vm/vm_compressor.h>
  93 #include <vm/vm_compressor_pager.h>
  94 #include <vm/vm_fault.h>
  95 #include <vm/vm_map.h>
  96 #include <vm/vm_object.h>
  97 #include <vm/vm_page.h>
  98 #include <vm/vm_kern.h>
  99 #include <vm/pmap.h>
 100 #include <vm/vm_pageout.h>
 101 #include <vm/vm_protos.h>
 102 #include <vm/vm_external.h>
 103 #include <vm/memory_object.h>
 104 #include <vm/vm_purgeable_internal.h>   /* Needed by some vm_page.h macros */
 105 #include <vm/vm_shared_region.h>
 106
 107 #include <sys/codesign.h>
 108 #include <sys/reason.h>
 109 #include <sys/signalvar.h>
 110
 111 #include <san/kasan.h>
 112
 113 #define VM_FAULT_CLASSIFY       0
 114
 115 #define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
 116
 117 unsigned int    vm_object_pagein_throttle = 16;
 118
 119 /*
 120  * We apply a hard throttle to the demand zero rate of tasks that we believe are running out of control which
 121  * kicks in when swap space runs out.  64-bit programs have massive address spaces and can leak enormous amounts
 122  * of memory if they're buggy and can run the system completely out of swap space.  If this happens, we
 123  * impose a hard throttle on them to prevent them from taking the last bit of memory left.  This helps
 124  * keep the UI active so that the user has a chance to kill the offending task before the system
 125  * completely hangs.
 126  *
 127  * The hard throttle is only applied when the system is nearly completely out of swap space and is only applied
 128  * to tasks that appear to be bloated.  When swap runs out, any task using more than vm_hard_throttle_threshold
 129  * will be throttled.  The throttling is done by giving the thread that's trying to demand zero a page a
 130  * delay of HARD_THROTTLE_DELAY microseconds before being allowed to try the page fault again.
 131  */
 132
 133 extern void throttle_lowpri_io(int);
 134
 135 extern struct vnode *vnode_pager_lookup_vnode(memory_object_t);
 136
 137 uint64_t vm_hard_throttle_threshold;
 138
 139
 140
 141 #define NEED_TO_HARD_THROTTLE_THIS_TASK()       (vm_wants_task_throttled(current_task()) ||     \
 142                                                  ((vm_page_free_count < vm_page_throttle_limit || \
 143                                                    HARD_THROTTLE_LIMIT_REACHED()) && \
 144                                                   proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO) >= THROTTLE_LEVEL_THROTTLED))
 145
 146
 147 #define HARD_THROTTLE_DELAY     10000   /* 10000 us == 10 ms */
 148 #define SOFT_THROTTLE_DELAY     200     /* 200 us == .2 ms */
 149
 150 #define VM_PAGE_CREATION_THROTTLE_PERIOD_SECS   6
 151 #define VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC  20000
 152
 153
 154 boolean_t current_thread_aborted(void);
 155
 156 /* Forward declarations of internal routines. */
 157 static kern_return_t vm_fault_wire_fast(
 158                                 vm_map_t        map,
 159                                 vm_map_offset_t va,
 160                                 vm_prot_t       prot,
 161                                 vm_tag_t        wire_tag,
 162                                 vm_map_entry_t  entry,
 163                                 pmap_t          pmap,
 164                                 vm_map_offset_t pmap_addr,
 165                                 ppnum_t         *physpage_p);
 166
 167 static kern_return_t vm_fault_internal(
 168                 vm_map_t        map,
 169                 vm_map_offset_t vaddr,
 170                 vm_prot_t       caller_prot,
 171                 boolean_t       change_wiring,
 172                 vm_tag_t        wire_tag,
 173                 int             interruptible,
 174                 pmap_t          pmap,
 175                 vm_map_offset_t pmap_addr,
 176                 ppnum_t         *physpage_p);
 177
 178 static void vm_fault_copy_cleanup(
 179                                 vm_page_t       page,
 180                                 vm_page_t       top_page);
 181
 182 static void vm_fault_copy_dst_cleanup(
 183                                 vm_page_t       page);
 184
 185 #if     VM_FAULT_CLASSIFY
 186 extern void vm_fault_classify(vm_object_t       object,
 187                           vm_object_offset_t    offset,
 188                           vm_prot_t             fault_type);
 189
 190 extern void vm_fault_classify_init(void);
 191 #endif
 192
 193 unsigned long vm_pmap_enter_blocked = 0;
 194 unsigned long vm_pmap_enter_retried = 0;
 195
 196 unsigned long vm_cs_validates = 0;
 197 unsigned long vm_cs_revalidates = 0;
 198 unsigned long vm_cs_query_modified = 0;
 199 unsigned long vm_cs_validated_dirtied = 0;
 200 unsigned long vm_cs_bitmap_validated = 0;
 201 #if PMAP_CS
 202 uint64_t vm_cs_defer_to_pmap_cs = 0;
 203 uint64_t vm_cs_defer_to_pmap_cs_not = 0;
 204 #endif /* PMAP_CS */
 205
 206 void vm_pre_fault(vm_map_offset_t);
 207
 208 extern char *kdp_compressor_decompressed_page;
 209 extern addr64_t kdp_compressor_decompressed_page_paddr;
 210 extern ppnum_t  kdp_compressor_decompressed_page_ppnum;
 211
 212 struct vmrtfr {
 213         int vmrtfr_maxi;
 214         int vmrtfr_curi;
 215         int64_t vmrtf_total;
 216         vm_rtfault_record_t *vm_rtf_records;
 217 } vmrtfrs;
 218 #define VMRTF_DEFAULT_BUFSIZE (4096)
 219 #define VMRTF_NUM_RECORDS_DEFAULT (VMRTF_DEFAULT_BUFSIZE / sizeof(vm_rtfault_record_t))
 220 int vmrtf_num_records = VMRTF_NUM_RECORDS_DEFAULT;
 221
 222 static void vm_rtfrecord_lock(void);
 223 static void vm_rtfrecord_unlock(void);
 224 static void vm_record_rtfault(thread_t, uint64_t, vm_map_offset_t, int);
 225
 226 lck_spin_t vm_rtfr_slock;
 227 extern lck_grp_t vm_page_lck_grp_bucket;
 228 extern lck_attr_t vm_page_lck_attr;
 229
 230 /*
 231  *      Routine:        vm_fault_init
 232  *      Purpose:
 233  *              Initialize our private data structures.
 234  */
 235 void
 236 vm_fault_init(void)
 237 {
 238         int i, vm_compressor_temp;
 239         boolean_t need_default_val = TRUE;
 240         /*
 241          * Choose a value for the hard throttle threshold based on the amount of ram.  The threshold is
 242          * computed as a percentage of available memory, and the percentage used is scaled inversely with
 243          * the amount of memory.  The percentage runs between 10% and 35%.  We use 35% for small memory systems
 244          * and reduce the value down to 10% for very large memory configurations.  This helps give us a
 245          * definition of a memory hog that makes more sense relative to the amount of ram in the machine.
 246          * The formula here simply uses the number of gigabytes of ram to adjust the percentage.
 247          */
 248
 249         vm_hard_throttle_threshold = sane_size * (35 - MIN((int)(sane_size / (1024*1024*1024)), 25)) / 100;
 250
 251         /*
 252          * Configure compressed pager behavior. A boot arg takes precedence over a device tree entry.
 253          */
 254
 255         if (PE_parse_boot_argn("vm_compressor", &vm_compressor_temp, sizeof (vm_compressor_temp))) {
 256                 for ( i = 0; i < VM_PAGER_MAX_MODES; i++) {
 257                         if (vm_compressor_temp > 0 &&
 258                             ((vm_compressor_temp & ( 1 << i)) == vm_compressor_temp)) {
 259                                 need_default_val = FALSE;
 260                                 vm_compressor_mode = vm_compressor_temp;
 261                                 break;
 262                         }
 263                 }
 264                 if (need_default_val)
 265                         printf("Ignoring \"vm_compressor\" boot arg %d\n", vm_compressor_temp);
 266         }
 267         if (need_default_val) {
 268                 /* If no boot arg or incorrect boot arg, try device tree. */
 269                 PE_get_default("kern.vm_compressor", &vm_compressor_mode, sizeof(vm_compressor_mode));
 270         }
 271         printf("\"vm_compressor_mode\" is %d\n", vm_compressor_mode);
 272 }
 273
 274 void vm_rtfault_record_init(void) {
 275         PE_parse_boot_argn("vm_rtfault_records", &vmrtf_num_records, sizeof(vmrtf_num_records));
 276
 277         assert(vmrtf_num_records >= 1);
 278         vmrtf_num_records = MAX(vmrtf_num_records, 1);
 279         size_t kallocsz = vmrtf_num_records * sizeof(vm_rtfault_record_t);
 280         vmrtfrs.vm_rtf_records = kalloc(kallocsz);
 281         bzero(vmrtfrs.vm_rtf_records, kallocsz);
 282         vmrtfrs.vmrtfr_maxi = vmrtf_num_records - 1;
 283         lck_spin_init(&vm_rtfr_slock, &vm_page_lck_grp_bucket, &vm_page_lck_attr);
 284 }
 285 /*
 286  *      Routine:        vm_fault_cleanup
 287  *      Purpose:
 288  *              Clean up the result of vm_fault_page.
 289  *      Results:
 290  *              The paging reference for "object" is released.
 291  *              "object" is unlocked.
 292  *              If "top_page" is not null,  "top_page" is
 293  *              freed and the paging reference for the object
 294  *              containing it is released.
 295  *
 296  *      In/out conditions:
 297  *              "object" must be locked.
 298  */
 299 void
 300 vm_fault_cleanup(
 301         vm_object_t     object,
 302         vm_page_t       top_page)
 303 {
 304         vm_object_paging_end(object);
 305         vm_object_unlock(object);
 306
 307         if (top_page != VM_PAGE_NULL) {
 308                 object = VM_PAGE_OBJECT(top_page);
 309
 310                 vm_object_lock(object);
 311                 VM_PAGE_FREE(top_page);
 312                 vm_object_paging_end(object);
 313                 vm_object_unlock(object);
 314         }
 315 }
 316
 317 #define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
 318
 319
 320 boolean_t       vm_page_deactivate_behind = TRUE;
 321 /*
 322  * default sizes given VM_BEHAVIOR_DEFAULT reference behavior
 323  */
 324 #define VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW     128
 325 #define VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER    16              /* don't make this too big... */
 326                                                                 /* we use it to size an array on the stack */
 327
 328 int vm_default_behind = VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW;
 329
 330 #define MAX_SEQUENTIAL_RUN      (1024 * 1024 * 1024)
 331
 332 /*
 333  * vm_page_is_sequential
 334  *
 335  * Determine if sequential access is in progress
 336  * in accordance with the behavior specified.
 337  * Update state to indicate current access pattern.
 338  *
 339  * object must have at least the shared lock held
 340  */
 341 static
 342 void
 343 vm_fault_is_sequential(
 344         vm_object_t             object,
 345         vm_object_offset_t      offset,
 346         vm_behavior_t           behavior)
 347 {
 348         vm_object_offset_t      last_alloc;
 349         int                     sequential;
 350         int                     orig_sequential;
 351
 352         last_alloc = object->last_alloc;
 353         sequential = object->sequential;
 354         orig_sequential = sequential;
 355
 356         switch (behavior) {
 357         case VM_BEHAVIOR_RANDOM:
 358                 /*
 359                  * reset indicator of sequential behavior
 360                  */
 361                 sequential = 0;
 362                 break;
 363
 364         case VM_BEHAVIOR_SEQUENTIAL:
 365                 if (offset && last_alloc == offset - PAGE_SIZE_64) {
 366                         /*
 367                          * advance indicator of sequential behavior
 368                          */
 369                         if (sequential < MAX_SEQUENTIAL_RUN)
 370                                 sequential += PAGE_SIZE;
 371                 } else {
 372                         /*
 373                          * reset indicator of sequential behavior
 374                          */
 375                         sequential = 0;
 376                 }
 377                 break;
 378
 379         case VM_BEHAVIOR_RSEQNTL:
 380                 if (last_alloc && last_alloc == offset + PAGE_SIZE_64) {
 381                         /*
 382                          * advance indicator of sequential behavior
 383                          */
 384                         if (sequential > -MAX_SEQUENTIAL_RUN)
 385                                 sequential -= PAGE_SIZE;
 386                 } else {
 387                         /*
 388                          * reset indicator of sequential behavior
 389                          */
 390                         sequential = 0;
 391                 }
 392                 break;
 393
 394         case VM_BEHAVIOR_DEFAULT:
 395         default:
 396                 if (offset && last_alloc == (offset - PAGE_SIZE_64)) {
 397                         /*
 398                          * advance indicator of sequential behavior
 399                          */
 400                         if (sequential < 0)
 401                                 sequential = 0;
 402                         if (sequential < MAX_SEQUENTIAL_RUN)
 403                                 sequential += PAGE_SIZE;
 404
 405                 } else if (last_alloc && last_alloc == (offset + PAGE_SIZE_64)) {
 406                         /*
 407                          * advance indicator of sequential behavior
 408                          */
 409                         if (sequential > 0)
 410                                 sequential = 0;
 411                         if (sequential > -MAX_SEQUENTIAL_RUN)
 412                                 sequential -= PAGE_SIZE;
 413                 } else {
 414                         /*
 415                          * reset indicator of sequential behavior
 416                          */
 417                         sequential = 0;
 418                 }
 419                 break;
 420         }
 421         if (sequential != orig_sequential) {
 422                 if (!OSCompareAndSwap(orig_sequential, sequential, (UInt32 *)&object->sequential)) {
 423                         /*
 424                          * if someone else has already updated object->sequential
 425                          * don't bother trying to update it or object->last_alloc
 426                          */
 427                         return;
 428                 }
 429         }
 430         /*
 431          * I'd like to do this with a OSCompareAndSwap64, but that
 432          * doesn't exist for PPC...  however, it shouldn't matter
 433          * that much... last_alloc is maintained so that we can determine
 434          * if a sequential access pattern is taking place... if only
 435          * one thread is banging on this object, no problem with the unprotected
 436          * update... if 2 or more threads are banging away, we run the risk of
 437          * someone seeing a mangled update... however, in the face of multiple
 438          * accesses, no sequential access pattern can develop anyway, so we
 439          * haven't lost any real info.
 440          */
 441         object->last_alloc = offset;
 442 }
 443
 444
 445 int vm_page_deactivate_behind_count = 0;
 446
 447 /*
 448  * vm_page_deactivate_behind
 449  *
 450  * Determine if sequential access is in progress
 451  * in accordance with the behavior specified.  If
 452  * so, compute a potential page to deactivate and
 453  * deactivate it.
 454  *
 455  * object must be locked.
 456  *
 457  * return TRUE if we actually deactivate a page
 458  */
 459 static
 460 boolean_t
 461 vm_fault_deactivate_behind(
 462         vm_object_t             object,
 463         vm_object_offset_t      offset,
 464         vm_behavior_t           behavior)
 465 {
 466         int             n;
 467         int             pages_in_run = 0;
 468         int             max_pages_in_run = 0;
 469         int             sequential_run;
 470         int             sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
 471         vm_object_offset_t      run_offset = 0;
 472         vm_object_offset_t      pg_offset = 0;
 473         vm_page_t       m;
 474         vm_page_t       page_run[VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER];
 475
 476         pages_in_run = 0;
 477 #if TRACEFAULTPAGE
 478         dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind); /* (TEST/DEBUG) */
 479 #endif
 480
 481         if (object == kernel_object || vm_page_deactivate_behind == FALSE) {
 482                 /*
 483                  * Do not deactivate pages from the kernel object: they
 484                  * are not intended to become pageable.
 485                  * or we've disabled the deactivate behind mechanism
 486                  */
 487                 return FALSE;
 488         }
 489         if ((sequential_run = object->sequential)) {
 490                   if (sequential_run < 0) {
 491                           sequential_behavior = VM_BEHAVIOR_RSEQNTL;
 492                           sequential_run = 0 - sequential_run;
 493                   } else {
 494                           sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
 495                   }
 496         }
 497         switch (behavior) {
 498         case VM_BEHAVIOR_RANDOM:
 499                 break;
 500         case VM_BEHAVIOR_SEQUENTIAL:
 501                 if (sequential_run >= (int)PAGE_SIZE) {
 502                         run_offset = 0 - PAGE_SIZE_64;
 503                         max_pages_in_run = 1;
 504                 }
 505                 break;
 506         case VM_BEHAVIOR_RSEQNTL:
 507                 if (sequential_run >= (int)PAGE_SIZE) {
 508                         run_offset = PAGE_SIZE_64;
 509                         max_pages_in_run = 1;
 510                 }
 511                 break;
 512         case VM_BEHAVIOR_DEFAULT:
 513         default:
 514         {       vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
 515
 516                 /*
 517                  * determine if the run of sequential accesss has been
 518                  * long enough on an object with default access behavior
 519                  * to consider it for deactivation
 520                  */
 521                 if ((uint64_t)sequential_run >= behind && (sequential_run % (VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER * PAGE_SIZE)) == 0) {
 522                         /*
 523                          * the comparisons between offset and behind are done
 524                          * in this kind of odd fashion in order to prevent wrap around
 525                          * at the end points
 526                          */
 527                         if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) {
 528                                 if (offset >= behind) {
 529                                         run_offset = 0 - behind;
 530                                         pg_offset = PAGE_SIZE_64;
 531                                         max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
 532                                 }
 533                         } else {
 534                                 if (offset < -behind) {
 535                                         run_offset = behind;
 536                                         pg_offset = 0 - PAGE_SIZE_64;
 537                                         max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
 538                                 }
 539                         }
 540                 }
 541                 break;
 542         }
 543         }
 544         for (n = 0; n < max_pages_in_run; n++) {
 545                 m = vm_page_lookup(object, offset + run_offset + (n * pg_offset));
 546
 547                 if (m && !m->vmp_laundry && !m->vmp_busy && !m->vmp_no_cache && (m->vmp_q_state != VM_PAGE_ON_THROTTLED_Q) && !m->vmp_fictitious && !m->vmp_absent) {
 548                         page_run[pages_in_run++] = m;
 549
 550                         /*
 551                          * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
 552                          *
 553                          * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
 554                          * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
 555                          * new reference happens. If no futher references happen on the page after that remote TLB flushes
 556                          * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
 557                          * by pageout_scan, which is just fine since the last reference would have happened quite far
 558                          * in the past (TLB caches don't hang around for very long), and of course could just as easily
 559                          * have happened before we did the deactivate_behind.
 560                          */
 561                         pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
 562                 }
 563         }
 564         if (pages_in_run) {
 565                 vm_page_lockspin_queues();
 566
 567                 for (n = 0; n < pages_in_run; n++) {
 568
 569                         m = page_run[n];
 570
 571                         vm_page_deactivate_internal(m, FALSE);
 572
 573                         vm_page_deactivate_behind_count++;
 574 #if TRACEFAULTPAGE
 575                         dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
 576 #endif
 577                 }
 578                 vm_page_unlock_queues();
 579
 580                 return TRUE;
 581         }
 582         return FALSE;
 583 }
 584
 585
 586 #if (DEVELOPMENT || DEBUG)
 587 uint32_t        vm_page_creation_throttled_hard = 0;
 588 uint32_t        vm_page_creation_throttled_soft = 0;
 589 uint64_t        vm_page_creation_throttle_avoided = 0;
 590 #endif /* DEVELOPMENT || DEBUG */
 591
 592 static int
 593 vm_page_throttled(boolean_t page_kept)
 594 {
 595         clock_sec_t     elapsed_sec;
 596         clock_sec_t     tv_sec;
 597         clock_usec_t    tv_usec;
 598
 599         thread_t thread = current_thread();
 600
 601         if (thread->options & TH_OPT_VMPRIV)
 602                 return (0);
 603
 604         if (thread->t_page_creation_throttled) {
 605                 thread->t_page_creation_throttled = 0;
 606
 607                 if (page_kept == FALSE)
 608                         goto no_throttle;
 609         }
 610         if (NEED_TO_HARD_THROTTLE_THIS_TASK()) {
 611 #if (DEVELOPMENT || DEBUG)
 612                 thread->t_page_creation_throttled_hard++;
 613                 OSAddAtomic(1, &vm_page_creation_throttled_hard);
 614 #endif /* DEVELOPMENT || DEBUG */
 615                 return (HARD_THROTTLE_DELAY);
 616         }
 617
 618         if ((vm_page_free_count < vm_page_throttle_limit || (VM_CONFIG_COMPRESSOR_IS_PRESENT && SWAPPER_NEEDS_TO_UNTHROTTLE())) &&
 619             thread->t_page_creation_count > (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS * VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC)) {
 620
 621                 if (vm_page_free_wanted == 0 && vm_page_free_wanted_privileged == 0) {
 622 #if (DEVELOPMENT || DEBUG)
 623                         OSAddAtomic64(1, &vm_page_creation_throttle_avoided);
 624 #endif
 625                         goto no_throttle;
 626                 }
 627                 clock_get_system_microtime(&tv_sec, &tv_usec);
 628
 629                 elapsed_sec = tv_sec - thread->t_page_creation_time;
 630
 631                 if (elapsed_sec <= VM_PAGE_CREATION_THROTTLE_PERIOD_SECS ||
 632                     (thread->t_page_creation_count / elapsed_sec) >= VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC) {
 633
 634                         if (elapsed_sec >= (3 * VM_PAGE_CREATION_THROTTLE_PERIOD_SECS)) {
 635                                 /*
 636                                  * we'll reset our stats to give a well behaved app
 637                                  * that was unlucky enough to accumulate a bunch of pages
 638                                  * over a long period of time a chance to get out of
 639                                  * the throttled state... we reset the counter and timestamp
 640                                  * so that if it stays under the rate limit for the next second
 641                                  * it will be back in our good graces... if it exceeds it, it
 642                                  * will remain in the throttled state
 643                                  */
 644                                 thread->t_page_creation_time = tv_sec;
 645                                 thread->t_page_creation_count = VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC * (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS - 1);
 646                         }
 647                         VM_PAGEOUT_DEBUG(vm_page_throttle_count, 1);
 648
 649                         thread->t_page_creation_throttled = 1;
 650
 651                         if (VM_CONFIG_COMPRESSOR_IS_PRESENT && HARD_THROTTLE_LIMIT_REACHED()) {
 652 #if (DEVELOPMENT || DEBUG)
 653                                 thread->t_page_creation_throttled_hard++;
 654                                 OSAddAtomic(1, &vm_page_creation_throttled_hard);
 655 #endif /* DEVELOPMENT || DEBUG */
 656                                 return (HARD_THROTTLE_DELAY);
 657                         } else {
 658 #if (DEVELOPMENT || DEBUG)
 659                                 thread->t_page_creation_throttled_soft++;
 660                                 OSAddAtomic(1, &vm_page_creation_throttled_soft);
 661 #endif /* DEVELOPMENT || DEBUG */
 662                                 return (SOFT_THROTTLE_DELAY);
 663                         }
 664                 }
 665                 thread->t_page_creation_time = tv_sec;
 666                 thread->t_page_creation_count = 0;
 667         }
 668 no_throttle:
 669         thread->t_page_creation_count++;
 670
 671         return (0);
 672 }
 673
 674
 675 /*
 676  * check for various conditions that would
 677  * prevent us from creating a ZF page...
 678  * cleanup is based on being called from vm_fault_page
 679  *
 680  * object must be locked
 681  * object == m->vmp_object
 682  */
 683 static vm_fault_return_t
 684 vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, wait_interrupt_t interruptible_state, boolean_t page_throttle)
 685 {
 686         int throttle_delay;
 687
 688         if (object->shadow_severed ||
 689             VM_OBJECT_PURGEABLE_FAULT_ERROR(object)) {
 690                 /*
 691                  * Either:
 692                  * 1. the shadow chain was severed,
 693                  * 2. the purgeable object is volatile or empty and is marked
 694                  *    to fault on access while volatile.
 695                  * Just have to return an error at this point
 696                  */
 697                 if (m != VM_PAGE_NULL)
 698                         VM_PAGE_FREE(m);
 699                 vm_fault_cleanup(object, first_m);
 700
 701                 thread_interrupt_level(interruptible_state);
 702
 703                 return (VM_FAULT_MEMORY_ERROR);
 704         }
 705         if (page_throttle == TRUE) {
 706                 if ((throttle_delay = vm_page_throttled(FALSE))) {
 707                         /*
 708                          * we're throttling zero-fills...
 709                          * treat this as if we couldn't grab a page
 710                          */
 711                         if (m != VM_PAGE_NULL)
 712                                 VM_PAGE_FREE(m);
 713                         vm_fault_cleanup(object, first_m);
 714
 715                         VM_DEBUG_EVENT(vmf_check_zfdelay, VMF_CHECK_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
 716
 717                         delay(throttle_delay);
 718
 719                         if (current_thread_aborted()) {
 720                                 thread_interrupt_level(interruptible_state);
 721                                 return VM_FAULT_INTERRUPTED;
 722                         }
 723                         thread_interrupt_level(interruptible_state);
 724
 725                         return (VM_FAULT_MEMORY_SHORTAGE);
 726                 }
 727         }
 728         return (VM_FAULT_SUCCESS);
 729 }
 730
 731
 732 /*
 733  * do the work to zero fill a page and
 734  * inject it into the correct paging queue
 735  *
 736  * m->vmp_object must be locked
 737  * page queue lock must NOT be held
 738  */
 739 static int
 740 vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill)
 741 {
 742         int my_fault = DBG_ZERO_FILL_FAULT;
 743         vm_object_t     object;
 744
 745         object = VM_PAGE_OBJECT(m);
 746
 747         /*
 748          * This is is a zero-fill page fault...
 749          *
 750          * Checking the page lock is a waste of
 751          * time;  this page was absent, so
 752          * it can't be page locked by a pager.
 753          *
 754          * we also consider it undefined
 755          * with respect to instruction
 756          * execution.  i.e. it is the responsibility
 757          * of higher layers to call for an instruction
 758          * sync after changing the contents and before
 759          * sending a program into this area.  We
 760          * choose this approach for performance
 761          */
 762         m->vmp_pmapped = TRUE;
 763
 764         m->vmp_cs_validated = FALSE;
 765         m->vmp_cs_tainted = FALSE;
 766         m->vmp_cs_nx = FALSE;
 767
 768         if (no_zero_fill == TRUE) {
 769                 my_fault = DBG_NZF_PAGE_FAULT;
 770
 771                 if (m->vmp_absent && m->vmp_busy)
 772                         return (my_fault);
 773         } else {
 774                 vm_page_zero_fill(m);
 775
 776                 VM_STAT_INCR(zero_fill_count);
 777                 DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
 778         }
 779         assert(!m->vmp_laundry);
 780         assert(object != kernel_object);
 781         //assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
 782
 783         if (!VM_DYNAMIC_PAGING_ENABLED() &&
 784                 (object->purgable == VM_PURGABLE_DENY ||
 785                  object->purgable == VM_PURGABLE_NONVOLATILE ||
 786                  object->purgable == VM_PURGABLE_VOLATILE )) {
 787
 788                 vm_page_lockspin_queues();
 789
 790                 if (!VM_DYNAMIC_PAGING_ENABLED()) {
 791                         assert(!VM_PAGE_WIRED(m));
 792
 793                         /*
 794                          * can't be on the pageout queue since we don't
 795                          * have a pager to try and clean to
 796                          */
 797                         vm_page_queues_remove(m, TRUE);
 798                         vm_page_check_pageable_safe(m);
 799                         vm_page_queue_enter(&vm_page_queue_throttled, m, vm_page_t, vmp_pageq);
 800                         m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
 801                         vm_page_throttled_count++;
 802                 }
 803                 vm_page_unlock_queues();
 804         }
 805         return (my_fault);
 806 }
 807
 808
 809 /*
 810  *      Routine:        vm_fault_page
 811  *      Purpose:
 812  *              Find the resident page for the virtual memory
 813  *              specified by the given virtual memory object
 814  *              and offset.
 815  *      Additional arguments:
 816  *              The required permissions for the page is given
 817  *              in "fault_type".  Desired permissions are included
 818  *              in "protection".
 819  *              fault_info is passed along to determine pagein cluster
 820  *              limits... it contains the expected reference pattern,
 821  *              cluster size if available, etc...
 822  *
 823  *              If the desired page is known to be resident (for
 824  *              example, because it was previously wired down), asserting
 825  *              the "unwiring" parameter will speed the search.
 826  *
 827  *              If the operation can be interrupted (by thread_abort
 828  *              or thread_terminate), then the "interruptible"
 829  *              parameter should be asserted.
 830  *
 831  *      Results:
 832  *              The page containing the proper data is returned
 833  *              in "result_page".
 834  *
 835  *      In/out conditions:
 836  *              The source object must be locked and referenced,
 837  *              and must donate one paging reference.  The reference
 838  *              is not affected.  The paging reference and lock are
 839  *              consumed.
 840  *
 841  *              If the call succeeds, the object in which "result_page"
 842  *              resides is left locked and holding a paging reference.
 843  *              If this is not the original object, a busy page in the
 844  *              original object is returned in "top_page", to prevent other
 845  *              callers from pursuing this same data, along with a paging
 846  *              reference for the original object.  The "top_page" should
 847  *              be destroyed when this guarantee is no longer required.
 848  *              The "result_page" is also left busy.  It is not removed
 849  *              from the pageout queues.
 850  *      Special Case:
 851  *              A return value of VM_FAULT_SUCCESS_NO_PAGE means that the
 852  *              fault succeeded but there's no VM page (i.e. the VM object
 853  *              does not actually hold VM pages, but device memory or
 854  *              large pages).  The object is still locked and we still hold a
 855  *              paging_in_progress reference.
 856  */
 857 unsigned int vm_fault_page_blocked_access = 0;
 858 unsigned int vm_fault_page_forced_retry = 0;
 859
 860 vm_fault_return_t
 861 vm_fault_page(
 862         /* Arguments: */
 863         vm_object_t     first_object,   /* Object to begin search */
 864         vm_object_offset_t first_offset,        /* Offset into object */
 865         vm_prot_t       fault_type,     /* What access is requested */
 866         boolean_t       must_be_resident,/* Must page be resident? */
 867         boolean_t       caller_lookup,  /* caller looked up page */
 868         /* Modifies in place: */
 869         vm_prot_t       *protection,    /* Protection for mapping */
 870         vm_page_t       *result_page,   /* Page found, if successful */
 871         /* Returns: */
 872         vm_page_t       *top_page,      /* Page in top object, if
 873                                          * not result_page.  */
 874         int             *type_of_fault, /* if non-null, fill in with type of fault
 875                                          * COW, zero-fill, etc... returned in trace point */
 876         /* More arguments: */
 877         kern_return_t   *error_code,    /* code if page is in error */
 878         boolean_t       no_zero_fill,   /* don't zero fill absent pages */
 879         boolean_t       data_supply,    /* treat as data_supply if
 880                                          * it is a write fault and a full
 881                                          * page is provided */
 882         vm_object_fault_info_t fault_info)
 883 {
 884         vm_page_t               m;
 885         vm_object_t             object;
 886         vm_object_offset_t      offset;
 887         vm_page_t               first_m;
 888         vm_object_t             next_object;
 889         vm_object_t             copy_object;
 890         boolean_t               look_for_page;
 891         boolean_t               force_fault_retry = FALSE;
 892         vm_prot_t               access_required = fault_type;
 893         vm_prot_t               wants_copy_flag;
 894         kern_return_t           wait_result;
 895         wait_interrupt_t        interruptible_state;
 896         boolean_t               data_already_requested = FALSE;
 897         vm_behavior_t           orig_behavior;
 898         vm_size_t               orig_cluster_size;
 899         vm_fault_return_t       error;
 900         int                     my_fault;
 901         uint32_t                try_failed_count;
 902         int                     interruptible; /* how may fault be interrupted? */
 903         int                     external_state = VM_EXTERNAL_STATE_UNKNOWN;
 904         memory_object_t         pager;
 905         vm_fault_return_t       retval;
 906         int                     grab_options;
 907
 908 /*
 909  * MUST_ASK_PAGER() evaluates to TRUE if the page specified by object/offset is
 910  * marked as paged out in the compressor pager or the pager doesn't exist.
 911  * Note also that if the pager for an internal object
 912  * has not been created, the pager is not invoked regardless of the value
 913  * of MUST_ASK_PAGER().
 914  *
 915  * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
 916  * is marked as paged out in the compressor pager.
 917  * PAGED_OUT() is used to determine if a page has already been pushed
 918  * into a copy object in order to avoid a redundant page out operation.
 919  */
 920 #define MUST_ASK_PAGER(o, f, s)                                 \
 921         ((s = VM_COMPRESSOR_PAGER_STATE_GET((o), (f))) != VM_EXTERNAL_STATE_ABSENT)
 922
 923 #define PAGED_OUT(o, f) \
 924         (VM_COMPRESSOR_PAGER_STATE_GET((o), (f)) == VM_EXTERNAL_STATE_EXISTS)
 925
 926 /*
 927  *      Recovery actions
 928  */
 929 #define RELEASE_PAGE(m)                                 \
 930         MACRO_BEGIN                                     \
 931         PAGE_WAKEUP_DONE(m);                            \
 932         if ( !VM_PAGE_PAGEABLE(m)) {                    \
 933                 vm_page_lockspin_queues();              \
 934                 if ( !VM_PAGE_PAGEABLE(m)) {            \
 935                         if (VM_CONFIG_COMPRESSOR_IS_ACTIVE)     \
 936                                 vm_page_deactivate(m);          \
 937                         else                                    \
 938                                 vm_page_activate(m);            \
 939                 }                                               \
 940                 vm_page_unlock_queues();                        \
 941         }                                                       \
 942         MACRO_END
 943
 944 #if TRACEFAULTPAGE
 945         dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */
 946 #endif
 947
 948         interruptible = fault_info->interruptible;
 949         interruptible_state = thread_interrupt_level(interruptible);
 950
 951         /*
 952          *      INVARIANTS (through entire routine):
 953          *
 954          *      1)      At all times, we must either have the object
 955          *              lock or a busy page in some object to prevent
 956          *              some other thread from trying to bring in
 957          *              the same page.
 958          *
 959          *              Note that we cannot hold any locks during the
 960          *              pager access or when waiting for memory, so
 961          *              we use a busy page then.
 962          *
 963          *      2)      To prevent another thread from racing us down the
 964          *              shadow chain and entering a new page in the top
 965          *              object before we do, we must keep a busy page in
 966          *              the top object while following the shadow chain.
 967          *
 968          *      3)      We must increment paging_in_progress on any object
 969          *              for which we have a busy page before dropping
 970          *              the object lock
 971          *
 972          *      4)      We leave busy pages on the pageout queues.
 973          *              If the pageout daemon comes across a busy page,
 974          *              it will remove the page from the pageout queues.
 975          */
 976
 977         object = first_object;
 978         offset = first_offset;
 979         first_m = VM_PAGE_NULL;
 980         access_required = fault_type;
 981
 982
 983         XPR(XPR_VM_FAULT,
 984                 "vm_f_page: obj 0x%X, offset 0x%X, type %d, prot %d\n",
 985                 object, offset, fault_type, *protection, 0);
 986
 987         /*
 988          * default type of fault
 989          */
 990         my_fault = DBG_CACHE_HIT_FAULT;
 991
 992         while (TRUE) {
 993 #if TRACEFAULTPAGE
 994                 dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
 995 #endif
 996
 997                 grab_options = 0;
 998 #if CONFIG_SECLUDED_MEMORY
 999                 if (object->can_grab_secluded) {
1000                         grab_options |= VM_PAGE_GRAB_SECLUDED;
1001                 }
1002 #endif /* CONFIG_SECLUDED_MEMORY */
1003
1004                 if (!object->alive) {
1005                         /*
1006                          * object is no longer valid
1007                          * clean up and return error
1008                          */
1009                         vm_fault_cleanup(object, first_m);
1010                         thread_interrupt_level(interruptible_state);
1011
1012                         return (VM_FAULT_MEMORY_ERROR);
1013                 }
1014
1015                 if (!object->pager_created && object->phys_contiguous) {
1016                         /*
1017                          * A physically-contiguous object without a pager:
1018                          * must be a "large page" object.  We do not deal
1019                          * with VM pages for this object.
1020                          */
1021                         caller_lookup = FALSE;
1022                         m = VM_PAGE_NULL;
1023                         goto phys_contig_object;
1024                 }
1025
1026                 if (object->blocked_access) {
1027                         /*
1028                          * Access to this VM object has been blocked.
1029                          * Replace our "paging_in_progress" reference with
1030                          * a "activity_in_progress" reference and wait for
1031                          * access to be unblocked.
1032                          */
1033                         caller_lookup = FALSE; /* no longer valid after sleep */
1034                         vm_object_activity_begin(object);
1035                         vm_object_paging_end(object);
1036                         while (object->blocked_access) {
1037                                 vm_object_sleep(object,
1038                                                 VM_OBJECT_EVENT_UNBLOCKED,
1039                                                 THREAD_UNINT);
1040                         }
1041                         vm_fault_page_blocked_access++;
1042                         vm_object_paging_begin(object);
1043                         vm_object_activity_end(object);
1044                 }
1045
1046                 /*
1047                  * See whether the page at 'offset' is resident
1048                  */
1049                 if (caller_lookup == TRUE) {
1050                         /*
1051                          * The caller has already looked up the page
1052                          * and gave us the result in "result_page".
1053                          * We can use this for the first lookup but
1054                          * it loses its validity as soon as we unlock
1055                          * the object.
1056                          */
1057                         m = *result_page;
1058                         caller_lookup = FALSE; /* no longer valid after that */
1059                 } else {
1060                         m = vm_page_lookup(object, offset);
1061                 }
1062 #if TRACEFAULTPAGE
1063                 dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
1064 #endif
1065                 if (m != VM_PAGE_NULL) {
1066
1067                         if (m->vmp_busy) {
1068                                 /*
1069                                  * The page is being brought in,
1070                                  * wait for it and then retry.
1071                                  */
1072 #if TRACEFAULTPAGE
1073                                 dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1074 #endif
1075                                 wait_result = PAGE_SLEEP(object, m, interruptible);
1076
1077                                 XPR(XPR_VM_FAULT,
1078                                     "vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n",
1079                                     object, offset,
1080                                     m, 0, 0);
1081                                 counter(c_vm_fault_page_block_busy_kernel++);
1082
1083                                 if (wait_result != THREAD_AWAKENED) {
1084                                         vm_fault_cleanup(object, first_m);
1085                                         thread_interrupt_level(interruptible_state);
1086
1087                                         if (wait_result == THREAD_RESTART)
1088                                                 return (VM_FAULT_RETRY);
1089                                         else
1090                                                 return (VM_FAULT_INTERRUPTED);
1091                                 }
1092                                 continue;
1093                         }
1094                         if (m->vmp_laundry) {
1095                                 m->vmp_free_when_done = FALSE;
1096
1097                                 if (!m->vmp_cleaning)
1098                                         vm_pageout_steal_laundry(m, FALSE);
1099                         }
1100                         if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
1101                                 /*
1102                                  * Guard page: off limits !
1103                                  */
1104                                 if (fault_type == VM_PROT_NONE) {
1105                                         /*
1106                                          * The fault is not requesting any
1107                                          * access to the guard page, so it must
1108                                          * be just to wire or unwire it.
1109                                          * Let's pretend it succeeded...
1110                                          */
1111                                         m->vmp_busy = TRUE;
1112                                         *result_page = m;
1113                                         assert(first_m == VM_PAGE_NULL);
1114                                         *top_page = first_m;
1115                                         if (type_of_fault)
1116                                                 *type_of_fault = DBG_GUARD_FAULT;
1117                                         thread_interrupt_level(interruptible_state);
1118                                         return VM_FAULT_SUCCESS;
1119                                 } else {
1120                                         /*
1121                                          * The fault requests access to the
1122                                          * guard page: let's deny that !
1123                                          */
1124                                         vm_fault_cleanup(object, first_m);
1125                                         thread_interrupt_level(interruptible_state);
1126                                         return VM_FAULT_MEMORY_ERROR;
1127                                 }
1128                         }
1129
1130                         if (m->vmp_error) {
1131                                 /*
1132                                  * The page is in error, give up now.
1133                                  */
1134 #if TRACEFAULTPAGE
1135                                 dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code);      /* (TEST/DEBUG) */
1136 #endif
1137                                 if (error_code)
1138                                         *error_code = KERN_MEMORY_ERROR;
1139                                 VM_PAGE_FREE(m);
1140
1141                                 vm_fault_cleanup(object, first_m);
1142                                 thread_interrupt_level(interruptible_state);
1143
1144                                 return (VM_FAULT_MEMORY_ERROR);
1145                         }
1146                         if (m->vmp_restart) {
1147                                 /*
1148                                  * The pager wants us to restart
1149                                  * at the top of the chain,
1150                                  * typically because it has moved the
1151                                  * page to another pager, then do so.
1152                                  */
1153 #if TRACEFAULTPAGE
1154                                 dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1155 #endif
1156                                 VM_PAGE_FREE(m);
1157
1158                                 vm_fault_cleanup(object, first_m);
1159                                 thread_interrupt_level(interruptible_state);
1160
1161                                 return (VM_FAULT_RETRY);
1162                         }
1163                         if (m->vmp_absent) {
1164                                 /*
1165                                  * The page isn't busy, but is absent,
1166                                  * therefore it's deemed "unavailable".
1167                                  *
1168                                  * Remove the non-existent page (unless it's
1169                                  * in the top object) and move on down to the
1170                                  * next object (if there is one).
1171                                  */
1172 #if TRACEFAULTPAGE
1173                                 dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow);  /* (TEST/DEBUG) */
1174 #endif
1175                                 next_object = object->shadow;
1176
1177                                 if (next_object == VM_OBJECT_NULL) {
1178                                         /*
1179                                          * Absent page at bottom of shadow
1180                                          * chain; zero fill the page we left
1181                                          * busy in the first object, and free
1182                                          * the absent page.
1183                                          */
1184                                         assert(!must_be_resident);
1185
1186                                         /*
1187                                          * check for any conditions that prevent
1188                                          * us from creating a new zero-fill page
1189                                          * vm_fault_check will do all of the
1190                                          * fault cleanup in the case of an error condition
1191                                          * including resetting the thread_interrupt_level
1192                                          */
1193                                         error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
1194
1195                                         if (error != VM_FAULT_SUCCESS)
1196                                                 return (error);
1197
1198                                         XPR(XPR_VM_FAULT,
1199                                             "vm_f_page: zero obj 0x%X, off 0x%X, page 0x%X, first_obj 0x%X\n",
1200                                                 object, offset,
1201                                                 m,
1202                                                 first_object, 0);
1203
1204                                         if (object != first_object) {
1205                                                 /*
1206                                                  * free the absent page we just found
1207                                                  */
1208                                                 VM_PAGE_FREE(m);
1209
1210                                                 /*
1211                                                  * drop reference and lock on current object
1212                                                  */
1213                                                 vm_object_paging_end(object);
1214                                                 vm_object_unlock(object);
1215
1216                                                 /*
1217                                                  * grab the original page we
1218                                                  * 'soldered' in place and
1219                                                  * retake lock on 'first_object'
1220                                                  */
1221                                                 m = first_m;
1222                                                 first_m = VM_PAGE_NULL;
1223
1224                                                 object = first_object;
1225                                                 offset = first_offset;
1226
1227                                                 vm_object_lock(object);
1228                                         } else {
1229                                                 /*
1230                                                  * we're going to use the absent page we just found
1231                                                  * so convert it to a 'busy' page
1232                                                  */
1233                                                 m->vmp_absent = FALSE;
1234                                                 m->vmp_busy = TRUE;
1235                                         }
1236                                         if (fault_info->mark_zf_absent && no_zero_fill == TRUE)
1237                                                 m->vmp_absent = TRUE;
1238                                         /*
1239                                          * zero-fill the page and put it on
1240                                          * the correct paging queue
1241                                          */
1242                                         my_fault = vm_fault_zero_page(m, no_zero_fill);
1243
1244                                         break;
1245                                 } else {
1246                                         if (must_be_resident)
1247                                                 vm_object_paging_end(object);
1248                                         else if (object != first_object) {
1249                                                 vm_object_paging_end(object);
1250                                                 VM_PAGE_FREE(m);
1251                                         } else {
1252                                                 first_m = m;
1253                                                 m->vmp_absent = FALSE;
1254                                                 m->vmp_busy = TRUE;
1255
1256                                                 vm_page_lockspin_queues();
1257                                                 vm_page_queues_remove(m, FALSE);
1258                                                 vm_page_unlock_queues();
1259                                         }
1260                                         XPR(XPR_VM_FAULT,
1261                                             "vm_f_page: unavail obj 0x%X, off 0x%X, next_obj 0x%X, newoff 0x%X\n",
1262                                                 object, offset,
1263                                                 next_object,
1264                                                 offset+object->vo_shadow_offset,0);
1265
1266                                         offset += object->vo_shadow_offset;
1267                                         fault_info->lo_offset += object->vo_shadow_offset;
1268                                         fault_info->hi_offset += object->vo_shadow_offset;
1269                                         access_required = VM_PROT_READ;
1270
1271                                         vm_object_lock(next_object);
1272                                         vm_object_unlock(object);
1273                                         object = next_object;
1274                                         vm_object_paging_begin(object);
1275
1276                                         /*
1277                                          * reset to default type of fault
1278                                          */
1279                                         my_fault = DBG_CACHE_HIT_FAULT;
1280
1281                                         continue;
1282                                 }
1283                         }
1284                         if ((m->vmp_cleaning)
1285                             && ((object != first_object) || (object->copy != VM_OBJECT_NULL))
1286                             && (fault_type & VM_PROT_WRITE)) {
1287                                 /*
1288                                  * This is a copy-on-write fault that will
1289                                  * cause us to revoke access to this page, but
1290                                  * this page is in the process of being cleaned
1291                                  * in a clustered pageout. We must wait until
1292                                  * the cleaning operation completes before
1293                                  * revoking access to the original page,
1294                                  * otherwise we might attempt to remove a
1295                                  * wired mapping.
1296                                  */
1297 #if TRACEFAULTPAGE
1298                                 dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset);  /* (TEST/DEBUG) */
1299 #endif
1300                                 XPR(XPR_VM_FAULT,
1301                                     "vm_f_page: cleaning obj 0x%X, offset 0x%X, page 0x%X\n",
1302                                         object, offset,
1303                                         m, 0, 0);
1304                                 /*
1305                                  * take an extra ref so that object won't die
1306                                  */
1307                                 vm_object_reference_locked(object);
1308
1309                                 vm_fault_cleanup(object, first_m);
1310
1311                                 counter(c_vm_fault_page_block_backoff_kernel++);
1312                                 vm_object_lock(object);
1313                                 assert(object->ref_count > 0);
1314
1315                                 m = vm_page_lookup(object, offset);
1316
1317                                 if (m != VM_PAGE_NULL && m->vmp_cleaning) {
1318                                         PAGE_ASSERT_WAIT(m, interruptible);
1319
1320                                         vm_object_unlock(object);
1321                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1322                                         vm_object_deallocate(object);
1323
1324                                         goto backoff;
1325                                 } else {
1326                                         vm_object_unlock(object);
1327
1328                                         vm_object_deallocate(object);
1329                                         thread_interrupt_level(interruptible_state);
1330
1331                                         return (VM_FAULT_RETRY);
1332                                 }
1333                         }
1334                         if (type_of_fault == NULL && (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) &&
1335                             !(fault_info != NULL && fault_info->stealth)) {
1336                                 /*
1337                                  * If we were passed a non-NULL pointer for
1338                                  * "type_of_fault", than we came from
1339                                  * vm_fault... we'll let it deal with
1340                                  * this condition, since it
1341                                  * needs to see m->vmp_speculative to correctly
1342                                  * account the pageins, otherwise...
1343                                  * take it off the speculative queue, we'll
1344                                  * let the caller of vm_fault_page deal
1345                                  * with getting it onto the correct queue
1346                                  *
1347                                  * If the caller specified in fault_info that
1348                                  * it wants a "stealth" fault, we also leave
1349                                  * the page in the speculative queue.
1350                                  */
1351                                 vm_page_lockspin_queues();
1352                                 if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q)
1353                                         vm_page_queues_remove(m, FALSE);
1354                                 vm_page_unlock_queues();
1355                         }
1356                         assert(object == VM_PAGE_OBJECT(m));
1357
1358                         if (object->code_signed) {
1359                                 /*
1360                                  * CODE SIGNING:
1361                                  * We just paged in a page from a signed
1362                                  * memory object but we don't need to
1363                                  * validate it now.  We'll validate it if
1364                                  * when it gets mapped into a user address
1365                                  * space for the first time or when the page
1366                                  * gets copied to another object as a result
1367                                  * of a copy-on-write.
1368                                  */
1369                         }
1370
1371                         /*
1372                          * We mark the page busy and leave it on
1373                          * the pageout queues.  If the pageout
1374                          * deamon comes across it, then it will
1375                          * remove the page from the queue, but not the object
1376                          */
1377 #if TRACEFAULTPAGE
1378                         dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1379 #endif
1380                         XPR(XPR_VM_FAULT,
1381                             "vm_f_page: found page obj 0x%X, offset 0x%X, page 0x%X\n",
1382                                 object, offset, m, 0, 0);
1383                         assert(!m->vmp_busy);
1384                         assert(!m->vmp_absent);
1385
1386                         m->vmp_busy = TRUE;
1387                         break;
1388                 }
1389
1390
1391                 /*
1392                  * we get here when there is no page present in the object at
1393                  * the offset we're interested in... we'll allocate a page
1394                  * at this point if the pager associated with
1395                  * this object can provide the data or we're the top object...
1396                  * object is locked;  m == NULL
1397                  */
1398
1399                 if (must_be_resident) {
1400                         if (fault_type == VM_PROT_NONE &&
1401                             object == kernel_object) {
1402                                 /*
1403                                  * We've been called from vm_fault_unwire()
1404                                  * while removing a map entry that was allocated
1405                                  * with KMA_KOBJECT and KMA_VAONLY.  This page
1406                                  * is not present and there's nothing more to
1407                                  * do here (nothing to unwire).
1408                                  */
1409                                 vm_fault_cleanup(object, first_m);
1410                                 thread_interrupt_level(interruptible_state);
1411
1412                                 return VM_FAULT_MEMORY_ERROR;
1413                         }
1414
1415                         goto dont_look_for_page;
1416                 }
1417
1418                 /* Don't expect to fault pages into the kernel object. */
1419                 assert(object != kernel_object);
1420
1421                 data_supply = FALSE;
1422
1423                 look_for_page = (object->pager_created && (MUST_ASK_PAGER(object, offset, external_state) == TRUE) && !data_supply);
1424
1425 #if TRACEFAULTPAGE
1426                 dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object);      /* (TEST/DEBUG) */
1427 #endif
1428                 if (!look_for_page && object == first_object && !object->phys_contiguous) {
1429                         /*
1430                          * Allocate a new page for this object/offset pair as a placeholder
1431                          */
1432                         m = vm_page_grab_options(grab_options);
1433 #if TRACEFAULTPAGE
1434                         dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
1435 #endif
1436                         if (m == VM_PAGE_NULL) {
1437
1438                                 vm_fault_cleanup(object, first_m);
1439                                 thread_interrupt_level(interruptible_state);
1440
1441                                 return (VM_FAULT_MEMORY_SHORTAGE);
1442                         }
1443
1444                         if (fault_info && fault_info->batch_pmap_op == TRUE) {
1445                                 vm_page_insert_internal(m, object, offset, VM_KERN_MEMORY_NONE, FALSE, TRUE, TRUE, FALSE, NULL);
1446                         } else {
1447                                 vm_page_insert(m, object, offset);
1448                         }
1449                 }
1450                 if (look_for_page) {
1451                         kern_return_t   rc;
1452                         int             my_fault_type;
1453
1454                         /*
1455                          *      If the memory manager is not ready, we
1456                          *      cannot make requests.
1457                          */
1458                         if (!object->pager_ready) {
1459 #if TRACEFAULTPAGE
1460                                 dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
1461 #endif
1462                                 if (m != VM_PAGE_NULL)
1463                                         VM_PAGE_FREE(m);
1464
1465                                 XPR(XPR_VM_FAULT,
1466                                 "vm_f_page: ready wait obj 0x%X, offset 0x%X\n",
1467                                         object, offset, 0, 0, 0);
1468
1469                                 /*
1470                                  * take an extra ref so object won't die
1471                                  */
1472                                 vm_object_reference_locked(object);
1473                                 vm_fault_cleanup(object, first_m);
1474                                 counter(c_vm_fault_page_block_backoff_kernel++);
1475
1476                                 vm_object_lock(object);
1477                                 assert(object->ref_count > 0);
1478
1479                                 if (!object->pager_ready) {
1480                                         wait_result = vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGER_READY, interruptible);
1481
1482                                         vm_object_unlock(object);
1483                                         if (wait_result == THREAD_WAITING)
1484                                                 wait_result = thread_block(THREAD_CONTINUE_NULL);
1485                                         vm_object_deallocate(object);
1486
1487                                         goto backoff;
1488                                 } else {
1489                                         vm_object_unlock(object);
1490                                         vm_object_deallocate(object);
1491                                         thread_interrupt_level(interruptible_state);
1492
1493                                         return (VM_FAULT_RETRY);
1494                                 }
1495                         }
1496                         if (!object->internal && !object->phys_contiguous && object->paging_in_progress > vm_object_pagein_throttle) {
1497                                 /*
1498                                  * If there are too many outstanding page
1499                                  * requests pending on this external object, we
1500                                  * wait for them to be resolved now.
1501                                  */
1502 #if TRACEFAULTPAGE
1503                                 dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1504 #endif
1505                                 if (m != VM_PAGE_NULL)
1506                                         VM_PAGE_FREE(m);
1507                                 /*
1508                                  * take an extra ref so object won't die
1509                                  */
1510                                 vm_object_reference_locked(object);
1511
1512                                 vm_fault_cleanup(object, first_m);
1513
1514                                 counter(c_vm_fault_page_block_backoff_kernel++);
1515
1516                                 vm_object_lock(object);
1517                                 assert(object->ref_count > 0);
1518
1519                                 if (object->paging_in_progress >= vm_object_pagein_throttle) {
1520                                         vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGING_ONLY_IN_PROGRESS, interruptible);
1521
1522                                         vm_object_unlock(object);
1523                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1524                                         vm_object_deallocate(object);
1525
1526                                         goto backoff;
1527                                 } else {
1528                                         vm_object_unlock(object);
1529                                         vm_object_deallocate(object);
1530                                         thread_interrupt_level(interruptible_state);
1531
1532                                         return (VM_FAULT_RETRY);
1533                                 }
1534                         }
1535                         if (object->internal) {
1536                                 int compressed_count_delta;
1537
1538                                 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
1539
1540                                 if (m == VM_PAGE_NULL) {
1541                                         /*
1542                                          * Allocate a new page for this object/offset pair as a placeholder
1543                                          */
1544                                         m = vm_page_grab_options(grab_options);
1545 #if TRACEFAULTPAGE
1546                                         dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
1547 #endif
1548                                         if (m == VM_PAGE_NULL) {
1549
1550                                                 vm_fault_cleanup(object, first_m);
1551                                                 thread_interrupt_level(interruptible_state);
1552
1553                                                 return (VM_FAULT_MEMORY_SHORTAGE);
1554                                         }
1555
1556                                         m->vmp_absent = TRUE;
1557                                         if (fault_info && fault_info->batch_pmap_op == TRUE) {
1558                                                 vm_page_insert_internal(m, object, offset, VM_KERN_MEMORY_NONE, FALSE, TRUE, TRUE, FALSE, NULL);
1559                                         } else {
1560                                                 vm_page_insert(m, object, offset);
1561                                         }
1562                                 }
1563                                 assert(m->vmp_busy);
1564
1565                                 m->vmp_absent = TRUE;
1566                                 pager = object->pager;
1567
1568                                 assert(object->paging_in_progress > 0);
1569                                 vm_object_unlock(object);
1570
1571                                 rc = vm_compressor_pager_get(
1572                                         pager,
1573                                         offset + object->paging_offset,
1574                                         VM_PAGE_GET_PHYS_PAGE(m),
1575                                         &my_fault_type,
1576                                         0,
1577                                         &compressed_count_delta);
1578
1579                                 if (type_of_fault == NULL) {
1580                                         int     throttle_delay;
1581
1582                                         /*
1583                                          * we weren't called from vm_fault, so we
1584                                          * need to apply page creation throttling
1585                                          * do it before we re-acquire any locks
1586                                          */
1587                                         if (my_fault_type == DBG_COMPRESSOR_FAULT) {
1588                                                 if ((throttle_delay = vm_page_throttled(TRUE))) {
1589                                                         VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 1, 0);
1590                                                         delay(throttle_delay);
1591                                                 }
1592                                         }
1593                                 }
1594                                 vm_object_lock(object);
1595                                 assert(object->paging_in_progress > 0);
1596
1597                                 vm_compressor_pager_count(
1598                                         pager,
1599                                         compressed_count_delta,
1600                                         FALSE, /* shared_lock */
1601                                         object);
1602
1603                                 switch (rc) {
1604                                 case KERN_SUCCESS:
1605                                         m->vmp_absent = FALSE;
1606                                         m->vmp_dirty = TRUE;
1607                                         if ((object->wimg_bits &
1608                                              VM_WIMG_MASK) !=
1609                                             VM_WIMG_USE_DEFAULT) {
1610                                                 /*
1611                                                  * If the page is not cacheable,
1612                                                  * we can't let its contents
1613                                                  * linger in the data cache
1614                                                  * after the decompression.
1615                                                  */
1616                                                 pmap_sync_page_attributes_phys(
1617                                                         VM_PAGE_GET_PHYS_PAGE(m));
1618                                         } else {
1619                                                 m->vmp_written_by_kernel = TRUE;
1620                                         }
1621
1622                                         /*
1623                                          * If the object is purgeable, its
1624                                          * owner's purgeable ledgers have been
1625                                          * updated in vm_page_insert() but the
1626                                          * page was also accounted for in a
1627                                          * "compressed purgeable" ledger, so
1628                                          * update that now.
1629                                          */
1630                                         if (((object->purgable !=
1631                                               VM_PURGABLE_DENY) ||
1632                                              object->vo_ledger_tag) &&
1633                                             (object->vo_owner !=
1634                                              NULL)) {
1635                                                 /*
1636                                                  * One less compressed
1637                                                  * purgeable/tagged page.
1638                                                  */
1639                                                 vm_object_owner_compressed_update(
1640                                                         object,
1641                                                         -1);
1642                                         }
1643
1644                                         break;
1645                                 case KERN_MEMORY_FAILURE:
1646                                         m->vmp_unusual = TRUE;
1647                                         m->vmp_error = TRUE;
1648                                         m->vmp_absent = FALSE;
1649                                         break;
1650                                 case KERN_MEMORY_ERROR:
1651                                         assert(m->vmp_absent);
1652                                         break;
1653                                 default:
1654                                         panic("vm_fault_page(): unexpected "
1655                                               "error %d from "
1656                                               "vm_compressor_pager_get()\n",
1657                                               rc);
1658                                 }
1659                                 PAGE_WAKEUP_DONE(m);
1660
1661                                 rc = KERN_SUCCESS;
1662                                 goto data_requested;
1663                         }
1664                         my_fault_type = DBG_PAGEIN_FAULT;
1665
1666                         if (m != VM_PAGE_NULL) {
1667                                 VM_PAGE_FREE(m);
1668                                 m = VM_PAGE_NULL;
1669                         }
1670
1671 #if TRACEFAULTPAGE
1672                         dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0);  /* (TEST/DEBUG) */
1673 #endif
1674
1675                         /*
1676                          * It's possible someone called vm_object_destroy while we weren't
1677                          * holding the object lock.  If that has happened, then bail out
1678                          * here.
1679                          */
1680
1681                         pager = object->pager;
1682
1683                         if (pager == MEMORY_OBJECT_NULL) {
1684                                 vm_fault_cleanup(object, first_m);
1685                                 thread_interrupt_level(interruptible_state);
1686                                 return VM_FAULT_MEMORY_ERROR;
1687                         }
1688
1689                         /*
1690                          * We have an absent page in place for the faulting offset,
1691                          * so we can release the object lock.
1692                          */
1693
1694                         if (object->object_is_shared_cache) {
1695                                 set_thread_rwlock_boost();
1696                         }
1697
1698                         vm_object_unlock(object);
1699
1700                         /*
1701                          * If this object uses a copy_call strategy,
1702                          * and we are interested in a copy of this object
1703                          * (having gotten here only by following a
1704                          * shadow chain), then tell the memory manager
1705                          * via a flag added to the desired_access
1706                          * parameter, so that it can detect a race
1707                          * between our walking down the shadow chain
1708                          * and its pushing pages up into a copy of
1709                          * the object that it manages.
1710                          */
1711                         if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL && object != first_object)
1712                                 wants_copy_flag = VM_PROT_WANTS_COPY;
1713                         else
1714                                 wants_copy_flag = VM_PROT_NONE;
1715
1716                         XPR(XPR_VM_FAULT,
1717                             "vm_f_page: data_req obj 0x%X, offset 0x%X, page 0x%X, acc %d\n",
1718                                 object, offset, m,
1719                                 access_required | wants_copy_flag, 0);
1720
1721                         if (object->copy == first_object) {
1722                                 /*
1723                                  * if we issue the memory_object_data_request in
1724                                  * this state, we are subject to a deadlock with
1725                                  * the underlying filesystem if it is trying to
1726                                  * shrink the file resulting in a push of pages
1727                                  * into the copy object...  that push will stall
1728                                  * on the placeholder page, and if the pushing thread
1729                                  * is holding a lock that is required on the pagein
1730                                  * path (such as a truncate lock), we'll deadlock...
1731                                  * to avoid this potential deadlock, we throw away
1732                                  * our placeholder page before calling memory_object_data_request
1733                                  * and force this thread to retry the vm_fault_page after
1734                                  * we have issued the I/O.  the second time through this path
1735                                  * we will find the page already in the cache (presumably still
1736                                  * busy waiting for the I/O to complete) and then complete
1737                                  * the fault w/o having to go through memory_object_data_request again
1738                                  */
1739                                 assert(first_m != VM_PAGE_NULL);
1740                                 assert(VM_PAGE_OBJECT(first_m) == first_object);
1741
1742                                 vm_object_lock(first_object);
1743                                 VM_PAGE_FREE(first_m);
1744                                 vm_object_paging_end(first_object);
1745                                 vm_object_unlock(first_object);
1746
1747                                 first_m = VM_PAGE_NULL;
1748                                 force_fault_retry = TRUE;
1749
1750                                 vm_fault_page_forced_retry++;
1751                         }
1752
1753                         if (data_already_requested == TRUE) {
1754                                 orig_behavior = fault_info->behavior;
1755                                 orig_cluster_size = fault_info->cluster_size;
1756
1757                                 fault_info->behavior = VM_BEHAVIOR_RANDOM;
1758                                 fault_info->cluster_size = PAGE_SIZE;
1759                         }
1760                         /*
1761                          * Call the memory manager to retrieve the data.
1762                          */
1763                         rc = memory_object_data_request(
1764                                 pager,
1765                                 offset + object->paging_offset,
1766                                 PAGE_SIZE,
1767                                 access_required | wants_copy_flag,
1768                                 (memory_object_fault_info_t)fault_info);
1769
1770                         if (data_already_requested == TRUE) {
1771                                 fault_info->behavior = orig_behavior;
1772                                 fault_info->cluster_size = orig_cluster_size;
1773                         } else
1774                                 data_already_requested = TRUE;
1775
1776                         DTRACE_VM2(maj_fault, int, 1, (uint64_t *), NULL);
1777 #if TRACEFAULTPAGE
1778                         dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
1779 #endif
1780                         vm_object_lock(object);
1781
1782                         if (object->object_is_shared_cache) {
1783                                 clear_thread_rwlock_boost();
1784                         }
1785
1786                 data_requested:
1787                         if (rc != KERN_SUCCESS) {
1788
1789                                 vm_fault_cleanup(object, first_m);
1790                                 thread_interrupt_level(interruptible_state);
1791
1792                                 return ((rc == MACH_SEND_INTERRUPTED) ?
1793                                         VM_FAULT_INTERRUPTED :
1794                                         VM_FAULT_MEMORY_ERROR);
1795                         } else {
1796                                 clock_sec_t     tv_sec;
1797                                 clock_usec_t    tv_usec;
1798
1799                                 if (my_fault_type == DBG_PAGEIN_FAULT) {
1800                                         clock_get_system_microtime(&tv_sec, &tv_usec);
1801                                         current_thread()->t_page_creation_time = tv_sec;
1802                                         current_thread()->t_page_creation_count = 0;
1803                                 }
1804                         }
1805                         if ((interruptible != THREAD_UNINT) && (current_thread()->sched_flags & TH_SFLAG_ABORT)) {
1806
1807                                 vm_fault_cleanup(object, first_m);
1808                                 thread_interrupt_level(interruptible_state);
1809
1810                                 return (VM_FAULT_INTERRUPTED);
1811                         }
1812                         if (force_fault_retry == TRUE) {
1813
1814                                 vm_fault_cleanup(object, first_m);
1815                                 thread_interrupt_level(interruptible_state);
1816
1817                                 return (VM_FAULT_RETRY);
1818                         }
1819                         if (m == VM_PAGE_NULL && object->phys_contiguous) {
1820                                 /*
1821                                  * No page here means that the object we
1822                                  * initially looked up was "physically
1823                                  * contiguous" (i.e. device memory).  However,
1824                                  * with Virtual VRAM, the object might not
1825                                  * be backed by that device memory anymore,
1826                                  * so we're done here only if the object is
1827                                  * still "phys_contiguous".
1828                                  * Otherwise, if the object is no longer
1829                                  * "phys_contiguous", we need to retry the
1830                                  * page fault against the object's new backing
1831                                  * store (different memory object).
1832                                  */
1833                         phys_contig_object:
1834                                 goto done;
1835                         }
1836                         /*
1837                          * potentially a pagein fault
1838                          * if we make it through the state checks
1839                          * above, than we'll count it as such
1840                          */
1841                         my_fault = my_fault_type;
1842
1843                         /*
1844                          * Retry with same object/offset, since new data may
1845                          * be in a different page (i.e., m is meaningless at
1846                          * this point).
1847                          */
1848                         continue;
1849                 }
1850 dont_look_for_page:
1851                 /*
1852                  * We get here if the object has no pager, or an existence map
1853                  * exists and indicates the page isn't present on the pager
1854                  * or we're unwiring a page.  If a pager exists, but there
1855                  * is no existence map, then the m->vmp_absent case above handles
1856                  * the ZF case when the pager can't provide the page
1857                  */
1858 #if TRACEFAULTPAGE
1859                 dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1860 #endif
1861                 if (object == first_object)
1862                         first_m = m;
1863                 else
1864                         assert(m == VM_PAGE_NULL);
1865
1866                 XPR(XPR_VM_FAULT,
1867                     "vm_f_page: no pager obj 0x%X, offset 0x%X, page 0x%X, next_obj 0x%X\n",
1868                         object, offset, m,
1869                         object->shadow, 0);
1870
1871                 next_object = object->shadow;
1872
1873                 if (next_object == VM_OBJECT_NULL) {
1874                         /*
1875                          * we've hit the bottom of the shadown chain,
1876                          * fill the page in the top object with zeros.
1877                          */
1878                         assert(!must_be_resident);
1879
1880                         if (object != first_object) {
1881                                 vm_object_paging_end(object);
1882                                 vm_object_unlock(object);
1883
1884                                 object = first_object;
1885                                 offset = first_offset;
1886                                 vm_object_lock(object);
1887                         }
1888                         m = first_m;
1889                         assert(VM_PAGE_OBJECT(m) == object);
1890                         first_m = VM_PAGE_NULL;
1891
1892                         /*
1893                          * check for any conditions that prevent
1894                          * us from creating a new zero-fill page
1895                          * vm_fault_check will do all of the
1896                          * fault cleanup in the case of an error condition
1897                          * including resetting the thread_interrupt_level
1898                          */
1899                         error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
1900
1901                         if (error != VM_FAULT_SUCCESS)
1902                                 return (error);
1903
1904                         if (m == VM_PAGE_NULL) {
1905                                 m = vm_page_grab_options(grab_options);
1906
1907                                 if (m == VM_PAGE_NULL) {
1908                                         vm_fault_cleanup(object, VM_PAGE_NULL);
1909                                         thread_interrupt_level(interruptible_state);
1910
1911                                         return (VM_FAULT_MEMORY_SHORTAGE);
1912                                 }
1913                                 vm_page_insert(m, object, offset);
1914                         }
1915                         if (fault_info->mark_zf_absent && no_zero_fill == TRUE)
1916                                 m->vmp_absent = TRUE;
1917
1918                         my_fault = vm_fault_zero_page(m, no_zero_fill);
1919
1920                         break;
1921
1922                 } else {
1923                         /*
1924                          * Move on to the next object.  Lock the next
1925                          * object before unlocking the current one.
1926                          */
1927                         if ((object != first_object) || must_be_resident)
1928                                 vm_object_paging_end(object);
1929
1930                         offset += object->vo_shadow_offset;
1931                         fault_info->lo_offset += object->vo_shadow_offset;
1932                         fault_info->hi_offset += object->vo_shadow_offset;
1933                         access_required = VM_PROT_READ;
1934
1935                         vm_object_lock(next_object);
1936                         vm_object_unlock(object);
1937
1938                         object = next_object;
1939                         vm_object_paging_begin(object);
1940                 }
1941         }
1942
1943         /*
1944          *      PAGE HAS BEEN FOUND.
1945          *
1946          *      This page (m) is:
1947          *              busy, so that we can play with it;
1948          *              not absent, so that nobody else will fill it;
1949          *              possibly eligible for pageout;
1950          *
1951          *      The top-level page (first_m) is:
1952          *              VM_PAGE_NULL if the page was found in the
1953          *               top-level object;
1954          *              busy, not absent, and ineligible for pageout.
1955          *
1956          *      The current object (object) is locked.  A paging
1957          *      reference is held for the current and top-level
1958          *      objects.
1959          */
1960
1961 #if TRACEFAULTPAGE
1962         dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1963 #endif
1964 #if     EXTRA_ASSERTIONS
1965         assert(m->vmp_busy && !m->vmp_absent);
1966         assert((first_m == VM_PAGE_NULL) ||
1967                (first_m->vmp_busy && !first_m->vmp_absent &&
1968                 !first_m->vmp_active && !first_m->vmp_inactive && !first_m->vmp_secluded));
1969 #endif  /* EXTRA_ASSERTIONS */
1970
1971         XPR(XPR_VM_FAULT,
1972             "vm_f_page: FOUND obj 0x%X, off 0x%X, page 0x%X, 1_obj 0x%X, 1_m 0x%X\n",
1973                 object, offset, m,
1974                 first_object, first_m);
1975
1976         /*
1977          * If the page is being written, but isn't
1978          * already owned by the top-level object,
1979          * we have to copy it into a new page owned
1980          * by the top-level object.
1981          */
1982         if (object != first_object) {
1983
1984 #if TRACEFAULTPAGE
1985                 dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1986 #endif
1987                 if (fault_type & VM_PROT_WRITE) {
1988                         vm_page_t copy_m;
1989
1990                         /*
1991                          * We only really need to copy if we
1992                          * want to write it.
1993                          */
1994                         assert(!must_be_resident);
1995
1996                         /*
1997                          * If we try to collapse first_object at this
1998                          * point, we may deadlock when we try to get
1999                          * the lock on an intermediate object (since we
2000                          * have the bottom object locked).  We can't
2001                          * unlock the bottom object, because the page
2002                          * we found may move (by collapse) if we do.
2003                          *
2004                          * Instead, we first copy the page.  Then, when
2005                          * we have no more use for the bottom object,
2006                          * we unlock it and try to collapse.
2007                          *
2008                          * Note that we copy the page even if we didn't
2009                          * need to... that's the breaks.
2010                          */
2011
2012                         /*
2013                          * Allocate a page for the copy
2014                          */
2015                         copy_m = vm_page_grab_options(grab_options);
2016
2017                         if (copy_m == VM_PAGE_NULL) {
2018                                 RELEASE_PAGE(m);
2019
2020                                 vm_fault_cleanup(object, first_m);
2021                                 thread_interrupt_level(interruptible_state);
2022
2023                                 return (VM_FAULT_MEMORY_SHORTAGE);
2024                         }
2025                         XPR(XPR_VM_FAULT,
2026                             "vm_f_page: page_copy obj 0x%X, offset 0x%X, m 0x%X, copy_m 0x%X\n",
2027                                 object, offset,
2028                                 m, copy_m, 0);
2029
2030                         vm_page_copy(m, copy_m);
2031
2032                         /*
2033                          * If another map is truly sharing this
2034                          * page with us, we have to flush all
2035                          * uses of the original page, since we
2036                          * can't distinguish those which want the
2037                          * original from those which need the
2038                          * new copy.
2039                          *
2040                          * XXXO If we know that only one map has
2041                          * access to this page, then we could
2042                          * avoid the pmap_disconnect() call.
2043                          */
2044                         if (m->vmp_pmapped)
2045                                 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
2046
2047                         if (m->vmp_clustered) {
2048                                 VM_PAGE_COUNT_AS_PAGEIN(m);
2049                                 VM_PAGE_CONSUME_CLUSTERED(m);
2050                         }
2051                         assert(!m->vmp_cleaning);
2052
2053                         /*
2054                          * We no longer need the old page or object.
2055                          */
2056                         RELEASE_PAGE(m);
2057
2058                         /*
2059                          * This check helps with marking the object as having a sequential pattern
2060                          * Normally we'll miss doing this below because this fault is about COW to
2061                          * the first_object i.e. bring page in from disk, push to object above but
2062                          * don't update the file object's sequential pattern.
2063                          */
2064                         if (object->internal == FALSE) {
2065                                 vm_fault_is_sequential(object, offset, fault_info->behavior);
2066                         }
2067
2068                         vm_object_paging_end(object);
2069                         vm_object_unlock(object);
2070
2071                         my_fault = DBG_COW_FAULT;
2072                         VM_STAT_INCR(cow_faults);
2073                         DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
2074                         current_task()->cow_faults++;
2075
2076                         object = first_object;
2077                         offset = first_offset;
2078
2079                         vm_object_lock(object);
2080                         /*
2081                          * get rid of the place holder
2082                          * page that we soldered in earlier
2083                          */
2084                         VM_PAGE_FREE(first_m);
2085                         first_m = VM_PAGE_NULL;
2086
2087                         /*
2088                          * and replace it with the
2089                          * page we just copied into
2090                          */
2091                         assert(copy_m->vmp_busy);
2092                         vm_page_insert(copy_m, object, offset);
2093                         SET_PAGE_DIRTY(copy_m, TRUE);
2094
2095                         m = copy_m;
2096                         /*
2097                          * Now that we've gotten the copy out of the
2098                          * way, let's try to collapse the top object.
2099                          * But we have to play ugly games with
2100                          * paging_in_progress to do that...
2101                          */
2102                         vm_object_paging_end(object);
2103                         vm_object_collapse(object, offset, TRUE);
2104                         vm_object_paging_begin(object);
2105
2106                 } else
2107                         *protection &= (~VM_PROT_WRITE);
2108         }
2109         /*
2110          * Now check whether the page needs to be pushed into the
2111          * copy object.  The use of asymmetric copy on write for
2112          * shared temporary objects means that we may do two copies to
2113          * satisfy the fault; one above to get the page from a
2114          * shadowed object, and one here to push it into the copy.
2115          */
2116         try_failed_count = 0;
2117
2118         while ((copy_object = first_object->copy) != VM_OBJECT_NULL) {
2119                 vm_object_offset_t      copy_offset;
2120                 vm_page_t               copy_m;
2121
2122 #if TRACEFAULTPAGE
2123                 dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type);    /* (TEST/DEBUG) */
2124 #endif
2125                 /*
2126                  * If the page is being written, but hasn't been
2127                  * copied to the copy-object, we have to copy it there.
2128                  */
2129                 if ((fault_type & VM_PROT_WRITE) == 0) {
2130                         *protection &= ~VM_PROT_WRITE;
2131                         break;
2132                 }
2133
2134                 /*
2135                  * If the page was guaranteed to be resident,
2136                  * we must have already performed the copy.
2137                  */
2138                 if (must_be_resident)
2139                         break;
2140
2141                 /*
2142                  * Try to get the lock on the copy_object.
2143                  */
2144                 if (!vm_object_lock_try(copy_object)) {
2145
2146                         vm_object_unlock(object);
2147                         try_failed_count++;
2148
2149                         mutex_pause(try_failed_count);  /* wait a bit */
2150                         vm_object_lock(object);
2151
2152                         continue;
2153                 }
2154                 try_failed_count = 0;
2155
2156                 /*
2157                  * Make another reference to the copy-object,
2158                  * to keep it from disappearing during the
2159                  * copy.
2160                  */
2161                 vm_object_reference_locked(copy_object);
2162
2163                 /*
2164                  * Does the page exist in the copy?
2165                  */
2166                 copy_offset = first_offset - copy_object->vo_shadow_offset;
2167
2168                 if (copy_object->vo_size <= copy_offset)
2169                         /*
2170                          * Copy object doesn't cover this page -- do nothing.
2171                          */
2172                         ;
2173                 else if ((copy_m = vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
2174                         /*
2175                          * Page currently exists in the copy object
2176                          */
2177                         if (copy_m->vmp_busy) {
2178                                 /*
2179                                  * If the page is being brought
2180                                  * in, wait for it and then retry.
2181                                  */
2182                                 RELEASE_PAGE(m);
2183
2184                                 /*
2185                                  * take an extra ref so object won't die
2186                                  */
2187                                 vm_object_reference_locked(copy_object);
2188                                 vm_object_unlock(copy_object);
2189                                 vm_fault_cleanup(object, first_m);
2190                                 counter(c_vm_fault_page_block_backoff_kernel++);
2191
2192                                 vm_object_lock(copy_object);
2193                                 assert(copy_object->ref_count > 0);
2194                                 VM_OBJ_RES_DECR(copy_object);
2195                                 vm_object_lock_assert_exclusive(copy_object);
2196                                 copy_object->ref_count--;
2197                                 assert(copy_object->ref_count > 0);
2198                                 copy_m = vm_page_lookup(copy_object, copy_offset);
2199
2200                                 if (copy_m != VM_PAGE_NULL && copy_m->vmp_busy) {
2201                                         PAGE_ASSERT_WAIT(copy_m, interruptible);
2202
2203                                         vm_object_unlock(copy_object);
2204                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
2205                                         vm_object_deallocate(copy_object);
2206
2207                                         goto backoff;
2208                                 } else {
2209                                         vm_object_unlock(copy_object);
2210                                         vm_object_deallocate(copy_object);
2211                                         thread_interrupt_level(interruptible_state);
2212
2213                                         return (VM_FAULT_RETRY);
2214                                 }
2215                         }
2216                 }
2217                 else if (!PAGED_OUT(copy_object, copy_offset)) {
2218                         /*
2219                          * If PAGED_OUT is TRUE, then the page used to exist
2220                          * in the copy-object, and has already been paged out.
2221                          * We don't need to repeat this. If PAGED_OUT is
2222                          * FALSE, then either we don't know (!pager_created,
2223                          * for example) or it hasn't been paged out.
2224                          * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
2225                          * We must copy the page to the copy object.
2226                          *
2227                          * Allocate a page for the copy
2228                          */
2229                         copy_m = vm_page_alloc(copy_object, copy_offset);
2230
2231                         if (copy_m == VM_PAGE_NULL) {
2232                                 RELEASE_PAGE(m);
2233
2234                                 VM_OBJ_RES_DECR(copy_object);
2235                                 vm_object_lock_assert_exclusive(copy_object);
2236                                 copy_object->ref_count--;
2237                                 assert(copy_object->ref_count > 0);
2238
2239                                 vm_object_unlock(copy_object);
2240                                 vm_fault_cleanup(object, first_m);
2241                                 thread_interrupt_level(interruptible_state);
2242
2243                                 return (VM_FAULT_MEMORY_SHORTAGE);
2244                         }
2245                         /*
2246                          * Must copy page into copy-object.
2247                          */
2248                         vm_page_copy(m, copy_m);
2249
2250                         /*
2251                          * If the old page was in use by any users
2252                          * of the copy-object, it must be removed
2253                          * from all pmaps.  (We can't know which
2254                          * pmaps use it.)
2255                          */
2256                         if (m->vmp_pmapped)
2257                                 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
2258
2259                         if (m->vmp_clustered) {
2260                                 VM_PAGE_COUNT_AS_PAGEIN(m);
2261                                 VM_PAGE_CONSUME_CLUSTERED(m);
2262                         }
2263                         /*
2264                          * If there's a pager, then immediately
2265                          * page out this page, using the "initialize"
2266                          * option.  Else, we use the copy.
2267                          */
2268                         if ((!copy_object->pager_ready)
2269                             || VM_COMPRESSOR_PAGER_STATE_GET(copy_object, copy_offset) == VM_EXTERNAL_STATE_ABSENT
2270                            ) {
2271
2272                                 vm_page_lockspin_queues();
2273                                 assert(!m->vmp_cleaning);
2274                                 vm_page_activate(copy_m);
2275                                 vm_page_unlock_queues();
2276
2277                                 SET_PAGE_DIRTY(copy_m, TRUE);
2278                                 PAGE_WAKEUP_DONE(copy_m);
2279
2280                         } else {
2281
2282                                 assert(copy_m->vmp_busy == TRUE);
2283                                 assert(!m->vmp_cleaning);
2284
2285                                 /*
2286                                  * dirty is protected by the object lock
2287                                  */
2288                                 SET_PAGE_DIRTY(copy_m, TRUE);
2289
2290                                 /*
2291                                  * The page is already ready for pageout:
2292                                  * not on pageout queues and busy.
2293                                  * Unlock everything except the
2294                                  * copy_object itself.
2295                                  */
2296                                 vm_object_unlock(object);
2297
2298                                 /*
2299                                  * Write the page to the copy-object,
2300                                  * flushing it from the kernel.
2301                                  */
2302                                 vm_pageout_initialize_page(copy_m);
2303
2304                                 /*
2305                                  * Since the pageout may have
2306                                  * temporarily dropped the
2307                                  * copy_object's lock, we
2308                                  * check whether we'll have
2309                                  * to deallocate the hard way.
2310                                  */
2311                                 if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
2312                                         vm_object_unlock(copy_object);
2313                                         vm_object_deallocate(copy_object);
2314                                         vm_object_lock(object);
2315
2316                                         continue;
2317                                 }
2318                                 /*
2319                                  * Pick back up the old object's
2320                                  * lock.  [It is safe to do so,
2321                                  * since it must be deeper in the
2322                                  * object tree.]
2323                                  */
2324                                 vm_object_lock(object);
2325                         }
2326
2327                         /*
2328                          * Because we're pushing a page upward
2329                          * in the object tree, we must restart
2330                          * any faults that are waiting here.
2331                          * [Note that this is an expansion of
2332                          * PAGE_WAKEUP that uses the THREAD_RESTART
2333                          * wait result].  Can't turn off the page's
2334                          * busy bit because we're not done with it.
2335                          */
2336                         if (m->vmp_wanted) {
2337                                 m->vmp_wanted = FALSE;
2338                                 thread_wakeup_with_result((event_t) m, THREAD_RESTART);
2339                         }
2340                 }
2341                 /*
2342                  * The reference count on copy_object must be
2343                  * at least 2: one for our extra reference,
2344                  * and at least one from the outside world
2345                  * (we checked that when we last locked
2346                  * copy_object).
2347                  */
2348                 vm_object_lock_assert_exclusive(copy_object);
2349                 copy_object->ref_count--;
2350                 assert(copy_object->ref_count > 0);
2351
2352                 VM_OBJ_RES_DECR(copy_object);
2353                 vm_object_unlock(copy_object);
2354
2355                 break;
2356         }
2357
2358 done:
2359         *result_page = m;
2360         *top_page = first_m;
2361
2362         XPR(XPR_VM_FAULT,
2363                 "vm_f_page: DONE obj 0x%X, offset 0x%X, m 0x%X, first_m 0x%X\n",
2364                 object, offset, m, first_m, 0);
2365
2366         if (m != VM_PAGE_NULL) {
2367                 assert(VM_PAGE_OBJECT(m) == object);
2368
2369                 retval = VM_FAULT_SUCCESS;
2370
2371                 if (my_fault == DBG_PAGEIN_FAULT) {
2372
2373                         VM_PAGE_COUNT_AS_PAGEIN(m);
2374
2375                         if (object->internal)
2376                                 my_fault = DBG_PAGEIND_FAULT;
2377                         else
2378                                 my_fault = DBG_PAGEINV_FAULT;
2379
2380                         /*
2381                          * evaluate access pattern and update state
2382                          * vm_fault_deactivate_behind depends on the
2383                          * state being up to date
2384                          */
2385                         vm_fault_is_sequential(object, offset, fault_info->behavior);
2386                         vm_fault_deactivate_behind(object, offset, fault_info->behavior);
2387
2388                 } else if (type_of_fault == NULL && my_fault == DBG_CACHE_HIT_FAULT) {
2389                         /*
2390                          * we weren't called from vm_fault, so handle the
2391                          * accounting here for hits in the cache
2392                          */
2393                         if (m->vmp_clustered) {
2394                                 VM_PAGE_COUNT_AS_PAGEIN(m);
2395                                 VM_PAGE_CONSUME_CLUSTERED(m);
2396                         }
2397                         vm_fault_is_sequential(object, offset, fault_info->behavior);
2398                         vm_fault_deactivate_behind(object, offset, fault_info->behavior);
2399
2400                 } else if (my_fault == DBG_COMPRESSOR_FAULT || my_fault == DBG_COMPRESSOR_SWAPIN_FAULT) {
2401
2402                         VM_STAT_INCR(decompressions);
2403                 }
2404                 if (type_of_fault)
2405                         *type_of_fault = my_fault;
2406         } else {
2407                 retval = VM_FAULT_SUCCESS_NO_VM_PAGE;
2408                 assert(first_m == VM_PAGE_NULL);
2409                 assert(object == first_object);
2410         }
2411
2412         thread_interrupt_level(interruptible_state);
2413
2414 #if TRACEFAULTPAGE
2415         dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0);       /* (TEST/DEBUG) */
2416 #endif
2417         return retval;
2418
2419 backoff:
2420         thread_interrupt_level(interruptible_state);
2421
2422         if (wait_result == THREAD_INTERRUPTED)
2423                 return (VM_FAULT_INTERRUPTED);
2424         return (VM_FAULT_RETRY);
2425
2426 #undef  RELEASE_PAGE
2427 }
2428
2429
2430
2431 /*
2432  * CODE SIGNING:
2433  * When soft faulting a page, we have to validate the page if:
2434  * 1. the page is being mapped in user space
2435  * 2. the page hasn't already been found to be "tainted"
2436  * 3. the page belongs to a code-signed object
2437  * 4. the page has not been validated yet or has been mapped for write.
2438  */
2439 #define VM_FAULT_NEED_CS_VALIDATION(pmap, page, page_obj)               \
2440         ((pmap) != kernel_pmap /*1*/ &&                                 \
2441          !(page)->vmp_cs_tainted /*2*/ &&                                       \
2442          (page_obj)->code_signed /*3*/ &&                                       \
2443          (!(page)->vmp_cs_validated || (page)->vmp_wpmapped /*4*/))
2444
2445
2446 /*
2447  * page queue lock must NOT be held
2448  * m->vmp_object must be locked
2449  *
2450  * NOTE: m->vmp_object could be locked "shared" only if we are called
2451  * from vm_fault() as part of a soft fault.  If so, we must be
2452  * careful not to modify the VM object in any way that is not
2453  * legal under a shared lock...
2454  */
2455 extern int panic_on_cs_killed;
2456 extern int proc_selfpid(void);
2457 extern char *proc_name_address(void *p);
2458 unsigned long cs_enter_tainted_rejected = 0;
2459 unsigned long cs_enter_tainted_accepted = 0;
2460 kern_return_t
2461 vm_fault_enter(vm_page_t m,
2462                pmap_t pmap,
2463                vm_map_offset_t vaddr,
2464                vm_prot_t prot,
2465                vm_prot_t caller_prot,
2466                boolean_t wired,
2467                boolean_t change_wiring,
2468                vm_tag_t  wire_tag,
2469                vm_object_fault_info_t fault_info,
2470                boolean_t *need_retry,
2471                int *type_of_fault)
2472 {
2473         kern_return_t   kr, pe_result;
2474         boolean_t       previously_pmapped = m->vmp_pmapped;
2475         boolean_t       must_disconnect = 0;
2476         boolean_t       map_is_switched, map_is_switch_protected;
2477         boolean_t       cs_violation;
2478         int             cs_enforcement_enabled;
2479         vm_prot_t       fault_type;
2480         vm_object_t     object;
2481         boolean_t       no_cache = fault_info->no_cache;
2482         boolean_t       cs_bypass = fault_info->cs_bypass;
2483         int             pmap_options = fault_info->pmap_options;
2484
2485         fault_type = change_wiring ? VM_PROT_NONE : caller_prot;
2486         object = VM_PAGE_OBJECT(m);
2487
2488         vm_object_lock_assert_held(object);
2489
2490 #if KASAN
2491         if (pmap == kernel_pmap) {
2492                 kasan_notify_address(vaddr, PAGE_SIZE);
2493         }
2494 #endif
2495
2496         LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
2497
2498         if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
2499                 assert(m->vmp_fictitious);
2500                 return KERN_SUCCESS;
2501         }
2502
2503         if (*type_of_fault == DBG_ZERO_FILL_FAULT) {
2504
2505                 vm_object_lock_assert_exclusive(object);
2506
2507         } else if ((fault_type & VM_PROT_WRITE) == 0 &&
2508                    (!m->vmp_wpmapped
2509 #if VM_OBJECT_ACCESS_TRACKING
2510                     || object->access_tracking
2511 #endif /* VM_OBJECT_ACCESS_TRACKING */
2512                            )) {
2513                 /*
2514                  * This is not a "write" fault, so we
2515                  * might not have taken the object lock
2516                  * exclusively and we might not be able
2517                  * to update the "wpmapped" bit in
2518                  * vm_fault_enter().
2519                  * Let's just grant read access to
2520                  * the page for now and we'll
2521                  * soft-fault again if we need write
2522                  * access later...
2523                  */
2524
2525                 /* This had better not be a JIT page. */
2526                 if (!pmap_has_prot_policy(prot)) {
2527                         prot &= ~VM_PROT_WRITE;
2528                 } else {
2529                         assert(cs_bypass);
2530                 }
2531         }
2532         if (m->vmp_pmapped == FALSE) {
2533
2534                 if (m->vmp_clustered) {
2535                         if (*type_of_fault == DBG_CACHE_HIT_FAULT) {
2536                                 /*
2537                                  * found it in the cache, but this
2538                                  * is the first fault-in of the page (m->vmp_pmapped == FALSE)
2539                                  * so it must have come in as part of
2540                                  * a cluster... account 1 pagein against it
2541                                  */
2542                                 if (object->internal)
2543                                         *type_of_fault = DBG_PAGEIND_FAULT;
2544                                 else
2545                                         *type_of_fault = DBG_PAGEINV_FAULT;
2546
2547                                 VM_PAGE_COUNT_AS_PAGEIN(m);
2548                         }
2549                         VM_PAGE_CONSUME_CLUSTERED(m);
2550                 }
2551         }
2552
2553         if (*type_of_fault != DBG_COW_FAULT) {
2554                 DTRACE_VM2(as_fault, int, 1, (uint64_t *), NULL);
2555
2556                 if (pmap == kernel_pmap) {
2557                         DTRACE_VM2(kernel_asflt, int, 1, (uint64_t *), NULL);
2558                 }
2559         }
2560
2561         /* Validate code signature if necessary. */
2562         if (!cs_bypass &&
2563             VM_FAULT_NEED_CS_VALIDATION(pmap, m, object)) {
2564                 vm_object_lock_assert_exclusive(object);
2565
2566                 if (m->vmp_cs_validated) {
2567                         vm_cs_revalidates++;
2568                 }
2569
2570                 /* VM map is locked, so 1 ref will remain on VM object -
2571                  * so no harm if vm_page_validate_cs drops the object lock */
2572
2573 #if PMAP_CS
2574                 if (fault_info->pmap_cs_associated &&
2575                     pmap_cs_enforced(pmap) &&
2576                     !m->vmp_cs_validated &&
2577                     !m->vmp_cs_tainted &&
2578                     !m->vmp_cs_nx &&
2579                     (prot & VM_PROT_EXECUTE) &&
2580                     (caller_prot & VM_PROT_EXECUTE)) {
2581                         /*
2582                          * With pmap_cs, the pmap layer will validate the
2583                          * code signature for any executable pmap mapping.
2584                          * No need for us to validate this page too:
2585                          * in pmap_cs we trust...
2586                          */
2587                         vm_cs_defer_to_pmap_cs++;
2588                 } else {
2589                         vm_cs_defer_to_pmap_cs_not++;
2590                         vm_page_validate_cs(m);
2591                 }
2592 #else /* PMAP_CS */
2593                 vm_page_validate_cs(m);
2594 #endif /* PMAP_CS */
2595         }
2596
2597 #define page_immutable(m,prot) ((m)->vmp_cs_validated /*&& ((prot) & VM_PROT_EXECUTE)*/)
2598 #define page_nx(m) ((m)->vmp_cs_nx)
2599
2600         map_is_switched = ((pmap != vm_map_pmap(current_task()->map)) &&
2601                            (pmap == vm_map_pmap(current_thread()->map)));
2602         map_is_switch_protected = current_thread()->map->switch_protect;
2603
2604         /* If the map is switched, and is switch-protected, we must protect
2605          * some pages from being write-faulted: immutable pages because by
2606          * definition they may not be written, and executable pages because that
2607          * would provide a way to inject unsigned code.
2608          * If the page is immutable, we can simply return. However, we can't
2609          * immediately determine whether a page is executable anywhere. But,
2610          * we can disconnect it everywhere and remove the executable protection
2611          * from the current map. We do that below right before we do the
2612          * PMAP_ENTER.
2613          */
2614         cs_enforcement_enabled = cs_process_enforcement(NULL);
2615
2616         if(cs_enforcement_enabled && map_is_switched &&
2617            map_is_switch_protected && page_immutable(m, prot) &&
2618            (prot & VM_PROT_WRITE))
2619         {
2620                 return KERN_CODESIGN_ERROR;
2621         }
2622
2623         if (cs_enforcement_enabled && page_nx(m) && (prot & VM_PROT_EXECUTE)) {
2624                 if (cs_debug)
2625                         printf("page marked to be NX, not letting it be mapped EXEC\n");
2626                 return KERN_CODESIGN_ERROR;
2627         }
2628
2629         /* A page could be tainted, or pose a risk of being tainted later.
2630          * Check whether the receiving process wants it, and make it feel
2631          * the consequences (that hapens in cs_invalid_page()).
2632          * For CS Enforcement, two other conditions will
2633          * cause that page to be tainted as well:
2634          * - pmapping an unsigned page executable - this means unsigned code;
2635          * - writeable mapping of a validated page - the content of that page
2636          *   can be changed without the kernel noticing, therefore unsigned
2637          *   code can be created
2638          */
2639         if (cs_bypass) {
2640                 /* code-signing is bypassed */
2641                 cs_violation = FALSE;
2642         } else if (m->vmp_cs_tainted) {
2643                 /* tainted page */
2644                 cs_violation = TRUE;
2645         } else if (!cs_enforcement_enabled) {
2646                 /* no further code-signing enforcement */
2647                 cs_violation = FALSE;
2648         } else if (page_immutable(m, prot) &&
2649                    ((prot & VM_PROT_WRITE) ||
2650                     m->vmp_wpmapped)) {
2651                 /*
2652                  * The page should be immutable, but is in danger of being
2653                  * modified.
2654                  * This is the case where we want policy from the code
2655                  * directory - is the page immutable or not? For now we have
2656                  * to assume that code pages will be immutable, data pages not.
2657                  * We'll assume a page is a code page if it has a code directory
2658                  * and we fault for execution.
2659                  * That is good enough since if we faulted the code page for
2660                  * writing in another map before, it is wpmapped; if we fault
2661                  * it for writing in this map later it will also be faulted for
2662                  * executing at the same time; and if we fault for writing in
2663                  * another map later, we will disconnect it from this pmap so
2664                  * we'll notice the change.
2665                  */
2666                 cs_violation = TRUE;
2667         } else if (!m->vmp_cs_validated &&
2668                    (prot & VM_PROT_EXECUTE)
2669 #if PMAP_CS
2670                    /*
2671                     * Executable pages will be validated by pmap_cs;
2672                     * in pmap_cs we trust...
2673                     * If pmap_cs is turned off, this is a code-signing
2674                     * violation.
2675                     */
2676                    && ! (pmap_cs_enforced(pmap))
2677 #endif /* PMAP_CS */
2678                 ) {
2679                 cs_violation = TRUE;
2680         } else {
2681                 cs_violation = FALSE;
2682         }
2683
2684         if (cs_violation) {
2685                 /* We will have a tainted page. Have to handle the special case
2686                  * of a switched map now. If the map is not switched, standard
2687                  * procedure applies - call cs_invalid_page().
2688                  * If the map is switched, the real owner is invalid already.
2689                  * There is no point in invalidating the switching process since
2690                  * it will not be executing from the map. So we don't call
2691                  * cs_invalid_page() in that case. */
2692                 boolean_t reject_page, cs_killed;
2693                 if(map_is_switched) {
2694                         assert(pmap==vm_map_pmap(current_thread()->map));
2695                         assert(!(prot & VM_PROT_WRITE) || (map_is_switch_protected == FALSE));
2696                         reject_page = FALSE;
2697                 } else {
2698                         if (cs_debug > 5)
2699                                 printf("vm_fault: signed: %s validate: %s tainted: %s wpmapped: %s prot: 0x%x\n",
2700                                        object->code_signed ? "yes" : "no",
2701                                        m->vmp_cs_validated ? "yes" : "no",
2702                                        m->vmp_cs_tainted ? "yes" : "no",
2703                                        m->vmp_wpmapped ? "yes" : "no",
2704                                        (int)prot);
2705                         reject_page = cs_invalid_page((addr64_t) vaddr, &cs_killed);
2706                 }
2707
2708                 if (reject_page) {
2709                         /* reject the invalid page: abort the page fault */
2710                         int                     pid;
2711                         const char              *procname;
2712                         task_t                  task;
2713                         vm_object_t             file_object, shadow;
2714                         vm_object_offset_t      file_offset;
2715                         char                    *pathname, *filename;
2716                         vm_size_t               pathname_len, filename_len;
2717                         boolean_t               truncated_path;
2718 #define __PATH_MAX 1024
2719                         struct timespec         mtime, cs_mtime;
2720                         int                     shadow_depth;
2721                         os_reason_t             codesigning_exit_reason = OS_REASON_NULL;
2722
2723                         kr = KERN_CODESIGN_ERROR;
2724                         cs_enter_tainted_rejected++;
2725
2726                         /* get process name and pid */
2727                         procname = "?";
2728                         task = current_task();
2729                         pid = proc_selfpid();
2730                         if (task->bsd_info != NULL)
2731                                 procname = proc_name_address(task->bsd_info);
2732
2733                         /* get file's VM object */
2734                         file_object = object;
2735                         file_offset = m->vmp_offset;
2736                         for (shadow = file_object->shadow,
2737                                      shadow_depth = 0;
2738                              shadow != VM_OBJECT_NULL;
2739                              shadow = file_object->shadow,
2740                                 shadow_depth++) {
2741                                 vm_object_lock_shared(shadow);
2742                                 if (file_object != object) {
2743                                         vm_object_unlock(file_object);
2744                                 }
2745                                 file_offset += file_object->vo_shadow_offset;
2746                                 file_object = shadow;
2747                         }
2748
2749                         mtime.tv_sec = 0;
2750                         mtime.tv_nsec = 0;
2751                         cs_mtime.tv_sec = 0;
2752                         cs_mtime.tv_nsec = 0;
2753
2754                         /* get file's pathname and/or filename */
2755                         pathname = NULL;
2756                         filename = NULL;
2757                         pathname_len = 0;
2758                         filename_len = 0;
2759                         truncated_path = FALSE;
2760                         /* no pager -> no file -> no pathname, use "<nil>" in that case */
2761                         if (file_object->pager != NULL) {
2762                                 pathname = (char *)kalloc(__PATH_MAX * 2);
2763                                 if (pathname) {
2764                                         pathname[0] = '\0';
2765                                         pathname_len = __PATH_MAX;
2766                                         filename = pathname + pathname_len;
2767                                         filename_len = __PATH_MAX;
2768                                 }
2769                                 vnode_pager_get_object_name(file_object->pager,
2770                                                             pathname,
2771                                                             pathname_len,
2772                                                             filename,
2773                                                             filename_len,
2774                                                             &truncated_path);
2775                                 if (pathname) {
2776                                         /* safety first... */
2777                                         pathname[__PATH_MAX-1] = '\0';
2778                                         filename[__PATH_MAX-1] = '\0';
2779                                 }
2780                                 vnode_pager_get_object_mtime(file_object->pager,
2781                                                              &mtime,
2782                                                              &cs_mtime);
2783                         }
2784                         printf("CODE SIGNING: process %d[%s]: "
2785                                "rejecting invalid page at address 0x%llx "
2786                                "from offset 0x%llx in file \"%s%s%s\" "
2787                                "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
2788                                "(signed:%d validated:%d tainted:%d nx:%d "
2789                                "wpmapped:%d dirty:%d depth:%d)\n",
2790                                pid, procname, (addr64_t) vaddr,
2791                                file_offset,
2792                                (pathname ? pathname : "<nil>"),
2793                                (truncated_path ? "/.../" : ""),
2794                                (truncated_path ? filename : ""),
2795                                cs_mtime.tv_sec, cs_mtime.tv_nsec,
2796                                ((cs_mtime.tv_sec == mtime.tv_sec &&
2797                                  cs_mtime.tv_nsec == mtime.tv_nsec)
2798                                 ? "=="
2799                                 : "!="),
2800                                mtime.tv_sec, mtime.tv_nsec,
2801                                object->code_signed,
2802                                m->vmp_cs_validated,
2803                                m->vmp_cs_tainted,
2804                                m->vmp_cs_nx,
2805                                m->vmp_wpmapped,
2806                                m->vmp_dirty,
2807                                shadow_depth);
2808
2809                         /*
2810                          * We currently only generate an exit reason if cs_invalid_page directly killed a process. If cs_invalid_page
2811                          * did not kill the process (more the case on desktop), vm_fault_enter will not satisfy the fault and whether the
2812                          * process dies is dependent on whether there is a signal handler registered for SIGSEGV and how that handler
2813                          * will deal with the segmentation fault.
2814                          */
2815                         if (cs_killed) {
2816                                 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE,
2817                                                                 pid, OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_INVALID_PAGE, 0, 0);
2818
2819                                 codesigning_exit_reason = os_reason_create(OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_INVALID_PAGE);
2820                                 if (codesigning_exit_reason == NULL) {
2821                                         printf("vm_fault_enter: failed to allocate codesigning exit reason\n");
2822                                 } else {
2823                                         mach_vm_address_t data_addr = 0;
2824                                         struct codesigning_exit_reason_info *ceri = NULL;
2825                                         uint32_t reason_buffer_size_estimate = kcdata_estimate_required_buffer_size(1, sizeof(*ceri));
2826
2827                                         if (os_reason_alloc_buffer_noblock(codesigning_exit_reason, reason_buffer_size_estimate)) {
2828                                                 printf("vm_fault_enter: failed to allocate buffer for codesigning exit reason\n");
2829                                         } else {
2830                                                 if (KERN_SUCCESS == kcdata_get_memory_addr(&codesigning_exit_reason->osr_kcd_descriptor,
2831                                                                 EXIT_REASON_CODESIGNING_INFO, sizeof(*ceri), &data_addr)) {
2832                                                         ceri = (struct codesigning_exit_reason_info *)data_addr;
2833                                                         static_assert(__PATH_MAX == sizeof(ceri->ceri_pathname));
2834
2835                                                         ceri->ceri_virt_addr = vaddr;
2836                                                         ceri->ceri_file_offset = file_offset;
2837                                                         if (pathname)
2838                                                                 strncpy((char *)&ceri->ceri_pathname, pathname, sizeof(ceri->ceri_pathname));
2839                                                         else
2840                                                                 ceri->ceri_pathname[0] = '\0';
2841                                                         if (filename)
2842                                                                 strncpy((char *)&ceri->ceri_filename, filename, sizeof(ceri->ceri_filename));
2843                                                         else
2844                                                                 ceri->ceri_filename[0] = '\0';
2845                                                         ceri->ceri_path_truncated = (truncated_path);
2846                                                         ceri->ceri_codesig_modtime_secs = cs_mtime.tv_sec;
2847                                                         ceri->ceri_codesig_modtime_nsecs = cs_mtime.tv_nsec;
2848                                                         ceri->ceri_page_modtime_secs = mtime.tv_sec;
2849                                                         ceri->ceri_page_modtime_nsecs = mtime.tv_nsec;
2850                                                         ceri->ceri_object_codesigned = (object->code_signed);
2851                                                         ceri->ceri_page_codesig_validated = (m->vmp_cs_validated);
2852                                                         ceri->ceri_page_codesig_tainted = (m->vmp_cs_tainted);
2853                                                         ceri->ceri_page_codesig_nx = (m->vmp_cs_nx);
2854                                                         ceri->ceri_page_wpmapped = (m->vmp_wpmapped);
2855                                                         ceri->ceri_page_slid = 0;
2856                                                         ceri->ceri_page_dirty = (m->vmp_dirty);
2857                                                         ceri->ceri_page_shadow_depth = shadow_depth;
2858                                                 } else {
2859 #if DEBUG || DEVELOPMENT
2860                                                         panic("vm_fault_enter: failed to allocate kcdata for codesigning exit reason");
2861 #else
2862                                                         printf("vm_fault_enter: failed to allocate kcdata for codesigning exit reason\n");
2863 #endif /* DEBUG || DEVELOPMENT */
2864                                                         /* Free the buffer */
2865                                                         os_reason_alloc_buffer_noblock(codesigning_exit_reason, 0);
2866                                                 }
2867                                         }
2868                                 }
2869
2870                                 set_thread_exit_reason(current_thread(), codesigning_exit_reason, FALSE);
2871                         }
2872                         if (panic_on_cs_killed &&
2873                             object->object_is_shared_cache) {
2874                                 panic("CODE SIGNING: process %d[%s]: "
2875                                       "rejecting invalid page at address 0x%llx "
2876                                       "from offset 0x%llx in file \"%s%s%s\" "
2877                                       "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
2878                                       "(signed:%d validated:%d tainted:%d nx:%d"
2879                                       "wpmapped:%d dirty:%d depth:%d)\n",
2880                                       pid, procname, (addr64_t) vaddr,
2881                                       file_offset,
2882                                       (pathname ? pathname : "<nil>"),
2883                                       (truncated_path ? "/.../" : ""),
2884                                       (truncated_path ? filename : ""),
2885                                       cs_mtime.tv_sec, cs_mtime.tv_nsec,
2886                                       ((cs_mtime.tv_sec == mtime.tv_sec &&
2887                                         cs_mtime.tv_nsec == mtime.tv_nsec)
2888                                        ? "=="
2889                                        : "!="),
2890                                       mtime.tv_sec, mtime.tv_nsec,
2891                                       object->code_signed,
2892                                       m->vmp_cs_validated,
2893                                       m->vmp_cs_tainted,
2894                                       m->vmp_cs_nx,
2895                                       m->vmp_wpmapped,
2896                                       m->vmp_dirty,
2897                                       shadow_depth);
2898                         }
2899
2900                         if (file_object != object) {
2901                                 vm_object_unlock(file_object);
2902                         }
2903                         if (pathname_len != 0) {
2904                                 kfree(pathname, __PATH_MAX * 2);
2905                                 pathname = NULL;
2906                                 filename = NULL;
2907                         }
2908                 } else {
2909                         /* proceed with the invalid page */
2910                         kr = KERN_SUCCESS;
2911                         if (!m->vmp_cs_validated &&
2912                             !object->code_signed) {
2913                                 /*
2914                                  * This page has not been (fully) validated but
2915                                  * does not belong to a code-signed object
2916                                  * so it should not be forcefully considered
2917                                  * as tainted.
2918                                  * We're just concerned about it here because
2919                                  * we've been asked to "execute" it but that
2920                                  * does not mean that it should cause other
2921                                  * accesses to fail.
2922                                  * This happens when a debugger sets a
2923                                  * breakpoint and we then execute code in
2924                                  * that page.  Marking the page as "tainted"
2925                                  * would cause any inspection tool ("leaks",
2926                                  * "vmmap", "CrashReporter", ...) to get killed
2927                                  * due to code-signing violation on that page,
2928                                  * even though they're just reading it and not
2929                                  * executing from it.
2930                                  */
2931                         } else {
2932                                 /*
2933                                  * Page might have been tainted before or not;
2934                                  * now it definitively is. If the page wasn't
2935                                  * tainted, we must disconnect it from all
2936                                  * pmaps later, to force existing mappings
2937                                  * through that code path for re-consideration
2938                                  * of the validity of that page.
2939                                  */
2940                                 must_disconnect = !m->vmp_cs_tainted;
2941                                 m->vmp_cs_tainted = TRUE;
2942                         }
2943                         cs_enter_tainted_accepted++;
2944                 }
2945                 if (kr != KERN_SUCCESS) {
2946                         if (cs_debug) {
2947                                 printf("CODESIGNING: vm_fault_enter(0x%llx): "
2948                                        "*** INVALID PAGE ***\n",
2949                                        (long long)vaddr);
2950                         }
2951 #if !SECURE_KERNEL
2952                         if (cs_enforcement_panic) {
2953                                 panic("CODESIGNING: panicking on invalid page\n");
2954                         }
2955 #endif
2956                 }
2957
2958         } else {
2959                 /* proceed with the valid page */
2960                 kr = KERN_SUCCESS;
2961         }
2962
2963         boolean_t       page_queues_locked = FALSE;
2964 #define __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED()   \
2965 MACRO_BEGIN                                     \
2966         if (! page_queues_locked) {             \
2967                 page_queues_locked = TRUE;      \
2968                 vm_page_lockspin_queues();      \
2969         }                                       \
2970 MACRO_END
2971 #define __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED()     \
2972 MACRO_BEGIN                                     \
2973         if (page_queues_locked) {               \
2974                 page_queues_locked = FALSE;     \
2975                 vm_page_unlock_queues();        \
2976         }                                       \
2977 MACRO_END
2978
2979         /*
2980          * Hold queues lock to manipulate
2981          * the page queues.  Change wiring
2982          * case is obvious.
2983          */
2984         assert((m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) || object != compressor_object);
2985
2986 #if CONFIG_BACKGROUND_QUEUE
2987         vm_page_update_background_state(m);
2988 #endif
2989         if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
2990                 /*
2991                  * Compressor pages are neither wired
2992                  * nor pageable and should never change.
2993                  */
2994                 assert(object == compressor_object);
2995         } else if (change_wiring) {
2996                 __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
2997
2998                 if (wired) {
2999                         if (kr == KERN_SUCCESS) {
3000                                 vm_page_wire(m, wire_tag, TRUE);
3001                         }
3002                 } else {
3003                         vm_page_unwire(m, TRUE);
3004                 }
3005                 /* we keep the page queues lock, if we need it later */
3006
3007         } else {
3008                 if (object->internal == TRUE) {
3009                         /*
3010                          * don't allow anonymous pages on
3011                          * the speculative queues
3012                          */
3013                         no_cache = FALSE;
3014                 }
3015                 if (kr != KERN_SUCCESS) {
3016                         __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3017                         vm_page_deactivate(m);
3018                         /* we keep the page queues lock, if we need it later */
3019                 } else if (((m->vmp_q_state == VM_PAGE_NOT_ON_Q) ||
3020                             (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ||
3021                             (m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) ||
3022                             ((m->vmp_q_state != VM_PAGE_ON_THROTTLED_Q) && no_cache)) &&
3023                            !VM_PAGE_WIRED(m)) {
3024
3025                         if (vm_page_local_q &&
3026                             (*type_of_fault == DBG_COW_FAULT ||
3027                              *type_of_fault == DBG_ZERO_FILL_FAULT) ) {
3028                                 struct vpl      *lq;
3029                                 uint32_t        lid;
3030
3031                                 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
3032
3033                                 __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
3034                                 vm_object_lock_assert_exclusive(object);
3035
3036                                 /*
3037                                  * we got a local queue to stuff this
3038                                  * new page on...
3039                                  * its safe to manipulate local and
3040                                  * local_id at this point since we're
3041                                  * behind an exclusive object lock and
3042                                  * the page is not on any global queue.
3043                                  *
3044                                  * we'll use the current cpu number to
3045                                  * select the queue note that we don't
3046                                  * need to disable preemption... we're
3047                                  * going to be behind the local queue's
3048                                  * lock to do the real work
3049                                  */
3050                                 lid = cpu_number();
3051
3052                                 lq = &vm_page_local_q[lid].vpl_un.vpl;
3053
3054                                 VPL_LOCK(&lq->vpl_lock);
3055
3056                                 vm_page_check_pageable_safe(m);
3057                                 vm_page_queue_enter(&lq->vpl_queue, m,
3058                                                     vm_page_t, vmp_pageq);
3059                                 m->vmp_q_state = VM_PAGE_ON_ACTIVE_LOCAL_Q;
3060                                 m->vmp_local_id = lid;
3061                                 lq->vpl_count++;
3062
3063                                 if (object->internal)
3064                                         lq->vpl_internal_count++;
3065                                 else
3066                                         lq->vpl_external_count++;
3067
3068                                 VPL_UNLOCK(&lq->vpl_lock);
3069
3070                                 if (lq->vpl_count > vm_page_local_q_soft_limit)
3071                                 {
3072                                         /*
3073                                          * we're beyond the soft limit
3074                                          * for the local queue
3075                                          * vm_page_reactivate_local will
3076                                          * 'try' to take the global page
3077                                          * queue lock... if it can't
3078                                          * that's ok... we'll let the
3079                                          * queue continue to grow up
3080                                          * to the hard limit... at that
3081                                          * point we'll wait for the
3082                                          * lock... once we've got the
3083                                          * lock, we'll transfer all of
3084                                          * the pages from the local
3085                                          * queue to the global active
3086                                          * queue
3087                                          */
3088                                         vm_page_reactivate_local(lid, FALSE, FALSE);
3089                                 }
3090                         } else {
3091
3092                                 __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3093
3094                                 /*
3095                                  * test again now that we hold the
3096                                  * page queue lock
3097                                  */
3098                                 if (!VM_PAGE_WIRED(m)) {
3099                                         if (m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3100                                                 vm_page_queues_remove(m, FALSE);
3101
3102                                                 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3103                                                 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_fault_reactivated, 1);
3104                                         }
3105
3106                                         if ( !VM_PAGE_ACTIVE_OR_INACTIVE(m) ||
3107                                              no_cache) {
3108                                                 /*
3109                                                  * If this is a no_cache mapping
3110                                                  * and the page has never been
3111                                                  * mapped before or was
3112                                                  * previously a no_cache page,
3113                                                  * then we want to leave pages
3114                                                  * in the speculative state so
3115                                                  * that they can be readily
3116                                                  * recycled if free memory runs
3117                                                  * low.  Otherwise the page is
3118                                                  * activated as normal.
3119                                                  */
3120
3121                                                 if (no_cache &&
3122                                                     (!previously_pmapped ||
3123                                                      m->vmp_no_cache)) {
3124                                                         m->vmp_no_cache = TRUE;
3125
3126                                                         if (m->vmp_q_state != VM_PAGE_ON_SPECULATIVE_Q)
3127                                                                 vm_page_speculate(m, FALSE);
3128
3129                                                 } else if ( !VM_PAGE_ACTIVE_OR_INACTIVE(m)) {
3130                                                         vm_page_activate(m);
3131                                                 }
3132                                         }
3133                                 }
3134                                 /* we keep the page queues lock, if we need it later */
3135                         }
3136                 }
3137         }
3138         /* we're done with the page queues lock, if we ever took it */
3139         __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
3140
3141
3142         /* If we have a KERN_SUCCESS from the previous checks, we either have
3143          * a good page, or a tainted page that has been accepted by the process.
3144          * In both cases the page will be entered into the pmap.
3145          * If the page is writeable, we need to disconnect it from other pmaps
3146          * now so those processes can take note.
3147          */
3148         if (kr == KERN_SUCCESS) {
3149                 /*
3150                  * NOTE: we may only hold the vm_object lock SHARED
3151                  * at this point, so we need the phys_page lock to
3152                  * properly serialize updating the pmapped and
3153                  * xpmapped bits
3154                  */
3155                 if ((prot & VM_PROT_EXECUTE) && !m->vmp_xpmapped) {
3156                         ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m);
3157
3158                         pmap_lock_phys_page(phys_page);
3159                         /*
3160                          * go ahead and take the opportunity
3161                          * to set 'pmapped' here so that we don't
3162                          * need to grab this lock a 2nd time
3163                          * just below
3164                          */
3165                         m->vmp_pmapped = TRUE;
3166
3167                         if (!m->vmp_xpmapped) {
3168
3169                                 m->vmp_xpmapped = TRUE;
3170
3171                                 pmap_unlock_phys_page(phys_page);
3172
3173                                 if (!object->internal)
3174                                         OSAddAtomic(1, &vm_page_xpmapped_external_count);
3175
3176 #if defined(__arm__) || defined(__arm64__)
3177                                 pmap_sync_page_data_phys(phys_page);
3178 #else
3179                                 if (object->internal &&
3180                                     object->pager != NULL) {
3181                                         /*
3182                                          * This page could have been
3183                                          * uncompressed by the
3184                                          * compressor pager and its
3185                                          * contents might be only in
3186                                          * the data cache.
3187                                          * Since it's being mapped for
3188                                          * "execute" for the fist time,
3189                                          * make sure the icache is in
3190                                          * sync.
3191                                          */
3192                                         assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
3193                                         pmap_sync_page_data_phys(phys_page);
3194                                 }
3195 #endif
3196                         } else
3197                                 pmap_unlock_phys_page(phys_page);
3198                 } else {
3199                         if (m->vmp_pmapped == FALSE) {
3200                                 ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m);
3201
3202                                 pmap_lock_phys_page(phys_page);
3203                                 m->vmp_pmapped = TRUE;
3204                                 pmap_unlock_phys_page(phys_page);
3205                         }
3206                 }
3207
3208                 if (fault_type & VM_PROT_WRITE) {
3209
3210                         if (m->vmp_wpmapped == FALSE) {
3211                                 vm_object_lock_assert_exclusive(object);
3212                                 if (!object->internal && object->pager) {
3213                                         task_update_logical_writes(current_task(), PAGE_SIZE, TASK_WRITE_DEFERRED, vnode_pager_lookup_vnode(object->pager));
3214                                 }
3215                                 m->vmp_wpmapped = TRUE;
3216                         }
3217                         if (must_disconnect) {
3218                                 /*
3219                                  * We can only get here
3220                                  * because of the CSE logic
3221                                  */
3222                                 assert(cs_enforcement_enabled);
3223                                 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
3224                                 /*
3225                                  * If we are faulting for a write, we can clear
3226                                  * the execute bit - that will ensure the page is
3227                                  * checked again before being executable, which
3228                                  * protects against a map switch.
3229                                  * This only happens the first time the page
3230                                  * gets tainted, so we won't get stuck here
3231                                  * to make an already writeable page executable.
3232                                  */
3233                                 if (!cs_bypass){
3234                                         assert(!pmap_has_prot_policy(prot));
3235                                         prot &= ~VM_PROT_EXECUTE;
3236                                 }
3237                         }
3238                 }
3239                 assert(VM_PAGE_OBJECT(m) == object);
3240
3241 #if VM_OBJECT_ACCESS_TRACKING
3242                 if (object->access_tracking) {
3243                         DTRACE_VM2(access_tracking, vm_map_offset_t, vaddr, int, fault_type);
3244                         if (fault_type & VM_PROT_WRITE) {
3245                                 object->access_tracking_writes++;
3246                                 vm_object_access_tracking_writes++;
3247                         } else {
3248                                 object->access_tracking_reads++;
3249                                 vm_object_access_tracking_reads++;
3250                         }
3251                 }
3252 #endif /* VM_OBJECT_ACCESS_TRACKING */
3253
3254 #if PMAP_CS
3255                 /*
3256                  * If CS enforcement is on, we don't ask for an executable page if the
3257                  * fault does not call for execution, because that can fail in
3258                  * situations where the caller only actually wanted read access.
3259                  * However, it may be better to instead retry without execute on
3260                  * failure, or pass a flag into pmap_enter to do the right thing.
3261                  */
3262                 // TODO: <rdar://problem/30997388> maybe do something better than masking out VM_PROT_EXECUTE on non-execute faults
3263                 if (pmap_cs_enforced(pmap) && !(caller_prot & VM_PROT_EXECUTE)) {
3264                         prot &= ~VM_PROT_EXECUTE;
3265                 }
3266 #endif
3267
3268                 /* Prevent a deadlock by not
3269                  * holding the object lock if we need to wait for a page in
3270                  * pmap_enter() - <rdar://problem/7138958> */
3271                 PMAP_ENTER_OPTIONS(pmap, vaddr, m, prot, fault_type, 0,
3272                                    wired,
3273                                    pmap_options | PMAP_OPTIONS_NOWAIT,
3274                                    pe_result);
3275 #if __x86_64__
3276                 if (pe_result == KERN_INVALID_ARGUMENT &&
3277                     pmap == PMAP_NULL &&
3278                     wired) {
3279                         /*
3280                          * Wiring a page in a pmap-less VM map:
3281                          * VMware's "vmmon" kernel extension does this
3282                          * to grab pages.
3283                          * Let it proceed even though the PMAP_ENTER() failed.
3284                          */
3285                         pe_result = KERN_SUCCESS;
3286                 }
3287 #endif /* __x86_64__ */
3288
3289                 if(pe_result == KERN_RESOURCE_SHORTAGE) {
3290
3291                         if (need_retry) {
3292                                 /*
3293                                  * this will be non-null in the case where we hold the lock
3294                                  * on the top-object in this chain... we can't just drop
3295                                  * the lock on the object we're inserting the page into
3296                                  * and recall the PMAP_ENTER since we can still cause
3297                                  * a deadlock if one of the critical paths tries to
3298                                  * acquire the lock on the top-object and we're blocked
3299                                  * in PMAP_ENTER waiting for memory... our only recourse
3300                                  * is to deal with it at a higher level where we can
3301                                  * drop both locks.
3302                                  */
3303                                 *need_retry = TRUE;
3304                                 vm_pmap_enter_retried++;
3305                                 goto after_the_pmap_enter;
3306                         }
3307                         /* The nonblocking version of pmap_enter did not succeed.
3308                          * and we don't need to drop other locks and retry
3309                          * at the level above us, so
3310                          * use the blocking version instead. Requires marking
3311                          * the page busy and unlocking the object */
3312                         boolean_t was_busy = m->vmp_busy;
3313
3314                         vm_object_lock_assert_exclusive(object);
3315
3316                         m->vmp_busy = TRUE;
3317                         vm_object_unlock(object);
3318
3319                         PMAP_ENTER_OPTIONS(pmap, vaddr, m, prot, fault_type,
3320                                            0, wired,
3321                                            pmap_options, pe_result);
3322
3323                         assert(VM_PAGE_OBJECT(m) == object);
3324
3325                         /* Take the object lock again. */
3326                         vm_object_lock(object);
3327
3328                         /* If the page was busy, someone else will wake it up.
3329                          * Otherwise, we have to do it now. */
3330                         assert(m->vmp_busy);
3331                         if(!was_busy) {
3332                                 PAGE_WAKEUP_DONE(m);
3333                         }
3334                         vm_pmap_enter_blocked++;
3335                 }
3336
3337                 kr = pe_result;
3338         }
3339
3340 after_the_pmap_enter:
3341         return kr;
3342 }
3343
3344 void
3345 vm_pre_fault(vm_map_offset_t vaddr)
3346 {
3347         if (pmap_find_phys(current_map()->pmap, vaddr) == 0) {
3348
3349                 vm_fault(current_map(),      /* map */
3350                         vaddr,               /* vaddr */
3351                         VM_PROT_READ,        /* fault_type */
3352                         FALSE,               /* change_wiring */
3353                         VM_KERN_MEMORY_NONE, /* tag - not wiring */
3354                         THREAD_UNINT,        /* interruptible */
3355                         NULL,                /* caller_pmap */
3356                         0                    /* caller_pmap_addr */);
3357         }
3358 }
3359
3360
3361 /*
3362  *      Routine:        vm_fault
3363  *      Purpose:
3364  *              Handle page faults, including pseudo-faults
3365  *              used to change the wiring status of pages.
3366  *      Returns:
3367  *              Explicit continuations have been removed.
3368  *      Implementation:
3369  *              vm_fault and vm_fault_page save mucho state
3370  *              in the moral equivalent of a closure.  The state
3371  *              structure is allocated when first entering vm_fault
3372  *              and deallocated when leaving vm_fault.
3373  */
3374
3375 extern int _map_enter_debug;
3376 extern uint64_t get_current_unique_pid(void);
3377
3378 unsigned long vm_fault_collapse_total = 0;
3379 unsigned long vm_fault_collapse_skipped = 0;
3380
3381
3382 kern_return_t
3383 vm_fault_external(
3384         vm_map_t        map,
3385         vm_map_offset_t vaddr,
3386         vm_prot_t       fault_type,
3387         boolean_t       change_wiring,
3388         int             interruptible,
3389         pmap_t          caller_pmap,
3390         vm_map_offset_t caller_pmap_addr)
3391 {
3392         return vm_fault_internal(map, vaddr, fault_type, change_wiring, vm_tag_bt(),
3393                                  interruptible, caller_pmap, caller_pmap_addr,
3394                                  NULL);
3395 }
3396
3397 kern_return_t
3398 vm_fault(
3399         vm_map_t        map,
3400         vm_map_offset_t vaddr,
3401         vm_prot_t       fault_type,
3402         boolean_t       change_wiring,
3403         vm_tag_t        wire_tag,               /* if wiring must pass tag != VM_KERN_MEMORY_NONE */
3404         int             interruptible,
3405         pmap_t          caller_pmap,
3406         vm_map_offset_t caller_pmap_addr)
3407 {
3408         return vm_fault_internal(map, vaddr, fault_type, change_wiring, wire_tag,
3409                                  interruptible, caller_pmap, caller_pmap_addr,
3410                                  NULL);
3411 }
3412
3413 kern_return_t
3414 vm_fault_internal(
3415         vm_map_t        map,
3416         vm_map_offset_t vaddr,
3417         vm_prot_t       caller_prot,
3418         boolean_t       change_wiring,
3419         vm_tag_t        wire_tag,               /* if wiring must pass tag != VM_KERN_MEMORY_NONE */
3420         int             interruptible,
3421         pmap_t          caller_pmap,
3422         vm_map_offset_t caller_pmap_addr,
3423         ppnum_t         *physpage_p)
3424 {
3425         vm_map_version_t        version;        /* Map version for verificiation */
3426         boolean_t               wired;          /* Should mapping be wired down? */
3427         vm_object_t             object;         /* Top-level object */
3428         vm_object_offset_t      offset;         /* Top-level offset */
3429         vm_prot_t               prot;           /* Protection for mapping */
3430         vm_object_t             old_copy_object; /* Saved copy object */
3431         vm_page_t               result_page;    /* Result of vm_fault_page */
3432         vm_page_t               top_page;       /* Placeholder page */
3433         kern_return_t           kr;
3434
3435         vm_page_t               m;      /* Fast access to result_page */
3436         kern_return_t           error_code;
3437         vm_object_t             cur_object;
3438         vm_object_t             m_object = NULL;
3439         vm_object_offset_t      cur_offset;
3440         vm_page_t               cur_m;
3441         vm_object_t             new_object;
3442         int                     type_of_fault;
3443         pmap_t                  pmap;
3444         wait_interrupt_t        interruptible_state;
3445         vm_map_t                real_map = map;
3446         vm_map_t                original_map = map;
3447         boolean_t               object_locks_dropped = FALSE;
3448         vm_prot_t               fault_type;
3449         vm_prot_t               original_fault_type;
3450         struct vm_object_fault_info fault_info = {};
3451         boolean_t               need_collapse = FALSE;
3452         boolean_t               need_retry = FALSE;
3453         boolean_t               *need_retry_ptr = NULL;
3454         int                     object_lock_type = 0;
3455         int                     cur_object_lock_type;
3456         vm_object_t             top_object = VM_OBJECT_NULL;
3457         vm_object_t             written_on_object = VM_OBJECT_NULL;
3458         memory_object_t         written_on_pager = NULL;
3459         vm_object_offset_t      written_on_offset = 0;
3460         int                     throttle_delay;
3461         int                     compressed_count_delta;
3462         int                     grab_options;
3463         vm_map_offset_t         trace_vaddr;
3464         vm_map_offset_t         trace_real_vaddr;
3465 #if DEVELOPMENT || DEBUG
3466         vm_map_offset_t         real_vaddr;
3467
3468         real_vaddr = vaddr;
3469 #endif /* DEVELOPMENT || DEBUG */
3470         trace_real_vaddr = vaddr;
3471         vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
3472
3473         if (map == kernel_map) {
3474                 trace_vaddr = VM_KERNEL_ADDRHIDE(vaddr);
3475                 trace_real_vaddr = VM_KERNEL_ADDRHIDE(trace_real_vaddr);
3476         } else {
3477                 trace_vaddr = vaddr;
3478         }
3479
3480         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3481                       (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START,
3482                               ((uint64_t)trace_vaddr >> 32),
3483                               trace_vaddr,
3484                               (map == kernel_map),
3485                               0,
3486                               0);
3487
3488         if (get_preemption_level() != 0) {
3489                 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3490                                       (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
3491                                       ((uint64_t)trace_vaddr >> 32),
3492                                       trace_vaddr,
3493                                       KERN_FAILURE,
3494                                       0,
3495                                       0);
3496
3497                 return (KERN_FAILURE);
3498         }
3499
3500         thread_t cthread = current_thread();
3501         boolean_t rtfault = (cthread->sched_mode == TH_MODE_REALTIME);
3502         uint64_t fstart = 0;
3503
3504         if (rtfault) {
3505                 fstart = mach_continuous_time();
3506         }
3507
3508         interruptible_state = thread_interrupt_level(interruptible);
3509
3510         fault_type = (change_wiring ? VM_PROT_NONE : caller_prot);
3511
3512         VM_STAT_INCR(faults);
3513         current_task()->faults++;
3514         original_fault_type = fault_type;
3515
3516         if (fault_type & VM_PROT_WRITE)
3517                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3518         else
3519                 object_lock_type = OBJECT_LOCK_SHARED;
3520
3521         cur_object_lock_type = OBJECT_LOCK_SHARED;
3522
3523         if ((map == kernel_map) && (caller_prot & VM_PROT_WRITE)) {
3524                 if (compressor_map) {
3525                         if ((vaddr >= vm_map_min(compressor_map)) && (vaddr < vm_map_max(compressor_map))) {
3526                                 panic("Write fault on compressor map, va: %p type: %u bounds: %p->%p", (void *) vaddr, caller_prot, (void *) vm_map_min(compressor_map), (void *) vm_map_max(compressor_map));
3527
3528                         }
3529                 }
3530         }
3531 RetryFault:
3532         assert(written_on_object == VM_OBJECT_NULL);
3533
3534         /*
3535          * assume we will hit a page in the cache
3536          * otherwise, explicitly override with
3537          * the real fault type once we determine it
3538          */
3539         type_of_fault = DBG_CACHE_HIT_FAULT;
3540
3541         /*
3542          *      Find the backing store object and offset into
3543          *      it to begin the search.
3544          */
3545         fault_type = original_fault_type;
3546         map = original_map;
3547         vm_map_lock_read(map);
3548
3549         kr = vm_map_lookup_locked(&map, vaddr, fault_type,
3550                                   object_lock_type, &version,
3551                                   &object, &offset, &prot, &wired,
3552                                   &fault_info,
3553                                   &real_map);
3554
3555         if (kr != KERN_SUCCESS) {
3556                 vm_map_unlock_read(map);
3557                 goto done;
3558         }
3559         pmap = real_map->pmap;
3560         fault_info.interruptible = interruptible;
3561         fault_info.stealth = FALSE;
3562         fault_info.io_sync = FALSE;
3563         fault_info.mark_zf_absent = FALSE;
3564         fault_info.batch_pmap_op = FALSE;
3565
3566         /*
3567          * If the page is wired, we must fault for the current protection
3568          * value, to avoid further faults.
3569          */
3570         if (wired) {
3571                 fault_type = prot | VM_PROT_WRITE;
3572                 /*
3573                  * since we're treating this fault as a 'write'
3574                  * we must hold the top object lock exclusively
3575                  */
3576                 if (object_lock_type == OBJECT_LOCK_SHARED) {
3577
3578                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3579
3580                         if (vm_object_lock_upgrade(object) == FALSE) {
3581                                 /*
3582                                  * couldn't upgrade, so explictly
3583                                  * take the lock exclusively
3584                                  */
3585                                 vm_object_lock(object);
3586                         }
3587                 }
3588         }
3589
3590 #if     VM_FAULT_CLASSIFY
3591         /*
3592          *      Temporary data gathering code
3593          */
3594         vm_fault_classify(object, offset, fault_type);
3595 #endif
3596         /*
3597          *      Fast fault code.  The basic idea is to do as much as
3598          *      possible while holding the map lock and object locks.
3599          *      Busy pages are not used until the object lock has to
3600          *      be dropped to do something (copy, zero fill, pmap enter).
3601          *      Similarly, paging references aren't acquired until that
3602          *      point, and object references aren't used.
3603          *
3604          *      If we can figure out what to do
3605          *      (zero fill, copy on write, pmap enter) while holding
3606          *      the locks, then it gets done.  Otherwise, we give up,
3607          *      and use the original fault path (which doesn't hold
3608          *      the map lock, and relies on busy pages).
3609          *      The give up cases include:
3610          *              - Have to talk to pager.
3611          *              - Page is busy, absent or in error.
3612          *              - Pager has locked out desired access.
3613          *              - Fault needs to be restarted.
3614          *              - Have to push page into copy object.
3615          *
3616          *      The code is an infinite loop that moves one level down
3617          *      the shadow chain each time.  cur_object and cur_offset
3618          *      refer to the current object being examined. object and offset
3619          *      are the original object from the map.  The loop is at the
3620          *      top level if and only if object and cur_object are the same.
3621          *
3622          *      Invariants:  Map lock is held throughout.  Lock is held on
3623          *              original object and cur_object (if different) when
3624          *              continuing or exiting loop.
3625          *
3626          */
3627
3628 #if defined(__arm64__)
3629         /*
3630          * Fail if reading an execute-only page in a
3631          * pmap that enforces execute-only protection.
3632          */
3633         if (fault_type == VM_PROT_READ &&
3634                 (prot & VM_PROT_EXECUTE) &&
3635                 !(prot & VM_PROT_READ) &&
3636                 pmap_enforces_execute_only(pmap)) {
3637                         vm_object_unlock(object);
3638                         vm_map_unlock_read(map);
3639                         if (real_map != map) {
3640                                 vm_map_unlock(real_map);
3641                         }
3642                         kr = KERN_PROTECTION_FAILURE;
3643                         goto done;
3644         }
3645 #endif
3646
3647         /*
3648          * If this page is to be inserted in a copy delay object
3649          * for writing, and if the object has a copy, then the
3650          * copy delay strategy is implemented in the slow fault page.
3651          */
3652         if (object->copy_strategy == MEMORY_OBJECT_COPY_DELAY &&
3653             object->copy != VM_OBJECT_NULL && (fault_type & VM_PROT_WRITE))
3654                 goto handle_copy_delay;
3655
3656         cur_object = object;
3657         cur_offset = offset;
3658
3659         grab_options = 0;
3660 #if CONFIG_SECLUDED_MEMORY
3661         if (object->can_grab_secluded) {
3662                 grab_options |= VM_PAGE_GRAB_SECLUDED;
3663         }
3664 #endif /* CONFIG_SECLUDED_MEMORY */
3665
3666         while (TRUE) {
3667                 if (!cur_object->pager_created &&
3668                     cur_object->phys_contiguous) /* superpage */
3669                         break;
3670
3671                 if (cur_object->blocked_access) {
3672                         /*
3673                          * Access to this VM object has been blocked.
3674                          * Let the slow path handle it.
3675                          */
3676                         break;
3677                 }
3678
3679                 m = vm_page_lookup(cur_object, cur_offset);
3680                 m_object = NULL;
3681
3682                 if (m != VM_PAGE_NULL) {
3683                         m_object = cur_object;
3684
3685                         if (m->vmp_busy) {
3686                                 wait_result_t   result;
3687
3688                                 /*
3689                                  * in order to do the PAGE_ASSERT_WAIT, we must
3690                                  * have object that 'm' belongs to locked exclusively
3691                                  */
3692                                 if (object != cur_object) {
3693
3694                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3695
3696                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3697
3698                                                 if (vm_object_lock_upgrade(cur_object) == FALSE) {
3699                                                         /*
3700                                                          * couldn't upgrade so go do a full retry
3701                                                          * immediately since we can no longer be
3702                                                          * certain about cur_object (since we
3703                                                          * don't hold a reference on it)...
3704                                                          * first drop the top object lock
3705                                                          */
3706                                                         vm_object_unlock(object);
3707
3708                                                         vm_map_unlock_read(map);
3709                                                         if (real_map != map)
3710                                                                 vm_map_unlock(real_map);
3711
3712                                                         goto RetryFault;
3713                                                 }
3714                                         }
3715                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3716
3717                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3718
3719                                         if (vm_object_lock_upgrade(object) == FALSE) {
3720                                                 /*
3721                                                  * couldn't upgrade, so explictly take the lock
3722                                                  * exclusively and go relookup the page since we
3723                                                  * will have dropped the object lock and
3724                                                  * a different thread could have inserted
3725                                                  * a page at this offset
3726                                                  * no need for a full retry since we're
3727                                                  * at the top level of the object chain
3728                                                  */
3729                                                 vm_object_lock(object);
3730
3731                                                 continue;
3732                                         }
3733                                 }
3734                                 if ((m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) && m_object->internal) {
3735                                         /*
3736                                          * m->vmp_busy == TRUE and the object is locked exclusively
3737                                          * if m->pageout_queue == TRUE after we acquire the
3738                                          * queues lock, we are guaranteed that it is stable on
3739                                          * the pageout queue and therefore reclaimable
3740                                          *
3741                                          * NOTE: this is only true for the internal pageout queue
3742                                          * in the compressor world
3743                                          */
3744                                         assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
3745
3746                                         vm_page_lock_queues();
3747
3748                                         if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
3749                                                 vm_pageout_throttle_up(m);
3750                                                 vm_page_unlock_queues();
3751
3752                                                 PAGE_WAKEUP_DONE(m);
3753                                                 goto reclaimed_from_pageout;
3754                                         }
3755                                         vm_page_unlock_queues();
3756                                 }
3757                                 if (object != cur_object)
3758                                         vm_object_unlock(object);
3759
3760                                 vm_map_unlock_read(map);
3761                                 if (real_map != map)
3762                                         vm_map_unlock(real_map);
3763
3764                                 result = PAGE_ASSERT_WAIT(m, interruptible);
3765
3766                                 vm_object_unlock(cur_object);
3767
3768                                 if (result == THREAD_WAITING) {
3769                                         result = thread_block(THREAD_CONTINUE_NULL);
3770
3771                                         counter(c_vm_fault_page_block_busy_kernel++);
3772                                 }
3773                                 if (result == THREAD_AWAKENED || result == THREAD_RESTART)
3774                                         goto RetryFault;
3775
3776                                 kr = KERN_ABORTED;
3777                                 goto done;
3778                         }
3779 reclaimed_from_pageout:
3780                         if (m->vmp_laundry) {
3781                                 if (object != cur_object) {
3782                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3783                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3784
3785                                                 vm_object_unlock(object);
3786                                                 vm_object_unlock(cur_object);
3787
3788                                                 vm_map_unlock_read(map);
3789                                                 if (real_map != map)
3790                                                         vm_map_unlock(real_map);
3791
3792                                                 goto RetryFault;
3793                                         }
3794
3795                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3796
3797                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3798
3799                                         if (vm_object_lock_upgrade(object) == FALSE) {
3800                                                 /*
3801                                                  * couldn't upgrade, so explictly take the lock
3802                                                  * exclusively and go relookup the page since we
3803                                                  * will have dropped the object lock and
3804                                                  * a different thread could have inserted
3805                                                  * a page at this offset
3806                                                  * no need for a full retry since we're
3807                                                  * at the top level of the object chain
3808                                                  */
3809                                                 vm_object_lock(object);
3810
3811                                                 continue;
3812                                         }
3813                                 }
3814                                 vm_pageout_steal_laundry(m, FALSE);
3815                         }
3816
3817                         if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
3818                                 /*
3819                                  * Guard page: let the slow path deal with it
3820                                  */
3821                                 break;
3822                         }
3823                         if (m->vmp_unusual && (m->vmp_error || m->vmp_restart || m->vmp_private || m->vmp_absent)) {
3824                                 /*
3825                                  * Unusual case... let the slow path deal with it
3826                                  */
3827                                 break;
3828                         }
3829                         if (VM_OBJECT_PURGEABLE_FAULT_ERROR(m_object)) {
3830                                 if (object != cur_object)
3831                                         vm_object_unlock(object);
3832                                 vm_map_unlock_read(map);
3833                                 if (real_map != map)
3834                                         vm_map_unlock(real_map);
3835                                 vm_object_unlock(cur_object);
3836                                 kr = KERN_MEMORY_ERROR;
3837                                 goto done;
3838                         }
3839                         assert(m_object == VM_PAGE_OBJECT(m));
3840
3841                         if (VM_FAULT_NEED_CS_VALIDATION(map->pmap, m, m_object) ||
3842                             (physpage_p != NULL && (prot & VM_PROT_WRITE))) {
3843 upgrade_for_validation:
3844                                 /*
3845                                  * We might need to validate this page
3846                                  * against its code signature, so we
3847                                  * want to hold the VM object exclusively.
3848                                  */
3849                                 if (object != cur_object) {
3850                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3851                                                 vm_object_unlock(object);
3852                                                 vm_object_unlock(cur_object);
3853
3854                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3855
3856                                                 vm_map_unlock_read(map);
3857                                                 if (real_map != map)
3858                                                         vm_map_unlock(real_map);
3859
3860                                                 goto RetryFault;
3861                                         }
3862
3863                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3864
3865                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3866
3867                                         if (vm_object_lock_upgrade(object) == FALSE) {
3868                                                 /*
3869                                                  * couldn't upgrade, so explictly take the lock
3870                                                  * exclusively and go relookup the page since we
3871                                                  * will have dropped the object lock and
3872                                                  * a different thread could have inserted
3873                                                  * a page at this offset
3874                                                  * no need for a full retry since we're
3875                                                  * at the top level of the object chain
3876                                                  */
3877                                                 vm_object_lock(object);
3878
3879                                                 continue;
3880                                         }
3881                                 }
3882                         }
3883                         /*
3884                          *      Two cases of map in faults:
3885                          *          - At top level w/o copy object.
3886                          *          - Read fault anywhere.
3887                          *              --> must disallow write.
3888                          */
3889
3890                         if (object == cur_object && object->copy == VM_OBJECT_NULL) {
3891
3892                                 goto FastPmapEnter;
3893                         }
3894
3895                         if ((fault_type & VM_PROT_WRITE) == 0) {
3896                                 if (!pmap_has_prot_policy(prot)) {
3897                                         prot &= ~VM_PROT_WRITE;
3898                                 } else {
3899                                         /*
3900                                          * For a protection that the pmap cares
3901                                          * about, we must hand over the full
3902                                          * set of protections (so that the pmap
3903                                          * layer can apply any desired policy).
3904                                          * This means that cs_bypass must be
3905                                          * set, as this can force us to pass
3906                                          * RWX.
3907                                          */
3908                                         assert(fault_info.cs_bypass);
3909                                 }
3910
3911                                 if (object != cur_object) {
3912                                         /*
3913                                          * We still need to hold the top object
3914                                          * lock here to prevent a race between
3915                                          * a read fault (taking only "shared"
3916                                          * locks) and a write fault (taking
3917                                          * an "exclusive" lock on the top
3918                                          * object.
3919                                          * Otherwise, as soon as we release the
3920                                          * top lock, the write fault could
3921                                          * proceed and actually complete before
3922                                          * the read fault, and the copied page's
3923                                          * translation could then be overwritten
3924                                          * by the read fault's translation for
3925                                          * the original page.
3926                                          *
3927                                          * Let's just record what the top object
3928                                          * is and we'll release it later.
3929                                          */
3930                                         top_object = object;
3931
3932                                         /*
3933                                          * switch to the object that has the new page
3934                                          */
3935                                         object = cur_object;
3936                                         object_lock_type = cur_object_lock_type;
3937                                 }
3938 FastPmapEnter:
3939                                 assert(m_object == VM_PAGE_OBJECT(m));
3940
3941                                 /*
3942                                  * prepare for the pmap_enter...
3943                                  * object and map are both locked
3944                                  * m contains valid data
3945                                  * object == m->vmp_object
3946                                  * cur_object == NULL or it's been unlocked
3947                                  * no paging references on either object or cur_object
3948                                  */
3949                                 if (top_object != VM_OBJECT_NULL || object_lock_type != OBJECT_LOCK_EXCLUSIVE)
3950                                         need_retry_ptr = &need_retry;
3951                                 else
3952                                         need_retry_ptr = NULL;
3953
3954                                 if (caller_pmap) {
3955                                         kr = vm_fault_enter(m,
3956                                                             caller_pmap,
3957                                                             caller_pmap_addr,
3958                                                             prot,
3959                                                             caller_prot,
3960                                                             wired,
3961                                                             change_wiring,
3962                                                             wire_tag,
3963                                                             &fault_info,
3964                                                             need_retry_ptr,
3965                                                             &type_of_fault);
3966                                 } else {
3967                                         kr = vm_fault_enter(m,
3968                                                             pmap,
3969                                                             vaddr,
3970                                                             prot,
3971                                                             caller_prot,
3972                                                             wired,
3973                                                             change_wiring,
3974                                                             wire_tag,
3975                                                             &fault_info,
3976                                                             need_retry_ptr,
3977                                                             &type_of_fault);
3978                                 }
3979 #if DEVELOPMENT || DEBUG
3980                                 {
3981                                 int     event_code = 0;
3982
3983                                 if (m_object->internal)
3984                                         event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL));
3985                                 else if (m_object->object_is_shared_cache)
3986                                         event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE));
3987                                 else
3988                                         event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL));
3989
3990                                 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, event_code, trace_real_vaddr, (fault_info.user_tag << 16) | (caller_prot << 8) | type_of_fault, m->vmp_offset, get_current_unique_pid(), 0);
3991
3992                                 DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info.user_tag);
3993                                 }
3994 #endif
3995                                 if (kr == KERN_SUCCESS &&
3996                                     physpage_p != NULL) {
3997                                         /* for vm_map_wire_and_extract() */
3998                                         *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
3999                                         if (prot & VM_PROT_WRITE) {
4000                                                 vm_object_lock_assert_exclusive(m_object);
4001                                                 m->vmp_dirty = TRUE;
4002                                         }
4003                                 }
4004
4005                                 if (top_object != VM_OBJECT_NULL) {
4006                                         /*
4007                                          * It's safe to drop the top object
4008                                          * now that we've done our
4009                                          * vm_fault_enter().  Any other fault
4010                                          * in progress for that virtual
4011                                          * address will either find our page
4012                                          * and translation or put in a new page
4013                                          * and translation.
4014                                          */
4015                                         vm_object_unlock(top_object);
4016                                         top_object = VM_OBJECT_NULL;
4017                                 }
4018
4019                                 if (need_collapse == TRUE)
4020                                         vm_object_collapse(object, offset, TRUE);
4021
4022                                 if (need_retry == FALSE &&
4023                                     (type_of_fault == DBG_PAGEIND_FAULT || type_of_fault == DBG_PAGEINV_FAULT || type_of_fault == DBG_CACHE_HIT_FAULT)) {
4024                                         /*
4025                                          * evaluate access pattern and update state
4026                                          * vm_fault_deactivate_behind depends on the
4027                                          * state being up to date
4028                                          */
4029                                         vm_fault_is_sequential(m_object, cur_offset, fault_info.behavior);
4030
4031                                         vm_fault_deactivate_behind(m_object, cur_offset, fault_info.behavior);
4032                                 }
4033                                 /*
4034                                  * That's it, clean up and return.
4035                                  */
4036                                 if (m->vmp_busy)
4037                                         PAGE_WAKEUP_DONE(m);
4038
4039                                 if (need_retry == FALSE && !m_object->internal && (fault_type & VM_PROT_WRITE)) {
4040
4041                                         vm_object_paging_begin(m_object);
4042
4043                                         assert(written_on_object == VM_OBJECT_NULL);
4044                                         written_on_object = m_object;
4045                                         written_on_pager = m_object->pager;
4046                                         written_on_offset = m_object->paging_offset + m->vmp_offset;
4047                                 }
4048                                 vm_object_unlock(object);
4049
4050                                 vm_map_unlock_read(map);
4051                                 if (real_map != map)
4052                                         vm_map_unlock(real_map);
4053
4054                                 if (need_retry == TRUE) {
4055                                         /*
4056                                          * vm_fault_enter couldn't complete the PMAP_ENTER...
4057                                          * at this point we don't hold any locks so it's safe
4058                                          * to ask the pmap layer to expand the page table to
4059                                          * accommodate this mapping... once expanded, we'll
4060                                          * re-drive the fault which should result in vm_fault_enter
4061                                          * being able to successfully enter the mapping this time around
4062                                          */
4063                                         (void)pmap_enter_options(
4064                                                 pmap, vaddr, 0, 0, 0, 0, 0,
4065                                                 PMAP_OPTIONS_NOENTER, NULL);
4066
4067                                         need_retry = FALSE;
4068                                         goto RetryFault;
4069                                 }
4070                                 goto done;
4071                         }
4072                         /*
4073                          * COPY ON WRITE FAULT
4074                          */
4075                         assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE);
4076
4077                         /*
4078                          * If objects match, then
4079                          * object->copy must not be NULL (else control
4080                          * would be in previous code block), and we
4081                          * have a potential push into the copy object
4082                          * with which we can't cope with here.
4083                          */
4084                         if (cur_object == object) {
4085                                 /*
4086                                  * must take the slow path to
4087                                  * deal with the copy push
4088                                  */
4089                                 break;
4090                         }
4091
4092                         /*
4093                          * This is now a shadow based copy on write
4094                          * fault -- it requires a copy up the shadow
4095                          * chain.
4096                          */
4097                         assert(m_object == VM_PAGE_OBJECT(m));
4098
4099                         if ((cur_object_lock_type == OBJECT_LOCK_SHARED) &&
4100                             VM_FAULT_NEED_CS_VALIDATION(NULL, m, m_object)) {
4101                                 goto upgrade_for_validation;
4102                         }
4103
4104                         /*
4105                          * Allocate a page in the original top level
4106                          * object. Give up if allocate fails.  Also
4107                          * need to remember current page, as it's the
4108                          * source of the copy.
4109                          *
4110                          * at this point we hold locks on both
4111                          * object and cur_object... no need to take
4112                          * paging refs or mark pages BUSY since
4113                          * we don't drop either object lock until
4114                          * the page has been copied and inserted
4115                          */
4116                         cur_m = m;
4117                         m = vm_page_grab_options(grab_options);
4118                         m_object = NULL;
4119
4120                         if (m == VM_PAGE_NULL) {
4121                                 /*
4122                                  * no free page currently available...
4123                                  * must take the slow path
4124                                  */
4125                                 break;
4126                         }
4127                         /*
4128                          * Now do the copy.  Mark the source page busy...
4129                          *
4130                          *      NOTE: This code holds the map lock across
4131                          *      the page copy.
4132                          */
4133                         vm_page_copy(cur_m, m);
4134                         vm_page_insert(m, object, offset);
4135                         m_object = object;
4136                         SET_PAGE_DIRTY(m, FALSE);
4137
4138                         /*
4139                          * Now cope with the source page and object
4140                          */
4141                         if (object->ref_count > 1 && cur_m->vmp_pmapped)
4142                                 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(cur_m));
4143
4144                         if (cur_m->vmp_clustered) {
4145                                 VM_PAGE_COUNT_AS_PAGEIN(cur_m);
4146                                 VM_PAGE_CONSUME_CLUSTERED(cur_m);
4147                                 vm_fault_is_sequential(cur_object, cur_offset, fault_info.behavior);
4148                         }
4149                         need_collapse = TRUE;
4150
4151                         if (!cur_object->internal &&
4152                             cur_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
4153                                 /*
4154                                  * The object from which we've just
4155                                  * copied a page is most probably backed
4156                                  * by a vnode.  We don't want to waste too
4157                                  * much time trying to collapse the VM objects
4158                                  * and create a bottleneck when several tasks
4159                                  * map the same file.
4160                                  */
4161                                 if (cur_object->copy == object) {
4162                                         /*
4163                                          * Shared mapping or no COW yet.
4164                                          * We can never collapse a copy
4165                                          * object into its backing object.
4166                                          */
4167                                         need_collapse = FALSE;
4168                                 } else if (cur_object->copy == object->shadow &&
4169                                            object->shadow->resident_page_count == 0) {
4170                                         /*
4171                                          * Shared mapping after a COW occurred.
4172                                          */
4173                                         need_collapse = FALSE;
4174                                 }
4175                         }
4176                         vm_object_unlock(cur_object);
4177
4178                         if (need_collapse == FALSE)
4179                                 vm_fault_collapse_skipped++;
4180                         vm_fault_collapse_total++;
4181
4182                         type_of_fault = DBG_COW_FAULT;
4183                         VM_STAT_INCR(cow_faults);
4184                         DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
4185                         current_task()->cow_faults++;
4186
4187                         goto FastPmapEnter;
4188
4189                 } else {
4190                         /*
4191                          * No page at cur_object, cur_offset... m == NULL
4192                          */
4193                         if (cur_object->pager_created) {
4194                                 int     compressor_external_state = VM_EXTERNAL_STATE_UNKNOWN;
4195
4196                                 if (MUST_ASK_PAGER(cur_object, cur_offset, compressor_external_state) == TRUE) {
4197                                         int             my_fault_type;
4198                                         int             c_flags = C_DONT_BLOCK;
4199                                         boolean_t       insert_cur_object = FALSE;
4200
4201                                         /*
4202                                          * May have to talk to a pager...
4203                                          * if so, take the slow path by
4204                                          * doing a 'break' from the while (TRUE) loop
4205                                          *
4206                                          * external_state will only be set to VM_EXTERNAL_STATE_EXISTS
4207                                          * if the compressor is active and the page exists there
4208                                          */
4209                                         if (compressor_external_state != VM_EXTERNAL_STATE_EXISTS)
4210                                                 break;
4211
4212                                         if (map == kernel_map || real_map == kernel_map) {
4213                                                 /*
4214                                                  * can't call into the compressor with the kernel_map
4215                                                  * lock held, since the compressor may try to operate
4216                                                  * on the kernel map in order to return an empty c_segment
4217                                                  */
4218                                                 break;
4219                                         }
4220                                         if (object != cur_object) {
4221                                                 if (fault_type & VM_PROT_WRITE)
4222                                                         c_flags |= C_KEEP;
4223                                                 else
4224                                                         insert_cur_object = TRUE;
4225                                         }
4226                                         if (insert_cur_object == TRUE) {
4227
4228                                                 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
4229
4230                                                         cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4231
4232                                                         if (vm_object_lock_upgrade(cur_object) == FALSE) {
4233                                                                 /*
4234                                                                  * couldn't upgrade so go do a full retry
4235                                                                  * immediately since we can no longer be
4236                                                                  * certain about cur_object (since we
4237                                                                  * don't hold a reference on it)...
4238                                                                  * first drop the top object lock
4239                                                                  */
4240                                                                 vm_object_unlock(object);
4241
4242                                                                 vm_map_unlock_read(map);
4243                                                                 if (real_map != map)
4244                                                                         vm_map_unlock(real_map);
4245
4246                                                                 goto RetryFault;
4247                                                         }
4248                                                 }
4249                                         } else if (object_lock_type == OBJECT_LOCK_SHARED) {
4250
4251                                                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4252
4253                                                 if (object != cur_object) {
4254                                                         /*
4255                                                          * we can't go for the upgrade on the top
4256                                                          * lock since the upgrade may block waiting
4257                                                          * for readers to drain... since we hold
4258                                                          * cur_object locked at this point, waiting
4259                                                          * for the readers to drain would represent
4260                                                          * a lock order inversion since the lock order
4261                                                          * for objects is the reference order in the
4262                                                          * shadown chain
4263                                                          */
4264                                                         vm_object_unlock(object);
4265                                                         vm_object_unlock(cur_object);
4266
4267                                                         vm_map_unlock_read(map);
4268                                                         if (real_map != map)
4269                                                                 vm_map_unlock(real_map);
4270
4271                                                         goto RetryFault;
4272                                                 }
4273                                                 if (vm_object_lock_upgrade(object) == FALSE) {
4274                                                         /*
4275                                                          * couldn't upgrade, so explictly take the lock
4276                                                          * exclusively and go relookup the page since we
4277                                                          * will have dropped the object lock and
4278                                                          * a different thread could have inserted
4279                                                          * a page at this offset
4280                                                          * no need for a full retry since we're
4281                                                          * at the top level of the object chain
4282                                                          */
4283                                                         vm_object_lock(object);
4284
4285                                                         continue;
4286                                                 }
4287                                         }
4288                                         m = vm_page_grab_options(grab_options);
4289                                         m_object = NULL;
4290
4291                                         if (m == VM_PAGE_NULL) {
4292                                                 /*
4293                                                  * no free page currently available...
4294                                                  * must take the slow path
4295                                                  */
4296                                                 break;
4297                                         }
4298
4299                                         /*
4300                                          * The object is and remains locked
4301                                          * so no need to take a
4302                                          * "paging_in_progress" reference.
4303                                          */
4304                                         boolean_t shared_lock;
4305                                         if ((object == cur_object &&
4306                                              object_lock_type == OBJECT_LOCK_EXCLUSIVE) ||
4307                                             (object != cur_object &&
4308                                              cur_object_lock_type == OBJECT_LOCK_EXCLUSIVE)) {
4309                                                 shared_lock = FALSE;
4310                                         } else {
4311                                                 shared_lock = TRUE;
4312                                         }
4313
4314                                         kr = vm_compressor_pager_get(
4315                                                 cur_object->pager,
4316                                                 (cur_offset +
4317                                                  cur_object->paging_offset),
4318                                                 VM_PAGE_GET_PHYS_PAGE(m),
4319                                                 &my_fault_type,
4320                                                 c_flags,
4321                                                 &compressed_count_delta);
4322
4323                                         vm_compressor_pager_count(
4324                                                 cur_object->pager,
4325                                                 compressed_count_delta,
4326                                                 shared_lock,
4327                                                 cur_object);
4328
4329                                         if (kr != KERN_SUCCESS) {
4330                                                 vm_page_release(m, FALSE);
4331                                                 m = VM_PAGE_NULL;
4332                                                 break;
4333                                         }
4334                                         m->vmp_dirty = TRUE;
4335
4336                                         /*
4337                                          * If the object is purgeable, its
4338                                          * owner's purgeable ledgers will be
4339                                          * updated in vm_page_insert() but the
4340                                          * page was also accounted for in a
4341                                          * "compressed purgeable" ledger, so
4342                                          * update that now.
4343                                          */
4344                                         if (object != cur_object &&
4345                                             !insert_cur_object) {
4346                                                 /*
4347                                                  * We're not going to insert
4348                                                  * the decompressed page into
4349                                                  * the object it came from.
4350                                                  *
4351                                                  * We're dealing with a
4352                                                  * copy-on-write fault on
4353                                                  * "object".
4354                                                  * We're going to decompress
4355                                                  * the page directly into the
4356                                                  * target "object" while
4357                                                  * keepin the compressed
4358                                                  * page for "cur_object", so
4359                                                  * no ledger update in that
4360                                                  * case.
4361                                                  */
4362                                         } else if (((cur_object->purgable ==
4363                                                      VM_PURGABLE_DENY) &&
4364                                                     (!cur_object->vo_ledger_tag)) ||
4365                                                    (cur_object->vo_owner ==
4366                                                     NULL)) {
4367                                                 /*
4368                                                  * "cur_object" is not purgeable
4369                                                  * and is not ledger-taged, or
4370                                                  * there's no owner for it,
4371                                                  * so no owner's ledgers to
4372                                                  * update.
4373                                                  */
4374                                         } else {
4375                                                 /*
4376                                                  * One less compressed
4377                                                  * purgeable/tagged page for
4378                                                  * cur_object's owner.
4379                                                  */
4380                                                 vm_object_owner_compressed_update(
4381                                                         cur_object,
4382                                                         -1);
4383                                         }
4384
4385                                         if (insert_cur_object) {
4386                                                 vm_page_insert(m, cur_object, cur_offset);
4387                                                 m_object = cur_object;
4388                                         } else {
4389                                                 vm_page_insert(m, object, offset);
4390                                                 m_object = object;
4391                                         }
4392
4393                                         if ((m_object->wimg_bits & VM_WIMG_MASK) != VM_WIMG_USE_DEFAULT) {
4394                                                 /*
4395                                                  * If the page is not cacheable,
4396                                                  * we can't let its contents
4397                                                  * linger in the data cache
4398                                                  * after the decompression.
4399                                                  */
4400                                                 pmap_sync_page_attributes_phys(VM_PAGE_GET_PHYS_PAGE(m));
4401                                         }
4402
4403                                         type_of_fault = my_fault_type;
4404
4405                                         VM_STAT_INCR(decompressions);
4406
4407                                         if (cur_object != object) {
4408                                                 if (insert_cur_object) {
4409                                                         top_object = object;
4410                                                         /*
4411                                                          * switch to the object that has the new page
4412                                                          */
4413                                                         object = cur_object;
4414                                                         object_lock_type = cur_object_lock_type;
4415                                                 } else {
4416                                                         vm_object_unlock(cur_object);
4417                                                         cur_object = object;
4418                                                 }
4419                                         }
4420                                         goto FastPmapEnter;
4421                                 }
4422                                 /*
4423                                  * existence map present and indicates
4424                                  * that the pager doesn't have this page
4425                                  */
4426                         }
4427                         if (cur_object->shadow == VM_OBJECT_NULL) {
4428                                 /*
4429                                  * Zero fill fault.  Page gets
4430                                  * inserted into the original object.
4431                                  */
4432                                 if (cur_object->shadow_severed ||
4433                                     VM_OBJECT_PURGEABLE_FAULT_ERROR(cur_object) ||
4434                                     cur_object == compressor_object ||
4435                                     cur_object == kernel_object ||
4436                                     cur_object == vm_submap_object) {
4437                                         if (object != cur_object)
4438                                                 vm_object_unlock(cur_object);
4439                                         vm_object_unlock(object);
4440
4441                                         vm_map_unlock_read(map);
4442                                         if (real_map != map)
4443                                                 vm_map_unlock(real_map);
4444
4445                                         kr = KERN_MEMORY_ERROR;
4446                                         goto done;
4447                                 }
4448                                 if (cur_object != object) {
4449                                         vm_object_unlock(cur_object);
4450
4451                                         cur_object = object;
4452                                 }
4453                                 if (object_lock_type == OBJECT_LOCK_SHARED) {
4454
4455                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4456
4457                                         if (vm_object_lock_upgrade(object) == FALSE) {
4458                                                 /*
4459                                                  * couldn't upgrade so do a full retry on the fault
4460                                                  * since we dropped the object lock which
4461                                                  * could allow another thread to insert
4462                                                  * a page at this offset
4463                                                  */
4464                                                 vm_map_unlock_read(map);
4465                                                 if (real_map != map)
4466                                                         vm_map_unlock(real_map);
4467
4468                                                 goto RetryFault;
4469                                         }
4470                                 }
4471                                 m = vm_page_alloc(object, offset);
4472                                 m_object = NULL;
4473
4474                                 if (m == VM_PAGE_NULL) {
4475                                         /*
4476                                          * no free page currently available...
4477                                          * must take the slow path
4478                                          */
4479                                         break;
4480                                 }
4481                                 m_object = object;
4482
4483                                 /*
4484                                  * Now zero fill page...
4485                                  * the page is probably going to
4486                                  * be written soon, so don't bother
4487                                  * to clear the modified bit
4488                                  *
4489                                  *   NOTE: This code holds the map
4490                                  *   lock across the zero fill.
4491                                  */
4492                                 type_of_fault = vm_fault_zero_page(m, map->no_zero_fill);
4493
4494                                 goto FastPmapEnter;
4495                         }
4496                         /*
4497                          * On to the next level in the shadow chain
4498                          */
4499                         cur_offset += cur_object->vo_shadow_offset;
4500                         new_object = cur_object->shadow;
4501
4502                         /*
4503                          * take the new_object's lock with the indicated state
4504                          */
4505                         if (cur_object_lock_type == OBJECT_LOCK_SHARED)
4506                                 vm_object_lock_shared(new_object);
4507                         else
4508                                 vm_object_lock(new_object);
4509
4510                         if (cur_object != object)
4511                                 vm_object_unlock(cur_object);
4512
4513                         cur_object = new_object;
4514
4515                         continue;
4516                 }
4517         }
4518         /*
4519          * Cleanup from fast fault failure.  Drop any object
4520          * lock other than original and drop map lock.
4521          */
4522         if (object != cur_object)
4523                 vm_object_unlock(cur_object);
4524
4525         /*
4526          * must own the object lock exclusively at this point
4527          */
4528         if (object_lock_type == OBJECT_LOCK_SHARED) {
4529                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4530
4531                 if (vm_object_lock_upgrade(object) == FALSE) {
4532                         /*
4533                          * couldn't upgrade, so explictly
4534                          * take the lock exclusively
4535                          * no need to retry the fault at this
4536                          * point since "vm_fault_page" will
4537                          * completely re-evaluate the state
4538                          */
4539                         vm_object_lock(object);
4540                 }
4541         }
4542
4543 handle_copy_delay:
4544         vm_map_unlock_read(map);
4545         if (real_map != map)
4546                 vm_map_unlock(real_map);
4547
4548         if (__improbable(object == compressor_object ||
4549                 object == kernel_object ||
4550                 object == vm_submap_object)) {
4551                 /*
4552                  * These objects are explicitly managed and populated by the
4553                  * kernel.  The virtual ranges backed by these objects should
4554                  * either have wired pages or "holes" that are not supposed to
4555                  * be accessed at all until they get explicitly populated.
4556                  * We should never have to resolve a fault on a mapping backed
4557                  * by one of these VM objects and providing a zero-filled page
4558                  * would be wrong here, so let's fail the fault and let the
4559                  * caller crash or recover.
4560                  */
4561                 vm_object_unlock(object);
4562                 kr = KERN_MEMORY_ERROR;
4563                 goto done;
4564         }
4565
4566         assert(object != compressor_object);
4567         assert(object != kernel_object);
4568         assert(object != vm_submap_object);
4569
4570         /*
4571          * Make a reference to this object to
4572          * prevent its disposal while we are messing with
4573          * it.  Once we have the reference, the map is free
4574          * to be diddled.  Since objects reference their
4575          * shadows (and copies), they will stay around as well.
4576          */
4577         vm_object_reference_locked(object);
4578         vm_object_paging_begin(object);
4579
4580         XPR(XPR_VM_FAULT,"vm_fault -> vm_fault_page\n",0,0,0,0,0);
4581
4582         error_code = 0;
4583
4584         result_page = VM_PAGE_NULL;
4585         kr = vm_fault_page(object, offset, fault_type,
4586                            (change_wiring && !wired),
4587                            FALSE, /* page not looked up */
4588                            &prot, &result_page, &top_page,
4589                            &type_of_fault,
4590                            &error_code, map->no_zero_fill,
4591                            FALSE, &fault_info);
4592
4593         /*
4594          * if kr != VM_FAULT_SUCCESS, then the paging reference
4595          * has been dropped and the object unlocked... the ref_count
4596          * is still held
4597          *
4598          * if kr == VM_FAULT_SUCCESS, then the paging reference
4599          * is still held along with the ref_count on the original object
4600          *
4601          *      the object is returned locked with a paging reference
4602          *
4603          *      if top_page != NULL, then it's BUSY and the
4604          *      object it belongs to has a paging reference
4605          *      but is returned unlocked
4606          */
4607         if (kr != VM_FAULT_SUCCESS &&
4608             kr != VM_FAULT_SUCCESS_NO_VM_PAGE) {
4609                 /*
4610                  * we didn't succeed, lose the object reference immediately.
4611                  */
4612                 vm_object_deallocate(object);
4613
4614                 /*
4615                  * See why we failed, and take corrective action.
4616                  */
4617                 switch (kr) {
4618                 case VM_FAULT_MEMORY_SHORTAGE:
4619                         if (vm_page_wait((change_wiring) ?
4620                                          THREAD_UNINT :
4621                                          THREAD_ABORTSAFE))
4622                                 goto RetryFault;
4623                         /*
4624                          * fall thru
4625                          */
4626                 case VM_FAULT_INTERRUPTED:
4627                         kr = KERN_ABORTED;
4628                         goto done;
4629                 case VM_FAULT_RETRY:
4630                         goto RetryFault;
4631                 case VM_FAULT_MEMORY_ERROR:
4632                         if (error_code)
4633                                 kr = error_code;
4634                         else
4635                                 kr = KERN_MEMORY_ERROR;
4636                         goto done;
4637                 default:
4638                         panic("vm_fault: unexpected error 0x%x from "
4639                               "vm_fault_page()\n", kr);
4640                 }
4641         }
4642         m = result_page;
4643         m_object = NULL;
4644
4645         if (m != VM_PAGE_NULL) {
4646                 m_object = VM_PAGE_OBJECT(m);
4647                 assert((change_wiring && !wired) ?
4648                        (top_page == VM_PAGE_NULL) :
4649                        ((top_page == VM_PAGE_NULL) == (m_object == object)));
4650         }
4651
4652         /*
4653          * What to do with the resulting page from vm_fault_page
4654          * if it doesn't get entered into the physical map:
4655          */
4656 #define RELEASE_PAGE(m)                                 \
4657         MACRO_BEGIN                                     \
4658         PAGE_WAKEUP_DONE(m);                            \
4659         if ( !VM_PAGE_PAGEABLE(m)) {                    \
4660                 vm_page_lockspin_queues();              \
4661                 if ( !VM_PAGE_PAGEABLE(m))              \
4662                         vm_page_activate(m);            \
4663                 vm_page_unlock_queues();                \
4664         }                                               \
4665         MACRO_END
4666
4667
4668         object_locks_dropped = FALSE;
4669         /*
4670          * We must verify that the maps have not changed
4671          * since our last lookup. vm_map_verify() needs the
4672          * map lock (shared) but we are holding object locks.
4673          * So we do a try_lock() first and, if that fails, we
4674          * drop the object locks and go in for the map lock again.
4675          */
4676         if (!vm_map_try_lock_read(original_map)) {
4677
4678                 if (m != VM_PAGE_NULL) {
4679                         old_copy_object = m_object->copy;
4680                         vm_object_unlock(m_object);
4681                 } else {
4682                         old_copy_object = VM_OBJECT_NULL;
4683                         vm_object_unlock(object);
4684                 }
4685
4686                 object_locks_dropped = TRUE;
4687
4688                 vm_map_lock_read(original_map);
4689         }
4690
4691         if ((map != original_map) || !vm_map_verify(map, &version)) {
4692
4693                 if (object_locks_dropped == FALSE) {
4694                         if (m != VM_PAGE_NULL) {
4695                                 old_copy_object = m_object->copy;
4696                                 vm_object_unlock(m_object);
4697                         } else {
4698                                 old_copy_object = VM_OBJECT_NULL;
4699                                 vm_object_unlock(object);
4700                         }
4701
4702                         object_locks_dropped = TRUE;
4703                 }
4704
4705                 /*
4706                  * no object locks are held at this point
4707                  */
4708                 vm_object_t             retry_object;
4709                 vm_object_offset_t      retry_offset;
4710                 vm_prot_t               retry_prot;
4711
4712                 /*
4713                  * To avoid trying to write_lock the map while another
4714                  * thread has it read_locked (in vm_map_pageable), we
4715                  * do not try for write permission.  If the page is
4716                  * still writable, we will get write permission.  If it
4717                  * is not, or has been marked needs_copy, we enter the
4718                  * mapping without write permission, and will merely
4719                  * take another fault.
4720                  */
4721                 map = original_map;
4722
4723                 kr = vm_map_lookup_locked(&map, vaddr,
4724                                           fault_type & ~VM_PROT_WRITE,
4725                                           OBJECT_LOCK_EXCLUSIVE, &version,
4726                                           &retry_object, &retry_offset, &retry_prot,
4727                                           &wired,
4728                                           &fault_info,
4729                                           &real_map);
4730                 pmap = real_map->pmap;
4731
4732                 if (kr != KERN_SUCCESS) {
4733                         vm_map_unlock_read(map);
4734
4735                         if (m != VM_PAGE_NULL) {
4736                                 assert(VM_PAGE_OBJECT(m) == m_object);
4737
4738                                 /*
4739                                  * retake the lock so that
4740                                  * we can drop the paging reference
4741                                  * in vm_fault_cleanup and do the
4742                                  * PAGE_WAKEUP_DONE in RELEASE_PAGE
4743                                  */
4744                                 vm_object_lock(m_object);
4745
4746                                 RELEASE_PAGE(m);
4747
4748                                 vm_fault_cleanup(m_object, top_page);
4749                         } else {
4750                                 /*
4751                                  * retake the lock so that
4752                                  * we can drop the paging reference
4753                                  * in vm_fault_cleanup
4754                                  */
4755                                 vm_object_lock(object);
4756
4757                                 vm_fault_cleanup(object, top_page);
4758                         }
4759                         vm_object_deallocate(object);
4760
4761                         goto done;
4762                 }
4763                 vm_object_unlock(retry_object);
4764
4765                 if ((retry_object != object) || (retry_offset != offset)) {
4766
4767                         vm_map_unlock_read(map);
4768                         if (real_map != map)
4769                                 vm_map_unlock(real_map);
4770
4771                         if (m != VM_PAGE_NULL) {
4772                                 assert(VM_PAGE_OBJECT(m) == m_object);
4773
4774                                 /*
4775                                  * retake the lock so that
4776                                  * we can drop the paging reference
4777                                  * in vm_fault_cleanup and do the
4778                                  * PAGE_WAKEUP_DONE in RELEASE_PAGE
4779                                  */
4780                                 vm_object_lock(m_object);
4781
4782                                 RELEASE_PAGE(m);
4783
4784                                 vm_fault_cleanup(m_object, top_page);
4785                         } else {
4786                                 /*
4787                                  * retake the lock so that
4788                                  * we can drop the paging reference
4789                                  * in vm_fault_cleanup
4790                                  */
4791                                 vm_object_lock(object);
4792
4793                                 vm_fault_cleanup(object, top_page);
4794                         }
4795                         vm_object_deallocate(object);
4796
4797                         goto RetryFault;
4798                 }
4799                 /*
4800                  * Check whether the protection has changed or the object
4801                  * has been copied while we left the map unlocked.
4802                  */
4803                 if (pmap_has_prot_policy(retry_prot)) {
4804                         /* If the pmap layer cares, pass the full set. */
4805                         prot = retry_prot;
4806                 } else {
4807                         prot &= retry_prot;
4808                 }
4809         }
4810
4811         if (object_locks_dropped == TRUE) {
4812                 if (m != VM_PAGE_NULL) {
4813                         vm_object_lock(m_object);
4814
4815                         if (m_object->copy != old_copy_object) {
4816                                 /*
4817                                  * The copy object changed while the top-level object
4818                                  * was unlocked, so take away write permission.
4819                                  */
4820                                 assert(!pmap_has_prot_policy(prot));
4821                                 prot &= ~VM_PROT_WRITE;
4822                         }
4823                 } else
4824                         vm_object_lock(object);
4825
4826                 object_locks_dropped = FALSE;
4827         }
4828
4829         /*
4830          * If we want to wire down this page, but no longer have
4831          * adequate permissions, we must start all over.
4832          */
4833         if (wired && (fault_type != (prot | VM_PROT_WRITE))) {
4834
4835                 vm_map_unlock_read(map);
4836                 if (real_map != map)
4837                         vm_map_unlock(real_map);
4838
4839                 if (m != VM_PAGE_NULL) {
4840                         assert(VM_PAGE_OBJECT(m) == m_object);
4841
4842                         RELEASE_PAGE(m);
4843
4844                         vm_fault_cleanup(m_object, top_page);
4845                 } else
4846                         vm_fault_cleanup(object, top_page);
4847
4848                 vm_object_deallocate(object);
4849
4850                 goto RetryFault;
4851         }
4852         if (m != VM_PAGE_NULL) {
4853                 /*
4854                  * Put this page into the physical map.
4855                  * We had to do the unlock above because pmap_enter
4856                  * may cause other faults.  The page may be on
4857                  * the pageout queues.  If the pageout daemon comes
4858                  * across the page, it will remove it from the queues.
4859                  */
4860                 if (caller_pmap) {
4861                         kr = vm_fault_enter(m,
4862                                             caller_pmap,
4863                                             caller_pmap_addr,
4864                                             prot,
4865                                             caller_prot,
4866                                             wired,
4867                                             change_wiring,
4868                                             wire_tag,
4869                                             &fault_info,
4870                                             NULL,
4871                                             &type_of_fault);
4872                 } else {
4873                         kr = vm_fault_enter(m,
4874                                             pmap,
4875                                             vaddr,
4876                                             prot,
4877                                             caller_prot,
4878                                             wired,
4879                                             change_wiring,
4880                                             wire_tag,
4881                                             &fault_info,
4882                                             NULL,
4883                                             &type_of_fault);
4884                 }
4885                 assert(VM_PAGE_OBJECT(m) == m_object);
4886
4887 #if DEVELOPMENT || DEBUG
4888         {
4889                 int     event_code = 0;
4890
4891                 if (m_object->internal)
4892                         event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL));
4893                 else if (m_object->object_is_shared_cache)
4894                         event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE));
4895                 else
4896                         event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL));
4897
4898                 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, event_code, trace_real_vaddr, (fault_info.user_tag << 16) | (caller_prot << 8) | type_of_fault, m->vmp_offset, get_current_unique_pid(), 0);
4899
4900                 DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info.user_tag);
4901         }
4902 #endif
4903                 if (kr != KERN_SUCCESS) {
4904                         /* abort this page fault */
4905                         vm_map_unlock_read(map);
4906                         if (real_map != map)
4907                                 vm_map_unlock(real_map);
4908                         PAGE_WAKEUP_DONE(m);
4909                         vm_fault_cleanup(m_object, top_page);
4910                         vm_object_deallocate(object);
4911                         goto done;
4912                 }
4913                 if (physpage_p != NULL) {
4914                         /* for vm_map_wire_and_extract() */
4915                         *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
4916                         if (prot & VM_PROT_WRITE) {
4917                                 vm_object_lock_assert_exclusive(m_object);
4918                                 m->vmp_dirty = TRUE;
4919                         }
4920                 }
4921         } else {
4922
4923                 vm_map_entry_t          entry;
4924                 vm_map_offset_t         laddr;
4925                 vm_map_offset_t         ldelta, hdelta;
4926
4927                 /*
4928                  * do a pmap block mapping from the physical address
4929                  * in the object
4930                  */
4931
4932                 if (real_map != map)
4933                         vm_map_unlock(real_map);
4934
4935                 if (original_map != map) {
4936                         vm_map_unlock_read(map);
4937                         vm_map_lock_read(original_map);
4938                         map = original_map;
4939                 }
4940                 real_map = map;
4941
4942                 laddr = vaddr;
4943                 hdelta = 0xFFFFF000;
4944                 ldelta = 0xFFFFF000;
4945
4946                 while (vm_map_lookup_entry(map, laddr, &entry)) {
4947                         if (ldelta > (laddr - entry->vme_start))
4948                                 ldelta = laddr - entry->vme_start;
4949                         if (hdelta > (entry->vme_end - laddr))
4950                                 hdelta = entry->vme_end - laddr;
4951                         if (entry->is_sub_map) {
4952
4953                                 laddr = ((laddr - entry->vme_start)
4954                                          + VME_OFFSET(entry));
4955                                 vm_map_lock_read(VME_SUBMAP(entry));
4956
4957                                 if (map != real_map)
4958                                         vm_map_unlock_read(map);
4959                                 if (entry->use_pmap) {
4960                                         vm_map_unlock_read(real_map);
4961                                         real_map = VME_SUBMAP(entry);
4962                                 }
4963                                 map = VME_SUBMAP(entry);
4964
4965                         } else {
4966                                 break;
4967                         }
4968                 }
4969
4970                 if (vm_map_lookup_entry(map, laddr, &entry) &&
4971                     (VME_OBJECT(entry) != NULL) &&
4972                     (VME_OBJECT(entry) == object)) {
4973                         int superpage;
4974
4975                         if (!object->pager_created &&
4976                             object->phys_contiguous &&
4977                             VME_OFFSET(entry) == 0 &&
4978                             (entry->vme_end - entry->vme_start == object->vo_size) &&
4979                             VM_MAP_PAGE_ALIGNED(entry->vme_start, (object->vo_size-1))) {
4980                                 superpage = VM_MEM_SUPERPAGE;
4981                         } else {
4982                                 superpage = 0;
4983                         }
4984
4985                         if (superpage && physpage_p) {
4986                                 /* for vm_map_wire_and_extract() */
4987                                 *physpage_p = (ppnum_t)
4988                                         ((((vm_map_offset_t)
4989                                            object->vo_shadow_offset)
4990                                           + VME_OFFSET(entry)
4991                                           + (laddr - entry->vme_start))
4992                                          >> PAGE_SHIFT);
4993                         }
4994
4995                         if (caller_pmap) {
4996                                 /*
4997                                  * Set up a block mapped area
4998                                  */
4999                                 assert((uint32_t)((ldelta + hdelta) >> PAGE_SHIFT) == ((ldelta + hdelta) >> PAGE_SHIFT));
5000                                 kr = pmap_map_block(caller_pmap,
5001                                                     (addr64_t)(caller_pmap_addr - ldelta),
5002                                                     (ppnum_t)((((vm_map_offset_t) (VME_OBJECT(entry)->vo_shadow_offset)) +
5003                                                                VME_OFFSET(entry) + (laddr - entry->vme_start) - ldelta) >> PAGE_SHIFT),
5004                                                     (uint32_t)((ldelta + hdelta) >> PAGE_SHIFT), prot,
5005                                                     (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
5006
5007                                 if (kr != KERN_SUCCESS) {
5008                                         goto cleanup;
5009                                 }
5010                         } else {
5011                                 /*
5012                                  * Set up a block mapped area
5013                                  */
5014                                 assert((uint32_t)((ldelta + hdelta) >> PAGE_SHIFT) == ((ldelta + hdelta) >> PAGE_SHIFT));
5015                                 kr = pmap_map_block(real_map->pmap,
5016                                                     (addr64_t)(vaddr - ldelta),
5017                                                     (ppnum_t)((((vm_map_offset_t)(VME_OBJECT(entry)->vo_shadow_offset)) +
5018                                                                VME_OFFSET(entry) + (laddr - entry->vme_start) - ldelta) >> PAGE_SHIFT),
5019                                                     (uint32_t)((ldelta + hdelta) >> PAGE_SHIFT), prot,
5020                                                     (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
5021
5022                                 if (kr != KERN_SUCCESS) {
5023                                         goto cleanup;
5024                                 }
5025                         }
5026                 }
5027         }
5028
5029         /*
5030          * Success
5031          */
5032         kr = KERN_SUCCESS;
5033
5034         /*
5035          * TODO: could most of the done cases just use cleanup?
5036          */
5037 cleanup:
5038         /*
5039          * Unlock everything, and return
5040          */
5041         vm_map_unlock_read(map);
5042         if (real_map != map)
5043                 vm_map_unlock(real_map);
5044
5045         if (m != VM_PAGE_NULL) {
5046                 assert(VM_PAGE_OBJECT(m) == m_object);
5047
5048                 if (!m_object->internal && (fault_type & VM_PROT_WRITE)) {
5049
5050                         vm_object_paging_begin(m_object);
5051
5052                         assert(written_on_object == VM_OBJECT_NULL);
5053                         written_on_object = m_object;
5054                         written_on_pager = m_object->pager;
5055                         written_on_offset = m_object->paging_offset + m->vmp_offset;
5056                 }
5057                 PAGE_WAKEUP_DONE(m);
5058
5059                 vm_fault_cleanup(m_object, top_page);
5060         } else
5061                 vm_fault_cleanup(object, top_page);
5062
5063         vm_object_deallocate(object);
5064
5065 #undef  RELEASE_PAGE
5066
5067 done:
5068         thread_interrupt_level(interruptible_state);
5069
5070         /*
5071          * Only I/O throttle on faults which cause a pagein/swapin.
5072          */
5073         if ((type_of_fault == DBG_PAGEIND_FAULT) || (type_of_fault == DBG_PAGEINV_FAULT) || (type_of_fault == DBG_COMPRESSOR_SWAPIN_FAULT)) {
5074                 throttle_lowpri_io(1);
5075         } else {
5076                 if (kr == KERN_SUCCESS && type_of_fault != DBG_CACHE_HIT_FAULT && type_of_fault != DBG_GUARD_FAULT) {
5077
5078                         if ((throttle_delay = vm_page_throttled(TRUE))) {
5079
5080                                 if (vm_debug_events) {
5081                                         if (type_of_fault == DBG_COMPRESSOR_FAULT)
5082                                                 VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
5083                                         else if (type_of_fault == DBG_COW_FAULT)
5084                                                 VM_DEBUG_EVENT(vmf_cowdelay, VMF_COWDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
5085                                         else
5086                                                 VM_DEBUG_EVENT(vmf_zfdelay, VMF_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
5087                                 }
5088                                 delay(throttle_delay);
5089                         }
5090                 }
5091         }
5092
5093         if (written_on_object) {
5094
5095                 vnode_pager_dirtied(written_on_pager, written_on_offset, written_on_offset + PAGE_SIZE_64);
5096
5097                 vm_object_lock(written_on_object);
5098                 vm_object_paging_end(written_on_object);
5099                 vm_object_unlock(written_on_object);
5100
5101                 written_on_object = VM_OBJECT_NULL;
5102         }
5103
5104         if (rtfault) {
5105                 vm_record_rtfault(cthread, fstart, trace_vaddr, type_of_fault);
5106         }
5107
5108         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
5109                               (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
5110                               ((uint64_t)trace_vaddr >> 32),
5111                               trace_vaddr,
5112                               kr,
5113                               type_of_fault,
5114                               0);
5115
5116         return (kr);
5117 }
5118
5119 /*
5120  *      vm_fault_wire:
5121  *
5122  *      Wire down a range of virtual addresses in a map.
5123  */
5124 kern_return_t
5125 vm_fault_wire(
5126         vm_map_t        map,
5127         vm_map_entry_t  entry,
5128         vm_prot_t       prot,
5129         vm_tag_t        wire_tag,
5130         pmap_t          pmap,
5131         vm_map_offset_t pmap_addr,
5132         ppnum_t         *physpage_p)
5133 {
5134         vm_map_offset_t va;
5135         vm_map_offset_t end_addr = entry->vme_end;
5136         kern_return_t   rc;
5137
5138         assert(entry->in_transition);
5139
5140         if ((VME_OBJECT(entry) != NULL) &&
5141             !entry->is_sub_map &&
5142             VME_OBJECT(entry)->phys_contiguous) {
5143                 return KERN_SUCCESS;
5144         }
5145
5146         /*
5147          *      Inform the physical mapping system that the
5148          *      range of addresses may not fault, so that
5149          *      page tables and such can be locked down as well.
5150          */
5151
5152         pmap_pageable(pmap, pmap_addr,
5153                 pmap_addr + (end_addr - entry->vme_start), FALSE);
5154
5155         /*
5156          *      We simulate a fault to get the page and enter it
5157          *      in the physical map.
5158          */
5159
5160         for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
5161                 rc = vm_fault_wire_fast(map, va, prot, wire_tag, entry, pmap,
5162                                         pmap_addr + (va - entry->vme_start),
5163                                         physpage_p);
5164                 if (rc != KERN_SUCCESS) {
5165                         rc = vm_fault_internal(map, va, prot, TRUE, wire_tag,
5166                                                ((pmap == kernel_pmap)
5167                                                 ? THREAD_UNINT
5168                                                 : THREAD_ABORTSAFE),
5169                                                pmap,
5170                                                (pmap_addr +
5171                                                 (va - entry->vme_start)),
5172                                                physpage_p);
5173                         DTRACE_VM2(softlock, int, 1, (uint64_t *), NULL);
5174                 }
5175
5176                 if (rc != KERN_SUCCESS) {
5177                         struct vm_map_entry     tmp_entry = *entry;
5178
5179                         /* unwire wired pages */
5180                         tmp_entry.vme_end = va;
5181                         vm_fault_unwire(map,
5182                                 &tmp_entry, FALSE, pmap, pmap_addr);
5183
5184                         return rc;
5185                 }
5186         }
5187         return KERN_SUCCESS;
5188 }
5189
5190 /*
5191  *      vm_fault_unwire:
5192  *
5193  *      Unwire a range of virtual addresses in a map.
5194  */
5195 void
5196 vm_fault_unwire(
5197         vm_map_t        map,
5198         vm_map_entry_t  entry,
5199         boolean_t       deallocate,
5200         pmap_t          pmap,
5201         vm_map_offset_t pmap_addr)
5202 {
5203         vm_map_offset_t va;
5204         vm_map_offset_t end_addr = entry->vme_end;
5205         vm_object_t             object;
5206         struct vm_object_fault_info fault_info = {};
5207         unsigned int    unwired_pages;
5208
5209         object = (entry->is_sub_map) ? VM_OBJECT_NULL : VME_OBJECT(entry);
5210
5211         /*
5212          * If it's marked phys_contiguous, then vm_fault_wire() didn't actually
5213          * do anything since such memory is wired by default.  So we don't have
5214          * anything to undo here.
5215          */
5216
5217         if (object != VM_OBJECT_NULL && object->phys_contiguous)
5218                 return;
5219
5220         fault_info.interruptible = THREAD_UNINT;
5221         fault_info.behavior = entry->behavior;
5222         fault_info.user_tag = VME_ALIAS(entry);
5223         if (entry->iokit_acct ||
5224             (!entry->is_sub_map && !entry->use_pmap)) {
5225                 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
5226         }
5227         fault_info.lo_offset = VME_OFFSET(entry);
5228         fault_info.hi_offset = (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
5229         fault_info.no_cache = entry->no_cache;
5230         fault_info.stealth = TRUE;
5231
5232         unwired_pages = 0;
5233
5234         /*
5235          *      Since the pages are wired down, we must be able to
5236          *      get their mappings from the physical map system.
5237          */
5238
5239         for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
5240
5241                 if (object == VM_OBJECT_NULL) {
5242                         if (pmap) {
5243                                 pmap_change_wiring(pmap,
5244                                                    pmap_addr + (va - entry->vme_start), FALSE);
5245                         }
5246                         (void) vm_fault(map, va, VM_PROT_NONE,
5247                                         TRUE, VM_KERN_MEMORY_NONE, THREAD_UNINT, pmap, pmap_addr);
5248                 } else {
5249                         vm_prot_t       prot;
5250                         vm_page_t       result_page;
5251                         vm_page_t       top_page;
5252                         vm_object_t     result_object;
5253                         vm_fault_return_t result;
5254
5255                         /* cap cluster size at maximum UPL size */
5256                         upl_size_t cluster_size;
5257                         if (os_sub_overflow(end_addr, va, &cluster_size)) {
5258                                 cluster_size = 0 - (upl_size_t)PAGE_SIZE;
5259                         }
5260                         fault_info.cluster_size = cluster_size;
5261
5262                         do {
5263                                 prot = VM_PROT_NONE;
5264
5265                                 vm_object_lock(object);
5266                                 vm_object_paging_begin(object);
5267                                 XPR(XPR_VM_FAULT,
5268                                         "vm_fault_unwire -> vm_fault_page\n",
5269                                         0,0,0,0,0);
5270                                 result_page = VM_PAGE_NULL;
5271                                 result = vm_fault_page(
5272                                         object,
5273                                         (VME_OFFSET(entry) +
5274                                          (va - entry->vme_start)),
5275                                         VM_PROT_NONE, TRUE,
5276                                         FALSE, /* page not looked up */
5277                                         &prot, &result_page, &top_page,
5278                                         (int *)0,
5279                                         NULL, map->no_zero_fill,
5280                                         FALSE, &fault_info);
5281                         } while (result == VM_FAULT_RETRY);
5282
5283                         /*
5284                          * If this was a mapping to a file on a device that has been forcibly
5285                          * unmounted, then we won't get a page back from vm_fault_page().  Just
5286                          * move on to the next one in case the remaining pages are mapped from
5287                          * different objects.  During a forced unmount, the object is terminated
5288                          * so the alive flag will be false if this happens.  A forced unmount will
5289                          * will occur when an external disk is unplugged before the user does an
5290                          * eject, so we don't want to panic in that situation.
5291                          */
5292
5293                         if (result == VM_FAULT_MEMORY_ERROR && !object->alive)
5294                                 continue;
5295
5296                         if (result == VM_FAULT_MEMORY_ERROR &&
5297                             object == kernel_object) {
5298                                 /*
5299                                  * This must have been allocated with
5300                                  * KMA_KOBJECT and KMA_VAONLY and there's
5301                                  * no physical page at this offset.
5302                                  * We're done (no page to free).
5303                                  */
5304                                 assert(deallocate);
5305                                 continue;
5306                         }
5307
5308                         if (result != VM_FAULT_SUCCESS)
5309                                 panic("vm_fault_unwire: failure");
5310
5311                         result_object = VM_PAGE_OBJECT(result_page);
5312
5313                         if (deallocate) {
5314                                 assert(VM_PAGE_GET_PHYS_PAGE(result_page) !=
5315                                        vm_page_fictitious_addr);
5316                                 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(result_page));
5317                                 if (VM_PAGE_WIRED(result_page)) {
5318                                         unwired_pages++;
5319                                 }
5320                                 VM_PAGE_FREE(result_page);
5321                         } else {
5322                                 if ((pmap) && (VM_PAGE_GET_PHYS_PAGE(result_page) != vm_page_guard_addr))
5323                                         pmap_change_wiring(pmap,
5324                                             pmap_addr + (va - entry->vme_start), FALSE);
5325
5326
5327                                 if (VM_PAGE_WIRED(result_page)) {
5328                                         vm_page_lockspin_queues();
5329                                         vm_page_unwire(result_page, TRUE);
5330                                         vm_page_unlock_queues();
5331                                         unwired_pages++;
5332                                 }
5333                                 if(entry->zero_wired_pages) {
5334                                         pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(result_page));
5335                                         entry->zero_wired_pages = FALSE;
5336                                 }
5337
5338                                 PAGE_WAKEUP_DONE(result_page);
5339                         }
5340                         vm_fault_cleanup(result_object, top_page);
5341                 }
5342         }
5343
5344         /*
5345          *      Inform the physical mapping system that the range
5346          *      of addresses may fault, so that page tables and
5347          *      such may be unwired themselves.
5348          */
5349
5350         pmap_pageable(pmap, pmap_addr,
5351                 pmap_addr + (end_addr - entry->vme_start), TRUE);
5352
5353         if (kernel_object == object) {
5354             vm_tag_update_size(fault_info.user_tag, -ptoa_64(unwired_pages));
5355         }
5356 }
5357
5358 /*
5359  *      vm_fault_wire_fast:
5360  *
5361  *      Handle common case of a wire down page fault at the given address.
5362  *      If successful, the page is inserted into the associated physical map.
5363  *      The map entry is passed in to avoid the overhead of a map lookup.
5364  *
5365  *      NOTE: the given address should be truncated to the
5366  *      proper page address.
5367  *
5368  *      KERN_SUCCESS is returned if the page fault is handled; otherwise,
5369  *      a standard error specifying why the fault is fatal is returned.
5370  *
5371  *      The map in question must be referenced, and remains so.
5372  *      Caller has a read lock on the map.
5373  *
5374  *      This is a stripped version of vm_fault() for wiring pages.  Anything
5375  *      other than the common case will return KERN_FAILURE, and the caller
5376  *      is expected to call vm_fault().
5377  */
5378 static kern_return_t
5379 vm_fault_wire_fast(
5380         __unused vm_map_t       map,
5381         vm_map_offset_t va,
5382         __unused vm_prot_t       caller_prot,
5383         vm_tag_t        wire_tag,
5384         vm_map_entry_t  entry,
5385         pmap_t          pmap,
5386         vm_map_offset_t pmap_addr,
5387         ppnum_t         *physpage_p)
5388 {
5389         vm_object_t             object;
5390         vm_object_offset_t      offset;
5391         vm_page_t               m;
5392         vm_prot_t               prot;
5393         thread_t                thread = current_thread();
5394         int                     type_of_fault;
5395         kern_return_t           kr;
5396         struct vm_object_fault_info fault_info = {};
5397
5398         VM_STAT_INCR(faults);
5399
5400         if (thread != THREAD_NULL && thread->task != TASK_NULL)
5401           thread->task->faults++;
5402
5403 /*
5404  *      Recovery actions
5405  */
5406
5407 #undef  RELEASE_PAGE
5408 #define RELEASE_PAGE(m) {                               \
5409         PAGE_WAKEUP_DONE(m);                            \
5410         vm_page_lockspin_queues();                      \
5411         vm_page_unwire(m, TRUE);                        \
5412         vm_page_unlock_queues();                        \
5413 }
5414
5415
5416 #undef  UNLOCK_THINGS
5417 #define UNLOCK_THINGS   {                               \
5418         vm_object_paging_end(object);                      \
5419         vm_object_unlock(object);                          \
5420 }
5421
5422 #undef  UNLOCK_AND_DEALLOCATE
5423 #define UNLOCK_AND_DEALLOCATE   {                       \
5424         UNLOCK_THINGS;                                  \
5425         vm_object_deallocate(object);                   \
5426 }
5427 /*
5428  *      Give up and have caller do things the hard way.
5429  */
5430
5431 #define GIVE_UP {                                       \
5432         UNLOCK_AND_DEALLOCATE;                          \
5433         return(KERN_FAILURE);                           \
5434 }
5435
5436
5437         /*
5438          *      If this entry is not directly to a vm_object, bail out.
5439          */
5440         if (entry->is_sub_map) {
5441                 assert(physpage_p == NULL);
5442                 return(KERN_FAILURE);
5443         }
5444
5445         /*
5446          *      Find the backing store object and offset into it.
5447          */
5448
5449         object = VME_OBJECT(entry);
5450         offset = (va - entry->vme_start) + VME_OFFSET(entry);
5451         prot = entry->protection;
5452
5453         /*
5454          *      Make a reference to this object to prevent its
5455          *      disposal while we are messing with it.
5456          */
5457
5458         vm_object_lock(object);
5459         vm_object_reference_locked(object);
5460         vm_object_paging_begin(object);
5461
5462         /*
5463          *      INVARIANTS (through entire routine):
5464          *
5465          *      1)      At all times, we must either have the object
5466          *              lock or a busy page in some object to prevent
5467          *              some other thread from trying to bring in
5468          *              the same page.
5469          *
5470          *      2)      Once we have a busy page, we must remove it from
5471          *              the pageout queues, so that the pageout daemon
5472          *              will not grab it away.
5473          *
5474          */
5475
5476         /*
5477          *      Look for page in top-level object.  If it's not there or
5478          *      there's something going on, give up.
5479          */
5480         m = vm_page_lookup(object, offset);
5481         if ((m == VM_PAGE_NULL) || (m->vmp_busy) ||
5482             (m->vmp_unusual && ( m->vmp_error || m->vmp_restart || m->vmp_absent))) {
5483
5484                 GIVE_UP;
5485         }
5486         if (m->vmp_fictitious &&
5487             VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
5488                 /*
5489                  * Guard pages are fictitious pages and are never
5490                  * entered into a pmap, so let's say it's been wired...
5491                  */
5492                 kr = KERN_SUCCESS;
5493                 goto done;
5494         }
5495
5496         /*
5497          *      Wire the page down now.  All bail outs beyond this
5498          *      point must unwire the page.
5499          */
5500
5501         vm_page_lockspin_queues();
5502         vm_page_wire(m, wire_tag, TRUE);
5503         vm_page_unlock_queues();
5504
5505         /*
5506          *      Mark page busy for other threads.
5507          */
5508         assert(!m->vmp_busy);
5509         m->vmp_busy = TRUE;
5510         assert(!m->vmp_absent);
5511
5512         /*
5513          *      Give up if the page is being written and there's a copy object
5514          */
5515         if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
5516                 RELEASE_PAGE(m);
5517                 GIVE_UP;
5518         }
5519
5520         fault_info.user_tag = VME_ALIAS(entry);
5521         fault_info.pmap_options = 0;
5522         if (entry->iokit_acct ||
5523             (!entry->is_sub_map && !entry->use_pmap)) {
5524                 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
5525         }
5526
5527         /*
5528          *      Put this page into the physical map.
5529          */
5530         type_of_fault = DBG_CACHE_HIT_FAULT;
5531         kr = vm_fault_enter(m,
5532                             pmap,
5533                             pmap_addr,
5534                             prot,
5535                             prot,
5536                             TRUE,  /* wired */
5537                             FALSE, /* change_wiring */
5538                             wire_tag,
5539                             &fault_info,
5540                             NULL,
5541                             &type_of_fault);
5542         if (kr != KERN_SUCCESS) {
5543                 RELEASE_PAGE(m);
5544                 GIVE_UP;
5545         }
5546
5547 done:
5548         /*
5549          *      Unlock everything, and return
5550          */
5551
5552         if (physpage_p) {
5553                 /* for vm_map_wire_and_extract() */
5554                 if (kr == KERN_SUCCESS) {
5555                         assert(object == VM_PAGE_OBJECT(m));
5556                         *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
5557                         if (prot & VM_PROT_WRITE) {
5558                                 vm_object_lock_assert_exclusive(object);
5559                                 m->vmp_dirty = TRUE;
5560                         }
5561                 } else {
5562                         *physpage_p = 0;
5563                 }
5564         }
5565
5566         PAGE_WAKEUP_DONE(m);
5567         UNLOCK_AND_DEALLOCATE;
5568
5569         return kr;
5570
5571 }
5572
5573 /*
5574  *      Routine:        vm_fault_copy_cleanup
5575  *      Purpose:
5576  *              Release a page used by vm_fault_copy.
5577  */
5578
5579 static void
5580 vm_fault_copy_cleanup(
5581         vm_page_t       page,
5582         vm_page_t       top_page)
5583 {
5584         vm_object_t     object = VM_PAGE_OBJECT(page);
5585
5586         vm_object_lock(object);
5587         PAGE_WAKEUP_DONE(page);
5588         if ( !VM_PAGE_PAGEABLE(page)) {
5589                 vm_page_lockspin_queues();
5590                 if ( !VM_PAGE_PAGEABLE(page)) {
5591                         vm_page_activate(page);
5592                 }
5593                 vm_page_unlock_queues();
5594         }
5595         vm_fault_cleanup(object, top_page);
5596 }
5597
5598 static void
5599 vm_fault_copy_dst_cleanup(
5600         vm_page_t       page)
5601 {
5602         vm_object_t     object;
5603
5604         if (page != VM_PAGE_NULL) {
5605                 object = VM_PAGE_OBJECT(page);
5606                 vm_object_lock(object);
5607                 vm_page_lockspin_queues();
5608                 vm_page_unwire(page, TRUE);
5609                 vm_page_unlock_queues();
5610                 vm_object_paging_end(object);
5611                 vm_object_unlock(object);
5612         }
5613 }
5614
5615 /*
5616  *      Routine:        vm_fault_copy
5617  *
5618  *      Purpose:
5619  *              Copy pages from one virtual memory object to another --
5620  *              neither the source nor destination pages need be resident.
5621  *
5622  *              Before actually copying a page, the version associated with
5623  *              the destination address map wil be verified.
5624  *
5625  *      In/out conditions:
5626  *              The caller must hold a reference, but not a lock, to
5627  *              each of the source and destination objects and to the
5628  *              destination map.
5629  *
5630  *      Results:
5631  *              Returns KERN_SUCCESS if no errors were encountered in
5632  *              reading or writing the data.  Returns KERN_INTERRUPTED if
5633  *              the operation was interrupted (only possible if the
5634  *              "interruptible" argument is asserted).  Other return values
5635  *              indicate a permanent error in copying the data.
5636  *
5637  *              The actual amount of data copied will be returned in the
5638  *              "copy_size" argument.  In the event that the destination map
5639  *              verification failed, this amount may be less than the amount
5640  *              requested.
5641  */
5642 kern_return_t
5643 vm_fault_copy(
5644         vm_object_t             src_object,
5645         vm_object_offset_t      src_offset,
5646         vm_map_size_t           *copy_size,             /* INOUT */
5647         vm_object_t             dst_object,
5648         vm_object_offset_t      dst_offset,
5649         vm_map_t                dst_map,
5650         vm_map_version_t         *dst_version,
5651         int                     interruptible)
5652 {
5653         vm_page_t               result_page;
5654
5655         vm_page_t               src_page;
5656         vm_page_t               src_top_page;
5657         vm_prot_t               src_prot;
5658
5659         vm_page_t               dst_page;
5660         vm_page_t               dst_top_page;
5661         vm_prot_t               dst_prot;
5662
5663         vm_map_size_t           amount_left;
5664         vm_object_t             old_copy_object;
5665         vm_object_t             result_page_object = NULL;
5666         kern_return_t           error = 0;
5667         vm_fault_return_t       result;
5668
5669         vm_map_size_t           part_size;
5670         struct vm_object_fault_info fault_info_src = {};
5671         struct vm_object_fault_info fault_info_dst = {};
5672
5673         /*
5674          * In order not to confuse the clustered pageins, align
5675          * the different offsets on a page boundary.
5676          */
5677
5678 #define RETURN(x)                                       \
5679         MACRO_BEGIN                                     \
5680         *copy_size -= amount_left;                      \
5681         MACRO_RETURN(x);                                \
5682         MACRO_END
5683
5684         amount_left = *copy_size;
5685
5686         fault_info_src.interruptible = interruptible;
5687         fault_info_src.behavior = VM_BEHAVIOR_SEQUENTIAL;
5688         fault_info_src.lo_offset = vm_object_trunc_page(src_offset);
5689         fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left;
5690         fault_info_src.stealth = TRUE;
5691
5692         fault_info_dst.interruptible = interruptible;
5693         fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL;
5694         fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset);
5695         fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left;
5696         fault_info_dst.stealth = TRUE;
5697
5698         do { /* while (amount_left > 0) */
5699                 /*
5700                  * There may be a deadlock if both source and destination
5701                  * pages are the same. To avoid this deadlock, the copy must
5702                  * start by getting the destination page in order to apply
5703                  * COW semantics if any.
5704                  */
5705
5706         RetryDestinationFault: ;
5707
5708                 dst_prot = VM_PROT_WRITE|VM_PROT_READ;
5709
5710                 vm_object_lock(dst_object);
5711                 vm_object_paging_begin(dst_object);
5712
5713                 /* cap cluster size at maximum UPL size */
5714                 upl_size_t cluster_size;
5715                 if (os_convert_overflow(amount_left, &cluster_size)) {
5716                         cluster_size = 0 - (upl_size_t)PAGE_SIZE;
5717                 }
5718                 fault_info_dst.cluster_size = cluster_size;
5719
5720                 XPR(XPR_VM_FAULT,"vm_fault_copy -> vm_fault_page\n",0,0,0,0,0);
5721                 dst_page = VM_PAGE_NULL;
5722                 result = vm_fault_page(dst_object,
5723                                        vm_object_trunc_page(dst_offset),
5724                                        VM_PROT_WRITE|VM_PROT_READ,
5725                                        FALSE,
5726                                        FALSE, /* page not looked up */
5727                                        &dst_prot, &dst_page, &dst_top_page,
5728                                        (int *)0,
5729                                        &error,
5730                                        dst_map->no_zero_fill,
5731                                        FALSE, &fault_info_dst);
5732                 switch (result) {
5733                 case VM_FAULT_SUCCESS:
5734                         break;
5735                 case VM_FAULT_RETRY:
5736                         goto RetryDestinationFault;
5737                 case VM_FAULT_MEMORY_SHORTAGE:
5738                         if (vm_page_wait(interruptible))
5739                                 goto RetryDestinationFault;
5740                         /* fall thru */
5741                 case VM_FAULT_INTERRUPTED:
5742                         RETURN(MACH_SEND_INTERRUPTED);
5743                 case VM_FAULT_SUCCESS_NO_VM_PAGE:
5744                         /* success but no VM page: fail the copy */
5745                         vm_object_paging_end(dst_object);
5746                         vm_object_unlock(dst_object);
5747                         /*FALLTHROUGH*/
5748                 case VM_FAULT_MEMORY_ERROR:
5749                         if (error)
5750                                 return (error);
5751                         else
5752                                 return(KERN_MEMORY_ERROR);
5753                 default:
5754                         panic("vm_fault_copy: unexpected error 0x%x from "
5755                               "vm_fault_page()\n", result);
5756                 }
5757                 assert ((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
5758
5759                 assert(dst_object == VM_PAGE_OBJECT(dst_page));
5760                 old_copy_object = dst_object->copy;
5761
5762                 /*
5763                  * There exists the possiblity that the source and
5764                  * destination page are the same.  But we can't
5765                  * easily determine that now.  If they are the
5766                  * same, the call to vm_fault_page() for the
5767                  * destination page will deadlock.  To prevent this we
5768                  * wire the page so we can drop busy without having
5769                  * the page daemon steal the page.  We clean up the
5770                  * top page  but keep the paging reference on the object
5771                  * holding the dest page so it doesn't go away.
5772                  */
5773
5774                 vm_page_lockspin_queues();
5775                 vm_page_wire(dst_page, VM_KERN_MEMORY_OSFMK, TRUE);
5776                 vm_page_unlock_queues();
5777                 PAGE_WAKEUP_DONE(dst_page);
5778                 vm_object_unlock(dst_object);
5779
5780                 if (dst_top_page != VM_PAGE_NULL) {
5781                         vm_object_lock(dst_object);
5782                         VM_PAGE_FREE(dst_top_page);
5783                         vm_object_paging_end(dst_object);
5784                         vm_object_unlock(dst_object);
5785                 }
5786
5787         RetrySourceFault: ;
5788
5789                 if (src_object == VM_OBJECT_NULL) {
5790                         /*
5791                          *      No source object.  We will just
5792                          *      zero-fill the page in dst_object.
5793                          */
5794                         src_page = VM_PAGE_NULL;
5795                         result_page = VM_PAGE_NULL;
5796                 } else {
5797                         vm_object_lock(src_object);
5798                         src_page = vm_page_lookup(src_object,
5799                                                   vm_object_trunc_page(src_offset));
5800                         if (src_page == dst_page) {
5801                                 src_prot = dst_prot;
5802                                 result_page = VM_PAGE_NULL;
5803                         } else {
5804                                 src_prot = VM_PROT_READ;
5805                                 vm_object_paging_begin(src_object);
5806
5807                                 /* cap cluster size at maximum UPL size */
5808                                 if (os_convert_overflow(amount_left, &cluster_size)) {
5809                                         cluster_size = 0 - (upl_size_t)PAGE_SIZE;
5810                                 }
5811                                 fault_info_src.cluster_size = cluster_size;
5812
5813                                 XPR(XPR_VM_FAULT,
5814                                         "vm_fault_copy(2) -> vm_fault_page\n",
5815                                         0,0,0,0,0);
5816                                 result_page = VM_PAGE_NULL;
5817                                 result = vm_fault_page(
5818                                         src_object,
5819                                         vm_object_trunc_page(src_offset),
5820                                         VM_PROT_READ, FALSE,
5821                                         FALSE, /* page not looked up */
5822                                         &src_prot,
5823                                         &result_page, &src_top_page,
5824                                         (int *)0, &error, FALSE,
5825                                         FALSE, &fault_info_src);
5826
5827                                 switch (result) {
5828                                 case VM_FAULT_SUCCESS:
5829                                         break;
5830                                 case VM_FAULT_RETRY:
5831                                         goto RetrySourceFault;
5832                                 case VM_FAULT_MEMORY_SHORTAGE:
5833                                         if (vm_page_wait(interruptible))
5834                                                 goto RetrySourceFault;
5835                                         /* fall thru */
5836                                 case VM_FAULT_INTERRUPTED:
5837                                         vm_fault_copy_dst_cleanup(dst_page);
5838                                         RETURN(MACH_SEND_INTERRUPTED);
5839                                 case VM_FAULT_SUCCESS_NO_VM_PAGE:
5840                                         /* success but no VM page: fail */
5841                                         vm_object_paging_end(src_object);
5842                                         vm_object_unlock(src_object);
5843                                         /*FALLTHROUGH*/
5844                                 case VM_FAULT_MEMORY_ERROR:
5845                                         vm_fault_copy_dst_cleanup(dst_page);
5846                                         if (error)
5847                                                 return (error);
5848                                         else
5849                                                 return(KERN_MEMORY_ERROR);
5850                                 default:
5851                                         panic("vm_fault_copy(2): unexpected "
5852                                               "error 0x%x from "
5853                                               "vm_fault_page()\n", result);
5854                                 }
5855
5856                                 result_page_object = VM_PAGE_OBJECT(result_page);
5857                                 assert((src_top_page == VM_PAGE_NULL) ==
5858                                        (result_page_object == src_object));
5859                         }
5860                         assert ((src_prot & VM_PROT_READ) != VM_PROT_NONE);
5861                         vm_object_unlock(result_page_object);
5862                 }
5863
5864                 vm_map_lock_read(dst_map);
5865
5866                 if (!vm_map_verify(dst_map, dst_version)) {
5867                         vm_map_unlock_read(dst_map);
5868                         if (result_page != VM_PAGE_NULL && src_page != dst_page)
5869                                 vm_fault_copy_cleanup(result_page, src_top_page);
5870                         vm_fault_copy_dst_cleanup(dst_page);
5871                         break;
5872                 }
5873                 assert(dst_object == VM_PAGE_OBJECT(dst_page));
5874
5875                 vm_object_lock(dst_object);
5876
5877                 if (dst_object->copy != old_copy_object) {
5878                         vm_object_unlock(dst_object);
5879                         vm_map_unlock_read(dst_map);
5880                         if (result_page != VM_PAGE_NULL && src_page != dst_page)
5881                                 vm_fault_copy_cleanup(result_page, src_top_page);
5882                         vm_fault_copy_dst_cleanup(dst_page);
5883                         break;
5884                 }
5885                 vm_object_unlock(dst_object);
5886
5887                 /*
5888                  *      Copy the page, and note that it is dirty
5889                  *      immediately.
5890                  */
5891
5892                 if (!page_aligned(src_offset) ||
5893                         !page_aligned(dst_offset) ||
5894                         !page_aligned(amount_left)) {
5895
5896                         vm_object_offset_t      src_po,
5897                                                 dst_po;
5898
5899                         src_po = src_offset - vm_object_trunc_page(src_offset);
5900                         dst_po = dst_offset - vm_object_trunc_page(dst_offset);
5901
5902                         if (dst_po > src_po) {
5903                                 part_size = PAGE_SIZE - dst_po;
5904                         } else {
5905                                 part_size = PAGE_SIZE - src_po;
5906                         }
5907                         if (part_size > (amount_left)){
5908                                 part_size = amount_left;
5909                         }
5910
5911                         if (result_page == VM_PAGE_NULL) {
5912                                 assert((vm_offset_t) dst_po == dst_po);
5913                                 assert((vm_size_t) part_size == part_size);
5914                                 vm_page_part_zero_fill(dst_page,
5915                                                        (vm_offset_t) dst_po,
5916                                                        (vm_size_t) part_size);
5917                         } else {
5918                                 assert((vm_offset_t) src_po == src_po);
5919                                 assert((vm_offset_t) dst_po == dst_po);
5920                                 assert((vm_size_t) part_size == part_size);
5921                                 vm_page_part_copy(result_page,
5922                                                   (vm_offset_t) src_po,
5923                                                   dst_page,
5924                                                   (vm_offset_t) dst_po,
5925                                                   (vm_size_t)part_size);
5926                                 if(!dst_page->vmp_dirty){
5927                                         vm_object_lock(dst_object);
5928                                         SET_PAGE_DIRTY(dst_page, TRUE);
5929                                         vm_object_unlock(dst_object);
5930                                 }
5931
5932                         }
5933                 } else {
5934                         part_size = PAGE_SIZE;
5935
5936                         if (result_page == VM_PAGE_NULL)
5937                                 vm_page_zero_fill(dst_page);
5938                         else{
5939                                 vm_object_lock(result_page_object);
5940                                 vm_page_copy(result_page, dst_page);
5941                                 vm_object_unlock(result_page_object);
5942
5943                                 if(!dst_page->vmp_dirty){
5944                                         vm_object_lock(dst_object);
5945                                         SET_PAGE_DIRTY(dst_page, TRUE);
5946                                         vm_object_unlock(dst_object);
5947                                 }
5948                         }
5949
5950                 }
5951
5952                 /*
5953                  *      Unlock everything, and return
5954                  */
5955
5956                 vm_map_unlock_read(dst_map);
5957
5958                 if (result_page != VM_PAGE_NULL && src_page != dst_page)
5959                         vm_fault_copy_cleanup(result_page, src_top_page);
5960                 vm_fault_copy_dst_cleanup(dst_page);
5961
5962                 amount_left -= part_size;
5963                 src_offset += part_size;
5964                 dst_offset += part_size;
5965         } while (amount_left > 0);
5966
5967         RETURN(KERN_SUCCESS);
5968 #undef  RETURN
5969
5970         /*NOTREACHED*/
5971 }
5972
5973 #if     VM_FAULT_CLASSIFY
5974 /*
5975  *      Temporary statistics gathering support.
5976  */
5977
5978 /*
5979  *      Statistics arrays:
5980  */
5981 #define VM_FAULT_TYPES_MAX      5
5982 #define VM_FAULT_LEVEL_MAX      8
5983
5984 int     vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
5985
5986 #define VM_FAULT_TYPE_ZERO_FILL 0
5987 #define VM_FAULT_TYPE_MAP_IN    1
5988 #define VM_FAULT_TYPE_PAGER     2
5989 #define VM_FAULT_TYPE_COPY      3
5990 #define VM_FAULT_TYPE_OTHER     4
5991
5992
5993 void
5994 vm_fault_classify(vm_object_t           object,
5995                   vm_object_offset_t    offset,
5996                   vm_prot_t             fault_type)
5997 {
5998         int             type, level = 0;
5999         vm_page_t       m;
6000
6001         while (TRUE) {
6002                 m = vm_page_lookup(object, offset);
6003                 if (m != VM_PAGE_NULL) {
6004                         if (m->vmp_busy || m->vmp_error || m->vmp_restart || m->vmp_absent) {
6005                                 type = VM_FAULT_TYPE_OTHER;
6006                                 break;
6007                         }
6008                         if (((fault_type & VM_PROT_WRITE) == 0) ||
6009                             ((level == 0) && object->copy == VM_OBJECT_NULL)) {
6010                                 type = VM_FAULT_TYPE_MAP_IN;
6011                                 break;
6012                         }
6013                         type = VM_FAULT_TYPE_COPY;
6014                         break;
6015                 }
6016                 else {
6017                         if (object->pager_created) {
6018                                 type = VM_FAULT_TYPE_PAGER;
6019                                 break;
6020                         }
6021                         if (object->shadow == VM_OBJECT_NULL) {
6022                                 type = VM_FAULT_TYPE_ZERO_FILL;
6023                                 break;
6024                         }
6025
6026                         offset += object->vo_shadow_offset;
6027                         object = object->shadow;
6028                         level++;
6029                         continue;
6030                 }
6031         }
6032
6033         if (level > VM_FAULT_LEVEL_MAX)
6034                 level = VM_FAULT_LEVEL_MAX;
6035
6036         vm_fault_stats[type][level] += 1;
6037
6038         return;
6039 }
6040
6041 /* cleanup routine to call from debugger */
6042
6043 void
6044 vm_fault_classify_init(void)
6045 {
6046         int type, level;
6047
6048         for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
6049                 for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
6050                         vm_fault_stats[type][level] = 0;
6051                 }
6052         }
6053
6054         return;
6055 }
6056 #endif  /* VM_FAULT_CLASSIFY */
6057
6058 vm_offset_t
6059 kdp_lightweight_fault(vm_map_t map, vm_offset_t cur_target_addr)
6060 {
6061         vm_map_entry_t  entry;
6062         vm_object_t     object;
6063         vm_offset_t     object_offset;
6064         vm_page_t       m;
6065         int             compressor_external_state, compressed_count_delta;
6066         int             compressor_flags = (C_DONT_BLOCK | C_KEEP | C_KDP);
6067         int             my_fault_type = VM_PROT_READ;
6068         kern_return_t   kr;
6069
6070         if (not_in_kdp) {
6071                 panic("kdp_lightweight_fault called from outside of debugger context");
6072         }
6073
6074         assert(map != VM_MAP_NULL);
6075
6076         assert((cur_target_addr & PAGE_MASK) == 0);
6077         if ((cur_target_addr & PAGE_MASK) != 0) {
6078                 return 0;
6079         }
6080
6081         if (kdp_lck_rw_lock_is_acquired_exclusive(&map->lock)) {
6082                 return 0;
6083         }
6084
6085         if (!vm_map_lookup_entry(map, cur_target_addr, &entry)) {
6086                 return 0;
6087         }
6088
6089         if (entry->is_sub_map) {
6090                 return 0;
6091         }
6092
6093         object = VME_OBJECT(entry);
6094         if (object == VM_OBJECT_NULL) {
6095                 return 0;
6096         }
6097
6098         object_offset = cur_target_addr - entry->vme_start + VME_OFFSET(entry);
6099
6100         while (TRUE) {
6101                 if (kdp_lck_rw_lock_is_acquired_exclusive(&object->Lock)) {
6102                         return 0;
6103                 }
6104
6105                 if (object->pager_created && (object->paging_in_progress ||
6106                         object->activity_in_progress)) {
6107                         return 0;
6108                 }
6109
6110                 m = kdp_vm_page_lookup(object, object_offset);
6111
6112                 if (m != VM_PAGE_NULL) {
6113
6114                         if ((object->wimg_bits & VM_WIMG_MASK) != VM_WIMG_DEFAULT) {
6115                                 return 0;
6116                         }
6117
6118                         if (m->vmp_laundry || m->vmp_busy || m->vmp_free_when_done || m->vmp_absent || m->vmp_error || m->vmp_cleaning ||
6119                                 m->vmp_overwriting || m->vmp_restart || m->vmp_unusual) {
6120                                 return 0;
6121                         }
6122
6123                         assert(!m->vmp_private);
6124                         if (m->vmp_private) {
6125                                 return 0;
6126                         }
6127
6128                         assert(!m->vmp_fictitious);
6129                         if (m->vmp_fictitious) {
6130                                 return 0;
6131                         }
6132
6133                         assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
6134                         if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
6135                                 return 0;
6136                         }
6137
6138                         return ptoa(VM_PAGE_GET_PHYS_PAGE(m));
6139                 }
6140
6141                 compressor_external_state = VM_EXTERNAL_STATE_UNKNOWN;
6142
6143                 if (object->pager_created && MUST_ASK_PAGER(object, object_offset, compressor_external_state)) {
6144                         if (compressor_external_state == VM_EXTERNAL_STATE_EXISTS) {
6145                                 kr = vm_compressor_pager_get(object->pager, (object_offset + object->paging_offset),
6146                                                                 kdp_compressor_decompressed_page_ppnum, &my_fault_type,
6147                                                                 compressor_flags, &compressed_count_delta);
6148                                 if (kr == KERN_SUCCESS) {
6149                                         return kdp_compressor_decompressed_page_paddr;
6150                                 } else {
6151                                         return 0;
6152                                 }
6153                         }
6154                 }
6155
6156                 if (object->shadow == VM_OBJECT_NULL) {
6157                         return 0;
6158                 }
6159
6160                 object_offset += object->vo_shadow_offset;
6161                 object = object->shadow;
6162         }
6163
6164 }
6165
6166 /*
6167  * vm_page_validate_cs_fast():
6168  * Performs a few quick checks to determine if the page's code signature
6169  * really needs to be fully validated.  It could:
6170  *      1. have been modified (i.e. automatically tainted),
6171  *      2. have already been validated,
6172  *      3. have already been found to be tainted,
6173  *      4. no longer have a backing store.
6174  * Returns FALSE if the page needs to be fully validated.
6175  */
6176 static boolean_t
6177 vm_page_validate_cs_fast(
6178         vm_page_t       page)
6179 {
6180         vm_object_t     object;
6181
6182         object = VM_PAGE_OBJECT(page);
6183         vm_object_lock_assert_held(object);
6184
6185         if (page->vmp_wpmapped && !page->vmp_cs_tainted) {
6186                 /*
6187                  * This page was mapped for "write" access sometime in the
6188                  * past and could still be modifiable in the future.
6189                  * Consider it tainted.
6190                  * [ If the page was already found to be "tainted", no
6191                  * need to re-validate. ]
6192                  */
6193                 vm_object_lock_assert_exclusive(object);
6194                 page->vmp_cs_validated = TRUE;
6195                 page->vmp_cs_tainted = TRUE;
6196                 if (cs_debug) {
6197                         printf("CODESIGNING: %s: "
6198                                "page %p obj %p off 0x%llx "
6199                                "was modified\n",
6200                                __FUNCTION__,
6201                                page, object, page->vmp_offset);
6202                 }
6203                 vm_cs_validated_dirtied++;
6204         }
6205
6206         if (page->vmp_cs_validated || page->vmp_cs_tainted) {
6207                 return TRUE;
6208         }
6209         vm_object_lock_assert_exclusive(object);
6210
6211 #if CHECK_CS_VALIDATION_BITMAP
6212         kern_return_t kr;
6213
6214         kr = vnode_pager_cs_check_validation_bitmap(
6215                 object->pager,
6216                 page->vmp_offset + object->paging_offset,
6217                 CS_BITMAP_CHECK);
6218         if (kr == KERN_SUCCESS) {
6219                 page->vmp_cs_validated = TRUE;
6220                 page->vmp_cs_tainted = FALSE;
6221                 vm_cs_bitmap_validated++;
6222                 return TRUE;
6223         }
6224 #endif /* CHECK_CS_VALIDATION_BITMAP */
6225
6226         if (!object->alive || object->terminating || object->pager == NULL) {
6227                 /*
6228                  * The object is terminating and we don't have its pager
6229                  * so we can't validate the data...
6230                  */
6231                 return TRUE;
6232         }
6233
6234         /* we need to really validate this page */
6235         vm_object_lock_assert_exclusive(object);
6236         return FALSE;
6237 }
6238
6239 void
6240 vm_page_validate_cs_mapped_slow(
6241         vm_page_t       page,
6242         const void      *kaddr)
6243 {
6244         vm_object_t             object;
6245         memory_object_offset_t  mo_offset;
6246         memory_object_t         pager;
6247         struct vnode            *vnode;
6248         boolean_t               validated;
6249         unsigned                tainted;
6250
6251         assert(page->vmp_busy);
6252         object = VM_PAGE_OBJECT(page);
6253         vm_object_lock_assert_exclusive(object);
6254
6255         vm_cs_validates++;
6256
6257         /*
6258          * Since we get here to validate a page that was brought in by
6259          * the pager, we know that this pager is all setup and ready
6260          * by now.
6261          */
6262         assert(object->code_signed);
6263         assert(!object->internal);
6264         assert(object->pager != NULL);
6265         assert(object->pager_ready);
6266
6267         pager = object->pager;
6268         assert(object->paging_in_progress);
6269         vnode = vnode_pager_lookup_vnode(pager);
6270         mo_offset = page->vmp_offset + object->paging_offset;
6271
6272         /* verify the SHA1 hash for this page */
6273         tainted = 0;
6274         validated = cs_validate_range(vnode,
6275                                       pager,
6276                                       mo_offset,
6277                                       (const void *)((const char *)kaddr),
6278                                       PAGE_SIZE_64,
6279                                       &tainted);
6280
6281         if (tainted & CS_VALIDATE_TAINTED) {
6282                 page->vmp_cs_tainted = TRUE;
6283         }
6284         if (tainted & CS_VALIDATE_NX) {
6285                 page->vmp_cs_nx = TRUE;
6286         }
6287         if (validated) {
6288                 page->vmp_cs_validated = TRUE;
6289         }
6290
6291 #if CHECK_CS_VALIDATION_BITMAP
6292         if (page->vmp_cs_validated && !page->vmp_cs_tainted) {
6293                 vnode_pager_cs_check_validation_bitmap(object->pager,
6294                                                        mo_offset,
6295                                                        CS_BITMAP_SET);
6296         }
6297 #endif /* CHECK_CS_VALIDATION_BITMAP */
6298 }
6299
6300 void
6301 vm_page_validate_cs_mapped(
6302         vm_page_t       page,
6303         const void      *kaddr)
6304 {
6305         if (!vm_page_validate_cs_fast(page)) {
6306                 vm_page_validate_cs_mapped_slow(page, kaddr);
6307         }
6308 }
6309
6310 void
6311 vm_page_validate_cs(
6312         vm_page_t       page)
6313 {
6314         vm_object_t             object;
6315         vm_object_offset_t      offset;
6316         vm_map_offset_t         koffset;
6317         vm_map_size_t           ksize;
6318         vm_offset_t             kaddr;
6319         kern_return_t           kr;
6320         boolean_t               busy_page;
6321         boolean_t               need_unmap;
6322
6323         object = VM_PAGE_OBJECT(page);
6324         vm_object_lock_assert_held(object);
6325
6326         if (vm_page_validate_cs_fast(page)) {
6327                 return;
6328         }
6329         vm_object_lock_assert_exclusive(object);
6330
6331         assert(object->code_signed);
6332         offset = page->vmp_offset;
6333
6334         busy_page = page->vmp_busy;
6335         if (!busy_page) {
6336                 /* keep page busy while we map (and unlock) the VM object */
6337                 page->vmp_busy = TRUE;
6338         }
6339
6340         /*
6341          * Take a paging reference on the VM object
6342          * to protect it from collapse or bypass,
6343          * and keep it from disappearing too.
6344          */
6345         vm_object_paging_begin(object);
6346
6347         /* map the page in the kernel address space */
6348         ksize = PAGE_SIZE_64;
6349         koffset = 0;
6350         need_unmap = FALSE;
6351         kr = vm_paging_map_object(page,
6352                                   object,
6353                                   offset,
6354                                   VM_PROT_READ,
6355                                   FALSE, /* can't unlock object ! */
6356                                   &ksize,
6357                                   &koffset,
6358                                   &need_unmap);
6359         if (kr != KERN_SUCCESS) {
6360                 panic("%s: could not map page: 0x%x\n", __FUNCTION__, kr);
6361         }
6362         kaddr = CAST_DOWN(vm_offset_t, koffset);
6363
6364         /* validate the mapped page */
6365         vm_page_validate_cs_mapped_slow(page, (const void *) kaddr);
6366
6367         assert(page->vmp_busy);
6368         assert(object == VM_PAGE_OBJECT(page));
6369         vm_object_lock_assert_exclusive(object);
6370
6371         if (!busy_page) {
6372                 PAGE_WAKEUP_DONE(page);
6373         }
6374         if (need_unmap) {
6375                 /* unmap the map from the kernel address space */
6376                 vm_paging_unmap_object(object, koffset, koffset + ksize);
6377                 koffset = 0;
6378                 ksize = 0;
6379                 kaddr = 0;
6380         }
6381         vm_object_paging_end(object);
6382 }
6383
6384 void
6385 vm_page_validate_cs_mapped_chunk(
6386         vm_page_t       page,
6387         const void      *kaddr,
6388         vm_offset_t     chunk_offset,
6389         vm_size_t       chunk_size,
6390         boolean_t       *validated_p,
6391         unsigned        *tainted_p)
6392 {
6393         vm_object_t             object;
6394         vm_object_offset_t      offset, offset_in_page;
6395         memory_object_t         pager;
6396         struct vnode            *vnode;
6397         boolean_t               validated;
6398         unsigned                tainted;
6399
6400         *validated_p = FALSE;
6401         *tainted_p = 0;
6402
6403         assert(page->vmp_busy);
6404         object = VM_PAGE_OBJECT(page);
6405         vm_object_lock_assert_exclusive(object);
6406
6407         assert(object->code_signed);
6408         offset = page->vmp_offset;
6409
6410         if (!object->alive || object->terminating || object->pager == NULL) {
6411                 /*
6412                  * The object is terminating and we don't have its pager
6413                  * so we can't validate the data...
6414                  */
6415                 return;
6416         }
6417         /*
6418          * Since we get here to validate a page that was brought in by
6419          * the pager, we know that this pager is all setup and ready
6420          * by now.
6421          */
6422         assert(!object->internal);
6423         assert(object->pager != NULL);
6424         assert(object->pager_ready);
6425
6426         pager = object->pager;
6427         assert(object->paging_in_progress);
6428         vnode = vnode_pager_lookup_vnode(pager);
6429
6430         /* verify the signature for this chunk */
6431         offset_in_page = chunk_offset;
6432         assert(offset_in_page < PAGE_SIZE);
6433
6434         tainted = 0;
6435         validated = cs_validate_range(vnode,
6436                                       pager,
6437                                       (object->paging_offset +
6438                                        offset +
6439                                        offset_in_page),
6440                                       (const void *)((const char *)kaddr
6441                                                     + offset_in_page),
6442                                       chunk_size,
6443                                       &tainted);
6444         if (validated) {
6445                 *validated_p = TRUE;
6446         }
6447         if (tainted) {
6448                 *tainted_p = tainted;
6449         }
6450 }
6451
6452 static void vm_rtfrecord_lock(void) {
6453         lck_spin_lock(&vm_rtfr_slock);
6454 }
6455
6456 static void vm_rtfrecord_unlock(void) {
6457         lck_spin_unlock(&vm_rtfr_slock);
6458 }
6459
6460 unsigned int vmrtfaultinfo_bufsz(void) {
6461         return (vmrtf_num_records * sizeof(vm_rtfault_record_t));
6462 }
6463
6464 #include <kern/backtrace.h>
6465
6466 static void vm_record_rtfault(thread_t cthread, uint64_t fstart, vm_map_offset_t fault_vaddr, int type_of_fault) {
6467         uint64_t fend = mach_continuous_time();
6468
6469         uint64_t cfpc = 0;
6470         uint64_t ctid = cthread->thread_id;
6471         uint64_t cupid = get_current_unique_pid();
6472
6473         uintptr_t bpc = 0;
6474         uint32_t bfrs = 0;
6475         bool u64 = false;
6476
6477         /* Capture a single-frame backtrace; this extracts just the program
6478          * counter at the point of the fault into "bpc", and should perform no
6479          * further user stack traversals, thus avoiding copyin()s and further
6480          * faults.
6481          */
6482         int btr = backtrace_thread_user(cthread, &bpc, 1U, &bfrs, &u64);
6483
6484         if ((btr == 0) && (bfrs > 0)) {
6485                 cfpc = bpc;
6486         }
6487
6488         assert((fstart != 0) && fend >= fstart);
6489         vm_rtfrecord_lock();
6490         assert(vmrtfrs.vmrtfr_curi <= vmrtfrs.vmrtfr_maxi);
6491
6492         vmrtfrs.vmrtf_total++;
6493         vm_rtfault_record_t *cvmr = &vmrtfrs.vm_rtf_records[vmrtfrs.vmrtfr_curi++];
6494
6495         cvmr->rtfabstime = fstart;
6496         cvmr->rtfduration = fend - fstart;
6497         cvmr->rtfaddr = fault_vaddr;
6498         cvmr->rtfpc = cfpc;
6499         cvmr->rtftype = type_of_fault;
6500         cvmr->rtfupid = cupid;
6501         cvmr->rtftid = ctid;
6502
6503         if (vmrtfrs.vmrtfr_curi > vmrtfrs.vmrtfr_maxi) {
6504                 vmrtfrs.vmrtfr_curi = 0;
6505         }
6506
6507         vm_rtfrecord_unlock();
6508 }
6509
6510 int vmrtf_extract(uint64_t cupid, __unused boolean_t isroot, int vrecordsz, void *vrecords, int *vmrtfrv) {
6511         vm_rtfault_record_t *cvmrd = vrecords;
6512         size_t residue = vrecordsz;
6513         int numextracted = 0;
6514         boolean_t early_exit = FALSE;
6515
6516         vm_rtfrecord_lock();
6517
6518         for (int vmfi = 0; vmfi <= vmrtfrs.vmrtfr_maxi; vmfi++) {
6519
6520                 if (residue < sizeof(vm_rtfault_record_t)) {
6521                         early_exit = TRUE;
6522                         break;
6523                 }
6524
6525                 if (vmrtfrs.vm_rtf_records[vmfi].rtfupid != cupid) {
6526 #if     DEVELOPMENT || DEBUG
6527                         if (isroot == FALSE) {
6528                                 continue;
6529                         }
6530 #else
6531                         continue;
6532 #endif /* DEVDEBUG */
6533                 }
6534
6535                 *cvmrd = vmrtfrs.vm_rtf_records[vmfi];
6536                 cvmrd++;
6537                 residue -= sizeof(vm_rtfault_record_t);
6538                 numextracted++;
6539         }
6540
6541         vm_rtfrecord_unlock();
6542
6543         *vmrtfrv = numextracted;
6544         return (early_exit);
6545 }