osfmk/vm/vm_fault.c

   1 /*
   2  * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58 /*
  59  *      File:   vm_fault.c
  60  *      Author: Avadis Tevanian, Jr., Michael Wayne Young
  61  *
  62  *      Page fault handling module.
  63  */
  64
  65 #include <mach_cluster_stats.h>
  66 #include <mach_pagemap.h>
  67 #include <libkern/OSAtomic.h>
  68
  69 #include <mach/mach_types.h>
  70 #include <mach/kern_return.h>
  71 #include <mach/message.h>       /* for error codes */
  72 #include <mach/vm_param.h>
  73 #include <mach/vm_behavior.h>
  74 #include <mach/memory_object.h>
  75 /* For memory_object_data_{request,unlock} */
  76 #include <mach/sdt.h>
  77
  78 #include <kern/kern_types.h>
  79 #include <kern/host_statistics.h>
  80 #include <kern/counters.h>
  81 #include <kern/task.h>
  82 #include <kern/thread.h>
  83 #include <kern/sched_prim.h>
  84 #include <kern/host.h>
  85 #include <kern/xpr.h>
  86 #include <kern/mach_param.h>
  87 #include <kern/macro_help.h>
  88 #include <kern/zalloc.h>
  89 #include <kern/misc_protos.h>
  90 #include <kern/policy_internal.h>
  91
  92 #include <vm/vm_compressor.h>
  93 #include <vm/vm_compressor_pager.h>
  94 #include <vm/vm_fault.h>
  95 #include <vm/vm_map.h>
  96 #include <vm/vm_object.h>
  97 #include <vm/vm_page.h>
  98 #include <vm/vm_kern.h>
  99 #include <vm/pmap.h>
 100 #include <vm/vm_pageout.h>
 101 #include <vm/vm_protos.h>
 102 #include <vm/vm_external.h>
 103 #include <vm/memory_object.h>
 104 #include <vm/vm_purgeable_internal.h>   /* Needed by some vm_page.h macros */
 105 #include <vm/vm_shared_region.h>
 106
 107 #include <sys/codesign.h>
 108 #include <sys/reason.h>
 109 #include <sys/signalvar.h>
 110
 111 #include <san/kasan.h>
 112
 113 #define VM_FAULT_CLASSIFY       0
 114
 115 #define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
 116
 117 unsigned int    vm_object_pagein_throttle = 16;
 118
 119 /*
 120  * We apply a hard throttle to the demand zero rate of tasks that we believe are running out of control which
 121  * kicks in when swap space runs out.  64-bit programs have massive address spaces and can leak enormous amounts
 122  * of memory if they're buggy and can run the system completely out of swap space.  If this happens, we
 123  * impose a hard throttle on them to prevent them from taking the last bit of memory left.  This helps
 124  * keep the UI active so that the user has a chance to kill the offending task before the system
 125  * completely hangs.
 126  *
 127  * The hard throttle is only applied when the system is nearly completely out of swap space and is only applied
 128  * to tasks that appear to be bloated.  When swap runs out, any task using more than vm_hard_throttle_threshold
 129  * will be throttled.  The throttling is done by giving the thread that's trying to demand zero a page a
 130  * delay of HARD_THROTTLE_DELAY microseconds before being allowed to try the page fault again.
 131  */
 132
 133 extern void throttle_lowpri_io(int);
 134
 135 extern struct vnode *vnode_pager_lookup_vnode(memory_object_t);
 136
 137 uint64_t vm_hard_throttle_threshold;
 138
 139
 140
 141 #define NEED_TO_HARD_THROTTLE_THIS_TASK()       (vm_wants_task_throttled(current_task()) ||     \
 142                                                  ((vm_page_free_count < vm_page_throttle_limit || \
 143                                                    HARD_THROTTLE_LIMIT_REACHED()) && \
 144                                                   proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO) >= THROTTLE_LEVEL_THROTTLED))
 145
 146
 147 #define HARD_THROTTLE_DELAY     10000   /* 10000 us == 10 ms */
 148 #define SOFT_THROTTLE_DELAY     200     /* 200 us == .2 ms */
 149
 150 #define VM_PAGE_CREATION_THROTTLE_PERIOD_SECS   6
 151 #define VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC  20000
 152
 153
 154 boolean_t current_thread_aborted(void);
 155
 156 /* Forward declarations of internal routines. */
 157 static kern_return_t vm_fault_wire_fast(
 158         vm_map_t        map,
 159         vm_map_offset_t va,
 160         vm_prot_t       prot,
 161         vm_tag_t        wire_tag,
 162         vm_map_entry_t  entry,
 163         pmap_t          pmap,
 164         vm_map_offset_t pmap_addr,
 165         ppnum_t         *physpage_p);
 166
 167 static kern_return_t vm_fault_internal(
 168         vm_map_t        map,
 169         vm_map_offset_t vaddr,
 170         vm_prot_t       caller_prot,
 171         boolean_t       change_wiring,
 172         vm_tag_t        wire_tag,
 173         int             interruptible,
 174         pmap_t          pmap,
 175         vm_map_offset_t pmap_addr,
 176         ppnum_t         *physpage_p);
 177
 178 static void vm_fault_copy_cleanup(
 179         vm_page_t       page,
 180         vm_page_t       top_page);
 181
 182 static void vm_fault_copy_dst_cleanup(
 183         vm_page_t       page);
 184
 185 #if     VM_FAULT_CLASSIFY
 186 extern void vm_fault_classify(vm_object_t       object,
 187     vm_object_offset_t    offset,
 188     vm_prot_t             fault_type);
 189
 190 extern void vm_fault_classify_init(void);
 191 #endif
 192
 193 unsigned long vm_pmap_enter_blocked = 0;
 194 unsigned long vm_pmap_enter_retried = 0;
 195
 196 unsigned long vm_cs_validates = 0;
 197 unsigned long vm_cs_revalidates = 0;
 198 unsigned long vm_cs_query_modified = 0;
 199 unsigned long vm_cs_validated_dirtied = 0;
 200 unsigned long vm_cs_bitmap_validated = 0;
 201 #if PMAP_CS
 202 uint64_t vm_cs_defer_to_pmap_cs = 0;
 203 uint64_t vm_cs_defer_to_pmap_cs_not = 0;
 204 #endif /* PMAP_CS */
 205
 206 void vm_pre_fault(vm_map_offset_t);
 207
 208 extern char *kdp_compressor_decompressed_page;
 209 extern addr64_t kdp_compressor_decompressed_page_paddr;
 210 extern ppnum_t  kdp_compressor_decompressed_page_ppnum;
 211
 212 struct vmrtfr {
 213         int vmrtfr_maxi;
 214         int vmrtfr_curi;
 215         int64_t vmrtf_total;
 216         vm_rtfault_record_t *vm_rtf_records;
 217 } vmrtfrs;
 218 #define VMRTF_DEFAULT_BUFSIZE (4096)
 219 #define VMRTF_NUM_RECORDS_DEFAULT (VMRTF_DEFAULT_BUFSIZE / sizeof(vm_rtfault_record_t))
 220 int vmrtf_num_records = VMRTF_NUM_RECORDS_DEFAULT;
 221
 222 static void vm_rtfrecord_lock(void);
 223 static void vm_rtfrecord_unlock(void);
 224 static void vm_record_rtfault(thread_t, uint64_t, vm_map_offset_t, int);
 225
 226 lck_spin_t vm_rtfr_slock;
 227 extern lck_grp_t vm_page_lck_grp_bucket;
 228 extern lck_attr_t vm_page_lck_attr;
 229
 230 /*
 231  *      Routine:        vm_fault_init
 232  *      Purpose:
 233  *              Initialize our private data structures.
 234  */
 235 void
 236 vm_fault_init(void)
 237 {
 238         int i, vm_compressor_temp;
 239         boolean_t need_default_val = TRUE;
 240         /*
 241          * Choose a value for the hard throttle threshold based on the amount of ram.  The threshold is
 242          * computed as a percentage of available memory, and the percentage used is scaled inversely with
 243          * the amount of memory.  The percentage runs between 10% and 35%.  We use 35% for small memory systems
 244          * and reduce the value down to 10% for very large memory configurations.  This helps give us a
 245          * definition of a memory hog that makes more sense relative to the amount of ram in the machine.
 246          * The formula here simply uses the number of gigabytes of ram to adjust the percentage.
 247          */
 248
 249         vm_hard_throttle_threshold = sane_size * (35 - MIN((int)(sane_size / (1024 * 1024 * 1024)), 25)) / 100;
 250
 251         /*
 252          * Configure compressed pager behavior. A boot arg takes precedence over a device tree entry.
 253          */
 254
 255         if (PE_parse_boot_argn("vm_compressor", &vm_compressor_temp, sizeof(vm_compressor_temp))) {
 256                 for (i = 0; i < VM_PAGER_MAX_MODES; i++) {
 257                         if (vm_compressor_temp > 0 &&
 258                             ((vm_compressor_temp & (1 << i)) == vm_compressor_temp)) {
 259                                 need_default_val = FALSE;
 260                                 vm_compressor_mode = vm_compressor_temp;
 261                                 break;
 262                         }
 263                 }
 264                 if (need_default_val) {
 265                         printf("Ignoring \"vm_compressor\" boot arg %d\n", vm_compressor_temp);
 266                 }
 267         }
 268         if (need_default_val) {
 269                 /* If no boot arg or incorrect boot arg, try device tree. */
 270                 PE_get_default("kern.vm_compressor", &vm_compressor_mode, sizeof(vm_compressor_mode));
 271         }
 272         printf("\"vm_compressor_mode\" is %d\n", vm_compressor_mode);
 273 }
 274
 275 void
 276 vm_rtfault_record_init(void)
 277 {
 278         PE_parse_boot_argn("vm_rtfault_records", &vmrtf_num_records, sizeof(vmrtf_num_records));
 279
 280         assert(vmrtf_num_records >= 1);
 281         vmrtf_num_records = MAX(vmrtf_num_records, 1);
 282         size_t kallocsz = vmrtf_num_records * sizeof(vm_rtfault_record_t);
 283         vmrtfrs.vm_rtf_records = kalloc(kallocsz);
 284         bzero(vmrtfrs.vm_rtf_records, kallocsz);
 285         vmrtfrs.vmrtfr_maxi = vmrtf_num_records - 1;
 286         lck_spin_init(&vm_rtfr_slock, &vm_page_lck_grp_bucket, &vm_page_lck_attr);
 287 }
 288 /*
 289  *      Routine:        vm_fault_cleanup
 290  *      Purpose:
 291  *              Clean up the result of vm_fault_page.
 292  *      Results:
 293  *              The paging reference for "object" is released.
 294  *              "object" is unlocked.
 295  *              If "top_page" is not null,  "top_page" is
 296  *              freed and the paging reference for the object
 297  *              containing it is released.
 298  *
 299  *      In/out conditions:
 300  *              "object" must be locked.
 301  */
 302 void
 303 vm_fault_cleanup(
 304         vm_object_t     object,
 305         vm_page_t       top_page)
 306 {
 307         vm_object_paging_end(object);
 308         vm_object_unlock(object);
 309
 310         if (top_page != VM_PAGE_NULL) {
 311                 object = VM_PAGE_OBJECT(top_page);
 312
 313                 vm_object_lock(object);
 314                 VM_PAGE_FREE(top_page);
 315                 vm_object_paging_end(object);
 316                 vm_object_unlock(object);
 317         }
 318 }
 319
 320 #define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
 321
 322
 323 boolean_t       vm_page_deactivate_behind = TRUE;
 324 /*
 325  * default sizes given VM_BEHAVIOR_DEFAULT reference behavior
 326  */
 327 #define VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW     128
 328 #define VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER    16              /* don't make this too big... */
 329                                                                 /* we use it to size an array on the stack */
 330
 331 int vm_default_behind = VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW;
 332
 333 #define MAX_SEQUENTIAL_RUN      (1024 * 1024 * 1024)
 334
 335 /*
 336  * vm_page_is_sequential
 337  *
 338  * Determine if sequential access is in progress
 339  * in accordance with the behavior specified.
 340  * Update state to indicate current access pattern.
 341  *
 342  * object must have at least the shared lock held
 343  */
 344 static
 345 void
 346 vm_fault_is_sequential(
 347         vm_object_t             object,
 348         vm_object_offset_t      offset,
 349         vm_behavior_t           behavior)
 350 {
 351         vm_object_offset_t      last_alloc;
 352         int                     sequential;
 353         int                     orig_sequential;
 354
 355         last_alloc = object->last_alloc;
 356         sequential = object->sequential;
 357         orig_sequential = sequential;
 358
 359         switch (behavior) {
 360         case VM_BEHAVIOR_RANDOM:
 361                 /*
 362                  * reset indicator of sequential behavior
 363                  */
 364                 sequential = 0;
 365                 break;
 366
 367         case VM_BEHAVIOR_SEQUENTIAL:
 368                 if (offset && last_alloc == offset - PAGE_SIZE_64) {
 369                         /*
 370                          * advance indicator of sequential behavior
 371                          */
 372                         if (sequential < MAX_SEQUENTIAL_RUN) {
 373                                 sequential += PAGE_SIZE;
 374                         }
 375                 } else {
 376                         /*
 377                          * reset indicator of sequential behavior
 378                          */
 379                         sequential = 0;
 380                 }
 381                 break;
 382
 383         case VM_BEHAVIOR_RSEQNTL:
 384                 if (last_alloc && last_alloc == offset + PAGE_SIZE_64) {
 385                         /*
 386                          * advance indicator of sequential behavior
 387                          */
 388                         if (sequential > -MAX_SEQUENTIAL_RUN) {
 389                                 sequential -= PAGE_SIZE;
 390                         }
 391                 } else {
 392                         /*
 393                          * reset indicator of sequential behavior
 394                          */
 395                         sequential = 0;
 396                 }
 397                 break;
 398
 399         case VM_BEHAVIOR_DEFAULT:
 400         default:
 401                 if (offset && last_alloc == (offset - PAGE_SIZE_64)) {
 402                         /*
 403                          * advance indicator of sequential behavior
 404                          */
 405                         if (sequential < 0) {
 406                                 sequential = 0;
 407                         }
 408                         if (sequential < MAX_SEQUENTIAL_RUN) {
 409                                 sequential += PAGE_SIZE;
 410                         }
 411                 } else if (last_alloc && last_alloc == (offset + PAGE_SIZE_64)) {
 412                         /*
 413                          * advance indicator of sequential behavior
 414                          */
 415                         if (sequential > 0) {
 416                                 sequential = 0;
 417                         }
 418                         if (sequential > -MAX_SEQUENTIAL_RUN) {
 419                                 sequential -= PAGE_SIZE;
 420                         }
 421                 } else {
 422                         /*
 423                          * reset indicator of sequential behavior
 424                          */
 425                         sequential = 0;
 426                 }
 427                 break;
 428         }
 429         if (sequential != orig_sequential) {
 430                 if (!OSCompareAndSwap(orig_sequential, sequential, (UInt32 *)&object->sequential)) {
 431                         /*
 432                          * if someone else has already updated object->sequential
 433                          * don't bother trying to update it or object->last_alloc
 434                          */
 435                         return;
 436                 }
 437         }
 438         /*
 439          * I'd like to do this with a OSCompareAndSwap64, but that
 440          * doesn't exist for PPC...  however, it shouldn't matter
 441          * that much... last_alloc is maintained so that we can determine
 442          * if a sequential access pattern is taking place... if only
 443          * one thread is banging on this object, no problem with the unprotected
 444          * update... if 2 or more threads are banging away, we run the risk of
 445          * someone seeing a mangled update... however, in the face of multiple
 446          * accesses, no sequential access pattern can develop anyway, so we
 447          * haven't lost any real info.
 448          */
 449         object->last_alloc = offset;
 450 }
 451
 452
 453 int vm_page_deactivate_behind_count = 0;
 454
 455 /*
 456  * vm_page_deactivate_behind
 457  *
 458  * Determine if sequential access is in progress
 459  * in accordance with the behavior specified.  If
 460  * so, compute a potential page to deactivate and
 461  * deactivate it.
 462  *
 463  * object must be locked.
 464  *
 465  * return TRUE if we actually deactivate a page
 466  */
 467 static
 468 boolean_t
 469 vm_fault_deactivate_behind(
 470         vm_object_t             object,
 471         vm_object_offset_t      offset,
 472         vm_behavior_t           behavior)
 473 {
 474         int             n;
 475         int             pages_in_run = 0;
 476         int             max_pages_in_run = 0;
 477         int             sequential_run;
 478         int             sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
 479         vm_object_offset_t      run_offset = 0;
 480         vm_object_offset_t      pg_offset = 0;
 481         vm_page_t       m;
 482         vm_page_t       page_run[VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER];
 483
 484         pages_in_run = 0;
 485 #if TRACEFAULTPAGE
 486         dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind); /* (TEST/DEBUG) */
 487 #endif
 488
 489         if (object == kernel_object || vm_page_deactivate_behind == FALSE) {
 490                 /*
 491                  * Do not deactivate pages from the kernel object: they
 492                  * are not intended to become pageable.
 493                  * or we've disabled the deactivate behind mechanism
 494                  */
 495                 return FALSE;
 496         }
 497         if ((sequential_run = object->sequential)) {
 498                 if (sequential_run < 0) {
 499                         sequential_behavior = VM_BEHAVIOR_RSEQNTL;
 500                         sequential_run = 0 - sequential_run;
 501                 } else {
 502                         sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
 503                 }
 504         }
 505         switch (behavior) {
 506         case VM_BEHAVIOR_RANDOM:
 507                 break;
 508         case VM_BEHAVIOR_SEQUENTIAL:
 509                 if (sequential_run >= (int)PAGE_SIZE) {
 510                         run_offset = 0 - PAGE_SIZE_64;
 511                         max_pages_in_run = 1;
 512                 }
 513                 break;
 514         case VM_BEHAVIOR_RSEQNTL:
 515                 if (sequential_run >= (int)PAGE_SIZE) {
 516                         run_offset = PAGE_SIZE_64;
 517                         max_pages_in_run = 1;
 518                 }
 519                 break;
 520         case VM_BEHAVIOR_DEFAULT:
 521         default:
 522         {       vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
 523
 524                 /*
 525                  * determine if the run of sequential accesss has been
 526                  * long enough on an object with default access behavior
 527                  * to consider it for deactivation
 528                  */
 529                 if ((uint64_t)sequential_run >= behind && (sequential_run % (VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER * PAGE_SIZE)) == 0) {
 530                         /*
 531                          * the comparisons between offset and behind are done
 532                          * in this kind of odd fashion in order to prevent wrap around
 533                          * at the end points
 534                          */
 535                         if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) {
 536                                 if (offset >= behind) {
 537                                         run_offset = 0 - behind;
 538                                         pg_offset = PAGE_SIZE_64;
 539                                         max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
 540                                 }
 541                         } else {
 542                                 if (offset < -behind) {
 543                                         run_offset = behind;
 544                                         pg_offset = 0 - PAGE_SIZE_64;
 545                                         max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
 546                                 }
 547                         }
 548                 }
 549                 break;}
 550         }
 551         for (n = 0; n < max_pages_in_run; n++) {
 552                 m = vm_page_lookup(object, offset + run_offset + (n * pg_offset));
 553
 554                 if (m && !m->vmp_laundry && !m->vmp_busy && !m->vmp_no_cache && (m->vmp_q_state != VM_PAGE_ON_THROTTLED_Q) && !m->vmp_fictitious && !m->vmp_absent) {
 555                         page_run[pages_in_run++] = m;
 556
 557                         /*
 558                          * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
 559                          *
 560                          * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
 561                          * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
 562                          * new reference happens. If no futher references happen on the page after that remote TLB flushes
 563                          * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
 564                          * by pageout_scan, which is just fine since the last reference would have happened quite far
 565                          * in the past (TLB caches don't hang around for very long), and of course could just as easily
 566                          * have happened before we did the deactivate_behind.
 567                          */
 568                         pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
 569                 }
 570         }
 571         if (pages_in_run) {
 572                 vm_page_lockspin_queues();
 573
 574                 for (n = 0; n < pages_in_run; n++) {
 575                         m = page_run[n];
 576
 577                         vm_page_deactivate_internal(m, FALSE);
 578
 579                         vm_page_deactivate_behind_count++;
 580 #if TRACEFAULTPAGE
 581                         dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
 582 #endif
 583                 }
 584                 vm_page_unlock_queues();
 585
 586                 return TRUE;
 587         }
 588         return FALSE;
 589 }
 590
 591
 592 #if (DEVELOPMENT || DEBUG)
 593 uint32_t        vm_page_creation_throttled_hard = 0;
 594 uint32_t        vm_page_creation_throttled_soft = 0;
 595 uint64_t        vm_page_creation_throttle_avoided = 0;
 596 #endif /* DEVELOPMENT || DEBUG */
 597
 598 static int
 599 vm_page_throttled(boolean_t page_kept)
 600 {
 601         clock_sec_t     elapsed_sec;
 602         clock_sec_t     tv_sec;
 603         clock_usec_t    tv_usec;
 604
 605         thread_t thread = current_thread();
 606
 607         if (thread->options & TH_OPT_VMPRIV) {
 608                 return 0;
 609         }
 610
 611         if (thread->t_page_creation_throttled) {
 612                 thread->t_page_creation_throttled = 0;
 613
 614                 if (page_kept == FALSE) {
 615                         goto no_throttle;
 616                 }
 617         }
 618         if (NEED_TO_HARD_THROTTLE_THIS_TASK()) {
 619 #if (DEVELOPMENT || DEBUG)
 620                 thread->t_page_creation_throttled_hard++;
 621                 OSAddAtomic(1, &vm_page_creation_throttled_hard);
 622 #endif /* DEVELOPMENT || DEBUG */
 623                 return HARD_THROTTLE_DELAY;
 624         }
 625
 626         if ((vm_page_free_count < vm_page_throttle_limit || (VM_CONFIG_COMPRESSOR_IS_PRESENT && SWAPPER_NEEDS_TO_UNTHROTTLE())) &&
 627             thread->t_page_creation_count > (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS * VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC)) {
 628                 if (vm_page_free_wanted == 0 && vm_page_free_wanted_privileged == 0) {
 629 #if (DEVELOPMENT || DEBUG)
 630                         OSAddAtomic64(1, &vm_page_creation_throttle_avoided);
 631 #endif
 632                         goto no_throttle;
 633                 }
 634                 clock_get_system_microtime(&tv_sec, &tv_usec);
 635
 636                 elapsed_sec = tv_sec - thread->t_page_creation_time;
 637
 638                 if (elapsed_sec <= VM_PAGE_CREATION_THROTTLE_PERIOD_SECS ||
 639                     (thread->t_page_creation_count / elapsed_sec) >= VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC) {
 640                         if (elapsed_sec >= (3 * VM_PAGE_CREATION_THROTTLE_PERIOD_SECS)) {
 641                                 /*
 642                                  * we'll reset our stats to give a well behaved app
 643                                  * that was unlucky enough to accumulate a bunch of pages
 644                                  * over a long period of time a chance to get out of
 645                                  * the throttled state... we reset the counter and timestamp
 646                                  * so that if it stays under the rate limit for the next second
 647                                  * it will be back in our good graces... if it exceeds it, it
 648                                  * will remain in the throttled state
 649                                  */
 650                                 thread->t_page_creation_time = tv_sec;
 651                                 thread->t_page_creation_count = VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC * (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS - 1);
 652                         }
 653                         VM_PAGEOUT_DEBUG(vm_page_throttle_count, 1);
 654
 655                         thread->t_page_creation_throttled = 1;
 656
 657                         if (VM_CONFIG_COMPRESSOR_IS_PRESENT && HARD_THROTTLE_LIMIT_REACHED()) {
 658 #if (DEVELOPMENT || DEBUG)
 659                                 thread->t_page_creation_throttled_hard++;
 660                                 OSAddAtomic(1, &vm_page_creation_throttled_hard);
 661 #endif /* DEVELOPMENT || DEBUG */
 662                                 return HARD_THROTTLE_DELAY;
 663                         } else {
 664 #if (DEVELOPMENT || DEBUG)
 665                                 thread->t_page_creation_throttled_soft++;
 666                                 OSAddAtomic(1, &vm_page_creation_throttled_soft);
 667 #endif /* DEVELOPMENT || DEBUG */
 668                                 return SOFT_THROTTLE_DELAY;
 669                         }
 670                 }
 671                 thread->t_page_creation_time = tv_sec;
 672                 thread->t_page_creation_count = 0;
 673         }
 674 no_throttle:
 675         thread->t_page_creation_count++;
 676
 677         return 0;
 678 }
 679
 680
 681 /*
 682  * check for various conditions that would
 683  * prevent us from creating a ZF page...
 684  * cleanup is based on being called from vm_fault_page
 685  *
 686  * object must be locked
 687  * object == m->vmp_object
 688  */
 689 static vm_fault_return_t
 690 vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, wait_interrupt_t interruptible_state, boolean_t page_throttle)
 691 {
 692         int throttle_delay;
 693
 694         if (object->shadow_severed ||
 695             VM_OBJECT_PURGEABLE_FAULT_ERROR(object)) {
 696                 /*
 697                  * Either:
 698                  * 1. the shadow chain was severed,
 699                  * 2. the purgeable object is volatile or empty and is marked
 700                  *    to fault on access while volatile.
 701                  * Just have to return an error at this point
 702                  */
 703                 if (m != VM_PAGE_NULL) {
 704                         VM_PAGE_FREE(m);
 705                 }
 706                 vm_fault_cleanup(object, first_m);
 707
 708                 thread_interrupt_level(interruptible_state);
 709
 710                 return VM_FAULT_MEMORY_ERROR;
 711         }
 712         if (page_throttle == TRUE) {
 713                 if ((throttle_delay = vm_page_throttled(FALSE))) {
 714                         /*
 715                          * we're throttling zero-fills...
 716                          * treat this as if we couldn't grab a page
 717                          */
 718                         if (m != VM_PAGE_NULL) {
 719                                 VM_PAGE_FREE(m);
 720                         }
 721                         vm_fault_cleanup(object, first_m);
 722
 723                         VM_DEBUG_EVENT(vmf_check_zfdelay, VMF_CHECK_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
 724
 725                         delay(throttle_delay);
 726
 727                         if (current_thread_aborted()) {
 728                                 thread_interrupt_level(interruptible_state);
 729                                 return VM_FAULT_INTERRUPTED;
 730                         }
 731                         thread_interrupt_level(interruptible_state);
 732
 733                         return VM_FAULT_MEMORY_SHORTAGE;
 734                 }
 735         }
 736         return VM_FAULT_SUCCESS;
 737 }
 738
 739
 740 /*
 741  * do the work to zero fill a page and
 742  * inject it into the correct paging queue
 743  *
 744  * m->vmp_object must be locked
 745  * page queue lock must NOT be held
 746  */
 747 static int
 748 vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill)
 749 {
 750         int my_fault = DBG_ZERO_FILL_FAULT;
 751         vm_object_t     object;
 752
 753         object = VM_PAGE_OBJECT(m);
 754
 755         /*
 756          * This is is a zero-fill page fault...
 757          *
 758          * Checking the page lock is a waste of
 759          * time;  this page was absent, so
 760          * it can't be page locked by a pager.
 761          *
 762          * we also consider it undefined
 763          * with respect to instruction
 764          * execution.  i.e. it is the responsibility
 765          * of higher layers to call for an instruction
 766          * sync after changing the contents and before
 767          * sending a program into this area.  We
 768          * choose this approach for performance
 769          */
 770         m->vmp_pmapped = TRUE;
 771
 772         m->vmp_cs_validated = FALSE;
 773         m->vmp_cs_tainted = FALSE;
 774         m->vmp_cs_nx = FALSE;
 775
 776         if (no_zero_fill == TRUE) {
 777                 my_fault = DBG_NZF_PAGE_FAULT;
 778
 779                 if (m->vmp_absent && m->vmp_busy) {
 780                         return my_fault;
 781                 }
 782         } else {
 783                 vm_page_zero_fill(m);
 784
 785                 VM_STAT_INCR(zero_fill_count);
 786                 DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
 787         }
 788         assert(!m->vmp_laundry);
 789         assert(object != kernel_object);
 790         //assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
 791
 792         if (!VM_DYNAMIC_PAGING_ENABLED() &&
 793             (object->purgable == VM_PURGABLE_DENY ||
 794             object->purgable == VM_PURGABLE_NONVOLATILE ||
 795             object->purgable == VM_PURGABLE_VOLATILE)) {
 796                 vm_page_lockspin_queues();
 797
 798                 if (!VM_DYNAMIC_PAGING_ENABLED()) {
 799                         assert(!VM_PAGE_WIRED(m));
 800
 801                         /*
 802                          * can't be on the pageout queue since we don't
 803                          * have a pager to try and clean to
 804                          */
 805                         vm_page_queues_remove(m, TRUE);
 806                         vm_page_check_pageable_safe(m);
 807                         vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
 808                         m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
 809                         vm_page_throttled_count++;
 810                 }
 811                 vm_page_unlock_queues();
 812         }
 813         return my_fault;
 814 }
 815
 816
 817 /*
 818  *      Routine:        vm_fault_page
 819  *      Purpose:
 820  *              Find the resident page for the virtual memory
 821  *              specified by the given virtual memory object
 822  *              and offset.
 823  *      Additional arguments:
 824  *              The required permissions for the page is given
 825  *              in "fault_type".  Desired permissions are included
 826  *              in "protection".
 827  *              fault_info is passed along to determine pagein cluster
 828  *              limits... it contains the expected reference pattern,
 829  *              cluster size if available, etc...
 830  *
 831  *              If the desired page is known to be resident (for
 832  *              example, because it was previously wired down), asserting
 833  *              the "unwiring" parameter will speed the search.
 834  *
 835  *              If the operation can be interrupted (by thread_abort
 836  *              or thread_terminate), then the "interruptible"
 837  *              parameter should be asserted.
 838  *
 839  *      Results:
 840  *              The page containing the proper data is returned
 841  *              in "result_page".
 842  *
 843  *      In/out conditions:
 844  *              The source object must be locked and referenced,
 845  *              and must donate one paging reference.  The reference
 846  *              is not affected.  The paging reference and lock are
 847  *              consumed.
 848  *
 849  *              If the call succeeds, the object in which "result_page"
 850  *              resides is left locked and holding a paging reference.
 851  *              If this is not the original object, a busy page in the
 852  *              original object is returned in "top_page", to prevent other
 853  *              callers from pursuing this same data, along with a paging
 854  *              reference for the original object.  The "top_page" should
 855  *              be destroyed when this guarantee is no longer required.
 856  *              The "result_page" is also left busy.  It is not removed
 857  *              from the pageout queues.
 858  *      Special Case:
 859  *              A return value of VM_FAULT_SUCCESS_NO_PAGE means that the
 860  *              fault succeeded but there's no VM page (i.e. the VM object
 861  *              does not actually hold VM pages, but device memory or
 862  *              large pages).  The object is still locked and we still hold a
 863  *              paging_in_progress reference.
 864  */
 865 unsigned int vm_fault_page_blocked_access = 0;
 866 unsigned int vm_fault_page_forced_retry = 0;
 867
 868 vm_fault_return_t
 869 vm_fault_page(
 870         /* Arguments: */
 871         vm_object_t     first_object,   /* Object to begin search */
 872         vm_object_offset_t first_offset,        /* Offset into object */
 873         vm_prot_t       fault_type,     /* What access is requested */
 874         boolean_t       must_be_resident,/* Must page be resident? */
 875         boolean_t       caller_lookup,  /* caller looked up page */
 876         /* Modifies in place: */
 877         vm_prot_t       *protection,    /* Protection for mapping */
 878         vm_page_t       *result_page,   /* Page found, if successful */
 879         /* Returns: */
 880         vm_page_t       *top_page,      /* Page in top object, if
 881                                          * not result_page.  */
 882         int             *type_of_fault, /* if non-null, fill in with type of fault
 883                                          * COW, zero-fill, etc... returned in trace point */
 884         /* More arguments: */
 885         kern_return_t   *error_code,    /* code if page is in error */
 886         boolean_t       no_zero_fill,   /* don't zero fill absent pages */
 887         boolean_t       data_supply,    /* treat as data_supply if
 888                                          * it is a write fault and a full
 889                                          * page is provided */
 890         vm_object_fault_info_t fault_info)
 891 {
 892         vm_page_t               m;
 893         vm_object_t             object;
 894         vm_object_offset_t      offset;
 895         vm_page_t               first_m;
 896         vm_object_t             next_object;
 897         vm_object_t             copy_object;
 898         boolean_t               look_for_page;
 899         boolean_t               force_fault_retry = FALSE;
 900         vm_prot_t               access_required = fault_type;
 901         vm_prot_t               wants_copy_flag;
 902         kern_return_t           wait_result;
 903         wait_interrupt_t        interruptible_state;
 904         boolean_t               data_already_requested = FALSE;
 905         vm_behavior_t           orig_behavior;
 906         vm_size_t               orig_cluster_size;
 907         vm_fault_return_t       error;
 908         int                     my_fault;
 909         uint32_t                try_failed_count;
 910         int                     interruptible; /* how may fault be interrupted? */
 911         int                     external_state = VM_EXTERNAL_STATE_UNKNOWN;
 912         memory_object_t         pager;
 913         vm_fault_return_t       retval;
 914         int                     grab_options;
 915
 916 /*
 917  * MUST_ASK_PAGER() evaluates to TRUE if the page specified by object/offset is
 918  * marked as paged out in the compressor pager or the pager doesn't exist.
 919  * Note also that if the pager for an internal object
 920  * has not been created, the pager is not invoked regardless of the value
 921  * of MUST_ASK_PAGER().
 922  *
 923  * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
 924  * is marked as paged out in the compressor pager.
 925  * PAGED_OUT() is used to determine if a page has already been pushed
 926  * into a copy object in order to avoid a redundant page out operation.
 927  */
 928 #define MUST_ASK_PAGER(o, f, s)                                 \
 929         ((s = VM_COMPRESSOR_PAGER_STATE_GET((o), (f))) != VM_EXTERNAL_STATE_ABSENT)
 930
 931 #define PAGED_OUT(o, f) \
 932         (VM_COMPRESSOR_PAGER_STATE_GET((o), (f)) == VM_EXTERNAL_STATE_EXISTS)
 933
 934 /*
 935  *      Recovery actions
 936  */
 937 #define RELEASE_PAGE(m)                                 \
 938         MACRO_BEGIN                                     \
 939         PAGE_WAKEUP_DONE(m);                            \
 940         if ( !VM_PAGE_PAGEABLE(m)) {                    \
 941                 vm_page_lockspin_queues();              \
 942                 if ( !VM_PAGE_PAGEABLE(m)) {            \
 943                         if (VM_CONFIG_COMPRESSOR_IS_ACTIVE)     \
 944                                 vm_page_deactivate(m);          \
 945                         else                                    \
 946                                 vm_page_activate(m);            \
 947                 }                                               \
 948                 vm_page_unlock_queues();                        \
 949         }                                                       \
 950         MACRO_END
 951
 952 #if TRACEFAULTPAGE
 953         dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */
 954 #endif
 955
 956         interruptible = fault_info->interruptible;
 957         interruptible_state = thread_interrupt_level(interruptible);
 958
 959         /*
 960          *      INVARIANTS (through entire routine):
 961          *
 962          *      1)      At all times, we must either have the object
 963          *              lock or a busy page in some object to prevent
 964          *              some other thread from trying to bring in
 965          *              the same page.
 966          *
 967          *              Note that we cannot hold any locks during the
 968          *              pager access or when waiting for memory, so
 969          *              we use a busy page then.
 970          *
 971          *      2)      To prevent another thread from racing us down the
 972          *              shadow chain and entering a new page in the top
 973          *              object before we do, we must keep a busy page in
 974          *              the top object while following the shadow chain.
 975          *
 976          *      3)      We must increment paging_in_progress on any object
 977          *              for which we have a busy page before dropping
 978          *              the object lock
 979          *
 980          *      4)      We leave busy pages on the pageout queues.
 981          *              If the pageout daemon comes across a busy page,
 982          *              it will remove the page from the pageout queues.
 983          */
 984
 985         object = first_object;
 986         offset = first_offset;
 987         first_m = VM_PAGE_NULL;
 988         access_required = fault_type;
 989
 990
 991         XPR(XPR_VM_FAULT,
 992             "vm_f_page: obj 0x%X, offset 0x%X, type %d, prot %d\n",
 993             object, offset, fault_type, *protection, 0);
 994
 995         /*
 996          * default type of fault
 997          */
 998         my_fault = DBG_CACHE_HIT_FAULT;
 999
1000         while (TRUE) {
1001 #if TRACEFAULTPAGE
1002                 dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
1003 #endif
1004
1005                 grab_options = 0;
1006 #if CONFIG_SECLUDED_MEMORY
1007                 if (object->can_grab_secluded) {
1008                         grab_options |= VM_PAGE_GRAB_SECLUDED;
1009                 }
1010 #endif /* CONFIG_SECLUDED_MEMORY */
1011
1012                 if (!object->alive) {
1013                         /*
1014                          * object is no longer valid
1015                          * clean up and return error
1016                          */
1017                         vm_fault_cleanup(object, first_m);
1018                         thread_interrupt_level(interruptible_state);
1019
1020                         return VM_FAULT_MEMORY_ERROR;
1021                 }
1022
1023                 if (!object->pager_created && object->phys_contiguous) {
1024                         /*
1025                          * A physically-contiguous object without a pager:
1026                          * must be a "large page" object.  We do not deal
1027                          * with VM pages for this object.
1028                          */
1029                         caller_lookup = FALSE;
1030                         m = VM_PAGE_NULL;
1031                         goto phys_contig_object;
1032                 }
1033
1034                 if (object->blocked_access) {
1035                         /*
1036                          * Access to this VM object has been blocked.
1037                          * Replace our "paging_in_progress" reference with
1038                          * a "activity_in_progress" reference and wait for
1039                          * access to be unblocked.
1040                          */
1041                         caller_lookup = FALSE; /* no longer valid after sleep */
1042                         vm_object_activity_begin(object);
1043                         vm_object_paging_end(object);
1044                         while (object->blocked_access) {
1045                                 vm_object_sleep(object,
1046                                     VM_OBJECT_EVENT_UNBLOCKED,
1047                                     THREAD_UNINT);
1048                         }
1049                         vm_fault_page_blocked_access++;
1050                         vm_object_paging_begin(object);
1051                         vm_object_activity_end(object);
1052                 }
1053
1054                 /*
1055                  * See whether the page at 'offset' is resident
1056                  */
1057                 if (caller_lookup == TRUE) {
1058                         /*
1059                          * The caller has already looked up the page
1060                          * and gave us the result in "result_page".
1061                          * We can use this for the first lookup but
1062                          * it loses its validity as soon as we unlock
1063                          * the object.
1064                          */
1065                         m = *result_page;
1066                         caller_lookup = FALSE; /* no longer valid after that */
1067                 } else {
1068                         m = vm_page_lookup(object, offset);
1069                 }
1070 #if TRACEFAULTPAGE
1071                 dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
1072 #endif
1073                 if (m != VM_PAGE_NULL) {
1074                         if (m->vmp_busy) {
1075                                 /*
1076                                  * The page is being brought in,
1077                                  * wait for it and then retry.
1078                                  */
1079 #if TRACEFAULTPAGE
1080                                 dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1081 #endif
1082                                 wait_result = PAGE_SLEEP(object, m, interruptible);
1083
1084                                 XPR(XPR_VM_FAULT,
1085                                     "vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n",
1086                                     object, offset,
1087                                     m, 0, 0);
1088                                 counter(c_vm_fault_page_block_busy_kernel++);
1089
1090                                 if (wait_result != THREAD_AWAKENED) {
1091                                         vm_fault_cleanup(object, first_m);
1092                                         thread_interrupt_level(interruptible_state);
1093
1094                                         if (wait_result == THREAD_RESTART) {
1095                                                 return VM_FAULT_RETRY;
1096                                         } else {
1097                                                 return VM_FAULT_INTERRUPTED;
1098                                         }
1099                                 }
1100                                 continue;
1101                         }
1102                         if (m->vmp_laundry) {
1103                                 m->vmp_free_when_done = FALSE;
1104
1105                                 if (!m->vmp_cleaning) {
1106                                         vm_pageout_steal_laundry(m, FALSE);
1107                                 }
1108                         }
1109                         if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
1110                                 /*
1111                                  * Guard page: off limits !
1112                                  */
1113                                 if (fault_type == VM_PROT_NONE) {
1114                                         /*
1115                                          * The fault is not requesting any
1116                                          * access to the guard page, so it must
1117                                          * be just to wire or unwire it.
1118                                          * Let's pretend it succeeded...
1119                                          */
1120                                         m->vmp_busy = TRUE;
1121                                         *result_page = m;
1122                                         assert(first_m == VM_PAGE_NULL);
1123                                         *top_page = first_m;
1124                                         if (type_of_fault) {
1125                                                 *type_of_fault = DBG_GUARD_FAULT;
1126                                         }
1127                                         thread_interrupt_level(interruptible_state);
1128                                         return VM_FAULT_SUCCESS;
1129                                 } else {
1130                                         /*
1131                                          * The fault requests access to the
1132                                          * guard page: let's deny that !
1133                                          */
1134                                         vm_fault_cleanup(object, first_m);
1135                                         thread_interrupt_level(interruptible_state);
1136                                         return VM_FAULT_MEMORY_ERROR;
1137                                 }
1138                         }
1139
1140                         if (m->vmp_error) {
1141                                 /*
1142                                  * The page is in error, give up now.
1143                                  */
1144 #if TRACEFAULTPAGE
1145                                 dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code);      /* (TEST/DEBUG) */
1146 #endif
1147                                 if (error_code) {
1148                                         *error_code = KERN_MEMORY_ERROR;
1149                                 }
1150                                 VM_PAGE_FREE(m);
1151
1152                                 vm_fault_cleanup(object, first_m);
1153                                 thread_interrupt_level(interruptible_state);
1154
1155                                 return VM_FAULT_MEMORY_ERROR;
1156                         }
1157                         if (m->vmp_restart) {
1158                                 /*
1159                                  * The pager wants us to restart
1160                                  * at the top of the chain,
1161                                  * typically because it has moved the
1162                                  * page to another pager, then do so.
1163                                  */
1164 #if TRACEFAULTPAGE
1165                                 dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1166 #endif
1167                                 VM_PAGE_FREE(m);
1168
1169                                 vm_fault_cleanup(object, first_m);
1170                                 thread_interrupt_level(interruptible_state);
1171
1172                                 return VM_FAULT_RETRY;
1173                         }
1174                         if (m->vmp_absent) {
1175                                 /*
1176                                  * The page isn't busy, but is absent,
1177                                  * therefore it's deemed "unavailable".
1178                                  *
1179                                  * Remove the non-existent page (unless it's
1180                                  * in the top object) and move on down to the
1181                                  * next object (if there is one).
1182                                  */
1183 #if TRACEFAULTPAGE
1184                                 dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow);  /* (TEST/DEBUG) */
1185 #endif
1186                                 next_object = object->shadow;
1187
1188                                 if (next_object == VM_OBJECT_NULL) {
1189                                         /*
1190                                          * Absent page at bottom of shadow
1191                                          * chain; zero fill the page we left
1192                                          * busy in the first object, and free
1193                                          * the absent page.
1194                                          */
1195                                         assert(!must_be_resident);
1196
1197                                         /*
1198                                          * check for any conditions that prevent
1199                                          * us from creating a new zero-fill page
1200                                          * vm_fault_check will do all of the
1201                                          * fault cleanup in the case of an error condition
1202                                          * including resetting the thread_interrupt_level
1203                                          */
1204                                         error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
1205
1206                                         if (error != VM_FAULT_SUCCESS) {
1207                                                 return error;
1208                                         }
1209
1210                                         XPR(XPR_VM_FAULT,
1211                                             "vm_f_page: zero obj 0x%X, off 0x%X, page 0x%X, first_obj 0x%X\n",
1212                                             object, offset,
1213                                             m,
1214                                             first_object, 0);
1215
1216                                         if (object != first_object) {
1217                                                 /*
1218                                                  * free the absent page we just found
1219                                                  */
1220                                                 VM_PAGE_FREE(m);
1221
1222                                                 /*
1223                                                  * drop reference and lock on current object
1224                                                  */
1225                                                 vm_object_paging_end(object);
1226                                                 vm_object_unlock(object);
1227
1228                                                 /*
1229                                                  * grab the original page we
1230                                                  * 'soldered' in place and
1231                                                  * retake lock on 'first_object'
1232                                                  */
1233                                                 m = first_m;
1234                                                 first_m = VM_PAGE_NULL;
1235
1236                                                 object = first_object;
1237                                                 offset = first_offset;
1238
1239                                                 vm_object_lock(object);
1240                                         } else {
1241                                                 /*
1242                                                  * we're going to use the absent page we just found
1243                                                  * so convert it to a 'busy' page
1244                                                  */
1245                                                 m->vmp_absent = FALSE;
1246                                                 m->vmp_busy = TRUE;
1247                                         }
1248                                         if (fault_info->mark_zf_absent && no_zero_fill == TRUE) {
1249                                                 m->vmp_absent = TRUE;
1250                                         }
1251                                         /*
1252                                          * zero-fill the page and put it on
1253                                          * the correct paging queue
1254                                          */
1255                                         my_fault = vm_fault_zero_page(m, no_zero_fill);
1256
1257                                         break;
1258                                 } else {
1259                                         if (must_be_resident) {
1260                                                 vm_object_paging_end(object);
1261                                         } else if (object != first_object) {
1262                                                 vm_object_paging_end(object);
1263                                                 VM_PAGE_FREE(m);
1264                                         } else {
1265                                                 first_m = m;
1266                                                 m->vmp_absent = FALSE;
1267                                                 m->vmp_busy = TRUE;
1268
1269                                                 vm_page_lockspin_queues();
1270                                                 vm_page_queues_remove(m, FALSE);
1271                                                 vm_page_unlock_queues();
1272                                         }
1273                                         XPR(XPR_VM_FAULT,
1274                                             "vm_f_page: unavail obj 0x%X, off 0x%X, next_obj 0x%X, newoff 0x%X\n",
1275                                             object, offset,
1276                                             next_object,
1277                                             offset + object->vo_shadow_offset, 0);
1278
1279                                         offset += object->vo_shadow_offset;
1280                                         fault_info->lo_offset += object->vo_shadow_offset;
1281                                         fault_info->hi_offset += object->vo_shadow_offset;
1282                                         access_required = VM_PROT_READ;
1283
1284                                         vm_object_lock(next_object);
1285                                         vm_object_unlock(object);
1286                                         object = next_object;
1287                                         vm_object_paging_begin(object);
1288
1289                                         /*
1290                                          * reset to default type of fault
1291                                          */
1292                                         my_fault = DBG_CACHE_HIT_FAULT;
1293
1294                                         continue;
1295                                 }
1296                         }
1297                         if ((m->vmp_cleaning)
1298                             && ((object != first_object) || (object->copy != VM_OBJECT_NULL))
1299                             && (fault_type & VM_PROT_WRITE)) {
1300                                 /*
1301                                  * This is a copy-on-write fault that will
1302                                  * cause us to revoke access to this page, but
1303                                  * this page is in the process of being cleaned
1304                                  * in a clustered pageout. We must wait until
1305                                  * the cleaning operation completes before
1306                                  * revoking access to the original page,
1307                                  * otherwise we might attempt to remove a
1308                                  * wired mapping.
1309                                  */
1310 #if TRACEFAULTPAGE
1311                                 dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset);  /* (TEST/DEBUG) */
1312 #endif
1313                                 XPR(XPR_VM_FAULT,
1314                                     "vm_f_page: cleaning obj 0x%X, offset 0x%X, page 0x%X\n",
1315                                     object, offset,
1316                                     m, 0, 0);
1317                                 /*
1318                                  * take an extra ref so that object won't die
1319                                  */
1320                                 vm_object_reference_locked(object);
1321
1322                                 vm_fault_cleanup(object, first_m);
1323
1324                                 counter(c_vm_fault_page_block_backoff_kernel++);
1325                                 vm_object_lock(object);
1326                                 assert(object->ref_count > 0);
1327
1328                                 m = vm_page_lookup(object, offset);
1329
1330                                 if (m != VM_PAGE_NULL && m->vmp_cleaning) {
1331                                         PAGE_ASSERT_WAIT(m, interruptible);
1332
1333                                         vm_object_unlock(object);
1334                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1335                                         vm_object_deallocate(object);
1336
1337                                         goto backoff;
1338                                 } else {
1339                                         vm_object_unlock(object);
1340
1341                                         vm_object_deallocate(object);
1342                                         thread_interrupt_level(interruptible_state);
1343
1344                                         return VM_FAULT_RETRY;
1345                                 }
1346                         }
1347                         if (type_of_fault == NULL && (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) &&
1348                             !(fault_info != NULL && fault_info->stealth)) {
1349                                 /*
1350                                  * If we were passed a non-NULL pointer for
1351                                  * "type_of_fault", than we came from
1352                                  * vm_fault... we'll let it deal with
1353                                  * this condition, since it
1354                                  * needs to see m->vmp_speculative to correctly
1355                                  * account the pageins, otherwise...
1356                                  * take it off the speculative queue, we'll
1357                                  * let the caller of vm_fault_page deal
1358                                  * with getting it onto the correct queue
1359                                  *
1360                                  * If the caller specified in fault_info that
1361                                  * it wants a "stealth" fault, we also leave
1362                                  * the page in the speculative queue.
1363                                  */
1364                                 vm_page_lockspin_queues();
1365                                 if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
1366                                         vm_page_queues_remove(m, FALSE);
1367                                 }
1368                                 vm_page_unlock_queues();
1369                         }
1370                         assert(object == VM_PAGE_OBJECT(m));
1371
1372                         if (object->code_signed) {
1373                                 /*
1374                                  * CODE SIGNING:
1375                                  * We just paged in a page from a signed
1376                                  * memory object but we don't need to
1377                                  * validate it now.  We'll validate it if
1378                                  * when it gets mapped into a user address
1379                                  * space for the first time or when the page
1380                                  * gets copied to another object as a result
1381                                  * of a copy-on-write.
1382                                  */
1383                         }
1384
1385                         /*
1386                          * We mark the page busy and leave it on
1387                          * the pageout queues.  If the pageout
1388                          * deamon comes across it, then it will
1389                          * remove the page from the queue, but not the object
1390                          */
1391 #if TRACEFAULTPAGE
1392                         dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1393 #endif
1394                         XPR(XPR_VM_FAULT,
1395                             "vm_f_page: found page obj 0x%X, offset 0x%X, page 0x%X\n",
1396                             object, offset, m, 0, 0);
1397                         assert(!m->vmp_busy);
1398                         assert(!m->vmp_absent);
1399
1400                         m->vmp_busy = TRUE;
1401                         break;
1402                 }
1403
1404
1405                 /*
1406                  * we get here when there is no page present in the object at
1407                  * the offset we're interested in... we'll allocate a page
1408                  * at this point if the pager associated with
1409                  * this object can provide the data or we're the top object...
1410                  * object is locked;  m == NULL
1411                  */
1412
1413                 if (must_be_resident) {
1414                         if (fault_type == VM_PROT_NONE &&
1415                             object == kernel_object) {
1416                                 /*
1417                                  * We've been called from vm_fault_unwire()
1418                                  * while removing a map entry that was allocated
1419                                  * with KMA_KOBJECT and KMA_VAONLY.  This page
1420                                  * is not present and there's nothing more to
1421                                  * do here (nothing to unwire).
1422                                  */
1423                                 vm_fault_cleanup(object, first_m);
1424                                 thread_interrupt_level(interruptible_state);
1425
1426                                 return VM_FAULT_MEMORY_ERROR;
1427                         }
1428
1429                         goto dont_look_for_page;
1430                 }
1431
1432                 /* Don't expect to fault pages into the kernel object. */
1433                 assert(object != kernel_object);
1434
1435                 data_supply = FALSE;
1436
1437                 look_for_page = (object->pager_created && (MUST_ASK_PAGER(object, offset, external_state) == TRUE) && !data_supply);
1438
1439 #if TRACEFAULTPAGE
1440                 dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object);      /* (TEST/DEBUG) */
1441 #endif
1442                 if (!look_for_page && object == first_object && !object->phys_contiguous) {
1443                         /*
1444                          * Allocate a new page for this object/offset pair as a placeholder
1445                          */
1446                         m = vm_page_grab_options(grab_options);
1447 #if TRACEFAULTPAGE
1448                         dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
1449 #endif
1450                         if (m == VM_PAGE_NULL) {
1451                                 vm_fault_cleanup(object, first_m);
1452                                 thread_interrupt_level(interruptible_state);
1453
1454                                 return VM_FAULT_MEMORY_SHORTAGE;
1455                         }
1456
1457                         if (fault_info && fault_info->batch_pmap_op == TRUE) {
1458                                 vm_page_insert_internal(m, object, offset, VM_KERN_MEMORY_NONE, FALSE, TRUE, TRUE, FALSE, NULL);
1459                         } else {
1460                                 vm_page_insert(m, object, offset);
1461                         }
1462                 }
1463                 if (look_for_page) {
1464                         kern_return_t   rc;
1465                         int             my_fault_type;
1466
1467                         /*
1468                          *      If the memory manager is not ready, we
1469                          *      cannot make requests.
1470                          */
1471                         if (!object->pager_ready) {
1472 #if TRACEFAULTPAGE
1473                                 dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0);       /* (TEST/DEBUG) */
1474 #endif
1475                                 if (m != VM_PAGE_NULL) {
1476                                         VM_PAGE_FREE(m);
1477                                 }
1478
1479                                 XPR(XPR_VM_FAULT,
1480                                     "vm_f_page: ready wait obj 0x%X, offset 0x%X\n",
1481                                     object, offset, 0, 0, 0);
1482
1483                                 /*
1484                                  * take an extra ref so object won't die
1485                                  */
1486                                 vm_object_reference_locked(object);
1487                                 vm_fault_cleanup(object, first_m);
1488                                 counter(c_vm_fault_page_block_backoff_kernel++);
1489
1490                                 vm_object_lock(object);
1491                                 assert(object->ref_count > 0);
1492
1493                                 if (!object->pager_ready) {
1494                                         wait_result = vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGER_READY, interruptible);
1495
1496                                         vm_object_unlock(object);
1497                                         if (wait_result == THREAD_WAITING) {
1498                                                 wait_result = thread_block(THREAD_CONTINUE_NULL);
1499                                         }
1500                                         vm_object_deallocate(object);
1501
1502                                         goto backoff;
1503                                 } else {
1504                                         vm_object_unlock(object);
1505                                         vm_object_deallocate(object);
1506                                         thread_interrupt_level(interruptible_state);
1507
1508                                         return VM_FAULT_RETRY;
1509                                 }
1510                         }
1511                         if (!object->internal && !object->phys_contiguous && object->paging_in_progress > vm_object_pagein_throttle) {
1512                                 /*
1513                                  * If there are too many outstanding page
1514                                  * requests pending on this external object, we
1515                                  * wait for them to be resolved now.
1516                                  */
1517 #if TRACEFAULTPAGE
1518                                 dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0);       /* (TEST/DEBUG) */
1519 #endif
1520                                 if (m != VM_PAGE_NULL) {
1521                                         VM_PAGE_FREE(m);
1522                                 }
1523                                 /*
1524                                  * take an extra ref so object won't die
1525                                  */
1526                                 vm_object_reference_locked(object);
1527
1528                                 vm_fault_cleanup(object, first_m);
1529
1530                                 counter(c_vm_fault_page_block_backoff_kernel++);
1531
1532                                 vm_object_lock(object);
1533                                 assert(object->ref_count > 0);
1534
1535                                 if (object->paging_in_progress >= vm_object_pagein_throttle) {
1536                                         vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGING_ONLY_IN_PROGRESS, interruptible);
1537
1538                                         vm_object_unlock(object);
1539                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
1540                                         vm_object_deallocate(object);
1541
1542                                         goto backoff;
1543                                 } else {
1544                                         vm_object_unlock(object);
1545                                         vm_object_deallocate(object);
1546                                         thread_interrupt_level(interruptible_state);
1547
1548                                         return VM_FAULT_RETRY;
1549                                 }
1550                         }
1551                         if (object->internal) {
1552                                 int compressed_count_delta;
1553
1554                                 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
1555
1556                                 if (m == VM_PAGE_NULL) {
1557                                         /*
1558                                          * Allocate a new page for this object/offset pair as a placeholder
1559                                          */
1560                                         m = vm_page_grab_options(grab_options);
1561 #if TRACEFAULTPAGE
1562                                         dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object);  /* (TEST/DEBUG) */
1563 #endif
1564                                         if (m == VM_PAGE_NULL) {
1565                                                 vm_fault_cleanup(object, first_m);
1566                                                 thread_interrupt_level(interruptible_state);
1567
1568                                                 return VM_FAULT_MEMORY_SHORTAGE;
1569                                         }
1570
1571                                         m->vmp_absent = TRUE;
1572                                         if (fault_info && fault_info->batch_pmap_op == TRUE) {
1573                                                 vm_page_insert_internal(m, object, offset, VM_KERN_MEMORY_NONE, FALSE, TRUE, TRUE, FALSE, NULL);
1574                                         } else {
1575                                                 vm_page_insert(m, object, offset);
1576                                         }
1577                                 }
1578                                 assert(m->vmp_busy);
1579
1580                                 m->vmp_absent = TRUE;
1581                                 pager = object->pager;
1582
1583                                 assert(object->paging_in_progress > 0);
1584                                 vm_object_unlock(object);
1585
1586                                 rc = vm_compressor_pager_get(
1587                                         pager,
1588                                         offset + object->paging_offset,
1589                                         VM_PAGE_GET_PHYS_PAGE(m),
1590                                         &my_fault_type,
1591                                         0,
1592                                         &compressed_count_delta);
1593
1594                                 if (type_of_fault == NULL) {
1595                                         int     throttle_delay;
1596
1597                                         /*
1598                                          * we weren't called from vm_fault, so we
1599                                          * need to apply page creation throttling
1600                                          * do it before we re-acquire any locks
1601                                          */
1602                                         if (my_fault_type == DBG_COMPRESSOR_FAULT) {
1603                                                 if ((throttle_delay = vm_page_throttled(TRUE))) {
1604                                                         VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 1, 0);
1605                                                         delay(throttle_delay);
1606                                                 }
1607                                         }
1608                                 }
1609                                 vm_object_lock(object);
1610                                 assert(object->paging_in_progress > 0);
1611
1612                                 vm_compressor_pager_count(
1613                                         pager,
1614                                         compressed_count_delta,
1615                                         FALSE, /* shared_lock */
1616                                         object);
1617
1618                                 switch (rc) {
1619                                 case KERN_SUCCESS:
1620                                         m->vmp_absent = FALSE;
1621                                         m->vmp_dirty = TRUE;
1622                                         if ((object->wimg_bits &
1623                                             VM_WIMG_MASK) !=
1624                                             VM_WIMG_USE_DEFAULT) {
1625                                                 /*
1626                                                  * If the page is not cacheable,
1627                                                  * we can't let its contents
1628                                                  * linger in the data cache
1629                                                  * after the decompression.
1630                                                  */
1631                                                 pmap_sync_page_attributes_phys(
1632                                                         VM_PAGE_GET_PHYS_PAGE(m));
1633                                         } else {
1634                                                 m->vmp_written_by_kernel = TRUE;
1635                                         }
1636
1637                                         /*
1638                                          * If the object is purgeable, its
1639                                          * owner's purgeable ledgers have been
1640                                          * updated in vm_page_insert() but the
1641                                          * page was also accounted for in a
1642                                          * "compressed purgeable" ledger, so
1643                                          * update that now.
1644                                          */
1645                                         if (((object->purgable !=
1646                                             VM_PURGABLE_DENY) ||
1647                                             object->vo_ledger_tag) &&
1648                                             (object->vo_owner !=
1649                                             NULL)) {
1650                                                 /*
1651                                                  * One less compressed
1652                                                  * purgeable/tagged page.
1653                                                  */
1654                                                 vm_object_owner_compressed_update(
1655                                                         object,
1656                                                         -1);
1657                                         }
1658
1659                                         break;
1660                                 case KERN_MEMORY_FAILURE:
1661                                         m->vmp_unusual = TRUE;
1662                                         m->vmp_error = TRUE;
1663                                         m->vmp_absent = FALSE;
1664                                         break;
1665                                 case KERN_MEMORY_ERROR:
1666                                         assert(m->vmp_absent);
1667                                         break;
1668                                 default:
1669                                         panic("vm_fault_page(): unexpected "
1670                                             "error %d from "
1671                                             "vm_compressor_pager_get()\n",
1672                                             rc);
1673                                 }
1674                                 PAGE_WAKEUP_DONE(m);
1675
1676                                 rc = KERN_SUCCESS;
1677                                 goto data_requested;
1678                         }
1679                         my_fault_type = DBG_PAGEIN_FAULT;
1680
1681                         if (m != VM_PAGE_NULL) {
1682                                 VM_PAGE_FREE(m);
1683                                 m = VM_PAGE_NULL;
1684                         }
1685
1686 #if TRACEFAULTPAGE
1687                         dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0);  /* (TEST/DEBUG) */
1688 #endif
1689
1690                         /*
1691                          * It's possible someone called vm_object_destroy while we weren't
1692                          * holding the object lock.  If that has happened, then bail out
1693                          * here.
1694                          */
1695
1696                         pager = object->pager;
1697
1698                         if (pager == MEMORY_OBJECT_NULL) {
1699                                 vm_fault_cleanup(object, first_m);
1700                                 thread_interrupt_level(interruptible_state);
1701                                 return VM_FAULT_MEMORY_ERROR;
1702                         }
1703
1704                         /*
1705                          * We have an absent page in place for the faulting offset,
1706                          * so we can release the object lock.
1707                          */
1708
1709                         if (object->object_is_shared_cache) {
1710                                 set_thread_rwlock_boost();
1711                         }
1712
1713                         vm_object_unlock(object);
1714
1715                         /*
1716                          * If this object uses a copy_call strategy,
1717                          * and we are interested in a copy of this object
1718                          * (having gotten here only by following a
1719                          * shadow chain), then tell the memory manager
1720                          * via a flag added to the desired_access
1721                          * parameter, so that it can detect a race
1722                          * between our walking down the shadow chain
1723                          * and its pushing pages up into a copy of
1724                          * the object that it manages.
1725                          */
1726                         if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL && object != first_object) {
1727                                 wants_copy_flag = VM_PROT_WANTS_COPY;
1728                         } else {
1729                                 wants_copy_flag = VM_PROT_NONE;
1730                         }
1731
1732                         XPR(XPR_VM_FAULT,
1733                             "vm_f_page: data_req obj 0x%X, offset 0x%X, page 0x%X, acc %d\n",
1734                             object, offset, m,
1735                             access_required | wants_copy_flag, 0);
1736
1737                         if (object->copy == first_object) {
1738                                 /*
1739                                  * if we issue the memory_object_data_request in
1740                                  * this state, we are subject to a deadlock with
1741                                  * the underlying filesystem if it is trying to
1742                                  * shrink the file resulting in a push of pages
1743                                  * into the copy object...  that push will stall
1744                                  * on the placeholder page, and if the pushing thread
1745                                  * is holding a lock that is required on the pagein
1746                                  * path (such as a truncate lock), we'll deadlock...
1747                                  * to avoid this potential deadlock, we throw away
1748                                  * our placeholder page before calling memory_object_data_request
1749                                  * and force this thread to retry the vm_fault_page after
1750                                  * we have issued the I/O.  the second time through this path
1751                                  * we will find the page already in the cache (presumably still
1752                                  * busy waiting for the I/O to complete) and then complete
1753                                  * the fault w/o having to go through memory_object_data_request again
1754                                  */
1755                                 assert(first_m != VM_PAGE_NULL);
1756                                 assert(VM_PAGE_OBJECT(first_m) == first_object);
1757
1758                                 vm_object_lock(first_object);
1759                                 VM_PAGE_FREE(first_m);
1760                                 vm_object_paging_end(first_object);
1761                                 vm_object_unlock(first_object);
1762
1763                                 first_m = VM_PAGE_NULL;
1764                                 force_fault_retry = TRUE;
1765
1766                                 vm_fault_page_forced_retry++;
1767                         }
1768
1769                         if (data_already_requested == TRUE) {
1770                                 orig_behavior = fault_info->behavior;
1771                                 orig_cluster_size = fault_info->cluster_size;
1772
1773                                 fault_info->behavior = VM_BEHAVIOR_RANDOM;
1774                                 fault_info->cluster_size = PAGE_SIZE;
1775                         }
1776                         /*
1777                          * Call the memory manager to retrieve the data.
1778                          */
1779                         rc = memory_object_data_request(
1780                                 pager,
1781                                 offset + object->paging_offset,
1782                                 PAGE_SIZE,
1783                                 access_required | wants_copy_flag,
1784                                 (memory_object_fault_info_t)fault_info);
1785
1786                         if (data_already_requested == TRUE) {
1787                                 fault_info->behavior = orig_behavior;
1788                                 fault_info->cluster_size = orig_cluster_size;
1789                         } else {
1790                                 data_already_requested = TRUE;
1791                         }
1792
1793                         DTRACE_VM2(maj_fault, int, 1, (uint64_t *), NULL);
1794 #if TRACEFAULTPAGE
1795                         dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
1796 #endif
1797                         vm_object_lock(object);
1798
1799                         if (object->object_is_shared_cache) {
1800                                 clear_thread_rwlock_boost();
1801                         }
1802
1803 data_requested:
1804                         if (rc != KERN_SUCCESS) {
1805                                 vm_fault_cleanup(object, first_m);
1806                                 thread_interrupt_level(interruptible_state);
1807
1808                                 return (rc == MACH_SEND_INTERRUPTED) ?
1809                                        VM_FAULT_INTERRUPTED :
1810                                        VM_FAULT_MEMORY_ERROR;
1811                         } else {
1812                                 clock_sec_t     tv_sec;
1813                                 clock_usec_t    tv_usec;
1814
1815                                 if (my_fault_type == DBG_PAGEIN_FAULT) {
1816                                         clock_get_system_microtime(&tv_sec, &tv_usec);
1817                                         current_thread()->t_page_creation_time = tv_sec;
1818                                         current_thread()->t_page_creation_count = 0;
1819                                 }
1820                         }
1821                         if ((interruptible != THREAD_UNINT) && (current_thread()->sched_flags & TH_SFLAG_ABORT)) {
1822                                 vm_fault_cleanup(object, first_m);
1823                                 thread_interrupt_level(interruptible_state);
1824
1825                                 return VM_FAULT_INTERRUPTED;
1826                         }
1827                         if (force_fault_retry == TRUE) {
1828                                 vm_fault_cleanup(object, first_m);
1829                                 thread_interrupt_level(interruptible_state);
1830
1831                                 return VM_FAULT_RETRY;
1832                         }
1833                         if (m == VM_PAGE_NULL && object->phys_contiguous) {
1834                                 /*
1835                                  * No page here means that the object we
1836                                  * initially looked up was "physically
1837                                  * contiguous" (i.e. device memory).  However,
1838                                  * with Virtual VRAM, the object might not
1839                                  * be backed by that device memory anymore,
1840                                  * so we're done here only if the object is
1841                                  * still "phys_contiguous".
1842                                  * Otherwise, if the object is no longer
1843                                  * "phys_contiguous", we need to retry the
1844                                  * page fault against the object's new backing
1845                                  * store (different memory object).
1846                                  */
1847 phys_contig_object:
1848                                 goto done;
1849                         }
1850                         /*
1851                          * potentially a pagein fault
1852                          * if we make it through the state checks
1853                          * above, than we'll count it as such
1854                          */
1855                         my_fault = my_fault_type;
1856
1857                         /*
1858                          * Retry with same object/offset, since new data may
1859                          * be in a different page (i.e., m is meaningless at
1860                          * this point).
1861                          */
1862                         continue;
1863                 }
1864 dont_look_for_page:
1865                 /*
1866                  * We get here if the object has no pager, or an existence map
1867                  * exists and indicates the page isn't present on the pager
1868                  * or we're unwiring a page.  If a pager exists, but there
1869                  * is no existence map, then the m->vmp_absent case above handles
1870                  * the ZF case when the pager can't provide the page
1871                  */
1872 #if TRACEFAULTPAGE
1873                 dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1874 #endif
1875                 if (object == first_object) {
1876                         first_m = m;
1877                 } else {
1878                         assert(m == VM_PAGE_NULL);
1879                 }
1880
1881                 XPR(XPR_VM_FAULT,
1882                     "vm_f_page: no pager obj 0x%X, offset 0x%X, page 0x%X, next_obj 0x%X\n",
1883                     object, offset, m,
1884                     object->shadow, 0);
1885
1886                 next_object = object->shadow;
1887
1888                 if (next_object == VM_OBJECT_NULL) {
1889                         /*
1890                          * we've hit the bottom of the shadown chain,
1891                          * fill the page in the top object with zeros.
1892                          */
1893                         assert(!must_be_resident);
1894
1895                         if (object != first_object) {
1896                                 vm_object_paging_end(object);
1897                                 vm_object_unlock(object);
1898
1899                                 object = first_object;
1900                                 offset = first_offset;
1901                                 vm_object_lock(object);
1902                         }
1903                         m = first_m;
1904                         assert(VM_PAGE_OBJECT(m) == object);
1905                         first_m = VM_PAGE_NULL;
1906
1907                         /*
1908                          * check for any conditions that prevent
1909                          * us from creating a new zero-fill page
1910                          * vm_fault_check will do all of the
1911                          * fault cleanup in the case of an error condition
1912                          * including resetting the thread_interrupt_level
1913                          */
1914                         error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
1915
1916                         if (error != VM_FAULT_SUCCESS) {
1917                                 return error;
1918                         }
1919
1920                         if (m == VM_PAGE_NULL) {
1921                                 m = vm_page_grab_options(grab_options);
1922
1923                                 if (m == VM_PAGE_NULL) {
1924                                         vm_fault_cleanup(object, VM_PAGE_NULL);
1925                                         thread_interrupt_level(interruptible_state);
1926
1927                                         return VM_FAULT_MEMORY_SHORTAGE;
1928                                 }
1929                                 vm_page_insert(m, object, offset);
1930                         }
1931                         if (fault_info->mark_zf_absent && no_zero_fill == TRUE) {
1932                                 m->vmp_absent = TRUE;
1933                         }
1934
1935                         my_fault = vm_fault_zero_page(m, no_zero_fill);
1936
1937                         break;
1938                 } else {
1939                         /*
1940                          * Move on to the next object.  Lock the next
1941                          * object before unlocking the current one.
1942                          */
1943                         if ((object != first_object) || must_be_resident) {
1944                                 vm_object_paging_end(object);
1945                         }
1946
1947                         offset += object->vo_shadow_offset;
1948                         fault_info->lo_offset += object->vo_shadow_offset;
1949                         fault_info->hi_offset += object->vo_shadow_offset;
1950                         access_required = VM_PROT_READ;
1951
1952                         vm_object_lock(next_object);
1953                         vm_object_unlock(object);
1954
1955                         object = next_object;
1956                         vm_object_paging_begin(object);
1957                 }
1958         }
1959
1960         /*
1961          *      PAGE HAS BEEN FOUND.
1962          *
1963          *      This page (m) is:
1964          *              busy, so that we can play with it;
1965          *              not absent, so that nobody else will fill it;
1966          *              possibly eligible for pageout;
1967          *
1968          *      The top-level page (first_m) is:
1969          *              VM_PAGE_NULL if the page was found in the
1970          *               top-level object;
1971          *              busy, not absent, and ineligible for pageout.
1972          *
1973          *      The current object (object) is locked.  A paging
1974          *      reference is held for the current and top-level
1975          *      objects.
1976          */
1977
1978 #if TRACEFAULTPAGE
1979         dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
1980 #endif
1981 #if     EXTRA_ASSERTIONS
1982         assert(m->vmp_busy && !m->vmp_absent);
1983         assert((first_m == VM_PAGE_NULL) ||
1984             (first_m->vmp_busy && !first_m->vmp_absent &&
1985             !first_m->vmp_active && !first_m->vmp_inactive && !first_m->vmp_secluded));
1986 #endif  /* EXTRA_ASSERTIONS */
1987
1988         XPR(XPR_VM_FAULT,
1989             "vm_f_page: FOUND obj 0x%X, off 0x%X, page 0x%X, 1_obj 0x%X, 1_m 0x%X\n",
1990             object, offset, m,
1991             first_object, first_m);
1992
1993         /*
1994          * If the page is being written, but isn't
1995          * already owned by the top-level object,
1996          * we have to copy it into a new page owned
1997          * by the top-level object.
1998          */
1999         if (object != first_object) {
2000 #if TRACEFAULTPAGE
2001                 dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
2002 #endif
2003                 if (fault_type & VM_PROT_WRITE) {
2004                         vm_page_t copy_m;
2005
2006                         /*
2007                          * We only really need to copy if we
2008                          * want to write it.
2009                          */
2010                         assert(!must_be_resident);
2011
2012                         /*
2013                          * If we try to collapse first_object at this
2014                          * point, we may deadlock when we try to get
2015                          * the lock on an intermediate object (since we
2016                          * have the bottom object locked).  We can't
2017                          * unlock the bottom object, because the page
2018                          * we found may move (by collapse) if we do.
2019                          *
2020                          * Instead, we first copy the page.  Then, when
2021                          * we have no more use for the bottom object,
2022                          * we unlock it and try to collapse.
2023                          *
2024                          * Note that we copy the page even if we didn't
2025                          * need to... that's the breaks.
2026                          */
2027
2028                         /*
2029                          * Allocate a page for the copy
2030                          */
2031                         copy_m = vm_page_grab_options(grab_options);
2032
2033                         if (copy_m == VM_PAGE_NULL) {
2034                                 RELEASE_PAGE(m);
2035
2036                                 vm_fault_cleanup(object, first_m);
2037                                 thread_interrupt_level(interruptible_state);
2038
2039                                 return VM_FAULT_MEMORY_SHORTAGE;
2040                         }
2041                         XPR(XPR_VM_FAULT,
2042                             "vm_f_page: page_copy obj 0x%X, offset 0x%X, m 0x%X, copy_m 0x%X\n",
2043                             object, offset,
2044                             m, copy_m, 0);
2045
2046                         vm_page_copy(m, copy_m);
2047
2048                         /*
2049                          * If another map is truly sharing this
2050                          * page with us, we have to flush all
2051                          * uses of the original page, since we
2052                          * can't distinguish those which want the
2053                          * original from those which need the
2054                          * new copy.
2055                          *
2056                          * XXXO If we know that only one map has
2057                          * access to this page, then we could
2058                          * avoid the pmap_disconnect() call.
2059                          */
2060                         if (m->vmp_pmapped) {
2061                                 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
2062                         }
2063
2064                         if (m->vmp_clustered) {
2065                                 VM_PAGE_COUNT_AS_PAGEIN(m);
2066                                 VM_PAGE_CONSUME_CLUSTERED(m);
2067                         }
2068                         assert(!m->vmp_cleaning);
2069
2070                         /*
2071                          * We no longer need the old page or object.
2072                          */
2073                         RELEASE_PAGE(m);
2074
2075                         /*
2076                          * This check helps with marking the object as having a sequential pattern
2077                          * Normally we'll miss doing this below because this fault is about COW to
2078                          * the first_object i.e. bring page in from disk, push to object above but
2079                          * don't update the file object's sequential pattern.
2080                          */
2081                         if (object->internal == FALSE) {
2082                                 vm_fault_is_sequential(object, offset, fault_info->behavior);
2083                         }
2084
2085                         vm_object_paging_end(object);
2086                         vm_object_unlock(object);
2087
2088                         my_fault = DBG_COW_FAULT;
2089                         VM_STAT_INCR(cow_faults);
2090                         DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
2091                         current_task()->cow_faults++;
2092
2093                         object = first_object;
2094                         offset = first_offset;
2095
2096                         vm_object_lock(object);
2097                         /*
2098                          * get rid of the place holder
2099                          * page that we soldered in earlier
2100                          */
2101                         VM_PAGE_FREE(first_m);
2102                         first_m = VM_PAGE_NULL;
2103
2104                         /*
2105                          * and replace it with the
2106                          * page we just copied into
2107                          */
2108                         assert(copy_m->vmp_busy);
2109                         vm_page_insert(copy_m, object, offset);
2110                         SET_PAGE_DIRTY(copy_m, TRUE);
2111
2112                         m = copy_m;
2113                         /*
2114                          * Now that we've gotten the copy out of the
2115                          * way, let's try to collapse the top object.
2116                          * But we have to play ugly games with
2117                          * paging_in_progress to do that...
2118                          */
2119                         vm_object_paging_end(object);
2120                         vm_object_collapse(object, offset, TRUE);
2121                         vm_object_paging_begin(object);
2122                 } else {
2123                         *protection &= (~VM_PROT_WRITE);
2124                 }
2125         }
2126         /*
2127          * Now check whether the page needs to be pushed into the
2128          * copy object.  The use of asymmetric copy on write for
2129          * shared temporary objects means that we may do two copies to
2130          * satisfy the fault; one above to get the page from a
2131          * shadowed object, and one here to push it into the copy.
2132          */
2133         try_failed_count = 0;
2134
2135         while ((copy_object = first_object->copy) != VM_OBJECT_NULL) {
2136                 vm_object_offset_t      copy_offset;
2137                 vm_page_t               copy_m;
2138
2139 #if TRACEFAULTPAGE
2140                 dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type);    /* (TEST/DEBUG) */
2141 #endif
2142                 /*
2143                  * If the page is being written, but hasn't been
2144                  * copied to the copy-object, we have to copy it there.
2145                  */
2146                 if ((fault_type & VM_PROT_WRITE) == 0) {
2147                         *protection &= ~VM_PROT_WRITE;
2148                         break;
2149                 }
2150
2151                 /*
2152                  * If the page was guaranteed to be resident,
2153                  * we must have already performed the copy.
2154                  */
2155                 if (must_be_resident) {
2156                         break;
2157                 }
2158
2159                 /*
2160                  * Try to get the lock on the copy_object.
2161                  */
2162                 if (!vm_object_lock_try(copy_object)) {
2163                         vm_object_unlock(object);
2164                         try_failed_count++;
2165
2166                         mutex_pause(try_failed_count);  /* wait a bit */
2167                         vm_object_lock(object);
2168
2169                         continue;
2170                 }
2171                 try_failed_count = 0;
2172
2173                 /*
2174                  * Make another reference to the copy-object,
2175                  * to keep it from disappearing during the
2176                  * copy.
2177                  */
2178                 vm_object_reference_locked(copy_object);
2179
2180                 /*
2181                  * Does the page exist in the copy?
2182                  */
2183                 copy_offset = first_offset - copy_object->vo_shadow_offset;
2184
2185                 if (copy_object->vo_size <= copy_offset) {
2186                         /*
2187                          * Copy object doesn't cover this page -- do nothing.
2188                          */
2189                         ;
2190                 } else if ((copy_m = vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
2191                         /*
2192                          * Page currently exists in the copy object
2193                          */
2194                         if (copy_m->vmp_busy) {
2195                                 /*
2196                                  * If the page is being brought
2197                                  * in, wait for it and then retry.
2198                                  */
2199                                 RELEASE_PAGE(m);
2200
2201                                 /*
2202                                  * take an extra ref so object won't die
2203                                  */
2204                                 vm_object_reference_locked(copy_object);
2205                                 vm_object_unlock(copy_object);
2206                                 vm_fault_cleanup(object, first_m);
2207                                 counter(c_vm_fault_page_block_backoff_kernel++);
2208
2209                                 vm_object_lock(copy_object);
2210                                 assert(copy_object->ref_count > 0);
2211                                 VM_OBJ_RES_DECR(copy_object);
2212                                 vm_object_lock_assert_exclusive(copy_object);
2213                                 copy_object->ref_count--;
2214                                 assert(copy_object->ref_count > 0);
2215                                 copy_m = vm_page_lookup(copy_object, copy_offset);
2216
2217                                 if (copy_m != VM_PAGE_NULL && copy_m->vmp_busy) {
2218                                         PAGE_ASSERT_WAIT(copy_m, interruptible);
2219
2220                                         vm_object_unlock(copy_object);
2221                                         wait_result = thread_block(THREAD_CONTINUE_NULL);
2222                                         vm_object_deallocate(copy_object);
2223
2224                                         goto backoff;
2225                                 } else {
2226                                         vm_object_unlock(copy_object);
2227                                         vm_object_deallocate(copy_object);
2228                                         thread_interrupt_level(interruptible_state);
2229
2230                                         return VM_FAULT_RETRY;
2231                                 }
2232                         }
2233                 } else if (!PAGED_OUT(copy_object, copy_offset)) {
2234                         /*
2235                          * If PAGED_OUT is TRUE, then the page used to exist
2236                          * in the copy-object, and has already been paged out.
2237                          * We don't need to repeat this. If PAGED_OUT is
2238                          * FALSE, then either we don't know (!pager_created,
2239                          * for example) or it hasn't been paged out.
2240                          * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
2241                          * We must copy the page to the copy object.
2242                          *
2243                          * Allocate a page for the copy
2244                          */
2245                         copy_m = vm_page_alloc(copy_object, copy_offset);
2246
2247                         if (copy_m == VM_PAGE_NULL) {
2248                                 RELEASE_PAGE(m);
2249
2250                                 VM_OBJ_RES_DECR(copy_object);
2251                                 vm_object_lock_assert_exclusive(copy_object);
2252                                 copy_object->ref_count--;
2253                                 assert(copy_object->ref_count > 0);
2254
2255                                 vm_object_unlock(copy_object);
2256                                 vm_fault_cleanup(object, first_m);
2257                                 thread_interrupt_level(interruptible_state);
2258
2259                                 return VM_FAULT_MEMORY_SHORTAGE;
2260                         }
2261                         /*
2262                          * Must copy page into copy-object.
2263                          */
2264                         vm_page_copy(m, copy_m);
2265
2266                         /*
2267                          * If the old page was in use by any users
2268                          * of the copy-object, it must be removed
2269                          * from all pmaps.  (We can't know which
2270                          * pmaps use it.)
2271                          */
2272                         if (m->vmp_pmapped) {
2273                                 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
2274                         }
2275
2276                         if (m->vmp_clustered) {
2277                                 VM_PAGE_COUNT_AS_PAGEIN(m);
2278                                 VM_PAGE_CONSUME_CLUSTERED(m);
2279                         }
2280                         /*
2281                          * If there's a pager, then immediately
2282                          * page out this page, using the "initialize"
2283                          * option.  Else, we use the copy.
2284                          */
2285                         if ((!copy_object->pager_ready)
2286                             || VM_COMPRESSOR_PAGER_STATE_GET(copy_object, copy_offset) == VM_EXTERNAL_STATE_ABSENT
2287                             ) {
2288                                 vm_page_lockspin_queues();
2289                                 assert(!m->vmp_cleaning);
2290                                 vm_page_activate(copy_m);
2291                                 vm_page_unlock_queues();
2292
2293                                 SET_PAGE_DIRTY(copy_m, TRUE);
2294                                 PAGE_WAKEUP_DONE(copy_m);
2295                         } else {
2296                                 assert(copy_m->vmp_busy == TRUE);
2297                                 assert(!m->vmp_cleaning);
2298
2299                                 /*
2300                                  * dirty is protected by the object lock
2301                                  */
2302                                 SET_PAGE_DIRTY(copy_m, TRUE);
2303
2304                                 /*
2305                                  * The page is already ready for pageout:
2306                                  * not on pageout queues and busy.
2307                                  * Unlock everything except the
2308                                  * copy_object itself.
2309                                  */
2310                                 vm_object_unlock(object);
2311
2312                                 /*
2313                                  * Write the page to the copy-object,
2314                                  * flushing it from the kernel.
2315                                  */
2316                                 vm_pageout_initialize_page(copy_m);
2317
2318                                 /*
2319                                  * Since the pageout may have
2320                                  * temporarily dropped the
2321                                  * copy_object's lock, we
2322                                  * check whether we'll have
2323                                  * to deallocate the hard way.
2324                                  */
2325                                 if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
2326                                         vm_object_unlock(copy_object);
2327                                         vm_object_deallocate(copy_object);
2328                                         vm_object_lock(object);
2329
2330                                         continue;
2331                                 }
2332                                 /*
2333                                  * Pick back up the old object's
2334                                  * lock.  [It is safe to do so,
2335                                  * since it must be deeper in the
2336                                  * object tree.]
2337                                  */
2338                                 vm_object_lock(object);
2339                         }
2340
2341                         /*
2342                          * Because we're pushing a page upward
2343                          * in the object tree, we must restart
2344                          * any faults that are waiting here.
2345                          * [Note that this is an expansion of
2346                          * PAGE_WAKEUP that uses the THREAD_RESTART
2347                          * wait result].  Can't turn off the page's
2348                          * busy bit because we're not done with it.
2349                          */
2350                         if (m->vmp_wanted) {
2351                                 m->vmp_wanted = FALSE;
2352                                 thread_wakeup_with_result((event_t) m, THREAD_RESTART);
2353                         }
2354                 }
2355                 /*
2356                  * The reference count on copy_object must be
2357                  * at least 2: one for our extra reference,
2358                  * and at least one from the outside world
2359                  * (we checked that when we last locked
2360                  * copy_object).
2361                  */
2362                 vm_object_lock_assert_exclusive(copy_object);
2363                 copy_object->ref_count--;
2364                 assert(copy_object->ref_count > 0);
2365
2366                 VM_OBJ_RES_DECR(copy_object);
2367                 vm_object_unlock(copy_object);
2368
2369                 break;
2370         }
2371
2372 done:
2373         *result_page = m;
2374         *top_page = first_m;
2375
2376         XPR(XPR_VM_FAULT,
2377             "vm_f_page: DONE obj 0x%X, offset 0x%X, m 0x%X, first_m 0x%X\n",
2378             object, offset, m, first_m, 0);
2379
2380         if (m != VM_PAGE_NULL) {
2381                 assert(VM_PAGE_OBJECT(m) == object);
2382
2383                 retval = VM_FAULT_SUCCESS;
2384
2385                 if (my_fault == DBG_PAGEIN_FAULT) {
2386                         VM_PAGE_COUNT_AS_PAGEIN(m);
2387
2388                         if (object->internal) {
2389                                 my_fault = DBG_PAGEIND_FAULT;
2390                         } else {
2391                                 my_fault = DBG_PAGEINV_FAULT;
2392                         }
2393
2394                         /*
2395                          * evaluate access pattern and update state
2396                          * vm_fault_deactivate_behind depends on the
2397                          * state being up to date
2398                          */
2399                         vm_fault_is_sequential(object, offset, fault_info->behavior);
2400                         vm_fault_deactivate_behind(object, offset, fault_info->behavior);
2401                 } else if (type_of_fault == NULL && my_fault == DBG_CACHE_HIT_FAULT) {
2402                         /*
2403                          * we weren't called from vm_fault, so handle the
2404                          * accounting here for hits in the cache
2405                          */
2406                         if (m->vmp_clustered) {
2407                                 VM_PAGE_COUNT_AS_PAGEIN(m);
2408                                 VM_PAGE_CONSUME_CLUSTERED(m);
2409                         }
2410                         vm_fault_is_sequential(object, offset, fault_info->behavior);
2411                         vm_fault_deactivate_behind(object, offset, fault_info->behavior);
2412                 } else if (my_fault == DBG_COMPRESSOR_FAULT || my_fault == DBG_COMPRESSOR_SWAPIN_FAULT) {
2413                         VM_STAT_INCR(decompressions);
2414                 }
2415                 if (type_of_fault) {
2416                         *type_of_fault = my_fault;
2417                 }
2418         } else {
2419                 retval = VM_FAULT_SUCCESS_NO_VM_PAGE;
2420                 assert(first_m == VM_PAGE_NULL);
2421                 assert(object == first_object);
2422         }
2423
2424         thread_interrupt_level(interruptible_state);
2425
2426 #if TRACEFAULTPAGE
2427         dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0);       /* (TEST/DEBUG) */
2428 #endif
2429         return retval;
2430
2431 backoff:
2432         thread_interrupt_level(interruptible_state);
2433
2434         if (wait_result == THREAD_INTERRUPTED) {
2435                 return VM_FAULT_INTERRUPTED;
2436         }
2437         return VM_FAULT_RETRY;
2438
2439 #undef  RELEASE_PAGE
2440 }
2441
2442
2443
2444 /*
2445  * CODE SIGNING:
2446  * When soft faulting a page, we have to validate the page if:
2447  * 1. the page is being mapped in user space
2448  * 2. the page hasn't already been found to be "tainted"
2449  * 3. the page belongs to a code-signed object
2450  * 4. the page has not been validated yet or has been mapped for write.
2451  */
2452 #define VM_FAULT_NEED_CS_VALIDATION(pmap, page, page_obj)               \
2453         ((pmap) != kernel_pmap /*1*/ &&                                 \
2454          !(page)->vmp_cs_tainted /*2*/ &&                                       \
2455          (page_obj)->code_signed /*3*/ &&                                       \
2456          (!(page)->vmp_cs_validated || (page)->vmp_wpmapped /*4*/ ))
2457
2458
2459 /*
2460  * page queue lock must NOT be held
2461  * m->vmp_object must be locked
2462  *
2463  * NOTE: m->vmp_object could be locked "shared" only if we are called
2464  * from vm_fault() as part of a soft fault.  If so, we must be
2465  * careful not to modify the VM object in any way that is not
2466  * legal under a shared lock...
2467  */
2468 extern int panic_on_cs_killed;
2469 extern int proc_selfpid(void);
2470 extern char *proc_name_address(void *p);
2471 unsigned long cs_enter_tainted_rejected = 0;
2472 unsigned long cs_enter_tainted_accepted = 0;
2473 kern_return_t
2474 vm_fault_enter(vm_page_t m,
2475     pmap_t pmap,
2476     vm_map_offset_t vaddr,
2477     vm_prot_t prot,
2478     vm_prot_t caller_prot,
2479     boolean_t wired,
2480     boolean_t change_wiring,
2481     vm_tag_t  wire_tag,
2482     vm_object_fault_info_t fault_info,
2483     boolean_t *need_retry,
2484     int *type_of_fault)
2485 {
2486         kern_return_t   kr, pe_result;
2487         boolean_t       previously_pmapped = m->vmp_pmapped;
2488         boolean_t       must_disconnect = 0;
2489         boolean_t       map_is_switched, map_is_switch_protected;
2490         boolean_t       cs_violation;
2491         int             cs_enforcement_enabled;
2492         vm_prot_t       fault_type;
2493         vm_object_t     object;
2494         boolean_t       no_cache = fault_info->no_cache;
2495         boolean_t       cs_bypass = fault_info->cs_bypass;
2496         int             pmap_options = fault_info->pmap_options;
2497
2498         fault_type = change_wiring ? VM_PROT_NONE : caller_prot;
2499         object = VM_PAGE_OBJECT(m);
2500
2501         vm_object_lock_assert_held(object);
2502
2503 #if KASAN
2504         if (pmap == kernel_pmap) {
2505                 kasan_notify_address(vaddr, PAGE_SIZE);
2506         }
2507 #endif
2508
2509         LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
2510
2511         if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
2512                 assert(m->vmp_fictitious);
2513                 return KERN_SUCCESS;
2514         }
2515
2516         if (*type_of_fault == DBG_ZERO_FILL_FAULT) {
2517                 vm_object_lock_assert_exclusive(object);
2518         } else if ((fault_type & VM_PROT_WRITE) == 0 &&
2519             (!m->vmp_wpmapped
2520 #if VM_OBJECT_ACCESS_TRACKING
2521             || object->access_tracking
2522 #endif /* VM_OBJECT_ACCESS_TRACKING */
2523             )) {
2524                 /*
2525                  * This is not a "write" fault, so we
2526                  * might not have taken the object lock
2527                  * exclusively and we might not be able
2528                  * to update the "wpmapped" bit in
2529                  * vm_fault_enter().
2530                  * Let's just grant read access to
2531                  * the page for now and we'll
2532                  * soft-fault again if we need write
2533                  * access later...
2534                  */
2535
2536                 /* This had better not be a JIT page. */
2537                 if (!pmap_has_prot_policy(prot)) {
2538                         prot &= ~VM_PROT_WRITE;
2539                 } else {
2540                         assert(cs_bypass);
2541                 }
2542         }
2543         if (m->vmp_pmapped == FALSE) {
2544                 if (m->vmp_clustered) {
2545                         if (*type_of_fault == DBG_CACHE_HIT_FAULT) {
2546                                 /*
2547                                  * found it in the cache, but this
2548                                  * is the first fault-in of the page (m->vmp_pmapped == FALSE)
2549                                  * so it must have come in as part of
2550                                  * a cluster... account 1 pagein against it
2551                                  */
2552                                 if (object->internal) {
2553                                         *type_of_fault = DBG_PAGEIND_FAULT;
2554                                 } else {
2555                                         *type_of_fault = DBG_PAGEINV_FAULT;
2556                                 }
2557
2558                                 VM_PAGE_COUNT_AS_PAGEIN(m);
2559                         }
2560                         VM_PAGE_CONSUME_CLUSTERED(m);
2561                 }
2562         }
2563
2564         if (*type_of_fault != DBG_COW_FAULT) {
2565                 DTRACE_VM2(as_fault, int, 1, (uint64_t *), NULL);
2566
2567                 if (pmap == kernel_pmap) {
2568                         DTRACE_VM2(kernel_asflt, int, 1, (uint64_t *), NULL);
2569                 }
2570         }
2571
2572         /* Validate code signature if necessary. */
2573         if (!cs_bypass &&
2574             VM_FAULT_NEED_CS_VALIDATION(pmap, m, object)) {
2575                 vm_object_lock_assert_exclusive(object);
2576
2577                 if (m->vmp_cs_validated) {
2578                         vm_cs_revalidates++;
2579                 }
2580
2581                 /* VM map is locked, so 1 ref will remain on VM object -
2582                  * so no harm if vm_page_validate_cs drops the object lock */
2583
2584 #if PMAP_CS
2585                 if (fault_info->pmap_cs_associated &&
2586                     pmap_cs_enforced(pmap) &&
2587                     !m->vmp_cs_validated &&
2588                     !m->vmp_cs_tainted &&
2589                     !m->vmp_cs_nx &&
2590                     (prot & VM_PROT_EXECUTE) &&
2591                     (caller_prot & VM_PROT_EXECUTE)) {
2592                         /*
2593                          * With pmap_cs, the pmap layer will validate the
2594                          * code signature for any executable pmap mapping.
2595                          * No need for us to validate this page too:
2596                          * in pmap_cs we trust...
2597                          */
2598                         vm_cs_defer_to_pmap_cs++;
2599                 } else {
2600                         vm_cs_defer_to_pmap_cs_not++;
2601                         vm_page_validate_cs(m);
2602                 }
2603 #else /* PMAP_CS */
2604                 vm_page_validate_cs(m);
2605 #endif /* PMAP_CS */
2606         }
2607
2608 #define page_immutable(m, prot) ((m)->vmp_cs_validated /*&& ((prot) & VM_PROT_EXECUTE)*/ )
2609 #define page_nx(m) ((m)->vmp_cs_nx)
2610
2611         map_is_switched = ((pmap != vm_map_pmap(current_task()->map)) &&
2612             (pmap == vm_map_pmap(current_thread()->map)));
2613         map_is_switch_protected = current_thread()->map->switch_protect;
2614
2615         /* If the map is switched, and is switch-protected, we must protect
2616          * some pages from being write-faulted: immutable pages because by
2617          * definition they may not be written, and executable pages because that
2618          * would provide a way to inject unsigned code.
2619          * If the page is immutable, we can simply return. However, we can't
2620          * immediately determine whether a page is executable anywhere. But,
2621          * we can disconnect it everywhere and remove the executable protection
2622          * from the current map. We do that below right before we do the
2623          * PMAP_ENTER.
2624          */
2625         cs_enforcement_enabled = cs_process_enforcement(NULL);
2626
2627         if (cs_enforcement_enabled && map_is_switched &&
2628             map_is_switch_protected && page_immutable(m, prot) &&
2629             (prot & VM_PROT_WRITE)) {
2630                 return KERN_CODESIGN_ERROR;
2631         }
2632
2633         if (cs_enforcement_enabled && page_nx(m) && (prot & VM_PROT_EXECUTE)) {
2634                 if (cs_debug) {
2635                         printf("page marked to be NX, not letting it be mapped EXEC\n");
2636                 }
2637                 return KERN_CODESIGN_ERROR;
2638         }
2639
2640         /* A page could be tainted, or pose a risk of being tainted later.
2641          * Check whether the receiving process wants it, and make it feel
2642          * the consequences (that hapens in cs_invalid_page()).
2643          * For CS Enforcement, two other conditions will
2644          * cause that page to be tainted as well:
2645          * - pmapping an unsigned page executable - this means unsigned code;
2646          * - writeable mapping of a validated page - the content of that page
2647          *   can be changed without the kernel noticing, therefore unsigned
2648          *   code can be created
2649          */
2650         if (cs_bypass) {
2651                 /* code-signing is bypassed */
2652                 cs_violation = FALSE;
2653         } else if (m->vmp_cs_tainted) {
2654                 /* tainted page */
2655                 cs_violation = TRUE;
2656         } else if (!cs_enforcement_enabled) {
2657                 /* no further code-signing enforcement */
2658                 cs_violation = FALSE;
2659         } else if (page_immutable(m, prot) &&
2660             ((prot & VM_PROT_WRITE) ||
2661             m->vmp_wpmapped)) {
2662                 /*
2663                  * The page should be immutable, but is in danger of being
2664                  * modified.
2665                  * This is the case where we want policy from the code
2666                  * directory - is the page immutable or not? For now we have
2667                  * to assume that code pages will be immutable, data pages not.
2668                  * We'll assume a page is a code page if it has a code directory
2669                  * and we fault for execution.
2670                  * That is good enough since if we faulted the code page for
2671                  * writing in another map before, it is wpmapped; if we fault
2672                  * it for writing in this map later it will also be faulted for
2673                  * executing at the same time; and if we fault for writing in
2674                  * another map later, we will disconnect it from this pmap so
2675                  * we'll notice the change.
2676                  */
2677                 cs_violation = TRUE;
2678         } else if (!m->vmp_cs_validated &&
2679             (prot & VM_PROT_EXECUTE)
2680 #if PMAP_CS
2681             /*
2682              * Executable pages will be validated by pmap_cs;
2683              * in pmap_cs we trust...
2684              * If pmap_cs is turned off, this is a code-signing
2685              * violation.
2686              */
2687             && !(pmap_cs_enforced(pmap))
2688 #endif /* PMAP_CS */
2689             ) {
2690                 cs_violation = TRUE;
2691         } else {
2692                 cs_violation = FALSE;
2693         }
2694
2695         if (cs_violation) {
2696                 /* We will have a tainted page. Have to handle the special case
2697                  * of a switched map now. If the map is not switched, standard
2698                  * procedure applies - call cs_invalid_page().
2699                  * If the map is switched, the real owner is invalid already.
2700                  * There is no point in invalidating the switching process since
2701                  * it will not be executing from the map. So we don't call
2702                  * cs_invalid_page() in that case. */
2703                 boolean_t reject_page, cs_killed;
2704                 if (map_is_switched) {
2705                         assert(pmap == vm_map_pmap(current_thread()->map));
2706                         assert(!(prot & VM_PROT_WRITE) || (map_is_switch_protected == FALSE));
2707                         reject_page = FALSE;
2708                 } else {
2709                         if (cs_debug > 5) {
2710                                 printf("vm_fault: signed: %s validate: %s tainted: %s wpmapped: %s prot: 0x%x\n",
2711                                     object->code_signed ? "yes" : "no",
2712                                     m->vmp_cs_validated ? "yes" : "no",
2713                                     m->vmp_cs_tainted ? "yes" : "no",
2714                                     m->vmp_wpmapped ? "yes" : "no",
2715                                     (int)prot);
2716                         }
2717                         reject_page = cs_invalid_page((addr64_t) vaddr, &cs_killed);
2718                 }
2719
2720                 if (reject_page) {
2721                         /* reject the invalid page: abort the page fault */
2722                         int                     pid;
2723                         const char              *procname;
2724                         task_t                  task;
2725                         vm_object_t             file_object, shadow;
2726                         vm_object_offset_t      file_offset;
2727                         char                    *pathname, *filename;
2728                         vm_size_t               pathname_len, filename_len;
2729                         boolean_t               truncated_path;
2730 #define __PATH_MAX 1024
2731                         struct timespec         mtime, cs_mtime;
2732                         int                     shadow_depth;
2733                         os_reason_t             codesigning_exit_reason = OS_REASON_NULL;
2734
2735                         kr = KERN_CODESIGN_ERROR;
2736                         cs_enter_tainted_rejected++;
2737
2738                         /* get process name and pid */
2739                         procname = "?";
2740                         task = current_task();
2741                         pid = proc_selfpid();
2742                         if (task->bsd_info != NULL) {
2743                                 procname = proc_name_address(task->bsd_info);
2744                         }
2745
2746                         /* get file's VM object */
2747                         file_object = object;
2748                         file_offset = m->vmp_offset;
2749                         for (shadow = file_object->shadow,
2750                             shadow_depth = 0;
2751                             shadow != VM_OBJECT_NULL;
2752                             shadow = file_object->shadow,
2753                             shadow_depth++) {
2754                                 vm_object_lock_shared(shadow);
2755                                 if (file_object != object) {
2756                                         vm_object_unlock(file_object);
2757                                 }
2758                                 file_offset += file_object->vo_shadow_offset;
2759                                 file_object = shadow;
2760                         }
2761
2762                         mtime.tv_sec = 0;
2763                         mtime.tv_nsec = 0;
2764                         cs_mtime.tv_sec = 0;
2765                         cs_mtime.tv_nsec = 0;
2766
2767                         /* get file's pathname and/or filename */
2768                         pathname = NULL;
2769                         filename = NULL;
2770                         pathname_len = 0;
2771                         filename_len = 0;
2772                         truncated_path = FALSE;
2773                         /* no pager -> no file -> no pathname, use "<nil>" in that case */
2774                         if (file_object->pager != NULL) {
2775                                 pathname = (char *)kalloc(__PATH_MAX * 2);
2776                                 if (pathname) {
2777                                         pathname[0] = '\0';
2778                                         pathname_len = __PATH_MAX;
2779                                         filename = pathname + pathname_len;
2780                                         filename_len = __PATH_MAX;
2781                                 }
2782                                 vnode_pager_get_object_name(file_object->pager,
2783                                     pathname,
2784                                     pathname_len,
2785                                     filename,
2786                                     filename_len,
2787                                     &truncated_path);
2788                                 if (pathname) {
2789                                         /* safety first... */
2790                                         pathname[__PATH_MAX - 1] = '\0';
2791                                         filename[__PATH_MAX - 1] = '\0';
2792                                 }
2793                                 vnode_pager_get_object_mtime(file_object->pager,
2794                                     &mtime,
2795                                     &cs_mtime);
2796                         }
2797                         printf("CODE SIGNING: process %d[%s]: "
2798                             "rejecting invalid page at address 0x%llx "
2799                             "from offset 0x%llx in file \"%s%s%s\" "
2800                             "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
2801                             "(signed:%d validated:%d tainted:%d nx:%d "
2802                             "wpmapped:%d dirty:%d depth:%d)\n",
2803                             pid, procname, (addr64_t) vaddr,
2804                             file_offset,
2805                             (pathname ? pathname : "<nil>"),
2806                             (truncated_path ? "/.../" : ""),
2807                             (truncated_path ? filename : ""),
2808                             cs_mtime.tv_sec, cs_mtime.tv_nsec,
2809                             ((cs_mtime.tv_sec == mtime.tv_sec &&
2810                             cs_mtime.tv_nsec == mtime.tv_nsec)
2811                             ? "=="
2812                             : "!="),
2813                             mtime.tv_sec, mtime.tv_nsec,
2814                             object->code_signed,
2815                             m->vmp_cs_validated,
2816                             m->vmp_cs_tainted,
2817                             m->vmp_cs_nx,
2818                             m->vmp_wpmapped,
2819                             m->vmp_dirty,
2820                             shadow_depth);
2821
2822                         /*
2823                          * We currently only generate an exit reason if cs_invalid_page directly killed a process. If cs_invalid_page
2824                          * did not kill the process (more the case on desktop), vm_fault_enter will not satisfy the fault and whether the
2825                          * process dies is dependent on whether there is a signal handler registered for SIGSEGV and how that handler
2826                          * will deal with the segmentation fault.
2827                          */
2828                         if (cs_killed) {
2829                                 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE,
2830                                     pid, OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_INVALID_PAGE, 0, 0);
2831
2832                                 codesigning_exit_reason = os_reason_create(OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_INVALID_PAGE);
2833                                 if (codesigning_exit_reason == NULL) {
2834                                         printf("vm_fault_enter: failed to allocate codesigning exit reason\n");
2835                                 } else {
2836                                         mach_vm_address_t data_addr = 0;
2837                                         struct codesigning_exit_reason_info *ceri = NULL;
2838                                         uint32_t reason_buffer_size_estimate = kcdata_estimate_required_buffer_size(1, sizeof(*ceri));
2839
2840                                         if (os_reason_alloc_buffer_noblock(codesigning_exit_reason, reason_buffer_size_estimate)) {
2841                                                 printf("vm_fault_enter: failed to allocate buffer for codesigning exit reason\n");
2842                                         } else {
2843                                                 if (KERN_SUCCESS == kcdata_get_memory_addr(&codesigning_exit_reason->osr_kcd_descriptor,
2844                                                     EXIT_REASON_CODESIGNING_INFO, sizeof(*ceri), &data_addr)) {
2845                                                         ceri = (struct codesigning_exit_reason_info *)data_addr;
2846                                                         static_assert(__PATH_MAX == sizeof(ceri->ceri_pathname));
2847
2848                                                         ceri->ceri_virt_addr = vaddr;
2849                                                         ceri->ceri_file_offset = file_offset;
2850                                                         if (pathname) {
2851                                                                 strncpy((char *)&ceri->ceri_pathname, pathname, sizeof(ceri->ceri_pathname));
2852                                                         } else {
2853                                                                 ceri->ceri_pathname[0] = '\0';
2854                                                         }
2855                                                         if (filename) {
2856                                                                 strncpy((char *)&ceri->ceri_filename, filename, sizeof(ceri->ceri_filename));
2857                                                         } else {
2858                                                                 ceri->ceri_filename[0] = '\0';
2859                                                         }
2860                                                         ceri->ceri_path_truncated = (truncated_path);
2861                                                         ceri->ceri_codesig_modtime_secs = cs_mtime.tv_sec;
2862                                                         ceri->ceri_codesig_modtime_nsecs = cs_mtime.tv_nsec;
2863                                                         ceri->ceri_page_modtime_secs = mtime.tv_sec;
2864                                                         ceri->ceri_page_modtime_nsecs = mtime.tv_nsec;
2865                                                         ceri->ceri_object_codesigned = (object->code_signed);
2866                                                         ceri->ceri_page_codesig_validated = (m->vmp_cs_validated);
2867                                                         ceri->ceri_page_codesig_tainted = (m->vmp_cs_tainted);
2868                                                         ceri->ceri_page_codesig_nx = (m->vmp_cs_nx);
2869                                                         ceri->ceri_page_wpmapped = (m->vmp_wpmapped);
2870                                                         ceri->ceri_page_slid = 0;
2871                                                         ceri->ceri_page_dirty = (m->vmp_dirty);
2872                                                         ceri->ceri_page_shadow_depth = shadow_depth;
2873                                                 } else {
2874 #if DEBUG || DEVELOPMENT
2875                                                         panic("vm_fault_enter: failed to allocate kcdata for codesigning exit reason");
2876 #else
2877                                                         printf("vm_fault_enter: failed to allocate kcdata for codesigning exit reason\n");
2878 #endif /* DEBUG || DEVELOPMENT */
2879                                                         /* Free the buffer */
2880                                                         os_reason_alloc_buffer_noblock(codesigning_exit_reason, 0);
2881                                                 }
2882                                         }
2883                                 }
2884
2885                                 set_thread_exit_reason(current_thread(), codesigning_exit_reason, FALSE);
2886                         }
2887                         if (panic_on_cs_killed &&
2888                             object->object_is_shared_cache) {
2889                                 panic("CODE SIGNING: process %d[%s]: "
2890                                     "rejecting invalid page at address 0x%llx "
2891                                     "from offset 0x%llx in file \"%s%s%s\" "
2892                                     "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
2893                                     "(signed:%d validated:%d tainted:%d nx:%d"
2894                                     "wpmapped:%d dirty:%d depth:%d)\n",
2895                                     pid, procname, (addr64_t) vaddr,
2896                                     file_offset,
2897                                     (pathname ? pathname : "<nil>"),
2898                                     (truncated_path ? "/.../" : ""),
2899                                     (truncated_path ? filename : ""),
2900                                     cs_mtime.tv_sec, cs_mtime.tv_nsec,
2901                                     ((cs_mtime.tv_sec == mtime.tv_sec &&
2902                                     cs_mtime.tv_nsec == mtime.tv_nsec)
2903                                     ? "=="
2904                                     : "!="),
2905                                     mtime.tv_sec, mtime.tv_nsec,
2906                                     object->code_signed,
2907                                     m->vmp_cs_validated,
2908                                     m->vmp_cs_tainted,
2909                                     m->vmp_cs_nx,
2910                                     m->vmp_wpmapped,
2911                                     m->vmp_dirty,
2912                                     shadow_depth);
2913                         }
2914
2915                         if (file_object != object) {
2916                                 vm_object_unlock(file_object);
2917                         }
2918                         if (pathname_len != 0) {
2919                                 kfree(pathname, __PATH_MAX * 2);
2920                                 pathname = NULL;
2921                                 filename = NULL;
2922                         }
2923                 } else {
2924                         /* proceed with the invalid page */
2925                         kr = KERN_SUCCESS;
2926                         if (!m->vmp_cs_validated &&
2927                             !object->code_signed) {
2928                                 /*
2929                                  * This page has not been (fully) validated but
2930                                  * does not belong to a code-signed object
2931                                  * so it should not be forcefully considered
2932                                  * as tainted.
2933                                  * We're just concerned about it here because
2934                                  * we've been asked to "execute" it but that
2935                                  * does not mean that it should cause other
2936                                  * accesses to fail.
2937                                  * This happens when a debugger sets a
2938                                  * breakpoint and we then execute code in
2939                                  * that page.  Marking the page as "tainted"
2940                                  * would cause any inspection tool ("leaks",
2941                                  * "vmmap", "CrashReporter", ...) to get killed
2942                                  * due to code-signing violation on that page,
2943                                  * even though they're just reading it and not
2944                                  * executing from it.
2945                                  */
2946                         } else {
2947                                 /*
2948                                  * Page might have been tainted before or not;
2949                                  * now it definitively is. If the page wasn't
2950                                  * tainted, we must disconnect it from all
2951                                  * pmaps later, to force existing mappings
2952                                  * through that code path for re-consideration
2953                                  * of the validity of that page.
2954                                  */
2955                                 must_disconnect = !m->vmp_cs_tainted;
2956                                 m->vmp_cs_tainted = TRUE;
2957                         }
2958                         cs_enter_tainted_accepted++;
2959                 }
2960                 if (kr != KERN_SUCCESS) {
2961                         if (cs_debug) {
2962                                 printf("CODESIGNING: vm_fault_enter(0x%llx): "
2963                                     "*** INVALID PAGE ***\n",
2964                                     (long long)vaddr);
2965                         }
2966 #if !SECURE_KERNEL
2967                         if (cs_enforcement_panic) {
2968                                 panic("CODESIGNING: panicking on invalid page\n");
2969                         }
2970 #endif
2971                 }
2972         } else {
2973                 /* proceed with the valid page */
2974                 kr = KERN_SUCCESS;
2975         }
2976
2977         boolean_t       page_queues_locked = FALSE;
2978 #define __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED()   \
2979 MACRO_BEGIN                                     \
2980         if (! page_queues_locked) {             \
2981                 page_queues_locked = TRUE;      \
2982                 vm_page_lockspin_queues();      \
2983         }                                       \
2984 MACRO_END
2985 #define __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED()     \
2986 MACRO_BEGIN                                     \
2987         if (page_queues_locked) {               \
2988                 page_queues_locked = FALSE;     \
2989                 vm_page_unlock_queues();        \
2990         }                                       \
2991 MACRO_END
2992
2993         /*
2994          * Hold queues lock to manipulate
2995          * the page queues.  Change wiring
2996          * case is obvious.
2997          */
2998         assert((m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) || object != compressor_object);
2999
3000 #if CONFIG_BACKGROUND_QUEUE
3001         vm_page_update_background_state(m);
3002 #endif
3003         if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
3004                 /*
3005                  * Compressor pages are neither wired
3006                  * nor pageable and should never change.
3007                  */
3008                 assert(object == compressor_object);
3009         } else if (change_wiring) {
3010                 __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3011
3012                 if (wired) {
3013                         if (kr == KERN_SUCCESS) {
3014                                 vm_page_wire(m, wire_tag, TRUE);
3015                         }
3016                 } else {
3017                         vm_page_unwire(m, TRUE);
3018                 }
3019                 /* we keep the page queues lock, if we need it later */
3020         } else {
3021                 if (object->internal == TRUE) {
3022                         /*
3023                          * don't allow anonymous pages on
3024                          * the speculative queues
3025                          */
3026                         no_cache = FALSE;
3027                 }
3028                 if (kr != KERN_SUCCESS) {
3029                         __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3030                         vm_page_deactivate(m);
3031                         /* we keep the page queues lock, if we need it later */
3032                 } else if (((m->vmp_q_state == VM_PAGE_NOT_ON_Q) ||
3033                     (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ||
3034                     (m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) ||
3035                     ((m->vmp_q_state != VM_PAGE_ON_THROTTLED_Q) && no_cache)) &&
3036                     !VM_PAGE_WIRED(m)) {
3037                         if (vm_page_local_q &&
3038                             (*type_of_fault == DBG_COW_FAULT ||
3039                             *type_of_fault == DBG_ZERO_FILL_FAULT)) {
3040                                 struct vpl      *lq;
3041                                 uint32_t        lid;
3042
3043                                 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
3044
3045                                 __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
3046                                 vm_object_lock_assert_exclusive(object);
3047
3048                                 /*
3049                                  * we got a local queue to stuff this
3050                                  * new page on...
3051                                  * its safe to manipulate local and
3052                                  * local_id at this point since we're
3053                                  * behind an exclusive object lock and
3054                                  * the page is not on any global queue.
3055                                  *
3056                                  * we'll use the current cpu number to
3057                                  * select the queue note that we don't
3058                                  * need to disable preemption... we're
3059                                  * going to be behind the local queue's
3060                                  * lock to do the real work
3061                                  */
3062                                 lid = cpu_number();
3063
3064                                 lq = &vm_page_local_q[lid].vpl_un.vpl;
3065
3066                                 VPL_LOCK(&lq->vpl_lock);
3067
3068                                 vm_page_check_pageable_safe(m);
3069                                 vm_page_queue_enter(&lq->vpl_queue, m, vmp_pageq);
3070                                 m->vmp_q_state = VM_PAGE_ON_ACTIVE_LOCAL_Q;
3071                                 m->vmp_local_id = lid;
3072                                 lq->vpl_count++;
3073
3074                                 if (object->internal) {
3075                                         lq->vpl_internal_count++;
3076                                 } else {
3077                                         lq->vpl_external_count++;
3078                                 }
3079
3080                                 VPL_UNLOCK(&lq->vpl_lock);
3081
3082                                 if (lq->vpl_count > vm_page_local_q_soft_limit) {
3083                                         /*
3084                                          * we're beyond the soft limit
3085                                          * for the local queue
3086                                          * vm_page_reactivate_local will
3087                                          * 'try' to take the global page
3088                                          * queue lock... if it can't
3089                                          * that's ok... we'll let the
3090                                          * queue continue to grow up
3091                                          * to the hard limit... at that
3092                                          * point we'll wait for the
3093                                          * lock... once we've got the
3094                                          * lock, we'll transfer all of
3095                                          * the pages from the local
3096                                          * queue to the global active
3097                                          * queue
3098                                          */
3099                                         vm_page_reactivate_local(lid, FALSE, FALSE);
3100                                 }
3101                         } else {
3102                                 __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3103
3104                                 /*
3105                                  * test again now that we hold the
3106                                  * page queue lock
3107                                  */
3108                                 if (!VM_PAGE_WIRED(m)) {
3109                                         if (m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3110                                                 vm_page_queues_remove(m, FALSE);
3111
3112                                                 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3113                                                 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_fault_reactivated, 1);
3114                                         }
3115
3116                                         if (!VM_PAGE_ACTIVE_OR_INACTIVE(m) ||
3117                                             no_cache) {
3118                                                 /*
3119                                                  * If this is a no_cache mapping
3120                                                  * and the page has never been
3121                                                  * mapped before or was
3122                                                  * previously a no_cache page,
3123                                                  * then we want to leave pages
3124                                                  * in the speculative state so
3125                                                  * that they can be readily
3126                                                  * recycled if free memory runs
3127                                                  * low.  Otherwise the page is
3128                                                  * activated as normal.
3129                                                  */
3130
3131                                                 if (no_cache &&
3132                                                     (!previously_pmapped ||
3133                                                     m->vmp_no_cache)) {
3134                                                         m->vmp_no_cache = TRUE;
3135
3136                                                         if (m->vmp_q_state != VM_PAGE_ON_SPECULATIVE_Q) {
3137                                                                 vm_page_speculate(m, FALSE);
3138                                                         }
3139                                                 } else if (!VM_PAGE_ACTIVE_OR_INACTIVE(m)) {
3140                                                         vm_page_activate(m);
3141                                                 }
3142                                         }
3143                                 }
3144                                 /* we keep the page queues lock, if we need it later */
3145                         }
3146                 }
3147         }
3148         /* we're done with the page queues lock, if we ever took it */
3149         __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
3150
3151
3152         /* If we have a KERN_SUCCESS from the previous checks, we either have
3153          * a good page, or a tainted page that has been accepted by the process.
3154          * In both cases the page will be entered into the pmap.
3155          * If the page is writeable, we need to disconnect it from other pmaps
3156          * now so those processes can take note.
3157          */
3158         if (kr == KERN_SUCCESS) {
3159                 /*
3160                  * NOTE: we may only hold the vm_object lock SHARED
3161                  * at this point, so we need the phys_page lock to
3162                  * properly serialize updating the pmapped and
3163                  * xpmapped bits
3164                  */
3165                 if ((prot & VM_PROT_EXECUTE) && !m->vmp_xpmapped) {
3166                         ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m);
3167
3168                         pmap_lock_phys_page(phys_page);
3169                         /*
3170                          * go ahead and take the opportunity
3171                          * to set 'pmapped' here so that we don't
3172                          * need to grab this lock a 2nd time
3173                          * just below
3174                          */
3175                         m->vmp_pmapped = TRUE;
3176
3177                         if (!m->vmp_xpmapped) {
3178                                 m->vmp_xpmapped = TRUE;
3179
3180                                 pmap_unlock_phys_page(phys_page);
3181
3182                                 if (!object->internal) {
3183                                         OSAddAtomic(1, &vm_page_xpmapped_external_count);
3184                                 }
3185
3186 #if defined(__arm__) || defined(__arm64__)
3187                                 pmap_sync_page_data_phys(phys_page);
3188 #else
3189                                 if (object->internal &&
3190                                     object->pager != NULL) {
3191                                         /*
3192                                          * This page could have been
3193                                          * uncompressed by the
3194                                          * compressor pager and its
3195                                          * contents might be only in
3196                                          * the data cache.
3197                                          * Since it's being mapped for
3198                                          * "execute" for the fist time,
3199                                          * make sure the icache is in
3200                                          * sync.
3201                                          */
3202                                         assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
3203                                         pmap_sync_page_data_phys(phys_page);
3204                                 }
3205 #endif
3206                         } else {
3207                                 pmap_unlock_phys_page(phys_page);
3208                         }
3209                 } else {
3210                         if (m->vmp_pmapped == FALSE) {
3211                                 ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m);
3212
3213                                 pmap_lock_phys_page(phys_page);
3214                                 m->vmp_pmapped = TRUE;
3215                                 pmap_unlock_phys_page(phys_page);
3216                         }
3217                 }
3218
3219                 if (fault_type & VM_PROT_WRITE) {
3220                         if (m->vmp_wpmapped == FALSE) {
3221                                 vm_object_lock_assert_exclusive(object);
3222                                 if (!object->internal && object->pager) {
3223                                         task_update_logical_writes(current_task(), PAGE_SIZE, TASK_WRITE_DEFERRED, vnode_pager_lookup_vnode(object->pager));
3224                                 }
3225                                 m->vmp_wpmapped = TRUE;
3226                         }
3227                         if (must_disconnect) {
3228                                 /*
3229                                  * We can only get here
3230                                  * because of the CSE logic
3231                                  */
3232                                 assert(cs_enforcement_enabled);
3233                                 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
3234                                 /*
3235                                  * If we are faulting for a write, we can clear
3236                                  * the execute bit - that will ensure the page is
3237                                  * checked again before being executable, which
3238                                  * protects against a map switch.
3239                                  * This only happens the first time the page
3240                                  * gets tainted, so we won't get stuck here
3241                                  * to make an already writeable page executable.
3242                                  */
3243                                 if (!cs_bypass) {
3244                                         assert(!pmap_has_prot_policy(prot));
3245                                         prot &= ~VM_PROT_EXECUTE;
3246                                 }
3247                         }
3248                 }
3249                 assert(VM_PAGE_OBJECT(m) == object);
3250
3251 #if VM_OBJECT_ACCESS_TRACKING
3252                 if (object->access_tracking) {
3253                         DTRACE_VM2(access_tracking, vm_map_offset_t, vaddr, int, fault_type);
3254                         if (fault_type & VM_PROT_WRITE) {
3255                                 object->access_tracking_writes++;
3256                                 vm_object_access_tracking_writes++;
3257                         } else {
3258                                 object->access_tracking_reads++;
3259                                 vm_object_access_tracking_reads++;
3260                         }
3261                 }
3262 #endif /* VM_OBJECT_ACCESS_TRACKING */
3263
3264 #if PMAP_CS
3265                 /*
3266                  * If CS enforcement is on, we don't ask for an executable page if the
3267                  * fault does not call for execution, because that can fail in
3268                  * situations where the caller only actually wanted read access.
3269                  * However, it may be better to instead retry without execute on
3270                  * failure, or pass a flag into pmap_enter to do the right thing.
3271                  */
3272                 // TODO: <rdar://problem/30997388> maybe do something better than masking out VM_PROT_EXECUTE on non-execute faults
3273                 if (pmap_cs_enforced(pmap) && !(caller_prot & VM_PROT_EXECUTE)) {
3274                         prot &= ~VM_PROT_EXECUTE;
3275                 }
3276 #endif
3277
3278                 /* Prevent a deadlock by not
3279                  * holding the object lock if we need to wait for a page in
3280                  * pmap_enter() - <rdar://problem/7138958> */
3281                 PMAP_ENTER_OPTIONS(pmap, vaddr, m, prot, fault_type, 0,
3282                     wired,
3283                     pmap_options | PMAP_OPTIONS_NOWAIT,
3284                     pe_result);
3285 #if __x86_64__
3286                 if (pe_result == KERN_INVALID_ARGUMENT &&
3287                     pmap == PMAP_NULL &&
3288                     wired) {
3289                         /*
3290                          * Wiring a page in a pmap-less VM map:
3291                          * VMware's "vmmon" kernel extension does this
3292                          * to grab pages.
3293                          * Let it proceed even though the PMAP_ENTER() failed.
3294                          */
3295                         pe_result = KERN_SUCCESS;
3296                 }
3297 #endif /* __x86_64__ */
3298
3299                 if (pe_result == KERN_RESOURCE_SHORTAGE) {
3300                         if (need_retry) {
3301                                 /*
3302                                  * this will be non-null in the case where we hold the lock
3303                                  * on the top-object in this chain... we can't just drop
3304                                  * the lock on the object we're inserting the page into
3305                                  * and recall the PMAP_ENTER since we can still cause
3306                                  * a deadlock if one of the critical paths tries to
3307                                  * acquire the lock on the top-object and we're blocked
3308                                  * in PMAP_ENTER waiting for memory... our only recourse
3309                                  * is to deal with it at a higher level where we can
3310                                  * drop both locks.
3311                                  */
3312                                 *need_retry = TRUE;
3313                                 vm_pmap_enter_retried++;
3314                                 goto after_the_pmap_enter;
3315                         }
3316                         /* The nonblocking version of pmap_enter did not succeed.
3317                          * and we don't need to drop other locks and retry
3318                          * at the level above us, so
3319                          * use the blocking version instead. Requires marking
3320                          * the page busy and unlocking the object */
3321                         boolean_t was_busy = m->vmp_busy;
3322
3323                         vm_object_lock_assert_exclusive(object);
3324
3325                         m->vmp_busy = TRUE;
3326                         vm_object_unlock(object);
3327
3328                         PMAP_ENTER_OPTIONS(pmap, vaddr, m, prot, fault_type,
3329                             0, wired,
3330                             pmap_options, pe_result);
3331
3332                         assert(VM_PAGE_OBJECT(m) == object);
3333
3334                         /* Take the object lock again. */
3335                         vm_object_lock(object);
3336
3337                         /* If the page was busy, someone else will wake it up.
3338                          * Otherwise, we have to do it now. */
3339                         assert(m->vmp_busy);
3340                         if (!was_busy) {
3341                                 PAGE_WAKEUP_DONE(m);
3342                         }
3343                         vm_pmap_enter_blocked++;
3344                 }
3345
3346                 kr = pe_result;
3347         }
3348
3349 after_the_pmap_enter:
3350         return kr;
3351 }
3352
3353 void
3354 vm_pre_fault(vm_map_offset_t vaddr)
3355 {
3356         if (pmap_find_phys(current_map()->pmap, vaddr) == 0) {
3357                 vm_fault(current_map(),      /* map */
3358                     vaddr,                   /* vaddr */
3359                     VM_PROT_READ,            /* fault_type */
3360                     FALSE,                   /* change_wiring */
3361                     VM_KERN_MEMORY_NONE,     /* tag - not wiring */
3362                     THREAD_UNINT,            /* interruptible */
3363                     NULL,                    /* caller_pmap */
3364                     0 /* caller_pmap_addr */);
3365         }
3366 }
3367
3368
3369 /*
3370  *      Routine:        vm_fault
3371  *      Purpose:
3372  *              Handle page faults, including pseudo-faults
3373  *              used to change the wiring status of pages.
3374  *      Returns:
3375  *              Explicit continuations have been removed.
3376  *      Implementation:
3377  *              vm_fault and vm_fault_page save mucho state
3378  *              in the moral equivalent of a closure.  The state
3379  *              structure is allocated when first entering vm_fault
3380  *              and deallocated when leaving vm_fault.
3381  */
3382
3383 extern int _map_enter_debug;
3384 extern uint64_t get_current_unique_pid(void);
3385
3386 unsigned long vm_fault_collapse_total = 0;
3387 unsigned long vm_fault_collapse_skipped = 0;
3388
3389
3390 kern_return_t
3391 vm_fault_external(
3392         vm_map_t        map,
3393         vm_map_offset_t vaddr,
3394         vm_prot_t       fault_type,
3395         boolean_t       change_wiring,
3396         int             interruptible,
3397         pmap_t          caller_pmap,
3398         vm_map_offset_t caller_pmap_addr)
3399 {
3400         return vm_fault_internal(map, vaddr, fault_type, change_wiring, vm_tag_bt(),
3401                    interruptible, caller_pmap, caller_pmap_addr,
3402                    NULL);
3403 }
3404
3405 kern_return_t
3406 vm_fault(
3407         vm_map_t        map,
3408         vm_map_offset_t vaddr,
3409         vm_prot_t       fault_type,
3410         boolean_t       change_wiring,
3411         vm_tag_t        wire_tag,               /* if wiring must pass tag != VM_KERN_MEMORY_NONE */
3412         int             interruptible,
3413         pmap_t          caller_pmap,
3414         vm_map_offset_t caller_pmap_addr)
3415 {
3416         return vm_fault_internal(map, vaddr, fault_type, change_wiring, wire_tag,
3417                    interruptible, caller_pmap, caller_pmap_addr,
3418                    NULL);
3419 }
3420
3421 kern_return_t
3422 vm_fault_internal(
3423         vm_map_t        map,
3424         vm_map_offset_t vaddr,
3425         vm_prot_t       caller_prot,
3426         boolean_t       change_wiring,
3427         vm_tag_t        wire_tag,               /* if wiring must pass tag != VM_KERN_MEMORY_NONE */
3428         int             interruptible,
3429         pmap_t          caller_pmap,
3430         vm_map_offset_t caller_pmap_addr,
3431         ppnum_t         *physpage_p)
3432 {
3433         vm_map_version_t        version;        /* Map version for verificiation */
3434         boolean_t               wired;          /* Should mapping be wired down? */
3435         vm_object_t             object;         /* Top-level object */
3436         vm_object_offset_t      offset;         /* Top-level offset */
3437         vm_prot_t               prot;           /* Protection for mapping */
3438         vm_object_t             old_copy_object; /* Saved copy object */
3439         vm_page_t               result_page;    /* Result of vm_fault_page */
3440         vm_page_t               top_page;       /* Placeholder page */
3441         kern_return_t           kr;
3442
3443         vm_page_t               m;      /* Fast access to result_page */
3444         kern_return_t           error_code;
3445         vm_object_t             cur_object;
3446         vm_object_t             m_object = NULL;
3447         vm_object_offset_t      cur_offset;
3448         vm_page_t               cur_m;
3449         vm_object_t             new_object;
3450         int                     type_of_fault;
3451         pmap_t                  pmap;
3452         wait_interrupt_t        interruptible_state;
3453         vm_map_t                real_map = map;
3454         vm_map_t                original_map = map;
3455         boolean_t               object_locks_dropped = FALSE;
3456         vm_prot_t               fault_type;
3457         vm_prot_t               original_fault_type;
3458         struct vm_object_fault_info fault_info = {};
3459         boolean_t               need_collapse = FALSE;
3460         boolean_t               need_retry = FALSE;
3461         boolean_t               *need_retry_ptr = NULL;
3462         int                     object_lock_type = 0;
3463         int                     cur_object_lock_type;
3464         vm_object_t             top_object = VM_OBJECT_NULL;
3465         vm_object_t             written_on_object = VM_OBJECT_NULL;
3466         memory_object_t         written_on_pager = NULL;
3467         vm_object_offset_t      written_on_offset = 0;
3468         int                     throttle_delay;
3469         int                     compressed_count_delta;
3470         int                     grab_options;
3471         vm_map_offset_t         trace_vaddr;
3472         vm_map_offset_t         trace_real_vaddr;
3473 #if DEVELOPMENT || DEBUG
3474         vm_map_offset_t         real_vaddr;
3475
3476         real_vaddr = vaddr;
3477 #endif /* DEVELOPMENT || DEBUG */
3478         trace_real_vaddr = vaddr;
3479         vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
3480
3481         if (map == kernel_map) {
3482                 trace_vaddr = VM_KERNEL_ADDRHIDE(vaddr);
3483                 trace_real_vaddr = VM_KERNEL_ADDRHIDE(trace_real_vaddr);
3484         } else {
3485                 trace_vaddr = vaddr;
3486         }
3487
3488         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3489             (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START,
3490             ((uint64_t)trace_vaddr >> 32),
3491             trace_vaddr,
3492             (map == kernel_map),
3493             0,
3494             0);
3495
3496         if (get_preemption_level() != 0) {
3497                 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3498                     (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
3499                     ((uint64_t)trace_vaddr >> 32),
3500                     trace_vaddr,
3501                     KERN_FAILURE,
3502                     0,
3503                     0);
3504
3505                 return KERN_FAILURE;
3506         }
3507
3508         thread_t cthread = current_thread();
3509         boolean_t rtfault = (cthread->sched_mode == TH_MODE_REALTIME);
3510         uint64_t fstart = 0;
3511
3512         if (rtfault) {
3513                 fstart = mach_continuous_time();
3514         }
3515
3516         interruptible_state = thread_interrupt_level(interruptible);
3517
3518         fault_type = (change_wiring ? VM_PROT_NONE : caller_prot);
3519
3520         VM_STAT_INCR(faults);
3521         current_task()->faults++;
3522         original_fault_type = fault_type;
3523
3524         if (fault_type & VM_PROT_WRITE) {
3525                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3526         } else {
3527                 object_lock_type = OBJECT_LOCK_SHARED;
3528         }
3529
3530         cur_object_lock_type = OBJECT_LOCK_SHARED;
3531
3532         if ((map == kernel_map) && (caller_prot & VM_PROT_WRITE)) {
3533                 if (compressor_map) {
3534                         if ((vaddr >= vm_map_min(compressor_map)) && (vaddr < vm_map_max(compressor_map))) {
3535                                 panic("Write fault on compressor map, va: %p type: %u bounds: %p->%p", (void *) vaddr, caller_prot, (void *) vm_map_min(compressor_map), (void *) vm_map_max(compressor_map));
3536                         }
3537                 }
3538         }
3539 RetryFault:
3540         assert(written_on_object == VM_OBJECT_NULL);
3541
3542         /*
3543          * assume we will hit a page in the cache
3544          * otherwise, explicitly override with
3545          * the real fault type once we determine it
3546          */
3547         type_of_fault = DBG_CACHE_HIT_FAULT;
3548
3549         /*
3550          *      Find the backing store object and offset into
3551          *      it to begin the search.
3552          */
3553         fault_type = original_fault_type;
3554         map = original_map;
3555         vm_map_lock_read(map);
3556
3557         kr = vm_map_lookup_locked(&map, vaddr, fault_type,
3558             object_lock_type, &version,
3559             &object, &offset, &prot, &wired,
3560             &fault_info,
3561             &real_map);
3562
3563         if (kr != KERN_SUCCESS) {
3564                 vm_map_unlock_read(map);
3565                 goto done;
3566         }
3567         pmap = real_map->pmap;
3568         fault_info.interruptible = interruptible;
3569         fault_info.stealth = FALSE;
3570         fault_info.io_sync = FALSE;
3571         fault_info.mark_zf_absent = FALSE;
3572         fault_info.batch_pmap_op = FALSE;
3573
3574         /*
3575          * If the page is wired, we must fault for the current protection
3576          * value, to avoid further faults.
3577          */
3578         if (wired) {
3579                 fault_type = prot | VM_PROT_WRITE;
3580                 /*
3581                  * since we're treating this fault as a 'write'
3582                  * we must hold the top object lock exclusively
3583                  */
3584                 if (object_lock_type == OBJECT_LOCK_SHARED) {
3585                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3586
3587                         if (vm_object_lock_upgrade(object) == FALSE) {
3588                                 /*
3589                                  * couldn't upgrade, so explictly
3590                                  * take the lock exclusively
3591                                  */
3592                                 vm_object_lock(object);
3593                         }
3594                 }
3595         }
3596
3597 #if     VM_FAULT_CLASSIFY
3598         /*
3599          *      Temporary data gathering code
3600          */
3601         vm_fault_classify(object, offset, fault_type);
3602 #endif
3603         /*
3604          *      Fast fault code.  The basic idea is to do as much as
3605          *      possible while holding the map lock and object locks.
3606          *      Busy pages are not used until the object lock has to
3607          *      be dropped to do something (copy, zero fill, pmap enter).
3608          *      Similarly, paging references aren't acquired until that
3609          *      point, and object references aren't used.
3610          *
3611          *      If we can figure out what to do
3612          *      (zero fill, copy on write, pmap enter) while holding
3613          *      the locks, then it gets done.  Otherwise, we give up,
3614          *      and use the original fault path (which doesn't hold
3615          *      the map lock, and relies on busy pages).
3616          *      The give up cases include:
3617          *              - Have to talk to pager.
3618          *              - Page is busy, absent or in error.
3619          *              - Pager has locked out desired access.
3620          *              - Fault needs to be restarted.
3621          *              - Have to push page into copy object.
3622          *
3623          *      The code is an infinite loop that moves one level down
3624          *      the shadow chain each time.  cur_object and cur_offset
3625          *      refer to the current object being examined. object and offset
3626          *      are the original object from the map.  The loop is at the
3627          *      top level if and only if object and cur_object are the same.
3628          *
3629          *      Invariants:  Map lock is held throughout.  Lock is held on
3630          *              original object and cur_object (if different) when
3631          *              continuing or exiting loop.
3632          *
3633          */
3634
3635 #if defined(__arm64__)
3636         /*
3637          * Fail if reading an execute-only page in a
3638          * pmap that enforces execute-only protection.
3639          */
3640         if (fault_type == VM_PROT_READ &&
3641             (prot & VM_PROT_EXECUTE) &&
3642             !(prot & VM_PROT_READ) &&
3643             pmap_enforces_execute_only(pmap)) {
3644                 vm_object_unlock(object);
3645                 vm_map_unlock_read(map);
3646                 if (real_map != map) {
3647                         vm_map_unlock(real_map);
3648                 }
3649                 kr = KERN_PROTECTION_FAILURE;
3650                 goto done;
3651         }
3652 #endif
3653
3654         /*
3655          * If this page is to be inserted in a copy delay object
3656          * for writing, and if the object has a copy, then the
3657          * copy delay strategy is implemented in the slow fault page.
3658          */
3659         if (object->copy_strategy == MEMORY_OBJECT_COPY_DELAY &&
3660             object->copy != VM_OBJECT_NULL && (fault_type & VM_PROT_WRITE)) {
3661                 goto handle_copy_delay;
3662         }
3663
3664         cur_object = object;
3665         cur_offset = offset;
3666
3667         grab_options = 0;
3668 #if CONFIG_SECLUDED_MEMORY
3669         if (object->can_grab_secluded) {
3670                 grab_options |= VM_PAGE_GRAB_SECLUDED;
3671         }
3672 #endif /* CONFIG_SECLUDED_MEMORY */
3673
3674         while (TRUE) {
3675                 if (!cur_object->pager_created &&
3676                     cur_object->phys_contiguous) { /* superpage */
3677                         break;
3678                 }
3679
3680                 if (cur_object->blocked_access) {
3681                         /*
3682                          * Access to this VM object has been blocked.
3683                          * Let the slow path handle it.
3684                          */
3685                         break;
3686                 }
3687
3688                 m = vm_page_lookup(cur_object, cur_offset);
3689                 m_object = NULL;
3690
3691                 if (m != VM_PAGE_NULL) {
3692                         m_object = cur_object;
3693
3694                         if (m->vmp_busy) {
3695                                 wait_result_t   result;
3696
3697                                 /*
3698                                  * in order to do the PAGE_ASSERT_WAIT, we must
3699                                  * have object that 'm' belongs to locked exclusively
3700                                  */
3701                                 if (object != cur_object) {
3702                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3703                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3704
3705                                                 if (vm_object_lock_upgrade(cur_object) == FALSE) {
3706                                                         /*
3707                                                          * couldn't upgrade so go do a full retry
3708                                                          * immediately since we can no longer be
3709                                                          * certain about cur_object (since we
3710                                                          * don't hold a reference on it)...
3711                                                          * first drop the top object lock
3712                                                          */
3713                                                         vm_object_unlock(object);
3714
3715                                                         vm_map_unlock_read(map);
3716                                                         if (real_map != map) {
3717                                                                 vm_map_unlock(real_map);
3718                                                         }
3719
3720                                                         goto RetryFault;
3721                                                 }
3722                                         }
3723                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3724                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3725
3726                                         if (vm_object_lock_upgrade(object) == FALSE) {
3727                                                 /*
3728                                                  * couldn't upgrade, so explictly take the lock
3729                                                  * exclusively and go relookup the page since we
3730                                                  * will have dropped the object lock and
3731                                                  * a different thread could have inserted
3732                                                  * a page at this offset
3733                                                  * no need for a full retry since we're
3734                                                  * at the top level of the object chain
3735                                                  */
3736                                                 vm_object_lock(object);
3737
3738                                                 continue;
3739                                         }
3740                                 }
3741                                 if ((m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) && m_object->internal) {
3742                                         /*
3743                                          * m->vmp_busy == TRUE and the object is locked exclusively
3744                                          * if m->pageout_queue == TRUE after we acquire the
3745                                          * queues lock, we are guaranteed that it is stable on
3746                                          * the pageout queue and therefore reclaimable
3747                                          *
3748                                          * NOTE: this is only true for the internal pageout queue
3749                                          * in the compressor world
3750                                          */
3751                                         assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
3752
3753                                         vm_page_lock_queues();
3754
3755                                         if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
3756                                                 vm_pageout_throttle_up(m);
3757                                                 vm_page_unlock_queues();
3758
3759                                                 PAGE_WAKEUP_DONE(m);
3760                                                 goto reclaimed_from_pageout;
3761                                         }
3762                                         vm_page_unlock_queues();
3763                                 }
3764                                 if (object != cur_object) {
3765                                         vm_object_unlock(object);
3766                                 }
3767
3768                                 vm_map_unlock_read(map);
3769                                 if (real_map != map) {
3770                                         vm_map_unlock(real_map);
3771                                 }
3772
3773                                 result = PAGE_ASSERT_WAIT(m, interruptible);
3774
3775                                 vm_object_unlock(cur_object);
3776
3777                                 if (result == THREAD_WAITING) {
3778                                         result = thread_block(THREAD_CONTINUE_NULL);
3779
3780                                         counter(c_vm_fault_page_block_busy_kernel++);
3781                                 }
3782                                 if (result == THREAD_AWAKENED || result == THREAD_RESTART) {
3783                                         goto RetryFault;
3784                                 }
3785
3786                                 kr = KERN_ABORTED;
3787                                 goto done;
3788                         }
3789 reclaimed_from_pageout:
3790                         if (m->vmp_laundry) {
3791                                 if (object != cur_object) {
3792                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3793                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3794
3795                                                 vm_object_unlock(object);
3796                                                 vm_object_unlock(cur_object);
3797
3798                                                 vm_map_unlock_read(map);
3799                                                 if (real_map != map) {
3800                                                         vm_map_unlock(real_map);
3801                                                 }
3802
3803                                                 goto RetryFault;
3804                                         }
3805                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3806                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3807
3808                                         if (vm_object_lock_upgrade(object) == FALSE) {
3809                                                 /*
3810                                                  * couldn't upgrade, so explictly take the lock
3811                                                  * exclusively and go relookup the page since we
3812                                                  * will have dropped the object lock and
3813                                                  * a different thread could have inserted
3814                                                  * a page at this offset
3815                                                  * no need for a full retry since we're
3816                                                  * at the top level of the object chain
3817                                                  */
3818                                                 vm_object_lock(object);
3819
3820                                                 continue;
3821                                         }
3822                                 }
3823                                 vm_pageout_steal_laundry(m, FALSE);
3824                         }
3825
3826                         if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
3827                                 /*
3828                                  * Guard page: let the slow path deal with it
3829                                  */
3830                                 break;
3831                         }
3832                         if (m->vmp_unusual && (m->vmp_error || m->vmp_restart || m->vmp_private || m->vmp_absent)) {
3833                                 /*
3834                                  * Unusual case... let the slow path deal with it
3835                                  */
3836                                 break;
3837                         }
3838                         if (VM_OBJECT_PURGEABLE_FAULT_ERROR(m_object)) {
3839                                 if (object != cur_object) {
3840                                         vm_object_unlock(object);
3841                                 }
3842                                 vm_map_unlock_read(map);
3843                                 if (real_map != map) {
3844                                         vm_map_unlock(real_map);
3845                                 }
3846                                 vm_object_unlock(cur_object);
3847                                 kr = KERN_MEMORY_ERROR;
3848                                 goto done;
3849                         }
3850                         assert(m_object == VM_PAGE_OBJECT(m));
3851
3852                         if (VM_FAULT_NEED_CS_VALIDATION(map->pmap, m, m_object) ||
3853                             (physpage_p != NULL && (prot & VM_PROT_WRITE))) {
3854 upgrade_for_validation:
3855                                 /*
3856                                  * We might need to validate this page
3857                                  * against its code signature, so we
3858                                  * want to hold the VM object exclusively.
3859                                  */
3860                                 if (object != cur_object) {
3861                                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3862                                                 vm_object_unlock(object);
3863                                                 vm_object_unlock(cur_object);
3864
3865                                                 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3866
3867                                                 vm_map_unlock_read(map);
3868                                                 if (real_map != map) {
3869                                                         vm_map_unlock(real_map);
3870                                                 }
3871
3872                                                 goto RetryFault;
3873                                         }
3874                                 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3875                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3876
3877                                         if (vm_object_lock_upgrade(object) == FALSE) {
3878                                                 /*
3879                                                  * couldn't upgrade, so explictly take the lock
3880                                                  * exclusively and go relookup the page since we
3881                                                  * will have dropped the object lock and
3882                                                  * a different thread could have inserted
3883                                                  * a page at this offset
3884                                                  * no need for a full retry since we're
3885                                                  * at the top level of the object chain
3886                                                  */
3887                                                 vm_object_lock(object);
3888
3889                                                 continue;
3890                                         }
3891                                 }
3892                         }
3893                         /*
3894                          *      Two cases of map in faults:
3895                          *          - At top level w/o copy object.
3896                          *          - Read fault anywhere.
3897                          *              --> must disallow write.
3898                          */
3899
3900                         if (object == cur_object && object->copy == VM_OBJECT_NULL) {
3901                                 goto FastPmapEnter;
3902                         }
3903
3904                         if ((fault_type & VM_PROT_WRITE) == 0) {
3905                                 if (!pmap_has_prot_policy(prot)) {
3906                                         prot &= ~VM_PROT_WRITE;
3907                                 } else {
3908                                         /*
3909                                          * For a protection that the pmap cares
3910                                          * about, we must hand over the full
3911                                          * set of protections (so that the pmap
3912                                          * layer can apply any desired policy).
3913                                          * This means that cs_bypass must be
3914                                          * set, as this can force us to pass
3915                                          * RWX.
3916                                          */
3917                                         assert(fault_info.cs_bypass);
3918                                 }
3919
3920                                 if (object != cur_object) {
3921                                         /*
3922                                          * We still need to hold the top object
3923                                          * lock here to prevent a race between
3924                                          * a read fault (taking only "shared"
3925                                          * locks) and a write fault (taking
3926                                          * an "exclusive" lock on the top
3927                                          * object.
3928                                          * Otherwise, as soon as we release the
3929                                          * top lock, the write fault could
3930                                          * proceed and actually complete before
3931                                          * the read fault, and the copied page's
3932                                          * translation could then be overwritten
3933                                          * by the read fault's translation for
3934                                          * the original page.
3935                                          *
3936                                          * Let's just record what the top object
3937                                          * is and we'll release it later.
3938                                          */
3939                                         top_object = object;
3940
3941                                         /*
3942                                          * switch to the object that has the new page
3943                                          */
3944                                         object = cur_object;
3945                                         object_lock_type = cur_object_lock_type;
3946                                 }
3947 FastPmapEnter:
3948                                 assert(m_object == VM_PAGE_OBJECT(m));
3949
3950                                 /*
3951                                  * prepare for the pmap_enter...
3952                                  * object and map are both locked
3953                                  * m contains valid data
3954                                  * object == m->vmp_object
3955                                  * cur_object == NULL or it's been unlocked
3956                                  * no paging references on either object or cur_object
3957                                  */
3958                                 if (top_object != VM_OBJECT_NULL || object_lock_type != OBJECT_LOCK_EXCLUSIVE) {
3959                                         need_retry_ptr = &need_retry;
3960                                 } else {
3961                                         need_retry_ptr = NULL;
3962                                 }
3963
3964                                 if (caller_pmap) {
3965                                         kr = vm_fault_enter(m,
3966                                             caller_pmap,
3967                                             caller_pmap_addr,
3968                                             prot,
3969                                             caller_prot,
3970                                             wired,
3971                                             change_wiring,
3972                                             wire_tag,
3973                                             &fault_info,
3974                                             need_retry_ptr,
3975                                             &type_of_fault);
3976                                 } else {
3977                                         kr = vm_fault_enter(m,
3978                                             pmap,
3979                                             vaddr,
3980                                             prot,
3981                                             caller_prot,
3982                                             wired,
3983                                             change_wiring,
3984                                             wire_tag,
3985                                             &fault_info,
3986                                             need_retry_ptr,
3987                                             &type_of_fault);
3988                                 }
3989 #if DEVELOPMENT || DEBUG
3990                                 {
3991                                         int     event_code = 0;
3992
3993                                         if (m_object->internal) {
3994                                                 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL));
3995                                         } else if (m_object->object_is_shared_cache) {
3996                                                 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE));
3997                                         } else {
3998                                                 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL));
3999                                         }
4000
4001                                         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, event_code, trace_real_vaddr, (fault_info.user_tag << 16) | (caller_prot << 8) | type_of_fault, m->vmp_offset, get_current_unique_pid(), 0);
4002
4003                                         DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info.user_tag);
4004                                 }
4005 #endif
4006                                 if (kr == KERN_SUCCESS &&
4007                                     physpage_p != NULL) {
4008                                         /* for vm_map_wire_and_extract() */
4009                                         *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
4010                                         if (prot & VM_PROT_WRITE) {
4011                                                 vm_object_lock_assert_exclusive(m_object);
4012                                                 m->vmp_dirty = TRUE;
4013                                         }
4014                                 }
4015
4016                                 if (top_object != VM_OBJECT_NULL) {
4017                                         /*
4018                                          * It's safe to drop the top object
4019                                          * now that we've done our
4020                                          * vm_fault_enter().  Any other fault
4021                                          * in progress for that virtual
4022                                          * address will either find our page
4023                                          * and translation or put in a new page
4024                                          * and translation.
4025                                          */
4026                                         vm_object_unlock(top_object);
4027                                         top_object = VM_OBJECT_NULL;
4028                                 }
4029
4030                                 if (need_collapse == TRUE) {
4031                                         vm_object_collapse(object, offset, TRUE);
4032                                 }
4033
4034                                 if (need_retry == FALSE &&
4035                                     (type_of_fault == DBG_PAGEIND_FAULT || type_of_fault == DBG_PAGEINV_FAULT || type_of_fault == DBG_CACHE_HIT_FAULT)) {
4036                                         /*
4037                                          * evaluate access pattern and update state
4038                                          * vm_fault_deactivate_behind depends on the
4039                                          * state being up to date
4040                                          */
4041                                         vm_fault_is_sequential(m_object, cur_offset, fault_info.behavior);
4042
4043                                         vm_fault_deactivate_behind(m_object, cur_offset, fault_info.behavior);
4044                                 }
4045                                 /*
4046                                  * That's it, clean up and return.
4047                                  */
4048                                 if (m->vmp_busy) {
4049                                         PAGE_WAKEUP_DONE(m);
4050                                 }
4051
4052                                 if (need_retry == FALSE && !m_object->internal && (fault_type & VM_PROT_WRITE)) {
4053                                         vm_object_paging_begin(m_object);
4054
4055                                         assert(written_on_object == VM_OBJECT_NULL);
4056                                         written_on_object = m_object;
4057                                         written_on_pager = m_object->pager;
4058                                         written_on_offset = m_object->paging_offset + m->vmp_offset;
4059                                 }
4060                                 vm_object_unlock(object);
4061
4062                                 vm_map_unlock_read(map);
4063                                 if (real_map != map) {
4064                                         vm_map_unlock(real_map);
4065                                 }
4066
4067                                 if (need_retry == TRUE) {
4068                                         /*
4069                                          * vm_fault_enter couldn't complete the PMAP_ENTER...
4070                                          * at this point we don't hold any locks so it's safe
4071                                          * to ask the pmap layer to expand the page table to
4072                                          * accommodate this mapping... once expanded, we'll
4073                                          * re-drive the fault which should result in vm_fault_enter
4074                                          * being able to successfully enter the mapping this time around
4075                                          */
4076                                         (void)pmap_enter_options(
4077                                                 pmap, vaddr, 0, 0, 0, 0, 0,
4078                                                 PMAP_OPTIONS_NOENTER, NULL);
4079
4080                                         need_retry = FALSE;
4081                                         goto RetryFault;
4082                                 }
4083                                 goto done;
4084                         }
4085                         /*
4086                          * COPY ON WRITE FAULT
4087                          */
4088                         assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE);
4089
4090                         /*
4091                          * If objects match, then
4092                          * object->copy must not be NULL (else control
4093                          * would be in previous code block), and we
4094                          * have a potential push into the copy object
4095                          * with which we can't cope with here.
4096                          */
4097                         if (cur_object == object) {
4098                                 /*
4099                                  * must take the slow path to
4100                                  * deal with the copy push
4101                                  */
4102                                 break;
4103                         }
4104
4105                         /*
4106                          * This is now a shadow based copy on write
4107                          * fault -- it requires a copy up the shadow
4108                          * chain.
4109                          */
4110                         assert(m_object == VM_PAGE_OBJECT(m));
4111
4112                         if ((cur_object_lock_type == OBJECT_LOCK_SHARED) &&
4113                             VM_FAULT_NEED_CS_VALIDATION(NULL, m, m_object)) {
4114                                 goto upgrade_for_validation;
4115                         }
4116
4117                         /*
4118                          * Allocate a page in the original top level
4119                          * object. Give up if allocate fails.  Also
4120                          * need to remember current page, as it's the
4121                          * source of the copy.
4122                          *
4123                          * at this point we hold locks on both
4124                          * object and cur_object... no need to take
4125                          * paging refs or mark pages BUSY since
4126                          * we don't drop either object lock until
4127                          * the page has been copied and inserted
4128                          */
4129                         cur_m = m;
4130                         m = vm_page_grab_options(grab_options);
4131                         m_object = NULL;
4132
4133                         if (m == VM_PAGE_NULL) {
4134                                 /*
4135                                  * no free page currently available...
4136                                  * must take the slow path
4137                                  */
4138                                 break;
4139                         }
4140                         /*
4141                          * Now do the copy.  Mark the source page busy...
4142                          *
4143                          *      NOTE: This code holds the map lock across
4144                          *      the page copy.
4145                          */
4146                         vm_page_copy(cur_m, m);
4147                         vm_page_insert(m, object, offset);
4148                         m_object = object;
4149                         SET_PAGE_DIRTY(m, FALSE);
4150
4151                         /*
4152                          * Now cope with the source page and object
4153                          */
4154                         if (object->ref_count > 1 && cur_m->vmp_pmapped) {
4155                                 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(cur_m));
4156                         }
4157
4158                         if (cur_m->vmp_clustered) {
4159                                 VM_PAGE_COUNT_AS_PAGEIN(cur_m);
4160                                 VM_PAGE_CONSUME_CLUSTERED(cur_m);
4161                                 vm_fault_is_sequential(cur_object, cur_offset, fault_info.behavior);
4162                         }
4163                         need_collapse = TRUE;
4164
4165                         if (!cur_object->internal &&
4166                             cur_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
4167                                 /*
4168                                  * The object from which we've just
4169                                  * copied a page is most probably backed
4170                                  * by a vnode.  We don't want to waste too
4171                                  * much time trying to collapse the VM objects
4172                                  * and create a bottleneck when several tasks
4173                                  * map the same file.
4174                                  */
4175                                 if (cur_object->copy == object) {
4176                                         /*
4177                                          * Shared mapping or no COW yet.
4178                                          * We can never collapse a copy
4179                                          * object into its backing object.
4180                                          */
4181                                         need_collapse = FALSE;
4182                                 } else if (cur_object->copy == object->shadow &&
4183                                     object->shadow->resident_page_count == 0) {
4184                                         /*
4185                                          * Shared mapping after a COW occurred.
4186                                          */
4187                                         need_collapse = FALSE;
4188                                 }
4189                         }
4190                         vm_object_unlock(cur_object);
4191
4192                         if (need_collapse == FALSE) {
4193                                 vm_fault_collapse_skipped++;
4194                         }
4195                         vm_fault_collapse_total++;
4196
4197                         type_of_fault = DBG_COW_FAULT;
4198                         VM_STAT_INCR(cow_faults);
4199                         DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
4200                         current_task()->cow_faults++;
4201
4202                         goto FastPmapEnter;
4203                 } else {
4204                         /*
4205                          * No page at cur_object, cur_offset... m == NULL
4206                          */
4207                         if (cur_object->pager_created) {
4208                                 int     compressor_external_state = VM_EXTERNAL_STATE_UNKNOWN;
4209
4210                                 if (MUST_ASK_PAGER(cur_object, cur_offset, compressor_external_state) == TRUE) {
4211                                         int             my_fault_type;
4212                                         int             c_flags = C_DONT_BLOCK;
4213                                         boolean_t       insert_cur_object = FALSE;
4214
4215                                         /*
4216                                          * May have to talk to a pager...
4217                                          * if so, take the slow path by
4218                                          * doing a 'break' from the while (TRUE) loop
4219                                          *
4220                                          * external_state will only be set to VM_EXTERNAL_STATE_EXISTS
4221                                          * if the compressor is active and the page exists there
4222                                          */
4223                                         if (compressor_external_state != VM_EXTERNAL_STATE_EXISTS) {
4224                                                 break;
4225                                         }
4226
4227                                         if (map == kernel_map || real_map == kernel_map) {
4228                                                 /*
4229                                                  * can't call into the compressor with the kernel_map
4230                                                  * lock held, since the compressor may try to operate
4231                                                  * on the kernel map in order to return an empty c_segment
4232                                                  */
4233                                                 break;
4234                                         }
4235                                         if (object != cur_object) {
4236                                                 if (fault_type & VM_PROT_WRITE) {
4237                                                         c_flags |= C_KEEP;
4238                                                 } else {
4239                                                         insert_cur_object = TRUE;
4240                                                 }
4241                                         }
4242                                         if (insert_cur_object == TRUE) {
4243                                                 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
4244                                                         cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4245
4246                                                         if (vm_object_lock_upgrade(cur_object) == FALSE) {
4247                                                                 /*
4248                                                                  * couldn't upgrade so go do a full retry
4249                                                                  * immediately since we can no longer be
4250                                                                  * certain about cur_object (since we
4251                                                                  * don't hold a reference on it)...
4252                                                                  * first drop the top object lock
4253                                                                  */
4254                                                                 vm_object_unlock(object);
4255
4256                                                                 vm_map_unlock_read(map);
4257                                                                 if (real_map != map) {
4258                                                                         vm_map_unlock(real_map);
4259                                                                 }
4260
4261                                                                 goto RetryFault;
4262                                                         }
4263                                                 }
4264                                         } else if (object_lock_type == OBJECT_LOCK_SHARED) {
4265                                                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4266
4267                                                 if (object != cur_object) {
4268                                                         /*
4269                                                          * we can't go for the upgrade on the top
4270                                                          * lock since the upgrade may block waiting
4271                                                          * for readers to drain... since we hold
4272                                                          * cur_object locked at this point, waiting
4273                                                          * for the readers to drain would represent
4274                                                          * a lock order inversion since the lock order
4275                                                          * for objects is the reference order in the
4276                                                          * shadown chain
4277                                                          */
4278                                                         vm_object_unlock(object);
4279                                                         vm_object_unlock(cur_object);
4280
4281                                                         vm_map_unlock_read(map);
4282                                                         if (real_map != map) {
4283                                                                 vm_map_unlock(real_map);
4284                                                         }
4285
4286                                                         goto RetryFault;
4287                                                 }
4288                                                 if (vm_object_lock_upgrade(object) == FALSE) {
4289                                                         /*
4290                                                          * couldn't upgrade, so explictly take the lock
4291                                                          * exclusively and go relookup the page since we
4292                                                          * will have dropped the object lock and
4293                                                          * a different thread could have inserted
4294                                                          * a page at this offset
4295                                                          * no need for a full retry since we're
4296                                                          * at the top level of the object chain
4297                                                          */
4298                                                         vm_object_lock(object);
4299
4300                                                         continue;
4301                                                 }
4302                                         }
4303                                         m = vm_page_grab_options(grab_options);
4304                                         m_object = NULL;
4305
4306                                         if (m == VM_PAGE_NULL) {
4307                                                 /*
4308                                                  * no free page currently available...
4309                                                  * must take the slow path
4310                                                  */
4311                                                 break;
4312                                         }
4313
4314                                         /*
4315                                          * The object is and remains locked
4316                                          * so no need to take a
4317                                          * "paging_in_progress" reference.
4318                                          */
4319                                         boolean_t shared_lock;
4320                                         if ((object == cur_object &&
4321                                             object_lock_type == OBJECT_LOCK_EXCLUSIVE) ||
4322                                             (object != cur_object &&
4323                                             cur_object_lock_type == OBJECT_LOCK_EXCLUSIVE)) {
4324                                                 shared_lock = FALSE;
4325                                         } else {
4326                                                 shared_lock = TRUE;
4327                                         }
4328
4329                                         kr = vm_compressor_pager_get(
4330                                                 cur_object->pager,
4331                                                 (cur_offset +
4332                                                 cur_object->paging_offset),
4333                                                 VM_PAGE_GET_PHYS_PAGE(m),
4334                                                 &my_fault_type,
4335                                                 c_flags,
4336                                                 &compressed_count_delta);
4337
4338                                         vm_compressor_pager_count(
4339                                                 cur_object->pager,
4340                                                 compressed_count_delta,
4341                                                 shared_lock,
4342                                                 cur_object);
4343
4344                                         if (kr != KERN_SUCCESS) {
4345                                                 vm_page_release(m, FALSE);
4346                                                 m = VM_PAGE_NULL;
4347                                                 break;
4348                                         }
4349                                         m->vmp_dirty = TRUE;
4350
4351                                         /*
4352                                          * If the object is purgeable, its
4353                                          * owner's purgeable ledgers will be
4354                                          * updated in vm_page_insert() but the
4355                                          * page was also accounted for in a
4356                                          * "compressed purgeable" ledger, so
4357                                          * update that now.
4358                                          */
4359                                         if (object != cur_object &&
4360                                             !insert_cur_object) {
4361                                                 /*
4362                                                  * We're not going to insert
4363                                                  * the decompressed page into
4364                                                  * the object it came from.
4365                                                  *
4366                                                  * We're dealing with a
4367                                                  * copy-on-write fault on
4368                                                  * "object".
4369                                                  * We're going to decompress
4370                                                  * the page directly into the
4371                                                  * target "object" while
4372                                                  * keepin the compressed
4373                                                  * page for "cur_object", so
4374                                                  * no ledger update in that
4375                                                  * case.
4376                                                  */
4377                                         } else if (((cur_object->purgable ==
4378                                             VM_PURGABLE_DENY) &&
4379                                             (!cur_object->vo_ledger_tag)) ||
4380                                             (cur_object->vo_owner ==
4381                                             NULL)) {
4382                                                 /*
4383                                                  * "cur_object" is not purgeable
4384                                                  * and is not ledger-taged, or
4385                                                  * there's no owner for it,
4386                                                  * so no owner's ledgers to
4387                                                  * update.
4388                                                  */
4389                                         } else {
4390                                                 /*
4391                                                  * One less compressed
4392                                                  * purgeable/tagged page for
4393                                                  * cur_object's owner.
4394                                                  */
4395                                                 vm_object_owner_compressed_update(
4396                                                         cur_object,
4397                                                         -1);
4398                                         }
4399
4400                                         if (insert_cur_object) {
4401                                                 vm_page_insert(m, cur_object, cur_offset);
4402                                                 m_object = cur_object;
4403                                         } else {
4404                                                 vm_page_insert(m, object, offset);
4405                                                 m_object = object;
4406                                         }
4407
4408                                         if ((m_object->wimg_bits & VM_WIMG_MASK) != VM_WIMG_USE_DEFAULT) {
4409                                                 /*
4410                                                  * If the page is not cacheable,
4411                                                  * we can't let its contents
4412                                                  * linger in the data cache
4413                                                  * after the decompression.
4414                                                  */
4415                                                 pmap_sync_page_attributes_phys(VM_PAGE_GET_PHYS_PAGE(m));
4416                                         }
4417
4418                                         type_of_fault = my_fault_type;
4419
4420                                         VM_STAT_INCR(decompressions);
4421
4422                                         if (cur_object != object) {
4423                                                 if (insert_cur_object) {
4424                                                         top_object = object;
4425                                                         /*
4426                                                          * switch to the object that has the new page
4427                                                          */
4428                                                         object = cur_object;
4429                                                         object_lock_type = cur_object_lock_type;
4430                                                 } else {
4431                                                         vm_object_unlock(cur_object);
4432                                                         cur_object = object;
4433                                                 }
4434                                         }
4435                                         goto FastPmapEnter;
4436                                 }
4437                                 /*
4438                                  * existence map present and indicates
4439                                  * that the pager doesn't have this page
4440                                  */
4441                         }
4442                         if (cur_object->shadow == VM_OBJECT_NULL) {
4443                                 /*
4444                                  * Zero fill fault.  Page gets
4445                                  * inserted into the original object.
4446                                  */
4447                                 if (cur_object->shadow_severed ||
4448                                     VM_OBJECT_PURGEABLE_FAULT_ERROR(cur_object) ||
4449                                     cur_object == compressor_object ||
4450                                     cur_object == kernel_object ||
4451                                     cur_object == vm_submap_object) {
4452                                         if (object != cur_object) {
4453                                                 vm_object_unlock(cur_object);
4454                                         }
4455                                         vm_object_unlock(object);
4456
4457                                         vm_map_unlock_read(map);
4458                                         if (real_map != map) {
4459                                                 vm_map_unlock(real_map);
4460                                         }
4461
4462                                         kr = KERN_MEMORY_ERROR;
4463                                         goto done;
4464                                 }
4465                                 if (cur_object != object) {
4466                                         vm_object_unlock(cur_object);
4467
4468                                         cur_object = object;
4469                                 }
4470                                 if (object_lock_type == OBJECT_LOCK_SHARED) {
4471                                         object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4472
4473                                         if (vm_object_lock_upgrade(object) == FALSE) {
4474                                                 /*
4475                                                  * couldn't upgrade so do a full retry on the fault
4476                                                  * since we dropped the object lock which
4477                                                  * could allow another thread to insert
4478                                                  * a page at this offset
4479                                                  */
4480                                                 vm_map_unlock_read(map);
4481                                                 if (real_map != map) {
4482                                                         vm_map_unlock(real_map);
4483                                                 }
4484
4485                                                 goto RetryFault;
4486                                         }
4487                                 }
4488                                 m = vm_page_alloc(object, offset);
4489                                 m_object = NULL;
4490
4491                                 if (m == VM_PAGE_NULL) {
4492                                         /*
4493                                          * no free page currently available...
4494                                          * must take the slow path
4495                                          */
4496                                         break;
4497                                 }
4498                                 m_object = object;
4499
4500                                 /*
4501                                  * Now zero fill page...
4502                                  * the page is probably going to
4503                                  * be written soon, so don't bother
4504                                  * to clear the modified bit
4505                                  *
4506                                  *   NOTE: This code holds the map
4507                                  *   lock across the zero fill.
4508                                  */
4509                                 type_of_fault = vm_fault_zero_page(m, map->no_zero_fill);
4510
4511                                 goto FastPmapEnter;
4512                         }
4513                         /*
4514                          * On to the next level in the shadow chain
4515                          */
4516                         cur_offset += cur_object->vo_shadow_offset;
4517                         new_object = cur_object->shadow;
4518
4519                         /*
4520                          * take the new_object's lock with the indicated state
4521                          */
4522                         if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
4523                                 vm_object_lock_shared(new_object);
4524                         } else {
4525                                 vm_object_lock(new_object);
4526                         }
4527
4528                         if (cur_object != object) {
4529                                 vm_object_unlock(cur_object);
4530                         }
4531
4532                         cur_object = new_object;
4533
4534                         continue;
4535                 }
4536         }
4537         /*
4538          * Cleanup from fast fault failure.  Drop any object
4539          * lock other than original and drop map lock.
4540          */
4541         if (object != cur_object) {
4542                 vm_object_unlock(cur_object);
4543         }
4544
4545         /*
4546          * must own the object lock exclusively at this point
4547          */
4548         if (object_lock_type == OBJECT_LOCK_SHARED) {
4549                 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4550
4551                 if (vm_object_lock_upgrade(object) == FALSE) {
4552                         /*
4553                          * couldn't upgrade, so explictly
4554                          * take the lock exclusively
4555                          * no need to retry the fault at this
4556                          * point since "vm_fault_page" will
4557                          * completely re-evaluate the state
4558                          */
4559                         vm_object_lock(object);
4560                 }
4561         }
4562
4563 handle_copy_delay:
4564         vm_map_unlock_read(map);
4565         if (real_map != map) {
4566                 vm_map_unlock(real_map);
4567         }
4568
4569         if (__improbable(object == compressor_object ||
4570             object == kernel_object ||
4571             object == vm_submap_object)) {
4572                 /*
4573                  * These objects are explicitly managed and populated by the
4574                  * kernel.  The virtual ranges backed by these objects should
4575                  * either have wired pages or "holes" that are not supposed to
4576                  * be accessed at all until they get explicitly populated.
4577                  * We should never have to resolve a fault on a mapping backed
4578                  * by one of these VM objects and providing a zero-filled page
4579                  * would be wrong here, so let's fail the fault and let the
4580                  * caller crash or recover.
4581                  */
4582                 vm_object_unlock(object);
4583                 kr = KERN_MEMORY_ERROR;
4584                 goto done;
4585         }
4586
4587         assert(object != compressor_object);
4588         assert(object != kernel_object);
4589         assert(object != vm_submap_object);
4590
4591         /*
4592          * Make a reference to this object to
4593          * prevent its disposal while we are messing with
4594          * it.  Once we have the reference, the map is free
4595          * to be diddled.  Since objects reference their
4596          * shadows (and copies), they will stay around as well.
4597          */
4598         vm_object_reference_locked(object);
4599         vm_object_paging_begin(object);
4600
4601         XPR(XPR_VM_FAULT, "vm_fault -> vm_fault_page\n", 0, 0, 0, 0, 0);
4602
4603         error_code = 0;
4604
4605         result_page = VM_PAGE_NULL;
4606         kr = vm_fault_page(object, offset, fault_type,
4607             (change_wiring && !wired),
4608             FALSE,                /* page not looked up */
4609             &prot, &result_page, &top_page,
4610             &type_of_fault,
4611             &error_code, map->no_zero_fill,
4612             FALSE, &fault_info);
4613
4614         /*
4615          * if kr != VM_FAULT_SUCCESS, then the paging reference
4616          * has been dropped and the object unlocked... the ref_count
4617          * is still held
4618          *
4619          * if kr == VM_FAULT_SUCCESS, then the paging reference
4620          * is still held along with the ref_count on the original object
4621          *
4622          *      the object is returned locked with a paging reference
4623          *
4624          *      if top_page != NULL, then it's BUSY and the
4625          *      object it belongs to has a paging reference
4626          *      but is returned unlocked
4627          */
4628         if (kr != VM_FAULT_SUCCESS &&
4629             kr != VM_FAULT_SUCCESS_NO_VM_PAGE) {
4630                 /*
4631                  * we didn't succeed, lose the object reference immediately.
4632                  */
4633                 vm_object_deallocate(object);
4634
4635                 /*
4636                  * See why we failed, and take corrective action.
4637                  */
4638                 switch (kr) {
4639                 case VM_FAULT_MEMORY_SHORTAGE:
4640                         if (vm_page_wait((change_wiring) ?
4641                             THREAD_UNINT :
4642                             THREAD_ABORTSAFE)) {
4643                                 goto RetryFault;
4644                         }
4645                 /*
4646                  * fall thru
4647                  */
4648                 case VM_FAULT_INTERRUPTED:
4649                         kr = KERN_ABORTED;
4650                         goto done;
4651                 case VM_FAULT_RETRY:
4652                         goto RetryFault;
4653                 case VM_FAULT_MEMORY_ERROR:
4654                         if (error_code) {
4655                                 kr = error_code;
4656                         } else {
4657                                 kr = KERN_MEMORY_ERROR;
4658                         }
4659                         goto done;
4660                 default:
4661                         panic("vm_fault: unexpected error 0x%x from "
4662                             "vm_fault_page()\n", kr);
4663                 }
4664         }
4665         m = result_page;
4666         m_object = NULL;
4667
4668         if (m != VM_PAGE_NULL) {
4669                 m_object = VM_PAGE_OBJECT(m);
4670                 assert((change_wiring && !wired) ?
4671                     (top_page == VM_PAGE_NULL) :
4672                     ((top_page == VM_PAGE_NULL) == (m_object == object)));
4673         }
4674
4675         /*
4676          * What to do with the resulting page from vm_fault_page
4677          * if it doesn't get entered into the physical map:
4678          */
4679 #define RELEASE_PAGE(m)                                 \
4680         MACRO_BEGIN                                     \
4681         PAGE_WAKEUP_DONE(m);                            \
4682         if ( !VM_PAGE_PAGEABLE(m)) {                    \
4683                 vm_page_lockspin_queues();              \
4684                 if ( !VM_PAGE_PAGEABLE(m))              \
4685                         vm_page_activate(m);            \
4686                 vm_page_unlock_queues();                \
4687         }                                               \
4688         MACRO_END
4689
4690
4691         object_locks_dropped = FALSE;
4692         /*
4693          * We must verify that the maps have not changed
4694          * since our last lookup. vm_map_verify() needs the
4695          * map lock (shared) but we are holding object locks.
4696          * So we do a try_lock() first and, if that fails, we
4697          * drop the object locks and go in for the map lock again.
4698          */
4699         if (!vm_map_try_lock_read(original_map)) {
4700                 if (m != VM_PAGE_NULL) {
4701                         old_copy_object = m_object->copy;
4702                         vm_object_unlock(m_object);
4703                 } else {
4704                         old_copy_object = VM_OBJECT_NULL;
4705                         vm_object_unlock(object);
4706                 }
4707
4708                 object_locks_dropped = TRUE;
4709
4710                 vm_map_lock_read(original_map);
4711         }
4712
4713         if ((map != original_map) || !vm_map_verify(map, &version)) {
4714                 if (object_locks_dropped == FALSE) {
4715                         if (m != VM_PAGE_NULL) {
4716                                 old_copy_object = m_object->copy;
4717                                 vm_object_unlock(m_object);
4718                         } else {
4719                                 old_copy_object = VM_OBJECT_NULL;
4720                                 vm_object_unlock(object);
4721                         }
4722
4723                         object_locks_dropped = TRUE;
4724                 }
4725
4726                 /*
4727                  * no object locks are held at this point
4728                  */
4729                 vm_object_t             retry_object;
4730                 vm_object_offset_t      retry_offset;
4731                 vm_prot_t               retry_prot;
4732
4733                 /*
4734                  * To avoid trying to write_lock the map while another
4735                  * thread has it read_locked (in vm_map_pageable), we
4736                  * do not try for write permission.  If the page is
4737                  * still writable, we will get write permission.  If it
4738                  * is not, or has been marked needs_copy, we enter the
4739                  * mapping without write permission, and will merely
4740                  * take another fault.
4741                  */
4742                 map = original_map;
4743
4744                 kr = vm_map_lookup_locked(&map, vaddr,
4745                     fault_type & ~VM_PROT_WRITE,
4746                     OBJECT_LOCK_EXCLUSIVE, &version,
4747                     &retry_object, &retry_offset, &retry_prot,
4748                     &wired,
4749                     &fault_info,
4750                     &real_map);
4751                 pmap = real_map->pmap;
4752
4753                 if (kr != KERN_SUCCESS) {
4754                         vm_map_unlock_read(map);
4755
4756                         if (m != VM_PAGE_NULL) {
4757                                 assert(VM_PAGE_OBJECT(m) == m_object);
4758
4759                                 /*
4760                                  * retake the lock so that
4761                                  * we can drop the paging reference
4762                                  * in vm_fault_cleanup and do the
4763                                  * PAGE_WAKEUP_DONE in RELEASE_PAGE
4764                                  */
4765                                 vm_object_lock(m_object);
4766
4767                                 RELEASE_PAGE(m);
4768
4769                                 vm_fault_cleanup(m_object, top_page);
4770                         } else {
4771                                 /*
4772                                  * retake the lock so that
4773                                  * we can drop the paging reference
4774                                  * in vm_fault_cleanup
4775                                  */
4776                                 vm_object_lock(object);
4777
4778                                 vm_fault_cleanup(object, top_page);
4779                         }
4780                         vm_object_deallocate(object);
4781
4782                         goto done;
4783                 }
4784                 vm_object_unlock(retry_object);
4785
4786                 if ((retry_object != object) || (retry_offset != offset)) {
4787                         vm_map_unlock_read(map);
4788                         if (real_map != map) {
4789                                 vm_map_unlock(real_map);
4790                         }
4791
4792                         if (m != VM_PAGE_NULL) {
4793                                 assert(VM_PAGE_OBJECT(m) == m_object);
4794
4795                                 /*
4796                                  * retake the lock so that
4797                                  * we can drop the paging reference
4798                                  * in vm_fault_cleanup and do the
4799                                  * PAGE_WAKEUP_DONE in RELEASE_PAGE
4800                                  */
4801                                 vm_object_lock(m_object);
4802
4803                                 RELEASE_PAGE(m);
4804
4805                                 vm_fault_cleanup(m_object, top_page);
4806                         } else {
4807                                 /*
4808                                  * retake the lock so that
4809                                  * we can drop the paging reference
4810                                  * in vm_fault_cleanup
4811                                  */
4812                                 vm_object_lock(object);
4813
4814                                 vm_fault_cleanup(object, top_page);
4815                         }
4816                         vm_object_deallocate(object);
4817
4818                         goto RetryFault;
4819                 }
4820                 /*
4821                  * Check whether the protection has changed or the object
4822                  * has been copied while we left the map unlocked.
4823                  */
4824                 if (pmap_has_prot_policy(retry_prot)) {
4825                         /* If the pmap layer cares, pass the full set. */
4826                         prot = retry_prot;
4827                 } else {
4828                         prot &= retry_prot;
4829                 }
4830         }
4831
4832         if (object_locks_dropped == TRUE) {
4833                 if (m != VM_PAGE_NULL) {
4834                         vm_object_lock(m_object);
4835
4836                         if (m_object->copy != old_copy_object) {
4837                                 /*
4838                                  * The copy object changed while the top-level object
4839                                  * was unlocked, so take away write permission.
4840                                  */
4841                                 assert(!pmap_has_prot_policy(prot));
4842                                 prot &= ~VM_PROT_WRITE;
4843                         }
4844                 } else {
4845                         vm_object_lock(object);
4846                 }
4847
4848                 object_locks_dropped = FALSE;
4849         }
4850
4851         /*
4852          * If we want to wire down this page, but no longer have
4853          * adequate permissions, we must start all over.
4854          */
4855         if (wired && (fault_type != (prot | VM_PROT_WRITE))) {
4856                 vm_map_unlock_read(map);
4857                 if (real_map != map) {
4858                         vm_map_unlock(real_map);
4859                 }
4860
4861                 if (m != VM_PAGE_NULL) {
4862                         assert(VM_PAGE_OBJECT(m) == m_object);
4863
4864                         RELEASE_PAGE(m);
4865
4866                         vm_fault_cleanup(m_object, top_page);
4867                 } else {
4868                         vm_fault_cleanup(object, top_page);
4869                 }
4870
4871                 vm_object_deallocate(object);
4872
4873                 goto RetryFault;
4874         }
4875         if (m != VM_PAGE_NULL) {
4876                 /*
4877                  * Put this page into the physical map.
4878                  * We had to do the unlock above because pmap_enter
4879                  * may cause other faults.  The page may be on
4880                  * the pageout queues.  If the pageout daemon comes
4881                  * across the page, it will remove it from the queues.
4882                  */
4883                 if (caller_pmap) {
4884                         kr = vm_fault_enter(m,
4885                             caller_pmap,
4886                             caller_pmap_addr,
4887                             prot,
4888                             caller_prot,
4889                             wired,
4890                             change_wiring,
4891                             wire_tag,
4892                             &fault_info,
4893                             NULL,
4894                             &type_of_fault);
4895                 } else {
4896                         kr = vm_fault_enter(m,
4897                             pmap,
4898                             vaddr,
4899                             prot,
4900                             caller_prot,
4901                             wired,
4902                             change_wiring,
4903                             wire_tag,
4904                             &fault_info,
4905                             NULL,
4906                             &type_of_fault);
4907                 }
4908                 assert(VM_PAGE_OBJECT(m) == m_object);
4909
4910 #if DEVELOPMENT || DEBUG
4911                 {
4912                         int     event_code = 0;
4913
4914                         if (m_object->internal) {
4915                                 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL));
4916                         } else if (m_object->object_is_shared_cache) {
4917                                 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE));
4918                         } else {
4919                                 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL));
4920                         }
4921
4922                         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, event_code, trace_real_vaddr, (fault_info.user_tag << 16) | (caller_prot << 8) | type_of_fault, m->vmp_offset, get_current_unique_pid(), 0);
4923
4924                         DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info.user_tag);
4925                 }
4926 #endif
4927                 if (kr != KERN_SUCCESS) {
4928                         /* abort this page fault */
4929                         vm_map_unlock_read(map);
4930                         if (real_map != map) {
4931                                 vm_map_unlock(real_map);
4932                         }
4933                         PAGE_WAKEUP_DONE(m);
4934                         vm_fault_cleanup(m_object, top_page);
4935                         vm_object_deallocate(object);
4936                         goto done;
4937                 }
4938                 if (physpage_p != NULL) {
4939                         /* for vm_map_wire_and_extract() */
4940                         *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
4941                         if (prot & VM_PROT_WRITE) {
4942                                 vm_object_lock_assert_exclusive(m_object);
4943                                 m->vmp_dirty = TRUE;
4944                         }
4945                 }
4946         } else {
4947                 vm_map_entry_t          entry;
4948                 vm_map_offset_t         laddr;
4949                 vm_map_offset_t         ldelta, hdelta;
4950
4951                 /*
4952                  * do a pmap block mapping from the physical address
4953                  * in the object
4954                  */
4955
4956                 if (real_map != map) {
4957                         vm_map_unlock(real_map);
4958                 }
4959
4960                 if (original_map != map) {
4961                         vm_map_unlock_read(map);
4962                         vm_map_lock_read(original_map);
4963                         map = original_map;
4964                 }
4965                 real_map = map;
4966
4967                 laddr = vaddr;
4968                 hdelta = 0xFFFFF000;
4969                 ldelta = 0xFFFFF000;
4970
4971                 while (vm_map_lookup_entry(map, laddr, &entry)) {
4972                         if (ldelta > (laddr - entry->vme_start)) {
4973                                 ldelta = laddr - entry->vme_start;
4974                         }
4975                         if (hdelta > (entry->vme_end - laddr)) {
4976                                 hdelta = entry->vme_end - laddr;
4977                         }
4978                         if (entry->is_sub_map) {
4979                                 laddr = ((laddr - entry->vme_start)
4980                                     + VME_OFFSET(entry));
4981                                 vm_map_lock_read(VME_SUBMAP(entry));
4982
4983                                 if (map != real_map) {
4984                                         vm_map_unlock_read(map);
4985                                 }
4986                                 if (entry->use_pmap) {
4987                                         vm_map_unlock_read(real_map);
4988                                         real_map = VME_SUBMAP(entry);
4989                                 }
4990                                 map = VME_SUBMAP(entry);
4991                         } else {
4992                                 break;
4993                         }
4994                 }
4995
4996                 if (vm_map_lookup_entry(map, laddr, &entry) &&
4997                     (VME_OBJECT(entry) != NULL) &&
4998                     (VME_OBJECT(entry) == object)) {
4999                         int superpage;
5000
5001                         if (!object->pager_created &&
5002                             object->phys_contiguous &&
5003                             VME_OFFSET(entry) == 0 &&
5004                             (entry->vme_end - entry->vme_start == object->vo_size) &&
5005                             VM_MAP_PAGE_ALIGNED(entry->vme_start, (object->vo_size - 1))) {
5006                                 superpage = VM_MEM_SUPERPAGE;
5007                         } else {
5008                                 superpage = 0;
5009                         }
5010
5011                         if (superpage && physpage_p) {
5012                                 /* for vm_map_wire_and_extract() */
5013                                 *physpage_p = (ppnum_t)
5014                                     ((((vm_map_offset_t)
5015                                     object->vo_shadow_offset)
5016                                     + VME_OFFSET(entry)
5017                                     + (laddr - entry->vme_start))
5018                                     >> PAGE_SHIFT);
5019                         }
5020
5021                         if (caller_pmap) {
5022                                 /*
5023                                  * Set up a block mapped area
5024                                  */
5025                                 assert((uint32_t)((ldelta + hdelta) >> PAGE_SHIFT) == ((ldelta + hdelta) >> PAGE_SHIFT));
5026                                 kr = pmap_map_block(caller_pmap,
5027                                     (addr64_t)(caller_pmap_addr - ldelta),
5028                                     (ppnum_t)((((vm_map_offset_t) (VME_OBJECT(entry)->vo_shadow_offset)) +
5029                                     VME_OFFSET(entry) + (laddr - entry->vme_start) - ldelta) >> PAGE_SHIFT),
5030                                     (uint32_t)((ldelta + hdelta) >> PAGE_SHIFT), prot,
5031                                     (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
5032
5033                                 if (kr != KERN_SUCCESS) {
5034                                         goto cleanup;
5035                                 }
5036                         } else {
5037                                 /*
5038                                  * Set up a block mapped area
5039                                  */
5040                                 assert((uint32_t)((ldelta + hdelta) >> PAGE_SHIFT) == ((ldelta + hdelta) >> PAGE_SHIFT));
5041                                 kr = pmap_map_block(real_map->pmap,
5042                                     (addr64_t)(vaddr - ldelta),
5043                                     (ppnum_t)((((vm_map_offset_t)(VME_OBJECT(entry)->vo_shadow_offset)) +
5044                                     VME_OFFSET(entry) + (laddr - entry->vme_start) - ldelta) >> PAGE_SHIFT),
5045                                     (uint32_t)((ldelta + hdelta) >> PAGE_SHIFT), prot,
5046                                     (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
5047
5048                                 if (kr != KERN_SUCCESS) {
5049                                         goto cleanup;
5050                                 }
5051                         }
5052                 }
5053         }
5054
5055         /*
5056          * Success
5057          */
5058         kr = KERN_SUCCESS;
5059
5060         /*
5061          * TODO: could most of the done cases just use cleanup?
5062          */
5063 cleanup:
5064         /*
5065          * Unlock everything, and return
5066          */
5067         vm_map_unlock_read(map);
5068         if (real_map != map) {
5069                 vm_map_unlock(real_map);
5070         }
5071
5072         if (m != VM_PAGE_NULL) {
5073                 assert(VM_PAGE_OBJECT(m) == m_object);
5074
5075                 if (!m_object->internal && (fault_type & VM_PROT_WRITE)) {
5076                         vm_object_paging_begin(m_object);
5077
5078                         assert(written_on_object == VM_OBJECT_NULL);
5079                         written_on_object = m_object;
5080                         written_on_pager = m_object->pager;
5081                         written_on_offset = m_object->paging_offset + m->vmp_offset;
5082                 }
5083                 PAGE_WAKEUP_DONE(m);
5084
5085                 vm_fault_cleanup(m_object, top_page);
5086         } else {
5087                 vm_fault_cleanup(object, top_page);
5088         }
5089
5090         vm_object_deallocate(object);
5091
5092 #undef  RELEASE_PAGE
5093
5094 done:
5095         thread_interrupt_level(interruptible_state);
5096
5097         /*
5098          * Only I/O throttle on faults which cause a pagein/swapin.
5099          */
5100         if ((type_of_fault == DBG_PAGEIND_FAULT) || (type_of_fault == DBG_PAGEINV_FAULT) || (type_of_fault == DBG_COMPRESSOR_SWAPIN_FAULT)) {
5101                 throttle_lowpri_io(1);
5102         } else {
5103                 if (kr == KERN_SUCCESS && type_of_fault != DBG_CACHE_HIT_FAULT && type_of_fault != DBG_GUARD_FAULT) {
5104                         if ((throttle_delay = vm_page_throttled(TRUE))) {
5105                                 if (vm_debug_events) {
5106                                         if (type_of_fault == DBG_COMPRESSOR_FAULT) {
5107                                                 VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
5108                                         } else if (type_of_fault == DBG_COW_FAULT) {
5109                                                 VM_DEBUG_EVENT(vmf_cowdelay, VMF_COWDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
5110                                         } else {
5111                                                 VM_DEBUG_EVENT(vmf_zfdelay, VMF_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
5112                                         }
5113                                 }
5114                                 delay(throttle_delay);
5115                         }
5116                 }
5117         }
5118
5119         if (written_on_object) {
5120                 vnode_pager_dirtied(written_on_pager, written_on_offset, written_on_offset + PAGE_SIZE_64);
5121
5122                 vm_object_lock(written_on_object);
5123                 vm_object_paging_end(written_on_object);
5124                 vm_object_unlock(written_on_object);
5125
5126                 written_on_object = VM_OBJECT_NULL;
5127         }
5128
5129         if (rtfault) {
5130                 vm_record_rtfault(cthread, fstart, trace_vaddr, type_of_fault);
5131         }
5132
5133         KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
5134             (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
5135             ((uint64_t)trace_vaddr >> 32),
5136             trace_vaddr,
5137             kr,
5138             type_of_fault,
5139             0);
5140
5141         return kr;
5142 }
5143
5144 /*
5145  *      vm_fault_wire:
5146  *
5147  *      Wire down a range of virtual addresses in a map.
5148  */
5149 kern_return_t
5150 vm_fault_wire(
5151         vm_map_t        map,
5152         vm_map_entry_t  entry,
5153         vm_prot_t       prot,
5154         vm_tag_t        wire_tag,
5155         pmap_t          pmap,
5156         vm_map_offset_t pmap_addr,
5157         ppnum_t         *physpage_p)
5158 {
5159         vm_map_offset_t va;
5160         vm_map_offset_t end_addr = entry->vme_end;
5161         kern_return_t   rc;
5162
5163         assert(entry->in_transition);
5164
5165         if ((VME_OBJECT(entry) != NULL) &&
5166             !entry->is_sub_map &&
5167             VME_OBJECT(entry)->phys_contiguous) {
5168                 return KERN_SUCCESS;
5169         }
5170
5171         /*
5172          *      Inform the physical mapping system that the
5173          *      range of addresses may not fault, so that
5174          *      page tables and such can be locked down as well.
5175          */
5176
5177         pmap_pageable(pmap, pmap_addr,
5178             pmap_addr + (end_addr - entry->vme_start), FALSE);
5179
5180         /*
5181          *      We simulate a fault to get the page and enter it
5182          *      in the physical map.
5183          */
5184
5185         for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
5186                 rc = vm_fault_wire_fast(map, va, prot, wire_tag, entry, pmap,
5187                     pmap_addr + (va - entry->vme_start),
5188                     physpage_p);
5189                 if (rc != KERN_SUCCESS) {
5190                         rc = vm_fault_internal(map, va, prot, TRUE, wire_tag,
5191                             ((pmap == kernel_pmap)
5192                             ? THREAD_UNINT
5193                             : THREAD_ABORTSAFE),
5194                             pmap,
5195                             (pmap_addr +
5196                             (va - entry->vme_start)),
5197                             physpage_p);
5198                         DTRACE_VM2(softlock, int, 1, (uint64_t *), NULL);
5199                 }
5200
5201                 if (rc != KERN_SUCCESS) {
5202                         struct vm_map_entry     tmp_entry = *entry;
5203
5204                         /* unwire wired pages */
5205                         tmp_entry.vme_end = va;
5206                         vm_fault_unwire(map,
5207                             &tmp_entry, FALSE, pmap, pmap_addr);
5208
5209                         return rc;
5210                 }
5211         }
5212         return KERN_SUCCESS;
5213 }
5214
5215 /*
5216  *      vm_fault_unwire:
5217  *
5218  *      Unwire a range of virtual addresses in a map.
5219  */
5220 void
5221 vm_fault_unwire(
5222         vm_map_t        map,
5223         vm_map_entry_t  entry,
5224         boolean_t       deallocate,
5225         pmap_t          pmap,
5226         vm_map_offset_t pmap_addr)
5227 {
5228         vm_map_offset_t va;
5229         vm_map_offset_t end_addr = entry->vme_end;
5230         vm_object_t             object;
5231         struct vm_object_fault_info fault_info = {};
5232         unsigned int    unwired_pages;
5233
5234         object = (entry->is_sub_map) ? VM_OBJECT_NULL : VME_OBJECT(entry);
5235
5236         /*
5237          * If it's marked phys_contiguous, then vm_fault_wire() didn't actually
5238          * do anything since such memory is wired by default.  So we don't have
5239          * anything to undo here.
5240          */
5241
5242         if (object != VM_OBJECT_NULL && object->phys_contiguous) {
5243                 return;
5244         }
5245
5246         fault_info.interruptible = THREAD_UNINT;
5247         fault_info.behavior = entry->behavior;
5248         fault_info.user_tag = VME_ALIAS(entry);
5249         if (entry->iokit_acct ||
5250             (!entry->is_sub_map && !entry->use_pmap)) {
5251                 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
5252         }
5253         fault_info.lo_offset = VME_OFFSET(entry);
5254         fault_info.hi_offset = (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
5255         fault_info.no_cache = entry->no_cache;
5256         fault_info.stealth = TRUE;
5257
5258         unwired_pages = 0;
5259
5260         /*
5261          *      Since the pages are wired down, we must be able to
5262          *      get their mappings from the physical map system.
5263          */
5264
5265         for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
5266                 if (object == VM_OBJECT_NULL) {
5267                         if (pmap) {
5268                                 pmap_change_wiring(pmap,
5269                                     pmap_addr + (va - entry->vme_start), FALSE);
5270                         }
5271                         (void) vm_fault(map, va, VM_PROT_NONE,
5272                             TRUE, VM_KERN_MEMORY_NONE, THREAD_UNINT, pmap, pmap_addr);
5273                 } else {
5274                         vm_prot_t       prot;
5275                         vm_page_t       result_page;
5276                         vm_page_t       top_page;
5277                         vm_object_t     result_object;
5278                         vm_fault_return_t result;
5279
5280                         /* cap cluster size at maximum UPL size */
5281                         upl_size_t cluster_size;
5282                         if (os_sub_overflow(end_addr, va, &cluster_size)) {
5283                                 cluster_size = 0 - (upl_size_t)PAGE_SIZE;
5284                         }
5285                         fault_info.cluster_size = cluster_size;
5286
5287                         do {
5288                                 prot = VM_PROT_NONE;
5289
5290                                 vm_object_lock(object);
5291                                 vm_object_paging_begin(object);
5292                                 XPR(XPR_VM_FAULT,
5293                                     "vm_fault_unwire -> vm_fault_page\n",
5294                                     0, 0, 0, 0, 0);
5295                                 result_page = VM_PAGE_NULL;
5296                                 result = vm_fault_page(
5297                                         object,
5298                                         (VME_OFFSET(entry) +
5299                                         (va - entry->vme_start)),
5300                                         VM_PROT_NONE, TRUE,
5301                                         FALSE, /* page not looked up */
5302                                         &prot, &result_page, &top_page,
5303                                         (int *)0,
5304                                         NULL, map->no_zero_fill,
5305                                         FALSE, &fault_info);
5306                         } while (result == VM_FAULT_RETRY);
5307
5308                         /*
5309                          * If this was a mapping to a file on a device that has been forcibly
5310                          * unmounted, then we won't get a page back from vm_fault_page().  Just
5311                          * move on to the next one in case the remaining pages are mapped from
5312                          * different objects.  During a forced unmount, the object is terminated
5313                          * so the alive flag will be false if this happens.  A forced unmount will
5314                          * will occur when an external disk is unplugged before the user does an
5315                          * eject, so we don't want to panic in that situation.
5316                          */
5317
5318                         if (result == VM_FAULT_MEMORY_ERROR && !object->alive) {
5319                                 continue;
5320                         }
5321
5322                         if (result == VM_FAULT_MEMORY_ERROR &&
5323                             object == kernel_object) {
5324                                 /*
5325                                  * This must have been allocated with
5326                                  * KMA_KOBJECT and KMA_VAONLY and there's
5327                                  * no physical page at this offset.
5328                                  * We're done (no page to free).
5329                                  */
5330                                 assert(deallocate);
5331                                 continue;
5332                         }
5333
5334                         if (result != VM_FAULT_SUCCESS) {
5335                                 panic("vm_fault_unwire: failure");
5336                         }
5337
5338                         result_object = VM_PAGE_OBJECT(result_page);
5339
5340                         if (deallocate) {
5341                                 assert(VM_PAGE_GET_PHYS_PAGE(result_page) !=
5342                                     vm_page_fictitious_addr);
5343                                 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(result_page));
5344                                 if (VM_PAGE_WIRED(result_page)) {
5345                                         unwired_pages++;
5346                                 }
5347                                 VM_PAGE_FREE(result_page);
5348                         } else {
5349                                 if ((pmap) && (VM_PAGE_GET_PHYS_PAGE(result_page) != vm_page_guard_addr)) {
5350                                         pmap_change_wiring(pmap,
5351                                             pmap_addr + (va - entry->vme_start), FALSE);
5352                                 }
5353
5354
5355                                 if (VM_PAGE_WIRED(result_page)) {
5356                                         vm_page_lockspin_queues();
5357                                         vm_page_unwire(result_page, TRUE);
5358                                         vm_page_unlock_queues();
5359                                         unwired_pages++;
5360                                 }
5361                                 if (entry->zero_wired_pages) {
5362                                         pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(result_page));
5363                                         entry->zero_wired_pages = FALSE;
5364                                 }
5365
5366                                 PAGE_WAKEUP_DONE(result_page);
5367                         }
5368                         vm_fault_cleanup(result_object, top_page);
5369                 }
5370         }
5371
5372         /*
5373          *      Inform the physical mapping system that the range
5374          *      of addresses may fault, so that page tables and
5375          *      such may be unwired themselves.
5376          */
5377
5378         pmap_pageable(pmap, pmap_addr,
5379             pmap_addr + (end_addr - entry->vme_start), TRUE);
5380
5381         if (kernel_object == object) {
5382                 vm_tag_update_size(fault_info.user_tag, -ptoa_64(unwired_pages));
5383         }
5384 }
5385
5386 /*
5387  *      vm_fault_wire_fast:
5388  *
5389  *      Handle common case of a wire down page fault at the given address.
5390  *      If successful, the page is inserted into the associated physical map.
5391  *      The map entry is passed in to avoid the overhead of a map lookup.
5392  *
5393  *      NOTE: the given address should be truncated to the
5394  *      proper page address.
5395  *
5396  *      KERN_SUCCESS is returned if the page fault is handled; otherwise,
5397  *      a standard error specifying why the fault is fatal is returned.
5398  *
5399  *      The map in question must be referenced, and remains so.
5400  *      Caller has a read lock on the map.
5401  *
5402  *      This is a stripped version of vm_fault() for wiring pages.  Anything
5403  *      other than the common case will return KERN_FAILURE, and the caller
5404  *      is expected to call vm_fault().
5405  */
5406 static kern_return_t
5407 vm_fault_wire_fast(
5408         __unused vm_map_t       map,
5409         vm_map_offset_t va,
5410         __unused vm_prot_t       caller_prot,
5411         vm_tag_t        wire_tag,
5412         vm_map_entry_t  entry,
5413         pmap_t          pmap,
5414         vm_map_offset_t pmap_addr,
5415         ppnum_t         *physpage_p)
5416 {
5417         vm_object_t             object;
5418         vm_object_offset_t      offset;
5419         vm_page_t               m;
5420         vm_prot_t               prot;
5421         thread_t                thread = current_thread();
5422         int                     type_of_fault;
5423         kern_return_t           kr;
5424         struct vm_object_fault_info fault_info = {};
5425
5426         VM_STAT_INCR(faults);
5427
5428         if (thread != THREAD_NULL && thread->task != TASK_NULL) {
5429                 thread->task->faults++;
5430         }
5431
5432 /*
5433  *      Recovery actions
5434  */
5435
5436 #undef  RELEASE_PAGE
5437 #define RELEASE_PAGE(m) {                               \
5438         PAGE_WAKEUP_DONE(m);                            \
5439         vm_page_lockspin_queues();                      \
5440         vm_page_unwire(m, TRUE);                        \
5441         vm_page_unlock_queues();                        \
5442 }
5443
5444
5445 #undef  UNLOCK_THINGS
5446 #define UNLOCK_THINGS   {                               \
5447         vm_object_paging_end(object);                      \
5448         vm_object_unlock(object);                          \
5449 }
5450
5451 #undef  UNLOCK_AND_DEALLOCATE
5452 #define UNLOCK_AND_DEALLOCATE   {                       \
5453         UNLOCK_THINGS;                                  \
5454         vm_object_deallocate(object);                   \
5455 }
5456 /*
5457  *      Give up and have caller do things the hard way.
5458  */
5459
5460 #define GIVE_UP {                                       \
5461         UNLOCK_AND_DEALLOCATE;                          \
5462         return(KERN_FAILURE);                           \
5463 }
5464
5465
5466         /*
5467          *      If this entry is not directly to a vm_object, bail out.
5468          */
5469         if (entry->is_sub_map) {
5470                 assert(physpage_p == NULL);
5471                 return KERN_FAILURE;
5472         }
5473
5474         /*
5475          *      Find the backing store object and offset into it.
5476          */
5477
5478         object = VME_OBJECT(entry);
5479         offset = (va - entry->vme_start) + VME_OFFSET(entry);
5480         prot = entry->protection;
5481
5482         /*
5483          *      Make a reference to this object to prevent its
5484          *      disposal while we are messing with it.
5485          */
5486
5487         vm_object_lock(object);
5488         vm_object_reference_locked(object);
5489         vm_object_paging_begin(object);
5490
5491         /*
5492          *      INVARIANTS (through entire routine):
5493          *
5494          *      1)      At all times, we must either have the object
5495          *              lock or a busy page in some object to prevent
5496          *              some other thread from trying to bring in
5497          *              the same page.
5498          *
5499          *      2)      Once we have a busy page, we must remove it from
5500          *              the pageout queues, so that the pageout daemon
5501          *              will not grab it away.
5502          *
5503          */
5504
5505         /*
5506          *      Look for page in top-level object.  If it's not there or
5507          *      there's something going on, give up.
5508          */
5509         m = vm_page_lookup(object, offset);
5510         if ((m == VM_PAGE_NULL) || (m->vmp_busy) ||
5511             (m->vmp_unusual && (m->vmp_error || m->vmp_restart || m->vmp_absent))) {
5512                 GIVE_UP;
5513         }
5514         if (m->vmp_fictitious &&
5515             VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
5516                 /*
5517                  * Guard pages are fictitious pages and are never
5518                  * entered into a pmap, so let's say it's been wired...
5519                  */
5520                 kr = KERN_SUCCESS;
5521                 goto done;
5522         }
5523
5524         /*
5525          *      Wire the page down now.  All bail outs beyond this
5526          *      point must unwire the page.
5527          */
5528
5529         vm_page_lockspin_queues();
5530         vm_page_wire(m, wire_tag, TRUE);
5531         vm_page_unlock_queues();
5532
5533         /*
5534          *      Mark page busy for other threads.
5535          */
5536         assert(!m->vmp_busy);
5537         m->vmp_busy = TRUE;
5538         assert(!m->vmp_absent);
5539
5540         /*
5541          *      Give up if the page is being written and there's a copy object
5542          */
5543         if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
5544                 RELEASE_PAGE(m);
5545                 GIVE_UP;
5546         }
5547
5548         fault_info.user_tag = VME_ALIAS(entry);
5549         fault_info.pmap_options = 0;
5550         if (entry->iokit_acct ||
5551             (!entry->is_sub_map && !entry->use_pmap)) {
5552                 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
5553         }
5554
5555         /*
5556          *      Put this page into the physical map.
5557          */
5558         type_of_fault = DBG_CACHE_HIT_FAULT;
5559         kr = vm_fault_enter(m,
5560             pmap,
5561             pmap_addr,
5562             prot,
5563             prot,
5564             TRUE,                  /* wired */
5565             FALSE,                 /* change_wiring */
5566             wire_tag,
5567             &fault_info,
5568             NULL,
5569             &type_of_fault);
5570         if (kr != KERN_SUCCESS) {
5571                 RELEASE_PAGE(m);
5572                 GIVE_UP;
5573         }
5574
5575 done:
5576         /*
5577          *      Unlock everything, and return
5578          */
5579
5580         if (physpage_p) {
5581                 /* for vm_map_wire_and_extract() */
5582                 if (kr == KERN_SUCCESS) {
5583                         assert(object == VM_PAGE_OBJECT(m));
5584                         *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
5585                         if (prot & VM_PROT_WRITE) {
5586                                 vm_object_lock_assert_exclusive(object);
5587                                 m->vmp_dirty = TRUE;
5588                         }
5589                 } else {
5590                         *physpage_p = 0;
5591                 }
5592         }
5593
5594         PAGE_WAKEUP_DONE(m);
5595         UNLOCK_AND_DEALLOCATE;
5596
5597         return kr;
5598 }
5599
5600 /*
5601  *      Routine:        vm_fault_copy_cleanup
5602  *      Purpose:
5603  *              Release a page used by vm_fault_copy.
5604  */
5605
5606 static void
5607 vm_fault_copy_cleanup(
5608         vm_page_t       page,
5609         vm_page_t       top_page)
5610 {
5611         vm_object_t     object = VM_PAGE_OBJECT(page);
5612
5613         vm_object_lock(object);
5614         PAGE_WAKEUP_DONE(page);
5615         if (!VM_PAGE_PAGEABLE(page)) {
5616                 vm_page_lockspin_queues();
5617                 if (!VM_PAGE_PAGEABLE(page)) {
5618                         vm_page_activate(page);
5619                 }
5620                 vm_page_unlock_queues();
5621         }
5622         vm_fault_cleanup(object, top_page);
5623 }
5624
5625 static void
5626 vm_fault_copy_dst_cleanup(
5627         vm_page_t       page)
5628 {
5629         vm_object_t     object;
5630
5631         if (page != VM_PAGE_NULL) {
5632                 object = VM_PAGE_OBJECT(page);
5633                 vm_object_lock(object);
5634                 vm_page_lockspin_queues();
5635                 vm_page_unwire(page, TRUE);
5636                 vm_page_unlock_queues();
5637                 vm_object_paging_end(object);
5638                 vm_object_unlock(object);
5639         }
5640 }
5641
5642 /*
5643  *      Routine:        vm_fault_copy
5644  *
5645  *      Purpose:
5646  *              Copy pages from one virtual memory object to another --
5647  *              neither the source nor destination pages need be resident.
5648  *
5649  *              Before actually copying a page, the version associated with
5650  *              the destination address map wil be verified.
5651  *
5652  *      In/out conditions:
5653  *              The caller must hold a reference, but not a lock, to
5654  *              each of the source and destination objects and to the
5655  *              destination map.
5656  *
5657  *      Results:
5658  *              Returns KERN_SUCCESS if no errors were encountered in
5659  *              reading or writing the data.  Returns KERN_INTERRUPTED if
5660  *              the operation was interrupted (only possible if the
5661  *              "interruptible" argument is asserted).  Other return values
5662  *              indicate a permanent error in copying the data.
5663  *
5664  *              The actual amount of data copied will be returned in the
5665  *              "copy_size" argument.  In the event that the destination map
5666  *              verification failed, this amount may be less than the amount
5667  *              requested.
5668  */
5669 kern_return_t
5670 vm_fault_copy(
5671         vm_object_t             src_object,
5672         vm_object_offset_t      src_offset,
5673         vm_map_size_t           *copy_size,             /* INOUT */
5674         vm_object_t             dst_object,
5675         vm_object_offset_t      dst_offset,
5676         vm_map_t                dst_map,
5677         vm_map_version_t         *dst_version,
5678         int                     interruptible)
5679 {
5680         vm_page_t               result_page;
5681
5682         vm_page_t               src_page;
5683         vm_page_t               src_top_page;
5684         vm_prot_t               src_prot;
5685
5686         vm_page_t               dst_page;
5687         vm_page_t               dst_top_page;
5688         vm_prot_t               dst_prot;
5689
5690         vm_map_size_t           amount_left;
5691         vm_object_t             old_copy_object;
5692         vm_object_t             result_page_object = NULL;
5693         kern_return_t           error = 0;
5694         vm_fault_return_t       result;
5695
5696         vm_map_size_t           part_size;
5697         struct vm_object_fault_info fault_info_src = {};
5698         struct vm_object_fault_info fault_info_dst = {};
5699
5700         /*
5701          * In order not to confuse the clustered pageins, align
5702          * the different offsets on a page boundary.
5703          */
5704
5705 #define RETURN(x)                                       \
5706         MACRO_BEGIN                                     \
5707         *copy_size -= amount_left;                      \
5708         MACRO_RETURN(x);                                \
5709         MACRO_END
5710
5711         amount_left = *copy_size;
5712
5713         fault_info_src.interruptible = interruptible;
5714         fault_info_src.behavior = VM_BEHAVIOR_SEQUENTIAL;
5715         fault_info_src.lo_offset = vm_object_trunc_page(src_offset);
5716         fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left;
5717         fault_info_src.stealth = TRUE;
5718
5719         fault_info_dst.interruptible = interruptible;
5720         fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL;
5721         fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset);
5722         fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left;
5723         fault_info_dst.stealth = TRUE;
5724
5725         do { /* while (amount_left > 0) */
5726                 /*
5727                  * There may be a deadlock if both source and destination
5728                  * pages are the same. To avoid this deadlock, the copy must
5729                  * start by getting the destination page in order to apply
5730                  * COW semantics if any.
5731                  */
5732
5733 RetryDestinationFault:;
5734
5735                 dst_prot = VM_PROT_WRITE | VM_PROT_READ;
5736
5737                 vm_object_lock(dst_object);
5738                 vm_object_paging_begin(dst_object);
5739
5740                 /* cap cluster size at maximum UPL size */
5741                 upl_size_t cluster_size;
5742                 if (os_convert_overflow(amount_left, &cluster_size)) {
5743                         cluster_size = 0 - (upl_size_t)PAGE_SIZE;
5744                 }
5745                 fault_info_dst.cluster_size = cluster_size;
5746
5747                 XPR(XPR_VM_FAULT, "vm_fault_copy -> vm_fault_page\n", 0, 0, 0, 0, 0);
5748                 dst_page = VM_PAGE_NULL;
5749                 result = vm_fault_page(dst_object,
5750                     vm_object_trunc_page(dst_offset),
5751                     VM_PROT_WRITE | VM_PROT_READ,
5752                     FALSE,
5753                     FALSE,                    /* page not looked up */
5754                     &dst_prot, &dst_page, &dst_top_page,
5755                     (int *)0,
5756                     &error,
5757                     dst_map->no_zero_fill,
5758                     FALSE, &fault_info_dst);
5759                 switch (result) {
5760                 case VM_FAULT_SUCCESS:
5761                         break;
5762                 case VM_FAULT_RETRY:
5763                         goto RetryDestinationFault;
5764                 case VM_FAULT_MEMORY_SHORTAGE:
5765                         if (vm_page_wait(interruptible)) {
5766                                 goto RetryDestinationFault;
5767                         }
5768                 /* fall thru */
5769                 case VM_FAULT_INTERRUPTED:
5770                         RETURN(MACH_SEND_INTERRUPTED);
5771                 case VM_FAULT_SUCCESS_NO_VM_PAGE:
5772                         /* success but no VM page: fail the copy */
5773                         vm_object_paging_end(dst_object);
5774                         vm_object_unlock(dst_object);
5775                 /*FALLTHROUGH*/
5776                 case VM_FAULT_MEMORY_ERROR:
5777                         if (error) {
5778                                 return error;
5779                         } else {
5780                                 return KERN_MEMORY_ERROR;
5781                         }
5782                 default:
5783                         panic("vm_fault_copy: unexpected error 0x%x from "
5784                             "vm_fault_page()\n", result);
5785                 }
5786                 assert((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
5787
5788                 assert(dst_object == VM_PAGE_OBJECT(dst_page));
5789                 old_copy_object = dst_object->copy;
5790
5791                 /*
5792                  * There exists the possiblity that the source and
5793                  * destination page are the same.  But we can't
5794                  * easily determine that now.  If they are the
5795                  * same, the call to vm_fault_page() for the
5796                  * destination page will deadlock.  To prevent this we
5797                  * wire the page so we can drop busy without having
5798                  * the page daemon steal the page.  We clean up the
5799                  * top page  but keep the paging reference on the object
5800                  * holding the dest page so it doesn't go away.
5801                  */
5802
5803                 vm_page_lockspin_queues();
5804                 vm_page_wire(dst_page, VM_KERN_MEMORY_OSFMK, TRUE);
5805                 vm_page_unlock_queues();
5806                 PAGE_WAKEUP_DONE(dst_page);
5807                 vm_object_unlock(dst_object);
5808
5809                 if (dst_top_page != VM_PAGE_NULL) {
5810                         vm_object_lock(dst_object);
5811                         VM_PAGE_FREE(dst_top_page);
5812                         vm_object_paging_end(dst_object);
5813                         vm_object_unlock(dst_object);
5814                 }
5815
5816 RetrySourceFault:;
5817
5818                 if (src_object == VM_OBJECT_NULL) {
5819                         /*
5820                          *      No source object.  We will just
5821                          *      zero-fill the page in dst_object.
5822                          */
5823                         src_page = VM_PAGE_NULL;
5824                         result_page = VM_PAGE_NULL;
5825                 } else {
5826                         vm_object_lock(src_object);
5827                         src_page = vm_page_lookup(src_object,
5828                             vm_object_trunc_page(src_offset));
5829                         if (src_page == dst_page) {
5830                                 src_prot = dst_prot;
5831                                 result_page = VM_PAGE_NULL;
5832                         } else {
5833                                 src_prot = VM_PROT_READ;
5834                                 vm_object_paging_begin(src_object);
5835
5836                                 /* cap cluster size at maximum UPL size */
5837                                 if (os_convert_overflow(amount_left, &cluster_size)) {
5838                                         cluster_size = 0 - (upl_size_t)PAGE_SIZE;
5839                                 }
5840                                 fault_info_src.cluster_size = cluster_size;
5841
5842                                 XPR(XPR_VM_FAULT,
5843                                     "vm_fault_copy(2) -> vm_fault_page\n",
5844                                     0, 0, 0, 0, 0);
5845                                 result_page = VM_PAGE_NULL;
5846                                 result = vm_fault_page(
5847                                         src_object,
5848                                         vm_object_trunc_page(src_offset),
5849                                         VM_PROT_READ, FALSE,
5850                                         FALSE, /* page not looked up */
5851                                         &src_prot,
5852                                         &result_page, &src_top_page,
5853                                         (int *)0, &error, FALSE,
5854                                         FALSE, &fault_info_src);
5855
5856                                 switch (result) {
5857                                 case VM_FAULT_SUCCESS:
5858                                         break;
5859                                 case VM_FAULT_RETRY:
5860                                         goto RetrySourceFault;
5861                                 case VM_FAULT_MEMORY_SHORTAGE:
5862                                         if (vm_page_wait(interruptible)) {
5863                                                 goto RetrySourceFault;
5864                                         }
5865                                 /* fall thru */
5866                                 case VM_FAULT_INTERRUPTED:
5867                                         vm_fault_copy_dst_cleanup(dst_page);
5868                                         RETURN(MACH_SEND_INTERRUPTED);
5869                                 case VM_FAULT_SUCCESS_NO_VM_PAGE:
5870                                         /* success but no VM page: fail */
5871                                         vm_object_paging_end(src_object);
5872                                         vm_object_unlock(src_object);
5873                                 /*FALLTHROUGH*/
5874                                 case VM_FAULT_MEMORY_ERROR:
5875                                         vm_fault_copy_dst_cleanup(dst_page);
5876                                         if (error) {
5877                                                 return error;
5878                                         } else {
5879                                                 return KERN_MEMORY_ERROR;
5880                                         }
5881                                 default:
5882                                         panic("vm_fault_copy(2): unexpected "
5883                                             "error 0x%x from "
5884                                             "vm_fault_page()\n", result);
5885                                 }
5886
5887                                 result_page_object = VM_PAGE_OBJECT(result_page);
5888                                 assert((src_top_page == VM_PAGE_NULL) ==
5889                                     (result_page_object == src_object));
5890                         }
5891                         assert((src_prot & VM_PROT_READ) != VM_PROT_NONE);
5892                         vm_object_unlock(result_page_object);
5893                 }
5894
5895                 vm_map_lock_read(dst_map);
5896
5897                 if (!vm_map_verify(dst_map, dst_version)) {
5898                         vm_map_unlock_read(dst_map);
5899                         if (result_page != VM_PAGE_NULL && src_page != dst_page) {
5900                                 vm_fault_copy_cleanup(result_page, src_top_page);
5901                         }
5902                         vm_fault_copy_dst_cleanup(dst_page);
5903                         break;
5904                 }
5905                 assert(dst_object == VM_PAGE_OBJECT(dst_page));
5906
5907                 vm_object_lock(dst_object);
5908
5909                 if (dst_object->copy != old_copy_object) {
5910                         vm_object_unlock(dst_object);
5911                         vm_map_unlock_read(dst_map);
5912                         if (result_page != VM_PAGE_NULL && src_page != dst_page) {
5913                                 vm_fault_copy_cleanup(result_page, src_top_page);
5914                         }
5915                         vm_fault_copy_dst_cleanup(dst_page);
5916                         break;
5917                 }
5918                 vm_object_unlock(dst_object);
5919
5920                 /*
5921                  *      Copy the page, and note that it is dirty
5922                  *      immediately.
5923                  */
5924
5925                 if (!page_aligned(src_offset) ||
5926                     !page_aligned(dst_offset) ||
5927                     !page_aligned(amount_left)) {
5928                         vm_object_offset_t      src_po,
5929                             dst_po;
5930
5931                         src_po = src_offset - vm_object_trunc_page(src_offset);
5932                         dst_po = dst_offset - vm_object_trunc_page(dst_offset);
5933
5934                         if (dst_po > src_po) {
5935                                 part_size = PAGE_SIZE - dst_po;
5936                         } else {
5937                                 part_size = PAGE_SIZE - src_po;
5938                         }
5939                         if (part_size > (amount_left)) {
5940                                 part_size = amount_left;
5941                         }
5942
5943                         if (result_page == VM_PAGE_NULL) {
5944                                 assert((vm_offset_t) dst_po == dst_po);
5945                                 assert((vm_size_t) part_size == part_size);
5946                                 vm_page_part_zero_fill(dst_page,
5947                                     (vm_offset_t) dst_po,
5948                                     (vm_size_t) part_size);
5949                         } else {
5950                                 assert((vm_offset_t) src_po == src_po);
5951                                 assert((vm_offset_t) dst_po == dst_po);
5952                                 assert((vm_size_t) part_size == part_size);
5953                                 vm_page_part_copy(result_page,
5954                                     (vm_offset_t) src_po,
5955                                     dst_page,
5956                                     (vm_offset_t) dst_po,
5957                                     (vm_size_t)part_size);
5958                                 if (!dst_page->vmp_dirty) {
5959                                         vm_object_lock(dst_object);
5960                                         SET_PAGE_DIRTY(dst_page, TRUE);
5961                                         vm_object_unlock(dst_object);
5962                                 }
5963                         }
5964                 } else {
5965                         part_size = PAGE_SIZE;
5966
5967                         if (result_page == VM_PAGE_NULL) {
5968                                 vm_page_zero_fill(dst_page);
5969                         } else {
5970                                 vm_object_lock(result_page_object);
5971                                 vm_page_copy(result_page, dst_page);
5972                                 vm_object_unlock(result_page_object);
5973
5974                                 if (!dst_page->vmp_dirty) {
5975                                         vm_object_lock(dst_object);
5976                                         SET_PAGE_DIRTY(dst_page, TRUE);
5977                                         vm_object_unlock(dst_object);
5978                                 }
5979                         }
5980                 }
5981
5982                 /*
5983                  *      Unlock everything, and return
5984                  */
5985
5986                 vm_map_unlock_read(dst_map);
5987
5988                 if (result_page != VM_PAGE_NULL && src_page != dst_page) {
5989                         vm_fault_copy_cleanup(result_page, src_top_page);
5990                 }
5991                 vm_fault_copy_dst_cleanup(dst_page);
5992
5993                 amount_left -= part_size;
5994                 src_offset += part_size;
5995                 dst_offset += part_size;
5996         } while (amount_left > 0);
5997
5998         RETURN(KERN_SUCCESS);
5999 #undef  RETURN
6000
6001         /*NOTREACHED*/
6002 }
6003
6004 #if     VM_FAULT_CLASSIFY
6005 /*
6006  *      Temporary statistics gathering support.
6007  */
6008
6009 /*
6010  *      Statistics arrays:
6011  */
6012 #define VM_FAULT_TYPES_MAX      5
6013 #define VM_FAULT_LEVEL_MAX      8
6014
6015 int     vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
6016
6017 #define VM_FAULT_TYPE_ZERO_FILL 0
6018 #define VM_FAULT_TYPE_MAP_IN    1
6019 #define VM_FAULT_TYPE_PAGER     2
6020 #define VM_FAULT_TYPE_COPY      3
6021 #define VM_FAULT_TYPE_OTHER     4
6022
6023
6024 void
6025 vm_fault_classify(vm_object_t           object,
6026     vm_object_offset_t    offset,
6027     vm_prot_t             fault_type)
6028 {
6029         int             type, level = 0;
6030         vm_page_t       m;
6031
6032         while (TRUE) {
6033                 m = vm_page_lookup(object, offset);
6034                 if (m != VM_PAGE_NULL) {
6035                         if (m->vmp_busy || m->vmp_error || m->vmp_restart || m->vmp_absent) {
6036                                 type = VM_FAULT_TYPE_OTHER;
6037                                 break;
6038                         }
6039                         if (((fault_type & VM_PROT_WRITE) == 0) ||
6040                             ((level == 0) && object->copy == VM_OBJECT_NULL)) {
6041                                 type = VM_FAULT_TYPE_MAP_IN;
6042                                 break;
6043                         }
6044                         type = VM_FAULT_TYPE_COPY;
6045                         break;
6046                 } else {
6047                         if (object->pager_created) {
6048                                 type = VM_FAULT_TYPE_PAGER;
6049                                 break;
6050                         }
6051                         if (object->shadow == VM_OBJECT_NULL) {
6052                                 type = VM_FAULT_TYPE_ZERO_FILL;
6053                                 break;
6054                         }
6055
6056                         offset += object->vo_shadow_offset;
6057                         object = object->shadow;
6058                         level++;
6059                         continue;
6060                 }
6061         }
6062
6063         if (level > VM_FAULT_LEVEL_MAX) {
6064                 level = VM_FAULT_LEVEL_MAX;
6065         }
6066
6067         vm_fault_stats[type][level] += 1;
6068
6069         return;
6070 }
6071
6072 /* cleanup routine to call from debugger */
6073
6074 void
6075 vm_fault_classify_init(void)
6076 {
6077         int type, level;
6078
6079         for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
6080                 for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
6081                         vm_fault_stats[type][level] = 0;
6082                 }
6083         }
6084
6085         return;
6086 }
6087 #endif  /* VM_FAULT_CLASSIFY */
6088
6089 vm_offset_t
6090 kdp_lightweight_fault(vm_map_t map, vm_offset_t cur_target_addr)
6091 {
6092         vm_map_entry_t  entry;
6093         vm_object_t     object;
6094         vm_offset_t     object_offset;
6095         vm_page_t       m;
6096         int             compressor_external_state, compressed_count_delta;
6097         int             compressor_flags = (C_DONT_BLOCK | C_KEEP | C_KDP);
6098         int             my_fault_type = VM_PROT_READ;
6099         kern_return_t   kr;
6100
6101         if (not_in_kdp) {
6102                 panic("kdp_lightweight_fault called from outside of debugger context");
6103         }
6104
6105         assert(map != VM_MAP_NULL);
6106
6107         assert((cur_target_addr & PAGE_MASK) == 0);
6108         if ((cur_target_addr & PAGE_MASK) != 0) {
6109                 return 0;
6110         }
6111
6112         if (kdp_lck_rw_lock_is_acquired_exclusive(&map->lock)) {
6113                 return 0;
6114         }
6115
6116         if (!vm_map_lookup_entry(map, cur_target_addr, &entry)) {
6117                 return 0;
6118         }
6119
6120         if (entry->is_sub_map) {
6121                 return 0;
6122         }
6123
6124         object = VME_OBJECT(entry);
6125         if (object == VM_OBJECT_NULL) {
6126                 return 0;
6127         }
6128
6129         object_offset = cur_target_addr - entry->vme_start + VME_OFFSET(entry);
6130
6131         while (TRUE) {
6132                 if (kdp_lck_rw_lock_is_acquired_exclusive(&object->Lock)) {
6133                         return 0;
6134                 }
6135
6136                 if (object->pager_created && (object->paging_in_progress ||
6137                     object->activity_in_progress)) {
6138                         return 0;
6139                 }
6140
6141                 m = kdp_vm_page_lookup(object, object_offset);
6142
6143                 if (m != VM_PAGE_NULL) {
6144                         if ((object->wimg_bits & VM_WIMG_MASK) != VM_WIMG_DEFAULT) {
6145                                 return 0;
6146                         }
6147
6148                         if (m->vmp_laundry || m->vmp_busy || m->vmp_free_when_done || m->vmp_absent || m->vmp_error || m->vmp_cleaning ||
6149                             m->vmp_overwriting || m->vmp_restart || m->vmp_unusual) {
6150                                 return 0;
6151                         }
6152
6153                         assert(!m->vmp_private);
6154                         if (m->vmp_private) {
6155                                 return 0;
6156                         }
6157
6158                         assert(!m->vmp_fictitious);
6159                         if (m->vmp_fictitious) {
6160                                 return 0;
6161                         }
6162
6163                         assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
6164                         if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
6165                                 return 0;
6166                         }
6167
6168                         return ptoa(VM_PAGE_GET_PHYS_PAGE(m));
6169                 }
6170
6171                 compressor_external_state = VM_EXTERNAL_STATE_UNKNOWN;
6172
6173                 if (object->pager_created && MUST_ASK_PAGER(object, object_offset, compressor_external_state)) {
6174                         if (compressor_external_state == VM_EXTERNAL_STATE_EXISTS) {
6175                                 kr = vm_compressor_pager_get(object->pager, (object_offset + object->paging_offset),
6176                                     kdp_compressor_decompressed_page_ppnum, &my_fault_type,
6177                                     compressor_flags, &compressed_count_delta);
6178                                 if (kr == KERN_SUCCESS) {
6179                                         return kdp_compressor_decompressed_page_paddr;
6180                                 } else {
6181                                         return 0;
6182                                 }
6183                         }
6184                 }
6185
6186                 if (object->shadow == VM_OBJECT_NULL) {
6187                         return 0;
6188                 }
6189
6190                 object_offset += object->vo_shadow_offset;
6191                 object = object->shadow;
6192         }
6193 }
6194
6195 /*
6196  * vm_page_validate_cs_fast():
6197  * Performs a few quick checks to determine if the page's code signature
6198  * really needs to be fully validated.  It could:
6199  *      1. have been modified (i.e. automatically tainted),
6200  *      2. have already been validated,
6201  *      3. have already been found to be tainted,
6202  *      4. no longer have a backing store.
6203  * Returns FALSE if the page needs to be fully validated.
6204  */
6205 static boolean_t
6206 vm_page_validate_cs_fast(
6207         vm_page_t       page)
6208 {
6209         vm_object_t     object;
6210
6211         object = VM_PAGE_OBJECT(page);
6212         vm_object_lock_assert_held(object);
6213
6214         if (page->vmp_wpmapped && !page->vmp_cs_tainted) {
6215                 /*
6216                  * This page was mapped for "write" access sometime in the
6217                  * past and could still be modifiable in the future.
6218                  * Consider it tainted.
6219                  * [ If the page was already found to be "tainted", no
6220                  * need to re-validate. ]
6221                  */
6222                 vm_object_lock_assert_exclusive(object);
6223                 page->vmp_cs_validated = TRUE;
6224                 page->vmp_cs_tainted = TRUE;
6225                 if (cs_debug) {
6226                         printf("CODESIGNING: %s: "
6227                             "page %p obj %p off 0x%llx "
6228                             "was modified\n",
6229                             __FUNCTION__,
6230                             page, object, page->vmp_offset);
6231                 }
6232                 vm_cs_validated_dirtied++;
6233         }
6234
6235         if (page->vmp_cs_validated || page->vmp_cs_tainted) {
6236                 return TRUE;
6237         }
6238         vm_object_lock_assert_exclusive(object);
6239
6240 #if CHECK_CS_VALIDATION_BITMAP
6241         kern_return_t kr;
6242
6243         kr = vnode_pager_cs_check_validation_bitmap(
6244                 object->pager,
6245                 page->vmp_offset + object->paging_offset,
6246                 CS_BITMAP_CHECK);
6247         if (kr == KERN_SUCCESS) {
6248                 page->vmp_cs_validated = TRUE;
6249                 page->vmp_cs_tainted = FALSE;
6250                 vm_cs_bitmap_validated++;
6251                 return TRUE;
6252         }
6253 #endif /* CHECK_CS_VALIDATION_BITMAP */
6254
6255         if (!object->alive || object->terminating || object->pager == NULL) {
6256                 /*
6257                  * The object is terminating and we don't have its pager
6258                  * so we can't validate the data...
6259                  */
6260                 return TRUE;
6261         }
6262
6263         /* we need to really validate this page */
6264         vm_object_lock_assert_exclusive(object);
6265         return FALSE;
6266 }
6267
6268 void
6269 vm_page_validate_cs_mapped_slow(
6270         vm_page_t       page,
6271         const void      *kaddr)
6272 {
6273         vm_object_t             object;
6274         memory_object_offset_t  mo_offset;
6275         memory_object_t         pager;
6276         struct vnode            *vnode;
6277         boolean_t               validated;
6278         unsigned                tainted;
6279
6280         assert(page->vmp_busy);
6281         object = VM_PAGE_OBJECT(page);
6282         vm_object_lock_assert_exclusive(object);
6283
6284         vm_cs_validates++;
6285
6286         /*
6287          * Since we get here to validate a page that was brought in by
6288          * the pager, we know that this pager is all setup and ready
6289          * by now.
6290          */
6291         assert(object->code_signed);
6292         assert(!object->internal);
6293         assert(object->pager != NULL);
6294         assert(object->pager_ready);
6295
6296         pager = object->pager;
6297         assert(object->paging_in_progress);
6298         vnode = vnode_pager_lookup_vnode(pager);
6299         mo_offset = page->vmp_offset + object->paging_offset;
6300
6301         /* verify the SHA1 hash for this page */
6302         tainted = 0;
6303         validated = cs_validate_range(vnode,
6304             pager,
6305             mo_offset,
6306             (const void *)((const char *)kaddr),
6307             PAGE_SIZE_64,
6308             &tainted);
6309
6310         if (tainted & CS_VALIDATE_TAINTED) {
6311                 page->vmp_cs_tainted = TRUE;
6312         }
6313         if (tainted & CS_VALIDATE_NX) {
6314                 page->vmp_cs_nx = TRUE;
6315         }
6316         if (validated) {
6317                 page->vmp_cs_validated = TRUE;
6318         }
6319
6320 #if CHECK_CS_VALIDATION_BITMAP
6321         if (page->vmp_cs_validated && !page->vmp_cs_tainted) {
6322                 vnode_pager_cs_check_validation_bitmap(object->pager,
6323                     mo_offset,
6324                     CS_BITMAP_SET);
6325         }
6326 #endif /* CHECK_CS_VALIDATION_BITMAP */
6327 }
6328
6329 void
6330 vm_page_validate_cs_mapped(
6331         vm_page_t       page,
6332         const void      *kaddr)
6333 {
6334         if (!vm_page_validate_cs_fast(page)) {
6335                 vm_page_validate_cs_mapped_slow(page, kaddr);
6336         }
6337 }
6338
6339 void
6340 vm_page_validate_cs(
6341         vm_page_t       page)
6342 {
6343         vm_object_t             object;
6344         vm_object_offset_t      offset;
6345         vm_map_offset_t         koffset;
6346         vm_map_size_t           ksize;
6347         vm_offset_t             kaddr;
6348         kern_return_t           kr;
6349         boolean_t               busy_page;
6350         boolean_t               need_unmap;
6351
6352         object = VM_PAGE_OBJECT(page);
6353         vm_object_lock_assert_held(object);
6354
6355         if (vm_page_validate_cs_fast(page)) {
6356                 return;
6357         }
6358         vm_object_lock_assert_exclusive(object);
6359
6360         assert(object->code_signed);
6361         offset = page->vmp_offset;
6362
6363         busy_page = page->vmp_busy;
6364         if (!busy_page) {
6365                 /* keep page busy while we map (and unlock) the VM object */
6366                 page->vmp_busy = TRUE;
6367         }
6368
6369         /*
6370          * Take a paging reference on the VM object
6371          * to protect it from collapse or bypass,
6372          * and keep it from disappearing too.
6373          */
6374         vm_object_paging_begin(object);
6375
6376         /* map the page in the kernel address space */
6377         ksize = PAGE_SIZE_64;
6378         koffset = 0;
6379         need_unmap = FALSE;
6380         kr = vm_paging_map_object(page,
6381             object,
6382             offset,
6383             VM_PROT_READ,
6384             FALSE,                       /* can't unlock object ! */
6385             &ksize,
6386             &koffset,
6387             &need_unmap);
6388         if (kr != KERN_SUCCESS) {
6389                 panic("%s: could not map page: 0x%x\n", __FUNCTION__, kr);
6390         }
6391         kaddr = CAST_DOWN(vm_offset_t, koffset);
6392
6393         /* validate the mapped page */
6394         vm_page_validate_cs_mapped_slow(page, (const void *) kaddr);
6395
6396         assert(page->vmp_busy);
6397         assert(object == VM_PAGE_OBJECT(page));
6398         vm_object_lock_assert_exclusive(object);
6399
6400         if (!busy_page) {
6401                 PAGE_WAKEUP_DONE(page);
6402         }
6403         if (need_unmap) {
6404                 /* unmap the map from the kernel address space */
6405                 vm_paging_unmap_object(object, koffset, koffset + ksize);
6406                 koffset = 0;
6407                 ksize = 0;
6408                 kaddr = 0;
6409         }
6410         vm_object_paging_end(object);
6411 }
6412
6413 void
6414 vm_page_validate_cs_mapped_chunk(
6415         vm_page_t       page,
6416         const void      *kaddr,
6417         vm_offset_t     chunk_offset,
6418         vm_size_t       chunk_size,
6419         boolean_t       *validated_p,
6420         unsigned        *tainted_p)
6421 {
6422         vm_object_t             object;
6423         vm_object_offset_t      offset, offset_in_page;
6424         memory_object_t         pager;
6425         struct vnode            *vnode;
6426         boolean_t               validated;
6427         unsigned                tainted;
6428
6429         *validated_p = FALSE;
6430         *tainted_p = 0;
6431
6432         assert(page->vmp_busy);
6433         object = VM_PAGE_OBJECT(page);
6434         vm_object_lock_assert_exclusive(object);
6435
6436         assert(object->code_signed);
6437         offset = page->vmp_offset;
6438
6439         if (!object->alive || object->terminating || object->pager == NULL) {
6440                 /*
6441                  * The object is terminating and we don't have its pager
6442                  * so we can't validate the data...
6443                  */
6444                 return;
6445         }
6446         /*
6447          * Since we get here to validate a page that was brought in by
6448          * the pager, we know that this pager is all setup and ready
6449          * by now.
6450          */
6451         assert(!object->internal);
6452         assert(object->pager != NULL);
6453         assert(object->pager_ready);
6454
6455         pager = object->pager;
6456         assert(object->paging_in_progress);
6457         vnode = vnode_pager_lookup_vnode(pager);
6458
6459         /* verify the signature for this chunk */
6460         offset_in_page = chunk_offset;
6461         assert(offset_in_page < PAGE_SIZE);
6462
6463         tainted = 0;
6464         validated = cs_validate_range(vnode,
6465             pager,
6466             (object->paging_offset +
6467             offset +
6468             offset_in_page),
6469             (const void *)((const char *)kaddr
6470             + offset_in_page),
6471             chunk_size,
6472             &tainted);
6473         if (validated) {
6474                 *validated_p = TRUE;
6475         }
6476         if (tainted) {
6477                 *tainted_p = tainted;
6478         }
6479 }
6480
6481 static void
6482 vm_rtfrecord_lock(void)
6483 {
6484         lck_spin_lock(&vm_rtfr_slock);
6485 }
6486
6487 static void
6488 vm_rtfrecord_unlock(void)
6489 {
6490         lck_spin_unlock(&vm_rtfr_slock);
6491 }
6492
6493 unsigned int
6494 vmrtfaultinfo_bufsz(void)
6495 {
6496         return vmrtf_num_records * sizeof(vm_rtfault_record_t);
6497 }
6498
6499 #include <kern/backtrace.h>
6500
6501 static void
6502 vm_record_rtfault(thread_t cthread, uint64_t fstart, vm_map_offset_t fault_vaddr, int type_of_fault)
6503 {
6504         uint64_t fend = mach_continuous_time();
6505
6506         uint64_t cfpc = 0;
6507         uint64_t ctid = cthread->thread_id;
6508         uint64_t cupid = get_current_unique_pid();
6509
6510         uintptr_t bpc = 0;
6511         uint32_t bfrs = 0;
6512         bool u64 = false;
6513
6514         /* Capture a single-frame backtrace; this extracts just the program
6515          * counter at the point of the fault into "bpc", and should perform no
6516          * further user stack traversals, thus avoiding copyin()s and further
6517          * faults.
6518          */
6519         int btr = backtrace_thread_user(cthread, &bpc, 1U, &bfrs, &u64);
6520
6521         if ((btr == 0) && (bfrs > 0)) {
6522                 cfpc = bpc;
6523         }
6524
6525         assert((fstart != 0) && fend >= fstart);
6526         vm_rtfrecord_lock();
6527         assert(vmrtfrs.vmrtfr_curi <= vmrtfrs.vmrtfr_maxi);
6528
6529         vmrtfrs.vmrtf_total++;
6530         vm_rtfault_record_t *cvmr = &vmrtfrs.vm_rtf_records[vmrtfrs.vmrtfr_curi++];
6531
6532         cvmr->rtfabstime = fstart;
6533         cvmr->rtfduration = fend - fstart;
6534         cvmr->rtfaddr = fault_vaddr;
6535         cvmr->rtfpc = cfpc;
6536         cvmr->rtftype = type_of_fault;
6537         cvmr->rtfupid = cupid;
6538         cvmr->rtftid = ctid;
6539
6540         if (vmrtfrs.vmrtfr_curi > vmrtfrs.vmrtfr_maxi) {
6541                 vmrtfrs.vmrtfr_curi = 0;
6542         }
6543
6544         vm_rtfrecord_unlock();
6545 }
6546
6547 int
6548 vmrtf_extract(uint64_t cupid, __unused boolean_t isroot, int vrecordsz, void *vrecords, int *vmrtfrv)
6549 {
6550         vm_rtfault_record_t *cvmrd = vrecords;
6551         size_t residue = vrecordsz;
6552         int numextracted = 0;
6553         boolean_t early_exit = FALSE;
6554
6555         vm_rtfrecord_lock();
6556
6557         for (int vmfi = 0; vmfi <= vmrtfrs.vmrtfr_maxi; vmfi++) {
6558                 if (residue < sizeof(vm_rtfault_record_t)) {
6559                         early_exit = TRUE;
6560                         break;
6561                 }
6562
6563                 if (vmrtfrs.vm_rtf_records[vmfi].rtfupid != cupid) {
6564 #if     DEVELOPMENT || DEBUG
6565                         if (isroot == FALSE) {
6566                                 continue;
6567                         }
6568 #else
6569                         continue;
6570 #endif /* DEVDEBUG */
6571                 }
6572
6573                 *cvmrd = vmrtfrs.vm_rtf_records[vmfi];
6574                 cvmrd++;
6575                 residue -= sizeof(vm_rtfault_record_t);
6576                 numextracted++;
6577         }
6578
6579         vm_rtfrecord_unlock();
6580
6581         *vmrtfrv = numextracted;
6582         return early_exit;
6583 }